forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/drivers/nvme/host/core.c
....@@ -1,39 +1,31 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * NVM Express device driver
34 * Copyright (c) 2011-2014, Intel Corporation.
4
- *
5
- * This program is free software; you can redistribute it and/or modify it
6
- * under the terms and conditions of the GNU General Public License,
7
- * version 2, as published by the Free Software Foundation.
8
- *
9
- * This program is distributed in the hope it will be useful, but WITHOUT
10
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12
- * more details.
135 */
146
157 #include <linux/blkdev.h>
168 #include <linux/blk-mq.h>
9
+#include <linux/compat.h>
1710 #include <linux/delay.h>
1811 #include <linux/errno.h>
1912 #include <linux/hdreg.h>
2013 #include <linux/kernel.h>
2114 #include <linux/module.h>
22
-#include <linux/list_sort.h>
15
+#include <linux/backing-dev.h>
2316 #include <linux/slab.h>
2417 #include <linux/types.h>
2518 #include <linux/pr.h>
2619 #include <linux/ptrace.h>
2720 #include <linux/nvme_ioctl.h>
28
-#include <linux/t10-pi.h>
2921 #include <linux/pm_qos.h>
3022 #include <asm/unaligned.h>
3123
32
-#define CREATE_TRACE_POINTS
33
-#include "trace.h"
34
-
3524 #include "nvme.h"
3625 #include "fabrics.h"
26
+
27
+#define CREATE_TRACE_POINTS
28
+#include "trace.h"
3729
3830 #define NVME_MINORS (1U << MINORBITS)
3931
....@@ -73,8 +65,8 @@
7365 * nvme_reset_wq - hosts nvme reset works
7466 * nvme_delete_wq - hosts nvme delete works
7567 *
76
- * nvme_wq will host works such are scan, aen handling, fw activation,
77
- * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
68
+ * nvme_wq will host works such as scan, aen handling, fw activation,
69
+ * keep-alive, periodic reconnects etc. nvme_reset_wq
7870 * runs reset works which also flush works hosted on nvme_wq for
7971 * serialization purposes. nvme_delete_wq host controller deletion
8072 * works which flush reset works for serialization.
....@@ -88,7 +80,6 @@
8880 struct workqueue_struct *nvme_delete_wq;
8981 EXPORT_SYMBOL_GPL(nvme_delete_wq);
9082
91
-static DEFINE_IDA(nvme_subsystems_ida);
9283 static LIST_HEAD(nvme_subsystems);
9384 static DEFINE_MUTEX(nvme_subsystems_lock);
9485
....@@ -97,27 +88,38 @@
9788 static struct class *nvme_class;
9889 static struct class *nvme_subsys_class;
9990
100
-static void nvme_ns_remove(struct nvme_ns *ns);
101
-static int nvme_revalidate_disk(struct gendisk *disk);
10291 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
10392 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
10493 unsigned nsid);
10594
95
+static void nvme_update_bdev_size(struct gendisk *disk)
96
+{
97
+ struct block_device *bdev = bdget_disk(disk, 0);
98
+
99
+ if (bdev) {
100
+ bd_set_nr_sectors(bdev, get_capacity(disk));
101
+ bdput(bdev);
102
+ }
103
+}
104
+
105
+/*
106
+ * Prepare a queue for teardown.
107
+ *
108
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
109
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
110
+ * holding bd_butex. This will end buffered writers dirtying pages that can't
111
+ * be synced.
112
+ */
106113 static void nvme_set_queue_dying(struct nvme_ns *ns)
107114 {
108
- /*
109
- * Revalidating a dead namespace sets capacity to 0. This will end
110
- * buffered writers dirtying pages that can't be synced.
111
- */
112
- if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
115
+ if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
113116 return;
117
+
114118 blk_set_queue_dying(ns->queue);
115
- /* Forcibly unquiesce queues to avoid blocking dispatch */
116119 blk_mq_unquiesce_queue(ns->queue);
117
- /*
118
- * Revalidate after unblocking dispatchers that may be holding bd_butex
119
- */
120
- revalidate_disk(ns->disk);
120
+
121
+ set_capacity(ns->disk, 0);
122
+ nvme_update_bdev_size(ns->disk);
121123 }
122124
123125 static void nvme_queue_scan(struct nvme_ctrl *ctrl)
....@@ -125,9 +127,25 @@
125127 /*
126128 * Only new queue scan work when admin and IO queues are both alive
127129 */
128
- if (ctrl->state == NVME_CTRL_LIVE)
130
+ if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
129131 queue_work(nvme_wq, &ctrl->scan_work);
130132 }
133
+
134
+/*
135
+ * Use this function to proceed with scheduling reset_work for a controller
136
+ * that had previously been set to the resetting state. This is intended for
137
+ * code paths that can't be interrupted by other reset attempts. A hot removal
138
+ * may prevent this from succeeding.
139
+ */
140
+int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
141
+{
142
+ if (ctrl->state != NVME_CTRL_RESETTING)
143
+ return -EBUSY;
144
+ if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
145
+ return -EBUSY;
146
+ return 0;
147
+}
148
+EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
131149
132150 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
133151 {
....@@ -146,8 +164,7 @@
146164 ret = nvme_reset_ctrl(ctrl);
147165 if (!ret) {
148166 flush_work(&ctrl->reset_work);
149
- if (ctrl->state != NVME_CTRL_LIVE &&
150
- ctrl->state != NVME_CTRL_ADMIN_ONLY)
167
+ if (ctrl->state != NVME_CTRL_LIVE)
151168 ret = -ENETRESET;
152169 }
153170
....@@ -155,11 +172,8 @@
155172 }
156173 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
157174
158
-static void nvme_delete_ctrl_work(struct work_struct *work)
175
+static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
159176 {
160
- struct nvme_ctrl *ctrl =
161
- container_of(work, struct nvme_ctrl, delete_work);
162
-
163177 dev_info(ctrl->device,
164178 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
165179
....@@ -168,7 +182,14 @@
168182 nvme_remove_namespaces(ctrl);
169183 ctrl->ops->delete_ctrl(ctrl);
170184 nvme_uninit_ctrl(ctrl);
171
- nvme_put_ctrl(ctrl);
185
+}
186
+
187
+static void nvme_delete_ctrl_work(struct work_struct *work)
188
+{
189
+ struct nvme_ctrl *ctrl =
190
+ container_of(work, struct nvme_ctrl, delete_work);
191
+
192
+ nvme_do_delete_ctrl(ctrl);
172193 }
173194
174195 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
....@@ -181,36 +202,28 @@
181202 }
182203 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
183204
184
-int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
205
+static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
185206 {
186
- int ret = 0;
187
-
188207 /*
189
- * Keep a reference until the work is flushed since ->delete_ctrl
190
- * can free the controller.
208
+ * Keep a reference until nvme_do_delete_ctrl() complete,
209
+ * since ->delete_ctrl can free the controller.
191210 */
192211 nvme_get_ctrl(ctrl);
193
- ret = nvme_delete_ctrl(ctrl);
194
- if (!ret)
195
- flush_work(&ctrl->delete_work);
212
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
213
+ nvme_do_delete_ctrl(ctrl);
196214 nvme_put_ctrl(ctrl);
197
- return ret;
198
-}
199
-EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
200
-
201
-static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
202
-{
203
- return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
204215 }
205216
206
-static blk_status_t nvme_error_status(struct request *req)
217
+static blk_status_t nvme_error_status(u16 status)
207218 {
208
- switch (nvme_req(req)->status & 0x7ff) {
219
+ switch (status & 0x7ff) {
209220 case NVME_SC_SUCCESS:
210221 return BLK_STS_OK;
211222 case NVME_SC_CAP_EXCEEDED:
212223 return BLK_STS_NOSPC;
213224 case NVME_SC_LBA_RANGE:
225
+ case NVME_SC_CMD_INTERRUPTED:
226
+ case NVME_SC_NS_NOT_READY:
214227 return BLK_STS_TARGET;
215228 case NVME_SC_BAD_ATTRIBUTES:
216229 case NVME_SC_ONCS_NOT_SUPPORTED:
....@@ -232,52 +245,131 @@
232245 return BLK_STS_PROTECTION;
233246 case NVME_SC_RESERVATION_CONFLICT:
234247 return BLK_STS_NEXUS;
248
+ case NVME_SC_HOST_PATH_ERROR:
249
+ return BLK_STS_TRANSPORT;
250
+ case NVME_SC_ZONE_TOO_MANY_ACTIVE:
251
+ return BLK_STS_ZONE_ACTIVE_RESOURCE;
252
+ case NVME_SC_ZONE_TOO_MANY_OPEN:
253
+ return BLK_STS_ZONE_OPEN_RESOURCE;
235254 default:
236255 return BLK_STS_IOERR;
237256 }
238257 }
239258
240
-static inline bool nvme_req_needs_retry(struct request *req)
259
+static void nvme_retry_req(struct request *req)
241260 {
242
- if (blk_noretry_request(req))
243
- return false;
244
- if (nvme_req(req)->status & NVME_SC_DNR)
245
- return false;
246
- if (nvme_req(req)->retries >= nvme_max_retries)
247
- return false;
248
- return true;
261
+ struct nvme_ns *ns = req->q->queuedata;
262
+ unsigned long delay = 0;
263
+ u16 crd;
264
+
265
+ /* The mask and shift result must be <= 3 */
266
+ crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
267
+ if (ns && crd)
268
+ delay = ns->ctrl->crdt[crd - 1] * 100;
269
+
270
+ nvme_req(req)->retries++;
271
+ blk_mq_requeue_request(req, false);
272
+ blk_mq_delay_kick_requeue_list(req->q, delay);
273
+}
274
+
275
+enum nvme_disposition {
276
+ COMPLETE,
277
+ RETRY,
278
+ FAILOVER,
279
+};
280
+
281
+static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
282
+{
283
+ if (likely(nvme_req(req)->status == 0))
284
+ return COMPLETE;
285
+
286
+ if (blk_noretry_request(req) ||
287
+ (nvme_req(req)->status & NVME_SC_DNR) ||
288
+ nvme_req(req)->retries >= nvme_max_retries)
289
+ return COMPLETE;
290
+
291
+ if (req->cmd_flags & REQ_NVME_MPATH) {
292
+ if (nvme_is_path_error(nvme_req(req)->status) ||
293
+ blk_queue_dying(req->q))
294
+ return FAILOVER;
295
+ } else {
296
+ if (blk_queue_dying(req->q))
297
+ return COMPLETE;
298
+ }
299
+
300
+ return RETRY;
301
+}
302
+
303
+static inline void nvme_end_req(struct request *req)
304
+{
305
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
306
+
307
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
308
+ req_op(req) == REQ_OP_ZONE_APPEND)
309
+ req->__sector = nvme_lba_to_sect(req->q->queuedata,
310
+ le64_to_cpu(nvme_req(req)->result.u64));
311
+
312
+ nvme_trace_bio_complete(req, status);
313
+ blk_mq_end_request(req, status);
249314 }
250315
251316 void nvme_complete_rq(struct request *req)
252317 {
253
- blk_status_t status = nvme_error_status(req);
254
-
255318 trace_nvme_complete_rq(req);
319
+ nvme_cleanup_cmd(req);
256320
257
- if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
258
- if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
259
- return;
321
+ if (nvme_req(req)->ctrl->kas)
322
+ nvme_req(req)->ctrl->comp_seen = true;
260323
261
- if (!blk_queue_dying(req->q)) {
262
- nvme_req(req)->retries++;
263
- blk_mq_requeue_request(req, true);
264
- return;
265
- }
324
+ switch (nvme_decide_disposition(req)) {
325
+ case COMPLETE:
326
+ nvme_end_req(req);
327
+ return;
328
+ case RETRY:
329
+ nvme_retry_req(req);
330
+ return;
331
+ case FAILOVER:
332
+ nvme_failover_req(req);
333
+ return;
266334 }
267
- blk_mq_end_request(req, status);
268335 }
269336 EXPORT_SYMBOL_GPL(nvme_complete_rq);
270337
271
-void nvme_cancel_request(struct request *req, void *data, bool reserved)
338
+bool nvme_cancel_request(struct request *req, void *data, bool reserved)
272339 {
273340 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
274341 "Cancelling I/O %d", req->tag);
275342
276
- nvme_req(req)->status = NVME_SC_ABORT_REQ;
277
- blk_mq_complete_request(req);
343
+ /* don't abort one completed request */
344
+ if (blk_mq_request_completed(req))
345
+ return true;
278346
347
+ nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
348
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
349
+ blk_mq_complete_request(req);
350
+ return true;
279351 }
280352 EXPORT_SYMBOL_GPL(nvme_cancel_request);
353
+
354
+void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
355
+{
356
+ if (ctrl->tagset) {
357
+ blk_mq_tagset_busy_iter(ctrl->tagset,
358
+ nvme_cancel_request, ctrl);
359
+ blk_mq_tagset_wait_completed_request(ctrl->tagset);
360
+ }
361
+}
362
+EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
363
+
364
+void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
365
+{
366
+ if (ctrl->admin_tagset) {
367
+ blk_mq_tagset_busy_iter(ctrl->admin_tagset,
368
+ nvme_cancel_request, ctrl);
369
+ blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
370
+ }
371
+}
372
+EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
281373
282374 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
283375 enum nvme_ctrl_state new_state)
....@@ -290,22 +382,13 @@
290382
291383 old_state = ctrl->state;
292384 switch (new_state) {
293
- case NVME_CTRL_ADMIN_ONLY:
294
- switch (old_state) {
295
- case NVME_CTRL_CONNECTING:
296
- changed = true;
297
- /* FALLTHRU */
298
- default:
299
- break;
300
- }
301
- break;
302385 case NVME_CTRL_LIVE:
303386 switch (old_state) {
304387 case NVME_CTRL_NEW:
305388 case NVME_CTRL_RESETTING:
306389 case NVME_CTRL_CONNECTING:
307390 changed = true;
308
- /* FALLTHRU */
391
+ fallthrough;
309392 default:
310393 break;
311394 }
....@@ -314,9 +397,8 @@
314397 switch (old_state) {
315398 case NVME_CTRL_NEW:
316399 case NVME_CTRL_LIVE:
317
- case NVME_CTRL_ADMIN_ONLY:
318400 changed = true;
319
- /* FALLTHRU */
401
+ fallthrough;
320402 default:
321403 break;
322404 }
....@@ -326,7 +408,7 @@
326408 case NVME_CTRL_NEW:
327409 case NVME_CTRL_RESETTING:
328410 changed = true;
329
- /* FALLTHRU */
411
+ fallthrough;
330412 default:
331413 break;
332414 }
....@@ -334,11 +416,20 @@
334416 case NVME_CTRL_DELETING:
335417 switch (old_state) {
336418 case NVME_CTRL_LIVE:
337
- case NVME_CTRL_ADMIN_ONLY:
338419 case NVME_CTRL_RESETTING:
339420 case NVME_CTRL_CONNECTING:
340421 changed = true;
341
- /* FALLTHRU */
422
+ fallthrough;
423
+ default:
424
+ break;
425
+ }
426
+ break;
427
+ case NVME_CTRL_DELETING_NOIO:
428
+ switch (old_state) {
429
+ case NVME_CTRL_DELETING:
430
+ case NVME_CTRL_DEAD:
431
+ changed = true;
432
+ fallthrough;
342433 default:
343434 break;
344435 }
....@@ -347,7 +438,7 @@
347438 switch (old_state) {
348439 case NVME_CTRL_DELETING:
349440 changed = true;
350
- /* FALLTHRU */
441
+ fallthrough;
351442 default:
352443 break;
353444 }
....@@ -356,8 +447,10 @@
356447 break;
357448 }
358449
359
- if (changed)
450
+ if (changed) {
360451 ctrl->state = new_state;
452
+ wake_up_all(&ctrl->state_wq);
453
+ }
361454
362455 spin_unlock_irqrestore(&ctrl->lock, flags);
363456 if (changed && ctrl->state == NVME_CTRL_LIVE)
....@@ -366,6 +459,40 @@
366459 }
367460 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
368461
462
+/*
463
+ * Returns true for sink states that can't ever transition back to live.
464
+ */
465
+static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
466
+{
467
+ switch (ctrl->state) {
468
+ case NVME_CTRL_NEW:
469
+ case NVME_CTRL_LIVE:
470
+ case NVME_CTRL_RESETTING:
471
+ case NVME_CTRL_CONNECTING:
472
+ return false;
473
+ case NVME_CTRL_DELETING:
474
+ case NVME_CTRL_DELETING_NOIO:
475
+ case NVME_CTRL_DEAD:
476
+ return true;
477
+ default:
478
+ WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
479
+ return true;
480
+ }
481
+}
482
+
483
+/*
484
+ * Waits for the controller state to be resetting, or returns false if it is
485
+ * not possible to ever transition to that state.
486
+ */
487
+bool nvme_wait_reset(struct nvme_ctrl *ctrl)
488
+{
489
+ wait_event(ctrl->state_wq,
490
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
491
+ nvme_state_terminal(ctrl));
492
+ return ctrl->state == NVME_CTRL_RESETTING;
493
+}
494
+EXPORT_SYMBOL_GPL(nvme_wait_reset);
495
+
369496 static void nvme_free_ns_head(struct kref *ref)
370497 {
371498 struct nvme_ns_head *head =
....@@ -373,8 +500,7 @@
373500
374501 nvme_mpath_remove_disk(head);
375502 ida_simple_remove(&head->subsys->ns_ida, head->instance);
376
- list_del_init(&head->entry);
377
- cleanup_srcu_struct_quiesced(&head->srcu);
503
+ cleanup_srcu_struct(&head->srcu);
378504 nvme_put_subsystem(head->subsys);
379505 kfree(head);
380506 }
....@@ -397,42 +523,61 @@
397523 kfree(ns);
398524 }
399525
400
-static void nvme_put_ns(struct nvme_ns *ns)
526
+void nvme_put_ns(struct nvme_ns *ns)
401527 {
402528 kref_put(&ns->kref, nvme_free_ns);
403529 }
530
+EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
404531
405532 static inline void nvme_clear_nvme_request(struct request *req)
406533 {
407
- if (!(req->rq_flags & RQF_DONTPREP)) {
408
- nvme_req(req)->retries = 0;
409
- nvme_req(req)->flags = 0;
410
- req->rq_flags |= RQF_DONTPREP;
411
- }
534
+ nvme_req(req)->retries = 0;
535
+ nvme_req(req)->flags = 0;
536
+ req->rq_flags |= RQF_DONTPREP;
412537 }
413538
414
-struct request *nvme_alloc_request(struct request_queue *q,
415
- struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
539
+static inline unsigned int nvme_req_op(struct nvme_command *cmd)
416540 {
417
- unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
418
- struct request *req;
541
+ return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
542
+}
419543
420
- if (qid == NVME_QID_ANY) {
421
- req = blk_mq_alloc_request(q, op, flags);
422
- } else {
423
- req = blk_mq_alloc_request_hctx(q, op, flags,
424
- qid ? qid - 1 : 0);
425
- }
426
- if (IS_ERR(req))
427
- return req;
544
+static inline void nvme_init_request(struct request *req,
545
+ struct nvme_command *cmd)
546
+{
547
+ if (req->q->queuedata)
548
+ req->timeout = NVME_IO_TIMEOUT;
549
+ else /* no queuedata implies admin queue */
550
+ req->timeout = ADMIN_TIMEOUT;
428551
429552 req->cmd_flags |= REQ_FAILFAST_DRIVER;
430553 nvme_clear_nvme_request(req);
431554 nvme_req(req)->cmd = cmd;
555
+}
432556
557
+struct request *nvme_alloc_request(struct request_queue *q,
558
+ struct nvme_command *cmd, blk_mq_req_flags_t flags)
559
+{
560
+ struct request *req;
561
+
562
+ req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
563
+ if (!IS_ERR(req))
564
+ nvme_init_request(req, cmd);
433565 return req;
434566 }
435567 EXPORT_SYMBOL_GPL(nvme_alloc_request);
568
+
569
+struct request *nvme_alloc_request_qid(struct request_queue *q,
570
+ struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
571
+{
572
+ struct request *req;
573
+
574
+ req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
575
+ qid ? qid - 1 : 0);
576
+ if (!IS_ERR(req))
577
+ nvme_init_request(req, cmd);
578
+ return req;
579
+}
580
+EXPORT_SYMBOL_GPL(nvme_alloc_request_qid);
436581
437582 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
438583 {
....@@ -470,7 +615,7 @@
470615
471616 c.directive.opcode = nvme_admin_directive_recv;
472617 c.directive.nsid = cpu_to_le32(nsid);
473
- c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
618
+ c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
474619 c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
475620 c.directive.dtype = NVME_DIR_STREAMS;
476621
....@@ -493,19 +638,22 @@
493638
494639 ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
495640 if (ret)
496
- return ret;
641
+ goto out_disable_stream;
497642
498643 ctrl->nssa = le16_to_cpu(s.nssa);
499644 if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
500645 dev_info(ctrl->device, "too few streams (%u) available\n",
501646 ctrl->nssa);
502
- nvme_disable_streams(ctrl);
503
- return 0;
647
+ goto out_disable_stream;
504648 }
505649
506
- ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
650
+ ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
507651 dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
508652 return 0;
653
+
654
+out_disable_stream:
655
+ nvme_disable_streams(ctrl);
656
+ return ret;
509657 }
510658
511659 /*
....@@ -533,10 +681,17 @@
533681 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
534682 }
535683
684
+static inline void nvme_setup_passthrough(struct request *req,
685
+ struct nvme_command *cmd)
686
+{
687
+ memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
688
+ /* passthru commands should let the driver set the SGL flags */
689
+ cmd->common.flags &= ~NVME_CMD_SGL_ALL;
690
+}
691
+
536692 static inline void nvme_setup_flush(struct nvme_ns *ns,
537693 struct nvme_command *cmnd)
538694 {
539
- memset(cmnd, 0, sizeof(*cmnd));
540695 cmnd->common.opcode = nvme_cmd_flush;
541696 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
542697 }
....@@ -569,7 +724,7 @@
569724 }
570725
571726 __rq_for_each_bio(bio, req) {
572
- u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
727
+ u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
573728 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
574729
575730 if (n < segments) {
....@@ -588,7 +743,6 @@
588743 return BLK_STS_IOERR;
589744 }
590745
591
- memset(cmnd, 0, sizeof(*cmnd));
592746 cmnd->dsm.opcode = nvme_cmd_dsm;
593747 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
594748 cmnd->dsm.nr = cpu_to_le32(segments - 1);
....@@ -602,8 +756,28 @@
602756 return BLK_STS_OK;
603757 }
604758
605
-static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
759
+static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
606760 struct request *req, struct nvme_command *cmnd)
761
+{
762
+ if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
763
+ return nvme_setup_discard(ns, req, cmnd);
764
+
765
+ cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
766
+ cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
767
+ cmnd->write_zeroes.slba =
768
+ cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
769
+ cmnd->write_zeroes.length =
770
+ cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
771
+ if (nvme_ns_has_pi(ns))
772
+ cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
773
+ else
774
+ cmnd->write_zeroes.control = 0;
775
+ return BLK_STS_OK;
776
+}
777
+
778
+static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
779
+ struct request *req, struct nvme_command *cmnd,
780
+ enum nvme_opcode op)
607781 {
608782 struct nvme_ctrl *ctrl = ns->ctrl;
609783 u16 control = 0;
....@@ -617,10 +791,9 @@
617791 if (req->cmd_flags & REQ_RAHEAD)
618792 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
619793
620
- memset(cmnd, 0, sizeof(*cmnd));
621
- cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
794
+ cmnd->rw.opcode = op;
622795 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
623
- cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
796
+ cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
624797 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
625798
626799 if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
....@@ -637,8 +810,6 @@
637810 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
638811 return BLK_STS_NOTSUPP;
639812 control |= NVME_RW_PRINFO_PRACT;
640
- } else if (req_op(req) == REQ_OP_WRITE) {
641
- t10_pi_prepare(req, ns->pi_type);
642813 }
643814
644815 switch (ns->pi_type) {
....@@ -649,6 +820,8 @@
649820 case NVME_NS_DPS_PI_TYPE2:
650821 control |= NVME_RW_PRINFO_PRCHK_GUARD |
651822 NVME_RW_PRINFO_PRCHK_REF;
823
+ if (op == nvme_cmd_zone_append)
824
+ control |= NVME_RW_APPEND_PIREMAP;
652825 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
653826 break;
654827 }
....@@ -661,13 +834,6 @@
661834
662835 void nvme_cleanup_cmd(struct request *req)
663836 {
664
- if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
665
- nvme_req(req)->status == 0) {
666
- struct nvme_ns *ns = req->rq_disk->private_data;
667
-
668
- t10_pi_complete(req, ns->pi_type,
669
- blk_rq_bytes(req) >> ns->lba_shift);
670
- }
671837 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
672838 struct nvme_ns *ns = req->rq_disk->private_data;
673839 struct page *page = req->special_vec.bv_page;
....@@ -683,37 +849,86 @@
683849 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
684850 struct nvme_command *cmd)
685851 {
852
+ struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
686853 blk_status_t ret = BLK_STS_OK;
687854
688
- nvme_clear_nvme_request(req);
855
+ if (!(req->rq_flags & RQF_DONTPREP))
856
+ nvme_clear_nvme_request(req);
689857
858
+ memset(cmd, 0, sizeof(*cmd));
690859 switch (req_op(req)) {
691860 case REQ_OP_DRV_IN:
692861 case REQ_OP_DRV_OUT:
693
- memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
862
+ nvme_setup_passthrough(req, cmd);
694863 break;
695864 case REQ_OP_FLUSH:
696865 nvme_setup_flush(ns, cmd);
697866 break;
867
+ case REQ_OP_ZONE_RESET_ALL:
868
+ case REQ_OP_ZONE_RESET:
869
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
870
+ break;
871
+ case REQ_OP_ZONE_OPEN:
872
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
873
+ break;
874
+ case REQ_OP_ZONE_CLOSE:
875
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
876
+ break;
877
+ case REQ_OP_ZONE_FINISH:
878
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
879
+ break;
698880 case REQ_OP_WRITE_ZEROES:
699
- /* currently only aliased to deallocate for a few ctrls: */
881
+ ret = nvme_setup_write_zeroes(ns, req, cmd);
882
+ break;
700883 case REQ_OP_DISCARD:
701884 ret = nvme_setup_discard(ns, req, cmd);
702885 break;
703886 case REQ_OP_READ:
887
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
888
+ break;
704889 case REQ_OP_WRITE:
705
- ret = nvme_setup_rw(ns, req, cmd);
890
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
891
+ break;
892
+ case REQ_OP_ZONE_APPEND:
893
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
706894 break;
707895 default:
708896 WARN_ON_ONCE(1);
709897 return BLK_STS_IOERR;
710898 }
711899
712
- cmd->common.command_id = req->tag;
900
+ if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN))
901
+ nvme_req(req)->genctr++;
902
+ cmd->common.command_id = nvme_cid(req);
713903 trace_nvme_setup_cmd(req, cmd);
714904 return ret;
715905 }
716906 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
907
+
908
+static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
909
+{
910
+ struct completion *waiting = rq->end_io_data;
911
+
912
+ rq->end_io_data = NULL;
913
+ complete(waiting);
914
+}
915
+
916
+static void nvme_execute_rq_polled(struct request_queue *q,
917
+ struct gendisk *bd_disk, struct request *rq, int at_head)
918
+{
919
+ DECLARE_COMPLETION_ONSTACK(wait);
920
+
921
+ WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
922
+
923
+ rq->cmd_flags |= REQ_HIPRI;
924
+ rq->end_io_data = &wait;
925
+ blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
926
+
927
+ while (!completion_done(&wait)) {
928
+ blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
929
+ cond_resched();
930
+ }
931
+}
717932
718933 /*
719934 * Returns 0 on success. If the result is negative, it's a Linux error code;
....@@ -722,16 +937,20 @@
722937 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
723938 union nvme_result *result, void *buffer, unsigned bufflen,
724939 unsigned timeout, int qid, int at_head,
725
- blk_mq_req_flags_t flags)
940
+ blk_mq_req_flags_t flags, bool poll)
726941 {
727942 struct request *req;
728943 int ret;
729944
730
- req = nvme_alloc_request(q, cmd, flags, qid);
945
+ if (qid == NVME_QID_ANY)
946
+ req = nvme_alloc_request(q, cmd, flags);
947
+ else
948
+ req = nvme_alloc_request_qid(q, cmd, flags, qid);
731949 if (IS_ERR(req))
732950 return PTR_ERR(req);
733951
734
- req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
952
+ if (timeout)
953
+ req->timeout = timeout;
735954
736955 if (buffer && bufflen) {
737956 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
....@@ -739,7 +958,10 @@
739958 goto out;
740959 }
741960
742
- blk_execute_rq(req->q, NULL, req, at_head);
961
+ if (poll)
962
+ nvme_execute_rq_polled(req->q, NULL, req, at_head);
963
+ else
964
+ blk_execute_rq(req->q, NULL, req, at_head);
743965 if (result)
744966 *result = nvme_req(req)->result;
745967 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
....@@ -756,7 +978,7 @@
756978 void *buffer, unsigned bufflen)
757979 {
758980 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
759
- NVME_QID_ANY, 0, 0);
981
+ NVME_QID_ANY, 0, 0, false);
760982 }
761983 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
762984
....@@ -794,10 +1016,97 @@
7941016 return ERR_PTR(ret);
7951017 }
7961018
1019
+static u32 nvme_known_admin_effects(u8 opcode)
1020
+{
1021
+ switch (opcode) {
1022
+ case nvme_admin_format_nvm:
1023
+ return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
1024
+ NVME_CMD_EFFECTS_CSE_MASK;
1025
+ case nvme_admin_sanitize_nvm:
1026
+ return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
1027
+ default:
1028
+ break;
1029
+ }
1030
+ return 0;
1031
+}
1032
+
1033
+u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1034
+{
1035
+ u32 effects = 0;
1036
+
1037
+ if (ns) {
1038
+ if (ns->head->effects)
1039
+ effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1040
+ if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1041
+ dev_warn(ctrl->device,
1042
+ "IO command:%02x has unhandled effects:%08x\n",
1043
+ opcode, effects);
1044
+ return 0;
1045
+ }
1046
+
1047
+ if (ctrl->effects)
1048
+ effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1049
+ effects |= nvme_known_admin_effects(opcode);
1050
+
1051
+ return effects;
1052
+}
1053
+EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1054
+
1055
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1056
+ u8 opcode)
1057
+{
1058
+ u32 effects = nvme_command_effects(ctrl, ns, opcode);
1059
+
1060
+ /*
1061
+ * For simplicity, IO to all namespaces is quiesced even if the command
1062
+ * effects say only one namespace is affected.
1063
+ */
1064
+ if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1065
+ mutex_lock(&ctrl->scan_lock);
1066
+ mutex_lock(&ctrl->subsys->lock);
1067
+ nvme_mpath_start_freeze(ctrl->subsys);
1068
+ nvme_mpath_wait_freeze(ctrl->subsys);
1069
+ nvme_start_freeze(ctrl);
1070
+ nvme_wait_freeze(ctrl);
1071
+ }
1072
+ return effects;
1073
+}
1074
+
1075
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1076
+{
1077
+ if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1078
+ nvme_unfreeze(ctrl);
1079
+ nvme_mpath_unfreeze(ctrl->subsys);
1080
+ mutex_unlock(&ctrl->subsys->lock);
1081
+ nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1082
+ mutex_unlock(&ctrl->scan_lock);
1083
+ }
1084
+ if (effects & NVME_CMD_EFFECTS_CCC)
1085
+ nvme_init_identify(ctrl);
1086
+ if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1087
+ nvme_queue_scan(ctrl);
1088
+ flush_work(&ctrl->scan_work);
1089
+ }
1090
+}
1091
+
1092
+void nvme_execute_passthru_rq(struct request *rq)
1093
+{
1094
+ struct nvme_command *cmd = nvme_req(rq)->cmd;
1095
+ struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
1096
+ struct nvme_ns *ns = rq->q->queuedata;
1097
+ struct gendisk *disk = ns ? ns->disk : NULL;
1098
+ u32 effects;
1099
+
1100
+ effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
1101
+ blk_execute_rq(rq->q, disk, rq, 0);
1102
+ nvme_passthru_end(ctrl, effects);
1103
+}
1104
+EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
1105
+
7971106 static int nvme_submit_user_cmd(struct request_queue *q,
7981107 struct nvme_command *cmd, void __user *ubuffer,
7991108 unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
800
- u32 meta_seed, u32 *result, unsigned timeout)
1109
+ u32 meta_seed, u64 *result, unsigned timeout)
8011110 {
8021111 bool write = nvme_is_write(cmd);
8031112 struct nvme_ns *ns = q->queuedata;
....@@ -807,11 +1116,12 @@
8071116 void *meta = NULL;
8081117 int ret;
8091118
810
- req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
1119
+ req = nvme_alloc_request(q, cmd, 0);
8111120 if (IS_ERR(req))
8121121 return PTR_ERR(req);
8131122
814
- req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
1123
+ if (timeout)
1124
+ req->timeout = timeout;
8151125 nvme_req(req)->flags |= NVME_REQ_USERCMD;
8161126
8171127 if (ubuffer && bufflen) {
....@@ -832,13 +1142,13 @@
8321142 }
8331143 }
8341144
835
- blk_execute_rq(req->q, disk, req, 0);
1145
+ nvme_execute_passthru_rq(req);
8361146 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
8371147 ret = -EINTR;
8381148 else
8391149 ret = nvme_req(req)->status;
8401150 if (result)
841
- *result = le32_to_cpu(nvme_req(req)->result.u32);
1151
+ *result = le64_to_cpu(nvme_req(req)->result.u64);
8421152 if (meta && !ret && !write) {
8431153 if (copy_to_user(meta_buffer, meta, meta_len))
8441154 ret = -EFAULT;
....@@ -867,21 +1177,22 @@
8671177 return;
8681178 }
8691179
1180
+ ctrl->comp_seen = false;
8701181 spin_lock_irqsave(&ctrl->lock, flags);
8711182 if (ctrl->state == NVME_CTRL_LIVE ||
8721183 ctrl->state == NVME_CTRL_CONNECTING)
8731184 startka = true;
8741185 spin_unlock_irqrestore(&ctrl->lock, flags);
8751186 if (startka)
876
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
1187
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
8771188 }
8781189
8791190 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
8801191 {
8811192 struct request *rq;
8821193
883
- rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED,
884
- NVME_QID_ANY);
1194
+ rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd,
1195
+ BLK_MQ_REQ_RESERVED);
8851196 if (IS_ERR(rq))
8861197 return PTR_ERR(rq);
8871198
....@@ -897,6 +1208,15 @@
8971208 {
8981209 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
8991210 struct nvme_ctrl, ka_work);
1211
+ bool comp_seen = ctrl->comp_seen;
1212
+
1213
+ if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1214
+ dev_dbg(ctrl->device,
1215
+ "reschedule traffic based keep-alive timer\n");
1216
+ ctrl->comp_seen = false;
1217
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1218
+ return;
1219
+ }
9001220
9011221 if (nvme_keep_alive(ctrl)) {
9021222 /* allocation failure, reset the controller */
....@@ -911,7 +1231,7 @@
9111231 if (unlikely(ctrl->kato == 0))
9121232 return;
9131233
914
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
1234
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
9151235 }
9161236
9171237 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
....@@ -956,14 +1276,75 @@
9561276 return error;
9571277 }
9581278
1279
+static bool nvme_multi_css(struct nvme_ctrl *ctrl)
1280
+{
1281
+ return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
1282
+}
1283
+
1284
+static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1285
+ struct nvme_ns_id_desc *cur, bool *csi_seen)
1286
+{
1287
+ const char *warn_str = "ctrl returned bogus length:";
1288
+ void *data = cur;
1289
+
1290
+ switch (cur->nidt) {
1291
+ case NVME_NIDT_EUI64:
1292
+ if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1293
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1294
+ warn_str, cur->nidl);
1295
+ return -1;
1296
+ }
1297
+ if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1298
+ return NVME_NIDT_EUI64_LEN;
1299
+ memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1300
+ return NVME_NIDT_EUI64_LEN;
1301
+ case NVME_NIDT_NGUID:
1302
+ if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1303
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1304
+ warn_str, cur->nidl);
1305
+ return -1;
1306
+ }
1307
+ if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1308
+ return NVME_NIDT_NGUID_LEN;
1309
+ memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1310
+ return NVME_NIDT_NGUID_LEN;
1311
+ case NVME_NIDT_UUID:
1312
+ if (cur->nidl != NVME_NIDT_UUID_LEN) {
1313
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1314
+ warn_str, cur->nidl);
1315
+ return -1;
1316
+ }
1317
+ if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1318
+ return NVME_NIDT_UUID_LEN;
1319
+ uuid_copy(&ids->uuid, data + sizeof(*cur));
1320
+ return NVME_NIDT_UUID_LEN;
1321
+ case NVME_NIDT_CSI:
1322
+ if (cur->nidl != NVME_NIDT_CSI_LEN) {
1323
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1324
+ warn_str, cur->nidl);
1325
+ return -1;
1326
+ }
1327
+ memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1328
+ *csi_seen = true;
1329
+ return NVME_NIDT_CSI_LEN;
1330
+ default:
1331
+ /* Skip unknown types */
1332
+ return cur->nidl;
1333
+ }
1334
+}
1335
+
9591336 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
9601337 struct nvme_ns_ids *ids)
9611338 {
9621339 struct nvme_command c = { };
963
- int status;
1340
+ bool csi_seen = false;
1341
+ int status, pos, len;
9641342 void *data;
965
- int pos;
966
- int len;
1343
+
1344
+ if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1345
+ return 0;
1346
+ if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1347
+ return 0;
9671348
9681349 c.identify.opcode = nvme_admin_identify;
9691350 c.identify.nsid = cpu_to_le32(nsid);
....@@ -975,8 +1356,11 @@
9751356
9761357 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
9771358 NVME_IDENTIFY_DATA_SIZE);
978
- if (status)
1359
+ if (status) {
1360
+ dev_warn(ctrl->device,
1361
+ "Identify Descriptors failed (%d)\n", status);
9791362 goto free_data;
1363
+ }
9801364
9811365 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
9821366 struct nvme_ns_id_desc *cur = data + pos;
....@@ -984,65 +1368,27 @@
9841368 if (cur->nidl == 0)
9851369 break;
9861370
987
- switch (cur->nidt) {
988
- case NVME_NIDT_EUI64:
989
- if (cur->nidl != NVME_NIDT_EUI64_LEN) {
990
- dev_warn(ctrl->device,
991
- "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
992
- cur->nidl);
993
- goto free_data;
994
- }
995
- len = NVME_NIDT_EUI64_LEN;
996
- memcpy(ids->eui64, data + pos + sizeof(*cur), len);
1371
+ len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
1372
+ if (len < 0)
9971373 break;
998
- case NVME_NIDT_NGUID:
999
- if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1000
- dev_warn(ctrl->device,
1001
- "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
1002
- cur->nidl);
1003
- goto free_data;
1004
- }
1005
- len = NVME_NIDT_NGUID_LEN;
1006
- memcpy(ids->nguid, data + pos + sizeof(*cur), len);
1007
- break;
1008
- case NVME_NIDT_UUID:
1009
- if (cur->nidl != NVME_NIDT_UUID_LEN) {
1010
- dev_warn(ctrl->device,
1011
- "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
1012
- cur->nidl);
1013
- goto free_data;
1014
- }
1015
- len = NVME_NIDT_UUID_LEN;
1016
- uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
1017
- break;
1018
- default:
1019
- /* Skip unnkown types */
1020
- len = cur->nidl;
1021
- break;
1022
- }
10231374
10241375 len += sizeof(*cur);
10251376 }
1377
+
1378
+ if (nvme_multi_css(ctrl) && !csi_seen) {
1379
+ dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1380
+ nsid);
1381
+ status = -EINVAL;
1382
+ }
1383
+
10261384 free_data:
10271385 kfree(data);
10281386 return status;
10291387 }
10301388
1031
-static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
1389
+static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1390
+ struct nvme_ns_ids *ids, struct nvme_id_ns **id)
10321391 {
1033
- struct nvme_command c = { };
1034
-
1035
- c.identify.opcode = nvme_admin_identify;
1036
- c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
1037
- c.identify.nsid = cpu_to_le32(nsid);
1038
- return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
1039
- NVME_IDENTIFY_DATA_SIZE);
1040
-}
1041
-
1042
-static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
1043
- unsigned nsid)
1044
-{
1045
- struct nvme_id_ns *id;
10461392 struct nvme_command c = { };
10471393 int error;
10481394
....@@ -1051,38 +1397,76 @@
10511397 c.identify.nsid = cpu_to_le32(nsid);
10521398 c.identify.cns = NVME_ID_CNS_NS;
10531399
1054
- id = kmalloc(sizeof(*id), GFP_KERNEL);
1055
- if (!id)
1056
- return NULL;
1400
+ *id = kmalloc(sizeof(**id), GFP_KERNEL);
1401
+ if (!*id)
1402
+ return -ENOMEM;
10571403
1058
- error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1404
+ error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
10591405 if (error) {
1060
- dev_warn(ctrl->device, "Identify namespace failed\n");
1061
- kfree(id);
1062
- return NULL;
1406
+ dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1407
+ goto out_free_id;
10631408 }
10641409
1065
- return id;
1410
+ error = NVME_SC_INVALID_NS | NVME_SC_DNR;
1411
+ if ((*id)->ncap == 0) /* namespace not allocated or attached */
1412
+ goto out_free_id;
1413
+
1414
+
1415
+ if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1416
+ dev_info(ctrl->device,
1417
+ "Ignoring bogus Namespace Identifiers\n");
1418
+ } else {
1419
+ if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1420
+ !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1421
+ memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
1422
+ if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1423
+ !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1424
+ memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
1425
+ }
1426
+
1427
+ return 0;
1428
+
1429
+out_free_id:
1430
+ kfree(*id);
1431
+ return error;
10661432 }
10671433
1068
-static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
1069
- void *buffer, size_t buflen, u32 *result)
1434
+static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1435
+ unsigned int dword11, void *buffer, size_t buflen, u32 *result)
10701436 {
10711437 union nvme_result res = { 0 };
10721438 struct nvme_command c;
10731439 int ret;
10741440
10751441 memset(&c, 0, sizeof(c));
1076
- c.features.opcode = nvme_admin_set_features;
1442
+ c.features.opcode = op;
10771443 c.features.fid = cpu_to_le32(fid);
10781444 c.features.dword11 = cpu_to_le32(dword11);
10791445
10801446 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1081
- buffer, buflen, 0, NVME_QID_ANY, 0, 0);
1447
+ buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
10821448 if (ret >= 0 && result)
10831449 *result = le32_to_cpu(res.u32);
10841450 return ret;
10851451 }
1452
+
1453
+int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1454
+ unsigned int dword11, void *buffer, size_t buflen,
1455
+ u32 *result)
1456
+{
1457
+ return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1458
+ buflen, result);
1459
+}
1460
+EXPORT_SYMBOL_GPL(nvme_set_features);
1461
+
1462
+int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1463
+ unsigned int dword11, void *buffer, size_t buflen,
1464
+ u32 *result)
1465
+{
1466
+ return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1467
+ buflen, result);
1468
+}
1469
+EXPORT_SYMBOL_GPL(nvme_get_features);
10861470
10871471 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
10881472 {
....@@ -1113,7 +1497,8 @@
11131497 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
11141498
11151499 #define NVME_AEN_SUPPORTED \
1116
- (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
1500
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1501
+ NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
11171502
11181503 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
11191504 {
....@@ -1128,6 +1513,20 @@
11281513 if (status)
11291514 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
11301515 supported_aens);
1516
+
1517
+ queue_work(nvme_wq, &ctrl->async_event_work);
1518
+}
1519
+
1520
+/*
1521
+ * Convert integer values from ioctl structures to user pointers, silently
1522
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
1523
+ * kernels.
1524
+ */
1525
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
1526
+{
1527
+ if (in_compat_syscall())
1528
+ ptrval = (compat_uptr_t)ptrval;
1529
+ return (void __user *)ptrval;
11311530 }
11321531
11331532 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
....@@ -1152,10 +1551,23 @@
11521551 }
11531552
11541553 length = (io.nblocks + 1) << ns->lba_shift;
1155
- meta_len = (io.nblocks + 1) * ns->ms;
1156
- metadata = (void __user *)(uintptr_t)io.metadata;
11571554
1158
- if (ns->ext) {
1555
+ if ((io.control & NVME_RW_PRINFO_PRACT) &&
1556
+ ns->ms == sizeof(struct t10_pi_tuple)) {
1557
+ /*
1558
+ * Protection information is stripped/inserted by the
1559
+ * controller.
1560
+ */
1561
+ if (nvme_to_user_ptr(io.metadata))
1562
+ return -EINVAL;
1563
+ meta_len = 0;
1564
+ metadata = NULL;
1565
+ } else {
1566
+ meta_len = (io.nblocks + 1) * ns->ms;
1567
+ metadata = nvme_to_user_ptr(io.metadata);
1568
+ }
1569
+
1570
+ if (ns->features & NVME_NS_EXT_LBAS) {
11591571 length += meta_len;
11601572 meta_len = 0;
11611573 } else if (meta_len) {
....@@ -1176,91 +1588,8 @@
11761588 c.rw.appmask = cpu_to_le16(io.appmask);
11771589
11781590 return nvme_submit_user_cmd(ns->queue, &c,
1179
- (void __user *)(uintptr_t)io.addr, length,
1180
- metadata, meta_len, io.slba, NULL, 0);
1181
-}
1182
-
1183
-static u32 nvme_known_admin_effects(u8 opcode)
1184
-{
1185
- switch (opcode) {
1186
- case nvme_admin_format_nvm:
1187
- return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
1188
- NVME_CMD_EFFECTS_CSE_MASK;
1189
- case nvme_admin_sanitize_nvm:
1190
- return NVME_CMD_EFFECTS_CSE_MASK;
1191
- default:
1192
- break;
1193
- }
1194
- return 0;
1195
-}
1196
-
1197
-static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1198
- u8 opcode)
1199
-{
1200
- u32 effects = 0;
1201
-
1202
- if (ns) {
1203
- if (ctrl->effects)
1204
- effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1205
- if (effects & ~NVME_CMD_EFFECTS_CSUPP)
1206
- dev_warn(ctrl->device,
1207
- "IO command:%02x has unhandled effects:%08x\n",
1208
- opcode, effects);
1209
- return 0;
1210
- }
1211
-
1212
- if (ctrl->effects)
1213
- effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1214
- else
1215
- effects = nvme_known_admin_effects(opcode);
1216
-
1217
- /*
1218
- * For simplicity, IO to all namespaces is quiesced even if the command
1219
- * effects say only one namespace is affected.
1220
- */
1221
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1222
- mutex_lock(&ctrl->scan_lock);
1223
- mutex_lock(&ctrl->subsys->lock);
1224
- nvme_mpath_start_freeze(ctrl->subsys);
1225
- nvme_mpath_wait_freeze(ctrl->subsys);
1226
- nvme_start_freeze(ctrl);
1227
- nvme_wait_freeze(ctrl);
1228
- }
1229
- return effects;
1230
-}
1231
-
1232
-static void nvme_update_formats(struct nvme_ctrl *ctrl)
1233
-{
1234
- struct nvme_ns *ns;
1235
-
1236
- down_read(&ctrl->namespaces_rwsem);
1237
- list_for_each_entry(ns, &ctrl->namespaces, list)
1238
- if (ns->disk && nvme_revalidate_disk(ns->disk))
1239
- nvme_set_queue_dying(ns);
1240
- up_read(&ctrl->namespaces_rwsem);
1241
-
1242
- nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1243
-}
1244
-
1245
-static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1246
-{
1247
- /*
1248
- * Revalidate LBA changes prior to unfreezing. This is necessary to
1249
- * prevent memory corruption if a logical block size was changed by
1250
- * this command.
1251
- */
1252
- if (effects & NVME_CMD_EFFECTS_LBCC)
1253
- nvme_update_formats(ctrl);
1254
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1255
- nvme_unfreeze(ctrl);
1256
- nvme_mpath_unfreeze(ctrl->subsys);
1257
- mutex_unlock(&ctrl->subsys->lock);
1258
- mutex_unlock(&ctrl->scan_lock);
1259
- }
1260
- if (effects & NVME_CMD_EFFECTS_CCC)
1261
- nvme_init_identify(ctrl);
1262
- if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
1263
- nvme_queue_scan(ctrl);
1591
+ nvme_to_user_ptr(io.addr), length,
1592
+ metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
12641593 }
12651594
12661595 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
....@@ -1269,7 +1598,7 @@
12691598 struct nvme_passthru_cmd cmd;
12701599 struct nvme_command c;
12711600 unsigned timeout = 0;
1272
- u32 effects;
1601
+ u64 result;
12731602 int status;
12741603
12751604 if (!capable(CAP_SYS_ADMIN))
....@@ -1285,22 +1614,64 @@
12851614 c.common.nsid = cpu_to_le32(cmd.nsid);
12861615 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
12871616 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1288
- c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1289
- c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1290
- c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1291
- c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1292
- c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1293
- c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1617
+ c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1618
+ c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1619
+ c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1620
+ c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1621
+ c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1622
+ c.common.cdw15 = cpu_to_le32(cmd.cdw15);
12941623
12951624 if (cmd.timeout_ms)
12961625 timeout = msecs_to_jiffies(cmd.timeout_ms);
12971626
1298
- effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
12991627 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1300
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1301
- (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len,
1628
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
1629
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
1630
+ 0, &result, timeout);
1631
+
1632
+ if (status >= 0) {
1633
+ if (put_user(result, &ucmd->result))
1634
+ return -EFAULT;
1635
+ }
1636
+
1637
+ return status;
1638
+}
1639
+
1640
+static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1641
+ struct nvme_passthru_cmd64 __user *ucmd)
1642
+{
1643
+ struct nvme_passthru_cmd64 cmd;
1644
+ struct nvme_command c;
1645
+ unsigned timeout = 0;
1646
+ int status;
1647
+
1648
+ if (!capable(CAP_SYS_ADMIN))
1649
+ return -EACCES;
1650
+ if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1651
+ return -EFAULT;
1652
+ if (cmd.flags)
1653
+ return -EINVAL;
1654
+
1655
+ memset(&c, 0, sizeof(c));
1656
+ c.common.opcode = cmd.opcode;
1657
+ c.common.flags = cmd.flags;
1658
+ c.common.nsid = cpu_to_le32(cmd.nsid);
1659
+ c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1660
+ c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1661
+ c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1662
+ c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1663
+ c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1664
+ c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1665
+ c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1666
+ c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1667
+
1668
+ if (cmd.timeout_ms)
1669
+ timeout = msecs_to_jiffies(cmd.timeout_ms);
1670
+
1671
+ status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1672
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
1673
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
13021674 0, &cmd.result, timeout);
1303
- nvme_passthru_end(ctrl, effects);
13041675
13051676 if (status >= 0) {
13061677 if (put_user(cmd.result, &ucmd->result))
....@@ -1314,7 +1685,7 @@
13141685 * Issue ioctl requests on the first available path. Note that unlike normal
13151686 * block layer requests we will not retry failed request on another controller.
13161687 */
1317
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1688
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
13181689 struct nvme_ns_head **head, int *srcu_idx)
13191690 {
13201691 #ifdef CONFIG_NVME_MULTIPATH
....@@ -1334,10 +1705,45 @@
13341705 return disk->private_data;
13351706 }
13361707
1337
-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1708
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
13381709 {
13391710 if (head)
13401711 srcu_read_unlock(&head->srcu, idx);
1712
+}
1713
+
1714
+static bool is_ctrl_ioctl(unsigned int cmd)
1715
+{
1716
+ if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
1717
+ return true;
1718
+ if (is_sed_ioctl(cmd))
1719
+ return true;
1720
+ return false;
1721
+}
1722
+
1723
+static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
1724
+ void __user *argp,
1725
+ struct nvme_ns_head *head,
1726
+ int srcu_idx)
1727
+{
1728
+ struct nvme_ctrl *ctrl = ns->ctrl;
1729
+ int ret;
1730
+
1731
+ nvme_get_ctrl(ns->ctrl);
1732
+ nvme_put_ns_from_disk(head, srcu_idx);
1733
+
1734
+ switch (cmd) {
1735
+ case NVME_IOCTL_ADMIN_CMD:
1736
+ ret = nvme_user_cmd(ctrl, NULL, argp);
1737
+ break;
1738
+ case NVME_IOCTL_ADMIN64_CMD:
1739
+ ret = nvme_user_cmd64(ctrl, NULL, argp);
1740
+ break;
1741
+ default:
1742
+ ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
1743
+ break;
1744
+ }
1745
+ nvme_put_ctrl(ctrl);
1746
+ return ret;
13411747 }
13421748
13431749 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
....@@ -1357,20 +1763,8 @@
13571763 * seperately and drop the ns SRCU reference early. This avoids a
13581764 * deadlock when deleting namespaces using the passthrough interface.
13591765 */
1360
- if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) {
1361
- struct nvme_ctrl *ctrl = ns->ctrl;
1362
-
1363
- nvme_get_ctrl(ns->ctrl);
1364
- nvme_put_ns_from_disk(head, srcu_idx);
1365
-
1366
- if (cmd == NVME_IOCTL_ADMIN_CMD)
1367
- ret = nvme_user_cmd(ctrl, NULL, argp);
1368
- else
1369
- ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
1370
-
1371
- nvme_put_ctrl(ctrl);
1372
- return ret;
1373
- }
1766
+ if (is_ctrl_ioctl(cmd))
1767
+ return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
13741768
13751769 switch (cmd) {
13761770 case NVME_IOCTL_ID:
....@@ -1383,6 +1777,9 @@
13831777 case NVME_IOCTL_SUBMIT_IO:
13841778 ret = nvme_submit_io(ns, argp);
13851779 break;
1780
+ case NVME_IOCTL_IO64_CMD:
1781
+ ret = nvme_user_cmd64(ns->ctrl, ns, argp);
1782
+ break;
13861783 default:
13871784 if (ns->ndev)
13881785 ret = nvme_nvm_ioctl(ns, cmd, arg);
....@@ -1393,6 +1790,47 @@
13931790 nvme_put_ns_from_disk(head, srcu_idx);
13941791 return ret;
13951792 }
1793
+
1794
+#ifdef CONFIG_COMPAT
1795
+struct nvme_user_io32 {
1796
+ __u8 opcode;
1797
+ __u8 flags;
1798
+ __u16 control;
1799
+ __u16 nblocks;
1800
+ __u16 rsvd;
1801
+ __u64 metadata;
1802
+ __u64 addr;
1803
+ __u64 slba;
1804
+ __u32 dsmgmt;
1805
+ __u32 reftag;
1806
+ __u16 apptag;
1807
+ __u16 appmask;
1808
+} __attribute__((__packed__));
1809
+
1810
+#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32)
1811
+
1812
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1813
+ unsigned int cmd, unsigned long arg)
1814
+{
1815
+ /*
1816
+ * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO
1817
+ * between 32 bit programs and 64 bit kernel.
1818
+ * The cause is that the results of sizeof(struct nvme_user_io),
1819
+ * which is used to define NVME_IOCTL_SUBMIT_IO,
1820
+ * are not same between 32 bit compiler and 64 bit compiler.
1821
+ * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling
1822
+ * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs.
1823
+ * Other IOCTL numbers are same between 32 bit and 64 bit.
1824
+ * So there is nothing to do regarding to other IOCTL numbers.
1825
+ */
1826
+ if (cmd == NVME_IOCTL_SUBMIT_IO32)
1827
+ return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg);
1828
+
1829
+ return nvme_ioctl(bdev, mode, cmd, arg);
1830
+}
1831
+#else
1832
+#define nvme_compat_ioctl NULL
1833
+#endif /* CONFIG_COMPAT */
13961834
13971835 static int nvme_open(struct block_device *bdev, fmode_t mode)
13981836 {
....@@ -1434,7 +1872,8 @@
14341872 }
14351873
14361874 #ifdef CONFIG_BLK_DEV_INTEGRITY
1437
-static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1875
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1876
+ u32 max_integrity_segments)
14381877 {
14391878 struct blk_integrity integrity;
14401879
....@@ -1457,24 +1896,19 @@
14571896 }
14581897 integrity.tuple_size = ms;
14591898 blk_integrity_register(disk, &integrity);
1460
- blk_queue_max_integrity_segments(disk->queue, 1);
1899
+ blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
14611900 }
14621901 #else
1463
-static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1902
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1903
+ u32 max_integrity_segments)
14641904 {
14651905 }
14661906 #endif /* CONFIG_BLK_DEV_INTEGRITY */
14671907
1468
-static void nvme_set_chunk_size(struct nvme_ns *ns)
1469
-{
1470
- u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
1471
- blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
1472
-}
1473
-
1474
-static void nvme_config_discard(struct nvme_ns *ns)
1908
+static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
14751909 {
14761910 struct nvme_ctrl *ctrl = ns->ctrl;
1477
- struct request_queue *queue = ns->queue;
1911
+ struct request_queue *queue = disk->queue;
14781912 u32 size = queue_logical_block_size(queue);
14791913
14801914 if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
....@@ -1502,23 +1936,18 @@
15021936 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
15031937 }
15041938
1505
-static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1506
- struct nvme_id_ns *id, struct nvme_ns_ids *ids)
1939
+/*
1940
+ * Even though NVMe spec explicitly states that MDTS is not applicable to the
1941
+ * write-zeroes, we are cautious and limit the size to the controllers
1942
+ * max_hw_sectors value, which is based on the MDTS field and possibly other
1943
+ * limiting factors.
1944
+ */
1945
+static void nvme_config_write_zeroes(struct request_queue *q,
1946
+ struct nvme_ctrl *ctrl)
15071947 {
1508
- memset(ids, 0, sizeof(*ids));
1509
-
1510
- if (ctrl->vs >= NVME_VS(1, 1, 0))
1511
- memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
1512
- if (ctrl->vs >= NVME_VS(1, 2, 0))
1513
- memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
1514
- if (ctrl->vs >= NVME_VS(1, 3, 0)) {
1515
- /* Don't treat error as fatal we potentially
1516
- * already have a NGUID or EUI-64
1517
- */
1518
- if (nvme_identify_ns_descs(ctrl, nsid, ids))
1519
- dev_warn(ctrl->device,
1520
- "%s: Identify Descriptors failed\n", __func__);
1521
- }
1948
+ if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
1949
+ !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
1950
+ blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors);
15221951 }
15231952
15241953 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
....@@ -1532,110 +1961,250 @@
15321961 {
15331962 return uuid_equal(&a->uuid, &b->uuid) &&
15341963 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1535
- memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
1964
+ memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1965
+ a->csi == b->csi;
15361966 }
15371967
1538
-static void nvme_update_disk_info(struct gendisk *disk,
1539
- struct nvme_ns *ns, struct nvme_id_ns *id)
1968
+static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1969
+ u32 *phys_bs, u32 *io_opt)
15401970 {
1541
- sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
1542
- unsigned short bs = 1 << ns->lba_shift;
1971
+ struct streams_directive_params s;
1972
+ int ret;
15431973
1544
- if (ns->lba_shift > PAGE_SHIFT) {
1545
- /* unsupported block size, set capacity to 0 later */
1546
- bs = (1 << 9);
1974
+ if (!ctrl->nr_streams)
1975
+ return 0;
1976
+
1977
+ ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
1978
+ if (ret)
1979
+ return ret;
1980
+
1981
+ ns->sws = le32_to_cpu(s.sws);
1982
+ ns->sgs = le16_to_cpu(s.sgs);
1983
+
1984
+ if (ns->sws) {
1985
+ *phys_bs = ns->sws * (1 << ns->lba_shift);
1986
+ if (ns->sgs)
1987
+ *io_opt = *phys_bs * ns->sgs;
15471988 }
1548
- blk_mq_freeze_queue(disk->queue);
1549
- blk_integrity_unregister(disk);
15501989
1551
- blk_queue_logical_block_size(disk->queue, bs);
1552
- blk_queue_physical_block_size(disk->queue, bs);
1553
- blk_queue_io_min(disk->queue, bs);
1554
-
1555
- if (ns->ms && !ns->ext &&
1556
- (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1557
- nvme_init_integrity(disk, ns->ms, ns->pi_type);
1558
- if ((ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) ||
1559
- ns->lba_shift > PAGE_SHIFT)
1560
- capacity = 0;
1561
-
1562
- set_capacity(disk, capacity);
1563
- nvme_config_discard(ns);
1564
-
1565
- if (id->nsattr & (1 << 0))
1566
- set_disk_ro(disk, true);
1567
- else
1568
- set_disk_ro(disk, false);
1569
-
1570
- blk_mq_unfreeze_queue(disk->queue);
1990
+ return 0;
15711991 }
15721992
1573
-static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1993
+static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
15741994 {
1575
- struct nvme_ns *ns = disk->private_data;
1995
+ struct nvme_ctrl *ctrl = ns->ctrl;
15761996
15771997 /*
1578
- * If identify namespace failed, use default 512 byte block size so
1579
- * block layer can use before failing read/write for 0 capacity.
1998
+ * The PI implementation requires the metadata size to be equal to the
1999
+ * t10 pi tuple size.
15802000 */
1581
- ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
1582
- if (ns->lba_shift == 0)
1583
- ns->lba_shift = 9;
1584
- ns->noiob = le16_to_cpu(id->noiob);
15852001 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1586
- ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1587
- /* the PI implementation requires metadata equal t10 pi tuple size */
15882002 if (ns->ms == sizeof(struct t10_pi_tuple))
15892003 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
15902004 else
15912005 ns->pi_type = 0;
15922006
1593
- if (ns->noiob)
1594
- nvme_set_chunk_size(ns);
1595
- nvme_update_disk_info(disk, ns, id);
1596
- if (ns->ndev)
1597
- nvme_nvm_update_nvm_info(ns);
1598
-#ifdef CONFIG_NVME_MULTIPATH
1599
- if (ns->head->disk) {
1600
- nvme_update_disk_info(ns->head->disk, ns, id);
1601
- blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
1602
- nvme_mpath_update_disk_size(ns->head->disk);
2007
+ ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
2008
+ if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
2009
+ return 0;
2010
+ if (ctrl->ops->flags & NVME_F_FABRICS) {
2011
+ /*
2012
+ * The NVMe over Fabrics specification only supports metadata as
2013
+ * part of the extended data LBA. We rely on HCA/HBA support to
2014
+ * remap the separate metadata buffer from the block layer.
2015
+ */
2016
+ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
2017
+ return -EINVAL;
2018
+ if (ctrl->max_integrity_segments)
2019
+ ns->features |=
2020
+ (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
2021
+ } else {
2022
+ /*
2023
+ * For PCIe controllers, we can't easily remap the separate
2024
+ * metadata buffer from the block layer and thus require a
2025
+ * separate metadata buffer for block layer metadata/PI support.
2026
+ * We allow extended LBAs for the passthrough interface, though.
2027
+ */
2028
+ if (id->flbas & NVME_NS_FLBAS_META_EXT)
2029
+ ns->features |= NVME_NS_EXT_LBAS;
2030
+ else
2031
+ ns->features |= NVME_NS_METADATA_SUPPORTED;
16032032 }
1604
-#endif
2033
+
2034
+ return 0;
16052035 }
16062036
1607
-static int nvme_revalidate_disk(struct gendisk *disk)
2037
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
2038
+ struct request_queue *q)
16082039 {
1609
- struct nvme_ns *ns = disk->private_data;
2040
+ bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
2041
+
2042
+ if (ctrl->max_hw_sectors) {
2043
+ u32 max_segments =
2044
+ (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
2045
+
2046
+ max_segments = min_not_zero(max_segments, ctrl->max_segments);
2047
+ blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
2048
+ blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
2049
+ }
2050
+ blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
2051
+ blk_queue_dma_alignment(q, 3);
2052
+ blk_queue_write_cache(q, vwc, vwc);
2053
+}
2054
+
2055
+static void nvme_update_disk_info(struct gendisk *disk,
2056
+ struct nvme_ns *ns, struct nvme_id_ns *id)
2057
+{
2058
+ sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
2059
+ unsigned short bs = 1 << ns->lba_shift;
2060
+ u32 atomic_bs, phys_bs, io_opt = 0;
2061
+
2062
+ /*
2063
+ * The block layer can't support LBA sizes larger than the page size
2064
+ * yet, so catch this early and don't allow block I/O.
2065
+ */
2066
+ if (ns->lba_shift > PAGE_SHIFT) {
2067
+ capacity = 0;
2068
+ bs = (1 << 9);
2069
+ }
2070
+
2071
+ blk_integrity_unregister(disk);
2072
+
2073
+ atomic_bs = phys_bs = bs;
2074
+ nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
2075
+ if (id->nabo == 0) {
2076
+ /*
2077
+ * Bit 1 indicates whether NAWUPF is defined for this namespace
2078
+ * and whether it should be used instead of AWUPF. If NAWUPF ==
2079
+ * 0 then AWUPF must be used instead.
2080
+ */
2081
+ if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
2082
+ atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2083
+ else
2084
+ atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
2085
+ }
2086
+
2087
+ if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2088
+ /* NPWG = Namespace Preferred Write Granularity */
2089
+ phys_bs = bs * (1 + le16_to_cpu(id->npwg));
2090
+ /* NOWS = Namespace Optimal Write Size */
2091
+ io_opt = bs * (1 + le16_to_cpu(id->nows));
2092
+ }
2093
+
2094
+ blk_queue_logical_block_size(disk->queue, bs);
2095
+ /*
2096
+ * Linux filesystems assume writing a single physical block is
2097
+ * an atomic operation. Hence limit the physical block size to the
2098
+ * value of the Atomic Write Unit Power Fail parameter.
2099
+ */
2100
+ blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
2101
+ blk_queue_io_min(disk->queue, phys_bs);
2102
+ blk_queue_io_opt(disk->queue, io_opt);
2103
+
2104
+ /*
2105
+ * Register a metadata profile for PI, or the plain non-integrity NVMe
2106
+ * metadata masquerading as Type 0 if supported, otherwise reject block
2107
+ * I/O to namespaces with metadata except when the namespace supports
2108
+ * PI, as it can strip/insert in that case.
2109
+ */
2110
+ if (ns->ms) {
2111
+ if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
2112
+ (ns->features & NVME_NS_METADATA_SUPPORTED))
2113
+ nvme_init_integrity(disk, ns->ms, ns->pi_type,
2114
+ ns->ctrl->max_integrity_segments);
2115
+ else if (!nvme_ns_has_pi(ns))
2116
+ capacity = 0;
2117
+ }
2118
+
2119
+ set_capacity_revalidate_and_notify(disk, capacity, false);
2120
+
2121
+ nvme_config_discard(disk, ns);
2122
+ nvme_config_write_zeroes(disk->queue, ns->ctrl);
2123
+
2124
+ if (id->nsattr & NVME_NS_ATTR_RO)
2125
+ set_disk_ro(disk, true);
2126
+}
2127
+
2128
+static inline bool nvme_first_scan(struct gendisk *disk)
2129
+{
2130
+ /* nvme_alloc_ns() scans the disk prior to adding it */
2131
+ return !(disk->flags & GENHD_FL_UP);
2132
+}
2133
+
2134
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
2135
+{
16102136 struct nvme_ctrl *ctrl = ns->ctrl;
1611
- struct nvme_id_ns *id;
1612
- struct nvme_ns_ids ids;
1613
- int ret = 0;
2137
+ u32 iob;
16142138
1615
- if (test_bit(NVME_NS_DEAD, &ns->flags)) {
1616
- set_capacity(disk, 0);
1617
- return -ENODEV;
2139
+ if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2140
+ is_power_of_2(ctrl->max_hw_sectors))
2141
+ iob = ctrl->max_hw_sectors;
2142
+ else
2143
+ iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
2144
+
2145
+ if (!iob)
2146
+ return;
2147
+
2148
+ if (!is_power_of_2(iob)) {
2149
+ if (nvme_first_scan(ns->disk))
2150
+ pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2151
+ ns->disk->disk_name, iob);
2152
+ return;
16182153 }
16192154
1620
- id = nvme_identify_ns(ctrl, ns->head->ns_id);
1621
- if (!id)
1622
- return -ENODEV;
1623
-
1624
- if (id->ncap == 0) {
1625
- ret = -ENODEV;
1626
- goto out;
2155
+ if (blk_queue_is_zoned(ns->disk->queue)) {
2156
+ if (nvme_first_scan(ns->disk))
2157
+ pr_warn("%s: ignoring zoned namespace IO boundary\n",
2158
+ ns->disk->disk_name);
2159
+ return;
16272160 }
16282161
1629
- __nvme_revalidate_disk(disk, id);
1630
- nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
1631
- if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
1632
- dev_err(ctrl->device,
1633
- "identifiers changed for nsid %d\n", ns->head->ns_id);
1634
- ret = -ENODEV;
2162
+ blk_queue_chunk_sectors(ns->queue, iob);
2163
+}
2164
+
2165
+static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
2166
+{
2167
+ unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
2168
+ int ret;
2169
+
2170
+ blk_mq_freeze_queue(ns->disk->queue);
2171
+ ns->lba_shift = id->lbaf[lbaf].ds;
2172
+ nvme_set_queue_limits(ns->ctrl, ns->queue);
2173
+
2174
+ if (ns->head->ids.csi == NVME_CSI_ZNS) {
2175
+ ret = nvme_update_zone_info(ns, lbaf);
2176
+ if (ret)
2177
+ goto out_unfreeze;
16352178 }
16362179
1637
-out:
1638
- kfree(id);
2180
+ ret = nvme_configure_metadata(ns, id);
2181
+ if (ret)
2182
+ goto out_unfreeze;
2183
+ nvme_set_chunk_sectors(ns, id);
2184
+ nvme_update_disk_info(ns->disk, ns, id);
2185
+ blk_mq_unfreeze_queue(ns->disk->queue);
2186
+
2187
+ if (blk_queue_is_zoned(ns->queue)) {
2188
+ ret = nvme_revalidate_zones(ns);
2189
+ if (ret && !nvme_first_scan(ns->disk))
2190
+ return ret;
2191
+ }
2192
+
2193
+#ifdef CONFIG_NVME_MULTIPATH
2194
+ if (ns->head->disk) {
2195
+ blk_mq_freeze_queue(ns->head->disk->queue);
2196
+ nvme_update_disk_info(ns->head->disk, ns, id);
2197
+ blk_stack_limits(&ns->head->disk->queue->limits,
2198
+ &ns->queue->limits, 0);
2199
+ blk_queue_update_readahead(ns->head->disk->queue);
2200
+ nvme_update_bdev_size(ns->head->disk);
2201
+ blk_mq_unfreeze_queue(ns->head->disk->queue);
2202
+ }
2203
+#endif
2204
+ return 0;
2205
+
2206
+out_unfreeze:
2207
+ blk_mq_unfreeze_queue(ns->disk->queue);
16392208 return ret;
16402209 }
16412210
....@@ -1678,7 +2247,7 @@
16782247 memset(&c, 0, sizeof(c));
16792248 c.common.opcode = op;
16802249 c.common.nsid = cpu_to_le32(ns->head->ns_id);
1681
- c.common.cdw10[0] = cpu_to_le32(cdw10);
2250
+ c.common.cdw10 = cpu_to_le32(cdw10);
16822251
16832252 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
16842253 nvme_put_ns_from_disk(head, srcu_idx);
....@@ -1716,18 +2285,21 @@
17162285 enum pr_type type, bool abort)
17172286 {
17182287 u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
2288
+
17192289 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
17202290 }
17212291
17222292 static int nvme_pr_clear(struct block_device *bdev, u64 key)
17232293 {
1724
- u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1725
- return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
2294
+ u32 cdw10 = 1 | (key ? 0 : 1 << 3);
2295
+
2296
+ return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
17262297 }
17272298
17282299 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
17292300 {
1730
- u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
2301
+ u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 0 : 1 << 3);
2302
+
17312303 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
17322304 }
17332305
....@@ -1752,11 +2324,11 @@
17522324 else
17532325 cmd.common.opcode = nvme_admin_security_recv;
17542326 cmd.common.nsid = 0;
1755
- cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1756
- cmd.common.cdw10[1] = cpu_to_le32(len);
2327
+ cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2328
+ cmd.common.cdw11 = cpu_to_le32(len);
17572329
17582330 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
1759
- ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
2331
+ ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
17602332 }
17612333 EXPORT_SYMBOL_GPL(nvme_sec_submit);
17622334 #endif /* CONFIG_BLK_SED_OPAL */
....@@ -1764,11 +2336,11 @@
17642336 static const struct block_device_operations nvme_fops = {
17652337 .owner = THIS_MODULE,
17662338 .ioctl = nvme_ioctl,
1767
- .compat_ioctl = nvme_ioctl,
2339
+ .compat_ioctl = nvme_compat_ioctl,
17682340 .open = nvme_open,
17692341 .release = nvme_release,
17702342 .getgeo = nvme_getgeo,
1771
- .revalidate_disk= nvme_revalidate_disk,
2343
+ .report_zones = nvme_report_zones,
17722344 .pr_ops = &nvme_pr_ops,
17732345 };
17742346
....@@ -1789,11 +2361,13 @@
17892361
17902362 const struct block_device_operations nvme_ns_head_ops = {
17912363 .owner = THIS_MODULE,
2364
+ .submit_bio = nvme_ns_head_submit_bio,
17922365 .open = nvme_ns_head_open,
17932366 .release = nvme_ns_head_release,
17942367 .ioctl = nvme_ioctl,
1795
- .compat_ioctl = nvme_ioctl,
2368
+ .compat_ioctl = nvme_compat_ioctl,
17962369 .getgeo = nvme_getgeo,
2370
+ .report_zones = nvme_report_zones,
17972371 .pr_ops = &nvme_pr_ops,
17982372 };
17992373 #endif /* CONFIG_NVME_MULTIPATH */
....@@ -1811,13 +2385,13 @@
18112385 if ((csts & NVME_CSTS_RDY) == bit)
18122386 break;
18132387
1814
- msleep(100);
2388
+ usleep_range(1000, 2000);
18152389 if (fatal_signal_pending(current))
18162390 return -EINTR;
18172391 if (time_after(jiffies, timeout)) {
18182392 dev_err(ctrl->device,
1819
- "Device not ready; aborting %s\n", enabled ?
1820
- "initialisation" : "reset");
2393
+ "Device not ready; aborting %s, CSTS=0x%x\n",
2394
+ enabled ? "initialisation" : "reset", csts);
18212395 return -ENODEV;
18222396 }
18232397 }
....@@ -1831,7 +2405,7 @@
18312405 * bits', but doing so may cause the device to complete commands to the
18322406 * admin queue ... and we don't know what memory that might be pointing at!
18332407 */
1834
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
2408
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
18352409 {
18362410 int ret;
18372411
....@@ -1845,31 +2419,34 @@
18452419 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
18462420 msleep(NVME_QUIRK_DELAY_AMOUNT);
18472421
1848
- return nvme_wait_ready(ctrl, cap, false);
2422
+ return nvme_wait_ready(ctrl, ctrl->cap, false);
18492423 }
18502424 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
18512425
1852
-int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
2426
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
18532427 {
1854
- /*
1855
- * Default to a 4K page size, with the intention to update this
1856
- * path in the future to accomodate architectures with differing
1857
- * kernel and IO page sizes.
1858
- */
1859
- unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
2428
+ unsigned dev_page_min;
18602429 int ret;
18612430
1862
- if (page_shift < dev_page_min) {
2431
+ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2432
+ if (ret) {
2433
+ dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2434
+ return ret;
2435
+ }
2436
+ dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2437
+
2438
+ if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
18632439 dev_err(ctrl->device,
18642440 "Minimum device page size %u too large for host (%u)\n",
1865
- 1 << dev_page_min, 1 << page_shift);
2441
+ 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
18662442 return -ENODEV;
18672443 }
18682444
1869
- ctrl->page_size = 1 << page_shift;
1870
-
1871
- ctrl->ctrl_config = NVME_CC_CSS_NVM;
1872
- ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
2445
+ if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2446
+ ctrl->ctrl_config = NVME_CC_CSS_CSI;
2447
+ else
2448
+ ctrl->ctrl_config = NVME_CC_CSS_NVM;
2449
+ ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
18732450 ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
18742451 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
18752452 ctrl->ctrl_config |= NVME_CC_ENABLE;
....@@ -1877,7 +2454,7 @@
18772454 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
18782455 if (ret)
18792456 return ret;
1880
- return nvme_wait_ready(ctrl, cap, true);
2457
+ return nvme_wait_ready(ctrl, ctrl->cap, true);
18812458 }
18822459 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
18832460
....@@ -1912,28 +2489,6 @@
19122489 }
19132490 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
19142491
1915
-static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1916
- struct request_queue *q)
1917
-{
1918
- bool vwc = false;
1919
-
1920
- if (ctrl->max_hw_sectors) {
1921
- u32 max_segments =
1922
- (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1923
-
1924
- max_segments = min_not_zero(max_segments, ctrl->max_segments);
1925
- blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1926
- blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1927
- }
1928
- if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1929
- is_power_of_2(ctrl->max_hw_sectors))
1930
- blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
1931
- blk_queue_virt_boundary(q, ctrl->page_size - 1);
1932
- if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1933
- vwc = true;
1934
- blk_queue_write_cache(q, vwc, vwc);
1935
-}
1936
-
19372492 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
19382493 {
19392494 __le64 ts;
....@@ -1948,6 +2503,26 @@
19482503 if (ret)
19492504 dev_warn_once(ctrl->device,
19502505 "could not set timestamp (%d)\n", ret);
2506
+ return ret;
2507
+}
2508
+
2509
+static int nvme_configure_acre(struct nvme_ctrl *ctrl)
2510
+{
2511
+ struct nvme_feat_host_behavior *host;
2512
+ int ret;
2513
+
2514
+ /* Don't bother enabling the feature if retry delay is not reported */
2515
+ if (!ctrl->crdt[0])
2516
+ return 0;
2517
+
2518
+ host = kzalloc(sizeof(*host), GFP_KERNEL);
2519
+ if (!host)
2520
+ return 0;
2521
+
2522
+ host->acre = NVME_ENABLE_ACRE;
2523
+ ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2524
+ host, sizeof(*host), NULL);
2525
+ kfree(host);
19512526 return ret;
19522527 }
19532528
....@@ -2117,6 +2692,44 @@
21172692 .vid = 0x1179,
21182693 .mn = "THNSF5256GPUK TOSHIBA",
21192694 .quirks = NVME_QUIRK_NO_APST,
2695
+ },
2696
+ {
2697
+ /*
2698
+ * This LiteON CL1-3D*-Q11 firmware version has a race
2699
+ * condition associated with actions related to suspend to idle
2700
+ * LiteON has resolved the problem in future firmware
2701
+ */
2702
+ .vid = 0x14a4,
2703
+ .fr = "22301111",
2704
+ .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2705
+ },
2706
+ {
2707
+ /*
2708
+ * This Kioxia CD6-V Series / HPE PE8030 device times out and
2709
+ * aborts I/O during any load, but more easily reproducible
2710
+ * with discards (fstrim).
2711
+ *
2712
+ * The device is left in a state where it is also not possible
2713
+ * to use "nvme set-feature" to disable APST, but booting with
2714
+ * nvme_core.default_ps_max_latency=0 works.
2715
+ */
2716
+ .vid = 0x1e0f,
2717
+ .mn = "KCD6XVUL6T40",
2718
+ .quirks = NVME_QUIRK_NO_APST,
2719
+ },
2720
+ {
2721
+ /*
2722
+ * The external Samsung X5 SSD fails initialization without a
2723
+ * delay before checking if it is ready and has a whole set of
2724
+ * other problems. To make this even more interesting, it
2725
+ * shares the PCI ID with internal Samsung 970 Evo Plus that
2726
+ * does not need or want these quirks.
2727
+ */
2728
+ .vid = 0x144d,
2729
+ .mn = "Samsung Portable SSD X5",
2730
+ .quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
2731
+ NVME_QUIRK_NO_DEEPEST_PS |
2732
+ NVME_QUIRK_IGNORE_DEV_SUBNQN,
21202733 }
21212734 };
21222735
....@@ -2155,14 +2768,16 @@
21552768 size_t nqnlen;
21562769 int off;
21572770
2158
- nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2159
- if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2160
- strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2161
- return;
2162
- }
2771
+ if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2772
+ nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2773
+ if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2774
+ strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2775
+ return;
2776
+ }
21632777
2164
- if (ctrl->vs >= NVME_VS(1, 2, 1))
2165
- dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2778
+ if (ctrl->vs >= NVME_VS(1, 2, 1))
2779
+ dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2780
+ }
21662781
21672782 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
21682783 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
....@@ -2175,15 +2790,14 @@
21752790 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
21762791 }
21772792
2178
-static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
2179
-{
2180
- ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
2181
- kfree(subsys);
2182
-}
2183
-
21842793 static void nvme_release_subsystem(struct device *dev)
21852794 {
2186
- __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
2795
+ struct nvme_subsystem *subsys =
2796
+ container_of(dev, struct nvme_subsystem, dev);
2797
+
2798
+ if (subsys->instance >= 0)
2799
+ ida_simple_remove(&nvme_instance_ida, subsys->instance);
2800
+ kfree(subsys);
21872801 }
21882802
21892803 static void nvme_destroy_subsystem(struct kref *ref)
....@@ -2254,8 +2868,8 @@
22542868 { \
22552869 struct nvme_subsystem *subsys = \
22562870 container_of(dev, struct nvme_subsystem, dev); \
2257
- return sprintf(buf, "%.*s\n", \
2258
- (int)sizeof(subsys->field), subsys->field); \
2871
+ return sysfs_emit(buf, "%.*s\n", \
2872
+ (int)sizeof(subsys->field), subsys->field); \
22592873 } \
22602874 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
22612875
....@@ -2268,6 +2882,9 @@
22682882 &subsys_attr_serial.attr,
22692883 &subsys_attr_firmware_rev.attr,
22702884 &subsys_attr_subsysnqn.attr,
2885
+#ifdef CONFIG_NVME_MULTIPATH
2886
+ &subsys_attr_iopolicy.attr,
2887
+#endif
22712888 NULL,
22722889 };
22732890
....@@ -2280,20 +2897,39 @@
22802897 NULL,
22812898 };
22822899
2283
-static int nvme_active_ctrls(struct nvme_subsystem *subsys)
2900
+static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
22842901 {
2285
- int count = 0;
2286
- struct nvme_ctrl *ctrl;
2902
+ return ctrl->opts && ctrl->opts->discovery_nqn;
2903
+}
22872904
2288
- mutex_lock(&subsys->lock);
2289
- list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
2290
- if (ctrl->state != NVME_CTRL_DELETING &&
2291
- ctrl->state != NVME_CTRL_DEAD)
2292
- count++;
2905
+static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2906
+ struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2907
+{
2908
+ struct nvme_ctrl *tmp;
2909
+
2910
+ lockdep_assert_held(&nvme_subsystems_lock);
2911
+
2912
+ list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2913
+ if (nvme_state_terminal(tmp))
2914
+ continue;
2915
+
2916
+ if (tmp->cntlid == ctrl->cntlid) {
2917
+ dev_err(ctrl->device,
2918
+ "Duplicate cntlid %u with %s, rejecting\n",
2919
+ ctrl->cntlid, dev_name(tmp->device));
2920
+ return false;
2921
+ }
2922
+
2923
+ if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2924
+ nvme_discovery_ctrl(ctrl))
2925
+ continue;
2926
+
2927
+ dev_err(ctrl->device,
2928
+ "Subsystem does not support multiple controllers\n");
2929
+ return false;
22932930 }
2294
- mutex_unlock(&subsys->lock);
22952931
2296
- return count;
2932
+ return true;
22972933 }
22982934
22992935 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
....@@ -2304,12 +2940,8 @@
23042940 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
23052941 if (!subsys)
23062942 return -ENOMEM;
2307
- ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
2308
- if (ret < 0) {
2309
- kfree(subsys);
2310
- return ret;
2311
- }
2312
- subsys->instance = ret;
2943
+
2944
+ subsys->instance = -1;
23132945 mutex_init(&subsys->lock);
23142946 kref_init(&subsys->ref);
23152947 INIT_LIST_HEAD(&subsys->ctrls);
....@@ -2317,74 +2949,68 @@
23172949 nvme_init_subnqn(subsys, ctrl, id);
23182950 memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
23192951 memcpy(subsys->model, id->mn, sizeof(subsys->model));
2320
- memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
23212952 subsys->vendor_id = le16_to_cpu(id->vid);
23222953 subsys->cmic = id->cmic;
2954
+ subsys->awupf = le16_to_cpu(id->awupf);
2955
+#ifdef CONFIG_NVME_MULTIPATH
2956
+ subsys->iopolicy = NVME_IOPOLICY_NUMA;
2957
+#endif
23232958
23242959 subsys->dev.class = nvme_subsys_class;
23252960 subsys->dev.release = nvme_release_subsystem;
23262961 subsys->dev.groups = nvme_subsys_attrs_groups;
2327
- dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
2962
+ dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
23282963 device_initialize(&subsys->dev);
23292964
23302965 mutex_lock(&nvme_subsystems_lock);
23312966 found = __nvme_find_get_subsystem(subsys->subnqn);
23322967 if (found) {
2333
- /*
2334
- * Verify that the subsystem actually supports multiple
2335
- * controllers, else bail out.
2336
- */
2337
- if (!(ctrl->opts && ctrl->opts->discovery_nqn) &&
2338
- nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
2339
- dev_err(ctrl->device,
2340
- "ignoring ctrl due to duplicate subnqn (%s).\n",
2341
- found->subnqn);
2342
- nvme_put_subsystem(found);
2343
- ret = -EINVAL;
2344
- goto out_unlock;
2345
- }
2346
-
2347
- __nvme_release_subsystem(subsys);
2968
+ put_device(&subsys->dev);
23482969 subsys = found;
2970
+
2971
+ if (!nvme_validate_cntlid(subsys, ctrl, id)) {
2972
+ ret = -EINVAL;
2973
+ goto out_put_subsystem;
2974
+ }
23492975 } else {
23502976 ret = device_add(&subsys->dev);
23512977 if (ret) {
23522978 dev_err(ctrl->device,
23532979 "failed to register subsystem device.\n");
2980
+ put_device(&subsys->dev);
23542981 goto out_unlock;
23552982 }
23562983 ida_init(&subsys->ns_ida);
23572984 list_add_tail(&subsys->entry, &nvme_subsystems);
23582985 }
23592986
2360
- ctrl->subsys = subsys;
2361
- mutex_unlock(&nvme_subsystems_lock);
2362
-
2363
- if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2364
- dev_name(ctrl->device))) {
2987
+ ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2988
+ dev_name(ctrl->device));
2989
+ if (ret) {
23652990 dev_err(ctrl->device,
23662991 "failed to create sysfs link from subsystem.\n");
2367
- /* the transport driver will eventually put the subsystem */
2368
- return -EINVAL;
2992
+ goto out_put_subsystem;
23692993 }
23702994
2371
- mutex_lock(&subsys->lock);
2995
+ if (!found)
2996
+ subsys->instance = ctrl->instance;
2997
+ ctrl->subsys = subsys;
23722998 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2373
- mutex_unlock(&subsys->lock);
2374
-
2999
+ mutex_unlock(&nvme_subsystems_lock);
23753000 return 0;
23763001
3002
+out_put_subsystem:
3003
+ nvme_put_subsystem(subsys);
23773004 out_unlock:
23783005 mutex_unlock(&nvme_subsystems_lock);
2379
- put_device(&subsys->dev);
23803006 return ret;
23813007 }
23823008
2383
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
3009
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
23843010 void *log, size_t size, u64 offset)
23853011 {
23863012 struct nvme_command c = { };
2387
- unsigned long dwlen = size / 4 - 1;
3013
+ u32 dwlen = nvme_bytes_to_numd(size);
23883014
23893015 c.get_log_page.opcode = nvme_admin_get_log_page;
23903016 c.get_log_page.nsid = cpu_to_le32(nsid);
....@@ -2394,27 +3020,35 @@
23943020 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
23953021 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
23963022 c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3023
+ c.get_log_page.csi = csi;
23973024
23983025 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
23993026 }
24003027
2401
-static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
3028
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3029
+ struct nvme_effects_log **log)
24023030 {
3031
+ struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
24033032 int ret;
24043033
2405
- if (!ctrl->effects)
2406
- ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
3034
+ if (cel)
3035
+ goto out;
24073036
2408
- if (!ctrl->effects)
2409
- return 0;
3037
+ cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3038
+ if (!cel)
3039
+ return -ENOMEM;
24103040
2411
- ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
2412
- ctrl->effects, sizeof(*ctrl->effects), 0);
3041
+ ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
3042
+ cel, sizeof(*cel), 0);
24133043 if (ret) {
2414
- kfree(ctrl->effects);
2415
- ctrl->effects = NULL;
3044
+ kfree(cel);
3045
+ return ret;
24163046 }
2417
- return ret;
3047
+
3048
+ xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
3049
+out:
3050
+ *log = cel;
3051
+ return 0;
24183052 }
24193053
24203054 /*
....@@ -2425,7 +3059,6 @@
24253059 int nvme_init_identify(struct nvme_ctrl *ctrl)
24263060 {
24273061 struct nvme_id_ctrl *id;
2428
- u64 cap;
24293062 int ret, page_shift;
24303063 u32 max_hw_sectors;
24313064 bool prev_apst_enabled;
....@@ -2435,16 +3068,11 @@
24353068 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
24363069 return ret;
24373070 }
2438
-
2439
- ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
2440
- if (ret) {
2441
- dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2442
- return ret;
2443
- }
2444
- page_shift = NVME_CAP_MPSMIN(cap) + 12;
3071
+ page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
3072
+ ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
24453073
24463074 if (ctrl->vs >= NVME_VS(1, 1, 0))
2447
- ctrl->subsystem = NVME_CAP_NSSRC(cap);
3075
+ ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
24483076
24493077 ret = nvme_identify_ctrl(ctrl, &id);
24503078 if (ret) {
....@@ -2453,17 +3081,16 @@
24533081 }
24543082
24553083 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
2456
- ret = nvme_get_effects_log(ctrl);
3084
+ ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
24573085 if (ret < 0)
24583086 goto out_free;
24593087 }
24603088
3089
+ if (!(ctrl->ops->flags & NVME_F_FABRICS))
3090
+ ctrl->cntlid = le16_to_cpu(id->cntlid);
3091
+
24613092 if (!ctrl->identified) {
24623093 int i;
2463
-
2464
- ret = nvme_init_subsystem(ctrl, id);
2465
- if (ret)
2466
- goto out_free;
24673094
24683095 /*
24693096 * Check for quirks. Quirk can depend on firmware version,
....@@ -2477,19 +3104,32 @@
24773104 if (quirk_matches(id, &core_quirks[i]))
24783105 ctrl->quirks |= core_quirks[i].quirks;
24793106 }
3107
+
3108
+ ret = nvme_init_subsystem(ctrl, id);
3109
+ if (ret)
3110
+ goto out_free;
24803111 }
3112
+ memcpy(ctrl->subsys->firmware_rev, id->fr,
3113
+ sizeof(ctrl->subsys->firmware_rev));
24813114
24823115 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
24833116 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
24843117 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
24853118 }
24863119
3120
+ ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3121
+ ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3122
+ ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3123
+
24873124 ctrl->oacs = le16_to_cpu(id->oacs);
2488
- ctrl->oncs = le16_to_cpup(&id->oncs);
3125
+ ctrl->oncs = le16_to_cpu(id->oncs);
3126
+ ctrl->mtfa = le16_to_cpu(id->mtfa);
24893127 ctrl->oaes = le32_to_cpu(id->oaes);
3128
+ ctrl->wctemp = le16_to_cpu(id->wctemp);
3129
+ ctrl->cctemp = le16_to_cpu(id->cctemp);
3130
+
24903131 atomic_set(&ctrl->abort_limit, id->acl + 1);
24913132 ctrl->vwc = id->vwc;
2492
- ctrl->cntlid = le16_to_cpup(&id->cntlid);
24933133 if (id->mdts)
24943134 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
24953135 else
....@@ -2501,10 +3141,11 @@
25013141 ctrl->sgls = le32_to_cpu(id->sgls);
25023142 ctrl->kas = le16_to_cpu(id->kas);
25033143 ctrl->max_namespaces = le32_to_cpu(id->mnan);
3144
+ ctrl->ctratt = le32_to_cpu(id->ctratt);
25043145
25053146 if (id->rtd3e) {
25063147 /* us -> s */
2507
- u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
3148
+ u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
25083149
25093150 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
25103151 shutdown_timeout, 60);
....@@ -2542,25 +3183,28 @@
25423183 * admin connect
25433184 */
25443185 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3186
+ dev_err(ctrl->device,
3187
+ "Mismatching cntlid: Connect %u vs Identify "
3188
+ "%u, rejecting\n",
3189
+ ctrl->cntlid, le16_to_cpu(id->cntlid));
25453190 ret = -EINVAL;
25463191 goto out_free;
25473192 }
25483193
2549
- if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
3194
+ if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
25503195 dev_err(ctrl->device,
25513196 "keep-alive support is mandatory for fabrics\n");
25523197 ret = -EINVAL;
25533198 goto out_free;
25543199 }
25553200 } else {
2556
- ctrl->cntlid = le16_to_cpu(id->cntlid);
25573201 ctrl->hmpre = le32_to_cpu(id->hmpre);
25583202 ctrl->hmmin = le32_to_cpu(id->hmmin);
25593203 ctrl->hmminds = le32_to_cpu(id->hmminds);
25603204 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
25613205 }
25623206
2563
- ret = nvme_mpath_init(ctrl, id);
3207
+ ret = nvme_mpath_init_identify(ctrl, id);
25643208 kfree(id);
25653209
25663210 if (ret < 0)
....@@ -2583,6 +3227,20 @@
25833227 if (ret < 0)
25843228 return ret;
25853229
3230
+ ret = nvme_configure_acre(ctrl);
3231
+ if (ret < 0)
3232
+ return ret;
3233
+
3234
+ if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3235
+ /*
3236
+ * Do not return errors unless we are in a controller reset,
3237
+ * the controller works perfectly fine without hwmon.
3238
+ */
3239
+ ret = nvme_hwmon_init(ctrl);
3240
+ if (ret == -EINTR)
3241
+ return ret;
3242
+ }
3243
+
25863244 ctrl->identified = true;
25873245
25883246 return 0;
....@@ -2600,7 +3258,6 @@
26003258
26013259 switch (ctrl->state) {
26023260 case NVME_CTRL_LIVE:
2603
- case NVME_CTRL_ADMIN_ONLY:
26043261 break;
26053262 default:
26063263 return -EWOULDBLOCK;
....@@ -2668,14 +3325,22 @@
26683325 switch (cmd) {
26693326 case NVME_IOCTL_ADMIN_CMD:
26703327 return nvme_user_cmd(ctrl, NULL, argp);
3328
+ case NVME_IOCTL_ADMIN64_CMD:
3329
+ return nvme_user_cmd64(ctrl, NULL, argp);
26713330 case NVME_IOCTL_IO_CMD:
26723331 return nvme_dev_user_cmd(ctrl, argp);
26733332 case NVME_IOCTL_RESET:
3333
+ if (!capable(CAP_SYS_ADMIN))
3334
+ return -EACCES;
26743335 dev_warn(ctrl->device, "resetting controller\n");
26753336 return nvme_reset_ctrl_sync(ctrl);
26763337 case NVME_IOCTL_SUBSYS_RESET:
3338
+ if (!capable(CAP_SYS_ADMIN))
3339
+ return -EACCES;
26773340 return nvme_reset_subsystem(ctrl);
26783341 case NVME_IOCTL_RESCAN:
3342
+ if (!capable(CAP_SYS_ADMIN))
3343
+ return -EACCES;
26793344 nvme_queue_scan(ctrl);
26803345 return 0;
26813346 default:
....@@ -2688,7 +3353,7 @@
26883353 .open = nvme_dev_open,
26893354 .release = nvme_dev_release,
26903355 .unlocked_ioctl = nvme_dev_ioctl,
2691
- .compat_ioctl = nvme_dev_ioctl,
3356
+ .compat_ioctl = compat_ptr_ioctl,
26923357 };
26933358
26943359 static ssize_t nvme_sysfs_reset(struct device *dev,
....@@ -2736,13 +3401,13 @@
27363401 int model_len = sizeof(subsys->model);
27373402
27383403 if (!uuid_is_null(&ids->uuid))
2739
- return sprintf(buf, "uuid.%pU\n", &ids->uuid);
3404
+ return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
27403405
27413406 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2742
- return sprintf(buf, "eui.%16phN\n", ids->nguid);
3407
+ return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
27433408
27443409 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2745
- return sprintf(buf, "eui.%8phN\n", ids->eui64);
3410
+ return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
27463411
27473412 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
27483413 subsys->serial[serial_len - 1] == '\0'))
....@@ -2751,7 +3416,7 @@
27513416 subsys->model[model_len - 1] == '\0'))
27523417 model_len--;
27533418
2754
- return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
3419
+ return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
27553420 serial_len, subsys->serial, model_len, subsys->model,
27563421 head->ns_id);
27573422 }
....@@ -2760,7 +3425,7 @@
27603425 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
27613426 char *buf)
27623427 {
2763
- return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
3428
+ return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
27643429 }
27653430 static DEVICE_ATTR_RO(nguid);
27663431
....@@ -2773,25 +3438,25 @@
27733438 * we have no UUID set
27743439 */
27753440 if (uuid_is_null(&ids->uuid)) {
2776
- printk_ratelimited(KERN_WARNING
2777
- "No UUID available providing old NGUID\n");
2778
- return sprintf(buf, "%pU\n", ids->nguid);
3441
+ dev_warn_ratelimited(dev,
3442
+ "No UUID available providing old NGUID\n");
3443
+ return sysfs_emit(buf, "%pU\n", ids->nguid);
27793444 }
2780
- return sprintf(buf, "%pU\n", &ids->uuid);
3445
+ return sysfs_emit(buf, "%pU\n", &ids->uuid);
27813446 }
27823447 static DEVICE_ATTR_RO(uuid);
27833448
27843449 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
27853450 char *buf)
27863451 {
2787
- return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
3452
+ return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
27883453 }
27893454 static DEVICE_ATTR_RO(eui);
27903455
27913456 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
27923457 char *buf)
27933458 {
2794
- return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
3459
+ return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
27953460 }
27963461 static DEVICE_ATTR_RO(nsid);
27973462
....@@ -2838,9 +3503,17 @@
28383503 return a->mode;
28393504 }
28403505
2841
-const struct attribute_group nvme_ns_id_attr_group = {
3506
+static const struct attribute_group nvme_ns_id_attr_group = {
28423507 .attrs = nvme_ns_id_attrs,
28433508 .is_visible = nvme_ns_id_attrs_are_visible,
3509
+};
3510
+
3511
+const struct attribute_group *nvme_ns_id_attr_groups[] = {
3512
+ &nvme_ns_id_attr_group,
3513
+#ifdef CONFIG_NVM
3514
+ &nvme_nvm_attr_group,
3515
+#endif
3516
+ NULL,
28443517 };
28453518
28463519 #define nvme_show_str_function(field) \
....@@ -2848,7 +3521,7 @@
28483521 struct device_attribute *attr, char *buf) \
28493522 { \
28503523 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
2851
- return sprintf(buf, "%.*s\n", \
3524
+ return sysfs_emit(buf, "%.*s\n", \
28523525 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \
28533526 } \
28543527 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
....@@ -2862,21 +3535,20 @@
28623535 struct device_attribute *attr, char *buf) \
28633536 { \
28643537 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
2865
- return sprintf(buf, "%d\n", ctrl->field); \
3538
+ return sysfs_emit(buf, "%d\n", ctrl->field); \
28663539 } \
28673540 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
28683541
28693542 nvme_show_int_function(cntlid);
3543
+nvme_show_int_function(numa_node);
3544
+nvme_show_int_function(queue_count);
3545
+nvme_show_int_function(sqsize);
28703546
28713547 static ssize_t nvme_sysfs_delete(struct device *dev,
28723548 struct device_attribute *attr, const char *buf,
28733549 size_t count)
28743550 {
28753551 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2876
-
2877
- /* Can't delete non-created controllers */
2878
- if (!ctrl->created)
2879
- return -EBUSY;
28803552
28813553 if (device_remove_file_self(dev, attr))
28823554 nvme_delete_ctrl_sync(ctrl);
....@@ -2902,18 +3574,18 @@
29023574 static const char *const state_name[] = {
29033575 [NVME_CTRL_NEW] = "new",
29043576 [NVME_CTRL_LIVE] = "live",
2905
- [NVME_CTRL_ADMIN_ONLY] = "only-admin",
29063577 [NVME_CTRL_RESETTING] = "resetting",
29073578 [NVME_CTRL_CONNECTING] = "connecting",
29083579 [NVME_CTRL_DELETING] = "deleting",
3580
+ [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
29093581 [NVME_CTRL_DEAD] = "dead",
29103582 };
29113583
29123584 if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
29133585 state_name[ctrl->state])
2914
- return sprintf(buf, "%s\n", state_name[ctrl->state]);
3586
+ return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
29153587
2916
- return sprintf(buf, "unknown state\n");
3588
+ return sysfs_emit(buf, "unknown state\n");
29173589 }
29183590
29193591 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
....@@ -2928,6 +3600,26 @@
29283600 }
29293601 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
29303602
3603
+static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
3604
+ struct device_attribute *attr,
3605
+ char *buf)
3606
+{
3607
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3608
+
3609
+ return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn);
3610
+}
3611
+static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
3612
+
3613
+static ssize_t nvme_sysfs_show_hostid(struct device *dev,
3614
+ struct device_attribute *attr,
3615
+ char *buf)
3616
+{
3617
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3618
+
3619
+ return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id);
3620
+}
3621
+static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
3622
+
29313623 static ssize_t nvme_sysfs_show_address(struct device *dev,
29323624 struct device_attribute *attr,
29333625 char *buf)
....@@ -2937,6 +3629,66 @@
29373629 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
29383630 }
29393631 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
3632
+
3633
+static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
3634
+ struct device_attribute *attr, char *buf)
3635
+{
3636
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3637
+ struct nvmf_ctrl_options *opts = ctrl->opts;
3638
+
3639
+ if (ctrl->opts->max_reconnects == -1)
3640
+ return sysfs_emit(buf, "off\n");
3641
+ return sysfs_emit(buf, "%d\n",
3642
+ opts->max_reconnects * opts->reconnect_delay);
3643
+}
3644
+
3645
+static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
3646
+ struct device_attribute *attr, const char *buf, size_t count)
3647
+{
3648
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3649
+ struct nvmf_ctrl_options *opts = ctrl->opts;
3650
+ int ctrl_loss_tmo, err;
3651
+
3652
+ err = kstrtoint(buf, 10, &ctrl_loss_tmo);
3653
+ if (err)
3654
+ return -EINVAL;
3655
+
3656
+ else if (ctrl_loss_tmo < 0)
3657
+ opts->max_reconnects = -1;
3658
+ else
3659
+ opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
3660
+ opts->reconnect_delay);
3661
+ return count;
3662
+}
3663
+static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
3664
+ nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
3665
+
3666
+static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
3667
+ struct device_attribute *attr, char *buf)
3668
+{
3669
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3670
+
3671
+ if (ctrl->opts->reconnect_delay == -1)
3672
+ return sysfs_emit(buf, "off\n");
3673
+ return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
3674
+}
3675
+
3676
+static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
3677
+ struct device_attribute *attr, const char *buf, size_t count)
3678
+{
3679
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3680
+ unsigned int v;
3681
+ int err;
3682
+
3683
+ err = kstrtou32(buf, 10, &v);
3684
+ if (err)
3685
+ return err;
3686
+
3687
+ ctrl->opts->reconnect_delay = v;
3688
+ return count;
3689
+}
3690
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
3691
+ nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
29403692
29413693 static struct attribute *nvme_dev_attrs[] = {
29423694 &dev_attr_reset_controller.attr,
....@@ -2950,6 +3702,13 @@
29503702 &dev_attr_subsysnqn.attr,
29513703 &dev_attr_address.attr,
29523704 &dev_attr_state.attr,
3705
+ &dev_attr_numa_node.attr,
3706
+ &dev_attr_queue_count.attr,
3707
+ &dev_attr_sqsize.attr,
3708
+ &dev_attr_hostnqn.attr,
3709
+ &dev_attr_hostid.attr,
3710
+ &dev_attr_ctrl_loss_tmo.attr,
3711
+ &dev_attr_reconnect_delay.attr,
29533712 NULL
29543713 };
29553714
....@@ -2962,6 +3721,14 @@
29623721 if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
29633722 return 0;
29643723 if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
3724
+ return 0;
3725
+ if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
3726
+ return 0;
3727
+ if (a == &dev_attr_hostid.attr && !ctrl->opts)
3728
+ return 0;
3729
+ if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
3730
+ return 0;
3731
+ if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
29653732 return 0;
29663733
29673734 return a->mode;
....@@ -2977,7 +3744,7 @@
29773744 NULL,
29783745 };
29793746
2980
-static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
3747
+static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
29813748 unsigned nsid)
29823749 {
29833750 struct nvme_ns_head *h;
....@@ -2992,17 +3759,15 @@
29923759 return NULL;
29933760 }
29943761
2995
-static int __nvme_check_ids(struct nvme_subsystem *subsys,
2996
- struct nvme_ns_head *new)
3762
+static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3763
+ struct nvme_ns_ids *ids)
29973764 {
29983765 struct nvme_ns_head *h;
29993766
30003767 lockdep_assert_held(&subsys->lock);
30013768
30023769 list_for_each_entry(h, &subsys->nsheads, entry) {
3003
- if (nvme_ns_ids_valid(&new->ids) &&
3004
- !list_empty(&h->list) &&
3005
- nvme_ns_ids_equal(&new->ids, &h->ids))
3770
+ if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids))
30063771 return -EINVAL;
30073772 }
30083773
....@@ -3010,12 +3775,17 @@
30103775 }
30113776
30123777 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3013
- unsigned nsid, struct nvme_id_ns *id)
3778
+ unsigned nsid, struct nvme_ns_ids *ids)
30143779 {
30153780 struct nvme_ns_head *head;
3781
+ size_t size = sizeof(*head);
30163782 int ret = -ENOMEM;
30173783
3018
- head = kzalloc(sizeof(*head), GFP_KERNEL);
3784
+#ifdef CONFIG_NVME_MULTIPATH
3785
+ size += num_possible_nodes() * sizeof(struct nvme_ns *);
3786
+#endif
3787
+
3788
+ head = kzalloc(size, GFP_KERNEL);
30193789 if (!head)
30203790 goto out;
30213791 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
....@@ -3028,16 +3798,22 @@
30283798 goto out_ida_remove;
30293799 head->subsys = ctrl->subsys;
30303800 head->ns_id = nsid;
3801
+ head->ids = *ids;
30313802 kref_init(&head->ref);
30323803
3033
- nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
3034
-
3035
- ret = __nvme_check_ids(ctrl->subsys, head);
3804
+ ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids);
30363805 if (ret) {
30373806 dev_err(ctrl->device,
30383807 "duplicate IDs for nsid %d\n", nsid);
30393808 goto out_cleanup_srcu;
30403809 }
3810
+
3811
+ if (head->ids.csi) {
3812
+ ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3813
+ if (ret)
3814
+ goto out_cleanup_srcu;
3815
+ } else
3816
+ head->effects = ctrl->effects;
30413817
30423818 ret = nvme_mpath_alloc_disk(ctrl, head);
30433819 if (ret)
....@@ -3055,56 +3831,55 @@
30553831 out_free_head:
30563832 kfree(head);
30573833 out:
3834
+ if (ret > 0)
3835
+ ret = blk_status_to_errno(nvme_error_status(ret));
30583836 return ERR_PTR(ret);
30593837 }
30603838
30613839 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
3062
- struct nvme_id_ns *id)
3840
+ struct nvme_ns_ids *ids, bool is_shared)
30633841 {
30643842 struct nvme_ctrl *ctrl = ns->ctrl;
3065
- bool is_shared = id->nmic & (1 << 0);
30663843 struct nvme_ns_head *head = NULL;
30673844 int ret = 0;
30683845
30693846 mutex_lock(&ctrl->subsys->lock);
3070
- if (is_shared)
3071
- head = __nvme_find_ns_head(ctrl->subsys, nsid);
3847
+ head = nvme_find_ns_head(ctrl->subsys, nsid);
30723848 if (!head) {
3073
- head = nvme_alloc_ns_head(ctrl, nsid, id);
3849
+ head = nvme_alloc_ns_head(ctrl, nsid, ids);
30743850 if (IS_ERR(head)) {
30753851 ret = PTR_ERR(head);
30763852 goto out_unlock;
30773853 }
3854
+ head->shared = is_shared;
30783855 } else {
3079
- struct nvme_ns_ids ids;
3080
-
3081
- nvme_report_ns_ids(ctrl, nsid, id, &ids);
3082
- if (!nvme_ns_ids_equal(&head->ids, &ids)) {
3856
+ ret = -EINVAL;
3857
+ if (!is_shared || !head->shared) {
3858
+ dev_err(ctrl->device,
3859
+ "Duplicate unshared namespace %d\n", nsid);
3860
+ goto out_put_ns_head;
3861
+ }
3862
+ if (!nvme_ns_ids_equal(&head->ids, ids)) {
30833863 dev_err(ctrl->device,
30843864 "IDs don't match for shared namespace %d\n",
30853865 nsid);
3086
- ret = -EINVAL;
3087
- goto out_unlock;
3866
+ goto out_put_ns_head;
30883867 }
30893868 }
30903869
30913870 list_add_tail(&ns->siblings, &head->list);
30923871 ns->head = head;
3872
+ mutex_unlock(&ctrl->subsys->lock);
3873
+ return 0;
30933874
3875
+out_put_ns_head:
3876
+ nvme_put_ns_head(head);
30943877 out_unlock:
30953878 mutex_unlock(&ctrl->subsys->lock);
30963879 return ret;
30973880 }
30983881
3099
-static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
3100
-{
3101
- struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
3102
- struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
3103
-
3104
- return nsa->head->ns_id - nsb->head->ns_id;
3105
-}
3106
-
3107
-static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3882
+struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
31083883 {
31093884 struct nvme_ns *ns, *ret = NULL;
31103885
....@@ -3122,76 +3897,59 @@
31223897 up_read(&ctrl->namespaces_rwsem);
31233898 return ret;
31243899 }
3900
+EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
31253901
3126
-static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
3902
+/*
3903
+ * Add the namespace to the controller list while keeping the list ordered.
3904
+ */
3905
+static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
31273906 {
3128
- struct streams_directive_params s;
3129
- int ret;
3907
+ struct nvme_ns *tmp;
31303908
3131
- if (!ctrl->nr_streams)
3132
- return 0;
3133
-
3134
- ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
3135
- if (ret)
3136
- return ret;
3137
-
3138
- ns->sws = le32_to_cpu(s.sws);
3139
- ns->sgs = le16_to_cpu(s.sgs);
3140
-
3141
- if (ns->sws) {
3142
- unsigned int bs = 1 << ns->lba_shift;
3143
-
3144
- blk_queue_io_min(ns->queue, bs * ns->sws);
3145
- if (ns->sgs)
3146
- blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
3909
+ list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
3910
+ if (tmp->head->ns_id < ns->head->ns_id) {
3911
+ list_add(&ns->list, &tmp->list);
3912
+ return;
3913
+ }
31473914 }
3148
-
3149
- return 0;
3915
+ list_add(&ns->list, &ns->ctrl->namespaces);
31503916 }
31513917
3152
-static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3918
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
3919
+ struct nvme_ns_ids *ids)
31533920 {
31543921 struct nvme_ns *ns;
31553922 struct gendisk *disk;
31563923 struct nvme_id_ns *id;
31573924 char disk_name[DISK_NAME_LEN];
3158
- int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
3925
+ int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
3926
+
3927
+ if (nvme_identify_ns(ctrl, nsid, ids, &id))
3928
+ return;
31593929
31603930 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
31613931 if (!ns)
3162
- return;
3932
+ goto out_free_id;
31633933
31643934 ns->queue = blk_mq_init_queue(ctrl->tagset);
31653935 if (IS_ERR(ns->queue))
31663936 goto out_free_ns;
3937
+
3938
+ if (ctrl->opts && ctrl->opts->data_digest)
3939
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
3940
+
31673941 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3942
+ if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
3943
+ blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
3944
+
31683945 ns->queue->queuedata = ns;
31693946 ns->ctrl = ctrl;
3170
-
31713947 kref_init(&ns->kref);
3172
- ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
31733948
3174
- blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
3175
- nvme_set_queue_limits(ctrl, ns->queue);
3176
-
3177
- id = nvme_identify_ns(ctrl, nsid);
3178
- if (!id)
3949
+ ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED);
3950
+ if (ret)
31793951 goto out_free_queue;
3180
-
3181
- if (id->ncap == 0)
3182
- goto out_free_id;
3183
-
3184
- if (nvme_init_ns_head(ns, nsid, id))
3185
- goto out_free_id;
3186
- nvme_setup_streams_ns(ctrl, ns);
31873952 nvme_set_disk_name(disk_name, ns, ctrl, &flags);
3188
-
3189
- if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3190
- if (nvme_nvm_register(ns, disk_name, node)) {
3191
- dev_warn(ctrl->device, "LightNVM init failure\n");
3192
- goto out_unlink_ns;
3193
- }
3194
- }
31953953
31963954 disk = alloc_disk_node(0, node);
31973955 if (!disk)
....@@ -3204,38 +3962,46 @@
32043962 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
32053963 ns->disk = disk;
32063964
3207
- __nvme_revalidate_disk(disk, id);
3965
+ if (nvme_update_ns_info(ns, id))
3966
+ goto out_put_disk;
3967
+
3968
+ if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3969
+ ret = nvme_nvm_register(ns, disk_name, node);
3970
+ if (ret) {
3971
+ dev_warn(ctrl->device, "LightNVM init failure\n");
3972
+ goto out_put_disk;
3973
+ }
3974
+ }
32083975
32093976 down_write(&ctrl->namespaces_rwsem);
3210
- list_add_tail(&ns->list, &ctrl->namespaces);
3977
+ nvme_ns_add_to_ctrl_list(ns);
32113978 up_write(&ctrl->namespaces_rwsem);
3212
-
32133979 nvme_get_ctrl(ctrl);
32143980
3215
- device_add_disk(ctrl->device, ns->disk);
3216
- if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
3217
- &nvme_ns_id_attr_group))
3218
- pr_warn("%s: failed to create sysfs group for identification\n",
3219
- ns->disk->disk_name);
3220
- if (ns->ndev && nvme_nvm_register_sysfs(ns))
3221
- pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
3222
- ns->disk->disk_name);
3981
+ device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups);
32233982
32243983 nvme_mpath_add_disk(ns, id);
3225
- nvme_fault_inject_init(ns);
3984
+ nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
32263985 kfree(id);
32273986
32283987 return;
3988
+ out_put_disk:
3989
+ /* prevent double queue cleanup */
3990
+ ns->disk->queue = NULL;
3991
+ put_disk(ns->disk);
32293992 out_unlink_ns:
32303993 mutex_lock(&ctrl->subsys->lock);
32313994 list_del_rcu(&ns->siblings);
3995
+ if (list_empty(&ns->head->list))
3996
+ list_del_init(&ns->head->entry);
32323997 mutex_unlock(&ctrl->subsys->lock);
3233
- out_free_id:
3234
- kfree(id);
3998
+ nvme_put_ns_head(ns->head);
32353999 out_free_queue:
32364000 blk_cleanup_queue(ns->queue);
32374001 out_free_ns:
32384002 kfree(ns);
4003
+ out_free_id:
4004
+ kfree(id);
32394005 }
32404006
32414007 static void nvme_ns_remove(struct nvme_ns *ns)
....@@ -3243,20 +4009,20 @@
32434009 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
32444010 return;
32454011
3246
- nvme_fault_inject_fini(ns);
4012
+ set_capacity(ns->disk, 0);
4013
+ nvme_fault_inject_fini(&ns->fault_inject);
32474014
32484015 mutex_lock(&ns->ctrl->subsys->lock);
32494016 list_del_rcu(&ns->siblings);
4017
+ if (list_empty(&ns->head->list))
4018
+ list_del_init(&ns->head->entry);
32504019 mutex_unlock(&ns->ctrl->subsys->lock);
4020
+
32514021 synchronize_rcu(); /* guarantee not available in head->list */
32524022 nvme_mpath_clear_current_path(ns);
32534023 synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
32544024
3255
- if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
3256
- sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
3257
- &nvme_ns_id_attr_group);
3258
- if (ns->ndev)
3259
- nvme_nvm_unregister_sysfs(ns);
4025
+ if (ns->disk->flags & GENHD_FL_UP) {
32604026 del_gendisk(ns->disk);
32614027 blk_cleanup_queue(ns->queue);
32624028 if (blk_get_integrity(ns->disk))
....@@ -3271,17 +4037,91 @@
32714037 nvme_put_ns(ns);
32724038 }
32734039
3274
-static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4040
+static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
32754041 {
4042
+ struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
4043
+
4044
+ if (ns) {
4045
+ nvme_ns_remove(ns);
4046
+ nvme_put_ns(ns);
4047
+ }
4048
+}
4049
+
4050
+static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
4051
+{
4052
+ struct nvme_id_ns *id;
4053
+ int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4054
+
4055
+ if (test_bit(NVME_NS_DEAD, &ns->flags))
4056
+ goto out;
4057
+
4058
+ ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
4059
+ if (ret)
4060
+ goto out;
4061
+
4062
+ ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
4063
+ if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
4064
+ dev_err(ns->ctrl->device,
4065
+ "identifiers changed for nsid %d\n", ns->head->ns_id);
4066
+ goto out_free_id;
4067
+ }
4068
+
4069
+ ret = nvme_update_ns_info(ns, id);
4070
+
4071
+out_free_id:
4072
+ kfree(id);
4073
+out:
4074
+ /*
4075
+ * Only remove the namespace if we got a fatal error back from the
4076
+ * device, otherwise ignore the error and just move on.
4077
+ *
4078
+ * TODO: we should probably schedule a delayed retry here.
4079
+ */
4080
+ if (ret > 0 && (ret & NVME_SC_DNR))
4081
+ nvme_ns_remove(ns);
4082
+ else
4083
+ revalidate_disk_size(ns->disk, true);
4084
+}
4085
+
4086
+static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
4087
+{
4088
+ struct nvme_ns_ids ids = { };
32764089 struct nvme_ns *ns;
4090
+
4091
+ if (nvme_identify_ns_descs(ctrl, nsid, &ids))
4092
+ return;
32774093
32784094 ns = nvme_find_get_ns(ctrl, nsid);
32794095 if (ns) {
3280
- if (ns->disk && revalidate_disk(ns->disk))
3281
- nvme_ns_remove(ns);
4096
+ nvme_validate_ns(ns, &ids);
32824097 nvme_put_ns(ns);
3283
- } else
3284
- nvme_alloc_ns(ctrl, nsid);
4098
+ return;
4099
+ }
4100
+
4101
+ switch (ids.csi) {
4102
+ case NVME_CSI_NVM:
4103
+ nvme_alloc_ns(ctrl, nsid, &ids);
4104
+ break;
4105
+ case NVME_CSI_ZNS:
4106
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
4107
+ dev_warn(ctrl->device,
4108
+ "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
4109
+ nsid);
4110
+ break;
4111
+ }
4112
+ if (!nvme_multi_css(ctrl)) {
4113
+ dev_warn(ctrl->device,
4114
+ "command set not reported for nsid: %d\n",
4115
+ nsid);
4116
+ break;
4117
+ }
4118
+ nvme_alloc_ns(ctrl, nsid, &ids);
4119
+ break;
4120
+ default:
4121
+ dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
4122
+ ids.csi, nsid);
4123
+ break;
4124
+ }
32854125 }
32864126
32874127 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
....@@ -3302,39 +4142,41 @@
33024142
33034143 }
33044144
3305
-static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
4145
+static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
33064146 {
3307
- struct nvme_ns *ns;
4147
+ const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
33084148 __le32 *ns_list;
3309
- unsigned i, j, nsid, prev = 0;
3310
- unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024);
3311
- int ret = 0;
4149
+ u32 prev = 0;
4150
+ int ret = 0, i;
4151
+
4152
+ if (nvme_ctrl_limited_cns(ctrl))
4153
+ return -EOPNOTSUPP;
33124154
33134155 ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
33144156 if (!ns_list)
33154157 return -ENOMEM;
33164158
3317
- for (i = 0; i < num_lists; i++) {
3318
- ret = nvme_identify_ns_list(ctrl, prev, ns_list);
4159
+ for (;;) {
4160
+ struct nvme_command cmd = {
4161
+ .identify.opcode = nvme_admin_identify,
4162
+ .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST,
4163
+ .identify.nsid = cpu_to_le32(prev),
4164
+ };
4165
+
4166
+ ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4167
+ NVME_IDENTIFY_DATA_SIZE);
33194168 if (ret)
33204169 goto free;
33214170
3322
- for (j = 0; j < min(nn, 1024U); j++) {
3323
- nsid = le32_to_cpu(ns_list[j]);
3324
- if (!nsid)
4171
+ for (i = 0; i < nr_entries; i++) {
4172
+ u32 nsid = le32_to_cpu(ns_list[i]);
4173
+
4174
+ if (!nsid) /* end of the list? */
33254175 goto out;
3326
-
3327
- nvme_validate_ns(ctrl, nsid);
3328
-
3329
- while (++prev < nsid) {
3330
- ns = nvme_find_get_ns(ctrl, prev);
3331
- if (ns) {
3332
- nvme_ns_remove(ns);
3333
- nvme_put_ns(ns);
3334
- }
3335
- }
4176
+ nvme_validate_or_alloc_ns(ctrl, nsid);
4177
+ while (++prev < nsid)
4178
+ nvme_ns_remove_by_nsid(ctrl, prev);
33364179 }
3337
- nn -= j;
33384180 }
33394181 out:
33404182 nvme_remove_invalid_namespaces(ctrl, prev);
....@@ -3343,12 +4185,18 @@
33434185 return ret;
33444186 }
33454187
3346
-static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
4188
+static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
33474189 {
3348
- unsigned i;
4190
+ struct nvme_id_ctrl *id;
4191
+ u32 nn, i;
4192
+
4193
+ if (nvme_identify_ctrl(ctrl, &id))
4194
+ return;
4195
+ nn = le32_to_cpu(id->nn);
4196
+ kfree(id);
33494197
33504198 for (i = 1; i <= nn; i++)
3351
- nvme_validate_ns(ctrl, i);
4199
+ nvme_validate_or_alloc_ns(ctrl, i);
33524200
33534201 nvme_remove_invalid_namespaces(ctrl, nn);
33544202 }
....@@ -3369,8 +4217,8 @@
33694217 * raced with us in reading the log page, which could cause us to miss
33704218 * updates.
33714219 */
3372
- error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
3373
- log_size, 0);
4220
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4221
+ NVME_CSI_NVM, log, log_size, 0);
33744222 if (error)
33754223 dev_warn(ctrl->device,
33764224 "reading changed ns log failed: %d\n", error);
....@@ -3382,35 +4230,20 @@
33824230 {
33834231 struct nvme_ctrl *ctrl =
33844232 container_of(work, struct nvme_ctrl, scan_work);
3385
- struct nvme_id_ctrl *id;
3386
- unsigned nn;
33874233
3388
- if (ctrl->state != NVME_CTRL_LIVE)
4234
+ /* No tagset on a live ctrl means IO queues could not created */
4235
+ if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
33894236 return;
3390
-
3391
- WARN_ON_ONCE(!ctrl->tagset);
33924237
33934238 if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
33944239 dev_info(ctrl->device, "rescanning namespaces.\n");
33954240 nvme_clear_changed_ns_log(ctrl);
33964241 }
33974242
3398
- if (nvme_identify_ctrl(ctrl, &id))
3399
- return;
3400
-
34014243 mutex_lock(&ctrl->scan_lock);
3402
- nn = le32_to_cpu(id->nn);
3403
- if (!nvme_ctrl_limited_cns(ctrl)) {
3404
- if (!nvme_scan_ns_list(ctrl, nn))
3405
- goto out_free_id;
3406
- }
3407
- nvme_scan_ns_sequential(ctrl, nn);
3408
-out_free_id:
4244
+ if (nvme_scan_ns_list(ctrl) != 0)
4245
+ nvme_scan_ns_sequential(ctrl);
34094246 mutex_unlock(&ctrl->scan_lock);
3410
- kfree(id);
3411
- down_write(&ctrl->namespaces_rwsem);
3412
- list_sort(NULL, &ctrl->namespaces, ns_cmp);
3413
- up_write(&ctrl->namespaces_rwsem);
34144247 }
34154248
34164249 /*
....@@ -3422,6 +4255,13 @@
34224255 {
34234256 struct nvme_ns *ns, *next;
34244257 LIST_HEAD(ns_list);
4258
+
4259
+ /*
4260
+ * make sure to requeue I/O to all namespaces as these
4261
+ * might result from the scan itself and must complete
4262
+ * for the scan_work to make progress
4263
+ */
4264
+ nvme_mpath_clear_ctrl_paths(ctrl);
34254265
34264266 /* prevent racing with ns scanning */
34274267 flush_work(&ctrl->scan_work);
....@@ -3435,6 +4275,9 @@
34354275 if (ctrl->state == NVME_CTRL_DEAD)
34364276 nvme_kill_queues(ctrl);
34374277
4278
+ /* this is a no-op when called from the controller reset handler */
4279
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4280
+
34384281 down_write(&ctrl->namespaces_rwsem);
34394282 list_splice_init(&ctrl->namespaces, &ns_list);
34404283 up_write(&ctrl->namespaces_rwsem);
....@@ -3443,6 +4286,33 @@
34434286 nvme_ns_remove(ns);
34444287 }
34454288 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4289
+
4290
+static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
4291
+{
4292
+ struct nvme_ctrl *ctrl =
4293
+ container_of(dev, struct nvme_ctrl, ctrl_device);
4294
+ struct nvmf_ctrl_options *opts = ctrl->opts;
4295
+ int ret;
4296
+
4297
+ ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4298
+ if (ret)
4299
+ return ret;
4300
+
4301
+ if (opts) {
4302
+ ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4303
+ if (ret)
4304
+ return ret;
4305
+
4306
+ ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4307
+ opts->trsvcid ?: "none");
4308
+ if (ret)
4309
+ return ret;
4310
+
4311
+ ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4312
+ opts->host_traddr ?: "none");
4313
+ }
4314
+ return ret;
4315
+}
34464316
34474317 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
34484318 {
....@@ -3498,8 +4368,8 @@
34984368 if (!log)
34994369 return;
35004370
3501
- if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
3502
- sizeof(*log), 0))
4371
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4372
+ log, sizeof(*log), 0))
35034373 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
35044374 kfree(log);
35054375 }
....@@ -3522,13 +4392,13 @@
35224392 if (time_after(jiffies, fw_act_timeout)) {
35234393 dev_warn(ctrl->device,
35244394 "Fw activation timeout, reset controller\n");
3525
- nvme_reset_ctrl(ctrl);
3526
- break;
4395
+ nvme_try_sched_reset(ctrl);
4396
+ return;
35274397 }
35284398 msleep(100);
35294399 }
35304400
3531
- if (ctrl->state != NVME_CTRL_LIVE)
4401
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
35324402 return;
35334403
35344404 nvme_start_queues(ctrl);
....@@ -3538,13 +4408,23 @@
35384408
35394409 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
35404410 {
3541
- switch ((result & 0xff00) >> 8) {
4411
+ u32 aer_notice_type = (result & 0xff00) >> 8;
4412
+
4413
+ trace_nvme_async_event(ctrl, aer_notice_type);
4414
+
4415
+ switch (aer_notice_type) {
35424416 case NVME_AER_NOTICE_NS_CHANGED:
35434417 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
35444418 nvme_queue_scan(ctrl);
35454419 break;
35464420 case NVME_AER_NOTICE_FW_ACT_STARTING:
3547
- queue_work(nvme_wq, &ctrl->fw_act_work);
4421
+ /*
4422
+ * We are (ab)using the RESETTING state to prevent subsequent
4423
+ * recovery actions from interfering with the controller's
4424
+ * firmware activation.
4425
+ */
4426
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
4427
+ queue_work(nvme_wq, &ctrl->fw_act_work);
35484428 break;
35494429 #ifdef CONFIG_NVME_MULTIPATH
35504430 case NVME_AER_NOTICE_ANA:
....@@ -3553,6 +4433,9 @@
35534433 queue_work(nvme_wq, &ctrl->ana_work);
35544434 break;
35554435 #endif
4436
+ case NVME_AER_NOTICE_DISC_CHANGED:
4437
+ ctrl->aen_result = result;
4438
+ break;
35564439 default:
35574440 dev_warn(ctrl->device, "async event result %08x\n", result);
35584441 }
....@@ -3562,11 +4445,12 @@
35624445 volatile union nvme_result *res)
35634446 {
35644447 u32 result = le32_to_cpu(res->u32);
4448
+ u32 aer_type = result & 0x07;
35654449
35664450 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
35674451 return;
35684452
3569
- switch (result & 0x7) {
4453
+ switch (aer_type) {
35704454 case NVME_AER_NOTICE:
35714455 nvme_handle_aen_notice(ctrl, result);
35724456 break;
....@@ -3574,6 +4458,7 @@
35744458 case NVME_AER_SMART:
35754459 case NVME_AER_CSS:
35764460 case NVME_AER_VS:
4461
+ trace_nvme_async_event(ctrl, aer_type);
35774462 ctrl->aen_result = result;
35784463 break;
35794464 default:
....@@ -3596,25 +4481,40 @@
35964481
35974482 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
35984483 {
3599
- if (ctrl->kato)
3600
- nvme_start_keep_alive(ctrl);
4484
+ nvme_start_keep_alive(ctrl);
4485
+
4486
+ nvme_enable_aen(ctrl);
36014487
36024488 if (ctrl->queue_count > 1) {
36034489 nvme_queue_scan(ctrl);
3604
- nvme_enable_aen(ctrl);
3605
- queue_work(nvme_wq, &ctrl->async_event_work);
36064490 nvme_start_queues(ctrl);
4491
+ nvme_mpath_update(ctrl);
36074492 }
3608
- ctrl->created = true;
36094493 }
36104494 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
36114495
36124496 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
36134497 {
4498
+ nvme_hwmon_exit(ctrl);
4499
+ nvme_fault_inject_fini(&ctrl->fault_inject);
36144500 dev_pm_qos_hide_latency_tolerance(ctrl->device);
36154501 cdev_device_del(&ctrl->cdev, ctrl->device);
4502
+ nvme_put_ctrl(ctrl);
36164503 }
36174504 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4505
+
4506
+static void nvme_free_cels(struct nvme_ctrl *ctrl)
4507
+{
4508
+ struct nvme_effects_log *cel;
4509
+ unsigned long i;
4510
+
4511
+ xa_for_each (&ctrl->cels, i, cel) {
4512
+ xa_erase(&ctrl->cels, i);
4513
+ kfree(cel);
4514
+ }
4515
+
4516
+ xa_destroy(&ctrl->cels);
4517
+}
36184518
36194519 static void nvme_free_ctrl(struct device *dev)
36204520 {
....@@ -3622,16 +4522,18 @@
36224522 container_of(dev, struct nvme_ctrl, ctrl_device);
36234523 struct nvme_subsystem *subsys = ctrl->subsys;
36244524
3625
- ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3626
- kfree(ctrl->effects);
4525
+ if (!subsys || ctrl->instance != subsys->instance)
4526
+ ida_simple_remove(&nvme_instance_ida, ctrl->instance);
4527
+
4528
+ nvme_free_cels(ctrl);
36274529 nvme_mpath_uninit(ctrl);
36284530 __free_page(ctrl->discard_page);
36294531
36304532 if (subsys) {
3631
- mutex_lock(&subsys->lock);
4533
+ mutex_lock(&nvme_subsystems_lock);
36324534 list_del(&ctrl->subsys_entry);
3633
- mutex_unlock(&subsys->lock);
36344535 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4536
+ mutex_unlock(&nvme_subsystems_lock);
36354537 }
36364538
36374539 ctrl->ops->free_ctrl(ctrl);
....@@ -3654,14 +4556,17 @@
36544556 spin_lock_init(&ctrl->lock);
36554557 mutex_init(&ctrl->scan_lock);
36564558 INIT_LIST_HEAD(&ctrl->namespaces);
4559
+ xa_init(&ctrl->cels);
36574560 init_rwsem(&ctrl->namespaces_rwsem);
36584561 ctrl->dev = dev;
36594562 ctrl->ops = ops;
36604563 ctrl->quirks = quirks;
4564
+ ctrl->numa_node = NUMA_NO_NODE;
36614565 INIT_WORK(&ctrl->scan_work, nvme_scan_work);
36624566 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
36634567 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
36644568 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4569
+ init_waitqueue_head(&ctrl->state_wq);
36654570
36664571 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
36674572 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
....@@ -3692,6 +4597,7 @@
36924597 if (ret)
36934598 goto out_release_instance;
36944599
4600
+ nvme_get_ctrl(ctrl);
36954601 cdev_init(&ctrl->cdev, &nvme_dev_fops);
36964602 ctrl->cdev.owner = ops->module;
36974603 ret = cdev_device_add(&ctrl->cdev, ctrl->device);
....@@ -3706,8 +4612,12 @@
37064612 dev_pm_qos_update_user_latency_tolerance(ctrl->device,
37074613 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
37084614
4615
+ nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4616
+ nvme_mpath_init_ctrl(ctrl);
4617
+
37094618 return 0;
37104619 out_free_name:
4620
+ nvme_put_ctrl(ctrl);
37114621 kfree_const(ctrl->device->kobj.name);
37124622 out_release_instance:
37134623 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
....@@ -3753,7 +4663,7 @@
37534663 }
37544664 EXPORT_SYMBOL_GPL(nvme_unfreeze);
37554665
3756
-void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4666
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
37574667 {
37584668 struct nvme_ns *ns;
37594669
....@@ -3764,6 +4674,7 @@
37644674 break;
37654675 }
37664676 up_read(&ctrl->namespaces_rwsem);
4677
+ return timeout;
37674678 }
37684679 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
37694680
....@@ -3811,9 +4722,65 @@
38114722 }
38124723 EXPORT_SYMBOL_GPL(nvme_start_queues);
38134724
3814
-int __init nvme_core_init(void)
4725
+void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
4726
+{
4727
+ struct nvme_ns *ns;
4728
+
4729
+ down_read(&ctrl->namespaces_rwsem);
4730
+ list_for_each_entry(ns, &ctrl->namespaces, list)
4731
+ blk_sync_queue(ns->queue);
4732
+ up_read(&ctrl->namespaces_rwsem);
4733
+}
4734
+EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
4735
+
4736
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
4737
+{
4738
+ nvme_sync_io_queues(ctrl);
4739
+ if (ctrl->admin_q)
4740
+ blk_sync_queue(ctrl->admin_q);
4741
+}
4742
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
4743
+
4744
+struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
4745
+{
4746
+ if (file->f_op != &nvme_dev_fops)
4747
+ return NULL;
4748
+ return file->private_data;
4749
+}
4750
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
4751
+
4752
+/*
4753
+ * Check we didn't inadvertently grow the command structure sizes:
4754
+ */
4755
+static inline void _nvme_check_size(void)
4756
+{
4757
+ BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
4758
+ BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
4759
+ BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
4760
+ BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
4761
+ BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
4762
+ BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
4763
+ BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
4764
+ BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
4765
+ BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
4766
+ BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
4767
+ BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
4768
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
4769
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
4770
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
4771
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
4772
+ BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
4773
+ BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
4774
+ BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
4775
+ BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
4776
+}
4777
+
4778
+
4779
+static int __init nvme_core_init(void)
38154780 {
38164781 int result = -ENOMEM;
4782
+
4783
+ _nvme_check_size();
38174784
38184785 nvme_wq = alloc_workqueue("nvme-wq",
38194786 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
....@@ -3839,6 +4806,7 @@
38394806 result = PTR_ERR(nvme_class);
38404807 goto unregister_chrdev;
38414808 }
4809
+ nvme_class->dev_uevent = nvme_class_uevent;
38424810
38434811 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
38444812 if (IS_ERR(nvme_subsys_class)) {
....@@ -3861,15 +4829,15 @@
38614829 return result;
38624830 }
38634831
3864
-void nvme_core_exit(void)
4832
+static void __exit nvme_core_exit(void)
38654833 {
3866
- ida_destroy(&nvme_subsystems_ida);
38674834 class_destroy(nvme_subsys_class);
38684835 class_destroy(nvme_class);
38694836 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
38704837 destroy_workqueue(nvme_delete_wq);
38714838 destroy_workqueue(nvme_reset_wq);
38724839 destroy_workqueue(nvme_wq);
4840
+ ida_destroy(&nvme_instance_ida);
38734841 }
38744842
38754843 MODULE_LICENSE("GPL");