forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-10 cde9070d9970eef1f7ec2360586c802a16230ad8
kernel/drivers/nvme/host/rdma.c
....@@ -1,15 +1,7 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * NVMe over Fabrics RDMA host code.
34 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4
- *
5
- * This program is free software; you can redistribute it and/or modify it
6
- * under the terms and conditions of the GNU General Public License,
7
- * version 2, as published by the Free Software Foundation.
8
- *
9
- * This program is distributed in the hope it will be useful, but WITHOUT
10
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12
- * more details.
135 */
146 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
157 #include <linux/module.h>
....@@ -42,6 +34,11 @@
4234
4335 #define NVME_RDMA_MAX_INLINE_SEGMENTS 4
4436
37
+#define NVME_RDMA_DATA_SGL_SIZE \
38
+ (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT)
39
+#define NVME_RDMA_METADATA_SGL_SIZE \
40
+ (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT)
41
+
4542 struct nvme_rdma_device {
4643 struct ib_device *dev;
4744 struct ib_pd *pd;
....@@ -56,6 +53,11 @@
5653 u64 dma;
5754 };
5855
56
+struct nvme_rdma_sgl {
57
+ int nents;
58
+ struct sg_table sg_table;
59
+};
60
+
5961 struct nvme_rdma_queue;
6062 struct nvme_rdma_request {
6163 struct nvme_request req;
....@@ -66,12 +68,12 @@
6668 refcount_t ref;
6769 struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
6870 u32 num_sge;
69
- int nents;
7071 struct ib_reg_wr reg_wr;
7172 struct ib_cqe reg_cqe;
7273 struct nvme_rdma_queue *queue;
73
- struct sg_table sg_table;
74
- struct scatterlist first_sgl[];
74
+ struct nvme_rdma_sgl data_sgl;
75
+ struct nvme_rdma_sgl *metadata_sgl;
76
+ bool use_sig_mr;
7577 };
7678
7779 enum nvme_rdma_queue_flags {
....@@ -93,6 +95,9 @@
9395 struct rdma_cm_id *cm_id;
9496 int cm_error;
9597 struct completion cm_done;
98
+ bool pi_support;
99
+ int cq_size;
100
+ struct mutex queue_lock;
96101 };
97102
98103 struct nvme_rdma_ctrl {
....@@ -118,8 +123,8 @@
118123 struct sockaddr_storage src_addr;
119124
120125 struct nvme_ctrl ctrl;
121
- struct mutex teardown_lock;
122126 bool use_inline_data;
127
+ u32 io_queues[HCTX_MAX_TYPES];
123128 };
124129
125130 static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
....@@ -146,21 +151,21 @@
146151 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
147152 struct rdma_cm_event *event);
148153 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
154
+static void nvme_rdma_complete_rq(struct request *rq);
149155
150156 static const struct blk_mq_ops nvme_rdma_mq_ops;
151157 static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
152158
153
-/* XXX: really should move to a generic header sooner or later.. */
154
-static inline void put_unaligned_le24(u32 val, u8 *p)
155
-{
156
- *p++ = val;
157
- *p++ = val >> 8;
158
- *p++ = val >> 16;
159
-}
160
-
161159 static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
162160 {
163161 return queue - queue->ctrl->queues;
162
+}
163
+
164
+static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
165
+{
166
+ return nvme_rdma_queue_idx(queue) >
167
+ queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
168
+ queue->ctrl->io_queues[HCTX_TYPE_READ];
164169 }
165170
166171 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
....@@ -214,6 +219,11 @@
214219 if (!ring)
215220 return NULL;
216221
222
+ /*
223
+ * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
224
+ * lifetime. It's safe, since any chage in the underlying RDMA device
225
+ * will issue error recovery and queue re-creation.
226
+ */
217227 for (i = 0; i < ib_queue_size; i++) {
218228 if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
219229 goto out_free_ring;
....@@ -235,8 +245,15 @@
235245
236246 static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
237247 {
238
- wait_for_completion_interruptible_timeout(&queue->cm_done,
248
+ int ret;
249
+
250
+ ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
239251 msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
252
+ if (ret < 0)
253
+ return ret;
254
+ if (ret == 0)
255
+ return -ETIMEDOUT;
256
+ WARN_ON_ONCE(queue->cm_error > 0);
240257 return queue->cm_error;
241258 }
242259
....@@ -258,6 +275,9 @@
258275 init_attr.qp_type = IB_QPT_RC;
259276 init_attr.send_cq = queue->ib_cq;
260277 init_attr.recv_cq = queue->ib_cq;
278
+ if (queue->pi_support)
279
+ init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
280
+ init_attr.qp_context = queue;
261281
262282 ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
263283
....@@ -268,14 +288,9 @@
268288 static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
269289 struct request *rq, unsigned int hctx_idx)
270290 {
271
- struct nvme_rdma_ctrl *ctrl = set->driver_data;
272291 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
273
- int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
274
- struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
275
- struct nvme_rdma_device *dev = queue->device;
276292
277
- nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
278
- DMA_TO_DEVICE);
293
+ kfree(req->sqe.data);
279294 }
280295
281296 static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
....@@ -286,15 +301,17 @@
286301 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
287302 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
288303 struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
289
- struct nvme_rdma_device *dev = queue->device;
290
- struct ib_device *ibdev = dev->dev;
291
- int ret;
292304
293305 nvme_req(rq)->ctrl = &ctrl->ctrl;
294
- ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
295
- DMA_TO_DEVICE);
296
- if (ret)
297
- return ret;
306
+ req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
307
+ if (!req->sqe.data)
308
+ return -ENOMEM;
309
+
310
+ /* metadata nvme_rdma_sgl struct is located after command's data SGL */
311
+ if (queue->pi_support)
312
+ req->metadata_sgl = (void *)nvme_req(rq) +
313
+ sizeof(struct nvme_rdma_request) +
314
+ NVME_RDMA_DATA_SGL_SIZE;
298315
299316 req->queue = queue;
300317
....@@ -395,6 +412,14 @@
395412 return NULL;
396413 }
397414
415
+static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
416
+{
417
+ if (nvme_rdma_poll_queue(queue))
418
+ ib_free_cq(queue->ib_cq);
419
+ else
420
+ ib_cq_pool_put(queue->ib_cq, queue->cq_size);
421
+}
422
+
398423 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
399424 {
400425 struct nvme_rdma_device *dev;
....@@ -406,6 +431,8 @@
406431 dev = queue->device;
407432 ibdev = dev->dev;
408433
434
+ if (queue->pi_support)
435
+ ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs);
409436 ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
410437
411438 /*
....@@ -414,7 +441,7 @@
414441 * the destruction of the QP shouldn't use rdma_cm API.
415442 */
416443 ib_destroy_qp(queue->qp);
417
- ib_free_cq(queue->ib_cq);
444
+ nvme_rdma_free_cq(queue);
418445
419446 nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
420447 sizeof(struct nvme_completion), DMA_FROM_DEVICE);
....@@ -422,10 +449,47 @@
422449 nvme_rdma_dev_put(dev);
423450 }
424451
425
-static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
452
+static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
426453 {
427
- return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
428
- ibdev->attrs.max_fast_reg_page_list_len);
454
+ u32 max_page_list_len;
455
+
456
+ if (pi_support)
457
+ max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len;
458
+ else
459
+ max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len;
460
+
461
+ return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
462
+}
463
+
464
+static int nvme_rdma_create_cq(struct ib_device *ibdev,
465
+ struct nvme_rdma_queue *queue)
466
+{
467
+ int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
468
+ enum ib_poll_context poll_ctx;
469
+
470
+ /*
471
+ * Spread I/O queues completion vectors according their queue index.
472
+ * Admin queues can always go on completion vector 0.
473
+ */
474
+ comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
475
+
476
+ /* Polling queues need direct cq polling context */
477
+ if (nvme_rdma_poll_queue(queue)) {
478
+ poll_ctx = IB_POLL_DIRECT;
479
+ queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
480
+ comp_vector, poll_ctx);
481
+ } else {
482
+ poll_ctx = IB_POLL_SOFTIRQ;
483
+ queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
484
+ comp_vector, poll_ctx);
485
+ }
486
+
487
+ if (IS_ERR(queue->ib_cq)) {
488
+ ret = PTR_ERR(queue->ib_cq);
489
+ return ret;
490
+ }
491
+
492
+ return 0;
429493 }
430494
431495 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
....@@ -433,8 +497,7 @@
433497 struct ib_device *ibdev;
434498 const int send_wr_factor = 3; /* MR, SEND, INV */
435499 const int cq_factor = send_wr_factor + 1; /* + RECV */
436
- int comp_vector, idx = nvme_rdma_queue_idx(queue);
437
- int ret;
500
+ int ret, pages_per_mr;
438501
439502 queue->device = nvme_rdma_find_get_device(queue->cm_id);
440503 if (!queue->device) {
....@@ -444,20 +507,12 @@
444507 }
445508 ibdev = queue->device->dev;
446509
447
- /*
448
- * Spread I/O queues completion vectors according their queue index.
449
- * Admin queues can always go on completion vector 0.
450
- */
451
- comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
452
-
453510 /* +1 for ib_stop_cq */
454
- queue->ib_cq = ib_alloc_cq(ibdev, queue,
455
- cq_factor * queue->queue_size + 1,
456
- comp_vector, IB_POLL_SOFTIRQ);
457
- if (IS_ERR(queue->ib_cq)) {
458
- ret = PTR_ERR(queue->ib_cq);
511
+ queue->cq_size = cq_factor * queue->queue_size + 1;
512
+
513
+ ret = nvme_rdma_create_cq(ibdev, queue);
514
+ if (ret)
459515 goto out_put_dev;
460
- }
461516
462517 ret = nvme_rdma_create_qp(queue, send_wr_factor);
463518 if (ret)
....@@ -470,28 +525,48 @@
470525 goto out_destroy_qp;
471526 }
472527
528
+ /*
529
+ * Currently we don't use SG_GAPS MR's so if the first entry is
530
+ * misaligned we'll end up using two entries for a single data page,
531
+ * so one additional entry is required.
532
+ */
533
+ pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1;
473534 ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
474535 queue->queue_size,
475536 IB_MR_TYPE_MEM_REG,
476
- nvme_rdma_get_max_fr_pages(ibdev));
537
+ pages_per_mr, 0);
477538 if (ret) {
478539 dev_err(queue->ctrl->ctrl.device,
479540 "failed to initialize MR pool sized %d for QID %d\n",
480
- queue->queue_size, idx);
541
+ queue->queue_size, nvme_rdma_queue_idx(queue));
481542 goto out_destroy_ring;
543
+ }
544
+
545
+ if (queue->pi_support) {
546
+ ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs,
547
+ queue->queue_size, IB_MR_TYPE_INTEGRITY,
548
+ pages_per_mr, pages_per_mr);
549
+ if (ret) {
550
+ dev_err(queue->ctrl->ctrl.device,
551
+ "failed to initialize PI MR pool sized %d for QID %d\n",
552
+ queue->queue_size, nvme_rdma_queue_idx(queue));
553
+ goto out_destroy_mr_pool;
554
+ }
482555 }
483556
484557 set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
485558
486559 return 0;
487560
561
+out_destroy_mr_pool:
562
+ ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
488563 out_destroy_ring:
489564 nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
490565 sizeof(struct nvme_completion), DMA_FROM_DEVICE);
491566 out_destroy_qp:
492567 rdma_destroy_qp(queue->cm_id);
493568 out_destroy_ib_cq:
494
- ib_free_cq(queue->ib_cq);
569
+ nvme_rdma_free_cq(queue);
495570 out_put_dev:
496571 nvme_rdma_dev_put(queue->device);
497572 return ret;
....@@ -505,7 +580,12 @@
505580 int ret;
506581
507582 queue = &ctrl->queues[idx];
583
+ mutex_init(&queue->queue_lock);
508584 queue->ctrl = ctrl;
585
+ if (idx && ctrl->ctrl.max_integrity_segments)
586
+ queue->pi_support = true;
587
+ else
588
+ queue->pi_support = false;
509589 init_completion(&queue->cm_done);
510590
511591 if (idx > 0)
....@@ -520,7 +600,8 @@
520600 if (IS_ERR(queue->cm_id)) {
521601 dev_info(ctrl->ctrl.device,
522602 "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
523
- return PTR_ERR(queue->cm_id);
603
+ ret = PTR_ERR(queue->cm_id);
604
+ goto out_destroy_mutex;
524605 }
525606
526607 if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
....@@ -550,16 +631,23 @@
550631 out_destroy_cm_id:
551632 rdma_destroy_id(queue->cm_id);
552633 nvme_rdma_destroy_queue_ib(queue);
634
+out_destroy_mutex:
635
+ mutex_destroy(&queue->queue_lock);
553636 return ret;
637
+}
638
+
639
+static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
640
+{
641
+ rdma_disconnect(queue->cm_id);
642
+ ib_drain_qp(queue->qp);
554643 }
555644
556645 static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
557646 {
558
- if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
559
- return;
560
-
561
- rdma_disconnect(queue->cm_id);
562
- ib_drain_qp(queue->qp);
647
+ mutex_lock(&queue->queue_lock);
648
+ if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
649
+ __nvme_rdma_stop_queue(queue);
650
+ mutex_unlock(&queue->queue_lock);
563651 }
564652
565653 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
....@@ -567,8 +655,9 @@
567655 if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
568656 return;
569657
570
- nvme_rdma_destroy_queue_ib(queue);
571658 rdma_destroy_id(queue->cm_id);
659
+ nvme_rdma_destroy_queue_ib(queue);
660
+ mutex_destroy(&queue->queue_lock);
572661 }
573662
574663 static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
....@@ -589,18 +678,23 @@
589678
590679 static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
591680 {
681
+ struct nvme_rdma_queue *queue = &ctrl->queues[idx];
682
+ bool poll = nvme_rdma_poll_queue(queue);
592683 int ret;
593684
594685 if (idx)
595
- ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
686
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
596687 else
597688 ret = nvmf_connect_admin_queue(&ctrl->ctrl);
598689
599
- if (!ret)
600
- set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags);
601
- else
690
+ if (!ret) {
691
+ set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
692
+ } else {
693
+ if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
694
+ __nvme_rdma_stop_queue(queue);
602695 dev_info(ctrl->ctrl.device,
603696 "failed to connect queue: %d ret=%d\n", idx, ret);
697
+ }
604698 return ret;
605699 }
606700
....@@ -626,18 +720,16 @@
626720 {
627721 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
628722 struct ib_device *ibdev = ctrl->device->dev;
629
- unsigned int nr_io_queues;
723
+ unsigned int nr_io_queues, nr_default_queues;
724
+ unsigned int nr_read_queues, nr_poll_queues;
630725 int i, ret;
631726
632
- nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
633
-
634
- /*
635
- * we map queues according to the device irq vectors for
636
- * optimal locality so we don't need more queues than
637
- * completion vectors.
638
- */
639
- nr_io_queues = min_t(unsigned int, nr_io_queues,
640
- ibdev->num_comp_vectors);
727
+ nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
728
+ min(opts->nr_io_queues, num_online_cpus()));
729
+ nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
730
+ min(opts->nr_write_queues, num_online_cpus()));
731
+ nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
732
+ nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
641733
642734 ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
643735 if (ret)
....@@ -652,6 +744,34 @@
652744 ctrl->ctrl.queue_count = nr_io_queues + 1;
653745 dev_info(ctrl->ctrl.device,
654746 "creating %d I/O queues.\n", nr_io_queues);
747
+
748
+ if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
749
+ /*
750
+ * separate read/write queues
751
+ * hand out dedicated default queues only after we have
752
+ * sufficient read queues.
753
+ */
754
+ ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
755
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
756
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
757
+ min(nr_default_queues, nr_io_queues);
758
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
759
+ } else {
760
+ /*
761
+ * shared read/write queues
762
+ * either no write queues were requested, or we don't have
763
+ * sufficient queue count to have dedicated default queues.
764
+ */
765
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
766
+ min(nr_read_queues, nr_io_queues);
767
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
768
+ }
769
+
770
+ if (opts->nr_poll_queues && nr_io_queues) {
771
+ /* map dedicated poll queues only if we have queues left */
772
+ ctrl->io_queues[HCTX_TYPE_POLL] =
773
+ min(nr_poll_queues, nr_io_queues);
774
+ }
655775
656776 for (i = 1; i < ctrl->ctrl.queue_count; i++) {
657777 ret = nvme_rdma_alloc_queue(ctrl, i,
....@@ -669,15 +789,6 @@
669789 return ret;
670790 }
671791
672
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
673
- struct blk_mq_tag_set *set)
674
-{
675
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
676
-
677
- blk_mq_free_tag_set(set);
678
- nvme_rdma_dev_put(ctrl->device);
679
-}
680
-
681792 static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
682793 bool admin)
683794 {
....@@ -691,9 +802,9 @@
691802 set->ops = &nvme_rdma_admin_mq_ops;
692803 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
693804 set->reserved_tags = 2; /* connect + keep-alive */
694
- set->numa_node = NUMA_NO_NODE;
805
+ set->numa_node = nctrl->numa_node;
695806 set->cmd_size = sizeof(struct nvme_rdma_request) +
696
- SG_CHUNK_SIZE * sizeof(struct scatterlist);
807
+ NVME_RDMA_DATA_SGL_SIZE;
697808 set->driver_data = ctrl;
698809 set->nr_hw_queues = 1;
699810 set->timeout = ADMIN_TIMEOUT;
....@@ -704,35 +815,24 @@
704815 set->ops = &nvme_rdma_mq_ops;
705816 set->queue_depth = nctrl->sqsize + 1;
706817 set->reserved_tags = 1; /* fabric connect */
707
- set->numa_node = NUMA_NO_NODE;
818
+ set->numa_node = nctrl->numa_node;
708819 set->flags = BLK_MQ_F_SHOULD_MERGE;
709820 set->cmd_size = sizeof(struct nvme_rdma_request) +
710
- SG_CHUNK_SIZE * sizeof(struct scatterlist);
821
+ NVME_RDMA_DATA_SGL_SIZE;
822
+ if (nctrl->max_integrity_segments)
823
+ set->cmd_size += sizeof(struct nvme_rdma_sgl) +
824
+ NVME_RDMA_METADATA_SGL_SIZE;
711825 set->driver_data = ctrl;
712826 set->nr_hw_queues = nctrl->queue_count - 1;
713827 set->timeout = NVME_IO_TIMEOUT;
828
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
714829 }
715830
716831 ret = blk_mq_alloc_tag_set(set);
717832 if (ret)
718
- goto out;
719
-
720
- /*
721
- * We need a reference on the device as long as the tag_set is alive,
722
- * as the MRs in the request structures need a valid ib_device.
723
- */
724
- ret = nvme_rdma_dev_get(ctrl->device);
725
- if (!ret) {
726
- ret = -EINVAL;
727
- goto out_free_tagset;
728
- }
833
+ return ERR_PTR(ret);
729834
730835 return set;
731
-
732
-out_free_tagset:
733
- blk_mq_free_tag_set(set);
734
-out:
735
- return ERR_PTR(ret);
736836 }
737837
738838 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
....@@ -740,7 +840,8 @@
740840 {
741841 if (remove) {
742842 blk_cleanup_queue(ctrl->ctrl.admin_q);
743
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
843
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
844
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
744845 }
745846 if (ctrl->async_event_sqe.data) {
746847 cancel_work_sync(&ctrl->ctrl.async_event_work);
....@@ -754,6 +855,7 @@
754855 static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
755856 bool new)
756857 {
858
+ bool pi_capable = false;
757859 int error;
758860
759861 error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
....@@ -761,9 +863,21 @@
761863 return error;
762864
763865 ctrl->device = ctrl->queues[0].device;
866
+ ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev);
764867
765
- ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
868
+ /* T10-PI support */
869
+ if (ctrl->device->dev->attrs.device_cap_flags &
870
+ IB_DEVICE_INTEGRITY_HANDOVER)
871
+ pi_capable = true;
766872
873
+ ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
874
+ pi_capable);
875
+
876
+ /*
877
+ * Bind the async event SQE DMA mapping to the admin queue lifetime.
878
+ * It's safe, since any chage in the underlying RDMA device will issue
879
+ * error recovery and queue re-creation.
880
+ */
767881 error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
768882 sizeof(struct nvme_command), DMA_TO_DEVICE);
769883 if (error)
....@@ -776,10 +890,16 @@
776890 goto out_free_async_qe;
777891 }
778892
893
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
894
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
895
+ error = PTR_ERR(ctrl->ctrl.fabrics_q);
896
+ goto out_free_tagset;
897
+ }
898
+
779899 ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
780900 if (IS_ERR(ctrl->ctrl.admin_q)) {
781901 error = PTR_ERR(ctrl->ctrl.admin_q);
782
- goto out_free_tagset;
902
+ goto out_cleanup_fabrics_q;
783903 }
784904 }
785905
....@@ -787,38 +907,40 @@
787907 if (error)
788908 goto out_cleanup_queue;
789909
790
- error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
791
- &ctrl->ctrl.cap);
792
- if (error) {
793
- dev_err(ctrl->ctrl.device,
794
- "prop_get NVME_REG_CAP failed\n");
795
- goto out_stop_queue;
796
- }
797
-
798
- ctrl->ctrl.sqsize =
799
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
800
-
801
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
910
+ error = nvme_enable_ctrl(&ctrl->ctrl);
802911 if (error)
803912 goto out_stop_queue;
804913
805
- ctrl->ctrl.max_hw_sectors =
806
- (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
914
+ ctrl->ctrl.max_segments = ctrl->max_fr_pages;
915
+ ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
916
+ if (pi_capable)
917
+ ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
918
+ else
919
+ ctrl->ctrl.max_integrity_segments = 0;
920
+
921
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
807922
808923 error = nvme_init_identify(&ctrl->ctrl);
809924 if (error)
810
- goto out_stop_queue;
925
+ goto out_quiesce_queue;
811926
812927 return 0;
813928
929
+out_quiesce_queue:
930
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
931
+ blk_sync_queue(ctrl->ctrl.admin_q);
814932 out_stop_queue:
815933 nvme_rdma_stop_queue(&ctrl->queues[0]);
934
+ nvme_cancel_admin_tagset(&ctrl->ctrl);
816935 out_cleanup_queue:
817936 if (new)
818937 blk_cleanup_queue(ctrl->ctrl.admin_q);
938
+out_cleanup_fabrics_q:
939
+ if (new)
940
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
819941 out_free_tagset:
820942 if (new)
821
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
943
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
822944 out_free_async_qe:
823945 if (ctrl->async_event_sqe.data) {
824946 nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
....@@ -835,7 +957,7 @@
835957 {
836958 if (remove) {
837959 blk_cleanup_queue(ctrl->ctrl.connect_q);
838
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
960
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
839961 }
840962 nvme_rdma_free_io_queues(ctrl);
841963 }
....@@ -860,23 +982,43 @@
860982 ret = PTR_ERR(ctrl->ctrl.connect_q);
861983 goto out_free_tag_set;
862984 }
863
- } else {
864
- blk_mq_update_nr_hw_queues(&ctrl->tag_set,
865
- ctrl->ctrl.queue_count - 1);
866985 }
867986
868987 ret = nvme_rdma_start_io_queues(ctrl);
869988 if (ret)
870989 goto out_cleanup_connect_q;
871990
991
+ if (!new) {
992
+ nvme_start_freeze(&ctrl->ctrl);
993
+ nvme_start_queues(&ctrl->ctrl);
994
+ if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
995
+ /*
996
+ * If we timed out waiting for freeze we are likely to
997
+ * be stuck. Fail the controller initialization just
998
+ * to be safe.
999
+ */
1000
+ ret = -ENODEV;
1001
+ nvme_unfreeze(&ctrl->ctrl);
1002
+ goto out_wait_freeze_timed_out;
1003
+ }
1004
+ blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
1005
+ ctrl->ctrl.queue_count - 1);
1006
+ nvme_unfreeze(&ctrl->ctrl);
1007
+ }
1008
+
8721009 return 0;
8731010
1011
+out_wait_freeze_timed_out:
1012
+ nvme_stop_queues(&ctrl->ctrl);
1013
+ nvme_sync_io_queues(&ctrl->ctrl);
1014
+ nvme_rdma_stop_io_queues(ctrl);
8741015 out_cleanup_connect_q:
1016
+ nvme_cancel_tagset(&ctrl->ctrl);
8751017 if (new)
8761018 blk_cleanup_queue(ctrl->ctrl.connect_q);
8771019 out_free_tag_set:
8781020 if (new)
879
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
1021
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
8801022 out_free_io_queues:
8811023 nvme_rdma_free_io_queues(ctrl);
8821024 return ret;
....@@ -885,32 +1027,35 @@
8851027 static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
8861028 bool remove)
8871029 {
888
- mutex_lock(&ctrl->teardown_lock);
8891030 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1031
+ blk_sync_queue(ctrl->ctrl.admin_q);
8901032 nvme_rdma_stop_queue(&ctrl->queues[0]);
891
- if (ctrl->ctrl.admin_tagset)
1033
+ if (ctrl->ctrl.admin_tagset) {
8921034 blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
8931035 nvme_cancel_request, &ctrl->ctrl);
894
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
1036
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
1037
+ }
1038
+ if (remove)
1039
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
8951040 nvme_rdma_destroy_admin_queue(ctrl, remove);
896
- mutex_unlock(&ctrl->teardown_lock);
8971041 }
8981042
8991043 static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
9001044 bool remove)
9011045 {
902
- mutex_lock(&ctrl->teardown_lock);
9031046 if (ctrl->ctrl.queue_count > 1) {
9041047 nvme_stop_queues(&ctrl->ctrl);
1048
+ nvme_sync_io_queues(&ctrl->ctrl);
9051049 nvme_rdma_stop_io_queues(ctrl);
906
- if (ctrl->ctrl.tagset)
1050
+ if (ctrl->ctrl.tagset) {
9071051 blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
9081052 nvme_cancel_request, &ctrl->ctrl);
1053
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
1054
+ }
9091055 if (remove)
9101056 nvme_start_queues(&ctrl->ctrl);
9111057 nvme_rdma_destroy_io_queues(ctrl, remove);
9121058 }
913
- mutex_unlock(&ctrl->teardown_lock);
9141059 }
9151060
9161061 static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
....@@ -1003,8 +1148,14 @@
10031148
10041149 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
10051150 if (!changed) {
1006
- /* state change failure is ok if we're in DELETING state */
1007
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
1151
+ /*
1152
+ * state change failure is ok if we started ctrl delete,
1153
+ * unless we're during creation of a new controller to
1154
+ * avoid races with teardown flow.
1155
+ */
1156
+ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
1157
+ ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
1158
+ WARN_ON_ONCE(new);
10081159 ret = -EINVAL;
10091160 goto destroy_io;
10101161 }
....@@ -1013,10 +1164,18 @@
10131164 return 0;
10141165
10151166 destroy_io:
1016
- if (ctrl->ctrl.queue_count > 1)
1167
+ if (ctrl->ctrl.queue_count > 1) {
1168
+ nvme_stop_queues(&ctrl->ctrl);
1169
+ nvme_sync_io_queues(&ctrl->ctrl);
1170
+ nvme_rdma_stop_io_queues(ctrl);
1171
+ nvme_cancel_tagset(&ctrl->ctrl);
10171172 nvme_rdma_destroy_io_queues(ctrl, new);
1173
+ }
10181174 destroy_admin:
1175
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1176
+ blk_sync_queue(ctrl->ctrl.admin_q);
10191177 nvme_rdma_stop_queue(&ctrl->queues[0]);
1178
+ nvme_cancel_admin_tagset(&ctrl->ctrl);
10201179 nvme_rdma_destroy_admin_queue(ctrl, new);
10211180 return ret;
10221181 }
....@@ -1054,10 +1213,12 @@
10541213 nvme_rdma_teardown_io_queues(ctrl, false);
10551214 nvme_start_queues(&ctrl->ctrl);
10561215 nvme_rdma_teardown_admin_queue(ctrl, false);
1216
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
10571217
10581218 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
1059
- /* state change failure is ok if we're in DELETING state */
1060
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
1219
+ /* state change failure is ok if we started ctrl delete */
1220
+ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
1221
+ ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
10611222 return;
10621223 }
10631224
....@@ -1069,13 +1230,24 @@
10691230 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
10701231 return;
10711232
1072
- queue_work(nvme_wq, &ctrl->err_work);
1233
+ dev_warn(ctrl->ctrl.device, "starting error recovery\n");
1234
+ queue_work(nvme_reset_wq, &ctrl->err_work);
1235
+}
1236
+
1237
+static void nvme_rdma_end_request(struct nvme_rdma_request *req)
1238
+{
1239
+ struct request *rq = blk_mq_rq_from_pdu(req);
1240
+
1241
+ if (!refcount_dec_and_test(&req->ref))
1242
+ return;
1243
+ if (!nvme_try_complete_req(rq, req->status, req->result))
1244
+ nvme_rdma_complete_rq(rq);
10731245 }
10741246
10751247 static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
10761248 const char *op)
10771249 {
1078
- struct nvme_rdma_queue *queue = cq->cq_context;
1250
+ struct nvme_rdma_queue *queue = wc->qp->qp_context;
10791251 struct nvme_rdma_ctrl *ctrl = queue->ctrl;
10801252
10811253 if (ctrl->ctrl.state == NVME_CTRL_LIVE)
....@@ -1096,16 +1268,11 @@
10961268 {
10971269 struct nvme_rdma_request *req =
10981270 container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
1099
- struct request *rq = blk_mq_rq_from_pdu(req);
11001271
1101
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
1272
+ if (unlikely(wc->status != IB_WC_SUCCESS))
11021273 nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
1103
- return;
1104
- }
1105
-
1106
- if (refcount_dec_and_test(&req->ref))
1107
- nvme_end_request(rq, req->status, req->result);
1108
-
1274
+ else
1275
+ nvme_rdma_end_request(req);
11091276 }
11101277
11111278 static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
....@@ -1131,21 +1298,29 @@
11311298 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
11321299 struct nvme_rdma_device *dev = queue->device;
11331300 struct ib_device *ibdev = dev->dev;
1301
+ struct list_head *pool = &queue->qp->rdma_mrs;
11341302
1135
- if (!blk_rq_payload_bytes(rq))
1303
+ if (!blk_rq_nr_phys_segments(rq))
11361304 return;
11371305
1306
+ if (blk_integrity_rq(rq)) {
1307
+ ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
1308
+ req->metadata_sgl->nents, rq_dma_dir(rq));
1309
+ sg_free_table_chained(&req->metadata_sgl->sg_table,
1310
+ NVME_INLINE_METADATA_SG_CNT);
1311
+ }
1312
+
1313
+ if (req->use_sig_mr)
1314
+ pool = &queue->qp->sig_mrs;
1315
+
11381316 if (req->mr) {
1139
- ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
1317
+ ib_mr_pool_put(queue->qp, pool, req->mr);
11401318 req->mr = NULL;
11411319 }
11421320
1143
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
1144
- req->nents, rq_data_dir(rq) ==
1145
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1146
-
1147
- nvme_cleanup_cmd(rq);
1148
- sg_free_table_chained(&req->sg_table, true);
1321
+ ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
1322
+ rq_dma_dir(rq));
1323
+ sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
11491324 }
11501325
11511326 static int nvme_rdma_set_sg_null(struct nvme_command *c)
....@@ -1164,16 +1339,17 @@
11641339 int count)
11651340 {
11661341 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1167
- struct scatterlist *sgl = req->sg_table.sgl;
11681342 struct ib_sge *sge = &req->sge[1];
1343
+ struct scatterlist *sgl;
11691344 u32 len = 0;
11701345 int i;
11711346
1172
- for (i = 0; i < count; i++, sgl++, sge++) {
1347
+ for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) {
11731348 sge->addr = sg_dma_address(sgl);
11741349 sge->length = sg_dma_len(sgl);
11751350 sge->lkey = queue->device->pd->local_dma_lkey;
11761351 len += sge->length;
1352
+ sge++;
11771353 }
11781354
11791355 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
....@@ -1189,8 +1365,8 @@
11891365 {
11901366 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
11911367
1192
- sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
1193
- put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
1368
+ sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl));
1369
+ put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length);
11941370 put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
11951371 sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
11961372 return 0;
....@@ -1211,7 +1387,8 @@
12111387 * Align the MR to a 4K page size to match the ctrl page size and
12121388 * the block virtual boundary.
12131389 */
1214
- nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
1390
+ nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL,
1391
+ SZ_4K);
12151392 if (unlikely(nr < count)) {
12161393 ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
12171394 req->mr = NULL;
....@@ -1242,12 +1419,125 @@
12421419 return 0;
12431420 }
12441421
1422
+static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
1423
+ struct nvme_command *cmd, struct ib_sig_domain *domain,
1424
+ u16 control, u8 pi_type)
1425
+{
1426
+ domain->sig_type = IB_SIG_TYPE_T10_DIF;
1427
+ domain->sig.dif.bg_type = IB_T10DIF_CRC;
1428
+ domain->sig.dif.pi_interval = 1 << bi->interval_exp;
1429
+ domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
1430
+ if (control & NVME_RW_PRINFO_PRCHK_REF)
1431
+ domain->sig.dif.ref_remap = true;
1432
+
1433
+ domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
1434
+ domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
1435
+ domain->sig.dif.app_escape = true;
1436
+ if (pi_type == NVME_NS_DPS_PI_TYPE3)
1437
+ domain->sig.dif.ref_escape = true;
1438
+}
1439
+
1440
+static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi,
1441
+ struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs,
1442
+ u8 pi_type)
1443
+{
1444
+ u16 control = le16_to_cpu(cmd->rw.control);
1445
+
1446
+ memset(sig_attrs, 0, sizeof(*sig_attrs));
1447
+ if (control & NVME_RW_PRINFO_PRACT) {
1448
+ /* for WRITE_INSERT/READ_STRIP no memory domain */
1449
+ sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
1450
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
1451
+ pi_type);
1452
+ /* Clear the PRACT bit since HCA will generate/verify the PI */
1453
+ control &= ~NVME_RW_PRINFO_PRACT;
1454
+ cmd->rw.control = cpu_to_le16(control);
1455
+ } else {
1456
+ /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
1457
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
1458
+ pi_type);
1459
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
1460
+ pi_type);
1461
+ }
1462
+}
1463
+
1464
+static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask)
1465
+{
1466
+ *mask = 0;
1467
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
1468
+ *mask |= IB_SIG_CHECK_REFTAG;
1469
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
1470
+ *mask |= IB_SIG_CHECK_GUARD;
1471
+}
1472
+
1473
+static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc)
1474
+{
1475
+ if (unlikely(wc->status != IB_WC_SUCCESS))
1476
+ nvme_rdma_wr_error(cq, wc, "SIG");
1477
+}
1478
+
1479
+static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
1480
+ struct nvme_rdma_request *req, struct nvme_command *c,
1481
+ int count, int pi_count)
1482
+{
1483
+ struct nvme_rdma_sgl *sgl = &req->data_sgl;
1484
+ struct ib_reg_wr *wr = &req->reg_wr;
1485
+ struct request *rq = blk_mq_rq_from_pdu(req);
1486
+ struct nvme_ns *ns = rq->q->queuedata;
1487
+ struct bio *bio = rq->bio;
1488
+ struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1489
+ int nr;
1490
+
1491
+ req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
1492
+ if (WARN_ON_ONCE(!req->mr))
1493
+ return -EAGAIN;
1494
+
1495
+ nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL,
1496
+ req->metadata_sgl->sg_table.sgl, pi_count, NULL,
1497
+ SZ_4K);
1498
+ if (unlikely(nr))
1499
+ goto mr_put;
1500
+
1501
+ nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c,
1502
+ req->mr->sig_attrs, ns->pi_type);
1503
+ nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
1504
+
1505
+ ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1506
+
1507
+ req->reg_cqe.done = nvme_rdma_sig_done;
1508
+ memset(wr, 0, sizeof(*wr));
1509
+ wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
1510
+ wr->wr.wr_cqe = &req->reg_cqe;
1511
+ wr->wr.num_sge = 0;
1512
+ wr->wr.send_flags = 0;
1513
+ wr->mr = req->mr;
1514
+ wr->key = req->mr->rkey;
1515
+ wr->access = IB_ACCESS_LOCAL_WRITE |
1516
+ IB_ACCESS_REMOTE_READ |
1517
+ IB_ACCESS_REMOTE_WRITE;
1518
+
1519
+ sg->addr = cpu_to_le64(req->mr->iova);
1520
+ put_unaligned_le24(req->mr->length, sg->length);
1521
+ put_unaligned_le32(req->mr->rkey, sg->key);
1522
+ sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1523
+
1524
+ return 0;
1525
+
1526
+mr_put:
1527
+ ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
1528
+ req->mr = NULL;
1529
+ if (nr < 0)
1530
+ return nr;
1531
+ return -EINVAL;
1532
+}
1533
+
12451534 static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
12461535 struct request *rq, struct nvme_command *c)
12471536 {
12481537 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
12491538 struct nvme_rdma_device *dev = queue->device;
12501539 struct ib_device *ibdev = dev->dev;
1540
+ int pi_count = 0;
12511541 int count, ret;
12521542
12531543 req->num_sge = 1;
....@@ -1255,22 +1545,53 @@
12551545
12561546 c->common.flags |= NVME_CMD_SGL_METABUF;
12571547
1258
- if (!blk_rq_payload_bytes(rq))
1548
+ if (!blk_rq_nr_phys_segments(rq))
12591549 return nvme_rdma_set_sg_null(c);
12601550
1261
- req->sg_table.sgl = req->first_sgl;
1262
- ret = sg_alloc_table_chained(&req->sg_table,
1263
- blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
1551
+ req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1);
1552
+ ret = sg_alloc_table_chained(&req->data_sgl.sg_table,
1553
+ blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl,
1554
+ NVME_INLINE_SG_CNT);
12641555 if (ret)
12651556 return -ENOMEM;
12661557
1267
- req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
1558
+ req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
1559
+ req->data_sgl.sg_table.sgl);
12681560
1269
- count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
1270
- rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1561
+ count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
1562
+ req->data_sgl.nents, rq_dma_dir(rq));
12711563 if (unlikely(count <= 0)) {
12721564 ret = -EIO;
12731565 goto out_free_table;
1566
+ }
1567
+
1568
+ if (blk_integrity_rq(rq)) {
1569
+ req->metadata_sgl->sg_table.sgl =
1570
+ (struct scatterlist *)(req->metadata_sgl + 1);
1571
+ ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table,
1572
+ blk_rq_count_integrity_sg(rq->q, rq->bio),
1573
+ req->metadata_sgl->sg_table.sgl,
1574
+ NVME_INLINE_METADATA_SG_CNT);
1575
+ if (unlikely(ret)) {
1576
+ ret = -ENOMEM;
1577
+ goto out_unmap_sg;
1578
+ }
1579
+
1580
+ req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q,
1581
+ rq->bio, req->metadata_sgl->sg_table.sgl);
1582
+ pi_count = ib_dma_map_sg(ibdev,
1583
+ req->metadata_sgl->sg_table.sgl,
1584
+ req->metadata_sgl->nents,
1585
+ rq_dma_dir(rq));
1586
+ if (unlikely(pi_count <= 0)) {
1587
+ ret = -EIO;
1588
+ goto out_free_pi_table;
1589
+ }
1590
+ }
1591
+
1592
+ if (req->use_sig_mr) {
1593
+ ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count);
1594
+ goto out;
12741595 }
12751596
12761597 if (count <= dev->num_inline_segments) {
....@@ -1291,16 +1612,23 @@
12911612 ret = nvme_rdma_map_sg_fr(queue, req, c, count);
12921613 out:
12931614 if (unlikely(ret))
1294
- goto out_unmap_sg;
1615
+ goto out_unmap_pi_sg;
12951616
12961617 return 0;
12971618
1619
+out_unmap_pi_sg:
1620
+ if (blk_integrity_rq(rq))
1621
+ ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
1622
+ req->metadata_sgl->nents, rq_dma_dir(rq));
1623
+out_free_pi_table:
1624
+ if (blk_integrity_rq(rq))
1625
+ sg_free_table_chained(&req->metadata_sgl->sg_table,
1626
+ NVME_INLINE_METADATA_SG_CNT);
12981627 out_unmap_sg:
1299
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
1300
- req->nents, rq_data_dir(rq) ==
1301
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1628
+ ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
1629
+ rq_dma_dir(rq));
13021630 out_free_table:
1303
- sg_free_table_chained(&req->sg_table, true);
1631
+ sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
13041632 return ret;
13051633 }
13061634
....@@ -1310,15 +1638,11 @@
13101638 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
13111639 struct nvme_rdma_request *req =
13121640 container_of(qe, struct nvme_rdma_request, sqe);
1313
- struct request *rq = blk_mq_rq_from_pdu(req);
13141641
1315
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
1642
+ if (unlikely(wc->status != IB_WC_SUCCESS))
13161643 nvme_rdma_wr_error(cq, wc, "SEND");
1317
- return;
1318
- }
1319
-
1320
- if (refcount_dec_and_test(&req->ref))
1321
- nvme_end_request(rq, req->status, req->result);
1644
+ else
1645
+ nvme_rdma_end_request(req);
13221646 }
13231647
13241648 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
....@@ -1329,7 +1653,7 @@
13291653 int ret;
13301654
13311655 sge->addr = qe->dma;
1332
- sge->length = sizeof(struct nvme_command),
1656
+ sge->length = sizeof(struct nvme_command);
13331657 sge->lkey = queue->device->pd->local_dma_lkey;
13341658
13351659 wr.next = NULL;
....@@ -1420,20 +1744,19 @@
14201744 WARN_ON_ONCE(ret);
14211745 }
14221746
1423
-static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1424
- struct nvme_completion *cqe, struct ib_wc *wc, int tag)
1747
+static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1748
+ struct nvme_completion *cqe, struct ib_wc *wc)
14251749 {
14261750 struct request *rq;
14271751 struct nvme_rdma_request *req;
1428
- int ret = 0;
14291752
1430
- rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1753
+ rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id);
14311754 if (!rq) {
14321755 dev_err(queue->ctrl->ctrl.device,
1433
- "tag 0x%x on QP %#x not found\n",
1756
+ "got bad command_id %#x on QP %#x\n",
14341757 cqe->command_id, queue->qp->qp_num);
14351758 nvme_rdma_error_recovery(queue->ctrl);
1436
- return ret;
1759
+ return;
14371760 }
14381761 req = blk_mq_rq_to_pdu(rq);
14391762
....@@ -1441,13 +1764,16 @@
14411764 req->result = cqe->result;
14421765
14431766 if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
1444
- if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) {
1767
+ if (unlikely(!req->mr ||
1768
+ wc->ex.invalidate_rkey != req->mr->rkey)) {
14451769 dev_err(queue->ctrl->ctrl.device,
14461770 "Bogus remote invalidation for rkey %#x\n",
1447
- req->mr->rkey);
1771
+ req->mr ? req->mr->rkey : 0);
14481772 nvme_rdma_error_recovery(queue->ctrl);
14491773 }
14501774 } else if (req->mr) {
1775
+ int ret;
1776
+
14511777 ret = nvme_rdma_inv_rkey(queue, req);
14521778 if (unlikely(ret < 0)) {
14531779 dev_err(queue->ctrl->ctrl.device,
....@@ -1456,31 +1782,32 @@
14561782 nvme_rdma_error_recovery(queue->ctrl);
14571783 }
14581784 /* the local invalidation completion will end the request */
1459
- return 0;
1785
+ return;
14601786 }
14611787
1462
- if (refcount_dec_and_test(&req->ref)) {
1463
- if (rq->tag == tag)
1464
- ret = 1;
1465
- nvme_end_request(rq, req->status, req->result);
1466
- }
1467
-
1468
- return ret;
1788
+ nvme_rdma_end_request(req);
14691789 }
14701790
1471
-static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1791
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
14721792 {
14731793 struct nvme_rdma_qe *qe =
14741794 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1475
- struct nvme_rdma_queue *queue = cq->cq_context;
1795
+ struct nvme_rdma_queue *queue = wc->qp->qp_context;
14761796 struct ib_device *ibdev = queue->device->dev;
14771797 struct nvme_completion *cqe = qe->data;
14781798 const size_t len = sizeof(struct nvme_completion);
1479
- int ret = 0;
14801799
14811800 if (unlikely(wc->status != IB_WC_SUCCESS)) {
14821801 nvme_rdma_wr_error(cq, wc, "RECV");
1483
- return 0;
1802
+ return;
1803
+ }
1804
+
1805
+ /* sanity checking for received data length */
1806
+ if (unlikely(wc->byte_len < len)) {
1807
+ dev_err(queue->ctrl->ctrl.device,
1808
+ "Unexpected nvme completion length(%d)\n", wc->byte_len);
1809
+ nvme_rdma_error_recovery(queue->ctrl);
1810
+ return;
14841811 }
14851812
14861813 ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
....@@ -1490,21 +1817,15 @@
14901817 * aborts. We don't even bother to allocate a struct request
14911818 * for them but rather special case them here.
14921819 */
1493
- if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1494
- cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
1820
+ if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue),
1821
+ cqe->command_id)))
14951822 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
14961823 &cqe->result);
14971824 else
1498
- ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
1825
+ nvme_rdma_process_nvme_rsp(queue, cqe, wc);
14991826 ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
15001827
15011828 nvme_rdma_post_recv(queue, qe);
1502
- return ret;
1503
-}
1504
-
1505
-static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1506
-{
1507
- __nvme_rdma_recv_done(cq, wc, -1);
15081829 }
15091830
15101831 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
....@@ -1514,14 +1835,10 @@
15141835 for (i = 0; i < queue->queue_size; i++) {
15151836 ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
15161837 if (ret)
1517
- goto out_destroy_queue_ib;
1838
+ return ret;
15181839 }
15191840
15201841 return 0;
1521
-
1522
-out_destroy_queue_ib:
1523
- nvme_rdma_destroy_queue_ib(queue);
1524
- return ret;
15251842 }
15261843
15271844 static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
....@@ -1552,16 +1869,18 @@
15521869
15531870 static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
15541871 {
1872
+ struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
15551873 int ret;
15561874
15571875 ret = nvme_rdma_create_queue_ib(queue);
15581876 if (ret)
15591877 return ret;
15601878
1879
+ if (ctrl->opts->tos >= 0)
1880
+ rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
15611881 ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
15621882 if (ret) {
1563
- dev_err(queue->ctrl->ctrl.device,
1564
- "rdma_resolve_route failed (%d).\n",
1883
+ dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
15651884 queue->cm_error);
15661885 goto out_destroy_queue;
15671886 }
....@@ -1609,18 +1928,14 @@
16091928 priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
16101929 }
16111930
1612
- ret = rdma_connect(queue->cm_id, &param);
1931
+ ret = rdma_connect_locked(queue->cm_id, &param);
16131932 if (ret) {
16141933 dev_err(ctrl->ctrl.device,
1615
- "rdma_connect failed (%d).\n", ret);
1616
- goto out_destroy_queue_ib;
1934
+ "rdma_connect_locked failed (%d).\n", ret);
1935
+ return ret;
16171936 }
16181937
16191938 return 0;
1620
-
1621
-out_destroy_queue_ib:
1622
- nvme_rdma_destroy_queue_ib(queue);
1623
- return ret;
16241939 }
16251940
16261941 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
....@@ -1651,8 +1966,6 @@
16511966 case RDMA_CM_EVENT_ROUTE_ERROR:
16521967 case RDMA_CM_EVENT_CONNECT_ERROR:
16531968 case RDMA_CM_EVENT_UNREACHABLE:
1654
- nvme_rdma_destroy_queue_ib(queue);
1655
- /* fall through */
16561969 case RDMA_CM_EVENT_ADDR_ERROR:
16571970 dev_dbg(queue->ctrl->ctrl.device,
16581971 "CM error event %d\n", ev->event);
....@@ -1683,6 +1996,18 @@
16831996 return 0;
16841997 }
16851998
1999
+static void nvme_rdma_complete_timed_out(struct request *rq)
2000
+{
2001
+ struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2002
+ struct nvme_rdma_queue *queue = req->queue;
2003
+
2004
+ nvme_rdma_stop_queue(queue);
2005
+ if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2006
+ nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2007
+ blk_mq_complete_request(rq);
2008
+ }
2009
+}
2010
+
16862011 static enum blk_eh_timer_return
16872012 nvme_rdma_timeout(struct request *rq, bool reserved)
16882013 {
....@@ -1695,19 +2020,27 @@
16952020
16962021 if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
16972022 /*
1698
- * Teardown immediately if controller times out while starting
1699
- * or we are already started error recovery. all outstanding
1700
- * requests are completed on shutdown, so we return BLK_EH_DONE.
2023
+ * If we are resetting, connecting or deleting we should
2024
+ * complete immediately because we may block controller
2025
+ * teardown or setup sequence
2026
+ * - ctrl disable/shutdown fabrics requests
2027
+ * - connect requests
2028
+ * - initialization admin requests
2029
+ * - I/O requests that entered after unquiescing and
2030
+ * the controller stopped responding
2031
+ *
2032
+ * All other requests should be cancelled by the error
2033
+ * recovery work, so it's fine that we fail it here.
17012034 */
1702
- flush_work(&ctrl->err_work);
1703
- nvme_rdma_teardown_io_queues(ctrl, false);
1704
- nvme_rdma_teardown_admin_queue(ctrl, false);
2035
+ nvme_rdma_complete_timed_out(rq);
17052036 return BLK_EH_DONE;
17062037 }
17072038
1708
- dev_warn(ctrl->ctrl.device, "starting error recovery\n");
2039
+ /*
2040
+ * LIVE state should trigger the normal error recovery which will
2041
+ * handle completing this request.
2042
+ */
17092043 nvme_rdma_error_recovery(ctrl);
1710
-
17112044 return BLK_EH_RESET_TIMER;
17122045 }
17132046
....@@ -1731,20 +2064,36 @@
17312064 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
17322065
17332066 dev = queue->device->dev;
2067
+
2068
+ req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
2069
+ sizeof(struct nvme_command),
2070
+ DMA_TO_DEVICE);
2071
+ err = ib_dma_mapping_error(dev, req->sqe.dma);
2072
+ if (unlikely(err))
2073
+ return BLK_STS_RESOURCE;
2074
+
17342075 ib_dma_sync_single_for_cpu(dev, sqe->dma,
17352076 sizeof(struct nvme_command), DMA_TO_DEVICE);
17362077
17372078 ret = nvme_setup_cmd(ns, rq, c);
17382079 if (ret)
1739
- return ret;
2080
+ goto unmap_qe;
17402081
17412082 blk_mq_start_request(rq);
2083
+
2084
+ if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
2085
+ queue->pi_support &&
2086
+ (c->common.opcode == nvme_cmd_write ||
2087
+ c->common.opcode == nvme_cmd_read) &&
2088
+ nvme_ns_has_pi(ns))
2089
+ req->use_sig_mr = true;
2090
+ else
2091
+ req->use_sig_mr = false;
17422092
17432093 err = nvme_rdma_map_data(queue, rq, c);
17442094 if (unlikely(err < 0)) {
17452095 dev_err(queue->ctrl->ctrl.device,
17462096 "Failed to map data (%d)\n", err);
1747
- nvme_cleanup_cmd(rq);
17482097 goto err;
17492098 }
17502099
....@@ -1755,52 +2104,123 @@
17552104
17562105 err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
17572106 req->mr ? &req->reg_wr.wr : NULL);
1758
- if (unlikely(err)) {
1759
- nvme_rdma_unmap_data(queue, rq);
1760
- goto err;
1761
- }
2107
+ if (unlikely(err))
2108
+ goto err_unmap;
17622109
17632110 return BLK_STS_OK;
2111
+
2112
+err_unmap:
2113
+ nvme_rdma_unmap_data(queue, rq);
17642114 err:
17652115 if (err == -ENOMEM || err == -EAGAIN)
1766
- return BLK_STS_RESOURCE;
1767
- return BLK_STS_IOERR;
2116
+ ret = BLK_STS_RESOURCE;
2117
+ else
2118
+ ret = BLK_STS_IOERR;
2119
+ nvme_cleanup_cmd(rq);
2120
+unmap_qe:
2121
+ ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
2122
+ DMA_TO_DEVICE);
2123
+ return ret;
17682124 }
17692125
1770
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
2126
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
17712127 {
17722128 struct nvme_rdma_queue *queue = hctx->driver_data;
1773
- struct ib_cq *cq = queue->ib_cq;
1774
- struct ib_wc wc;
1775
- int found = 0;
17762129
1777
- while (ib_poll_cq(cq, 1, &wc) > 0) {
1778
- struct ib_cqe *cqe = wc.wr_cqe;
2130
+ return ib_process_cq_direct(queue->ib_cq, -1);
2131
+}
17792132
1780
- if (cqe) {
1781
- if (cqe->done == nvme_rdma_recv_done)
1782
- found |= __nvme_rdma_recv_done(cq, &wc, tag);
1783
- else
1784
- cqe->done(cq, &wc);
1785
- }
2133
+static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req)
2134
+{
2135
+ struct request *rq = blk_mq_rq_from_pdu(req);
2136
+ struct ib_mr_status mr_status;
2137
+ int ret;
2138
+
2139
+ ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
2140
+ if (ret) {
2141
+ pr_err("ib_check_mr_status failed, ret %d\n", ret);
2142
+ nvme_req(rq)->status = NVME_SC_INVALID_PI;
2143
+ return;
17862144 }
17872145
1788
- return found;
2146
+ if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
2147
+ switch (mr_status.sig_err.err_type) {
2148
+ case IB_SIG_BAD_GUARD:
2149
+ nvme_req(rq)->status = NVME_SC_GUARD_CHECK;
2150
+ break;
2151
+ case IB_SIG_BAD_REFTAG:
2152
+ nvme_req(rq)->status = NVME_SC_REFTAG_CHECK;
2153
+ break;
2154
+ case IB_SIG_BAD_APPTAG:
2155
+ nvme_req(rq)->status = NVME_SC_APPTAG_CHECK;
2156
+ break;
2157
+ }
2158
+ pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
2159
+ mr_status.sig_err.err_type, mr_status.sig_err.expected,
2160
+ mr_status.sig_err.actual);
2161
+ }
17892162 }
17902163
17912164 static void nvme_rdma_complete_rq(struct request *rq)
17922165 {
17932166 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2167
+ struct nvme_rdma_queue *queue = req->queue;
2168
+ struct ib_device *ibdev = queue->device->dev;
17942169
1795
- nvme_rdma_unmap_data(req->queue, rq);
2170
+ if (req->use_sig_mr)
2171
+ nvme_rdma_check_pi_status(req);
2172
+
2173
+ nvme_rdma_unmap_data(queue, rq);
2174
+ ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
2175
+ DMA_TO_DEVICE);
17962176 nvme_complete_rq(rq);
17972177 }
17982178
17992179 static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
18002180 {
18012181 struct nvme_rdma_ctrl *ctrl = set->driver_data;
2182
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
18022183
1803
- return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
2184
+ if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2185
+ /* separate read/write queues */
2186
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
2187
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
2188
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2189
+ set->map[HCTX_TYPE_READ].nr_queues =
2190
+ ctrl->io_queues[HCTX_TYPE_READ];
2191
+ set->map[HCTX_TYPE_READ].queue_offset =
2192
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
2193
+ } else {
2194
+ /* shared read/write queues */
2195
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
2196
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
2197
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2198
+ set->map[HCTX_TYPE_READ].nr_queues =
2199
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
2200
+ set->map[HCTX_TYPE_READ].queue_offset = 0;
2201
+ }
2202
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
2203
+ ctrl->device->dev, 0);
2204
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
2205
+ ctrl->device->dev, 0);
2206
+
2207
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2208
+ /* map dedicated poll queues only if we have queues left */
2209
+ set->map[HCTX_TYPE_POLL].nr_queues =
2210
+ ctrl->io_queues[HCTX_TYPE_POLL];
2211
+ set->map[HCTX_TYPE_POLL].queue_offset =
2212
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2213
+ ctrl->io_queues[HCTX_TYPE_READ];
2214
+ blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2215
+ }
2216
+
2217
+ dev_info(ctrl->ctrl.device,
2218
+ "mapped %d/%d/%d default/read/poll queues.\n",
2219
+ ctrl->io_queues[HCTX_TYPE_DEFAULT],
2220
+ ctrl->io_queues[HCTX_TYPE_READ],
2221
+ ctrl->io_queues[HCTX_TYPE_POLL]);
2222
+
2223
+ return 0;
18042224 }
18052225
18062226 static const struct blk_mq_ops nvme_rdma_mq_ops = {
....@@ -1809,9 +2229,9 @@
18092229 .init_request = nvme_rdma_init_request,
18102230 .exit_request = nvme_rdma_exit_request,
18112231 .init_hctx = nvme_rdma_init_hctx,
1812
- .poll = nvme_rdma_poll,
18132232 .timeout = nvme_rdma_timeout,
18142233 .map_queues = nvme_rdma_map_queues,
2234
+ .poll = nvme_rdma_poll,
18152235 };
18162236
18172237 static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
....@@ -1826,10 +2246,11 @@
18262246 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
18272247 {
18282248 nvme_rdma_teardown_io_queues(ctrl, shutdown);
2249
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
18292250 if (shutdown)
18302251 nvme_shutdown_ctrl(&ctrl->ctrl);
18312252 else
1832
- nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
2253
+ nvme_disable_ctrl(&ctrl->ctrl);
18332254 nvme_rdma_teardown_admin_queue(ctrl, shutdown);
18342255 }
18352256
....@@ -1865,7 +2286,7 @@
18652286 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
18662287 .name = "rdma",
18672288 .module = THIS_MODULE,
1868
- .flags = NVME_F_FABRICS,
2289
+ .flags = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED,
18692290 .reg_read32 = nvmf_reg_read32,
18702291 .reg_read64 = nvmf_reg_read64,
18712292 .reg_write32 = nvmf_reg_write32,
....@@ -1875,54 +2296,6 @@
18752296 .get_address = nvmf_get_address,
18762297 .stop_ctrl = nvme_rdma_stop_ctrl,
18772298 };
1878
-
1879
-static inline bool
1880
-__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
1881
- struct nvmf_ctrl_options *opts)
1882
-{
1883
- char *stdport = __stringify(NVME_RDMA_IP_PORT);
1884
-
1885
-
1886
- if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
1887
- strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
1888
- return false;
1889
-
1890
- if (opts->mask & NVMF_OPT_TRSVCID &&
1891
- ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
1892
- if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
1893
- return false;
1894
- } else if (opts->mask & NVMF_OPT_TRSVCID) {
1895
- if (strcmp(opts->trsvcid, stdport))
1896
- return false;
1897
- } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
1898
- if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
1899
- return false;
1900
- }
1901
- /* else, it's a match as both have stdport. Fall to next checks */
1902
-
1903
- /*
1904
- * checking the local address is rough. In most cases, one
1905
- * is not specified and the host port is selected by the stack.
1906
- *
1907
- * Assume no match if:
1908
- * local address is specified and address is not the same
1909
- * local address is not specified but remote is, or vice versa
1910
- * (admin using specific host_traddr when it matters).
1911
- */
1912
- if (opts->mask & NVMF_OPT_HOST_TRADDR &&
1913
- ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
1914
- if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
1915
- return false;
1916
- } else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
1917
- ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
1918
- return false;
1919
- /*
1920
- * if neither controller had an host port specified, assume it's
1921
- * a match as everything else matched.
1922
- */
1923
-
1924
- return true;
1925
-}
19262299
19272300 /*
19282301 * Fails a connection request if it matches an existing controller
....@@ -1944,7 +2317,7 @@
19442317
19452318 mutex_lock(&nvme_rdma_ctrl_mutex);
19462319 list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
1947
- found = __nvme_rdma_options_match(ctrl, opts);
2320
+ found = nvmf_ip_options_match(&ctrl->ctrl, opts);
19482321 if (found)
19492322 break;
19502323 }
....@@ -1959,24 +2332,28 @@
19592332 struct nvme_rdma_ctrl *ctrl;
19602333 int ret;
19612334 bool changed;
1962
- char *port;
19632335
19642336 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
19652337 if (!ctrl)
19662338 return ERR_PTR(-ENOMEM);
19672339 ctrl->ctrl.opts = opts;
19682340 INIT_LIST_HEAD(&ctrl->list);
1969
- mutex_init(&ctrl->teardown_lock);
19702341
1971
- if (opts->mask & NVMF_OPT_TRSVCID)
1972
- port = opts->trsvcid;
1973
- else
1974
- port = __stringify(NVME_RDMA_IP_PORT);
2342
+ if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2343
+ opts->trsvcid =
2344
+ kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
2345
+ if (!opts->trsvcid) {
2346
+ ret = -ENOMEM;
2347
+ goto out_free_ctrl;
2348
+ }
2349
+ opts->mask |= NVMF_OPT_TRSVCID;
2350
+ }
19752351
19762352 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
1977
- opts->traddr, port, &ctrl->addr);
2353
+ opts->traddr, opts->trsvcid, &ctrl->addr);
19782354 if (ret) {
1979
- pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
2355
+ pr_err("malformed address passed: %s:%s\n",
2356
+ opts->traddr, opts->trsvcid);
19802357 goto out_free_ctrl;
19812358 }
19822359
....@@ -2000,7 +2377,8 @@
20002377 INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
20012378 INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
20022379
2003
- ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
2380
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2381
+ opts->nr_poll_queues + 1;
20042382 ctrl->ctrl.sqsize = opts->queue_size - 1;
20052383 ctrl->ctrl.kato = opts->kato;
20062384
....@@ -2024,8 +2402,6 @@
20242402
20252403 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
20262404 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2027
-
2028
- nvme_get_ctrl(&ctrl->ctrl);
20292405
20302406 mutex_lock(&nvme_rdma_ctrl_mutex);
20312407 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
....@@ -2051,7 +2427,9 @@
20512427 .module = THIS_MODULE,
20522428 .required_opts = NVMF_OPT_TRADDR,
20532429 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2054
- NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
2430
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2431
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2432
+ NVMF_OPT_TOS,
20552433 .create_ctrl = nvme_rdma_create_ctrl,
20562434 };
20572435
....@@ -2111,8 +2489,16 @@
21112489
21122490 static void __exit nvme_rdma_cleanup_module(void)
21132491 {
2492
+ struct nvme_rdma_ctrl *ctrl;
2493
+
21142494 nvmf_unregister_transport(&nvme_rdma_transport);
21152495 ib_unregister_client(&nvme_rdma_ib_client);
2496
+
2497
+ mutex_lock(&nvme_rdma_ctrl_mutex);
2498
+ list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
2499
+ nvme_delete_ctrl(&ctrl->ctrl);
2500
+ mutex_unlock(&nvme_rdma_ctrl_mutex);
2501
+ flush_workqueue(nvme_delete_wq);
21162502 }
21172503
21182504 module_init(nvme_rdma_init_module);