forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 1f93a7dfd1f8d5ff7a5c53246c7534fe2332d6f4
kernel/drivers/nvme/host/rdma.c
....@@ -1,15 +1,7 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * NVMe over Fabrics RDMA host code.
34 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4
- *
5
- * This program is free software; you can redistribute it and/or modify it
6
- * under the terms and conditions of the GNU General Public License,
7
- * version 2, as published by the Free Software Foundation.
8
- *
9
- * This program is distributed in the hope it will be useful, but WITHOUT
10
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12
- * more details.
135 */
146 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
157 #include <linux/module.h>
....@@ -42,6 +34,11 @@
4234
4335 #define NVME_RDMA_MAX_INLINE_SEGMENTS 4
4436
37
+#define NVME_RDMA_DATA_SGL_SIZE \
38
+ (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT)
39
+#define NVME_RDMA_METADATA_SGL_SIZE \
40
+ (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT)
41
+
4542 struct nvme_rdma_device {
4643 struct ib_device *dev;
4744 struct ib_pd *pd;
....@@ -56,6 +53,11 @@
5653 u64 dma;
5754 };
5855
56
+struct nvme_rdma_sgl {
57
+ int nents;
58
+ struct sg_table sg_table;
59
+};
60
+
5961 struct nvme_rdma_queue;
6062 struct nvme_rdma_request {
6163 struct nvme_request req;
....@@ -66,12 +68,12 @@
6668 refcount_t ref;
6769 struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
6870 u32 num_sge;
69
- int nents;
7071 struct ib_reg_wr reg_wr;
7172 struct ib_cqe reg_cqe;
7273 struct nvme_rdma_queue *queue;
73
- struct sg_table sg_table;
74
- struct scatterlist first_sgl[];
74
+ struct nvme_rdma_sgl data_sgl;
75
+ struct nvme_rdma_sgl *metadata_sgl;
76
+ bool use_sig_mr;
7577 };
7678
7779 enum nvme_rdma_queue_flags {
....@@ -93,6 +95,9 @@
9395 struct rdma_cm_id *cm_id;
9496 int cm_error;
9597 struct completion cm_done;
98
+ bool pi_support;
99
+ int cq_size;
100
+ struct mutex queue_lock;
96101 };
97102
98103 struct nvme_rdma_ctrl {
....@@ -118,8 +123,8 @@
118123 struct sockaddr_storage src_addr;
119124
120125 struct nvme_ctrl ctrl;
121
- struct mutex teardown_lock;
122126 bool use_inline_data;
127
+ u32 io_queues[HCTX_MAX_TYPES];
123128 };
124129
125130 static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
....@@ -146,21 +151,21 @@
146151 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
147152 struct rdma_cm_event *event);
148153 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
154
+static void nvme_rdma_complete_rq(struct request *rq);
149155
150156 static const struct blk_mq_ops nvme_rdma_mq_ops;
151157 static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
152158
153
-/* XXX: really should move to a generic header sooner or later.. */
154
-static inline void put_unaligned_le24(u32 val, u8 *p)
155
-{
156
- *p++ = val;
157
- *p++ = val >> 8;
158
- *p++ = val >> 16;
159
-}
160
-
161159 static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
162160 {
163161 return queue - queue->ctrl->queues;
162
+}
163
+
164
+static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
165
+{
166
+ return nvme_rdma_queue_idx(queue) >
167
+ queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
168
+ queue->ctrl->io_queues[HCTX_TYPE_READ];
164169 }
165170
166171 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
....@@ -214,6 +219,11 @@
214219 if (!ring)
215220 return NULL;
216221
222
+ /*
223
+ * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
224
+ * lifetime. It's safe, since any chage in the underlying RDMA device
225
+ * will issue error recovery and queue re-creation.
226
+ */
217227 for (i = 0; i < ib_queue_size; i++) {
218228 if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
219229 goto out_free_ring;
....@@ -235,8 +245,15 @@
235245
236246 static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
237247 {
238
- wait_for_completion_interruptible_timeout(&queue->cm_done,
248
+ int ret;
249
+
250
+ ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
239251 msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
252
+ if (ret < 0)
253
+ return ret;
254
+ if (ret == 0)
255
+ return -ETIMEDOUT;
256
+ WARN_ON_ONCE(queue->cm_error > 0);
240257 return queue->cm_error;
241258 }
242259
....@@ -258,6 +275,9 @@
258275 init_attr.qp_type = IB_QPT_RC;
259276 init_attr.send_cq = queue->ib_cq;
260277 init_attr.recv_cq = queue->ib_cq;
278
+ if (queue->pi_support)
279
+ init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
280
+ init_attr.qp_context = queue;
261281
262282 ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
263283
....@@ -268,14 +288,9 @@
268288 static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
269289 struct request *rq, unsigned int hctx_idx)
270290 {
271
- struct nvme_rdma_ctrl *ctrl = set->driver_data;
272291 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
273
- int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
274
- struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
275
- struct nvme_rdma_device *dev = queue->device;
276292
277
- nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
278
- DMA_TO_DEVICE);
293
+ kfree(req->sqe.data);
279294 }
280295
281296 static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
....@@ -286,15 +301,17 @@
286301 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
287302 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
288303 struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
289
- struct nvme_rdma_device *dev = queue->device;
290
- struct ib_device *ibdev = dev->dev;
291
- int ret;
292304
293305 nvme_req(rq)->ctrl = &ctrl->ctrl;
294
- ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
295
- DMA_TO_DEVICE);
296
- if (ret)
297
- return ret;
306
+ req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
307
+ if (!req->sqe.data)
308
+ return -ENOMEM;
309
+
310
+ /* metadata nvme_rdma_sgl struct is located after command's data SGL */
311
+ if (queue->pi_support)
312
+ req->metadata_sgl = (void *)nvme_req(rq) +
313
+ sizeof(struct nvme_rdma_request) +
314
+ NVME_RDMA_DATA_SGL_SIZE;
298315
299316 req->queue = queue;
300317
....@@ -395,6 +412,14 @@
395412 return NULL;
396413 }
397414
415
+static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
416
+{
417
+ if (nvme_rdma_poll_queue(queue))
418
+ ib_free_cq(queue->ib_cq);
419
+ else
420
+ ib_cq_pool_put(queue->ib_cq, queue->cq_size);
421
+}
422
+
398423 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
399424 {
400425 struct nvme_rdma_device *dev;
....@@ -406,6 +431,8 @@
406431 dev = queue->device;
407432 ibdev = dev->dev;
408433
434
+ if (queue->pi_support)
435
+ ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs);
409436 ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
410437
411438 /*
....@@ -414,7 +441,7 @@
414441 * the destruction of the QP shouldn't use rdma_cm API.
415442 */
416443 ib_destroy_qp(queue->qp);
417
- ib_free_cq(queue->ib_cq);
444
+ nvme_rdma_free_cq(queue);
418445
419446 nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
420447 sizeof(struct nvme_completion), DMA_FROM_DEVICE);
....@@ -422,10 +449,47 @@
422449 nvme_rdma_dev_put(dev);
423450 }
424451
425
-static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
452
+static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
426453 {
427
- return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
428
- ibdev->attrs.max_fast_reg_page_list_len);
454
+ u32 max_page_list_len;
455
+
456
+ if (pi_support)
457
+ max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len;
458
+ else
459
+ max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len;
460
+
461
+ return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
462
+}
463
+
464
+static int nvme_rdma_create_cq(struct ib_device *ibdev,
465
+ struct nvme_rdma_queue *queue)
466
+{
467
+ int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
468
+ enum ib_poll_context poll_ctx;
469
+
470
+ /*
471
+ * Spread I/O queues completion vectors according their queue index.
472
+ * Admin queues can always go on completion vector 0.
473
+ */
474
+ comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
475
+
476
+ /* Polling queues need direct cq polling context */
477
+ if (nvme_rdma_poll_queue(queue)) {
478
+ poll_ctx = IB_POLL_DIRECT;
479
+ queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
480
+ comp_vector, poll_ctx);
481
+ } else {
482
+ poll_ctx = IB_POLL_SOFTIRQ;
483
+ queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
484
+ comp_vector, poll_ctx);
485
+ }
486
+
487
+ if (IS_ERR(queue->ib_cq)) {
488
+ ret = PTR_ERR(queue->ib_cq);
489
+ return ret;
490
+ }
491
+
492
+ return 0;
429493 }
430494
431495 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
....@@ -433,8 +497,7 @@
433497 struct ib_device *ibdev;
434498 const int send_wr_factor = 3; /* MR, SEND, INV */
435499 const int cq_factor = send_wr_factor + 1; /* + RECV */
436
- int comp_vector, idx = nvme_rdma_queue_idx(queue);
437
- int ret;
500
+ int ret, pages_per_mr;
438501
439502 queue->device = nvme_rdma_find_get_device(queue->cm_id);
440503 if (!queue->device) {
....@@ -444,20 +507,12 @@
444507 }
445508 ibdev = queue->device->dev;
446509
447
- /*
448
- * Spread I/O queues completion vectors according their queue index.
449
- * Admin queues can always go on completion vector 0.
450
- */
451
- comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
452
-
453510 /* +1 for ib_stop_cq */
454
- queue->ib_cq = ib_alloc_cq(ibdev, queue,
455
- cq_factor * queue->queue_size + 1,
456
- comp_vector, IB_POLL_SOFTIRQ);
457
- if (IS_ERR(queue->ib_cq)) {
458
- ret = PTR_ERR(queue->ib_cq);
511
+ queue->cq_size = cq_factor * queue->queue_size + 1;
512
+
513
+ ret = nvme_rdma_create_cq(ibdev, queue);
514
+ if (ret)
459515 goto out_put_dev;
460
- }
461516
462517 ret = nvme_rdma_create_qp(queue, send_wr_factor);
463518 if (ret)
....@@ -470,28 +525,48 @@
470525 goto out_destroy_qp;
471526 }
472527
528
+ /*
529
+ * Currently we don't use SG_GAPS MR's so if the first entry is
530
+ * misaligned we'll end up using two entries for a single data page,
531
+ * so one additional entry is required.
532
+ */
533
+ pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1;
473534 ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
474535 queue->queue_size,
475536 IB_MR_TYPE_MEM_REG,
476
- nvme_rdma_get_max_fr_pages(ibdev));
537
+ pages_per_mr, 0);
477538 if (ret) {
478539 dev_err(queue->ctrl->ctrl.device,
479540 "failed to initialize MR pool sized %d for QID %d\n",
480
- queue->queue_size, idx);
541
+ queue->queue_size, nvme_rdma_queue_idx(queue));
481542 goto out_destroy_ring;
543
+ }
544
+
545
+ if (queue->pi_support) {
546
+ ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs,
547
+ queue->queue_size, IB_MR_TYPE_INTEGRITY,
548
+ pages_per_mr, pages_per_mr);
549
+ if (ret) {
550
+ dev_err(queue->ctrl->ctrl.device,
551
+ "failed to initialize PI MR pool sized %d for QID %d\n",
552
+ queue->queue_size, nvme_rdma_queue_idx(queue));
553
+ goto out_destroy_mr_pool;
554
+ }
482555 }
483556
484557 set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
485558
486559 return 0;
487560
561
+out_destroy_mr_pool:
562
+ ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
488563 out_destroy_ring:
489564 nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
490565 sizeof(struct nvme_completion), DMA_FROM_DEVICE);
491566 out_destroy_qp:
492567 rdma_destroy_qp(queue->cm_id);
493568 out_destroy_ib_cq:
494
- ib_free_cq(queue->ib_cq);
569
+ nvme_rdma_free_cq(queue);
495570 out_put_dev:
496571 nvme_rdma_dev_put(queue->device);
497572 return ret;
....@@ -505,7 +580,12 @@
505580 int ret;
506581
507582 queue = &ctrl->queues[idx];
583
+ mutex_init(&queue->queue_lock);
508584 queue->ctrl = ctrl;
585
+ if (idx && ctrl->ctrl.max_integrity_segments)
586
+ queue->pi_support = true;
587
+ else
588
+ queue->pi_support = false;
509589 init_completion(&queue->cm_done);
510590
511591 if (idx > 0)
....@@ -520,7 +600,8 @@
520600 if (IS_ERR(queue->cm_id)) {
521601 dev_info(ctrl->ctrl.device,
522602 "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
523
- return PTR_ERR(queue->cm_id);
603
+ ret = PTR_ERR(queue->cm_id);
604
+ goto out_destroy_mutex;
524605 }
525606
526607 if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
....@@ -550,16 +631,23 @@
550631 out_destroy_cm_id:
551632 rdma_destroy_id(queue->cm_id);
552633 nvme_rdma_destroy_queue_ib(queue);
634
+out_destroy_mutex:
635
+ mutex_destroy(&queue->queue_lock);
553636 return ret;
637
+}
638
+
639
+static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
640
+{
641
+ rdma_disconnect(queue->cm_id);
642
+ ib_drain_qp(queue->qp);
554643 }
555644
556645 static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
557646 {
558
- if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
559
- return;
560
-
561
- rdma_disconnect(queue->cm_id);
562
- ib_drain_qp(queue->qp);
647
+ mutex_lock(&queue->queue_lock);
648
+ if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
649
+ __nvme_rdma_stop_queue(queue);
650
+ mutex_unlock(&queue->queue_lock);
563651 }
564652
565653 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
....@@ -567,8 +655,9 @@
567655 if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
568656 return;
569657
570
- nvme_rdma_destroy_queue_ib(queue);
571658 rdma_destroy_id(queue->cm_id);
659
+ nvme_rdma_destroy_queue_ib(queue);
660
+ mutex_destroy(&queue->queue_lock);
572661 }
573662
574663 static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
....@@ -589,18 +678,23 @@
589678
590679 static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
591680 {
681
+ struct nvme_rdma_queue *queue = &ctrl->queues[idx];
682
+ bool poll = nvme_rdma_poll_queue(queue);
592683 int ret;
593684
594685 if (idx)
595
- ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
686
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
596687 else
597688 ret = nvmf_connect_admin_queue(&ctrl->ctrl);
598689
599
- if (!ret)
600
- set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags);
601
- else
690
+ if (!ret) {
691
+ set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
692
+ } else {
693
+ if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
694
+ __nvme_rdma_stop_queue(queue);
602695 dev_info(ctrl->ctrl.device,
603696 "failed to connect queue: %d ret=%d\n", idx, ret);
697
+ }
604698 return ret;
605699 }
606700
....@@ -626,18 +720,16 @@
626720 {
627721 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
628722 struct ib_device *ibdev = ctrl->device->dev;
629
- unsigned int nr_io_queues;
723
+ unsigned int nr_io_queues, nr_default_queues;
724
+ unsigned int nr_read_queues, nr_poll_queues;
630725 int i, ret;
631726
632
- nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
633
-
634
- /*
635
- * we map queues according to the device irq vectors for
636
- * optimal locality so we don't need more queues than
637
- * completion vectors.
638
- */
639
- nr_io_queues = min_t(unsigned int, nr_io_queues,
640
- ibdev->num_comp_vectors);
727
+ nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
728
+ min(opts->nr_io_queues, num_online_cpus()));
729
+ nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
730
+ min(opts->nr_write_queues, num_online_cpus()));
731
+ nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
732
+ nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
641733
642734 ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
643735 if (ret)
....@@ -652,6 +744,34 @@
652744 ctrl->ctrl.queue_count = nr_io_queues + 1;
653745 dev_info(ctrl->ctrl.device,
654746 "creating %d I/O queues.\n", nr_io_queues);
747
+
748
+ if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
749
+ /*
750
+ * separate read/write queues
751
+ * hand out dedicated default queues only after we have
752
+ * sufficient read queues.
753
+ */
754
+ ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
755
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
756
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
757
+ min(nr_default_queues, nr_io_queues);
758
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
759
+ } else {
760
+ /*
761
+ * shared read/write queues
762
+ * either no write queues were requested, or we don't have
763
+ * sufficient queue count to have dedicated default queues.
764
+ */
765
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
766
+ min(nr_read_queues, nr_io_queues);
767
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
768
+ }
769
+
770
+ if (opts->nr_poll_queues && nr_io_queues) {
771
+ /* map dedicated poll queues only if we have queues left */
772
+ ctrl->io_queues[HCTX_TYPE_POLL] =
773
+ min(nr_poll_queues, nr_io_queues);
774
+ }
655775
656776 for (i = 1; i < ctrl->ctrl.queue_count; i++) {
657777 ret = nvme_rdma_alloc_queue(ctrl, i,
....@@ -669,15 +789,6 @@
669789 return ret;
670790 }
671791
672
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
673
- struct blk_mq_tag_set *set)
674
-{
675
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
676
-
677
- blk_mq_free_tag_set(set);
678
- nvme_rdma_dev_put(ctrl->device);
679
-}
680
-
681792 static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
682793 bool admin)
683794 {
....@@ -691,9 +802,9 @@
691802 set->ops = &nvme_rdma_admin_mq_ops;
692803 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
693804 set->reserved_tags = 2; /* connect + keep-alive */
694
- set->numa_node = NUMA_NO_NODE;
805
+ set->numa_node = nctrl->numa_node;
695806 set->cmd_size = sizeof(struct nvme_rdma_request) +
696
- SG_CHUNK_SIZE * sizeof(struct scatterlist);
807
+ NVME_RDMA_DATA_SGL_SIZE;
697808 set->driver_data = ctrl;
698809 set->nr_hw_queues = 1;
699810 set->timeout = ADMIN_TIMEOUT;
....@@ -704,35 +815,24 @@
704815 set->ops = &nvme_rdma_mq_ops;
705816 set->queue_depth = nctrl->sqsize + 1;
706817 set->reserved_tags = 1; /* fabric connect */
707
- set->numa_node = NUMA_NO_NODE;
818
+ set->numa_node = nctrl->numa_node;
708819 set->flags = BLK_MQ_F_SHOULD_MERGE;
709820 set->cmd_size = sizeof(struct nvme_rdma_request) +
710
- SG_CHUNK_SIZE * sizeof(struct scatterlist);
821
+ NVME_RDMA_DATA_SGL_SIZE;
822
+ if (nctrl->max_integrity_segments)
823
+ set->cmd_size += sizeof(struct nvme_rdma_sgl) +
824
+ NVME_RDMA_METADATA_SGL_SIZE;
711825 set->driver_data = ctrl;
712826 set->nr_hw_queues = nctrl->queue_count - 1;
713827 set->timeout = NVME_IO_TIMEOUT;
828
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
714829 }
715830
716831 ret = blk_mq_alloc_tag_set(set);
717832 if (ret)
718
- goto out;
719
-
720
- /*
721
- * We need a reference on the device as long as the tag_set is alive,
722
- * as the MRs in the request structures need a valid ib_device.
723
- */
724
- ret = nvme_rdma_dev_get(ctrl->device);
725
- if (!ret) {
726
- ret = -EINVAL;
727
- goto out_free_tagset;
728
- }
833
+ return ERR_PTR(ret);
729834
730835 return set;
731
-
732
-out_free_tagset:
733
- blk_mq_free_tag_set(set);
734
-out:
735
- return ERR_PTR(ret);
736836 }
737837
738838 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
....@@ -740,7 +840,8 @@
740840 {
741841 if (remove) {
742842 blk_cleanup_queue(ctrl->ctrl.admin_q);
743
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
843
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
844
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
744845 }
745846 if (ctrl->async_event_sqe.data) {
746847 cancel_work_sync(&ctrl->ctrl.async_event_work);
....@@ -754,6 +855,7 @@
754855 static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
755856 bool new)
756857 {
858
+ bool pi_capable = false;
757859 int error;
758860
759861 error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
....@@ -761,9 +863,21 @@
761863 return error;
762864
763865 ctrl->device = ctrl->queues[0].device;
866
+ ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev);
764867
765
- ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
868
+ /* T10-PI support */
869
+ if (ctrl->device->dev->attrs.device_cap_flags &
870
+ IB_DEVICE_INTEGRITY_HANDOVER)
871
+ pi_capable = true;
766872
873
+ ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
874
+ pi_capable);
875
+
876
+ /*
877
+ * Bind the async event SQE DMA mapping to the admin queue lifetime.
878
+ * It's safe, since any chage in the underlying RDMA device will issue
879
+ * error recovery and queue re-creation.
880
+ */
767881 error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
768882 sizeof(struct nvme_command), DMA_TO_DEVICE);
769883 if (error)
....@@ -776,10 +890,16 @@
776890 goto out_free_async_qe;
777891 }
778892
893
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
894
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
895
+ error = PTR_ERR(ctrl->ctrl.fabrics_q);
896
+ goto out_free_tagset;
897
+ }
898
+
779899 ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
780900 if (IS_ERR(ctrl->ctrl.admin_q)) {
781901 error = PTR_ERR(ctrl->ctrl.admin_q);
782
- goto out_free_tagset;
902
+ goto out_cleanup_fabrics_q;
783903 }
784904 }
785905
....@@ -787,38 +907,40 @@
787907 if (error)
788908 goto out_cleanup_queue;
789909
790
- error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
791
- &ctrl->ctrl.cap);
792
- if (error) {
793
- dev_err(ctrl->ctrl.device,
794
- "prop_get NVME_REG_CAP failed\n");
795
- goto out_stop_queue;
796
- }
797
-
798
- ctrl->ctrl.sqsize =
799
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
800
-
801
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
910
+ error = nvme_enable_ctrl(&ctrl->ctrl);
802911 if (error)
803912 goto out_stop_queue;
804913
805
- ctrl->ctrl.max_hw_sectors =
806
- (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
914
+ ctrl->ctrl.max_segments = ctrl->max_fr_pages;
915
+ ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
916
+ if (pi_capable)
917
+ ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
918
+ else
919
+ ctrl->ctrl.max_integrity_segments = 0;
920
+
921
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
807922
808923 error = nvme_init_identify(&ctrl->ctrl);
809924 if (error)
810
- goto out_stop_queue;
925
+ goto out_quiesce_queue;
811926
812927 return 0;
813928
929
+out_quiesce_queue:
930
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
931
+ blk_sync_queue(ctrl->ctrl.admin_q);
814932 out_stop_queue:
815933 nvme_rdma_stop_queue(&ctrl->queues[0]);
934
+ nvme_cancel_admin_tagset(&ctrl->ctrl);
816935 out_cleanup_queue:
817936 if (new)
818937 blk_cleanup_queue(ctrl->ctrl.admin_q);
938
+out_cleanup_fabrics_q:
939
+ if (new)
940
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
819941 out_free_tagset:
820942 if (new)
821
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
943
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
822944 out_free_async_qe:
823945 if (ctrl->async_event_sqe.data) {
824946 nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
....@@ -835,7 +957,7 @@
835957 {
836958 if (remove) {
837959 blk_cleanup_queue(ctrl->ctrl.connect_q);
838
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
960
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
839961 }
840962 nvme_rdma_free_io_queues(ctrl);
841963 }
....@@ -860,23 +982,41 @@
860982 ret = PTR_ERR(ctrl->ctrl.connect_q);
861983 goto out_free_tag_set;
862984 }
863
- } else {
864
- blk_mq_update_nr_hw_queues(&ctrl->tag_set,
865
- ctrl->ctrl.queue_count - 1);
866985 }
867986
868987 ret = nvme_rdma_start_io_queues(ctrl);
869988 if (ret)
870989 goto out_cleanup_connect_q;
871990
991
+ if (!new) {
992
+ nvme_start_queues(&ctrl->ctrl);
993
+ if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
994
+ /*
995
+ * If we timed out waiting for freeze we are likely to
996
+ * be stuck. Fail the controller initialization just
997
+ * to be safe.
998
+ */
999
+ ret = -ENODEV;
1000
+ goto out_wait_freeze_timed_out;
1001
+ }
1002
+ blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
1003
+ ctrl->ctrl.queue_count - 1);
1004
+ nvme_unfreeze(&ctrl->ctrl);
1005
+ }
1006
+
8721007 return 0;
8731008
1009
+out_wait_freeze_timed_out:
1010
+ nvme_stop_queues(&ctrl->ctrl);
1011
+ nvme_sync_io_queues(&ctrl->ctrl);
1012
+ nvme_rdma_stop_io_queues(ctrl);
8741013 out_cleanup_connect_q:
1014
+ nvme_cancel_tagset(&ctrl->ctrl);
8751015 if (new)
8761016 blk_cleanup_queue(ctrl->ctrl.connect_q);
8771017 out_free_tag_set:
8781018 if (new)
879
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
1019
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
8801020 out_free_io_queues:
8811021 nvme_rdma_free_io_queues(ctrl);
8821022 return ret;
....@@ -885,32 +1025,36 @@
8851025 static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
8861026 bool remove)
8871027 {
888
- mutex_lock(&ctrl->teardown_lock);
8891028 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1029
+ blk_sync_queue(ctrl->ctrl.admin_q);
8901030 nvme_rdma_stop_queue(&ctrl->queues[0]);
891
- if (ctrl->ctrl.admin_tagset)
1031
+ if (ctrl->ctrl.admin_tagset) {
8921032 blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
8931033 nvme_cancel_request, &ctrl->ctrl);
894
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
1034
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
1035
+ }
1036
+ if (remove)
1037
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
8951038 nvme_rdma_destroy_admin_queue(ctrl, remove);
896
- mutex_unlock(&ctrl->teardown_lock);
8971039 }
8981040
8991041 static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
9001042 bool remove)
9011043 {
902
- mutex_lock(&ctrl->teardown_lock);
9031044 if (ctrl->ctrl.queue_count > 1) {
1045
+ nvme_start_freeze(&ctrl->ctrl);
9041046 nvme_stop_queues(&ctrl->ctrl);
1047
+ nvme_sync_io_queues(&ctrl->ctrl);
9051048 nvme_rdma_stop_io_queues(ctrl);
906
- if (ctrl->ctrl.tagset)
1049
+ if (ctrl->ctrl.tagset) {
9071050 blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
9081051 nvme_cancel_request, &ctrl->ctrl);
1052
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
1053
+ }
9091054 if (remove)
9101055 nvme_start_queues(&ctrl->ctrl);
9111056 nvme_rdma_destroy_io_queues(ctrl, remove);
9121057 }
913
- mutex_unlock(&ctrl->teardown_lock);
9141058 }
9151059
9161060 static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
....@@ -1003,8 +1147,14 @@
10031147
10041148 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
10051149 if (!changed) {
1006
- /* state change failure is ok if we're in DELETING state */
1007
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
1150
+ /*
1151
+ * state change failure is ok if we started ctrl delete,
1152
+ * unless we're during creation of a new controller to
1153
+ * avoid races with teardown flow.
1154
+ */
1155
+ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
1156
+ ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
1157
+ WARN_ON_ONCE(new);
10081158 ret = -EINVAL;
10091159 goto destroy_io;
10101160 }
....@@ -1013,10 +1163,18 @@
10131163 return 0;
10141164
10151165 destroy_io:
1016
- if (ctrl->ctrl.queue_count > 1)
1166
+ if (ctrl->ctrl.queue_count > 1) {
1167
+ nvme_stop_queues(&ctrl->ctrl);
1168
+ nvme_sync_io_queues(&ctrl->ctrl);
1169
+ nvme_rdma_stop_io_queues(ctrl);
1170
+ nvme_cancel_tagset(&ctrl->ctrl);
10171171 nvme_rdma_destroy_io_queues(ctrl, new);
1172
+ }
10181173 destroy_admin:
1174
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1175
+ blk_sync_queue(ctrl->ctrl.admin_q);
10191176 nvme_rdma_stop_queue(&ctrl->queues[0]);
1177
+ nvme_cancel_admin_tagset(&ctrl->ctrl);
10201178 nvme_rdma_destroy_admin_queue(ctrl, new);
10211179 return ret;
10221180 }
....@@ -1054,10 +1212,12 @@
10541212 nvme_rdma_teardown_io_queues(ctrl, false);
10551213 nvme_start_queues(&ctrl->ctrl);
10561214 nvme_rdma_teardown_admin_queue(ctrl, false);
1215
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
10571216
10581217 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
1059
- /* state change failure is ok if we're in DELETING state */
1060
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
1218
+ /* state change failure is ok if we started ctrl delete */
1219
+ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
1220
+ ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
10611221 return;
10621222 }
10631223
....@@ -1069,13 +1229,24 @@
10691229 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
10701230 return;
10711231
1072
- queue_work(nvme_wq, &ctrl->err_work);
1232
+ dev_warn(ctrl->ctrl.device, "starting error recovery\n");
1233
+ queue_work(nvme_reset_wq, &ctrl->err_work);
1234
+}
1235
+
1236
+static void nvme_rdma_end_request(struct nvme_rdma_request *req)
1237
+{
1238
+ struct request *rq = blk_mq_rq_from_pdu(req);
1239
+
1240
+ if (!refcount_dec_and_test(&req->ref))
1241
+ return;
1242
+ if (!nvme_try_complete_req(rq, req->status, req->result))
1243
+ nvme_rdma_complete_rq(rq);
10731244 }
10741245
10751246 static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
10761247 const char *op)
10771248 {
1078
- struct nvme_rdma_queue *queue = cq->cq_context;
1249
+ struct nvme_rdma_queue *queue = wc->qp->qp_context;
10791250 struct nvme_rdma_ctrl *ctrl = queue->ctrl;
10801251
10811252 if (ctrl->ctrl.state == NVME_CTRL_LIVE)
....@@ -1096,16 +1267,11 @@
10961267 {
10971268 struct nvme_rdma_request *req =
10981269 container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
1099
- struct request *rq = blk_mq_rq_from_pdu(req);
11001270
1101
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
1271
+ if (unlikely(wc->status != IB_WC_SUCCESS))
11021272 nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
1103
- return;
1104
- }
1105
-
1106
- if (refcount_dec_and_test(&req->ref))
1107
- nvme_end_request(rq, req->status, req->result);
1108
-
1273
+ else
1274
+ nvme_rdma_end_request(req);
11091275 }
11101276
11111277 static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
....@@ -1131,21 +1297,29 @@
11311297 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
11321298 struct nvme_rdma_device *dev = queue->device;
11331299 struct ib_device *ibdev = dev->dev;
1300
+ struct list_head *pool = &queue->qp->rdma_mrs;
11341301
1135
- if (!blk_rq_payload_bytes(rq))
1302
+ if (!blk_rq_nr_phys_segments(rq))
11361303 return;
11371304
1305
+ if (blk_integrity_rq(rq)) {
1306
+ ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
1307
+ req->metadata_sgl->nents, rq_dma_dir(rq));
1308
+ sg_free_table_chained(&req->metadata_sgl->sg_table,
1309
+ NVME_INLINE_METADATA_SG_CNT);
1310
+ }
1311
+
1312
+ if (req->use_sig_mr)
1313
+ pool = &queue->qp->sig_mrs;
1314
+
11381315 if (req->mr) {
1139
- ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
1316
+ ib_mr_pool_put(queue->qp, pool, req->mr);
11401317 req->mr = NULL;
11411318 }
11421319
1143
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
1144
- req->nents, rq_data_dir(rq) ==
1145
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1146
-
1147
- nvme_cleanup_cmd(rq);
1148
- sg_free_table_chained(&req->sg_table, true);
1320
+ ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
1321
+ rq_dma_dir(rq));
1322
+ sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
11491323 }
11501324
11511325 static int nvme_rdma_set_sg_null(struct nvme_command *c)
....@@ -1164,16 +1338,17 @@
11641338 int count)
11651339 {
11661340 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1167
- struct scatterlist *sgl = req->sg_table.sgl;
11681341 struct ib_sge *sge = &req->sge[1];
1342
+ struct scatterlist *sgl;
11691343 u32 len = 0;
11701344 int i;
11711345
1172
- for (i = 0; i < count; i++, sgl++, sge++) {
1346
+ for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) {
11731347 sge->addr = sg_dma_address(sgl);
11741348 sge->length = sg_dma_len(sgl);
11751349 sge->lkey = queue->device->pd->local_dma_lkey;
11761350 len += sge->length;
1351
+ sge++;
11771352 }
11781353
11791354 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
....@@ -1189,8 +1364,8 @@
11891364 {
11901365 struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
11911366
1192
- sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
1193
- put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
1367
+ sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl));
1368
+ put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length);
11941369 put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
11951370 sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
11961371 return 0;
....@@ -1211,7 +1386,8 @@
12111386 * Align the MR to a 4K page size to match the ctrl page size and
12121387 * the block virtual boundary.
12131388 */
1214
- nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
1389
+ nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL,
1390
+ SZ_4K);
12151391 if (unlikely(nr < count)) {
12161392 ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
12171393 req->mr = NULL;
....@@ -1242,12 +1418,125 @@
12421418 return 0;
12431419 }
12441420
1421
+static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
1422
+ struct nvme_command *cmd, struct ib_sig_domain *domain,
1423
+ u16 control, u8 pi_type)
1424
+{
1425
+ domain->sig_type = IB_SIG_TYPE_T10_DIF;
1426
+ domain->sig.dif.bg_type = IB_T10DIF_CRC;
1427
+ domain->sig.dif.pi_interval = 1 << bi->interval_exp;
1428
+ domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
1429
+ if (control & NVME_RW_PRINFO_PRCHK_REF)
1430
+ domain->sig.dif.ref_remap = true;
1431
+
1432
+ domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
1433
+ domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
1434
+ domain->sig.dif.app_escape = true;
1435
+ if (pi_type == NVME_NS_DPS_PI_TYPE3)
1436
+ domain->sig.dif.ref_escape = true;
1437
+}
1438
+
1439
+static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi,
1440
+ struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs,
1441
+ u8 pi_type)
1442
+{
1443
+ u16 control = le16_to_cpu(cmd->rw.control);
1444
+
1445
+ memset(sig_attrs, 0, sizeof(*sig_attrs));
1446
+ if (control & NVME_RW_PRINFO_PRACT) {
1447
+ /* for WRITE_INSERT/READ_STRIP no memory domain */
1448
+ sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
1449
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
1450
+ pi_type);
1451
+ /* Clear the PRACT bit since HCA will generate/verify the PI */
1452
+ control &= ~NVME_RW_PRINFO_PRACT;
1453
+ cmd->rw.control = cpu_to_le16(control);
1454
+ } else {
1455
+ /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
1456
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
1457
+ pi_type);
1458
+ nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
1459
+ pi_type);
1460
+ }
1461
+}
1462
+
1463
+static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask)
1464
+{
1465
+ *mask = 0;
1466
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
1467
+ *mask |= IB_SIG_CHECK_REFTAG;
1468
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
1469
+ *mask |= IB_SIG_CHECK_GUARD;
1470
+}
1471
+
1472
+static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc)
1473
+{
1474
+ if (unlikely(wc->status != IB_WC_SUCCESS))
1475
+ nvme_rdma_wr_error(cq, wc, "SIG");
1476
+}
1477
+
1478
+static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
1479
+ struct nvme_rdma_request *req, struct nvme_command *c,
1480
+ int count, int pi_count)
1481
+{
1482
+ struct nvme_rdma_sgl *sgl = &req->data_sgl;
1483
+ struct ib_reg_wr *wr = &req->reg_wr;
1484
+ struct request *rq = blk_mq_rq_from_pdu(req);
1485
+ struct nvme_ns *ns = rq->q->queuedata;
1486
+ struct bio *bio = rq->bio;
1487
+ struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1488
+ int nr;
1489
+
1490
+ req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
1491
+ if (WARN_ON_ONCE(!req->mr))
1492
+ return -EAGAIN;
1493
+
1494
+ nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL,
1495
+ req->metadata_sgl->sg_table.sgl, pi_count, NULL,
1496
+ SZ_4K);
1497
+ if (unlikely(nr))
1498
+ goto mr_put;
1499
+
1500
+ nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c,
1501
+ req->mr->sig_attrs, ns->pi_type);
1502
+ nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
1503
+
1504
+ ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1505
+
1506
+ req->reg_cqe.done = nvme_rdma_sig_done;
1507
+ memset(wr, 0, sizeof(*wr));
1508
+ wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
1509
+ wr->wr.wr_cqe = &req->reg_cqe;
1510
+ wr->wr.num_sge = 0;
1511
+ wr->wr.send_flags = 0;
1512
+ wr->mr = req->mr;
1513
+ wr->key = req->mr->rkey;
1514
+ wr->access = IB_ACCESS_LOCAL_WRITE |
1515
+ IB_ACCESS_REMOTE_READ |
1516
+ IB_ACCESS_REMOTE_WRITE;
1517
+
1518
+ sg->addr = cpu_to_le64(req->mr->iova);
1519
+ put_unaligned_le24(req->mr->length, sg->length);
1520
+ put_unaligned_le32(req->mr->rkey, sg->key);
1521
+ sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1522
+
1523
+ return 0;
1524
+
1525
+mr_put:
1526
+ ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
1527
+ req->mr = NULL;
1528
+ if (nr < 0)
1529
+ return nr;
1530
+ return -EINVAL;
1531
+}
1532
+
12451533 static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
12461534 struct request *rq, struct nvme_command *c)
12471535 {
12481536 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
12491537 struct nvme_rdma_device *dev = queue->device;
12501538 struct ib_device *ibdev = dev->dev;
1539
+ int pi_count = 0;
12511540 int count, ret;
12521541
12531542 req->num_sge = 1;
....@@ -1255,22 +1544,53 @@
12551544
12561545 c->common.flags |= NVME_CMD_SGL_METABUF;
12571546
1258
- if (!blk_rq_payload_bytes(rq))
1547
+ if (!blk_rq_nr_phys_segments(rq))
12591548 return nvme_rdma_set_sg_null(c);
12601549
1261
- req->sg_table.sgl = req->first_sgl;
1262
- ret = sg_alloc_table_chained(&req->sg_table,
1263
- blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
1550
+ req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1);
1551
+ ret = sg_alloc_table_chained(&req->data_sgl.sg_table,
1552
+ blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl,
1553
+ NVME_INLINE_SG_CNT);
12641554 if (ret)
12651555 return -ENOMEM;
12661556
1267
- req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
1557
+ req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
1558
+ req->data_sgl.sg_table.sgl);
12681559
1269
- count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
1270
- rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1560
+ count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
1561
+ req->data_sgl.nents, rq_dma_dir(rq));
12711562 if (unlikely(count <= 0)) {
12721563 ret = -EIO;
12731564 goto out_free_table;
1565
+ }
1566
+
1567
+ if (blk_integrity_rq(rq)) {
1568
+ req->metadata_sgl->sg_table.sgl =
1569
+ (struct scatterlist *)(req->metadata_sgl + 1);
1570
+ ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table,
1571
+ blk_rq_count_integrity_sg(rq->q, rq->bio),
1572
+ req->metadata_sgl->sg_table.sgl,
1573
+ NVME_INLINE_METADATA_SG_CNT);
1574
+ if (unlikely(ret)) {
1575
+ ret = -ENOMEM;
1576
+ goto out_unmap_sg;
1577
+ }
1578
+
1579
+ req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q,
1580
+ rq->bio, req->metadata_sgl->sg_table.sgl);
1581
+ pi_count = ib_dma_map_sg(ibdev,
1582
+ req->metadata_sgl->sg_table.sgl,
1583
+ req->metadata_sgl->nents,
1584
+ rq_dma_dir(rq));
1585
+ if (unlikely(pi_count <= 0)) {
1586
+ ret = -EIO;
1587
+ goto out_free_pi_table;
1588
+ }
1589
+ }
1590
+
1591
+ if (req->use_sig_mr) {
1592
+ ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count);
1593
+ goto out;
12741594 }
12751595
12761596 if (count <= dev->num_inline_segments) {
....@@ -1291,16 +1611,23 @@
12911611 ret = nvme_rdma_map_sg_fr(queue, req, c, count);
12921612 out:
12931613 if (unlikely(ret))
1294
- goto out_unmap_sg;
1614
+ goto out_unmap_pi_sg;
12951615
12961616 return 0;
12971617
1618
+out_unmap_pi_sg:
1619
+ if (blk_integrity_rq(rq))
1620
+ ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
1621
+ req->metadata_sgl->nents, rq_dma_dir(rq));
1622
+out_free_pi_table:
1623
+ if (blk_integrity_rq(rq))
1624
+ sg_free_table_chained(&req->metadata_sgl->sg_table,
1625
+ NVME_INLINE_METADATA_SG_CNT);
12981626 out_unmap_sg:
1299
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
1300
- req->nents, rq_data_dir(rq) ==
1301
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1627
+ ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
1628
+ rq_dma_dir(rq));
13021629 out_free_table:
1303
- sg_free_table_chained(&req->sg_table, true);
1630
+ sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
13041631 return ret;
13051632 }
13061633
....@@ -1310,15 +1637,11 @@
13101637 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
13111638 struct nvme_rdma_request *req =
13121639 container_of(qe, struct nvme_rdma_request, sqe);
1313
- struct request *rq = blk_mq_rq_from_pdu(req);
13141640
1315
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
1641
+ if (unlikely(wc->status != IB_WC_SUCCESS))
13161642 nvme_rdma_wr_error(cq, wc, "SEND");
1317
- return;
1318
- }
1319
-
1320
- if (refcount_dec_and_test(&req->ref))
1321
- nvme_end_request(rq, req->status, req->result);
1643
+ else
1644
+ nvme_rdma_end_request(req);
13221645 }
13231646
13241647 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
....@@ -1329,7 +1652,7 @@
13291652 int ret;
13301653
13311654 sge->addr = qe->dma;
1332
- sge->length = sizeof(struct nvme_command),
1655
+ sge->length = sizeof(struct nvme_command);
13331656 sge->lkey = queue->device->pd->local_dma_lkey;
13341657
13351658 wr.next = NULL;
....@@ -1420,20 +1743,19 @@
14201743 WARN_ON_ONCE(ret);
14211744 }
14221745
1423
-static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1424
- struct nvme_completion *cqe, struct ib_wc *wc, int tag)
1746
+static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1747
+ struct nvme_completion *cqe, struct ib_wc *wc)
14251748 {
14261749 struct request *rq;
14271750 struct nvme_rdma_request *req;
1428
- int ret = 0;
14291751
1430
- rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1752
+ rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id);
14311753 if (!rq) {
14321754 dev_err(queue->ctrl->ctrl.device,
1433
- "tag 0x%x on QP %#x not found\n",
1755
+ "got bad command_id %#x on QP %#x\n",
14341756 cqe->command_id, queue->qp->qp_num);
14351757 nvme_rdma_error_recovery(queue->ctrl);
1436
- return ret;
1758
+ return;
14371759 }
14381760 req = blk_mq_rq_to_pdu(rq);
14391761
....@@ -1441,13 +1763,16 @@
14411763 req->result = cqe->result;
14421764
14431765 if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
1444
- if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) {
1766
+ if (unlikely(!req->mr ||
1767
+ wc->ex.invalidate_rkey != req->mr->rkey)) {
14451768 dev_err(queue->ctrl->ctrl.device,
14461769 "Bogus remote invalidation for rkey %#x\n",
1447
- req->mr->rkey);
1770
+ req->mr ? req->mr->rkey : 0);
14481771 nvme_rdma_error_recovery(queue->ctrl);
14491772 }
14501773 } else if (req->mr) {
1774
+ int ret;
1775
+
14511776 ret = nvme_rdma_inv_rkey(queue, req);
14521777 if (unlikely(ret < 0)) {
14531778 dev_err(queue->ctrl->ctrl.device,
....@@ -1456,31 +1781,32 @@
14561781 nvme_rdma_error_recovery(queue->ctrl);
14571782 }
14581783 /* the local invalidation completion will end the request */
1459
- return 0;
1784
+ return;
14601785 }
14611786
1462
- if (refcount_dec_and_test(&req->ref)) {
1463
- if (rq->tag == tag)
1464
- ret = 1;
1465
- nvme_end_request(rq, req->status, req->result);
1466
- }
1467
-
1468
- return ret;
1787
+ nvme_rdma_end_request(req);
14691788 }
14701789
1471
-static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1790
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
14721791 {
14731792 struct nvme_rdma_qe *qe =
14741793 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1475
- struct nvme_rdma_queue *queue = cq->cq_context;
1794
+ struct nvme_rdma_queue *queue = wc->qp->qp_context;
14761795 struct ib_device *ibdev = queue->device->dev;
14771796 struct nvme_completion *cqe = qe->data;
14781797 const size_t len = sizeof(struct nvme_completion);
1479
- int ret = 0;
14801798
14811799 if (unlikely(wc->status != IB_WC_SUCCESS)) {
14821800 nvme_rdma_wr_error(cq, wc, "RECV");
1483
- return 0;
1801
+ return;
1802
+ }
1803
+
1804
+ /* sanity checking for received data length */
1805
+ if (unlikely(wc->byte_len < len)) {
1806
+ dev_err(queue->ctrl->ctrl.device,
1807
+ "Unexpected nvme completion length(%d)\n", wc->byte_len);
1808
+ nvme_rdma_error_recovery(queue->ctrl);
1809
+ return;
14841810 }
14851811
14861812 ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
....@@ -1490,21 +1816,15 @@
14901816 * aborts. We don't even bother to allocate a struct request
14911817 * for them but rather special case them here.
14921818 */
1493
- if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1494
- cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
1819
+ if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue),
1820
+ cqe->command_id)))
14951821 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
14961822 &cqe->result);
14971823 else
1498
- ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
1824
+ nvme_rdma_process_nvme_rsp(queue, cqe, wc);
14991825 ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
15001826
15011827 nvme_rdma_post_recv(queue, qe);
1502
- return ret;
1503
-}
1504
-
1505
-static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1506
-{
1507
- __nvme_rdma_recv_done(cq, wc, -1);
15081828 }
15091829
15101830 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
....@@ -1514,14 +1834,10 @@
15141834 for (i = 0; i < queue->queue_size; i++) {
15151835 ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
15161836 if (ret)
1517
- goto out_destroy_queue_ib;
1837
+ return ret;
15181838 }
15191839
15201840 return 0;
1521
-
1522
-out_destroy_queue_ib:
1523
- nvme_rdma_destroy_queue_ib(queue);
1524
- return ret;
15251841 }
15261842
15271843 static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
....@@ -1552,16 +1868,18 @@
15521868
15531869 static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
15541870 {
1871
+ struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
15551872 int ret;
15561873
15571874 ret = nvme_rdma_create_queue_ib(queue);
15581875 if (ret)
15591876 return ret;
15601877
1878
+ if (ctrl->opts->tos >= 0)
1879
+ rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
15611880 ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
15621881 if (ret) {
1563
- dev_err(queue->ctrl->ctrl.device,
1564
- "rdma_resolve_route failed (%d).\n",
1882
+ dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
15651883 queue->cm_error);
15661884 goto out_destroy_queue;
15671885 }
....@@ -1609,18 +1927,14 @@
16091927 priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
16101928 }
16111929
1612
- ret = rdma_connect(queue->cm_id, &param);
1930
+ ret = rdma_connect_locked(queue->cm_id, &param);
16131931 if (ret) {
16141932 dev_err(ctrl->ctrl.device,
1615
- "rdma_connect failed (%d).\n", ret);
1616
- goto out_destroy_queue_ib;
1933
+ "rdma_connect_locked failed (%d).\n", ret);
1934
+ return ret;
16171935 }
16181936
16191937 return 0;
1620
-
1621
-out_destroy_queue_ib:
1622
- nvme_rdma_destroy_queue_ib(queue);
1623
- return ret;
16241938 }
16251939
16261940 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
....@@ -1651,8 +1965,6 @@
16511965 case RDMA_CM_EVENT_ROUTE_ERROR:
16521966 case RDMA_CM_EVENT_CONNECT_ERROR:
16531967 case RDMA_CM_EVENT_UNREACHABLE:
1654
- nvme_rdma_destroy_queue_ib(queue);
1655
- /* fall through */
16561968 case RDMA_CM_EVENT_ADDR_ERROR:
16571969 dev_dbg(queue->ctrl->ctrl.device,
16581970 "CM error event %d\n", ev->event);
....@@ -1683,6 +1995,18 @@
16831995 return 0;
16841996 }
16851997
1998
+static void nvme_rdma_complete_timed_out(struct request *rq)
1999
+{
2000
+ struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2001
+ struct nvme_rdma_queue *queue = req->queue;
2002
+
2003
+ nvme_rdma_stop_queue(queue);
2004
+ if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2005
+ nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2006
+ blk_mq_complete_request(rq);
2007
+ }
2008
+}
2009
+
16862010 static enum blk_eh_timer_return
16872011 nvme_rdma_timeout(struct request *rq, bool reserved)
16882012 {
....@@ -1695,19 +2019,27 @@
16952019
16962020 if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
16972021 /*
1698
- * Teardown immediately if controller times out while starting
1699
- * or we are already started error recovery. all outstanding
1700
- * requests are completed on shutdown, so we return BLK_EH_DONE.
2022
+ * If we are resetting, connecting or deleting we should
2023
+ * complete immediately because we may block controller
2024
+ * teardown or setup sequence
2025
+ * - ctrl disable/shutdown fabrics requests
2026
+ * - connect requests
2027
+ * - initialization admin requests
2028
+ * - I/O requests that entered after unquiescing and
2029
+ * the controller stopped responding
2030
+ *
2031
+ * All other requests should be cancelled by the error
2032
+ * recovery work, so it's fine that we fail it here.
17012033 */
1702
- flush_work(&ctrl->err_work);
1703
- nvme_rdma_teardown_io_queues(ctrl, false);
1704
- nvme_rdma_teardown_admin_queue(ctrl, false);
2034
+ nvme_rdma_complete_timed_out(rq);
17052035 return BLK_EH_DONE;
17062036 }
17072037
1708
- dev_warn(ctrl->ctrl.device, "starting error recovery\n");
2038
+ /*
2039
+ * LIVE state should trigger the normal error recovery which will
2040
+ * handle completing this request.
2041
+ */
17092042 nvme_rdma_error_recovery(ctrl);
1710
-
17112043 return BLK_EH_RESET_TIMER;
17122044 }
17132045
....@@ -1731,20 +2063,36 @@
17312063 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
17322064
17332065 dev = queue->device->dev;
2066
+
2067
+ req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
2068
+ sizeof(struct nvme_command),
2069
+ DMA_TO_DEVICE);
2070
+ err = ib_dma_mapping_error(dev, req->sqe.dma);
2071
+ if (unlikely(err))
2072
+ return BLK_STS_RESOURCE;
2073
+
17342074 ib_dma_sync_single_for_cpu(dev, sqe->dma,
17352075 sizeof(struct nvme_command), DMA_TO_DEVICE);
17362076
17372077 ret = nvme_setup_cmd(ns, rq, c);
17382078 if (ret)
1739
- return ret;
2079
+ goto unmap_qe;
17402080
17412081 blk_mq_start_request(rq);
2082
+
2083
+ if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
2084
+ queue->pi_support &&
2085
+ (c->common.opcode == nvme_cmd_write ||
2086
+ c->common.opcode == nvme_cmd_read) &&
2087
+ nvme_ns_has_pi(ns))
2088
+ req->use_sig_mr = true;
2089
+ else
2090
+ req->use_sig_mr = false;
17422091
17432092 err = nvme_rdma_map_data(queue, rq, c);
17442093 if (unlikely(err < 0)) {
17452094 dev_err(queue->ctrl->ctrl.device,
17462095 "Failed to map data (%d)\n", err);
1747
- nvme_cleanup_cmd(rq);
17482096 goto err;
17492097 }
17502098
....@@ -1755,52 +2103,123 @@
17552103
17562104 err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
17572105 req->mr ? &req->reg_wr.wr : NULL);
1758
- if (unlikely(err)) {
1759
- nvme_rdma_unmap_data(queue, rq);
1760
- goto err;
1761
- }
2106
+ if (unlikely(err))
2107
+ goto err_unmap;
17622108
17632109 return BLK_STS_OK;
2110
+
2111
+err_unmap:
2112
+ nvme_rdma_unmap_data(queue, rq);
17642113 err:
17652114 if (err == -ENOMEM || err == -EAGAIN)
1766
- return BLK_STS_RESOURCE;
1767
- return BLK_STS_IOERR;
2115
+ ret = BLK_STS_RESOURCE;
2116
+ else
2117
+ ret = BLK_STS_IOERR;
2118
+ nvme_cleanup_cmd(rq);
2119
+unmap_qe:
2120
+ ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
2121
+ DMA_TO_DEVICE);
2122
+ return ret;
17682123 }
17692124
1770
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
2125
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
17712126 {
17722127 struct nvme_rdma_queue *queue = hctx->driver_data;
1773
- struct ib_cq *cq = queue->ib_cq;
1774
- struct ib_wc wc;
1775
- int found = 0;
17762128
1777
- while (ib_poll_cq(cq, 1, &wc) > 0) {
1778
- struct ib_cqe *cqe = wc.wr_cqe;
2129
+ return ib_process_cq_direct(queue->ib_cq, -1);
2130
+}
17792131
1780
- if (cqe) {
1781
- if (cqe->done == nvme_rdma_recv_done)
1782
- found |= __nvme_rdma_recv_done(cq, &wc, tag);
1783
- else
1784
- cqe->done(cq, &wc);
1785
- }
2132
+static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req)
2133
+{
2134
+ struct request *rq = blk_mq_rq_from_pdu(req);
2135
+ struct ib_mr_status mr_status;
2136
+ int ret;
2137
+
2138
+ ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
2139
+ if (ret) {
2140
+ pr_err("ib_check_mr_status failed, ret %d\n", ret);
2141
+ nvme_req(rq)->status = NVME_SC_INVALID_PI;
2142
+ return;
17862143 }
17872144
1788
- return found;
2145
+ if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
2146
+ switch (mr_status.sig_err.err_type) {
2147
+ case IB_SIG_BAD_GUARD:
2148
+ nvme_req(rq)->status = NVME_SC_GUARD_CHECK;
2149
+ break;
2150
+ case IB_SIG_BAD_REFTAG:
2151
+ nvme_req(rq)->status = NVME_SC_REFTAG_CHECK;
2152
+ break;
2153
+ case IB_SIG_BAD_APPTAG:
2154
+ nvme_req(rq)->status = NVME_SC_APPTAG_CHECK;
2155
+ break;
2156
+ }
2157
+ pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
2158
+ mr_status.sig_err.err_type, mr_status.sig_err.expected,
2159
+ mr_status.sig_err.actual);
2160
+ }
17892161 }
17902162
17912163 static void nvme_rdma_complete_rq(struct request *rq)
17922164 {
17932165 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2166
+ struct nvme_rdma_queue *queue = req->queue;
2167
+ struct ib_device *ibdev = queue->device->dev;
17942168
1795
- nvme_rdma_unmap_data(req->queue, rq);
2169
+ if (req->use_sig_mr)
2170
+ nvme_rdma_check_pi_status(req);
2171
+
2172
+ nvme_rdma_unmap_data(queue, rq);
2173
+ ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
2174
+ DMA_TO_DEVICE);
17962175 nvme_complete_rq(rq);
17972176 }
17982177
17992178 static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
18002179 {
18012180 struct nvme_rdma_ctrl *ctrl = set->driver_data;
2181
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
18022182
1803
- return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
2183
+ if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2184
+ /* separate read/write queues */
2185
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
2186
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
2187
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2188
+ set->map[HCTX_TYPE_READ].nr_queues =
2189
+ ctrl->io_queues[HCTX_TYPE_READ];
2190
+ set->map[HCTX_TYPE_READ].queue_offset =
2191
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
2192
+ } else {
2193
+ /* shared read/write queues */
2194
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
2195
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
2196
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2197
+ set->map[HCTX_TYPE_READ].nr_queues =
2198
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
2199
+ set->map[HCTX_TYPE_READ].queue_offset = 0;
2200
+ }
2201
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
2202
+ ctrl->device->dev, 0);
2203
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
2204
+ ctrl->device->dev, 0);
2205
+
2206
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2207
+ /* map dedicated poll queues only if we have queues left */
2208
+ set->map[HCTX_TYPE_POLL].nr_queues =
2209
+ ctrl->io_queues[HCTX_TYPE_POLL];
2210
+ set->map[HCTX_TYPE_POLL].queue_offset =
2211
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2212
+ ctrl->io_queues[HCTX_TYPE_READ];
2213
+ blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2214
+ }
2215
+
2216
+ dev_info(ctrl->ctrl.device,
2217
+ "mapped %d/%d/%d default/read/poll queues.\n",
2218
+ ctrl->io_queues[HCTX_TYPE_DEFAULT],
2219
+ ctrl->io_queues[HCTX_TYPE_READ],
2220
+ ctrl->io_queues[HCTX_TYPE_POLL]);
2221
+
2222
+ return 0;
18042223 }
18052224
18062225 static const struct blk_mq_ops nvme_rdma_mq_ops = {
....@@ -1809,9 +2228,9 @@
18092228 .init_request = nvme_rdma_init_request,
18102229 .exit_request = nvme_rdma_exit_request,
18112230 .init_hctx = nvme_rdma_init_hctx,
1812
- .poll = nvme_rdma_poll,
18132231 .timeout = nvme_rdma_timeout,
18142232 .map_queues = nvme_rdma_map_queues,
2233
+ .poll = nvme_rdma_poll,
18152234 };
18162235
18172236 static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
....@@ -1826,10 +2245,11 @@
18262245 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
18272246 {
18282247 nvme_rdma_teardown_io_queues(ctrl, shutdown);
2248
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
18292249 if (shutdown)
18302250 nvme_shutdown_ctrl(&ctrl->ctrl);
18312251 else
1832
- nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
2252
+ nvme_disable_ctrl(&ctrl->ctrl);
18332253 nvme_rdma_teardown_admin_queue(ctrl, shutdown);
18342254 }
18352255
....@@ -1865,7 +2285,7 @@
18652285 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
18662286 .name = "rdma",
18672287 .module = THIS_MODULE,
1868
- .flags = NVME_F_FABRICS,
2288
+ .flags = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED,
18692289 .reg_read32 = nvmf_reg_read32,
18702290 .reg_read64 = nvmf_reg_read64,
18712291 .reg_write32 = nvmf_reg_write32,
....@@ -1875,54 +2295,6 @@
18752295 .get_address = nvmf_get_address,
18762296 .stop_ctrl = nvme_rdma_stop_ctrl,
18772297 };
1878
-
1879
-static inline bool
1880
-__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
1881
- struct nvmf_ctrl_options *opts)
1882
-{
1883
- char *stdport = __stringify(NVME_RDMA_IP_PORT);
1884
-
1885
-
1886
- if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
1887
- strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
1888
- return false;
1889
-
1890
- if (opts->mask & NVMF_OPT_TRSVCID &&
1891
- ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
1892
- if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
1893
- return false;
1894
- } else if (opts->mask & NVMF_OPT_TRSVCID) {
1895
- if (strcmp(opts->trsvcid, stdport))
1896
- return false;
1897
- } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
1898
- if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
1899
- return false;
1900
- }
1901
- /* else, it's a match as both have stdport. Fall to next checks */
1902
-
1903
- /*
1904
- * checking the local address is rough. In most cases, one
1905
- * is not specified and the host port is selected by the stack.
1906
- *
1907
- * Assume no match if:
1908
- * local address is specified and address is not the same
1909
- * local address is not specified but remote is, or vice versa
1910
- * (admin using specific host_traddr when it matters).
1911
- */
1912
- if (opts->mask & NVMF_OPT_HOST_TRADDR &&
1913
- ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
1914
- if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
1915
- return false;
1916
- } else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
1917
- ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
1918
- return false;
1919
- /*
1920
- * if neither controller had an host port specified, assume it's
1921
- * a match as everything else matched.
1922
- */
1923
-
1924
- return true;
1925
-}
19262298
19272299 /*
19282300 * Fails a connection request if it matches an existing controller
....@@ -1944,7 +2316,7 @@
19442316
19452317 mutex_lock(&nvme_rdma_ctrl_mutex);
19462318 list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
1947
- found = __nvme_rdma_options_match(ctrl, opts);
2319
+ found = nvmf_ip_options_match(&ctrl->ctrl, opts);
19482320 if (found)
19492321 break;
19502322 }
....@@ -1959,24 +2331,28 @@
19592331 struct nvme_rdma_ctrl *ctrl;
19602332 int ret;
19612333 bool changed;
1962
- char *port;
19632334
19642335 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
19652336 if (!ctrl)
19662337 return ERR_PTR(-ENOMEM);
19672338 ctrl->ctrl.opts = opts;
19682339 INIT_LIST_HEAD(&ctrl->list);
1969
- mutex_init(&ctrl->teardown_lock);
19702340
1971
- if (opts->mask & NVMF_OPT_TRSVCID)
1972
- port = opts->trsvcid;
1973
- else
1974
- port = __stringify(NVME_RDMA_IP_PORT);
2341
+ if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2342
+ opts->trsvcid =
2343
+ kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
2344
+ if (!opts->trsvcid) {
2345
+ ret = -ENOMEM;
2346
+ goto out_free_ctrl;
2347
+ }
2348
+ opts->mask |= NVMF_OPT_TRSVCID;
2349
+ }
19752350
19762351 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
1977
- opts->traddr, port, &ctrl->addr);
2352
+ opts->traddr, opts->trsvcid, &ctrl->addr);
19782353 if (ret) {
1979
- pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
2354
+ pr_err("malformed address passed: %s:%s\n",
2355
+ opts->traddr, opts->trsvcid);
19802356 goto out_free_ctrl;
19812357 }
19822358
....@@ -2000,7 +2376,8 @@
20002376 INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
20012377 INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
20022378
2003
- ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
2379
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2380
+ opts->nr_poll_queues + 1;
20042381 ctrl->ctrl.sqsize = opts->queue_size - 1;
20052382 ctrl->ctrl.kato = opts->kato;
20062383
....@@ -2024,8 +2401,6 @@
20242401
20252402 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
20262403 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2027
-
2028
- nvme_get_ctrl(&ctrl->ctrl);
20292404
20302405 mutex_lock(&nvme_rdma_ctrl_mutex);
20312406 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
....@@ -2051,7 +2426,9 @@
20512426 .module = THIS_MODULE,
20522427 .required_opts = NVMF_OPT_TRADDR,
20532428 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2054
- NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
2429
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2430
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2431
+ NVMF_OPT_TOS,
20552432 .create_ctrl = nvme_rdma_create_ctrl,
20562433 };
20572434
....@@ -2111,8 +2488,16 @@
21112488
21122489 static void __exit nvme_rdma_cleanup_module(void)
21132490 {
2491
+ struct nvme_rdma_ctrl *ctrl;
2492
+
21142493 nvmf_unregister_transport(&nvme_rdma_transport);
21152494 ib_unregister_client(&nvme_rdma_ib_client);
2495
+
2496
+ mutex_lock(&nvme_rdma_ctrl_mutex);
2497
+ list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
2498
+ nvme_delete_ctrl(&ctrl->ctrl);
2499
+ mutex_unlock(&nvme_rdma_ctrl_mutex);
2500
+ flush_workqueue(nvme_delete_wq);
21162501 }
21172502
21182503 module_init(nvme_rdma_init_module);