| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * NVM Express device driver |
|---|
| 3 | 4 | * Copyright (c) 2011-2014, Intel Corporation. |
|---|
| 4 | | - * |
|---|
| 5 | | - * This program is free software; you can redistribute it and/or modify it |
|---|
| 6 | | - * under the terms and conditions of the GNU General Public License, |
|---|
| 7 | | - * version 2, as published by the Free Software Foundation. |
|---|
| 8 | | - * |
|---|
| 9 | | - * This program is distributed in the hope it will be useful, but WITHOUT |
|---|
| 10 | | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|---|
| 11 | | - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
|---|
| 12 | | - * more details. |
|---|
| 13 | 5 | */ |
|---|
| 14 | 6 | |
|---|
| 7 | +#include <linux/acpi.h> |
|---|
| 15 | 8 | #include <linux/aer.h> |
|---|
| 16 | 9 | #include <linux/async.h> |
|---|
| 17 | 10 | #include <linux/blkdev.h> |
|---|
| .. | .. |
|---|
| 26 | 19 | #include <linux/mutex.h> |
|---|
| 27 | 20 | #include <linux/once.h> |
|---|
| 28 | 21 | #include <linux/pci.h> |
|---|
| 22 | +#include <linux/suspend.h> |
|---|
| 29 | 23 | #include <linux/t10-pi.h> |
|---|
| 30 | 24 | #include <linux/types.h> |
|---|
| 31 | 25 | #include <linux/io-64-nonatomic-lo-hi.h> |
|---|
| 26 | +#include <linux/io-64-nonatomic-hi-lo.h> |
|---|
| 32 | 27 | #include <linux/sed-opal.h> |
|---|
| 28 | +#include <linux/pci-p2pdma.h> |
|---|
| 33 | 29 | |
|---|
| 30 | +#include "trace.h" |
|---|
| 34 | 31 | #include "nvme.h" |
|---|
| 35 | 32 | |
|---|
| 36 | | -#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) |
|---|
| 37 | | -#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) |
|---|
| 33 | +#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) |
|---|
| 34 | +#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) |
|---|
| 38 | 35 | |
|---|
| 39 | 36 | #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) |
|---|
| 40 | 37 | |
|---|
| .. | .. |
|---|
| 66 | 63 | static int io_queue_depth_set(const char *val, const struct kernel_param *kp); |
|---|
| 67 | 64 | static const struct kernel_param_ops io_queue_depth_ops = { |
|---|
| 68 | 65 | .set = io_queue_depth_set, |
|---|
| 69 | | - .get = param_get_int, |
|---|
| 66 | + .get = param_get_uint, |
|---|
| 70 | 67 | }; |
|---|
| 71 | 68 | |
|---|
| 72 | | -static int io_queue_depth = 1024; |
|---|
| 69 | +static unsigned int io_queue_depth = 1024; |
|---|
| 73 | 70 | module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); |
|---|
| 74 | 71 | MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); |
|---|
| 72 | + |
|---|
| 73 | +static int io_queue_count_set(const char *val, const struct kernel_param *kp) |
|---|
| 74 | +{ |
|---|
| 75 | + unsigned int n; |
|---|
| 76 | + int ret; |
|---|
| 77 | + |
|---|
| 78 | + ret = kstrtouint(val, 10, &n); |
|---|
| 79 | + if (ret != 0 || n > num_possible_cpus()) |
|---|
| 80 | + return -EINVAL; |
|---|
| 81 | + return param_set_uint(val, kp); |
|---|
| 82 | +} |
|---|
| 83 | + |
|---|
| 84 | +static const struct kernel_param_ops io_queue_count_ops = { |
|---|
| 85 | + .set = io_queue_count_set, |
|---|
| 86 | + .get = param_get_uint, |
|---|
| 87 | +}; |
|---|
| 88 | + |
|---|
| 89 | +static unsigned int write_queues; |
|---|
| 90 | +module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644); |
|---|
| 91 | +MODULE_PARM_DESC(write_queues, |
|---|
| 92 | + "Number of queues to use for writes. If not set, reads and writes " |
|---|
| 93 | + "will share a queue set."); |
|---|
| 94 | + |
|---|
| 95 | +static unsigned int poll_queues; |
|---|
| 96 | +module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); |
|---|
| 97 | +MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); |
|---|
| 98 | + |
|---|
| 99 | +static bool noacpi; |
|---|
| 100 | +module_param(noacpi, bool, 0444); |
|---|
| 101 | +MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); |
|---|
| 75 | 102 | |
|---|
| 76 | 103 | struct nvme_dev; |
|---|
| 77 | 104 | struct nvme_queue; |
|---|
| 78 | 105 | |
|---|
| 79 | 106 | static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); |
|---|
| 107 | +static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); |
|---|
| 80 | 108 | |
|---|
| 81 | 109 | /* |
|---|
| 82 | 110 | * Represents an NVM Express device. Each nvme_dev is a PCI function. |
|---|
| .. | .. |
|---|
| 91 | 119 | struct dma_pool *prp_small_pool; |
|---|
| 92 | 120 | unsigned online_queues; |
|---|
| 93 | 121 | unsigned max_qid; |
|---|
| 122 | + unsigned io_queues[HCTX_MAX_TYPES]; |
|---|
| 94 | 123 | unsigned int num_vecs; |
|---|
| 95 | | - int q_depth; |
|---|
| 124 | + u32 q_depth; |
|---|
| 125 | + int io_sqes; |
|---|
| 96 | 126 | u32 db_stride; |
|---|
| 97 | 127 | void __iomem *bar; |
|---|
| 98 | 128 | unsigned long bar_mapped_size; |
|---|
| 99 | 129 | struct work_struct remove_work; |
|---|
| 100 | 130 | struct mutex shutdown_lock; |
|---|
| 101 | 131 | bool subsystem; |
|---|
| 102 | | - void __iomem *cmb; |
|---|
| 103 | | - pci_bus_addr_t cmb_bus_addr; |
|---|
| 104 | 132 | u64 cmb_size; |
|---|
| 133 | + bool cmb_use_sqes; |
|---|
| 105 | 134 | u32 cmbsz; |
|---|
| 106 | 135 | u32 cmbloc; |
|---|
| 107 | 136 | struct nvme_ctrl ctrl; |
|---|
| 108 | | - struct completion ioq_wait; |
|---|
| 137 | + u32 last_ps; |
|---|
| 109 | 138 | |
|---|
| 110 | 139 | mempool_t *iod_mempool; |
|---|
| 111 | 140 | |
|---|
| .. | .. |
|---|
| 121 | 150 | dma_addr_t host_mem_descs_dma; |
|---|
| 122 | 151 | struct nvme_host_mem_buf_desc *host_mem_descs; |
|---|
| 123 | 152 | void **host_mem_desc_bufs; |
|---|
| 153 | + unsigned int nr_allocated_queues; |
|---|
| 154 | + unsigned int nr_write_queues; |
|---|
| 155 | + unsigned int nr_poll_queues; |
|---|
| 124 | 156 | }; |
|---|
| 125 | 157 | |
|---|
| 126 | 158 | static int io_queue_depth_set(const char *val, const struct kernel_param *kp) |
|---|
| 127 | 159 | { |
|---|
| 128 | | - int n = 0, ret; |
|---|
| 160 | + int ret; |
|---|
| 161 | + u32 n; |
|---|
| 129 | 162 | |
|---|
| 130 | | - ret = kstrtoint(val, 10, &n); |
|---|
| 163 | + ret = kstrtou32(val, 10, &n); |
|---|
| 131 | 164 | if (ret != 0 || n < 2) |
|---|
| 132 | 165 | return -EINVAL; |
|---|
| 133 | 166 | |
|---|
| 134 | | - return param_set_int(val, kp); |
|---|
| 167 | + return param_set_uint(val, kp); |
|---|
| 135 | 168 | } |
|---|
| 136 | 169 | |
|---|
| 137 | 170 | static inline unsigned int sq_idx(unsigned int qid, u32 stride) |
|---|
| .. | .. |
|---|
| 154 | 187 | * commands and one for I/O commands). |
|---|
| 155 | 188 | */ |
|---|
| 156 | 189 | struct nvme_queue { |
|---|
| 157 | | - struct device *q_dmadev; |
|---|
| 158 | 190 | struct nvme_dev *dev; |
|---|
| 159 | 191 | spinlock_t sq_lock; |
|---|
| 160 | | - struct nvme_command *sq_cmds; |
|---|
| 161 | | - struct nvme_command __iomem *sq_cmds_io; |
|---|
| 162 | | - spinlock_t cq_lock ____cacheline_aligned_in_smp; |
|---|
| 163 | | - volatile struct nvme_completion *cqes; |
|---|
| 164 | | - struct blk_mq_tags **tags; |
|---|
| 192 | + void *sq_cmds; |
|---|
| 193 | + /* only used for poll queues: */ |
|---|
| 194 | + spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; |
|---|
| 195 | + struct nvme_completion *cqes; |
|---|
| 165 | 196 | dma_addr_t sq_dma_addr; |
|---|
| 166 | 197 | dma_addr_t cq_dma_addr; |
|---|
| 167 | 198 | u32 __iomem *q_db; |
|---|
| 168 | | - u16 q_depth; |
|---|
| 169 | | - s16 cq_vector; |
|---|
| 199 | + u32 q_depth; |
|---|
| 200 | + u16 cq_vector; |
|---|
| 170 | 201 | u16 sq_tail; |
|---|
| 202 | + u16 last_sq_tail; |
|---|
| 171 | 203 | u16 cq_head; |
|---|
| 172 | | - u16 last_cq_head; |
|---|
| 173 | 204 | u16 qid; |
|---|
| 174 | 205 | u8 cq_phase; |
|---|
| 206 | + u8 sqes; |
|---|
| 207 | + unsigned long flags; |
|---|
| 208 | +#define NVMEQ_ENABLED 0 |
|---|
| 209 | +#define NVMEQ_SQ_CMB 1 |
|---|
| 210 | +#define NVMEQ_DELETE_ERROR 2 |
|---|
| 211 | +#define NVMEQ_POLLED 3 |
|---|
| 175 | 212 | u32 *dbbuf_sq_db; |
|---|
| 176 | 213 | u32 *dbbuf_cq_db; |
|---|
| 177 | 214 | u32 *dbbuf_sq_ei; |
|---|
| 178 | 215 | u32 *dbbuf_cq_ei; |
|---|
| 216 | + struct completion delete_done; |
|---|
| 179 | 217 | }; |
|---|
| 180 | 218 | |
|---|
| 181 | 219 | /* |
|---|
| 182 | | - * The nvme_iod describes the data in an I/O, including the list of PRP |
|---|
| 183 | | - * entries. You can't see it in this data structure because C doesn't let |
|---|
| 184 | | - * me express that. Use nvme_init_iod to ensure there's enough space |
|---|
| 185 | | - * allocated to store the PRP list. |
|---|
| 220 | + * The nvme_iod describes the data in an I/O. |
|---|
| 221 | + * |
|---|
| 222 | + * The sg pointer contains the list of PRP/SGL chunk allocations in addition |
|---|
| 223 | + * to the actual struct scatterlist. |
|---|
| 186 | 224 | */ |
|---|
| 187 | 225 | struct nvme_iod { |
|---|
| 188 | 226 | struct nvme_request req; |
|---|
| 227 | + struct nvme_command cmd; |
|---|
| 189 | 228 | struct nvme_queue *nvmeq; |
|---|
| 190 | 229 | bool use_sgl; |
|---|
| 191 | 230 | int aborted; |
|---|
| 192 | 231 | int npages; /* In the PRP list. 0 means small pool in use */ |
|---|
| 193 | 232 | int nents; /* Used in scatterlist */ |
|---|
| 194 | | - int length; /* Of data, in bytes */ |
|---|
| 195 | 233 | dma_addr_t first_dma; |
|---|
| 196 | | - struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ |
|---|
| 234 | + unsigned int dma_len; /* length of single DMA segment mapping */ |
|---|
| 235 | + dma_addr_t meta_dma; |
|---|
| 197 | 236 | struct scatterlist *sg; |
|---|
| 198 | | - struct scatterlist inline_sg[0]; |
|---|
| 199 | 237 | }; |
|---|
| 200 | 238 | |
|---|
| 201 | | -/* |
|---|
| 202 | | - * Check we didin't inadvertently grow the command struct |
|---|
| 203 | | - */ |
|---|
| 204 | | -static inline void _nvme_check_size(void) |
|---|
| 239 | +static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) |
|---|
| 205 | 240 | { |
|---|
| 206 | | - BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); |
|---|
| 207 | | - BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); |
|---|
| 208 | | - BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); |
|---|
| 209 | | - BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); |
|---|
| 210 | | - BUILD_BUG_ON(sizeof(struct nvme_features) != 64); |
|---|
| 211 | | - BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); |
|---|
| 212 | | - BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); |
|---|
| 213 | | - BUILD_BUG_ON(sizeof(struct nvme_command) != 64); |
|---|
| 214 | | - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); |
|---|
| 215 | | - BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); |
|---|
| 216 | | - BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); |
|---|
| 217 | | - BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); |
|---|
| 218 | | - BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); |
|---|
| 219 | | -} |
|---|
| 220 | | - |
|---|
| 221 | | -static inline unsigned int nvme_dbbuf_size(u32 stride) |
|---|
| 222 | | -{ |
|---|
| 223 | | - return ((num_possible_cpus() + 1) * 8 * stride); |
|---|
| 241 | + return dev->nr_allocated_queues * 8 * dev->db_stride; |
|---|
| 224 | 242 | } |
|---|
| 225 | 243 | |
|---|
| 226 | 244 | static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) |
|---|
| 227 | 245 | { |
|---|
| 228 | | - unsigned int mem_size = nvme_dbbuf_size(dev->db_stride); |
|---|
| 246 | + unsigned int mem_size = nvme_dbbuf_size(dev); |
|---|
| 229 | 247 | |
|---|
| 230 | 248 | if (dev->dbbuf_dbs) |
|---|
| 231 | 249 | return 0; |
|---|
| .. | .. |
|---|
| 250 | 268 | |
|---|
| 251 | 269 | static void nvme_dbbuf_dma_free(struct nvme_dev *dev) |
|---|
| 252 | 270 | { |
|---|
| 253 | | - unsigned int mem_size = nvme_dbbuf_size(dev->db_stride); |
|---|
| 271 | + unsigned int mem_size = nvme_dbbuf_size(dev); |
|---|
| 254 | 272 | |
|---|
| 255 | 273 | if (dev->dbbuf_dbs) { |
|---|
| 256 | 274 | dma_free_coherent(dev->dev, mem_size, |
|---|
| .. | .. |
|---|
| 347 | 365 | } |
|---|
| 348 | 366 | |
|---|
| 349 | 367 | /* |
|---|
| 350 | | - * Max size of iod being embedded in the request payload |
|---|
| 351 | | - */ |
|---|
| 352 | | -#define NVME_INT_PAGES 2 |
|---|
| 353 | | -#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) |
|---|
| 354 | | - |
|---|
| 355 | | -/* |
|---|
| 356 | 368 | * Will slightly overestimate the number of pages needed. This is OK |
|---|
| 357 | 369 | * as it only leads to a small amount of wasted memory for the lifetime of |
|---|
| 358 | 370 | * the I/O. |
|---|
| 359 | 371 | */ |
|---|
| 360 | | -static int nvme_npages(unsigned size, struct nvme_dev *dev) |
|---|
| 372 | +static int nvme_pci_npages_prp(void) |
|---|
| 361 | 373 | { |
|---|
| 362 | | - unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, |
|---|
| 363 | | - dev->ctrl.page_size); |
|---|
| 374 | + unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, |
|---|
| 375 | + NVME_CTRL_PAGE_SIZE); |
|---|
| 364 | 376 | return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); |
|---|
| 365 | 377 | } |
|---|
| 366 | 378 | |
|---|
| .. | .. |
|---|
| 368 | 380 | * Calculates the number of pages needed for the SGL segments. For example a 4k |
|---|
| 369 | 381 | * page can accommodate 256 SGL descriptors. |
|---|
| 370 | 382 | */ |
|---|
| 371 | | -static int nvme_pci_npages_sgl(unsigned int num_seg) |
|---|
| 383 | +static int nvme_pci_npages_sgl(void) |
|---|
| 372 | 384 | { |
|---|
| 373 | | - return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); |
|---|
| 385 | + return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), |
|---|
| 386 | + PAGE_SIZE); |
|---|
| 374 | 387 | } |
|---|
| 375 | 388 | |
|---|
| 376 | | -static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, |
|---|
| 377 | | - unsigned int size, unsigned int nseg, bool use_sgl) |
|---|
| 389 | +static size_t nvme_pci_iod_alloc_size(void) |
|---|
| 378 | 390 | { |
|---|
| 379 | | - size_t alloc_size; |
|---|
| 391 | + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); |
|---|
| 380 | 392 | |
|---|
| 381 | | - if (use_sgl) |
|---|
| 382 | | - alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); |
|---|
| 383 | | - else |
|---|
| 384 | | - alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); |
|---|
| 385 | | - |
|---|
| 386 | | - return alloc_size + sizeof(struct scatterlist) * nseg; |
|---|
| 387 | | -} |
|---|
| 388 | | - |
|---|
| 389 | | -static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl) |
|---|
| 390 | | -{ |
|---|
| 391 | | - unsigned int alloc_size = nvme_pci_iod_alloc_size(dev, |
|---|
| 392 | | - NVME_INT_BYTES(dev), NVME_INT_PAGES, |
|---|
| 393 | | - use_sgl); |
|---|
| 394 | | - |
|---|
| 395 | | - return sizeof(struct nvme_iod) + alloc_size; |
|---|
| 393 | + return sizeof(__le64 *) * npages + |
|---|
| 394 | + sizeof(struct scatterlist) * NVME_MAX_SEGS; |
|---|
| 396 | 395 | } |
|---|
| 397 | 396 | |
|---|
| 398 | 397 | static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, |
|---|
| .. | .. |
|---|
| 403 | 402 | |
|---|
| 404 | 403 | WARN_ON(hctx_idx != 0); |
|---|
| 405 | 404 | WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); |
|---|
| 406 | | - WARN_ON(nvmeq->tags); |
|---|
| 407 | 405 | |
|---|
| 408 | 406 | hctx->driver_data = nvmeq; |
|---|
| 409 | | - nvmeq->tags = &dev->admin_tagset.tags[0]; |
|---|
| 410 | 407 | return 0; |
|---|
| 411 | | -} |
|---|
| 412 | | - |
|---|
| 413 | | -static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) |
|---|
| 414 | | -{ |
|---|
| 415 | | - struct nvme_queue *nvmeq = hctx->driver_data; |
|---|
| 416 | | - |
|---|
| 417 | | - nvmeq->tags = NULL; |
|---|
| 418 | 408 | } |
|---|
| 419 | 409 | |
|---|
| 420 | 410 | static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, |
|---|
| .. | .. |
|---|
| 422 | 412 | { |
|---|
| 423 | 413 | struct nvme_dev *dev = data; |
|---|
| 424 | 414 | struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; |
|---|
| 425 | | - |
|---|
| 426 | | - if (!nvmeq->tags) |
|---|
| 427 | | - nvmeq->tags = &dev->tagset.tags[hctx_idx]; |
|---|
| 428 | 415 | |
|---|
| 429 | 416 | WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); |
|---|
| 430 | 417 | hctx->driver_data = nvmeq; |
|---|
| .. | .. |
|---|
| 446 | 433 | return 0; |
|---|
| 447 | 434 | } |
|---|
| 448 | 435 | |
|---|
| 436 | +static int queue_irq_offset(struct nvme_dev *dev) |
|---|
| 437 | +{ |
|---|
| 438 | + /* if we have more than 1 vec, admin queue offsets us by 1 */ |
|---|
| 439 | + if (dev->num_vecs > 1) |
|---|
| 440 | + return 1; |
|---|
| 441 | + |
|---|
| 442 | + return 0; |
|---|
| 443 | +} |
|---|
| 444 | + |
|---|
| 449 | 445 | static int nvme_pci_map_queues(struct blk_mq_tag_set *set) |
|---|
| 450 | 446 | { |
|---|
| 451 | 447 | struct nvme_dev *dev = set->driver_data; |
|---|
| 448 | + int i, qoff, offset; |
|---|
| 452 | 449 | |
|---|
| 453 | | - return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), |
|---|
| 454 | | - dev->num_vecs > 1 ? 1 /* admin queue */ : 0); |
|---|
| 450 | + offset = queue_irq_offset(dev); |
|---|
| 451 | + for (i = 0, qoff = 0; i < set->nr_maps; i++) { |
|---|
| 452 | + struct blk_mq_queue_map *map = &set->map[i]; |
|---|
| 453 | + |
|---|
| 454 | + map->nr_queues = dev->io_queues[i]; |
|---|
| 455 | + if (!map->nr_queues) { |
|---|
| 456 | + BUG_ON(i == HCTX_TYPE_DEFAULT); |
|---|
| 457 | + continue; |
|---|
| 458 | + } |
|---|
| 459 | + |
|---|
| 460 | + /* |
|---|
| 461 | + * The poll queue(s) doesn't have an IRQ (and hence IRQ |
|---|
| 462 | + * affinity), so use the regular blk-mq cpu mapping |
|---|
| 463 | + */ |
|---|
| 464 | + map->queue_offset = qoff; |
|---|
| 465 | + if (i != HCTX_TYPE_POLL && offset) |
|---|
| 466 | + blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); |
|---|
| 467 | + else |
|---|
| 468 | + blk_mq_map_queues(map); |
|---|
| 469 | + qoff += map->nr_queues; |
|---|
| 470 | + offset += map->nr_queues; |
|---|
| 471 | + } |
|---|
| 472 | + |
|---|
| 473 | + return 0; |
|---|
| 474 | +} |
|---|
| 475 | + |
|---|
| 476 | +/* |
|---|
| 477 | + * Write sq tail if we are asked to, or if the next command would wrap. |
|---|
| 478 | + */ |
|---|
| 479 | +static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) |
|---|
| 480 | +{ |
|---|
| 481 | + if (!write_sq) { |
|---|
| 482 | + u16 next_tail = nvmeq->sq_tail + 1; |
|---|
| 483 | + |
|---|
| 484 | + if (next_tail == nvmeq->q_depth) |
|---|
| 485 | + next_tail = 0; |
|---|
| 486 | + if (next_tail != nvmeq->last_sq_tail) |
|---|
| 487 | + return; |
|---|
| 488 | + } |
|---|
| 489 | + |
|---|
| 490 | + if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, |
|---|
| 491 | + nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) |
|---|
| 492 | + writel(nvmeq->sq_tail, nvmeq->q_db); |
|---|
| 493 | + nvmeq->last_sq_tail = nvmeq->sq_tail; |
|---|
| 455 | 494 | } |
|---|
| 456 | 495 | |
|---|
| 457 | 496 | /** |
|---|
| 458 | 497 | * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell |
|---|
| 459 | 498 | * @nvmeq: The queue to use |
|---|
| 460 | 499 | * @cmd: The command to send |
|---|
| 500 | + * @write_sq: whether to write to the SQ doorbell |
|---|
| 461 | 501 | */ |
|---|
| 462 | | -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) |
|---|
| 502 | +static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, |
|---|
| 503 | + bool write_sq) |
|---|
| 463 | 504 | { |
|---|
| 464 | 505 | spin_lock(&nvmeq->sq_lock); |
|---|
| 465 | | - if (nvmeq->sq_cmds_io) |
|---|
| 466 | | - memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd, |
|---|
| 467 | | - sizeof(*cmd)); |
|---|
| 468 | | - else |
|---|
| 469 | | - memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); |
|---|
| 470 | | - |
|---|
| 506 | + memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), |
|---|
| 507 | + cmd, sizeof(*cmd)); |
|---|
| 471 | 508 | if (++nvmeq->sq_tail == nvmeq->q_depth) |
|---|
| 472 | 509 | nvmeq->sq_tail = 0; |
|---|
| 473 | | - if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, |
|---|
| 474 | | - nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) |
|---|
| 475 | | - writel(nvmeq->sq_tail, nvmeq->q_db); |
|---|
| 510 | + nvme_write_sq_db(nvmeq, write_sq); |
|---|
| 511 | + spin_unlock(&nvmeq->sq_lock); |
|---|
| 512 | +} |
|---|
| 513 | + |
|---|
| 514 | +static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) |
|---|
| 515 | +{ |
|---|
| 516 | + struct nvme_queue *nvmeq = hctx->driver_data; |
|---|
| 517 | + |
|---|
| 518 | + spin_lock(&nvmeq->sq_lock); |
|---|
| 519 | + if (nvmeq->sq_tail != nvmeq->last_sq_tail) |
|---|
| 520 | + nvme_write_sq_db(nvmeq, true); |
|---|
| 476 | 521 | spin_unlock(&nvmeq->sq_lock); |
|---|
| 477 | 522 | } |
|---|
| 478 | 523 | |
|---|
| .. | .. |
|---|
| 488 | 533 | int nseg = blk_rq_nr_phys_segments(req); |
|---|
| 489 | 534 | unsigned int avg_seg_size; |
|---|
| 490 | 535 | |
|---|
| 491 | | - if (nseg == 0) |
|---|
| 492 | | - return false; |
|---|
| 493 | | - |
|---|
| 494 | 536 | avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); |
|---|
| 495 | 537 | |
|---|
| 496 | 538 | if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) |
|---|
| .. | .. |
|---|
| 502 | 544 | return true; |
|---|
| 503 | 545 | } |
|---|
| 504 | 546 | |
|---|
| 505 | | -static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) |
|---|
| 547 | +static void nvme_free_prps(struct nvme_dev *dev, struct request *req) |
|---|
| 506 | 548 | { |
|---|
| 507 | | - struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); |
|---|
| 508 | | - int nseg = blk_rq_nr_phys_segments(rq); |
|---|
| 509 | | - unsigned int size = blk_rq_payload_bytes(rq); |
|---|
| 510 | | - |
|---|
| 511 | | - iod->use_sgl = nvme_pci_use_sgls(dev, rq); |
|---|
| 512 | | - |
|---|
| 513 | | - if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { |
|---|
| 514 | | - iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); |
|---|
| 515 | | - if (!iod->sg) |
|---|
| 516 | | - return BLK_STS_RESOURCE; |
|---|
| 517 | | - } else { |
|---|
| 518 | | - iod->sg = iod->inline_sg; |
|---|
| 519 | | - } |
|---|
| 520 | | - |
|---|
| 521 | | - iod->aborted = 0; |
|---|
| 522 | | - iod->npages = -1; |
|---|
| 523 | | - iod->nents = 0; |
|---|
| 524 | | - iod->length = size; |
|---|
| 525 | | - |
|---|
| 526 | | - return BLK_STS_OK; |
|---|
| 527 | | -} |
|---|
| 528 | | - |
|---|
| 529 | | -static void nvme_free_iod(struct nvme_dev *dev, struct request *req) |
|---|
| 530 | | -{ |
|---|
| 549 | + const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; |
|---|
| 531 | 550 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 532 | | - const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; |
|---|
| 533 | | - dma_addr_t dma_addr = iod->first_dma, next_dma_addr; |
|---|
| 534 | | - |
|---|
| 551 | + dma_addr_t dma_addr = iod->first_dma; |
|---|
| 535 | 552 | int i; |
|---|
| 536 | 553 | |
|---|
| 537 | | - if (iod->npages == 0) |
|---|
| 538 | | - dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], |
|---|
| 539 | | - dma_addr); |
|---|
| 540 | | - |
|---|
| 541 | 554 | for (i = 0; i < iod->npages; i++) { |
|---|
| 542 | | - void *addr = nvme_pci_iod_list(req)[i]; |
|---|
| 555 | + __le64 *prp_list = nvme_pci_iod_list(req)[i]; |
|---|
| 556 | + dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); |
|---|
| 543 | 557 | |
|---|
| 544 | | - if (iod->use_sgl) { |
|---|
| 545 | | - struct nvme_sgl_desc *sg_list = addr; |
|---|
| 546 | | - |
|---|
| 547 | | - next_dma_addr = |
|---|
| 548 | | - le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr); |
|---|
| 549 | | - } else { |
|---|
| 550 | | - __le64 *prp_list = addr; |
|---|
| 551 | | - |
|---|
| 552 | | - next_dma_addr = le64_to_cpu(prp_list[last_prp]); |
|---|
| 553 | | - } |
|---|
| 554 | | - |
|---|
| 555 | | - dma_pool_free(dev->prp_page_pool, addr, dma_addr); |
|---|
| 558 | + dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); |
|---|
| 556 | 559 | dma_addr = next_dma_addr; |
|---|
| 557 | 560 | } |
|---|
| 558 | 561 | |
|---|
| 559 | | - if (iod->sg != iod->inline_sg) |
|---|
| 560 | | - mempool_free(iod->sg, dev->iod_mempool); |
|---|
| 562 | +} |
|---|
| 563 | + |
|---|
| 564 | +static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) |
|---|
| 565 | +{ |
|---|
| 566 | + const int last_sg = SGES_PER_PAGE - 1; |
|---|
| 567 | + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 568 | + dma_addr_t dma_addr = iod->first_dma; |
|---|
| 569 | + int i; |
|---|
| 570 | + |
|---|
| 571 | + for (i = 0; i < iod->npages; i++) { |
|---|
| 572 | + struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; |
|---|
| 573 | + dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); |
|---|
| 574 | + |
|---|
| 575 | + dma_pool_free(dev->prp_page_pool, sg_list, dma_addr); |
|---|
| 576 | + dma_addr = next_dma_addr; |
|---|
| 577 | + } |
|---|
| 578 | + |
|---|
| 579 | +} |
|---|
| 580 | + |
|---|
| 581 | +static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) |
|---|
| 582 | +{ |
|---|
| 583 | + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 584 | + |
|---|
| 585 | + if (is_pci_p2pdma_page(sg_page(iod->sg))) |
|---|
| 586 | + pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, |
|---|
| 587 | + rq_dma_dir(req)); |
|---|
| 588 | + else |
|---|
| 589 | + dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); |
|---|
| 590 | +} |
|---|
| 591 | + |
|---|
| 592 | +static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) |
|---|
| 593 | +{ |
|---|
| 594 | + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 595 | + |
|---|
| 596 | + if (iod->dma_len) { |
|---|
| 597 | + dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, |
|---|
| 598 | + rq_dma_dir(req)); |
|---|
| 599 | + return; |
|---|
| 600 | + } |
|---|
| 601 | + |
|---|
| 602 | + WARN_ON_ONCE(!iod->nents); |
|---|
| 603 | + |
|---|
| 604 | + nvme_unmap_sg(dev, req); |
|---|
| 605 | + if (iod->npages == 0) |
|---|
| 606 | + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], |
|---|
| 607 | + iod->first_dma); |
|---|
| 608 | + else if (iod->use_sgl) |
|---|
| 609 | + nvme_free_sgls(dev, req); |
|---|
| 610 | + else |
|---|
| 611 | + nvme_free_prps(dev, req); |
|---|
| 612 | + mempool_free(iod->sg, dev->iod_mempool); |
|---|
| 561 | 613 | } |
|---|
| 562 | 614 | |
|---|
| 563 | 615 | static void nvme_print_sgl(struct scatterlist *sgl, int nents) |
|---|
| .. | .. |
|---|
| 583 | 635 | struct scatterlist *sg = iod->sg; |
|---|
| 584 | 636 | int dma_len = sg_dma_len(sg); |
|---|
| 585 | 637 | u64 dma_addr = sg_dma_address(sg); |
|---|
| 586 | | - u32 page_size = dev->ctrl.page_size; |
|---|
| 587 | | - int offset = dma_addr & (page_size - 1); |
|---|
| 638 | + int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); |
|---|
| 588 | 639 | __le64 *prp_list; |
|---|
| 589 | 640 | void **list = nvme_pci_iod_list(req); |
|---|
| 590 | 641 | dma_addr_t prp_dma; |
|---|
| 591 | 642 | int nprps, i; |
|---|
| 592 | 643 | |
|---|
| 593 | | - length -= (page_size - offset); |
|---|
| 644 | + length -= (NVME_CTRL_PAGE_SIZE - offset); |
|---|
| 594 | 645 | if (length <= 0) { |
|---|
| 595 | 646 | iod->first_dma = 0; |
|---|
| 596 | 647 | goto done; |
|---|
| 597 | 648 | } |
|---|
| 598 | 649 | |
|---|
| 599 | | - dma_len -= (page_size - offset); |
|---|
| 650 | + dma_len -= (NVME_CTRL_PAGE_SIZE - offset); |
|---|
| 600 | 651 | if (dma_len) { |
|---|
| 601 | | - dma_addr += (page_size - offset); |
|---|
| 652 | + dma_addr += (NVME_CTRL_PAGE_SIZE - offset); |
|---|
| 602 | 653 | } else { |
|---|
| 603 | 654 | sg = sg_next(sg); |
|---|
| 604 | 655 | dma_addr = sg_dma_address(sg); |
|---|
| 605 | 656 | dma_len = sg_dma_len(sg); |
|---|
| 606 | 657 | } |
|---|
| 607 | 658 | |
|---|
| 608 | | - if (length <= page_size) { |
|---|
| 659 | + if (length <= NVME_CTRL_PAGE_SIZE) { |
|---|
| 609 | 660 | iod->first_dma = dma_addr; |
|---|
| 610 | 661 | goto done; |
|---|
| 611 | 662 | } |
|---|
| 612 | 663 | |
|---|
| 613 | | - nprps = DIV_ROUND_UP(length, page_size); |
|---|
| 664 | + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); |
|---|
| 614 | 665 | if (nprps <= (256 / 8)) { |
|---|
| 615 | 666 | pool = dev->prp_small_pool; |
|---|
| 616 | 667 | iod->npages = 0; |
|---|
| .. | .. |
|---|
| 629 | 680 | iod->first_dma = prp_dma; |
|---|
| 630 | 681 | i = 0; |
|---|
| 631 | 682 | for (;;) { |
|---|
| 632 | | - if (i == page_size >> 3) { |
|---|
| 683 | + if (i == NVME_CTRL_PAGE_SIZE >> 3) { |
|---|
| 633 | 684 | __le64 *old_prp_list = prp_list; |
|---|
| 634 | 685 | prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); |
|---|
| 635 | 686 | if (!prp_list) |
|---|
| 636 | | - return BLK_STS_RESOURCE; |
|---|
| 687 | + goto free_prps; |
|---|
| 637 | 688 | list[iod->npages++] = prp_list; |
|---|
| 638 | 689 | prp_list[0] = old_prp_list[i - 1]; |
|---|
| 639 | 690 | old_prp_list[i - 1] = cpu_to_le64(prp_dma); |
|---|
| 640 | 691 | i = 1; |
|---|
| 641 | 692 | } |
|---|
| 642 | 693 | prp_list[i++] = cpu_to_le64(dma_addr); |
|---|
| 643 | | - dma_len -= page_size; |
|---|
| 644 | | - dma_addr += page_size; |
|---|
| 645 | | - length -= page_size; |
|---|
| 694 | + dma_len -= NVME_CTRL_PAGE_SIZE; |
|---|
| 695 | + dma_addr += NVME_CTRL_PAGE_SIZE; |
|---|
| 696 | + length -= NVME_CTRL_PAGE_SIZE; |
|---|
| 646 | 697 | if (length <= 0) |
|---|
| 647 | 698 | break; |
|---|
| 648 | 699 | if (dma_len > 0) |
|---|
| .. | .. |
|---|
| 653 | 704 | dma_addr = sg_dma_address(sg); |
|---|
| 654 | 705 | dma_len = sg_dma_len(sg); |
|---|
| 655 | 706 | } |
|---|
| 656 | | - |
|---|
| 657 | 707 | done: |
|---|
| 658 | 708 | cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); |
|---|
| 659 | 709 | cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); |
|---|
| 660 | | - |
|---|
| 661 | 710 | return BLK_STS_OK; |
|---|
| 662 | | - |
|---|
| 663 | | - bad_sgl: |
|---|
| 711 | +free_prps: |
|---|
| 712 | + nvme_free_prps(dev, req); |
|---|
| 713 | + return BLK_STS_RESOURCE; |
|---|
| 714 | +bad_sgl: |
|---|
| 664 | 715 | WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), |
|---|
| 665 | 716 | "Invalid SGL for payload:%d nents:%d\n", |
|---|
| 666 | 717 | blk_rq_payload_bytes(req), iod->nents); |
|---|
| .. | .. |
|---|
| 732 | 783 | |
|---|
| 733 | 784 | sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); |
|---|
| 734 | 785 | if (!sg_list) |
|---|
| 735 | | - return BLK_STS_RESOURCE; |
|---|
| 786 | + goto free_sgls; |
|---|
| 736 | 787 | |
|---|
| 737 | 788 | i = 0; |
|---|
| 738 | 789 | nvme_pci_iod_list(req)[iod->npages++] = sg_list; |
|---|
| .. | .. |
|---|
| 745 | 796 | } while (--entries > 0); |
|---|
| 746 | 797 | |
|---|
| 747 | 798 | return BLK_STS_OK; |
|---|
| 799 | +free_sgls: |
|---|
| 800 | + nvme_free_sgls(dev, req); |
|---|
| 801 | + return BLK_STS_RESOURCE; |
|---|
| 802 | +} |
|---|
| 803 | + |
|---|
| 804 | +static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, |
|---|
| 805 | + struct request *req, struct nvme_rw_command *cmnd, |
|---|
| 806 | + struct bio_vec *bv) |
|---|
| 807 | +{ |
|---|
| 808 | + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 809 | + unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); |
|---|
| 810 | + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; |
|---|
| 811 | + |
|---|
| 812 | + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); |
|---|
| 813 | + if (dma_mapping_error(dev->dev, iod->first_dma)) |
|---|
| 814 | + return BLK_STS_RESOURCE; |
|---|
| 815 | + iod->dma_len = bv->bv_len; |
|---|
| 816 | + |
|---|
| 817 | + cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); |
|---|
| 818 | + if (bv->bv_len > first_prp_len) |
|---|
| 819 | + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); |
|---|
| 820 | + else |
|---|
| 821 | + cmnd->dptr.prp2 = 0; |
|---|
| 822 | + return BLK_STS_OK; |
|---|
| 823 | +} |
|---|
| 824 | + |
|---|
| 825 | +static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, |
|---|
| 826 | + struct request *req, struct nvme_rw_command *cmnd, |
|---|
| 827 | + struct bio_vec *bv) |
|---|
| 828 | +{ |
|---|
| 829 | + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 830 | + |
|---|
| 831 | + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); |
|---|
| 832 | + if (dma_mapping_error(dev->dev, iod->first_dma)) |
|---|
| 833 | + return BLK_STS_RESOURCE; |
|---|
| 834 | + iod->dma_len = bv->bv_len; |
|---|
| 835 | + |
|---|
| 836 | + cmnd->flags = NVME_CMD_SGL_METABUF; |
|---|
| 837 | + cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); |
|---|
| 838 | + cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); |
|---|
| 839 | + cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; |
|---|
| 840 | + return BLK_STS_OK; |
|---|
| 748 | 841 | } |
|---|
| 749 | 842 | |
|---|
| 750 | 843 | static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, |
|---|
| 751 | 844 | struct nvme_command *cmnd) |
|---|
| 752 | 845 | { |
|---|
| 753 | 846 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 754 | | - struct request_queue *q = req->q; |
|---|
| 755 | | - enum dma_data_direction dma_dir = rq_data_dir(req) ? |
|---|
| 756 | | - DMA_TO_DEVICE : DMA_FROM_DEVICE; |
|---|
| 757 | | - blk_status_t ret = BLK_STS_IOERR; |
|---|
| 847 | + blk_status_t ret = BLK_STS_RESOURCE; |
|---|
| 758 | 848 | int nr_mapped; |
|---|
| 759 | 849 | |
|---|
| 850 | + if (blk_rq_nr_phys_segments(req) == 1) { |
|---|
| 851 | + struct bio_vec bv = req_bvec(req); |
|---|
| 852 | + |
|---|
| 853 | + if (!is_pci_p2pdma_page(bv.bv_page)) { |
|---|
| 854 | + if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) |
|---|
| 855 | + return nvme_setup_prp_simple(dev, req, |
|---|
| 856 | + &cmnd->rw, &bv); |
|---|
| 857 | + |
|---|
| 858 | + if (iod->nvmeq->qid && sgl_threshold && |
|---|
| 859 | + dev->ctrl.sgls & ((1 << 0) | (1 << 1))) |
|---|
| 860 | + return nvme_setup_sgl_simple(dev, req, |
|---|
| 861 | + &cmnd->rw, &bv); |
|---|
| 862 | + } |
|---|
| 863 | + } |
|---|
| 864 | + |
|---|
| 865 | + iod->dma_len = 0; |
|---|
| 866 | + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); |
|---|
| 867 | + if (!iod->sg) |
|---|
| 868 | + return BLK_STS_RESOURCE; |
|---|
| 760 | 869 | sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); |
|---|
| 761 | | - iod->nents = blk_rq_map_sg(q, req, iod->sg); |
|---|
| 870 | + iod->nents = blk_rq_map_sg(req->q, req, iod->sg); |
|---|
| 762 | 871 | if (!iod->nents) |
|---|
| 763 | | - goto out; |
|---|
| 872 | + goto out_free_sg; |
|---|
| 764 | 873 | |
|---|
| 765 | | - ret = BLK_STS_RESOURCE; |
|---|
| 766 | | - nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, |
|---|
| 767 | | - DMA_ATTR_NO_WARN); |
|---|
| 874 | + if (is_pci_p2pdma_page(sg_page(iod->sg))) |
|---|
| 875 | + nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg, |
|---|
| 876 | + iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); |
|---|
| 877 | + else |
|---|
| 878 | + nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, |
|---|
| 879 | + rq_dma_dir(req), DMA_ATTR_NO_WARN); |
|---|
| 768 | 880 | if (!nr_mapped) |
|---|
| 769 | | - goto out; |
|---|
| 881 | + goto out_free_sg; |
|---|
| 770 | 882 | |
|---|
| 883 | + iod->use_sgl = nvme_pci_use_sgls(dev, req); |
|---|
| 771 | 884 | if (iod->use_sgl) |
|---|
| 772 | 885 | ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); |
|---|
| 773 | 886 | else |
|---|
| 774 | 887 | ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); |
|---|
| 775 | | - |
|---|
| 776 | 888 | if (ret != BLK_STS_OK) |
|---|
| 777 | | - goto out_unmap; |
|---|
| 778 | | - |
|---|
| 779 | | - ret = BLK_STS_IOERR; |
|---|
| 780 | | - if (blk_integrity_rq(req)) { |
|---|
| 781 | | - if (blk_rq_count_integrity_sg(q, req->bio) != 1) |
|---|
| 782 | | - goto out_unmap; |
|---|
| 783 | | - |
|---|
| 784 | | - sg_init_table(&iod->meta_sg, 1); |
|---|
| 785 | | - if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) |
|---|
| 786 | | - goto out_unmap; |
|---|
| 787 | | - |
|---|
| 788 | | - if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) |
|---|
| 789 | | - goto out_unmap; |
|---|
| 790 | | - } |
|---|
| 791 | | - |
|---|
| 792 | | - if (blk_integrity_rq(req)) |
|---|
| 793 | | - cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); |
|---|
| 889 | + goto out_unmap_sg; |
|---|
| 794 | 890 | return BLK_STS_OK; |
|---|
| 795 | 891 | |
|---|
| 796 | | -out_unmap: |
|---|
| 797 | | - dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); |
|---|
| 798 | | -out: |
|---|
| 892 | +out_unmap_sg: |
|---|
| 893 | + nvme_unmap_sg(dev, req); |
|---|
| 894 | +out_free_sg: |
|---|
| 895 | + mempool_free(iod->sg, dev->iod_mempool); |
|---|
| 799 | 896 | return ret; |
|---|
| 800 | 897 | } |
|---|
| 801 | 898 | |
|---|
| 802 | | -static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) |
|---|
| 899 | +static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, |
|---|
| 900 | + struct nvme_command *cmnd) |
|---|
| 803 | 901 | { |
|---|
| 804 | 902 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 805 | | - enum dma_data_direction dma_dir = rq_data_dir(req) ? |
|---|
| 806 | | - DMA_TO_DEVICE : DMA_FROM_DEVICE; |
|---|
| 807 | 903 | |
|---|
| 808 | | - if (iod->nents) { |
|---|
| 809 | | - dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); |
|---|
| 810 | | - if (blk_integrity_rq(req)) |
|---|
| 811 | | - dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); |
|---|
| 812 | | - } |
|---|
| 813 | | - |
|---|
| 814 | | - nvme_cleanup_cmd(req); |
|---|
| 815 | | - nvme_free_iod(dev, req); |
|---|
| 904 | + iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req), |
|---|
| 905 | + rq_dma_dir(req), 0); |
|---|
| 906 | + if (dma_mapping_error(dev->dev, iod->meta_dma)) |
|---|
| 907 | + return BLK_STS_IOERR; |
|---|
| 908 | + cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); |
|---|
| 909 | + return BLK_STS_OK; |
|---|
| 816 | 910 | } |
|---|
| 817 | 911 | |
|---|
| 818 | 912 | /* |
|---|
| .. | .. |
|---|
| 825 | 919 | struct nvme_queue *nvmeq = hctx->driver_data; |
|---|
| 826 | 920 | struct nvme_dev *dev = nvmeq->dev; |
|---|
| 827 | 921 | struct request *req = bd->rq; |
|---|
| 828 | | - struct nvme_command cmnd; |
|---|
| 922 | + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 923 | + struct nvme_command *cmnd = &iod->cmd; |
|---|
| 829 | 924 | blk_status_t ret; |
|---|
| 925 | + |
|---|
| 926 | + iod->aborted = 0; |
|---|
| 927 | + iod->npages = -1; |
|---|
| 928 | + iod->nents = 0; |
|---|
| 830 | 929 | |
|---|
| 831 | 930 | /* |
|---|
| 832 | 931 | * We should not need to do this, but we're still using this to |
|---|
| 833 | 932 | * ensure we can drain requests on a dying queue. |
|---|
| 834 | 933 | */ |
|---|
| 835 | | - if (unlikely(nvmeq->cq_vector < 0)) |
|---|
| 934 | + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) |
|---|
| 836 | 935 | return BLK_STS_IOERR; |
|---|
| 837 | 936 | |
|---|
| 838 | | - ret = nvme_setup_cmd(ns, req, &cmnd); |
|---|
| 937 | + ret = nvme_setup_cmd(ns, req, cmnd); |
|---|
| 839 | 938 | if (ret) |
|---|
| 840 | 939 | return ret; |
|---|
| 841 | 940 | |
|---|
| 842 | | - ret = nvme_init_iod(req, dev); |
|---|
| 843 | | - if (ret) |
|---|
| 844 | | - goto out_free_cmd; |
|---|
| 845 | | - |
|---|
| 846 | 941 | if (blk_rq_nr_phys_segments(req)) { |
|---|
| 847 | | - ret = nvme_map_data(dev, req, &cmnd); |
|---|
| 942 | + ret = nvme_map_data(dev, req, cmnd); |
|---|
| 848 | 943 | if (ret) |
|---|
| 849 | | - goto out_cleanup_iod; |
|---|
| 944 | + goto out_free_cmd; |
|---|
| 945 | + } |
|---|
| 946 | + |
|---|
| 947 | + if (blk_integrity_rq(req)) { |
|---|
| 948 | + ret = nvme_map_metadata(dev, req, cmnd); |
|---|
| 949 | + if (ret) |
|---|
| 950 | + goto out_unmap_data; |
|---|
| 850 | 951 | } |
|---|
| 851 | 952 | |
|---|
| 852 | 953 | blk_mq_start_request(req); |
|---|
| 853 | | - nvme_submit_cmd(nvmeq, &cmnd); |
|---|
| 954 | + nvme_submit_cmd(nvmeq, cmnd, bd->last); |
|---|
| 854 | 955 | return BLK_STS_OK; |
|---|
| 855 | | -out_cleanup_iod: |
|---|
| 856 | | - nvme_free_iod(dev, req); |
|---|
| 956 | +out_unmap_data: |
|---|
| 957 | + nvme_unmap_data(dev, req); |
|---|
| 857 | 958 | out_free_cmd: |
|---|
| 858 | 959 | nvme_cleanup_cmd(req); |
|---|
| 859 | 960 | return ret; |
|---|
| .. | .. |
|---|
| 862 | 963 | static void nvme_pci_complete_rq(struct request *req) |
|---|
| 863 | 964 | { |
|---|
| 864 | 965 | struct nvme_iod *iod = blk_mq_rq_to_pdu(req); |
|---|
| 966 | + struct nvme_dev *dev = iod->nvmeq->dev; |
|---|
| 865 | 967 | |
|---|
| 866 | | - nvme_unmap_data(iod->nvmeq->dev, req); |
|---|
| 968 | + if (blk_integrity_rq(req)) |
|---|
| 969 | + dma_unmap_page(dev->dev, iod->meta_dma, |
|---|
| 970 | + rq_integrity_vec(req)->bv_len, rq_data_dir(req)); |
|---|
| 971 | + if (blk_rq_nr_phys_segments(req)) |
|---|
| 972 | + nvme_unmap_data(dev, req); |
|---|
| 867 | 973 | nvme_complete_rq(req); |
|---|
| 868 | 974 | } |
|---|
| 869 | 975 | |
|---|
| 870 | 976 | /* We read the CQE phase first to check if the rest of the entry is valid */ |
|---|
| 871 | 977 | static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) |
|---|
| 872 | 978 | { |
|---|
| 873 | | - return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == |
|---|
| 874 | | - nvmeq->cq_phase; |
|---|
| 979 | + struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head]; |
|---|
| 980 | + |
|---|
| 981 | + return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase; |
|---|
| 875 | 982 | } |
|---|
| 876 | 983 | |
|---|
| 877 | 984 | static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) |
|---|
| .. | .. |
|---|
| 883 | 990 | writel(head, nvmeq->q_db + nvmeq->dev->db_stride); |
|---|
| 884 | 991 | } |
|---|
| 885 | 992 | |
|---|
| 993 | +static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) |
|---|
| 994 | +{ |
|---|
| 995 | + if (!nvmeq->qid) |
|---|
| 996 | + return nvmeq->dev->admin_tagset.tags[0]; |
|---|
| 997 | + return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; |
|---|
| 998 | +} |
|---|
| 999 | + |
|---|
| 886 | 1000 | static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) |
|---|
| 887 | 1001 | { |
|---|
| 888 | | - volatile struct nvme_completion *cqe = &nvmeq->cqes[idx]; |
|---|
| 1002 | + struct nvme_completion *cqe = &nvmeq->cqes[idx]; |
|---|
| 1003 | + __u16 command_id = READ_ONCE(cqe->command_id); |
|---|
| 889 | 1004 | struct request *req; |
|---|
| 890 | | - |
|---|
| 891 | | - if (unlikely(cqe->command_id >= nvmeq->q_depth)) { |
|---|
| 892 | | - dev_warn(nvmeq->dev->ctrl.device, |
|---|
| 893 | | - "invalid id %d completed on queue %d\n", |
|---|
| 894 | | - cqe->command_id, le16_to_cpu(cqe->sq_id)); |
|---|
| 895 | | - return; |
|---|
| 896 | | - } |
|---|
| 897 | 1005 | |
|---|
| 898 | 1006 | /* |
|---|
| 899 | 1007 | * AEN requests are special as they don't time out and can |
|---|
| .. | .. |
|---|
| 901 | 1009 | * aborts. We don't even bother to allocate a struct request |
|---|
| 902 | 1010 | * for them but rather special case them here. |
|---|
| 903 | 1011 | */ |
|---|
| 904 | | - if (unlikely(nvmeq->qid == 0 && |
|---|
| 905 | | - cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { |
|---|
| 1012 | + if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) { |
|---|
| 906 | 1013 | nvme_complete_async_event(&nvmeq->dev->ctrl, |
|---|
| 907 | 1014 | cqe->status, &cqe->result); |
|---|
| 908 | 1015 | return; |
|---|
| 909 | 1016 | } |
|---|
| 910 | 1017 | |
|---|
| 911 | | - req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); |
|---|
| 912 | | - nvme_end_request(req, cqe->status, cqe->result); |
|---|
| 913 | | -} |
|---|
| 914 | | - |
|---|
| 915 | | -static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) |
|---|
| 916 | | -{ |
|---|
| 917 | | - while (start != end) { |
|---|
| 918 | | - nvme_handle_cqe(nvmeq, start); |
|---|
| 919 | | - if (++start == nvmeq->q_depth) |
|---|
| 920 | | - start = 0; |
|---|
| 1018 | + req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); |
|---|
| 1019 | + if (unlikely(!req)) { |
|---|
| 1020 | + dev_warn(nvmeq->dev->ctrl.device, |
|---|
| 1021 | + "invalid id %d completed on queue %d\n", |
|---|
| 1022 | + command_id, le16_to_cpu(cqe->sq_id)); |
|---|
| 1023 | + return; |
|---|
| 921 | 1024 | } |
|---|
| 1025 | + |
|---|
| 1026 | + trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); |
|---|
| 1027 | + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) |
|---|
| 1028 | + nvme_pci_complete_rq(req); |
|---|
| 922 | 1029 | } |
|---|
| 923 | 1030 | |
|---|
| 924 | 1031 | static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) |
|---|
| 925 | 1032 | { |
|---|
| 926 | | - if (nvmeq->cq_head == nvmeq->q_depth - 1) { |
|---|
| 1033 | + u32 tmp = nvmeq->cq_head + 1; |
|---|
| 1034 | + |
|---|
| 1035 | + if (tmp == nvmeq->q_depth) { |
|---|
| 927 | 1036 | nvmeq->cq_head = 0; |
|---|
| 928 | | - nvmeq->cq_phase = !nvmeq->cq_phase; |
|---|
| 1037 | + nvmeq->cq_phase ^= 1; |
|---|
| 929 | 1038 | } else { |
|---|
| 930 | | - nvmeq->cq_head++; |
|---|
| 1039 | + nvmeq->cq_head = tmp; |
|---|
| 931 | 1040 | } |
|---|
| 932 | 1041 | } |
|---|
| 933 | 1042 | |
|---|
| 934 | | -static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, |
|---|
| 935 | | - u16 *end, int tag) |
|---|
| 1043 | +static inline int nvme_process_cq(struct nvme_queue *nvmeq) |
|---|
| 936 | 1044 | { |
|---|
| 937 | | - bool found = false; |
|---|
| 1045 | + int found = 0; |
|---|
| 938 | 1046 | |
|---|
| 939 | | - *start = nvmeq->cq_head; |
|---|
| 940 | | - while (!found && nvme_cqe_pending(nvmeq)) { |
|---|
| 941 | | - if (nvmeq->cqes[nvmeq->cq_head].command_id == tag) |
|---|
| 942 | | - found = true; |
|---|
| 1047 | + while (nvme_cqe_pending(nvmeq)) { |
|---|
| 1048 | + found++; |
|---|
| 1049 | + /* |
|---|
| 1050 | + * load-load control dependency between phase and the rest of |
|---|
| 1051 | + * the cqe requires a full read memory barrier |
|---|
| 1052 | + */ |
|---|
| 1053 | + dma_rmb(); |
|---|
| 1054 | + nvme_handle_cqe(nvmeq, nvmeq->cq_head); |
|---|
| 943 | 1055 | nvme_update_cq_head(nvmeq); |
|---|
| 944 | 1056 | } |
|---|
| 945 | | - *end = nvmeq->cq_head; |
|---|
| 946 | 1057 | |
|---|
| 947 | | - if (*start != *end) |
|---|
| 1058 | + if (found) |
|---|
| 948 | 1059 | nvme_ring_cq_doorbell(nvmeq); |
|---|
| 949 | 1060 | return found; |
|---|
| 950 | 1061 | } |
|---|
| .. | .. |
|---|
| 953 | 1064 | { |
|---|
| 954 | 1065 | struct nvme_queue *nvmeq = data; |
|---|
| 955 | 1066 | irqreturn_t ret = IRQ_NONE; |
|---|
| 956 | | - u16 start, end; |
|---|
| 957 | 1067 | |
|---|
| 958 | | - spin_lock(&nvmeq->cq_lock); |
|---|
| 959 | | - if (nvmeq->cq_head != nvmeq->last_cq_head) |
|---|
| 1068 | + /* |
|---|
| 1069 | + * The rmb/wmb pair ensures we see all updates from a previous run of |
|---|
| 1070 | + * the irq handler, even if that was on another CPU. |
|---|
| 1071 | + */ |
|---|
| 1072 | + rmb(); |
|---|
| 1073 | + if (nvme_process_cq(nvmeq)) |
|---|
| 960 | 1074 | ret = IRQ_HANDLED; |
|---|
| 961 | | - nvme_process_cq(nvmeq, &start, &end, -1); |
|---|
| 962 | | - nvmeq->last_cq_head = nvmeq->cq_head; |
|---|
| 963 | | - spin_unlock(&nvmeq->cq_lock); |
|---|
| 964 | | - |
|---|
| 965 | | - if (start != end) { |
|---|
| 966 | | - nvme_complete_cqes(nvmeq, start, end); |
|---|
| 967 | | - return IRQ_HANDLED; |
|---|
| 968 | | - } |
|---|
| 1075 | + wmb(); |
|---|
| 969 | 1076 | |
|---|
| 970 | 1077 | return ret; |
|---|
| 971 | 1078 | } |
|---|
| .. | .. |
|---|
| 973 | 1080 | static irqreturn_t nvme_irq_check(int irq, void *data) |
|---|
| 974 | 1081 | { |
|---|
| 975 | 1082 | struct nvme_queue *nvmeq = data; |
|---|
| 1083 | + |
|---|
| 976 | 1084 | if (nvme_cqe_pending(nvmeq)) |
|---|
| 977 | 1085 | return IRQ_WAKE_THREAD; |
|---|
| 978 | 1086 | return IRQ_NONE; |
|---|
| 979 | 1087 | } |
|---|
| 980 | 1088 | |
|---|
| 981 | | -static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) |
|---|
| 1089 | +/* |
|---|
| 1090 | + * Poll for completions for any interrupt driven queue |
|---|
| 1091 | + * Can be called from any context. |
|---|
| 1092 | + */ |
|---|
| 1093 | +static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) |
|---|
| 982 | 1094 | { |
|---|
| 983 | | - u16 start, end; |
|---|
| 1095 | + struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); |
|---|
| 1096 | + |
|---|
| 1097 | + WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); |
|---|
| 1098 | + |
|---|
| 1099 | + disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); |
|---|
| 1100 | + nvme_process_cq(nvmeq); |
|---|
| 1101 | + enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); |
|---|
| 1102 | +} |
|---|
| 1103 | + |
|---|
| 1104 | +static int nvme_poll(struct blk_mq_hw_ctx *hctx) |
|---|
| 1105 | +{ |
|---|
| 1106 | + struct nvme_queue *nvmeq = hctx->driver_data; |
|---|
| 984 | 1107 | bool found; |
|---|
| 985 | 1108 | |
|---|
| 986 | 1109 | if (!nvme_cqe_pending(nvmeq)) |
|---|
| 987 | 1110 | return 0; |
|---|
| 988 | 1111 | |
|---|
| 989 | | - spin_lock_irq(&nvmeq->cq_lock); |
|---|
| 990 | | - found = nvme_process_cq(nvmeq, &start, &end, tag); |
|---|
| 991 | | - spin_unlock_irq(&nvmeq->cq_lock); |
|---|
| 1112 | + spin_lock(&nvmeq->cq_poll_lock); |
|---|
| 1113 | + found = nvme_process_cq(nvmeq); |
|---|
| 1114 | + spin_unlock(&nvmeq->cq_poll_lock); |
|---|
| 992 | 1115 | |
|---|
| 993 | | - nvme_complete_cqes(nvmeq, start, end); |
|---|
| 994 | 1116 | return found; |
|---|
| 995 | | -} |
|---|
| 996 | | - |
|---|
| 997 | | -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) |
|---|
| 998 | | -{ |
|---|
| 999 | | - struct nvme_queue *nvmeq = hctx->driver_data; |
|---|
| 1000 | | - |
|---|
| 1001 | | - return __nvme_poll(nvmeq, tag); |
|---|
| 1002 | 1117 | } |
|---|
| 1003 | 1118 | |
|---|
| 1004 | 1119 | static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) |
|---|
| .. | .. |
|---|
| 1010 | 1125 | memset(&c, 0, sizeof(c)); |
|---|
| 1011 | 1126 | c.common.opcode = nvme_admin_async_event; |
|---|
| 1012 | 1127 | c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; |
|---|
| 1013 | | - nvme_submit_cmd(nvmeq, &c); |
|---|
| 1128 | + nvme_submit_cmd(nvmeq, &c, true); |
|---|
| 1014 | 1129 | } |
|---|
| 1015 | 1130 | |
|---|
| 1016 | 1131 | static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) |
|---|
| .. | .. |
|---|
| 1028 | 1143 | struct nvme_queue *nvmeq, s16 vector) |
|---|
| 1029 | 1144 | { |
|---|
| 1030 | 1145 | struct nvme_command c; |
|---|
| 1031 | | - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; |
|---|
| 1146 | + int flags = NVME_QUEUE_PHYS_CONTIG; |
|---|
| 1147 | + |
|---|
| 1148 | + if (!test_bit(NVMEQ_POLLED, &nvmeq->flags)) |
|---|
| 1149 | + flags |= NVME_CQ_IRQ_ENABLED; |
|---|
| 1032 | 1150 | |
|---|
| 1033 | 1151 | /* |
|---|
| 1034 | 1152 | * Note: we (ab)use the fact that the prp fields survive if no data |
|---|
| .. | .. |
|---|
| 1098 | 1216 | |
|---|
| 1099 | 1217 | static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) |
|---|
| 1100 | 1218 | { |
|---|
| 1101 | | - |
|---|
| 1102 | 1219 | /* If true, indicates loss of adapter communication, possibly by a |
|---|
| 1103 | 1220 | * NVMe Subsystem reset. |
|---|
| 1104 | 1221 | */ |
|---|
| .. | .. |
|---|
| 1147 | 1264 | struct nvme_dev *dev = nvmeq->dev; |
|---|
| 1148 | 1265 | struct request *abort_req; |
|---|
| 1149 | 1266 | struct nvme_command cmd; |
|---|
| 1150 | | - bool shutdown = false; |
|---|
| 1151 | 1267 | u32 csts = readl(dev->bar + NVME_REG_CSTS); |
|---|
| 1152 | 1268 | |
|---|
| 1153 | 1269 | /* If PCI error recovery process is happening, we cannot reset or |
|---|
| .. | .. |
|---|
| 1170 | 1286 | /* |
|---|
| 1171 | 1287 | * Did we miss an interrupt? |
|---|
| 1172 | 1288 | */ |
|---|
| 1173 | | - if (__nvme_poll(nvmeq, req->tag)) { |
|---|
| 1289 | + if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) |
|---|
| 1290 | + nvme_poll(req->mq_hctx); |
|---|
| 1291 | + else |
|---|
| 1292 | + nvme_poll_irqdisable(nvmeq); |
|---|
| 1293 | + |
|---|
| 1294 | + if (blk_mq_request_completed(req)) { |
|---|
| 1174 | 1295 | dev_warn(dev->ctrl.device, |
|---|
| 1175 | 1296 | "I/O %d QID %d timeout, completion polled\n", |
|---|
| 1176 | 1297 | req->tag, nvmeq->qid); |
|---|
| .. | .. |
|---|
| 1184 | 1305 | * shutdown, so we return BLK_EH_DONE. |
|---|
| 1185 | 1306 | */ |
|---|
| 1186 | 1307 | switch (dev->ctrl.state) { |
|---|
| 1187 | | - case NVME_CTRL_DELETING: |
|---|
| 1188 | | - shutdown = true; |
|---|
| 1189 | 1308 | case NVME_CTRL_CONNECTING: |
|---|
| 1190 | | - case NVME_CTRL_RESETTING: |
|---|
| 1309 | + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); |
|---|
| 1310 | + fallthrough; |
|---|
| 1311 | + case NVME_CTRL_DELETING: |
|---|
| 1191 | 1312 | dev_warn_ratelimited(dev->ctrl.device, |
|---|
| 1192 | 1313 | "I/O %d QID %d timeout, disable controller\n", |
|---|
| 1193 | 1314 | req->tag, nvmeq->qid); |
|---|
| 1194 | | - nvme_dev_disable(dev, shutdown); |
|---|
| 1195 | 1315 | nvme_req(req)->flags |= NVME_REQ_CANCELLED; |
|---|
| 1316 | + nvme_dev_disable(dev, true); |
|---|
| 1196 | 1317 | return BLK_EH_DONE; |
|---|
| 1318 | + case NVME_CTRL_RESETTING: |
|---|
| 1319 | + return BLK_EH_RESET_TIMER; |
|---|
| 1197 | 1320 | default: |
|---|
| 1198 | 1321 | break; |
|---|
| 1199 | 1322 | } |
|---|
| 1200 | 1323 | |
|---|
| 1201 | 1324 | /* |
|---|
| 1202 | | - * Shutdown the controller immediately and schedule a reset if the |
|---|
| 1203 | | - * command was already aborted once before and still hasn't been |
|---|
| 1204 | | - * returned to the driver, or if this is the admin queue. |
|---|
| 1325 | + * Shutdown the controller immediately and schedule a reset if the |
|---|
| 1326 | + * command was already aborted once before and still hasn't been |
|---|
| 1327 | + * returned to the driver, or if this is the admin queue. |
|---|
| 1205 | 1328 | */ |
|---|
| 1206 | 1329 | if (!nvmeq->qid || iod->aborted) { |
|---|
| 1207 | 1330 | dev_warn(dev->ctrl.device, |
|---|
| 1208 | 1331 | "I/O %d QID %d timeout, reset controller\n", |
|---|
| 1209 | 1332 | req->tag, nvmeq->qid); |
|---|
| 1333 | + nvme_req(req)->flags |= NVME_REQ_CANCELLED; |
|---|
| 1210 | 1334 | nvme_dev_disable(dev, false); |
|---|
| 1211 | 1335 | nvme_reset_ctrl(&dev->ctrl); |
|---|
| 1212 | 1336 | |
|---|
| 1213 | | - nvme_req(req)->flags |= NVME_REQ_CANCELLED; |
|---|
| 1214 | 1337 | return BLK_EH_DONE; |
|---|
| 1215 | 1338 | } |
|---|
| 1216 | 1339 | |
|---|
| .. | .. |
|---|
| 1222 | 1345 | |
|---|
| 1223 | 1346 | memset(&cmd, 0, sizeof(cmd)); |
|---|
| 1224 | 1347 | cmd.abort.opcode = nvme_admin_abort_cmd; |
|---|
| 1225 | | - cmd.abort.cid = req->tag; |
|---|
| 1348 | + cmd.abort.cid = nvme_cid(req); |
|---|
| 1226 | 1349 | cmd.abort.sqid = cpu_to_le16(nvmeq->qid); |
|---|
| 1227 | 1350 | |
|---|
| 1228 | 1351 | dev_warn(nvmeq->dev->ctrl.device, |
|---|
| .. | .. |
|---|
| 1230 | 1353 | req->tag, nvmeq->qid); |
|---|
| 1231 | 1354 | |
|---|
| 1232 | 1355 | abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, |
|---|
| 1233 | | - BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); |
|---|
| 1356 | + BLK_MQ_REQ_NOWAIT); |
|---|
| 1234 | 1357 | if (IS_ERR(abort_req)) { |
|---|
| 1235 | 1358 | atomic_inc(&dev->ctrl.abort_limit); |
|---|
| 1236 | 1359 | return BLK_EH_RESET_TIMER; |
|---|
| 1237 | 1360 | } |
|---|
| 1238 | 1361 | |
|---|
| 1239 | | - abort_req->timeout = ADMIN_TIMEOUT; |
|---|
| 1240 | 1362 | abort_req->end_io_data = NULL; |
|---|
| 1241 | 1363 | blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); |
|---|
| 1242 | 1364 | |
|---|
| .. | .. |
|---|
| 1250 | 1372 | |
|---|
| 1251 | 1373 | static void nvme_free_queue(struct nvme_queue *nvmeq) |
|---|
| 1252 | 1374 | { |
|---|
| 1253 | | - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), |
|---|
| 1375 | + dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), |
|---|
| 1254 | 1376 | (void *)nvmeq->cqes, nvmeq->cq_dma_addr); |
|---|
| 1255 | | - if (nvmeq->sq_cmds) |
|---|
| 1256 | | - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), |
|---|
| 1257 | | - nvmeq->sq_cmds, nvmeq->sq_dma_addr); |
|---|
| 1377 | + if (!nvmeq->sq_cmds) |
|---|
| 1378 | + return; |
|---|
| 1379 | + |
|---|
| 1380 | + if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { |
|---|
| 1381 | + pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), |
|---|
| 1382 | + nvmeq->sq_cmds, SQ_SIZE(nvmeq)); |
|---|
| 1383 | + } else { |
|---|
| 1384 | + dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), |
|---|
| 1385 | + nvmeq->sq_cmds, nvmeq->sq_dma_addr); |
|---|
| 1386 | + } |
|---|
| 1258 | 1387 | } |
|---|
| 1259 | 1388 | |
|---|
| 1260 | 1389 | static void nvme_free_queues(struct nvme_dev *dev, int lowest) |
|---|
| .. | .. |
|---|
| 1269 | 1398 | |
|---|
| 1270 | 1399 | /** |
|---|
| 1271 | 1400 | * nvme_suspend_queue - put queue into suspended state |
|---|
| 1272 | | - * @nvmeq - queue to suspend |
|---|
| 1401 | + * @nvmeq: queue to suspend |
|---|
| 1273 | 1402 | */ |
|---|
| 1274 | 1403 | static int nvme_suspend_queue(struct nvme_queue *nvmeq) |
|---|
| 1275 | 1404 | { |
|---|
| 1276 | | - int vector; |
|---|
| 1277 | | - |
|---|
| 1278 | | - spin_lock_irq(&nvmeq->cq_lock); |
|---|
| 1279 | | - if (nvmeq->cq_vector == -1) { |
|---|
| 1280 | | - spin_unlock_irq(&nvmeq->cq_lock); |
|---|
| 1405 | + if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) |
|---|
| 1281 | 1406 | return 1; |
|---|
| 1282 | | - } |
|---|
| 1283 | | - vector = nvmeq->cq_vector; |
|---|
| 1284 | | - nvmeq->dev->online_queues--; |
|---|
| 1285 | | - nvmeq->cq_vector = -1; |
|---|
| 1286 | | - spin_unlock_irq(&nvmeq->cq_lock); |
|---|
| 1287 | 1407 | |
|---|
| 1288 | | - /* |
|---|
| 1289 | | - * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without |
|---|
| 1290 | | - * having to grab the lock. |
|---|
| 1291 | | - */ |
|---|
| 1408 | + /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ |
|---|
| 1292 | 1409 | mb(); |
|---|
| 1293 | 1410 | |
|---|
| 1411 | + nvmeq->dev->online_queues--; |
|---|
| 1294 | 1412 | if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) |
|---|
| 1295 | 1413 | blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); |
|---|
| 1296 | | - |
|---|
| 1297 | | - pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); |
|---|
| 1298 | | - |
|---|
| 1414 | + if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) |
|---|
| 1415 | + pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); |
|---|
| 1299 | 1416 | return 0; |
|---|
| 1417 | +} |
|---|
| 1418 | + |
|---|
| 1419 | +static void nvme_suspend_io_queues(struct nvme_dev *dev) |
|---|
| 1420 | +{ |
|---|
| 1421 | + int i; |
|---|
| 1422 | + |
|---|
| 1423 | + for (i = dev->ctrl.queue_count - 1; i > 0; i--) |
|---|
| 1424 | + nvme_suspend_queue(&dev->queues[i]); |
|---|
| 1300 | 1425 | } |
|---|
| 1301 | 1426 | |
|---|
| 1302 | 1427 | static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) |
|---|
| 1303 | 1428 | { |
|---|
| 1304 | 1429 | struct nvme_queue *nvmeq = &dev->queues[0]; |
|---|
| 1305 | | - u16 start, end; |
|---|
| 1306 | 1430 | |
|---|
| 1307 | 1431 | if (shutdown) |
|---|
| 1308 | 1432 | nvme_shutdown_ctrl(&dev->ctrl); |
|---|
| 1309 | 1433 | else |
|---|
| 1310 | | - nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); |
|---|
| 1434 | + nvme_disable_ctrl(&dev->ctrl); |
|---|
| 1311 | 1435 | |
|---|
| 1312 | | - spin_lock_irq(&nvmeq->cq_lock); |
|---|
| 1313 | | - nvme_process_cq(nvmeq, &start, &end, -1); |
|---|
| 1314 | | - spin_unlock_irq(&nvmeq->cq_lock); |
|---|
| 1436 | + nvme_poll_irqdisable(nvmeq); |
|---|
| 1437 | +} |
|---|
| 1315 | 1438 | |
|---|
| 1316 | | - nvme_complete_cqes(nvmeq, start, end); |
|---|
| 1439 | +/* |
|---|
| 1440 | + * Called only on a device that has been disabled and after all other threads |
|---|
| 1441 | + * that can check this device's completion queues have synced, except |
|---|
| 1442 | + * nvme_poll(). This is the last chance for the driver to see a natural |
|---|
| 1443 | + * completion before nvme_cancel_request() terminates all incomplete requests. |
|---|
| 1444 | + */ |
|---|
| 1445 | +static void nvme_reap_pending_cqes(struct nvme_dev *dev) |
|---|
| 1446 | +{ |
|---|
| 1447 | + int i; |
|---|
| 1448 | + |
|---|
| 1449 | + for (i = dev->ctrl.queue_count - 1; i > 0; i--) { |
|---|
| 1450 | + spin_lock(&dev->queues[i].cq_poll_lock); |
|---|
| 1451 | + nvme_process_cq(&dev->queues[i]); |
|---|
| 1452 | + spin_unlock(&dev->queues[i].cq_poll_lock); |
|---|
| 1453 | + } |
|---|
| 1317 | 1454 | } |
|---|
| 1318 | 1455 | |
|---|
| 1319 | 1456 | static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, |
|---|
| .. | .. |
|---|
| 1321 | 1458 | { |
|---|
| 1322 | 1459 | int q_depth = dev->q_depth; |
|---|
| 1323 | 1460 | unsigned q_size_aligned = roundup(q_depth * entry_size, |
|---|
| 1324 | | - dev->ctrl.page_size); |
|---|
| 1461 | + NVME_CTRL_PAGE_SIZE); |
|---|
| 1325 | 1462 | |
|---|
| 1326 | 1463 | if (q_size_aligned * nr_io_queues > dev->cmb_size) { |
|---|
| 1327 | 1464 | u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); |
|---|
| 1328 | | - mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); |
|---|
| 1465 | + |
|---|
| 1466 | + mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE); |
|---|
| 1329 | 1467 | q_depth = div_u64(mem_per_q, entry_size); |
|---|
| 1330 | 1468 | |
|---|
| 1331 | 1469 | /* |
|---|
| .. | .. |
|---|
| 1341 | 1479 | } |
|---|
| 1342 | 1480 | |
|---|
| 1343 | 1481 | static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, |
|---|
| 1344 | | - int qid, int depth) |
|---|
| 1482 | + int qid) |
|---|
| 1345 | 1483 | { |
|---|
| 1346 | | - /* CMB SQEs will be mapped before creation */ |
|---|
| 1347 | | - if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) |
|---|
| 1348 | | - return 0; |
|---|
| 1484 | + struct pci_dev *pdev = to_pci_dev(dev->dev); |
|---|
| 1349 | 1485 | |
|---|
| 1350 | | - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), |
|---|
| 1351 | | - &nvmeq->sq_dma_addr, GFP_KERNEL); |
|---|
| 1486 | + if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { |
|---|
| 1487 | + nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); |
|---|
| 1488 | + if (nvmeq->sq_cmds) { |
|---|
| 1489 | + nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, |
|---|
| 1490 | + nvmeq->sq_cmds); |
|---|
| 1491 | + if (nvmeq->sq_dma_addr) { |
|---|
| 1492 | + set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); |
|---|
| 1493 | + return 0; |
|---|
| 1494 | + } |
|---|
| 1495 | + |
|---|
| 1496 | + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); |
|---|
| 1497 | + } |
|---|
| 1498 | + } |
|---|
| 1499 | + |
|---|
| 1500 | + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), |
|---|
| 1501 | + &nvmeq->sq_dma_addr, GFP_KERNEL); |
|---|
| 1352 | 1502 | if (!nvmeq->sq_cmds) |
|---|
| 1353 | 1503 | return -ENOMEM; |
|---|
| 1354 | 1504 | return 0; |
|---|
| .. | .. |
|---|
| 1361 | 1511 | if (dev->ctrl.queue_count > qid) |
|---|
| 1362 | 1512 | return 0; |
|---|
| 1363 | 1513 | |
|---|
| 1364 | | - nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), |
|---|
| 1365 | | - &nvmeq->cq_dma_addr, GFP_KERNEL); |
|---|
| 1514 | + nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; |
|---|
| 1515 | + nvmeq->q_depth = depth; |
|---|
| 1516 | + nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), |
|---|
| 1517 | + &nvmeq->cq_dma_addr, GFP_KERNEL); |
|---|
| 1366 | 1518 | if (!nvmeq->cqes) |
|---|
| 1367 | 1519 | goto free_nvmeq; |
|---|
| 1368 | 1520 | |
|---|
| 1369 | | - if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) |
|---|
| 1521 | + if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) |
|---|
| 1370 | 1522 | goto free_cqdma; |
|---|
| 1371 | 1523 | |
|---|
| 1372 | | - nvmeq->q_dmadev = dev->dev; |
|---|
| 1373 | 1524 | nvmeq->dev = dev; |
|---|
| 1374 | 1525 | spin_lock_init(&nvmeq->sq_lock); |
|---|
| 1375 | | - spin_lock_init(&nvmeq->cq_lock); |
|---|
| 1526 | + spin_lock_init(&nvmeq->cq_poll_lock); |
|---|
| 1376 | 1527 | nvmeq->cq_head = 0; |
|---|
| 1377 | 1528 | nvmeq->cq_phase = 1; |
|---|
| 1378 | 1529 | nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; |
|---|
| 1379 | | - nvmeq->q_depth = depth; |
|---|
| 1380 | 1530 | nvmeq->qid = qid; |
|---|
| 1381 | | - nvmeq->cq_vector = -1; |
|---|
| 1382 | 1531 | dev->ctrl.queue_count++; |
|---|
| 1383 | 1532 | |
|---|
| 1384 | 1533 | return 0; |
|---|
| 1385 | 1534 | |
|---|
| 1386 | 1535 | free_cqdma: |
|---|
| 1387 | | - dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, |
|---|
| 1388 | | - nvmeq->cq_dma_addr); |
|---|
| 1536 | + dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, |
|---|
| 1537 | + nvmeq->cq_dma_addr); |
|---|
| 1389 | 1538 | free_nvmeq: |
|---|
| 1390 | 1539 | return -ENOMEM; |
|---|
| 1391 | 1540 | } |
|---|
| .. | .. |
|---|
| 1408 | 1557 | { |
|---|
| 1409 | 1558 | struct nvme_dev *dev = nvmeq->dev; |
|---|
| 1410 | 1559 | |
|---|
| 1411 | | - spin_lock_irq(&nvmeq->cq_lock); |
|---|
| 1412 | 1560 | nvmeq->sq_tail = 0; |
|---|
| 1561 | + nvmeq->last_sq_tail = 0; |
|---|
| 1413 | 1562 | nvmeq->cq_head = 0; |
|---|
| 1414 | 1563 | nvmeq->cq_phase = 1; |
|---|
| 1415 | 1564 | nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; |
|---|
| 1416 | | - memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); |
|---|
| 1565 | + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); |
|---|
| 1417 | 1566 | nvme_dbbuf_init(dev, nvmeq, qid); |
|---|
| 1418 | 1567 | dev->online_queues++; |
|---|
| 1419 | | - spin_unlock_irq(&nvmeq->cq_lock); |
|---|
| 1568 | + wmb(); /* ensure the first interrupt sees the initialization */ |
|---|
| 1420 | 1569 | } |
|---|
| 1421 | 1570 | |
|---|
| 1422 | | -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) |
|---|
| 1571 | +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) |
|---|
| 1423 | 1572 | { |
|---|
| 1424 | 1573 | struct nvme_dev *dev = nvmeq->dev; |
|---|
| 1425 | 1574 | int result; |
|---|
| 1426 | | - s16 vector; |
|---|
| 1575 | + u16 vector = 0; |
|---|
| 1427 | 1576 | |
|---|
| 1428 | | - if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { |
|---|
| 1429 | | - unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth), |
|---|
| 1430 | | - dev->ctrl.page_size); |
|---|
| 1431 | | - nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; |
|---|
| 1432 | | - nvmeq->sq_cmds_io = dev->cmb + offset; |
|---|
| 1433 | | - } |
|---|
| 1577 | + clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); |
|---|
| 1434 | 1578 | |
|---|
| 1435 | 1579 | /* |
|---|
| 1436 | 1580 | * A queue's vector matches the queue identifier unless the controller |
|---|
| 1437 | 1581 | * has only one vector available. |
|---|
| 1438 | 1582 | */ |
|---|
| 1439 | | - vector = dev->num_vecs == 1 ? 0 : qid; |
|---|
| 1583 | + if (!polled) |
|---|
| 1584 | + vector = dev->num_vecs == 1 ? 0 : qid; |
|---|
| 1585 | + else |
|---|
| 1586 | + set_bit(NVMEQ_POLLED, &nvmeq->flags); |
|---|
| 1587 | + |
|---|
| 1440 | 1588 | result = adapter_alloc_cq(dev, qid, nvmeq, vector); |
|---|
| 1441 | 1589 | if (result) |
|---|
| 1442 | 1590 | return result; |
|---|
| .. | .. |
|---|
| 1444 | 1592 | result = adapter_alloc_sq(dev, qid, nvmeq); |
|---|
| 1445 | 1593 | if (result < 0) |
|---|
| 1446 | 1594 | return result; |
|---|
| 1447 | | - else if (result) |
|---|
| 1595 | + if (result) |
|---|
| 1448 | 1596 | goto release_cq; |
|---|
| 1449 | 1597 | |
|---|
| 1450 | | - /* |
|---|
| 1451 | | - * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will |
|---|
| 1452 | | - * invoke free_irq for it and cause a 'Trying to free already-free IRQ |
|---|
| 1453 | | - * xxx' warning if the create CQ/SQ command times out. |
|---|
| 1454 | | - */ |
|---|
| 1455 | 1598 | nvmeq->cq_vector = vector; |
|---|
| 1456 | 1599 | nvme_init_queue(nvmeq, qid); |
|---|
| 1457 | | - result = queue_request_irq(nvmeq); |
|---|
| 1458 | | - if (result < 0) |
|---|
| 1459 | | - goto release_sq; |
|---|
| 1460 | 1600 | |
|---|
| 1601 | + if (!polled) { |
|---|
| 1602 | + result = queue_request_irq(nvmeq); |
|---|
| 1603 | + if (result < 0) |
|---|
| 1604 | + goto release_sq; |
|---|
| 1605 | + } |
|---|
| 1606 | + |
|---|
| 1607 | + set_bit(NVMEQ_ENABLED, &nvmeq->flags); |
|---|
| 1461 | 1608 | return result; |
|---|
| 1462 | 1609 | |
|---|
| 1463 | 1610 | release_sq: |
|---|
| 1464 | | - nvmeq->cq_vector = -1; |
|---|
| 1465 | 1611 | dev->online_queues--; |
|---|
| 1466 | 1612 | adapter_delete_sq(dev, qid); |
|---|
| 1467 | 1613 | release_cq: |
|---|
| .. | .. |
|---|
| 1473 | 1619 | .queue_rq = nvme_queue_rq, |
|---|
| 1474 | 1620 | .complete = nvme_pci_complete_rq, |
|---|
| 1475 | 1621 | .init_hctx = nvme_admin_init_hctx, |
|---|
| 1476 | | - .exit_hctx = nvme_admin_exit_hctx, |
|---|
| 1477 | 1622 | .init_request = nvme_init_request, |
|---|
| 1478 | 1623 | .timeout = nvme_timeout, |
|---|
| 1479 | 1624 | }; |
|---|
| .. | .. |
|---|
| 1481 | 1626 | static const struct blk_mq_ops nvme_mq_ops = { |
|---|
| 1482 | 1627 | .queue_rq = nvme_queue_rq, |
|---|
| 1483 | 1628 | .complete = nvme_pci_complete_rq, |
|---|
| 1629 | + .commit_rqs = nvme_commit_rqs, |
|---|
| 1484 | 1630 | .init_hctx = nvme_init_hctx, |
|---|
| 1485 | 1631 | .init_request = nvme_init_request, |
|---|
| 1486 | 1632 | .map_queues = nvme_pci_map_queues, |
|---|
| .. | .. |
|---|
| 1510 | 1656 | |
|---|
| 1511 | 1657 | dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; |
|---|
| 1512 | 1658 | dev->admin_tagset.timeout = ADMIN_TIMEOUT; |
|---|
| 1513 | | - dev->admin_tagset.numa_node = dev_to_node(dev->dev); |
|---|
| 1514 | | - dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false); |
|---|
| 1659 | + dev->admin_tagset.numa_node = dev->ctrl.numa_node; |
|---|
| 1660 | + dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); |
|---|
| 1515 | 1661 | dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; |
|---|
| 1516 | 1662 | dev->admin_tagset.driver_data = dev; |
|---|
| 1517 | 1663 | |
|---|
| .. | .. |
|---|
| 1522 | 1668 | dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); |
|---|
| 1523 | 1669 | if (IS_ERR(dev->ctrl.admin_q)) { |
|---|
| 1524 | 1670 | blk_mq_free_tag_set(&dev->admin_tagset); |
|---|
| 1671 | + dev->ctrl.admin_q = NULL; |
|---|
| 1525 | 1672 | return -ENOMEM; |
|---|
| 1526 | 1673 | } |
|---|
| 1527 | 1674 | if (!blk_get_queue(dev->ctrl.admin_q)) { |
|---|
| .. | .. |
|---|
| 1578 | 1725 | (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) |
|---|
| 1579 | 1726 | writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); |
|---|
| 1580 | 1727 | |
|---|
| 1581 | | - result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); |
|---|
| 1728 | + result = nvme_disable_ctrl(&dev->ctrl); |
|---|
| 1582 | 1729 | if (result < 0) |
|---|
| 1583 | 1730 | return result; |
|---|
| 1584 | 1731 | |
|---|
| 1585 | 1732 | result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); |
|---|
| 1586 | 1733 | if (result) |
|---|
| 1587 | 1734 | return result; |
|---|
| 1735 | + |
|---|
| 1736 | + dev->ctrl.numa_node = dev_to_node(dev->dev); |
|---|
| 1588 | 1737 | |
|---|
| 1589 | 1738 | nvmeq = &dev->queues[0]; |
|---|
| 1590 | 1739 | aqa = nvmeq->q_depth - 1; |
|---|
| .. | .. |
|---|
| 1594 | 1743 | lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); |
|---|
| 1595 | 1744 | lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); |
|---|
| 1596 | 1745 | |
|---|
| 1597 | | - result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); |
|---|
| 1746 | + result = nvme_enable_ctrl(&dev->ctrl); |
|---|
| 1598 | 1747 | if (result) |
|---|
| 1599 | 1748 | return result; |
|---|
| 1600 | 1749 | |
|---|
| .. | .. |
|---|
| 1602 | 1751 | nvme_init_queue(nvmeq, 0); |
|---|
| 1603 | 1752 | result = queue_request_irq(nvmeq); |
|---|
| 1604 | 1753 | if (result) { |
|---|
| 1605 | | - nvmeq->cq_vector = -1; |
|---|
| 1754 | + dev->online_queues--; |
|---|
| 1606 | 1755 | return result; |
|---|
| 1607 | 1756 | } |
|---|
| 1608 | 1757 | |
|---|
| 1758 | + set_bit(NVMEQ_ENABLED, &nvmeq->flags); |
|---|
| 1609 | 1759 | return result; |
|---|
| 1610 | 1760 | } |
|---|
| 1611 | 1761 | |
|---|
| 1612 | 1762 | static int nvme_create_io_queues(struct nvme_dev *dev) |
|---|
| 1613 | 1763 | { |
|---|
| 1614 | | - unsigned i, max; |
|---|
| 1764 | + unsigned i, max, rw_queues; |
|---|
| 1615 | 1765 | int ret = 0; |
|---|
| 1616 | 1766 | |
|---|
| 1617 | 1767 | for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { |
|---|
| .. | .. |
|---|
| 1622 | 1772 | } |
|---|
| 1623 | 1773 | |
|---|
| 1624 | 1774 | max = min(dev->max_qid, dev->ctrl.queue_count - 1); |
|---|
| 1775 | + if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { |
|---|
| 1776 | + rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + |
|---|
| 1777 | + dev->io_queues[HCTX_TYPE_READ]; |
|---|
| 1778 | + } else { |
|---|
| 1779 | + rw_queues = max; |
|---|
| 1780 | + } |
|---|
| 1781 | + |
|---|
| 1625 | 1782 | for (i = dev->online_queues; i <= max; i++) { |
|---|
| 1626 | | - ret = nvme_create_queue(&dev->queues[i], i); |
|---|
| 1783 | + bool polled = i > rw_queues; |
|---|
| 1784 | + |
|---|
| 1785 | + ret = nvme_create_queue(&dev->queues[i], i, polled); |
|---|
| 1627 | 1786 | if (ret) |
|---|
| 1628 | 1787 | break; |
|---|
| 1629 | 1788 | } |
|---|
| .. | .. |
|---|
| 1670 | 1829 | if (dev->cmb_size) |
|---|
| 1671 | 1830 | return; |
|---|
| 1672 | 1831 | |
|---|
| 1832 | + if (NVME_CAP_CMBS(dev->ctrl.cap)) |
|---|
| 1833 | + writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC); |
|---|
| 1834 | + |
|---|
| 1673 | 1835 | dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); |
|---|
| 1674 | 1836 | if (!dev->cmbsz) |
|---|
| 1675 | 1837 | return; |
|---|
| 1676 | 1838 | dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); |
|---|
| 1677 | | - |
|---|
| 1678 | | - if (!use_cmb_sqes) |
|---|
| 1679 | | - return; |
|---|
| 1680 | 1839 | |
|---|
| 1681 | 1840 | size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); |
|---|
| 1682 | 1841 | offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); |
|---|
| .. | .. |
|---|
| 1687 | 1846 | return; |
|---|
| 1688 | 1847 | |
|---|
| 1689 | 1848 | /* |
|---|
| 1849 | + * Tell the controller about the host side address mapping the CMB, |
|---|
| 1850 | + * and enable CMB decoding for the NVMe 1.4+ scheme: |
|---|
| 1851 | + */ |
|---|
| 1852 | + if (NVME_CAP_CMBS(dev->ctrl.cap)) { |
|---|
| 1853 | + hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE | |
|---|
| 1854 | + (pci_bus_address(pdev, bar) + offset), |
|---|
| 1855 | + dev->bar + NVME_REG_CMBMSC); |
|---|
| 1856 | + } |
|---|
| 1857 | + |
|---|
| 1858 | + /* |
|---|
| 1690 | 1859 | * Controllers may support a CMB size larger than their BAR, |
|---|
| 1691 | 1860 | * for example, due to being behind a bridge. Reduce the CMB to |
|---|
| 1692 | 1861 | * the reported size of the BAR |
|---|
| .. | .. |
|---|
| 1694 | 1863 | if (size > bar_size - offset) |
|---|
| 1695 | 1864 | size = bar_size - offset; |
|---|
| 1696 | 1865 | |
|---|
| 1697 | | - dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); |
|---|
| 1698 | | - if (!dev->cmb) |
|---|
| 1866 | + if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { |
|---|
| 1867 | + dev_warn(dev->ctrl.device, |
|---|
| 1868 | + "failed to register the CMB\n"); |
|---|
| 1699 | 1869 | return; |
|---|
| 1700 | | - dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; |
|---|
| 1870 | + } |
|---|
| 1871 | + |
|---|
| 1701 | 1872 | dev->cmb_size = size; |
|---|
| 1873 | + dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS); |
|---|
| 1874 | + |
|---|
| 1875 | + if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == |
|---|
| 1876 | + (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) |
|---|
| 1877 | + pci_p2pmem_publish(pdev, true); |
|---|
| 1702 | 1878 | |
|---|
| 1703 | 1879 | if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, |
|---|
| 1704 | 1880 | &dev_attr_cmb.attr, NULL)) |
|---|
| .. | .. |
|---|
| 1708 | 1884 | |
|---|
| 1709 | 1885 | static inline void nvme_release_cmb(struct nvme_dev *dev) |
|---|
| 1710 | 1886 | { |
|---|
| 1711 | | - if (dev->cmb) { |
|---|
| 1712 | | - iounmap(dev->cmb); |
|---|
| 1713 | | - dev->cmb = NULL; |
|---|
| 1887 | + if (dev->cmb_size) { |
|---|
| 1714 | 1888 | sysfs_remove_file_from_group(&dev->ctrl.device->kobj, |
|---|
| 1715 | 1889 | &dev_attr_cmb.attr, NULL); |
|---|
| 1716 | | - dev->cmbsz = 0; |
|---|
| 1890 | + dev->cmb_size = 0; |
|---|
| 1717 | 1891 | } |
|---|
| 1718 | 1892 | } |
|---|
| 1719 | 1893 | |
|---|
| 1720 | 1894 | static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) |
|---|
| 1721 | 1895 | { |
|---|
| 1896 | + u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT; |
|---|
| 1722 | 1897 | u64 dma_addr = dev->host_mem_descs_dma; |
|---|
| 1723 | 1898 | struct nvme_command c; |
|---|
| 1724 | 1899 | int ret; |
|---|
| .. | .. |
|---|
| 1727 | 1902 | c.features.opcode = nvme_admin_set_features; |
|---|
| 1728 | 1903 | c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); |
|---|
| 1729 | 1904 | c.features.dword11 = cpu_to_le32(bits); |
|---|
| 1730 | | - c.features.dword12 = cpu_to_le32(dev->host_mem_size >> |
|---|
| 1731 | | - ilog2(dev->ctrl.page_size)); |
|---|
| 1905 | + c.features.dword12 = cpu_to_le32(host_mem_size); |
|---|
| 1732 | 1906 | c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); |
|---|
| 1733 | 1907 | c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); |
|---|
| 1734 | 1908 | c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); |
|---|
| .. | .. |
|---|
| 1748 | 1922 | |
|---|
| 1749 | 1923 | for (i = 0; i < dev->nr_host_mem_descs; i++) { |
|---|
| 1750 | 1924 | struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; |
|---|
| 1751 | | - size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; |
|---|
| 1925 | + size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; |
|---|
| 1752 | 1926 | |
|---|
| 1753 | 1927 | dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], |
|---|
| 1754 | 1928 | le64_to_cpu(desc->addr), |
|---|
| .. | .. |
|---|
| 1781 | 1955 | if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) |
|---|
| 1782 | 1956 | max_entries = dev->ctrl.hmmaxd; |
|---|
| 1783 | 1957 | |
|---|
| 1784 | | - descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs), |
|---|
| 1785 | | - &descs_dma, GFP_KERNEL); |
|---|
| 1958 | + descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs), |
|---|
| 1959 | + &descs_dma, GFP_KERNEL); |
|---|
| 1786 | 1960 | if (!descs) |
|---|
| 1787 | 1961 | goto out; |
|---|
| 1788 | 1962 | |
|---|
| .. | .. |
|---|
| 1800 | 1974 | break; |
|---|
| 1801 | 1975 | |
|---|
| 1802 | 1976 | descs[i].addr = cpu_to_le64(dma_addr); |
|---|
| 1803 | | - descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); |
|---|
| 1977 | + descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE); |
|---|
| 1804 | 1978 | i++; |
|---|
| 1805 | 1979 | } |
|---|
| 1806 | 1980 | |
|---|
| .. | .. |
|---|
| 1816 | 1990 | |
|---|
| 1817 | 1991 | out_free_bufs: |
|---|
| 1818 | 1992 | while (--i >= 0) { |
|---|
| 1819 | | - size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; |
|---|
| 1993 | + size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; |
|---|
| 1820 | 1994 | |
|---|
| 1821 | 1995 | dma_free_attrs(dev->dev, size, bufs[i], |
|---|
| 1822 | 1996 | le64_to_cpu(descs[i].addr), |
|---|
| .. | .. |
|---|
| 1834 | 2008 | |
|---|
| 1835 | 2009 | static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) |
|---|
| 1836 | 2010 | { |
|---|
| 1837 | | - u32 chunk_size; |
|---|
| 2011 | + u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); |
|---|
| 2012 | + u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); |
|---|
| 2013 | + u64 chunk_size; |
|---|
| 1838 | 2014 | |
|---|
| 1839 | 2015 | /* start big and work our way down */ |
|---|
| 1840 | | - for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); |
|---|
| 1841 | | - chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); |
|---|
| 1842 | | - chunk_size /= 2) { |
|---|
| 2016 | + for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { |
|---|
| 1843 | 2017 | if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { |
|---|
| 1844 | 2018 | if (!min || dev->host_mem_size >= min) |
|---|
| 1845 | 2019 | return 0; |
|---|
| .. | .. |
|---|
| 1895 | 2069 | return ret; |
|---|
| 1896 | 2070 | } |
|---|
| 1897 | 2071 | |
|---|
| 2072 | +/* |
|---|
| 2073 | + * nirqs is the number of interrupts available for write and read |
|---|
| 2074 | + * queues. The core already reserved an interrupt for the admin queue. |
|---|
| 2075 | + */ |
|---|
| 2076 | +static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) |
|---|
| 2077 | +{ |
|---|
| 2078 | + struct nvme_dev *dev = affd->priv; |
|---|
| 2079 | + unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; |
|---|
| 2080 | + |
|---|
| 2081 | + /* |
|---|
| 2082 | + * If there is no interrupt available for queues, ensure that |
|---|
| 2083 | + * the default queue is set to 1. The affinity set size is |
|---|
| 2084 | + * also set to one, but the irq core ignores it for this case. |
|---|
| 2085 | + * |
|---|
| 2086 | + * If only one interrupt is available or 'write_queue' == 0, combine |
|---|
| 2087 | + * write and read queues. |
|---|
| 2088 | + * |
|---|
| 2089 | + * If 'write_queues' > 0, ensure it leaves room for at least one read |
|---|
| 2090 | + * queue. |
|---|
| 2091 | + */ |
|---|
| 2092 | + if (!nrirqs) { |
|---|
| 2093 | + nrirqs = 1; |
|---|
| 2094 | + nr_read_queues = 0; |
|---|
| 2095 | + } else if (nrirqs == 1 || !nr_write_queues) { |
|---|
| 2096 | + nr_read_queues = 0; |
|---|
| 2097 | + } else if (nr_write_queues >= nrirqs) { |
|---|
| 2098 | + nr_read_queues = 1; |
|---|
| 2099 | + } else { |
|---|
| 2100 | + nr_read_queues = nrirqs - nr_write_queues; |
|---|
| 2101 | + } |
|---|
| 2102 | + |
|---|
| 2103 | + dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; |
|---|
| 2104 | + affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; |
|---|
| 2105 | + dev->io_queues[HCTX_TYPE_READ] = nr_read_queues; |
|---|
| 2106 | + affd->set_size[HCTX_TYPE_READ] = nr_read_queues; |
|---|
| 2107 | + affd->nr_sets = nr_read_queues ? 2 : 1; |
|---|
| 2108 | +} |
|---|
| 2109 | + |
|---|
| 2110 | +static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) |
|---|
| 2111 | +{ |
|---|
| 2112 | + struct pci_dev *pdev = to_pci_dev(dev->dev); |
|---|
| 2113 | + struct irq_affinity affd = { |
|---|
| 2114 | + .pre_vectors = 1, |
|---|
| 2115 | + .calc_sets = nvme_calc_irq_sets, |
|---|
| 2116 | + .priv = dev, |
|---|
| 2117 | + }; |
|---|
| 2118 | + unsigned int irq_queues, poll_queues; |
|---|
| 2119 | + |
|---|
| 2120 | + /* |
|---|
| 2121 | + * Poll queues don't need interrupts, but we need at least one I/O queue |
|---|
| 2122 | + * left over for non-polled I/O. |
|---|
| 2123 | + */ |
|---|
| 2124 | + poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1); |
|---|
| 2125 | + dev->io_queues[HCTX_TYPE_POLL] = poll_queues; |
|---|
| 2126 | + |
|---|
| 2127 | + /* |
|---|
| 2128 | + * Initialize for the single interrupt case, will be updated in |
|---|
| 2129 | + * nvme_calc_irq_sets(). |
|---|
| 2130 | + */ |
|---|
| 2131 | + dev->io_queues[HCTX_TYPE_DEFAULT] = 1; |
|---|
| 2132 | + dev->io_queues[HCTX_TYPE_READ] = 0; |
|---|
| 2133 | + |
|---|
| 2134 | + /* |
|---|
| 2135 | + * We need interrupts for the admin queue and each non-polled I/O queue, |
|---|
| 2136 | + * but some Apple controllers require all queues to use the first |
|---|
| 2137 | + * vector. |
|---|
| 2138 | + */ |
|---|
| 2139 | + irq_queues = 1; |
|---|
| 2140 | + if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) |
|---|
| 2141 | + irq_queues += (nr_io_queues - poll_queues); |
|---|
| 2142 | + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, |
|---|
| 2143 | + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); |
|---|
| 2144 | +} |
|---|
| 2145 | + |
|---|
| 2146 | +static void nvme_disable_io_queues(struct nvme_dev *dev) |
|---|
| 2147 | +{ |
|---|
| 2148 | + if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq)) |
|---|
| 2149 | + __nvme_disable_io_queues(dev, nvme_admin_delete_cq); |
|---|
| 2150 | +} |
|---|
| 2151 | + |
|---|
| 2152 | +static unsigned int nvme_max_io_queues(struct nvme_dev *dev) |
|---|
| 2153 | +{ |
|---|
| 2154 | + return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; |
|---|
| 2155 | +} |
|---|
| 2156 | + |
|---|
| 1898 | 2157 | static int nvme_setup_io_queues(struct nvme_dev *dev) |
|---|
| 1899 | 2158 | { |
|---|
| 1900 | 2159 | struct nvme_queue *adminq = &dev->queues[0]; |
|---|
| 1901 | 2160 | struct pci_dev *pdev = to_pci_dev(dev->dev); |
|---|
| 1902 | | - int result, nr_io_queues; |
|---|
| 2161 | + unsigned int nr_io_queues; |
|---|
| 1903 | 2162 | unsigned long size; |
|---|
| 2163 | + int result; |
|---|
| 1904 | 2164 | |
|---|
| 1905 | | - struct irq_affinity affd = { |
|---|
| 1906 | | - .pre_vectors = 1 |
|---|
| 1907 | | - }; |
|---|
| 2165 | + /* |
|---|
| 2166 | + * Sample the module parameters once at reset time so that we have |
|---|
| 2167 | + * stable values to work with. |
|---|
| 2168 | + */ |
|---|
| 2169 | + dev->nr_write_queues = write_queues; |
|---|
| 2170 | + dev->nr_poll_queues = poll_queues; |
|---|
| 1908 | 2171 | |
|---|
| 1909 | | - nr_io_queues = num_possible_cpus(); |
|---|
| 2172 | + /* |
|---|
| 2173 | + * If tags are shared with admin queue (Apple bug), then |
|---|
| 2174 | + * make sure we only use one IO queue. |
|---|
| 2175 | + */ |
|---|
| 2176 | + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) |
|---|
| 2177 | + nr_io_queues = 1; |
|---|
| 2178 | + else |
|---|
| 2179 | + nr_io_queues = min(nvme_max_io_queues(dev), |
|---|
| 2180 | + dev->nr_allocated_queues - 1); |
|---|
| 2181 | + |
|---|
| 1910 | 2182 | result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); |
|---|
| 1911 | 2183 | if (result < 0) |
|---|
| 1912 | 2184 | return result; |
|---|
| 1913 | 2185 | |
|---|
| 1914 | 2186 | if (nr_io_queues == 0) |
|---|
| 1915 | 2187 | return 0; |
|---|
| 2188 | + |
|---|
| 2189 | + clear_bit(NVMEQ_ENABLED, &adminq->flags); |
|---|
| 1916 | 2190 | |
|---|
| 1917 | | - if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { |
|---|
| 2191 | + if (dev->cmb_use_sqes) { |
|---|
| 1918 | 2192 | result = nvme_cmb_qdepth(dev, nr_io_queues, |
|---|
| 1919 | 2193 | sizeof(struct nvme_command)); |
|---|
| 1920 | 2194 | if (result > 0) |
|---|
| 1921 | 2195 | dev->q_depth = result; |
|---|
| 1922 | 2196 | else |
|---|
| 1923 | | - nvme_release_cmb(dev); |
|---|
| 2197 | + dev->cmb_use_sqes = false; |
|---|
| 1924 | 2198 | } |
|---|
| 1925 | 2199 | |
|---|
| 1926 | 2200 | do { |
|---|
| .. | .. |
|---|
| 1933 | 2207 | } while (1); |
|---|
| 1934 | 2208 | adminq->q_db = dev->dbs; |
|---|
| 1935 | 2209 | |
|---|
| 2210 | + retry: |
|---|
| 1936 | 2211 | /* Deregister the admin queue's interrupt */ |
|---|
| 1937 | 2212 | pci_free_irq(pdev, 0, adminq); |
|---|
| 1938 | 2213 | |
|---|
| .. | .. |
|---|
| 1941 | 2216 | * setting up the full range we need. |
|---|
| 1942 | 2217 | */ |
|---|
| 1943 | 2218 | pci_free_irq_vectors(pdev); |
|---|
| 1944 | | - result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1, |
|---|
| 1945 | | - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); |
|---|
| 2219 | + |
|---|
| 2220 | + result = nvme_setup_irqs(dev, nr_io_queues); |
|---|
| 1946 | 2221 | if (result <= 0) |
|---|
| 1947 | 2222 | return -EIO; |
|---|
| 2223 | + |
|---|
| 1948 | 2224 | dev->num_vecs = result; |
|---|
| 1949 | | - dev->max_qid = max(result - 1, 1); |
|---|
| 2225 | + result = max(result - 1, 1); |
|---|
| 2226 | + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; |
|---|
| 1950 | 2227 | |
|---|
| 1951 | 2228 | /* |
|---|
| 1952 | 2229 | * Should investigate if there's a performance win from allocating |
|---|
| .. | .. |
|---|
| 1954 | 2231 | * path to scale better, even if the receive path is limited by the |
|---|
| 1955 | 2232 | * number of interrupts. |
|---|
| 1956 | 2233 | */ |
|---|
| 1957 | | - |
|---|
| 1958 | 2234 | result = queue_request_irq(adminq); |
|---|
| 1959 | | - if (result) { |
|---|
| 1960 | | - adminq->cq_vector = -1; |
|---|
| 2235 | + if (result) |
|---|
| 1961 | 2236 | return result; |
|---|
| 2237 | + set_bit(NVMEQ_ENABLED, &adminq->flags); |
|---|
| 2238 | + |
|---|
| 2239 | + result = nvme_create_io_queues(dev); |
|---|
| 2240 | + if (result || dev->online_queues < 2) |
|---|
| 2241 | + return result; |
|---|
| 2242 | + |
|---|
| 2243 | + if (dev->online_queues - 1 < dev->max_qid) { |
|---|
| 2244 | + nr_io_queues = dev->online_queues - 1; |
|---|
| 2245 | + nvme_disable_io_queues(dev); |
|---|
| 2246 | + nvme_suspend_io_queues(dev); |
|---|
| 2247 | + goto retry; |
|---|
| 1962 | 2248 | } |
|---|
| 1963 | | - return nvme_create_io_queues(dev); |
|---|
| 2249 | + dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", |
|---|
| 2250 | + dev->io_queues[HCTX_TYPE_DEFAULT], |
|---|
| 2251 | + dev->io_queues[HCTX_TYPE_READ], |
|---|
| 2252 | + dev->io_queues[HCTX_TYPE_POLL]); |
|---|
| 2253 | + return 0; |
|---|
| 1964 | 2254 | } |
|---|
| 1965 | 2255 | |
|---|
| 1966 | 2256 | static void nvme_del_queue_end(struct request *req, blk_status_t error) |
|---|
| .. | .. |
|---|
| 1968 | 2258 | struct nvme_queue *nvmeq = req->end_io_data; |
|---|
| 1969 | 2259 | |
|---|
| 1970 | 2260 | blk_mq_free_request(req); |
|---|
| 1971 | | - complete(&nvmeq->dev->ioq_wait); |
|---|
| 2261 | + complete(&nvmeq->delete_done); |
|---|
| 1972 | 2262 | } |
|---|
| 1973 | 2263 | |
|---|
| 1974 | 2264 | static void nvme_del_cq_end(struct request *req, blk_status_t error) |
|---|
| 1975 | 2265 | { |
|---|
| 1976 | 2266 | struct nvme_queue *nvmeq = req->end_io_data; |
|---|
| 1977 | | - u16 start, end; |
|---|
| 1978 | 2267 | |
|---|
| 1979 | | - if (!error) { |
|---|
| 1980 | | - unsigned long flags; |
|---|
| 1981 | | - |
|---|
| 1982 | | - spin_lock_irqsave(&nvmeq->cq_lock, flags); |
|---|
| 1983 | | - nvme_process_cq(nvmeq, &start, &end, -1); |
|---|
| 1984 | | - spin_unlock_irqrestore(&nvmeq->cq_lock, flags); |
|---|
| 1985 | | - |
|---|
| 1986 | | - nvme_complete_cqes(nvmeq, start, end); |
|---|
| 1987 | | - } |
|---|
| 2268 | + if (error) |
|---|
| 2269 | + set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); |
|---|
| 1988 | 2270 | |
|---|
| 1989 | 2271 | nvme_del_queue_end(req, error); |
|---|
| 1990 | 2272 | } |
|---|
| .. | .. |
|---|
| 1999 | 2281 | cmd.delete_queue.opcode = opcode; |
|---|
| 2000 | 2282 | cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); |
|---|
| 2001 | 2283 | |
|---|
| 2002 | | - req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); |
|---|
| 2284 | + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); |
|---|
| 2003 | 2285 | if (IS_ERR(req)) |
|---|
| 2004 | 2286 | return PTR_ERR(req); |
|---|
| 2005 | 2287 | |
|---|
| 2006 | | - req->timeout = ADMIN_TIMEOUT; |
|---|
| 2007 | 2288 | req->end_io_data = nvmeq; |
|---|
| 2008 | 2289 | |
|---|
| 2290 | + init_completion(&nvmeq->delete_done); |
|---|
| 2009 | 2291 | blk_execute_rq_nowait(q, NULL, req, false, |
|---|
| 2010 | 2292 | opcode == nvme_admin_delete_cq ? |
|---|
| 2011 | 2293 | nvme_del_cq_end : nvme_del_queue_end); |
|---|
| 2012 | 2294 | return 0; |
|---|
| 2013 | 2295 | } |
|---|
| 2014 | 2296 | |
|---|
| 2015 | | -static void nvme_disable_io_queues(struct nvme_dev *dev) |
|---|
| 2297 | +static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) |
|---|
| 2016 | 2298 | { |
|---|
| 2017 | | - int pass, queues = dev->online_queues - 1; |
|---|
| 2299 | + int nr_queues = dev->online_queues - 1, sent = 0; |
|---|
| 2018 | 2300 | unsigned long timeout; |
|---|
| 2019 | | - u8 opcode = nvme_admin_delete_sq; |
|---|
| 2020 | 2301 | |
|---|
| 2021 | | - for (pass = 0; pass < 2; pass++) { |
|---|
| 2022 | | - int sent = 0, i = queues; |
|---|
| 2023 | | - |
|---|
| 2024 | | - reinit_completion(&dev->ioq_wait); |
|---|
| 2025 | 2302 | retry: |
|---|
| 2026 | | - timeout = ADMIN_TIMEOUT; |
|---|
| 2027 | | - for (; i > 0; i--, sent++) |
|---|
| 2028 | | - if (nvme_delete_queue(&dev->queues[i], opcode)) |
|---|
| 2029 | | - break; |
|---|
| 2030 | | - |
|---|
| 2031 | | - while (sent--) { |
|---|
| 2032 | | - timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); |
|---|
| 2033 | | - if (timeout == 0) |
|---|
| 2034 | | - return; |
|---|
| 2035 | | - if (i) |
|---|
| 2036 | | - goto retry; |
|---|
| 2037 | | - } |
|---|
| 2038 | | - opcode = nvme_admin_delete_cq; |
|---|
| 2303 | + timeout = ADMIN_TIMEOUT; |
|---|
| 2304 | + while (nr_queues > 0) { |
|---|
| 2305 | + if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) |
|---|
| 2306 | + break; |
|---|
| 2307 | + nr_queues--; |
|---|
| 2308 | + sent++; |
|---|
| 2039 | 2309 | } |
|---|
| 2310 | + while (sent) { |
|---|
| 2311 | + struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent]; |
|---|
| 2312 | + |
|---|
| 2313 | + timeout = wait_for_completion_io_timeout(&nvmeq->delete_done, |
|---|
| 2314 | + timeout); |
|---|
| 2315 | + if (timeout == 0) |
|---|
| 2316 | + return false; |
|---|
| 2317 | + |
|---|
| 2318 | + sent--; |
|---|
| 2319 | + if (nr_queues) |
|---|
| 2320 | + goto retry; |
|---|
| 2321 | + } |
|---|
| 2322 | + return true; |
|---|
| 2040 | 2323 | } |
|---|
| 2041 | 2324 | |
|---|
| 2042 | | -/* |
|---|
| 2043 | | - * return error value only when tagset allocation failed |
|---|
| 2044 | | - */ |
|---|
| 2045 | | -static int nvme_dev_add(struct nvme_dev *dev) |
|---|
| 2325 | +static void nvme_dev_add(struct nvme_dev *dev) |
|---|
| 2046 | 2326 | { |
|---|
| 2047 | 2327 | int ret; |
|---|
| 2048 | 2328 | |
|---|
| 2049 | 2329 | if (!dev->ctrl.tagset) { |
|---|
| 2050 | 2330 | dev->tagset.ops = &nvme_mq_ops; |
|---|
| 2051 | 2331 | dev->tagset.nr_hw_queues = dev->online_queues - 1; |
|---|
| 2332 | + dev->tagset.nr_maps = 2; /* default + read */ |
|---|
| 2333 | + if (dev->io_queues[HCTX_TYPE_POLL]) |
|---|
| 2334 | + dev->tagset.nr_maps++; |
|---|
| 2052 | 2335 | dev->tagset.timeout = NVME_IO_TIMEOUT; |
|---|
| 2053 | | - dev->tagset.numa_node = dev_to_node(dev->dev); |
|---|
| 2054 | | - dev->tagset.queue_depth = |
|---|
| 2055 | | - min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; |
|---|
| 2056 | | - dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false); |
|---|
| 2057 | | - if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) { |
|---|
| 2058 | | - dev->tagset.cmd_size = max(dev->tagset.cmd_size, |
|---|
| 2059 | | - nvme_pci_cmd_size(dev, true)); |
|---|
| 2060 | | - } |
|---|
| 2336 | + dev->tagset.numa_node = dev->ctrl.numa_node; |
|---|
| 2337 | + dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, |
|---|
| 2338 | + BLK_MQ_MAX_DEPTH) - 1; |
|---|
| 2339 | + dev->tagset.cmd_size = sizeof(struct nvme_iod); |
|---|
| 2061 | 2340 | dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; |
|---|
| 2062 | 2341 | dev->tagset.driver_data = dev; |
|---|
| 2342 | + |
|---|
| 2343 | + /* |
|---|
| 2344 | + * Some Apple controllers requires tags to be unique |
|---|
| 2345 | + * across admin and IO queue, so reserve the first 32 |
|---|
| 2346 | + * tags of the IO queue. |
|---|
| 2347 | + */ |
|---|
| 2348 | + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) |
|---|
| 2349 | + dev->tagset.reserved_tags = NVME_AQ_DEPTH; |
|---|
| 2063 | 2350 | |
|---|
| 2064 | 2351 | ret = blk_mq_alloc_tag_set(&dev->tagset); |
|---|
| 2065 | 2352 | if (ret) { |
|---|
| 2066 | 2353 | dev_warn(dev->ctrl.device, |
|---|
| 2067 | 2354 | "IO queues tagset allocation failed %d\n", ret); |
|---|
| 2068 | | - return ret; |
|---|
| 2355 | + return; |
|---|
| 2069 | 2356 | } |
|---|
| 2070 | 2357 | dev->ctrl.tagset = &dev->tagset; |
|---|
| 2071 | | - |
|---|
| 2072 | | - nvme_dbbuf_set(dev); |
|---|
| 2073 | 2358 | } else { |
|---|
| 2074 | 2359 | blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); |
|---|
| 2075 | 2360 | |
|---|
| .. | .. |
|---|
| 2077 | 2362 | nvme_free_queues(dev, dev->online_queues); |
|---|
| 2078 | 2363 | } |
|---|
| 2079 | 2364 | |
|---|
| 2080 | | - return 0; |
|---|
| 2365 | + nvme_dbbuf_set(dev); |
|---|
| 2081 | 2366 | } |
|---|
| 2082 | 2367 | |
|---|
| 2083 | 2368 | static int nvme_pci_enable(struct nvme_dev *dev) |
|---|
| .. | .. |
|---|
| 2090 | 2375 | |
|---|
| 2091 | 2376 | pci_set_master(pdev); |
|---|
| 2092 | 2377 | |
|---|
| 2093 | | - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && |
|---|
| 2094 | | - dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) |
|---|
| 2378 | + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64))) |
|---|
| 2095 | 2379 | goto disable; |
|---|
| 2096 | 2380 | |
|---|
| 2097 | 2381 | if (readl(dev->bar + NVME_REG_CSTS) == -1) { |
|---|
| .. | .. |
|---|
| 2110 | 2394 | |
|---|
| 2111 | 2395 | dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); |
|---|
| 2112 | 2396 | |
|---|
| 2113 | | - dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, |
|---|
| 2397 | + dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, |
|---|
| 2114 | 2398 | io_queue_depth); |
|---|
| 2399 | + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ |
|---|
| 2115 | 2400 | dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); |
|---|
| 2116 | 2401 | dev->dbs = dev->bar + 4096; |
|---|
| 2402 | + |
|---|
| 2403 | + /* |
|---|
| 2404 | + * Some Apple controllers require a non-standard SQE size. |
|---|
| 2405 | + * Interestingly they also seem to ignore the CC:IOSQES register |
|---|
| 2406 | + * so we don't bother updating it here. |
|---|
| 2407 | + */ |
|---|
| 2408 | + if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) |
|---|
| 2409 | + dev->io_sqes = 7; |
|---|
| 2410 | + else |
|---|
| 2411 | + dev->io_sqes = NVME_NVM_IOSQES; |
|---|
| 2117 | 2412 | |
|---|
| 2118 | 2413 | /* |
|---|
| 2119 | 2414 | * Temporary fix for the Apple controller found in the MacBook8,1 and |
|---|
| .. | .. |
|---|
| 2131 | 2426 | dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " |
|---|
| 2132 | 2427 | "set queue depth=%u\n", dev->q_depth); |
|---|
| 2133 | 2428 | } |
|---|
| 2429 | + |
|---|
| 2430 | + /* |
|---|
| 2431 | + * Controllers with the shared tags quirk need the IO queue to be |
|---|
| 2432 | + * big enough so that we get 32 tags for the admin queue |
|---|
| 2433 | + */ |
|---|
| 2434 | + if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && |
|---|
| 2435 | + (dev->q_depth < (NVME_AQ_DEPTH + 2))) { |
|---|
| 2436 | + dev->q_depth = NVME_AQ_DEPTH + 2; |
|---|
| 2437 | + dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", |
|---|
| 2438 | + dev->q_depth); |
|---|
| 2439 | + } |
|---|
| 2440 | + |
|---|
| 2134 | 2441 | |
|---|
| 2135 | 2442 | nvme_map_cmb(dev); |
|---|
| 2136 | 2443 | |
|---|
| .. | .. |
|---|
| 2164 | 2471 | |
|---|
| 2165 | 2472 | static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) |
|---|
| 2166 | 2473 | { |
|---|
| 2167 | | - int i; |
|---|
| 2168 | | - bool dead = true; |
|---|
| 2474 | + bool dead = true, freeze = false; |
|---|
| 2169 | 2475 | struct pci_dev *pdev = to_pci_dev(dev->dev); |
|---|
| 2170 | 2476 | |
|---|
| 2171 | 2477 | mutex_lock(&dev->shutdown_lock); |
|---|
| .. | .. |
|---|
| 2173 | 2479 | u32 csts = readl(dev->bar + NVME_REG_CSTS); |
|---|
| 2174 | 2480 | |
|---|
| 2175 | 2481 | if (dev->ctrl.state == NVME_CTRL_LIVE || |
|---|
| 2176 | | - dev->ctrl.state == NVME_CTRL_RESETTING) |
|---|
| 2482 | + dev->ctrl.state == NVME_CTRL_RESETTING) { |
|---|
| 2483 | + freeze = true; |
|---|
| 2177 | 2484 | nvme_start_freeze(&dev->ctrl); |
|---|
| 2485 | + } |
|---|
| 2178 | 2486 | dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || |
|---|
| 2179 | 2487 | pdev->error_state != pci_channel_io_normal); |
|---|
| 2180 | 2488 | } |
|---|
| .. | .. |
|---|
| 2183 | 2491 | * Give the controller a chance to complete all entered requests if |
|---|
| 2184 | 2492 | * doing a safe shutdown. |
|---|
| 2185 | 2493 | */ |
|---|
| 2186 | | - if (!dead) { |
|---|
| 2187 | | - if (shutdown) |
|---|
| 2188 | | - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); |
|---|
| 2189 | | - } |
|---|
| 2494 | + if (!dead && shutdown && freeze) |
|---|
| 2495 | + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); |
|---|
| 2190 | 2496 | |
|---|
| 2191 | 2497 | nvme_stop_queues(&dev->ctrl); |
|---|
| 2192 | 2498 | |
|---|
| .. | .. |
|---|
| 2194 | 2500 | nvme_disable_io_queues(dev); |
|---|
| 2195 | 2501 | nvme_disable_admin_queue(dev, shutdown); |
|---|
| 2196 | 2502 | } |
|---|
| 2197 | | - for (i = dev->ctrl.queue_count - 1; i >= 0; i--) |
|---|
| 2198 | | - nvme_suspend_queue(&dev->queues[i]); |
|---|
| 2199 | | - |
|---|
| 2503 | + nvme_suspend_io_queues(dev); |
|---|
| 2504 | + nvme_suspend_queue(&dev->queues[0]); |
|---|
| 2200 | 2505 | nvme_pci_disable(dev); |
|---|
| 2506 | + nvme_reap_pending_cqes(dev); |
|---|
| 2201 | 2507 | |
|---|
| 2202 | 2508 | blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); |
|---|
| 2203 | 2509 | blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); |
|---|
| 2510 | + blk_mq_tagset_wait_completed_request(&dev->tagset); |
|---|
| 2511 | + blk_mq_tagset_wait_completed_request(&dev->admin_tagset); |
|---|
| 2204 | 2512 | |
|---|
| 2205 | 2513 | /* |
|---|
| 2206 | 2514 | * The driver will not be starting up queues again if shutting down so |
|---|
| .. | .. |
|---|
| 2215 | 2523 | mutex_unlock(&dev->shutdown_lock); |
|---|
| 2216 | 2524 | } |
|---|
| 2217 | 2525 | |
|---|
| 2526 | +static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) |
|---|
| 2527 | +{ |
|---|
| 2528 | + if (!nvme_wait_reset(&dev->ctrl)) |
|---|
| 2529 | + return -EBUSY; |
|---|
| 2530 | + nvme_dev_disable(dev, shutdown); |
|---|
| 2531 | + return 0; |
|---|
| 2532 | +} |
|---|
| 2533 | + |
|---|
| 2218 | 2534 | static int nvme_setup_prp_pools(struct nvme_dev *dev) |
|---|
| 2219 | 2535 | { |
|---|
| 2220 | 2536 | dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, |
|---|
| 2221 | | - PAGE_SIZE, PAGE_SIZE, 0); |
|---|
| 2537 | + NVME_CTRL_PAGE_SIZE, |
|---|
| 2538 | + NVME_CTRL_PAGE_SIZE, 0); |
|---|
| 2222 | 2539 | if (!dev->prp_page_pool) |
|---|
| 2223 | 2540 | return -ENOMEM; |
|---|
| 2224 | 2541 | |
|---|
| .. | .. |
|---|
| 2238 | 2555 | dma_pool_destroy(dev->prp_small_pool); |
|---|
| 2239 | 2556 | } |
|---|
| 2240 | 2557 | |
|---|
| 2558 | +static void nvme_free_tagset(struct nvme_dev *dev) |
|---|
| 2559 | +{ |
|---|
| 2560 | + if (dev->tagset.tags) |
|---|
| 2561 | + blk_mq_free_tag_set(&dev->tagset); |
|---|
| 2562 | + dev->ctrl.tagset = NULL; |
|---|
| 2563 | +} |
|---|
| 2564 | + |
|---|
| 2241 | 2565 | static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) |
|---|
| 2242 | 2566 | { |
|---|
| 2243 | 2567 | struct nvme_dev *dev = to_nvme_dev(ctrl); |
|---|
| 2244 | 2568 | |
|---|
| 2245 | 2569 | nvme_dbbuf_dma_free(dev); |
|---|
| 2246 | | - put_device(dev->dev); |
|---|
| 2247 | | - if (dev->tagset.tags) |
|---|
| 2248 | | - blk_mq_free_tag_set(&dev->tagset); |
|---|
| 2570 | + nvme_free_tagset(dev); |
|---|
| 2249 | 2571 | if (dev->ctrl.admin_q) |
|---|
| 2250 | 2572 | blk_put_queue(dev->ctrl.admin_q); |
|---|
| 2251 | | - kfree(dev->queues); |
|---|
| 2252 | 2573 | free_opal_dev(dev->ctrl.opal_dev); |
|---|
| 2253 | 2574 | mempool_destroy(dev->iod_mempool); |
|---|
| 2575 | + put_device(dev->dev); |
|---|
| 2576 | + kfree(dev->queues); |
|---|
| 2254 | 2577 | kfree(dev); |
|---|
| 2255 | 2578 | } |
|---|
| 2256 | 2579 | |
|---|
| 2257 | | -static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) |
|---|
| 2580 | +static void nvme_remove_dead_ctrl(struct nvme_dev *dev) |
|---|
| 2258 | 2581 | { |
|---|
| 2259 | | - dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); |
|---|
| 2260 | | - |
|---|
| 2582 | + /* |
|---|
| 2583 | + * Set state to deleting now to avoid blocking nvme_wait_reset(), which |
|---|
| 2584 | + * may be holding this pci_dev's device lock. |
|---|
| 2585 | + */ |
|---|
| 2586 | + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); |
|---|
| 2261 | 2587 | nvme_get_ctrl(&dev->ctrl); |
|---|
| 2262 | 2588 | nvme_dev_disable(dev, false); |
|---|
| 2263 | 2589 | nvme_kill_queues(&dev->ctrl); |
|---|
| .. | .. |
|---|
| 2271 | 2597 | container_of(work, struct nvme_dev, ctrl.reset_work); |
|---|
| 2272 | 2598 | bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); |
|---|
| 2273 | 2599 | int result; |
|---|
| 2274 | | - enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; |
|---|
| 2275 | 2600 | |
|---|
| 2276 | 2601 | if (dev->ctrl.state != NVME_CTRL_RESETTING) { |
|---|
| 2277 | 2602 | dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", |
|---|
| .. | .. |
|---|
| 2286 | 2611 | */ |
|---|
| 2287 | 2612 | if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) |
|---|
| 2288 | 2613 | nvme_dev_disable(dev, false); |
|---|
| 2614 | + nvme_sync_queues(&dev->ctrl); |
|---|
| 2289 | 2615 | |
|---|
| 2290 | 2616 | mutex_lock(&dev->shutdown_lock); |
|---|
| 2291 | 2617 | result = nvme_pci_enable(dev); |
|---|
| .. | .. |
|---|
| 2300 | 2626 | if (result) |
|---|
| 2301 | 2627 | goto out_unlock; |
|---|
| 2302 | 2628 | |
|---|
| 2629 | + dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); |
|---|
| 2630 | + |
|---|
| 2303 | 2631 | /* |
|---|
| 2304 | 2632 | * Limit the max command size to prevent iod->sg allocations going |
|---|
| 2305 | 2633 | * over a single page. |
|---|
| 2306 | 2634 | */ |
|---|
| 2307 | | - dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; |
|---|
| 2635 | + dev->ctrl.max_hw_sectors = min_t(u32, |
|---|
| 2636 | + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); |
|---|
| 2308 | 2637 | dev->ctrl.max_segments = NVME_MAX_SEGS; |
|---|
| 2638 | + |
|---|
| 2639 | + /* |
|---|
| 2640 | + * Don't limit the IOMMU merged segment size. |
|---|
| 2641 | + */ |
|---|
| 2642 | + dma_set_max_seg_size(dev->dev, 0xffffffff); |
|---|
| 2643 | + |
|---|
| 2309 | 2644 | mutex_unlock(&dev->shutdown_lock); |
|---|
| 2310 | 2645 | |
|---|
| 2311 | 2646 | /* |
|---|
| .. | .. |
|---|
| 2318 | 2653 | result = -EBUSY; |
|---|
| 2319 | 2654 | goto out; |
|---|
| 2320 | 2655 | } |
|---|
| 2656 | + |
|---|
| 2657 | + /* |
|---|
| 2658 | + * We do not support an SGL for metadata (yet), so we are limited to a |
|---|
| 2659 | + * single integrity segment for the separate metadata pointer. |
|---|
| 2660 | + */ |
|---|
| 2661 | + dev->ctrl.max_integrity_segments = 1; |
|---|
| 2321 | 2662 | |
|---|
| 2322 | 2663 | result = nvme_init_identify(&dev->ctrl); |
|---|
| 2323 | 2664 | if (result) |
|---|
| .. | .. |
|---|
| 2359 | 2700 | dev_warn(dev->ctrl.device, "IO queues not created\n"); |
|---|
| 2360 | 2701 | nvme_kill_queues(&dev->ctrl); |
|---|
| 2361 | 2702 | nvme_remove_namespaces(&dev->ctrl); |
|---|
| 2362 | | - new_state = NVME_CTRL_ADMIN_ONLY; |
|---|
| 2703 | + nvme_free_tagset(dev); |
|---|
| 2363 | 2704 | } else { |
|---|
| 2364 | 2705 | nvme_start_queues(&dev->ctrl); |
|---|
| 2365 | 2706 | nvme_wait_freeze(&dev->ctrl); |
|---|
| 2366 | | - /* hit this only when allocate tagset fails */ |
|---|
| 2367 | | - if (nvme_dev_add(dev)) |
|---|
| 2368 | | - new_state = NVME_CTRL_ADMIN_ONLY; |
|---|
| 2707 | + nvme_dev_add(dev); |
|---|
| 2369 | 2708 | nvme_unfreeze(&dev->ctrl); |
|---|
| 2370 | 2709 | } |
|---|
| 2371 | 2710 | |
|---|
| .. | .. |
|---|
| 2373 | 2712 | * If only admin queue live, keep it to do further investigation or |
|---|
| 2374 | 2713 | * recovery. |
|---|
| 2375 | 2714 | */ |
|---|
| 2376 | | - if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { |
|---|
| 2715 | + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { |
|---|
| 2377 | 2716 | dev_warn(dev->ctrl.device, |
|---|
| 2378 | | - "failed to mark controller state %d\n", new_state); |
|---|
| 2717 | + "failed to mark controller live state\n"); |
|---|
| 2379 | 2718 | result = -ENODEV; |
|---|
| 2380 | 2719 | goto out; |
|---|
| 2381 | 2720 | } |
|---|
| .. | .. |
|---|
| 2386 | 2725 | out_unlock: |
|---|
| 2387 | 2726 | mutex_unlock(&dev->shutdown_lock); |
|---|
| 2388 | 2727 | out: |
|---|
| 2389 | | - nvme_remove_dead_ctrl(dev, result); |
|---|
| 2728 | + if (result) |
|---|
| 2729 | + dev_warn(dev->ctrl.device, |
|---|
| 2730 | + "Removing after probe failure status: %d\n", result); |
|---|
| 2731 | + nvme_remove_dead_ctrl(dev); |
|---|
| 2390 | 2732 | } |
|---|
| 2391 | 2733 | |
|---|
| 2392 | 2734 | static void nvme_remove_dead_ctrl_work(struct work_struct *work) |
|---|
| .. | .. |
|---|
| 2421 | 2763 | { |
|---|
| 2422 | 2764 | struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); |
|---|
| 2423 | 2765 | |
|---|
| 2424 | | - return snprintf(buf, size, "%s", dev_name(&pdev->dev)); |
|---|
| 2766 | + return snprintf(buf, size, "%s\n", dev_name(&pdev->dev)); |
|---|
| 2425 | 2767 | } |
|---|
| 2426 | 2768 | |
|---|
| 2427 | 2769 | static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { |
|---|
| 2428 | 2770 | .name = "pcie", |
|---|
| 2429 | 2771 | .module = THIS_MODULE, |
|---|
| 2430 | | - .flags = NVME_F_METADATA_SUPPORTED, |
|---|
| 2772 | + .flags = NVME_F_METADATA_SUPPORTED | |
|---|
| 2773 | + NVME_F_PCI_P2PDMA, |
|---|
| 2431 | 2774 | .reg_read32 = nvme_pci_reg_read32, |
|---|
| 2432 | 2775 | .reg_write32 = nvme_pci_reg_write32, |
|---|
| 2433 | 2776 | .reg_read64 = nvme_pci_reg_read64, |
|---|
| .. | .. |
|---|
| 2478 | 2821 | (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") || |
|---|
| 2479 | 2822 | dmi_match(DMI_BOARD_NAME, "PRIME Z370-A"))) |
|---|
| 2480 | 2823 | return NVME_QUIRK_NO_APST; |
|---|
| 2824 | + } else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 || |
|---|
| 2825 | + pdev->device == 0xa808 || pdev->device == 0xa809)) || |
|---|
| 2826 | + (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) { |
|---|
| 2827 | + /* |
|---|
| 2828 | + * Forcing to use host managed nvme power settings for |
|---|
| 2829 | + * lowest idle power with quick resume latency on |
|---|
| 2830 | + * Samsung and Toshiba SSDs based on suspend behavior |
|---|
| 2831 | + * on Coffee Lake board for LENOVO C640 |
|---|
| 2832 | + */ |
|---|
| 2833 | + if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) && |
|---|
| 2834 | + dmi_match(DMI_BOARD_NAME, "LNVNB161216")) |
|---|
| 2835 | + return NVME_QUIRK_SIMPLE_SUSPEND; |
|---|
| 2481 | 2836 | } |
|---|
| 2482 | 2837 | |
|---|
| 2483 | 2838 | return 0; |
|---|
| 2484 | 2839 | } |
|---|
| 2840 | + |
|---|
| 2841 | +#ifdef CONFIG_ACPI |
|---|
| 2842 | +static bool nvme_acpi_storage_d3(struct pci_dev *dev) |
|---|
| 2843 | +{ |
|---|
| 2844 | + struct acpi_device *adev = ACPI_COMPANION(&dev->dev); |
|---|
| 2845 | + u8 val; |
|---|
| 2846 | + |
|---|
| 2847 | + /* |
|---|
| 2848 | + * Look for _DSD property specifying that the storage device on the port |
|---|
| 2849 | + * must use D3 to support deep platform power savings during |
|---|
| 2850 | + * suspend-to-idle. |
|---|
| 2851 | + */ |
|---|
| 2852 | + |
|---|
| 2853 | + if (!adev) |
|---|
| 2854 | + return false; |
|---|
| 2855 | + if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable", |
|---|
| 2856 | + &val)) |
|---|
| 2857 | + return false; |
|---|
| 2858 | + return val == 1; |
|---|
| 2859 | +} |
|---|
| 2860 | +#else |
|---|
| 2861 | +static inline bool nvme_acpi_storage_d3(struct pci_dev *dev) |
|---|
| 2862 | +{ |
|---|
| 2863 | + return false; |
|---|
| 2864 | +} |
|---|
| 2865 | +#endif /* CONFIG_ACPI */ |
|---|
| 2485 | 2866 | |
|---|
| 2486 | 2867 | static void nvme_async_probe(void *data, async_cookie_t cookie) |
|---|
| 2487 | 2868 | { |
|---|
| .. | .. |
|---|
| 2507 | 2888 | if (!dev) |
|---|
| 2508 | 2889 | return -ENOMEM; |
|---|
| 2509 | 2890 | |
|---|
| 2510 | | - dev->queues = kcalloc_node(num_possible_cpus() + 1, |
|---|
| 2891 | + dev->nr_write_queues = write_queues; |
|---|
| 2892 | + dev->nr_poll_queues = poll_queues; |
|---|
| 2893 | + dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1; |
|---|
| 2894 | + dev->queues = kcalloc_node(dev->nr_allocated_queues, |
|---|
| 2511 | 2895 | sizeof(struct nvme_queue), GFP_KERNEL, node); |
|---|
| 2512 | 2896 | if (!dev->queues) |
|---|
| 2513 | 2897 | goto free; |
|---|
| .. | .. |
|---|
| 2522 | 2906 | INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); |
|---|
| 2523 | 2907 | INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); |
|---|
| 2524 | 2908 | mutex_init(&dev->shutdown_lock); |
|---|
| 2525 | | - init_completion(&dev->ioq_wait); |
|---|
| 2526 | 2909 | |
|---|
| 2527 | 2910 | result = nvme_setup_prp_pools(dev); |
|---|
| 2528 | 2911 | if (result) |
|---|
| .. | .. |
|---|
| 2530 | 2913 | |
|---|
| 2531 | 2914 | quirks |= check_vendor_combination_bug(pdev); |
|---|
| 2532 | 2915 | |
|---|
| 2916 | + if (!noacpi && nvme_acpi_storage_d3(pdev)) { |
|---|
| 2917 | + /* |
|---|
| 2918 | + * Some systems use a bios work around to ask for D3 on |
|---|
| 2919 | + * platforms that support kernel managed suspend. |
|---|
| 2920 | + */ |
|---|
| 2921 | + dev_info(&pdev->dev, |
|---|
| 2922 | + "platform quirk: setting simple suspend\n"); |
|---|
| 2923 | + quirks |= NVME_QUIRK_SIMPLE_SUSPEND; |
|---|
| 2924 | + } |
|---|
| 2925 | + |
|---|
| 2533 | 2926 | /* |
|---|
| 2534 | 2927 | * Double check that our mempool alloc size will cover the biggest |
|---|
| 2535 | 2928 | * command we support. |
|---|
| 2536 | 2929 | */ |
|---|
| 2537 | | - alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, |
|---|
| 2538 | | - NVME_MAX_SEGS, true); |
|---|
| 2930 | + alloc_size = nvme_pci_iod_alloc_size(); |
|---|
| 2539 | 2931 | WARN_ON_ONCE(alloc_size > PAGE_SIZE); |
|---|
| 2540 | 2932 | |
|---|
| 2541 | 2933 | dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, |
|---|
| .. | .. |
|---|
| 2555 | 2947 | dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); |
|---|
| 2556 | 2948 | |
|---|
| 2557 | 2949 | nvme_reset_ctrl(&dev->ctrl); |
|---|
| 2558 | | - nvme_get_ctrl(&dev->ctrl); |
|---|
| 2559 | 2950 | async_schedule(nvme_async_probe, dev); |
|---|
| 2560 | 2951 | |
|---|
| 2561 | 2952 | return 0; |
|---|
| .. | .. |
|---|
| 2577 | 2968 | static void nvme_reset_prepare(struct pci_dev *pdev) |
|---|
| 2578 | 2969 | { |
|---|
| 2579 | 2970 | struct nvme_dev *dev = pci_get_drvdata(pdev); |
|---|
| 2580 | | - nvme_dev_disable(dev, false); |
|---|
| 2971 | + |
|---|
| 2972 | + /* |
|---|
| 2973 | + * We don't need to check the return value from waiting for the reset |
|---|
| 2974 | + * state as pci_dev device lock is held, making it impossible to race |
|---|
| 2975 | + * with ->remove(). |
|---|
| 2976 | + */ |
|---|
| 2977 | + nvme_disable_prepare_reset(dev, false); |
|---|
| 2978 | + nvme_sync_queues(&dev->ctrl); |
|---|
| 2581 | 2979 | } |
|---|
| 2582 | 2980 | |
|---|
| 2583 | 2981 | static void nvme_reset_done(struct pci_dev *pdev) |
|---|
| 2584 | 2982 | { |
|---|
| 2585 | 2983 | struct nvme_dev *dev = pci_get_drvdata(pdev); |
|---|
| 2586 | | - nvme_reset_ctrl_sync(&dev->ctrl); |
|---|
| 2984 | + |
|---|
| 2985 | + if (!nvme_try_sched_reset(&dev->ctrl)) |
|---|
| 2986 | + flush_work(&dev->ctrl.reset_work); |
|---|
| 2587 | 2987 | } |
|---|
| 2588 | 2988 | |
|---|
| 2589 | 2989 | static void nvme_shutdown(struct pci_dev *pdev) |
|---|
| 2590 | 2990 | { |
|---|
| 2591 | 2991 | struct nvme_dev *dev = pci_get_drvdata(pdev); |
|---|
| 2592 | | - nvme_dev_disable(dev, true); |
|---|
| 2992 | + |
|---|
| 2993 | + nvme_disable_prepare_reset(dev, true); |
|---|
| 2593 | 2994 | } |
|---|
| 2594 | 2995 | |
|---|
| 2595 | 2996 | /* |
|---|
| .. | .. |
|---|
| 2617 | 3018 | nvme_free_host_mem(dev); |
|---|
| 2618 | 3019 | nvme_dev_remove_admin(dev); |
|---|
| 2619 | 3020 | nvme_free_queues(dev, 0); |
|---|
| 2620 | | - nvme_uninit_ctrl(&dev->ctrl); |
|---|
| 2621 | 3021 | nvme_release_prp_pools(dev); |
|---|
| 2622 | 3022 | nvme_dev_unmap(dev); |
|---|
| 2623 | | - nvme_put_ctrl(&dev->ctrl); |
|---|
| 3023 | + nvme_uninit_ctrl(&dev->ctrl); |
|---|
| 2624 | 3024 | } |
|---|
| 2625 | 3025 | |
|---|
| 2626 | 3026 | #ifdef CONFIG_PM_SLEEP |
|---|
| 2627 | | -static int nvme_suspend(struct device *dev) |
|---|
| 3027 | +static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps) |
|---|
| 2628 | 3028 | { |
|---|
| 2629 | | - struct pci_dev *pdev = to_pci_dev(dev); |
|---|
| 2630 | | - struct nvme_dev *ndev = pci_get_drvdata(pdev); |
|---|
| 3029 | + return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps); |
|---|
| 3030 | +} |
|---|
| 2631 | 3031 | |
|---|
| 2632 | | - nvme_dev_disable(ndev, true); |
|---|
| 2633 | | - return 0; |
|---|
| 3032 | +static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps) |
|---|
| 3033 | +{ |
|---|
| 3034 | + return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL); |
|---|
| 2634 | 3035 | } |
|---|
| 2635 | 3036 | |
|---|
| 2636 | 3037 | static int nvme_resume(struct device *dev) |
|---|
| 2637 | 3038 | { |
|---|
| 3039 | + struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); |
|---|
| 3040 | + struct nvme_ctrl *ctrl = &ndev->ctrl; |
|---|
| 3041 | + |
|---|
| 3042 | + if (ndev->last_ps == U32_MAX || |
|---|
| 3043 | + nvme_set_power_state(ctrl, ndev->last_ps) != 0) |
|---|
| 3044 | + return nvme_try_sched_reset(&ndev->ctrl); |
|---|
| 3045 | + return 0; |
|---|
| 3046 | +} |
|---|
| 3047 | + |
|---|
| 3048 | +static int nvme_suspend(struct device *dev) |
|---|
| 3049 | +{ |
|---|
| 3050 | + struct pci_dev *pdev = to_pci_dev(dev); |
|---|
| 3051 | + struct nvme_dev *ndev = pci_get_drvdata(pdev); |
|---|
| 3052 | + struct nvme_ctrl *ctrl = &ndev->ctrl; |
|---|
| 3053 | + int ret = -EBUSY; |
|---|
| 3054 | + |
|---|
| 3055 | + ndev->last_ps = U32_MAX; |
|---|
| 3056 | + |
|---|
| 3057 | + /* |
|---|
| 3058 | + * The platform does not remove power for a kernel managed suspend so |
|---|
| 3059 | + * use host managed nvme power settings for lowest idle power if |
|---|
| 3060 | + * possible. This should have quicker resume latency than a full device |
|---|
| 3061 | + * shutdown. But if the firmware is involved after the suspend or the |
|---|
| 3062 | + * device does not support any non-default power states, shut down the |
|---|
| 3063 | + * device fully. |
|---|
| 3064 | + * |
|---|
| 3065 | + * If ASPM is not enabled for the device, shut down the device and allow |
|---|
| 3066 | + * the PCI bus layer to put it into D3 in order to take the PCIe link |
|---|
| 3067 | + * down, so as to allow the platform to achieve its minimum low-power |
|---|
| 3068 | + * state (which may not be possible if the link is up). |
|---|
| 3069 | + * |
|---|
| 3070 | + * If a host memory buffer is enabled, shut down the device as the NVMe |
|---|
| 3071 | + * specification allows the device to access the host memory buffer in |
|---|
| 3072 | + * host DRAM from all power states, but hosts will fail access to DRAM |
|---|
| 3073 | + * during S3. |
|---|
| 3074 | + */ |
|---|
| 3075 | + if (pm_suspend_via_firmware() || !ctrl->npss || |
|---|
| 3076 | + !pcie_aspm_enabled(pdev) || |
|---|
| 3077 | + ndev->nr_host_mem_descs || |
|---|
| 3078 | + (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) |
|---|
| 3079 | + return nvme_disable_prepare_reset(ndev, true); |
|---|
| 3080 | + |
|---|
| 3081 | + nvme_start_freeze(ctrl); |
|---|
| 3082 | + nvme_wait_freeze(ctrl); |
|---|
| 3083 | + nvme_sync_queues(ctrl); |
|---|
| 3084 | + |
|---|
| 3085 | + if (ctrl->state != NVME_CTRL_LIVE) |
|---|
| 3086 | + goto unfreeze; |
|---|
| 3087 | + |
|---|
| 3088 | + ret = nvme_get_power_state(ctrl, &ndev->last_ps); |
|---|
| 3089 | + if (ret < 0) |
|---|
| 3090 | + goto unfreeze; |
|---|
| 3091 | + |
|---|
| 3092 | + /* |
|---|
| 3093 | + * A saved state prevents pci pm from generically controlling the |
|---|
| 3094 | + * device's power. If we're using protocol specific settings, we don't |
|---|
| 3095 | + * want pci interfering. |
|---|
| 3096 | + */ |
|---|
| 3097 | + pci_save_state(pdev); |
|---|
| 3098 | + |
|---|
| 3099 | + ret = nvme_set_power_state(ctrl, ctrl->npss); |
|---|
| 3100 | + if (ret < 0) |
|---|
| 3101 | + goto unfreeze; |
|---|
| 3102 | + |
|---|
| 3103 | + if (ret) { |
|---|
| 3104 | + /* discard the saved state */ |
|---|
| 3105 | + pci_load_saved_state(pdev, NULL); |
|---|
| 3106 | + |
|---|
| 3107 | + /* |
|---|
| 3108 | + * Clearing npss forces a controller reset on resume. The |
|---|
| 3109 | + * correct value will be rediscovered then. |
|---|
| 3110 | + */ |
|---|
| 3111 | + ret = nvme_disable_prepare_reset(ndev, true); |
|---|
| 3112 | + ctrl->npss = 0; |
|---|
| 3113 | + } |
|---|
| 3114 | +unfreeze: |
|---|
| 3115 | + nvme_unfreeze(ctrl); |
|---|
| 3116 | + return ret; |
|---|
| 3117 | +} |
|---|
| 3118 | + |
|---|
| 3119 | +static int nvme_simple_suspend(struct device *dev) |
|---|
| 3120 | +{ |
|---|
| 3121 | + struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); |
|---|
| 3122 | + |
|---|
| 3123 | + return nvme_disable_prepare_reset(ndev, true); |
|---|
| 3124 | +} |
|---|
| 3125 | + |
|---|
| 3126 | +static int nvme_simple_resume(struct device *dev) |
|---|
| 3127 | +{ |
|---|
| 2638 | 3128 | struct pci_dev *pdev = to_pci_dev(dev); |
|---|
| 2639 | 3129 | struct nvme_dev *ndev = pci_get_drvdata(pdev); |
|---|
| 2640 | 3130 | |
|---|
| 2641 | | - nvme_reset_ctrl(&ndev->ctrl); |
|---|
| 2642 | | - return 0; |
|---|
| 3131 | + return nvme_try_sched_reset(&ndev->ctrl); |
|---|
| 2643 | 3132 | } |
|---|
| 2644 | | -#endif |
|---|
| 2645 | 3133 | |
|---|
| 2646 | | -static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); |
|---|
| 3134 | +static const struct dev_pm_ops nvme_dev_pm_ops = { |
|---|
| 3135 | + .suspend = nvme_suspend, |
|---|
| 3136 | + .resume = nvme_resume, |
|---|
| 3137 | + .freeze = nvme_simple_suspend, |
|---|
| 3138 | + .thaw = nvme_simple_resume, |
|---|
| 3139 | + .poweroff = nvme_simple_suspend, |
|---|
| 3140 | + .restore = nvme_simple_resume, |
|---|
| 3141 | +}; |
|---|
| 3142 | +#endif /* CONFIG_PM_SLEEP */ |
|---|
| 2647 | 3143 | |
|---|
| 2648 | 3144 | static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, |
|---|
| 2649 | 3145 | pci_channel_state_t state) |
|---|
| .. | .. |
|---|
| 2686 | 3182 | struct nvme_dev *dev = pci_get_drvdata(pdev); |
|---|
| 2687 | 3183 | |
|---|
| 2688 | 3184 | flush_work(&dev->ctrl.reset_work); |
|---|
| 2689 | | - pci_cleanup_aer_uncorrect_error_status(pdev); |
|---|
| 2690 | 3185 | } |
|---|
| 2691 | 3186 | |
|---|
| 2692 | 3187 | static const struct pci_error_handlers nvme_err_handler = { |
|---|
| .. | .. |
|---|
| 2698 | 3193 | }; |
|---|
| 2699 | 3194 | |
|---|
| 2700 | 3195 | static const struct pci_device_id nvme_id_table[] = { |
|---|
| 2701 | | - { PCI_VDEVICE(INTEL, 0x0953), |
|---|
| 3196 | + { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ |
|---|
| 2702 | 3197 | .driver_data = NVME_QUIRK_STRIPE_SIZE | |
|---|
| 2703 | 3198 | NVME_QUIRK_DEALLOCATE_ZEROES, }, |
|---|
| 2704 | | - { PCI_VDEVICE(INTEL, 0x0a53), |
|---|
| 3199 | + { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ |
|---|
| 2705 | 3200 | .driver_data = NVME_QUIRK_STRIPE_SIZE | |
|---|
| 2706 | 3201 | NVME_QUIRK_DEALLOCATE_ZEROES, }, |
|---|
| 2707 | | - { PCI_VDEVICE(INTEL, 0x0a54), |
|---|
| 3202 | + { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ |
|---|
| 2708 | 3203 | .driver_data = NVME_QUIRK_STRIPE_SIZE | |
|---|
| 2709 | | - NVME_QUIRK_DEALLOCATE_ZEROES, }, |
|---|
| 2710 | | - { PCI_VDEVICE(INTEL, 0x0a55), |
|---|
| 3204 | + NVME_QUIRK_DEALLOCATE_ZEROES | |
|---|
| 3205 | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, |
|---|
| 3206 | + { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ |
|---|
| 2711 | 3207 | .driver_data = NVME_QUIRK_STRIPE_SIZE | |
|---|
| 2712 | 3208 | NVME_QUIRK_DEALLOCATE_ZEROES, }, |
|---|
| 2713 | 3209 | { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ |
|---|
| 2714 | 3210 | .driver_data = NVME_QUIRK_NO_DEEPEST_PS | |
|---|
| 2715 | | - NVME_QUIRK_MEDIUM_PRIO_SQ }, |
|---|
| 3211 | + NVME_QUIRK_MEDIUM_PRIO_SQ | |
|---|
| 3212 | + NVME_QUIRK_NO_TEMP_THRESH_CHANGE | |
|---|
| 3213 | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, |
|---|
| 3214 | + { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ |
|---|
| 3215 | + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, |
|---|
| 2716 | 3216 | { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ |
|---|
| 2717 | | - .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, |
|---|
| 3217 | + .driver_data = NVME_QUIRK_IDENTIFY_CNS | |
|---|
| 3218 | + NVME_QUIRK_DISABLE_WRITE_ZEROES | |
|---|
| 3219 | + NVME_QUIRK_BOGUS_NID, }, |
|---|
| 3220 | + { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ |
|---|
| 3221 | + .driver_data = NVME_QUIRK_BOGUS_NID, }, |
|---|
| 3222 | + { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ |
|---|
| 3223 | + .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, |
|---|
| 2718 | 3224 | { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ |
|---|
| 2719 | | - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, |
|---|
| 3225 | + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | |
|---|
| 3226 | + NVME_QUIRK_NO_NS_DESC_LIST, }, |
|---|
| 2720 | 3227 | { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ |
|---|
| 2721 | 3228 | .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, |
|---|
| 2722 | 3229 | { PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */ |
|---|
| .. | .. |
|---|
| 2726 | 3233 | { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ |
|---|
| 2727 | 3234 | .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, |
|---|
| 2728 | 3235 | { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ |
|---|
| 2729 | | - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, |
|---|
| 3236 | + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | |
|---|
| 3237 | + NVME_QUIRK_DISABLE_WRITE_ZEROES| |
|---|
| 3238 | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, |
|---|
| 3239 | + { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */ |
|---|
| 3240 | + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | |
|---|
| 3241 | + NVME_QUIRK_BOGUS_NID, }, |
|---|
| 3242 | + { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */ |
|---|
| 3243 | + .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | |
|---|
| 3244 | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, |
|---|
| 2730 | 3245 | { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ |
|---|
| 2731 | 3246 | .driver_data = NVME_QUIRK_LIGHTNVM, }, |
|---|
| 2732 | 3247 | { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ |
|---|
| 2733 | 3248 | .driver_data = NVME_QUIRK_LIGHTNVM, }, |
|---|
| 2734 | 3249 | { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ |
|---|
| 2735 | 3250 | .driver_data = NVME_QUIRK_LIGHTNVM, }, |
|---|
| 2736 | | - { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, |
|---|
| 3251 | + { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ |
|---|
| 3252 | + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | |
|---|
| 3253 | + NVME_QUIRK_BOGUS_NID, }, |
|---|
| 3254 | + { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ |
|---|
| 3255 | + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | |
|---|
| 3256 | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, |
|---|
| 3257 | + { PCI_DEVICE(0x1344, 0x5407), /* Micron Technology Inc NVMe SSD */ |
|---|
| 3258 | + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN }, |
|---|
| 3259 | + { PCI_DEVICE(0x1344, 0x6001), /* Micron Nitro NVMe */ |
|---|
| 3260 | + .driver_data = NVME_QUIRK_BOGUS_NID, }, |
|---|
| 3261 | + { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ |
|---|
| 3262 | + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, |
|---|
| 3263 | + { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ |
|---|
| 3264 | + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, |
|---|
| 3265 | + { PCI_DEVICE(0x2646, 0x2262), /* KINGSTON SKC2000 NVMe SSD */ |
|---|
| 3266 | + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, |
|---|
| 2737 | 3267 | { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ |
|---|
| 2738 | 3268 | .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, |
|---|
| 2739 | | - { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, |
|---|
| 3269 | + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), |
|---|
| 3270 | + .driver_data = NVME_QUIRK_SINGLE_VECTOR }, |
|---|
| 2740 | 3271 | { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, |
|---|
| 3272 | + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), |
|---|
| 3273 | + .driver_data = NVME_QUIRK_SINGLE_VECTOR | |
|---|
| 3274 | + NVME_QUIRK_128_BYTES_SQES | |
|---|
| 3275 | + NVME_QUIRK_SHARED_TAGS | |
|---|
| 3276 | + NVME_QUIRK_SKIP_CID_GEN }, |
|---|
| 3277 | + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, |
|---|
| 2741 | 3278 | { 0, } |
|---|
| 2742 | 3279 | }; |
|---|
| 2743 | 3280 | MODULE_DEVICE_TABLE(pci, nvme_id_table); |
|---|
| .. | .. |
|---|
| 2748 | 3285 | .probe = nvme_probe, |
|---|
| 2749 | 3286 | .remove = nvme_remove, |
|---|
| 2750 | 3287 | .shutdown = nvme_shutdown, |
|---|
| 3288 | +#ifdef CONFIG_PM_SLEEP |
|---|
| 2751 | 3289 | .driver = { |
|---|
| 2752 | 3290 | .pm = &nvme_dev_pm_ops, |
|---|
| 2753 | 3291 | }, |
|---|
| 3292 | +#endif |
|---|
| 2754 | 3293 | .sriov_configure = pci_sriov_configure_simple, |
|---|
| 2755 | 3294 | .err_handler = &nvme_err_handler, |
|---|
| 2756 | 3295 | }; |
|---|
| 2757 | 3296 | |
|---|
| 2758 | 3297 | static int __init nvme_init(void) |
|---|
| 2759 | 3298 | { |
|---|
| 3299 | + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); |
|---|
| 3300 | + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); |
|---|
| 3301 | + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); |
|---|
| 3302 | + BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); |
|---|
| 3303 | + |
|---|
| 2760 | 3304 | return pci_register_driver(&nvme_driver); |
|---|
| 2761 | 3305 | } |
|---|
| 2762 | 3306 | |
|---|
| .. | .. |
|---|
| 2764 | 3308 | { |
|---|
| 2765 | 3309 | pci_unregister_driver(&nvme_driver); |
|---|
| 2766 | 3310 | flush_workqueue(nvme_wq); |
|---|
| 2767 | | - _nvme_check_size(); |
|---|
| 2768 | 3311 | } |
|---|
| 2769 | 3312 | |
|---|
| 2770 | 3313 | MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); |
|---|