| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Copyright (c) 2017-2018 Christoph Hellwig. |
|---|
| 3 | | - * |
|---|
| 4 | | - * This program is free software; you can redistribute it and/or modify it |
|---|
| 5 | | - * under the terms and conditions of the GNU General Public License, |
|---|
| 6 | | - * version 2, as published by the Free Software Foundation. |
|---|
| 7 | | - * |
|---|
| 8 | | - * This program is distributed in the hope it will be useful, but WITHOUT |
|---|
| 9 | | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|---|
| 10 | | - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
|---|
| 11 | | - * more details. |
|---|
| 12 | 4 | */ |
|---|
| 13 | 5 | |
|---|
| 14 | 6 | #include <linux/backing-dev.h> |
|---|
| .. | .. |
|---|
| 65 | 57 | sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); |
|---|
| 66 | 58 | } else if (ns->head->disk) { |
|---|
| 67 | 59 | sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, |
|---|
| 68 | | - ctrl->cntlid, ns->head->instance); |
|---|
| 60 | + ctrl->instance, ns->head->instance); |
|---|
| 69 | 61 | *flags = GENHD_FL_HIDDEN; |
|---|
| 70 | 62 | } else { |
|---|
| 71 | 63 | sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, |
|---|
| .. | .. |
|---|
| 73 | 65 | } |
|---|
| 74 | 66 | } |
|---|
| 75 | 67 | |
|---|
| 76 | | -bool nvme_failover_req(struct request *req) |
|---|
| 68 | +void nvme_failover_req(struct request *req) |
|---|
| 77 | 69 | { |
|---|
| 78 | 70 | struct nvme_ns *ns = req->q->queuedata; |
|---|
| 79 | | - u16 status = nvme_req(req)->status; |
|---|
| 71 | + u16 status = nvme_req(req)->status & 0x7ff; |
|---|
| 80 | 72 | unsigned long flags; |
|---|
| 81 | 73 | |
|---|
| 82 | | - switch (status & 0x7ff) { |
|---|
| 83 | | - case NVME_SC_ANA_TRANSITION: |
|---|
| 84 | | - case NVME_SC_ANA_INACCESSIBLE: |
|---|
| 85 | | - case NVME_SC_ANA_PERSISTENT_LOSS: |
|---|
| 86 | | - /* |
|---|
| 87 | | - * If we got back an ANA error we know the controller is alive, |
|---|
| 88 | | - * but not ready to serve this namespaces. The spec suggests |
|---|
| 89 | | - * we should update our general state here, but due to the fact |
|---|
| 90 | | - * that the admin and I/O queues are not serialized that is |
|---|
| 91 | | - * fundamentally racy. So instead just clear the current path, |
|---|
| 92 | | - * mark the the path as pending and kick of a re-read of the ANA |
|---|
| 93 | | - * log page ASAP. |
|---|
| 94 | | - */ |
|---|
| 95 | | - nvme_mpath_clear_current_path(ns); |
|---|
| 96 | | - if (ns->ctrl->ana_log_buf) { |
|---|
| 97 | | - set_bit(NVME_NS_ANA_PENDING, &ns->flags); |
|---|
| 98 | | - queue_work(nvme_wq, &ns->ctrl->ana_work); |
|---|
| 99 | | - } |
|---|
| 100 | | - break; |
|---|
| 101 | | - case NVME_SC_HOST_PATH_ERROR: |
|---|
| 102 | | - /* |
|---|
| 103 | | - * Temporary transport disruption in talking to the controller. |
|---|
| 104 | | - * Try to send on a new path. |
|---|
| 105 | | - */ |
|---|
| 106 | | - nvme_mpath_clear_current_path(ns); |
|---|
| 107 | | - break; |
|---|
| 108 | | - default: |
|---|
| 109 | | - /* This was a non-ANA error so follow the normal error path. */ |
|---|
| 110 | | - return false; |
|---|
| 74 | + nvme_mpath_clear_current_path(ns); |
|---|
| 75 | + |
|---|
| 76 | + /* |
|---|
| 77 | + * If we got back an ANA error, we know the controller is alive but not |
|---|
| 78 | + * ready to serve this namespace. Kick of a re-read of the ANA |
|---|
| 79 | + * information page, and just try any other available path for now. |
|---|
| 80 | + */ |
|---|
| 81 | + if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { |
|---|
| 82 | + set_bit(NVME_NS_ANA_PENDING, &ns->flags); |
|---|
| 83 | + queue_work(nvme_wq, &ns->ctrl->ana_work); |
|---|
| 111 | 84 | } |
|---|
| 112 | 85 | |
|---|
| 113 | 86 | spin_lock_irqsave(&ns->head->requeue_lock, flags); |
|---|
| 114 | 87 | blk_steal_bios(&ns->head->requeue_list, req); |
|---|
| 115 | 88 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); |
|---|
| 116 | | - blk_mq_end_request(req, 0); |
|---|
| 117 | 89 | |
|---|
| 90 | + blk_mq_end_request(req, 0); |
|---|
| 118 | 91 | kblockd_schedule_work(&ns->head->requeue_work); |
|---|
| 119 | | - return true; |
|---|
| 120 | 92 | } |
|---|
| 121 | 93 | |
|---|
| 122 | 94 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
|---|
| .. | .. |
|---|
| 140 | 112 | [NVME_ANA_CHANGE] = "change", |
|---|
| 141 | 113 | }; |
|---|
| 142 | 114 | |
|---|
| 143 | | -static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) |
|---|
| 115 | +bool nvme_mpath_clear_current_path(struct nvme_ns *ns) |
|---|
| 144 | 116 | { |
|---|
| 145 | | - struct nvme_ns *ns, *fallback = NULL; |
|---|
| 117 | + struct nvme_ns_head *head = ns->head; |
|---|
| 118 | + bool changed = false; |
|---|
| 119 | + int node; |
|---|
| 120 | + |
|---|
| 121 | + if (!head) |
|---|
| 122 | + goto out; |
|---|
| 123 | + |
|---|
| 124 | + for_each_node(node) { |
|---|
| 125 | + if (ns == rcu_access_pointer(head->current_path[node])) { |
|---|
| 126 | + rcu_assign_pointer(head->current_path[node], NULL); |
|---|
| 127 | + changed = true; |
|---|
| 128 | + } |
|---|
| 129 | + } |
|---|
| 130 | +out: |
|---|
| 131 | + return changed; |
|---|
| 132 | +} |
|---|
| 133 | + |
|---|
| 134 | +void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) |
|---|
| 135 | +{ |
|---|
| 136 | + struct nvme_ns *ns; |
|---|
| 137 | + |
|---|
| 138 | + down_read(&ctrl->namespaces_rwsem); |
|---|
| 139 | + list_for_each_entry(ns, &ctrl->namespaces, list) { |
|---|
| 140 | + nvme_mpath_clear_current_path(ns); |
|---|
| 141 | + kblockd_schedule_work(&ns->head->requeue_work); |
|---|
| 142 | + } |
|---|
| 143 | + up_read(&ctrl->namespaces_rwsem); |
|---|
| 144 | +} |
|---|
| 145 | + |
|---|
| 146 | +static bool nvme_path_is_disabled(struct nvme_ns *ns) |
|---|
| 147 | +{ |
|---|
| 148 | + /* |
|---|
| 149 | + * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should |
|---|
| 150 | + * still be able to complete assuming that the controller is connected. |
|---|
| 151 | + * Otherwise it will fail immediately and return to the requeue list. |
|---|
| 152 | + */ |
|---|
| 153 | + if (ns->ctrl->state != NVME_CTRL_LIVE && |
|---|
| 154 | + ns->ctrl->state != NVME_CTRL_DELETING) |
|---|
| 155 | + return true; |
|---|
| 156 | + if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || |
|---|
| 157 | + test_bit(NVME_NS_REMOVING, &ns->flags)) |
|---|
| 158 | + return true; |
|---|
| 159 | + return false; |
|---|
| 160 | +} |
|---|
| 161 | + |
|---|
| 162 | +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) |
|---|
| 163 | +{ |
|---|
| 164 | + int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; |
|---|
| 165 | + struct nvme_ns *found = NULL, *fallback = NULL, *ns; |
|---|
| 146 | 166 | |
|---|
| 147 | 167 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
|---|
| 148 | | - if (ns->ctrl->state != NVME_CTRL_LIVE || |
|---|
| 149 | | - test_bit(NVME_NS_ANA_PENDING, &ns->flags)) |
|---|
| 168 | + if (nvme_path_is_disabled(ns)) |
|---|
| 150 | 169 | continue; |
|---|
| 170 | + |
|---|
| 171 | + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) |
|---|
| 172 | + distance = node_distance(node, ns->ctrl->numa_node); |
|---|
| 173 | + else |
|---|
| 174 | + distance = LOCAL_DISTANCE; |
|---|
| 175 | + |
|---|
| 151 | 176 | switch (ns->ana_state) { |
|---|
| 152 | 177 | case NVME_ANA_OPTIMIZED: |
|---|
| 153 | | - rcu_assign_pointer(head->current_path, ns); |
|---|
| 154 | | - return ns; |
|---|
| 178 | + if (distance < found_distance) { |
|---|
| 179 | + found_distance = distance; |
|---|
| 180 | + found = ns; |
|---|
| 181 | + } |
|---|
| 182 | + break; |
|---|
| 155 | 183 | case NVME_ANA_NONOPTIMIZED: |
|---|
| 156 | | - fallback = ns; |
|---|
| 184 | + if (distance < fallback_distance) { |
|---|
| 185 | + fallback_distance = distance; |
|---|
| 186 | + fallback = ns; |
|---|
| 187 | + } |
|---|
| 157 | 188 | break; |
|---|
| 158 | 189 | default: |
|---|
| 159 | 190 | break; |
|---|
| 160 | 191 | } |
|---|
| 161 | 192 | } |
|---|
| 162 | 193 | |
|---|
| 163 | | - if (fallback) |
|---|
| 164 | | - rcu_assign_pointer(head->current_path, fallback); |
|---|
| 165 | | - return fallback; |
|---|
| 194 | + if (!found) |
|---|
| 195 | + found = fallback; |
|---|
| 196 | + if (found) |
|---|
| 197 | + rcu_assign_pointer(head->current_path[node], found); |
|---|
| 198 | + return found; |
|---|
| 199 | +} |
|---|
| 200 | + |
|---|
| 201 | +static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, |
|---|
| 202 | + struct nvme_ns *ns) |
|---|
| 203 | +{ |
|---|
| 204 | + ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, |
|---|
| 205 | + siblings); |
|---|
| 206 | + if (ns) |
|---|
| 207 | + return ns; |
|---|
| 208 | + return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); |
|---|
| 209 | +} |
|---|
| 210 | + |
|---|
| 211 | +static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, |
|---|
| 212 | + int node, struct nvme_ns *old) |
|---|
| 213 | +{ |
|---|
| 214 | + struct nvme_ns *ns, *found = NULL; |
|---|
| 215 | + |
|---|
| 216 | + if (list_is_singular(&head->list)) { |
|---|
| 217 | + if (nvme_path_is_disabled(old)) |
|---|
| 218 | + return NULL; |
|---|
| 219 | + return old; |
|---|
| 220 | + } |
|---|
| 221 | + |
|---|
| 222 | + for (ns = nvme_next_ns(head, old); |
|---|
| 223 | + ns && ns != old; |
|---|
| 224 | + ns = nvme_next_ns(head, ns)) { |
|---|
| 225 | + if (nvme_path_is_disabled(ns)) |
|---|
| 226 | + continue; |
|---|
| 227 | + |
|---|
| 228 | + if (ns->ana_state == NVME_ANA_OPTIMIZED) { |
|---|
| 229 | + found = ns; |
|---|
| 230 | + goto out; |
|---|
| 231 | + } |
|---|
| 232 | + if (ns->ana_state == NVME_ANA_NONOPTIMIZED) |
|---|
| 233 | + found = ns; |
|---|
| 234 | + } |
|---|
| 235 | + |
|---|
| 236 | + /* |
|---|
| 237 | + * The loop above skips the current path for round-robin semantics. |
|---|
| 238 | + * Fall back to the current path if either: |
|---|
| 239 | + * - no other optimized path found and current is optimized, |
|---|
| 240 | + * - no other usable path found and current is usable. |
|---|
| 241 | + */ |
|---|
| 242 | + if (!nvme_path_is_disabled(old) && |
|---|
| 243 | + (old->ana_state == NVME_ANA_OPTIMIZED || |
|---|
| 244 | + (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) |
|---|
| 245 | + return old; |
|---|
| 246 | + |
|---|
| 247 | + if (!found) |
|---|
| 248 | + return NULL; |
|---|
| 249 | +out: |
|---|
| 250 | + rcu_assign_pointer(head->current_path[node], found); |
|---|
| 251 | + return found; |
|---|
| 166 | 252 | } |
|---|
| 167 | 253 | |
|---|
| 168 | 254 | static inline bool nvme_path_is_optimized(struct nvme_ns *ns) |
|---|
| .. | .. |
|---|
| 173 | 259 | |
|---|
| 174 | 260 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) |
|---|
| 175 | 261 | { |
|---|
| 176 | | - struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); |
|---|
| 262 | + int node = numa_node_id(); |
|---|
| 263 | + struct nvme_ns *ns; |
|---|
| 177 | 264 | |
|---|
| 178 | | - if (unlikely(!ns || !nvme_path_is_optimized(ns))) |
|---|
| 179 | | - ns = __nvme_find_path(head); |
|---|
| 265 | + ns = srcu_dereference(head->current_path[node], &head->srcu); |
|---|
| 266 | + if (unlikely(!ns)) |
|---|
| 267 | + return __nvme_find_path(head, node); |
|---|
| 268 | + |
|---|
| 269 | + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) |
|---|
| 270 | + return nvme_round_robin_path(head, node, ns); |
|---|
| 271 | + if (unlikely(!nvme_path_is_optimized(ns))) |
|---|
| 272 | + return __nvme_find_path(head, node); |
|---|
| 180 | 273 | return ns; |
|---|
| 181 | 274 | } |
|---|
| 182 | 275 | |
|---|
| 183 | | -static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, |
|---|
| 184 | | - struct bio *bio) |
|---|
| 276 | +static bool nvme_available_path(struct nvme_ns_head *head) |
|---|
| 185 | 277 | { |
|---|
| 186 | | - struct nvme_ns_head *head = q->queuedata; |
|---|
| 278 | + struct nvme_ns *ns; |
|---|
| 279 | + |
|---|
| 280 | + list_for_each_entry_rcu(ns, &head->list, siblings) { |
|---|
| 281 | + switch (ns->ctrl->state) { |
|---|
| 282 | + case NVME_CTRL_LIVE: |
|---|
| 283 | + case NVME_CTRL_RESETTING: |
|---|
| 284 | + case NVME_CTRL_CONNECTING: |
|---|
| 285 | + /* fallthru */ |
|---|
| 286 | + return true; |
|---|
| 287 | + default: |
|---|
| 288 | + break; |
|---|
| 289 | + } |
|---|
| 290 | + } |
|---|
| 291 | + return false; |
|---|
| 292 | +} |
|---|
| 293 | + |
|---|
| 294 | +blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) |
|---|
| 295 | +{ |
|---|
| 296 | + struct nvme_ns_head *head = bio->bi_disk->private_data; |
|---|
| 187 | 297 | struct device *dev = disk_to_dev(head->disk); |
|---|
| 188 | 298 | struct nvme_ns *ns; |
|---|
| 189 | 299 | blk_qc_t ret = BLK_QC_T_NONE; |
|---|
| 190 | 300 | int srcu_idx; |
|---|
| 301 | + |
|---|
| 302 | + /* |
|---|
| 303 | + * The namespace might be going away and the bio might be moved to a |
|---|
| 304 | + * different queue via blk_steal_bios(), so we need to use the bio_split |
|---|
| 305 | + * pool from the original queue to allocate the bvecs from. |
|---|
| 306 | + */ |
|---|
| 307 | + blk_queue_split(&bio); |
|---|
| 191 | 308 | |
|---|
| 192 | 309 | srcu_idx = srcu_read_lock(&head->srcu); |
|---|
| 193 | 310 | ns = nvme_find_path(head); |
|---|
| .. | .. |
|---|
| 197 | 314 | trace_block_bio_remap(bio->bi_disk->queue, bio, |
|---|
| 198 | 315 | disk_devt(ns->head->disk), |
|---|
| 199 | 316 | bio->bi_iter.bi_sector); |
|---|
| 200 | | - ret = direct_make_request(bio); |
|---|
| 201 | | - } else if (!list_empty_careful(&head->list)) { |
|---|
| 202 | | - dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); |
|---|
| 317 | + ret = submit_bio_noacct(bio); |
|---|
| 318 | + } else if (nvme_available_path(head)) { |
|---|
| 319 | + dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); |
|---|
| 203 | 320 | |
|---|
| 204 | 321 | spin_lock_irq(&head->requeue_lock); |
|---|
| 205 | 322 | bio_list_add(&head->requeue_list, bio); |
|---|
| 206 | 323 | spin_unlock_irq(&head->requeue_lock); |
|---|
| 207 | 324 | } else { |
|---|
| 208 | | - dev_warn_ratelimited(dev, "no path - failing I/O\n"); |
|---|
| 325 | + dev_warn_ratelimited(dev, "no available path - failing I/O\n"); |
|---|
| 209 | 326 | |
|---|
| 210 | 327 | bio->bi_status = BLK_STS_IOERR; |
|---|
| 211 | 328 | bio_endio(bio); |
|---|
| .. | .. |
|---|
| 213 | 330 | |
|---|
| 214 | 331 | srcu_read_unlock(&head->srcu, srcu_idx); |
|---|
| 215 | 332 | return ret; |
|---|
| 216 | | -} |
|---|
| 217 | | - |
|---|
| 218 | | -static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) |
|---|
| 219 | | -{ |
|---|
| 220 | | - struct nvme_ns_head *head = q->queuedata; |
|---|
| 221 | | - struct nvme_ns *ns; |
|---|
| 222 | | - bool found = false; |
|---|
| 223 | | - int srcu_idx; |
|---|
| 224 | | - |
|---|
| 225 | | - srcu_idx = srcu_read_lock(&head->srcu); |
|---|
| 226 | | - ns = srcu_dereference(head->current_path, &head->srcu); |
|---|
| 227 | | - if (likely(ns && nvme_path_is_optimized(ns))) |
|---|
| 228 | | - found = ns->queue->poll_fn(q, qc); |
|---|
| 229 | | - srcu_read_unlock(&head->srcu, srcu_idx); |
|---|
| 230 | | - return found; |
|---|
| 231 | 333 | } |
|---|
| 232 | 334 | |
|---|
| 233 | 335 | static void nvme_requeue_work(struct work_struct *work) |
|---|
| .. | .. |
|---|
| 249 | 351 | * path. |
|---|
| 250 | 352 | */ |
|---|
| 251 | 353 | bio->bi_disk = head->disk; |
|---|
| 252 | | - generic_make_request(bio); |
|---|
| 354 | + submit_bio_noacct(bio); |
|---|
| 253 | 355 | } |
|---|
| 254 | 356 | } |
|---|
| 255 | 357 | |
|---|
| .. | .. |
|---|
| 268 | 370 | * We also do this for private namespaces as the namespace sharing data could |
|---|
| 269 | 371 | * change after a rescan. |
|---|
| 270 | 372 | */ |
|---|
| 271 | | - if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) |
|---|
| 373 | + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) |
|---|
| 272 | 374 | return 0; |
|---|
| 273 | 375 | |
|---|
| 274 | | - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL); |
|---|
| 376 | + q = blk_alloc_queue(ctrl->numa_node); |
|---|
| 275 | 377 | if (!q) |
|---|
| 276 | 378 | goto out; |
|---|
| 277 | | - q->queuedata = head; |
|---|
| 278 | | - blk_queue_make_request(q, nvme_ns_head_make_request); |
|---|
| 279 | | - q->poll_fn = nvme_ns_head_poll; |
|---|
| 280 | 379 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q); |
|---|
| 281 | 380 | /* set to a default value for 512 until disk is validated */ |
|---|
| 282 | 381 | blk_queue_logical_block_size(q, 512); |
|---|
| .. | .. |
|---|
| 308 | 407 | { |
|---|
| 309 | 408 | struct nvme_ns_head *head = ns->head; |
|---|
| 310 | 409 | |
|---|
| 311 | | - lockdep_assert_held(&ns->head->lock); |
|---|
| 312 | | - |
|---|
| 313 | 410 | if (!head->disk) |
|---|
| 314 | 411 | return; |
|---|
| 315 | 412 | |
|---|
| 316 | | - if (!(head->disk->flags & GENHD_FL_UP)) { |
|---|
| 317 | | - device_add_disk(&head->subsys->dev, head->disk); |
|---|
| 318 | | - if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, |
|---|
| 319 | | - &nvme_ns_id_attr_group)) |
|---|
| 320 | | - dev_warn(&head->subsys->dev, |
|---|
| 321 | | - "failed to create id group.\n"); |
|---|
| 322 | | - } |
|---|
| 413 | + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) |
|---|
| 414 | + device_add_disk(&head->subsys->dev, head->disk, |
|---|
| 415 | + nvme_ns_id_attr_groups); |
|---|
| 323 | 416 | |
|---|
| 324 | | - synchronize_srcu(&ns->head->srcu); |
|---|
| 325 | | - kblockd_schedule_work(&ns->head->requeue_work); |
|---|
| 417 | + mutex_lock(&head->lock); |
|---|
| 418 | + if (nvme_path_is_optimized(ns)) { |
|---|
| 419 | + int node, srcu_idx; |
|---|
| 420 | + |
|---|
| 421 | + srcu_idx = srcu_read_lock(&head->srcu); |
|---|
| 422 | + for_each_node(node) |
|---|
| 423 | + __nvme_find_path(head, node); |
|---|
| 424 | + srcu_read_unlock(&head->srcu, srcu_idx); |
|---|
| 425 | + } |
|---|
| 426 | + mutex_unlock(&head->lock); |
|---|
| 427 | + |
|---|
| 428 | + synchronize_srcu(&head->srcu); |
|---|
| 429 | + kblockd_schedule_work(&head->requeue_work); |
|---|
| 326 | 430 | } |
|---|
| 327 | 431 | |
|---|
| 328 | 432 | static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, |
|---|
| .. | .. |
|---|
| 337 | 441 | |
|---|
| 338 | 442 | for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { |
|---|
| 339 | 443 | struct nvme_ana_group_desc *desc = base + offset; |
|---|
| 340 | | - u32 nr_nsids = le32_to_cpu(desc->nnsids); |
|---|
| 341 | | - size_t nsid_buf_size = nr_nsids * sizeof(__le32); |
|---|
| 444 | + u32 nr_nsids; |
|---|
| 445 | + size_t nsid_buf_size; |
|---|
| 446 | + |
|---|
| 447 | + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) |
|---|
| 448 | + return -EINVAL; |
|---|
| 449 | + |
|---|
| 450 | + nr_nsids = le32_to_cpu(desc->nnsids); |
|---|
| 451 | + nsid_buf_size = nr_nsids * sizeof(__le32); |
|---|
| 342 | 452 | |
|---|
| 343 | 453 | if (WARN_ON_ONCE(desc->grpid == 0)) |
|---|
| 344 | 454 | return -EINVAL; |
|---|
| .. | .. |
|---|
| 358 | 468 | return error; |
|---|
| 359 | 469 | |
|---|
| 360 | 470 | offset += nsid_buf_size; |
|---|
| 361 | | - if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) |
|---|
| 362 | | - return -EINVAL; |
|---|
| 363 | 471 | } |
|---|
| 364 | 472 | |
|---|
| 365 | 473 | return 0; |
|---|
| .. | .. |
|---|
| 373 | 481 | static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, |
|---|
| 374 | 482 | struct nvme_ns *ns) |
|---|
| 375 | 483 | { |
|---|
| 376 | | - mutex_lock(&ns->head->lock); |
|---|
| 377 | 484 | ns->ana_grpid = le32_to_cpu(desc->grpid); |
|---|
| 378 | 485 | ns->ana_state = desc->state; |
|---|
| 379 | 486 | clear_bit(NVME_NS_ANA_PENDING, &ns->flags); |
|---|
| 380 | | - |
|---|
| 381 | | - if (nvme_state_is_live(ns->ana_state)) |
|---|
| 487 | + /* |
|---|
| 488 | + * nvme_mpath_set_live() will trigger I/O to the multipath path device |
|---|
| 489 | + * and in turn to this path device. However we cannot accept this I/O |
|---|
| 490 | + * if the controller is not live. This may deadlock if called from |
|---|
| 491 | + * nvme_mpath_init_identify() and the ctrl will never complete |
|---|
| 492 | + * initialization, preventing I/O from completing. For this case we |
|---|
| 493 | + * will reprocess the ANA log page in nvme_mpath_update() once the |
|---|
| 494 | + * controller is ready. |
|---|
| 495 | + */ |
|---|
| 496 | + if (nvme_state_is_live(ns->ana_state) && |
|---|
| 497 | + ns->ctrl->state == NVME_CTRL_LIVE) |
|---|
| 382 | 498 | nvme_mpath_set_live(ns); |
|---|
| 383 | | - mutex_unlock(&ns->head->lock); |
|---|
| 384 | 499 | } |
|---|
| 385 | 500 | |
|---|
| 386 | 501 | static int nvme_update_ana_state(struct nvme_ctrl *ctrl, |
|---|
| .. | .. |
|---|
| 390 | 505 | unsigned *nr_change_groups = data; |
|---|
| 391 | 506 | struct nvme_ns *ns; |
|---|
| 392 | 507 | |
|---|
| 393 | | - dev_info(ctrl->device, "ANA group %d: %s.\n", |
|---|
| 508 | + dev_dbg(ctrl->device, "ANA group %d: %s.\n", |
|---|
| 394 | 509 | le32_to_cpu(desc->grpid), |
|---|
| 395 | 510 | nvme_ana_state_names[desc->state]); |
|---|
| 396 | 511 | |
|---|
| .. | .. |
|---|
| 418 | 533 | return 0; |
|---|
| 419 | 534 | } |
|---|
| 420 | 535 | |
|---|
| 421 | | -static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only) |
|---|
| 536 | +static int nvme_read_ana_log(struct nvme_ctrl *ctrl) |
|---|
| 422 | 537 | { |
|---|
| 423 | 538 | u32 nr_change_groups = 0; |
|---|
| 424 | 539 | int error; |
|---|
| 425 | 540 | |
|---|
| 426 | 541 | mutex_lock(&ctrl->ana_lock); |
|---|
| 427 | | - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, |
|---|
| 428 | | - groups_only ? NVME_ANA_LOG_RGO : 0, |
|---|
| 542 | + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, |
|---|
| 429 | 543 | ctrl->ana_log_buf, ctrl->ana_log_size, 0); |
|---|
| 430 | 544 | if (error) { |
|---|
| 431 | 545 | dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); |
|---|
| .. | .. |
|---|
| 461 | 575 | { |
|---|
| 462 | 576 | struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); |
|---|
| 463 | 577 | |
|---|
| 464 | | - nvme_read_ana_log(ctrl, false); |
|---|
| 578 | + if (ctrl->state != NVME_CTRL_LIVE) |
|---|
| 579 | + return; |
|---|
| 580 | + |
|---|
| 581 | + nvme_read_ana_log(ctrl); |
|---|
| 582 | +} |
|---|
| 583 | + |
|---|
| 584 | +void nvme_mpath_update(struct nvme_ctrl *ctrl) |
|---|
| 585 | +{ |
|---|
| 586 | + u32 nr_change_groups = 0; |
|---|
| 587 | + |
|---|
| 588 | + if (!ctrl->ana_log_buf) |
|---|
| 589 | + return; |
|---|
| 590 | + |
|---|
| 591 | + mutex_lock(&ctrl->ana_lock); |
|---|
| 592 | + nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state); |
|---|
| 593 | + mutex_unlock(&ctrl->ana_lock); |
|---|
| 465 | 594 | } |
|---|
| 466 | 595 | |
|---|
| 467 | 596 | static void nvme_anatt_timeout(struct timer_list *t) |
|---|
| .. | .. |
|---|
| 480 | 609 | cancel_work_sync(&ctrl->ana_work); |
|---|
| 481 | 610 | } |
|---|
| 482 | 611 | |
|---|
| 612 | +#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ |
|---|
| 613 | + struct device_attribute subsys_attr_##_name = \ |
|---|
| 614 | + __ATTR(_name, _mode, _show, _store) |
|---|
| 615 | + |
|---|
| 616 | +static const char *nvme_iopolicy_names[] = { |
|---|
| 617 | + [NVME_IOPOLICY_NUMA] = "numa", |
|---|
| 618 | + [NVME_IOPOLICY_RR] = "round-robin", |
|---|
| 619 | +}; |
|---|
| 620 | + |
|---|
| 621 | +static ssize_t nvme_subsys_iopolicy_show(struct device *dev, |
|---|
| 622 | + struct device_attribute *attr, char *buf) |
|---|
| 623 | +{ |
|---|
| 624 | + struct nvme_subsystem *subsys = |
|---|
| 625 | + container_of(dev, struct nvme_subsystem, dev); |
|---|
| 626 | + |
|---|
| 627 | + return sysfs_emit(buf, "%s\n", |
|---|
| 628 | + nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); |
|---|
| 629 | +} |
|---|
| 630 | + |
|---|
| 631 | +static ssize_t nvme_subsys_iopolicy_store(struct device *dev, |
|---|
| 632 | + struct device_attribute *attr, const char *buf, size_t count) |
|---|
| 633 | +{ |
|---|
| 634 | + struct nvme_subsystem *subsys = |
|---|
| 635 | + container_of(dev, struct nvme_subsystem, dev); |
|---|
| 636 | + int i; |
|---|
| 637 | + |
|---|
| 638 | + for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { |
|---|
| 639 | + if (sysfs_streq(buf, nvme_iopolicy_names[i])) { |
|---|
| 640 | + WRITE_ONCE(subsys->iopolicy, i); |
|---|
| 641 | + return count; |
|---|
| 642 | + } |
|---|
| 643 | + } |
|---|
| 644 | + |
|---|
| 645 | + return -EINVAL; |
|---|
| 646 | +} |
|---|
| 647 | +SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, |
|---|
| 648 | + nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); |
|---|
| 649 | + |
|---|
| 483 | 650 | static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, |
|---|
| 484 | 651 | char *buf) |
|---|
| 485 | 652 | { |
|---|
| 486 | | - return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); |
|---|
| 653 | + return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); |
|---|
| 487 | 654 | } |
|---|
| 488 | 655 | DEVICE_ATTR_RO(ana_grpid); |
|---|
| 489 | 656 | |
|---|
| .. | .. |
|---|
| 492 | 659 | { |
|---|
| 493 | 660 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); |
|---|
| 494 | 661 | |
|---|
| 495 | | - return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); |
|---|
| 662 | + return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); |
|---|
| 496 | 663 | } |
|---|
| 497 | 664 | DEVICE_ATTR_RO(ana_state); |
|---|
| 498 | 665 | |
|---|
| .. | .. |
|---|
| 529 | 696 | queue_work(nvme_wq, &ns->ctrl->ana_work); |
|---|
| 530 | 697 | } |
|---|
| 531 | 698 | } else { |
|---|
| 532 | | - mutex_lock(&ns->head->lock); |
|---|
| 533 | 699 | ns->ana_state = NVME_ANA_OPTIMIZED; |
|---|
| 534 | 700 | nvme_mpath_set_live(ns); |
|---|
| 535 | | - mutex_unlock(&ns->head->lock); |
|---|
| 536 | 701 | } |
|---|
| 537 | 702 | |
|---|
| 538 | | - if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { |
|---|
| 539 | | - struct gendisk *disk = ns->head->disk; |
|---|
| 540 | | - |
|---|
| 541 | | - if (disk) |
|---|
| 542 | | - disk->queue->backing_dev_info->capabilities |= |
|---|
| 543 | | - BDI_CAP_STABLE_WRITES; |
|---|
| 544 | | - } |
|---|
| 703 | + if (blk_queue_stable_writes(ns->queue) && ns->head->disk) |
|---|
| 704 | + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, |
|---|
| 705 | + ns->head->disk->queue); |
|---|
| 706 | +#ifdef CONFIG_BLK_DEV_ZONED |
|---|
| 707 | + if (blk_queue_is_zoned(ns->queue) && ns->head->disk) |
|---|
| 708 | + ns->head->disk->queue->nr_zones = ns->queue->nr_zones; |
|---|
| 709 | +#endif |
|---|
| 545 | 710 | } |
|---|
| 546 | 711 | |
|---|
| 547 | 712 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) |
|---|
| 548 | 713 | { |
|---|
| 549 | 714 | if (!head->disk) |
|---|
| 550 | 715 | return; |
|---|
| 551 | | - if (head->disk->flags & GENHD_FL_UP) { |
|---|
| 552 | | - sysfs_remove_group(&disk_to_dev(head->disk)->kobj, |
|---|
| 553 | | - &nvme_ns_id_attr_group); |
|---|
| 716 | + if (head->disk->flags & GENHD_FL_UP) |
|---|
| 554 | 717 | del_gendisk(head->disk); |
|---|
| 555 | | - } |
|---|
| 556 | 718 | blk_set_queue_dying(head->disk->queue); |
|---|
| 557 | 719 | /* make sure all pending bios are cleaned up */ |
|---|
| 558 | 720 | kblockd_schedule_work(&head->requeue_work); |
|---|
| 559 | 721 | flush_work(&head->requeue_work); |
|---|
| 560 | 722 | blk_cleanup_queue(head->disk->queue); |
|---|
| 723 | + if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { |
|---|
| 724 | + /* |
|---|
| 725 | + * if device_add_disk wasn't called, prevent |
|---|
| 726 | + * disk release to put a bogus reference on the |
|---|
| 727 | + * request queue |
|---|
| 728 | + */ |
|---|
| 729 | + head->disk->queue = NULL; |
|---|
| 730 | + } |
|---|
| 561 | 731 | put_disk(head->disk); |
|---|
| 562 | 732 | } |
|---|
| 563 | 733 | |
|---|
| 564 | | -int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) |
|---|
| 734 | +void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) |
|---|
| 565 | 735 | { |
|---|
| 566 | | - int error; |
|---|
| 736 | + mutex_init(&ctrl->ana_lock); |
|---|
| 737 | + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); |
|---|
| 738 | + INIT_WORK(&ctrl->ana_work, nvme_ana_work); |
|---|
| 739 | +} |
|---|
| 740 | + |
|---|
| 741 | +int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) |
|---|
| 742 | +{ |
|---|
| 743 | + size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; |
|---|
| 744 | + size_t ana_log_size; |
|---|
| 745 | + int error = 0; |
|---|
| 567 | 746 | |
|---|
| 568 | 747 | /* check if multipath is enabled and we have the capability */ |
|---|
| 569 | | - if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3))) |
|---|
| 748 | + if (!multipath || !ctrl->subsys || |
|---|
| 749 | + !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) |
|---|
| 570 | 750 | return 0; |
|---|
| 571 | 751 | |
|---|
| 572 | 752 | ctrl->anacap = id->anacap; |
|---|
| .. | .. |
|---|
| 574 | 754 | ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); |
|---|
| 575 | 755 | ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); |
|---|
| 576 | 756 | |
|---|
| 577 | | - mutex_init(&ctrl->ana_lock); |
|---|
| 578 | | - timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); |
|---|
| 579 | | - ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + |
|---|
| 580 | | - ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); |
|---|
| 581 | | - ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); |
|---|
| 582 | | - |
|---|
| 583 | | - if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { |
|---|
| 757 | + ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + |
|---|
| 758 | + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + |
|---|
| 759 | + ctrl->max_namespaces * sizeof(__le32); |
|---|
| 760 | + if (ana_log_size > max_transfer_size) { |
|---|
| 584 | 761 | dev_err(ctrl->device, |
|---|
| 585 | | - "ANA log page size (%zd) larger than MDTS (%d).\n", |
|---|
| 586 | | - ctrl->ana_log_size, |
|---|
| 587 | | - ctrl->max_hw_sectors << SECTOR_SHIFT); |
|---|
| 762 | + "ANA log page size (%zd) larger than MDTS (%zd).\n", |
|---|
| 763 | + ana_log_size, max_transfer_size); |
|---|
| 588 | 764 | dev_err(ctrl->device, "disabling ANA support.\n"); |
|---|
| 589 | | - return 0; |
|---|
| 765 | + goto out_uninit; |
|---|
| 590 | 766 | } |
|---|
| 591 | | - |
|---|
| 592 | | - INIT_WORK(&ctrl->ana_work, nvme_ana_work); |
|---|
| 593 | | - kfree(ctrl->ana_log_buf); |
|---|
| 594 | | - ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); |
|---|
| 595 | | - if (!ctrl->ana_log_buf) { |
|---|
| 596 | | - error = -ENOMEM; |
|---|
| 597 | | - goto out; |
|---|
| 767 | + if (ana_log_size > ctrl->ana_log_size) { |
|---|
| 768 | + nvme_mpath_stop(ctrl); |
|---|
| 769 | + kfree(ctrl->ana_log_buf); |
|---|
| 770 | + ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); |
|---|
| 771 | + if (!ctrl->ana_log_buf) |
|---|
| 772 | + return -ENOMEM; |
|---|
| 598 | 773 | } |
|---|
| 599 | | - |
|---|
| 600 | | - error = nvme_read_ana_log(ctrl, false); |
|---|
| 774 | + ctrl->ana_log_size = ana_log_size; |
|---|
| 775 | + error = nvme_read_ana_log(ctrl); |
|---|
| 601 | 776 | if (error) |
|---|
| 602 | | - goto out_free_ana_log_buf; |
|---|
| 777 | + goto out_uninit; |
|---|
| 603 | 778 | return 0; |
|---|
| 604 | | -out_free_ana_log_buf: |
|---|
| 605 | | - kfree(ctrl->ana_log_buf); |
|---|
| 606 | | - ctrl->ana_log_buf = NULL; |
|---|
| 607 | | -out: |
|---|
| 779 | + |
|---|
| 780 | +out_uninit: |
|---|
| 781 | + nvme_mpath_uninit(ctrl); |
|---|
| 608 | 782 | return error; |
|---|
| 609 | 783 | } |
|---|
| 610 | 784 | |
|---|