.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
---|
1 | 2 | /* |
---|
2 | 3 | * Copyright (c) 2017-2018 Christoph Hellwig. |
---|
3 | | - * |
---|
4 | | - * This program is free software; you can redistribute it and/or modify it |
---|
5 | | - * under the terms and conditions of the GNU General Public License, |
---|
6 | | - * version 2, as published by the Free Software Foundation. |
---|
7 | | - * |
---|
8 | | - * This program is distributed in the hope it will be useful, but WITHOUT |
---|
9 | | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
---|
10 | | - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
---|
11 | | - * more details. |
---|
12 | 4 | */ |
---|
13 | 5 | |
---|
14 | 6 | #include <linux/backing-dev.h> |
---|
.. | .. |
---|
65 | 57 | sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); |
---|
66 | 58 | } else if (ns->head->disk) { |
---|
67 | 59 | sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, |
---|
68 | | - ctrl->cntlid, ns->head->instance); |
---|
| 60 | + ctrl->instance, ns->head->instance); |
---|
69 | 61 | *flags = GENHD_FL_HIDDEN; |
---|
70 | 62 | } else { |
---|
71 | 63 | sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, |
---|
.. | .. |
---|
73 | 65 | } |
---|
74 | 66 | } |
---|
75 | 67 | |
---|
76 | | -bool nvme_failover_req(struct request *req) |
---|
| 68 | +void nvme_failover_req(struct request *req) |
---|
77 | 69 | { |
---|
78 | 70 | struct nvme_ns *ns = req->q->queuedata; |
---|
79 | | - u16 status = nvme_req(req)->status; |
---|
| 71 | + u16 status = nvme_req(req)->status & 0x7ff; |
---|
80 | 72 | unsigned long flags; |
---|
81 | 73 | |
---|
82 | | - switch (status & 0x7ff) { |
---|
83 | | - case NVME_SC_ANA_TRANSITION: |
---|
84 | | - case NVME_SC_ANA_INACCESSIBLE: |
---|
85 | | - case NVME_SC_ANA_PERSISTENT_LOSS: |
---|
86 | | - /* |
---|
87 | | - * If we got back an ANA error we know the controller is alive, |
---|
88 | | - * but not ready to serve this namespaces. The spec suggests |
---|
89 | | - * we should update our general state here, but due to the fact |
---|
90 | | - * that the admin and I/O queues are not serialized that is |
---|
91 | | - * fundamentally racy. So instead just clear the current path, |
---|
92 | | - * mark the the path as pending and kick of a re-read of the ANA |
---|
93 | | - * log page ASAP. |
---|
94 | | - */ |
---|
95 | | - nvme_mpath_clear_current_path(ns); |
---|
96 | | - if (ns->ctrl->ana_log_buf) { |
---|
97 | | - set_bit(NVME_NS_ANA_PENDING, &ns->flags); |
---|
98 | | - queue_work(nvme_wq, &ns->ctrl->ana_work); |
---|
99 | | - } |
---|
100 | | - break; |
---|
101 | | - case NVME_SC_HOST_PATH_ERROR: |
---|
102 | | - /* |
---|
103 | | - * Temporary transport disruption in talking to the controller. |
---|
104 | | - * Try to send on a new path. |
---|
105 | | - */ |
---|
106 | | - nvme_mpath_clear_current_path(ns); |
---|
107 | | - break; |
---|
108 | | - default: |
---|
109 | | - /* This was a non-ANA error so follow the normal error path. */ |
---|
110 | | - return false; |
---|
| 74 | + nvme_mpath_clear_current_path(ns); |
---|
| 75 | + |
---|
| 76 | + /* |
---|
| 77 | + * If we got back an ANA error, we know the controller is alive but not |
---|
| 78 | + * ready to serve this namespace. Kick of a re-read of the ANA |
---|
| 79 | + * information page, and just try any other available path for now. |
---|
| 80 | + */ |
---|
| 81 | + if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { |
---|
| 82 | + set_bit(NVME_NS_ANA_PENDING, &ns->flags); |
---|
| 83 | + queue_work(nvme_wq, &ns->ctrl->ana_work); |
---|
111 | 84 | } |
---|
112 | 85 | |
---|
113 | 86 | spin_lock_irqsave(&ns->head->requeue_lock, flags); |
---|
114 | 87 | blk_steal_bios(&ns->head->requeue_list, req); |
---|
115 | 88 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); |
---|
116 | | - blk_mq_end_request(req, 0); |
---|
117 | 89 | |
---|
| 90 | + blk_mq_end_request(req, 0); |
---|
118 | 91 | kblockd_schedule_work(&ns->head->requeue_work); |
---|
119 | | - return true; |
---|
120 | 92 | } |
---|
121 | 93 | |
---|
122 | 94 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
---|
.. | .. |
---|
140 | 112 | [NVME_ANA_CHANGE] = "change", |
---|
141 | 113 | }; |
---|
142 | 114 | |
---|
143 | | -static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) |
---|
| 115 | +bool nvme_mpath_clear_current_path(struct nvme_ns *ns) |
---|
144 | 116 | { |
---|
145 | | - struct nvme_ns *ns, *fallback = NULL; |
---|
| 117 | + struct nvme_ns_head *head = ns->head; |
---|
| 118 | + bool changed = false; |
---|
| 119 | + int node; |
---|
| 120 | + |
---|
| 121 | + if (!head) |
---|
| 122 | + goto out; |
---|
| 123 | + |
---|
| 124 | + for_each_node(node) { |
---|
| 125 | + if (ns == rcu_access_pointer(head->current_path[node])) { |
---|
| 126 | + rcu_assign_pointer(head->current_path[node], NULL); |
---|
| 127 | + changed = true; |
---|
| 128 | + } |
---|
| 129 | + } |
---|
| 130 | +out: |
---|
| 131 | + return changed; |
---|
| 132 | +} |
---|
| 133 | + |
---|
| 134 | +void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) |
---|
| 135 | +{ |
---|
| 136 | + struct nvme_ns *ns; |
---|
| 137 | + |
---|
| 138 | + down_read(&ctrl->namespaces_rwsem); |
---|
| 139 | + list_for_each_entry(ns, &ctrl->namespaces, list) { |
---|
| 140 | + nvme_mpath_clear_current_path(ns); |
---|
| 141 | + kblockd_schedule_work(&ns->head->requeue_work); |
---|
| 142 | + } |
---|
| 143 | + up_read(&ctrl->namespaces_rwsem); |
---|
| 144 | +} |
---|
| 145 | + |
---|
| 146 | +static bool nvme_path_is_disabled(struct nvme_ns *ns) |
---|
| 147 | +{ |
---|
| 148 | + /* |
---|
| 149 | + * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should |
---|
| 150 | + * still be able to complete assuming that the controller is connected. |
---|
| 151 | + * Otherwise it will fail immediately and return to the requeue list. |
---|
| 152 | + */ |
---|
| 153 | + if (ns->ctrl->state != NVME_CTRL_LIVE && |
---|
| 154 | + ns->ctrl->state != NVME_CTRL_DELETING) |
---|
| 155 | + return true; |
---|
| 156 | + if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || |
---|
| 157 | + test_bit(NVME_NS_REMOVING, &ns->flags)) |
---|
| 158 | + return true; |
---|
| 159 | + return false; |
---|
| 160 | +} |
---|
| 161 | + |
---|
| 162 | +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) |
---|
| 163 | +{ |
---|
| 164 | + int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; |
---|
| 165 | + struct nvme_ns *found = NULL, *fallback = NULL, *ns; |
---|
146 | 166 | |
---|
147 | 167 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
---|
148 | | - if (ns->ctrl->state != NVME_CTRL_LIVE || |
---|
149 | | - test_bit(NVME_NS_ANA_PENDING, &ns->flags)) |
---|
| 168 | + if (nvme_path_is_disabled(ns)) |
---|
150 | 169 | continue; |
---|
| 170 | + |
---|
| 171 | + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) |
---|
| 172 | + distance = node_distance(node, ns->ctrl->numa_node); |
---|
| 173 | + else |
---|
| 174 | + distance = LOCAL_DISTANCE; |
---|
| 175 | + |
---|
151 | 176 | switch (ns->ana_state) { |
---|
152 | 177 | case NVME_ANA_OPTIMIZED: |
---|
153 | | - rcu_assign_pointer(head->current_path, ns); |
---|
154 | | - return ns; |
---|
| 178 | + if (distance < found_distance) { |
---|
| 179 | + found_distance = distance; |
---|
| 180 | + found = ns; |
---|
| 181 | + } |
---|
| 182 | + break; |
---|
155 | 183 | case NVME_ANA_NONOPTIMIZED: |
---|
156 | | - fallback = ns; |
---|
| 184 | + if (distance < fallback_distance) { |
---|
| 185 | + fallback_distance = distance; |
---|
| 186 | + fallback = ns; |
---|
| 187 | + } |
---|
157 | 188 | break; |
---|
158 | 189 | default: |
---|
159 | 190 | break; |
---|
160 | 191 | } |
---|
161 | 192 | } |
---|
162 | 193 | |
---|
163 | | - if (fallback) |
---|
164 | | - rcu_assign_pointer(head->current_path, fallback); |
---|
165 | | - return fallback; |
---|
| 194 | + if (!found) |
---|
| 195 | + found = fallback; |
---|
| 196 | + if (found) |
---|
| 197 | + rcu_assign_pointer(head->current_path[node], found); |
---|
| 198 | + return found; |
---|
| 199 | +} |
---|
| 200 | + |
---|
| 201 | +static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, |
---|
| 202 | + struct nvme_ns *ns) |
---|
| 203 | +{ |
---|
| 204 | + ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, |
---|
| 205 | + siblings); |
---|
| 206 | + if (ns) |
---|
| 207 | + return ns; |
---|
| 208 | + return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); |
---|
| 209 | +} |
---|
| 210 | + |
---|
| 211 | +static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, |
---|
| 212 | + int node, struct nvme_ns *old) |
---|
| 213 | +{ |
---|
| 214 | + struct nvme_ns *ns, *found = NULL; |
---|
| 215 | + |
---|
| 216 | + if (list_is_singular(&head->list)) { |
---|
| 217 | + if (nvme_path_is_disabled(old)) |
---|
| 218 | + return NULL; |
---|
| 219 | + return old; |
---|
| 220 | + } |
---|
| 221 | + |
---|
| 222 | + for (ns = nvme_next_ns(head, old); |
---|
| 223 | + ns && ns != old; |
---|
| 224 | + ns = nvme_next_ns(head, ns)) { |
---|
| 225 | + if (nvme_path_is_disabled(ns)) |
---|
| 226 | + continue; |
---|
| 227 | + |
---|
| 228 | + if (ns->ana_state == NVME_ANA_OPTIMIZED) { |
---|
| 229 | + found = ns; |
---|
| 230 | + goto out; |
---|
| 231 | + } |
---|
| 232 | + if (ns->ana_state == NVME_ANA_NONOPTIMIZED) |
---|
| 233 | + found = ns; |
---|
| 234 | + } |
---|
| 235 | + |
---|
| 236 | + /* |
---|
| 237 | + * The loop above skips the current path for round-robin semantics. |
---|
| 238 | + * Fall back to the current path if either: |
---|
| 239 | + * - no other optimized path found and current is optimized, |
---|
| 240 | + * - no other usable path found and current is usable. |
---|
| 241 | + */ |
---|
| 242 | + if (!nvme_path_is_disabled(old) && |
---|
| 243 | + (old->ana_state == NVME_ANA_OPTIMIZED || |
---|
| 244 | + (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) |
---|
| 245 | + return old; |
---|
| 246 | + |
---|
| 247 | + if (!found) |
---|
| 248 | + return NULL; |
---|
| 249 | +out: |
---|
| 250 | + rcu_assign_pointer(head->current_path[node], found); |
---|
| 251 | + return found; |
---|
166 | 252 | } |
---|
167 | 253 | |
---|
168 | 254 | static inline bool nvme_path_is_optimized(struct nvme_ns *ns) |
---|
.. | .. |
---|
173 | 259 | |
---|
174 | 260 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) |
---|
175 | 261 | { |
---|
176 | | - struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); |
---|
| 262 | + int node = numa_node_id(); |
---|
| 263 | + struct nvme_ns *ns; |
---|
177 | 264 | |
---|
178 | | - if (unlikely(!ns || !nvme_path_is_optimized(ns))) |
---|
179 | | - ns = __nvme_find_path(head); |
---|
| 265 | + ns = srcu_dereference(head->current_path[node], &head->srcu); |
---|
| 266 | + if (unlikely(!ns)) |
---|
| 267 | + return __nvme_find_path(head, node); |
---|
| 268 | + |
---|
| 269 | + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) |
---|
| 270 | + return nvme_round_robin_path(head, node, ns); |
---|
| 271 | + if (unlikely(!nvme_path_is_optimized(ns))) |
---|
| 272 | + return __nvme_find_path(head, node); |
---|
180 | 273 | return ns; |
---|
181 | 274 | } |
---|
182 | 275 | |
---|
183 | | -static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, |
---|
184 | | - struct bio *bio) |
---|
| 276 | +static bool nvme_available_path(struct nvme_ns_head *head) |
---|
185 | 277 | { |
---|
186 | | - struct nvme_ns_head *head = q->queuedata; |
---|
| 278 | + struct nvme_ns *ns; |
---|
| 279 | + |
---|
| 280 | + list_for_each_entry_rcu(ns, &head->list, siblings) { |
---|
| 281 | + switch (ns->ctrl->state) { |
---|
| 282 | + case NVME_CTRL_LIVE: |
---|
| 283 | + case NVME_CTRL_RESETTING: |
---|
| 284 | + case NVME_CTRL_CONNECTING: |
---|
| 285 | + /* fallthru */ |
---|
| 286 | + return true; |
---|
| 287 | + default: |
---|
| 288 | + break; |
---|
| 289 | + } |
---|
| 290 | + } |
---|
| 291 | + return false; |
---|
| 292 | +} |
---|
| 293 | + |
---|
| 294 | +blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) |
---|
| 295 | +{ |
---|
| 296 | + struct nvme_ns_head *head = bio->bi_disk->private_data; |
---|
187 | 297 | struct device *dev = disk_to_dev(head->disk); |
---|
188 | 298 | struct nvme_ns *ns; |
---|
189 | 299 | blk_qc_t ret = BLK_QC_T_NONE; |
---|
190 | 300 | int srcu_idx; |
---|
| 301 | + |
---|
| 302 | + /* |
---|
| 303 | + * The namespace might be going away and the bio might be moved to a |
---|
| 304 | + * different queue via blk_steal_bios(), so we need to use the bio_split |
---|
| 305 | + * pool from the original queue to allocate the bvecs from. |
---|
| 306 | + */ |
---|
| 307 | + blk_queue_split(&bio); |
---|
191 | 308 | |
---|
192 | 309 | srcu_idx = srcu_read_lock(&head->srcu); |
---|
193 | 310 | ns = nvme_find_path(head); |
---|
.. | .. |
---|
197 | 314 | trace_block_bio_remap(bio->bi_disk->queue, bio, |
---|
198 | 315 | disk_devt(ns->head->disk), |
---|
199 | 316 | bio->bi_iter.bi_sector); |
---|
200 | | - ret = direct_make_request(bio); |
---|
201 | | - } else if (!list_empty_careful(&head->list)) { |
---|
202 | | - dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); |
---|
| 317 | + ret = submit_bio_noacct(bio); |
---|
| 318 | + } else if (nvme_available_path(head)) { |
---|
| 319 | + dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); |
---|
203 | 320 | |
---|
204 | 321 | spin_lock_irq(&head->requeue_lock); |
---|
205 | 322 | bio_list_add(&head->requeue_list, bio); |
---|
206 | 323 | spin_unlock_irq(&head->requeue_lock); |
---|
207 | 324 | } else { |
---|
208 | | - dev_warn_ratelimited(dev, "no path - failing I/O\n"); |
---|
| 325 | + dev_warn_ratelimited(dev, "no available path - failing I/O\n"); |
---|
209 | 326 | |
---|
210 | 327 | bio->bi_status = BLK_STS_IOERR; |
---|
211 | 328 | bio_endio(bio); |
---|
.. | .. |
---|
213 | 330 | |
---|
214 | 331 | srcu_read_unlock(&head->srcu, srcu_idx); |
---|
215 | 332 | return ret; |
---|
216 | | -} |
---|
217 | | - |
---|
218 | | -static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) |
---|
219 | | -{ |
---|
220 | | - struct nvme_ns_head *head = q->queuedata; |
---|
221 | | - struct nvme_ns *ns; |
---|
222 | | - bool found = false; |
---|
223 | | - int srcu_idx; |
---|
224 | | - |
---|
225 | | - srcu_idx = srcu_read_lock(&head->srcu); |
---|
226 | | - ns = srcu_dereference(head->current_path, &head->srcu); |
---|
227 | | - if (likely(ns && nvme_path_is_optimized(ns))) |
---|
228 | | - found = ns->queue->poll_fn(q, qc); |
---|
229 | | - srcu_read_unlock(&head->srcu, srcu_idx); |
---|
230 | | - return found; |
---|
231 | 333 | } |
---|
232 | 334 | |
---|
233 | 335 | static void nvme_requeue_work(struct work_struct *work) |
---|
.. | .. |
---|
249 | 351 | * path. |
---|
250 | 352 | */ |
---|
251 | 353 | bio->bi_disk = head->disk; |
---|
252 | | - generic_make_request(bio); |
---|
| 354 | + submit_bio_noacct(bio); |
---|
253 | 355 | } |
---|
254 | 356 | } |
---|
255 | 357 | |
---|
.. | .. |
---|
268 | 370 | * We also do this for private namespaces as the namespace sharing data could |
---|
269 | 371 | * change after a rescan. |
---|
270 | 372 | */ |
---|
271 | | - if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) |
---|
| 373 | + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) |
---|
272 | 374 | return 0; |
---|
273 | 375 | |
---|
274 | | - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL); |
---|
| 376 | + q = blk_alloc_queue(ctrl->numa_node); |
---|
275 | 377 | if (!q) |
---|
276 | 378 | goto out; |
---|
277 | | - q->queuedata = head; |
---|
278 | | - blk_queue_make_request(q, nvme_ns_head_make_request); |
---|
279 | | - q->poll_fn = nvme_ns_head_poll; |
---|
280 | 379 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q); |
---|
281 | 380 | /* set to a default value for 512 until disk is validated */ |
---|
282 | 381 | blk_queue_logical_block_size(q, 512); |
---|
.. | .. |
---|
308 | 407 | { |
---|
309 | 408 | struct nvme_ns_head *head = ns->head; |
---|
310 | 409 | |
---|
311 | | - lockdep_assert_held(&ns->head->lock); |
---|
312 | | - |
---|
313 | 410 | if (!head->disk) |
---|
314 | 411 | return; |
---|
315 | 412 | |
---|
316 | | - if (!(head->disk->flags & GENHD_FL_UP)) { |
---|
317 | | - device_add_disk(&head->subsys->dev, head->disk); |
---|
318 | | - if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, |
---|
319 | | - &nvme_ns_id_attr_group)) |
---|
320 | | - dev_warn(&head->subsys->dev, |
---|
321 | | - "failed to create id group.\n"); |
---|
322 | | - } |
---|
| 413 | + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) |
---|
| 414 | + device_add_disk(&head->subsys->dev, head->disk, |
---|
| 415 | + nvme_ns_id_attr_groups); |
---|
323 | 416 | |
---|
324 | | - synchronize_srcu(&ns->head->srcu); |
---|
325 | | - kblockd_schedule_work(&ns->head->requeue_work); |
---|
| 417 | + mutex_lock(&head->lock); |
---|
| 418 | + if (nvme_path_is_optimized(ns)) { |
---|
| 419 | + int node, srcu_idx; |
---|
| 420 | + |
---|
| 421 | + srcu_idx = srcu_read_lock(&head->srcu); |
---|
| 422 | + for_each_node(node) |
---|
| 423 | + __nvme_find_path(head, node); |
---|
| 424 | + srcu_read_unlock(&head->srcu, srcu_idx); |
---|
| 425 | + } |
---|
| 426 | + mutex_unlock(&head->lock); |
---|
| 427 | + |
---|
| 428 | + synchronize_srcu(&head->srcu); |
---|
| 429 | + kblockd_schedule_work(&head->requeue_work); |
---|
326 | 430 | } |
---|
327 | 431 | |
---|
328 | 432 | static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, |
---|
.. | .. |
---|
337 | 441 | |
---|
338 | 442 | for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { |
---|
339 | 443 | struct nvme_ana_group_desc *desc = base + offset; |
---|
340 | | - u32 nr_nsids = le32_to_cpu(desc->nnsids); |
---|
341 | | - size_t nsid_buf_size = nr_nsids * sizeof(__le32); |
---|
| 444 | + u32 nr_nsids; |
---|
| 445 | + size_t nsid_buf_size; |
---|
| 446 | + |
---|
| 447 | + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) |
---|
| 448 | + return -EINVAL; |
---|
| 449 | + |
---|
| 450 | + nr_nsids = le32_to_cpu(desc->nnsids); |
---|
| 451 | + nsid_buf_size = nr_nsids * sizeof(__le32); |
---|
342 | 452 | |
---|
343 | 453 | if (WARN_ON_ONCE(desc->grpid == 0)) |
---|
344 | 454 | return -EINVAL; |
---|
.. | .. |
---|
358 | 468 | return error; |
---|
359 | 469 | |
---|
360 | 470 | offset += nsid_buf_size; |
---|
361 | | - if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) |
---|
362 | | - return -EINVAL; |
---|
363 | 471 | } |
---|
364 | 472 | |
---|
365 | 473 | return 0; |
---|
.. | .. |
---|
373 | 481 | static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, |
---|
374 | 482 | struct nvme_ns *ns) |
---|
375 | 483 | { |
---|
376 | | - mutex_lock(&ns->head->lock); |
---|
377 | 484 | ns->ana_grpid = le32_to_cpu(desc->grpid); |
---|
378 | 485 | ns->ana_state = desc->state; |
---|
379 | 486 | clear_bit(NVME_NS_ANA_PENDING, &ns->flags); |
---|
380 | | - |
---|
381 | | - if (nvme_state_is_live(ns->ana_state)) |
---|
| 487 | + /* |
---|
| 488 | + * nvme_mpath_set_live() will trigger I/O to the multipath path device |
---|
| 489 | + * and in turn to this path device. However we cannot accept this I/O |
---|
| 490 | + * if the controller is not live. This may deadlock if called from |
---|
| 491 | + * nvme_mpath_init_identify() and the ctrl will never complete |
---|
| 492 | + * initialization, preventing I/O from completing. For this case we |
---|
| 493 | + * will reprocess the ANA log page in nvme_mpath_update() once the |
---|
| 494 | + * controller is ready. |
---|
| 495 | + */ |
---|
| 496 | + if (nvme_state_is_live(ns->ana_state) && |
---|
| 497 | + ns->ctrl->state == NVME_CTRL_LIVE) |
---|
382 | 498 | nvme_mpath_set_live(ns); |
---|
383 | | - mutex_unlock(&ns->head->lock); |
---|
384 | 499 | } |
---|
385 | 500 | |
---|
386 | 501 | static int nvme_update_ana_state(struct nvme_ctrl *ctrl, |
---|
.. | .. |
---|
390 | 505 | unsigned *nr_change_groups = data; |
---|
391 | 506 | struct nvme_ns *ns; |
---|
392 | 507 | |
---|
393 | | - dev_info(ctrl->device, "ANA group %d: %s.\n", |
---|
| 508 | + dev_dbg(ctrl->device, "ANA group %d: %s.\n", |
---|
394 | 509 | le32_to_cpu(desc->grpid), |
---|
395 | 510 | nvme_ana_state_names[desc->state]); |
---|
396 | 511 | |
---|
.. | .. |
---|
418 | 533 | return 0; |
---|
419 | 534 | } |
---|
420 | 535 | |
---|
421 | | -static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only) |
---|
| 536 | +static int nvme_read_ana_log(struct nvme_ctrl *ctrl) |
---|
422 | 537 | { |
---|
423 | 538 | u32 nr_change_groups = 0; |
---|
424 | 539 | int error; |
---|
425 | 540 | |
---|
426 | 541 | mutex_lock(&ctrl->ana_lock); |
---|
427 | | - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, |
---|
428 | | - groups_only ? NVME_ANA_LOG_RGO : 0, |
---|
| 542 | + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, |
---|
429 | 543 | ctrl->ana_log_buf, ctrl->ana_log_size, 0); |
---|
430 | 544 | if (error) { |
---|
431 | 545 | dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); |
---|
.. | .. |
---|
461 | 575 | { |
---|
462 | 576 | struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); |
---|
463 | 577 | |
---|
464 | | - nvme_read_ana_log(ctrl, false); |
---|
| 578 | + if (ctrl->state != NVME_CTRL_LIVE) |
---|
| 579 | + return; |
---|
| 580 | + |
---|
| 581 | + nvme_read_ana_log(ctrl); |
---|
| 582 | +} |
---|
| 583 | + |
---|
| 584 | +void nvme_mpath_update(struct nvme_ctrl *ctrl) |
---|
| 585 | +{ |
---|
| 586 | + u32 nr_change_groups = 0; |
---|
| 587 | + |
---|
| 588 | + if (!ctrl->ana_log_buf) |
---|
| 589 | + return; |
---|
| 590 | + |
---|
| 591 | + mutex_lock(&ctrl->ana_lock); |
---|
| 592 | + nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state); |
---|
| 593 | + mutex_unlock(&ctrl->ana_lock); |
---|
465 | 594 | } |
---|
466 | 595 | |
---|
467 | 596 | static void nvme_anatt_timeout(struct timer_list *t) |
---|
.. | .. |
---|
480 | 609 | cancel_work_sync(&ctrl->ana_work); |
---|
481 | 610 | } |
---|
482 | 611 | |
---|
| 612 | +#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ |
---|
| 613 | + struct device_attribute subsys_attr_##_name = \ |
---|
| 614 | + __ATTR(_name, _mode, _show, _store) |
---|
| 615 | + |
---|
| 616 | +static const char *nvme_iopolicy_names[] = { |
---|
| 617 | + [NVME_IOPOLICY_NUMA] = "numa", |
---|
| 618 | + [NVME_IOPOLICY_RR] = "round-robin", |
---|
| 619 | +}; |
---|
| 620 | + |
---|
| 621 | +static ssize_t nvme_subsys_iopolicy_show(struct device *dev, |
---|
| 622 | + struct device_attribute *attr, char *buf) |
---|
| 623 | +{ |
---|
| 624 | + struct nvme_subsystem *subsys = |
---|
| 625 | + container_of(dev, struct nvme_subsystem, dev); |
---|
| 626 | + |
---|
| 627 | + return sysfs_emit(buf, "%s\n", |
---|
| 628 | + nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); |
---|
| 629 | +} |
---|
| 630 | + |
---|
| 631 | +static ssize_t nvme_subsys_iopolicy_store(struct device *dev, |
---|
| 632 | + struct device_attribute *attr, const char *buf, size_t count) |
---|
| 633 | +{ |
---|
| 634 | + struct nvme_subsystem *subsys = |
---|
| 635 | + container_of(dev, struct nvme_subsystem, dev); |
---|
| 636 | + int i; |
---|
| 637 | + |
---|
| 638 | + for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { |
---|
| 639 | + if (sysfs_streq(buf, nvme_iopolicy_names[i])) { |
---|
| 640 | + WRITE_ONCE(subsys->iopolicy, i); |
---|
| 641 | + return count; |
---|
| 642 | + } |
---|
| 643 | + } |
---|
| 644 | + |
---|
| 645 | + return -EINVAL; |
---|
| 646 | +} |
---|
| 647 | +SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, |
---|
| 648 | + nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); |
---|
| 649 | + |
---|
483 | 650 | static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, |
---|
484 | 651 | char *buf) |
---|
485 | 652 | { |
---|
486 | | - return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); |
---|
| 653 | + return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); |
---|
487 | 654 | } |
---|
488 | 655 | DEVICE_ATTR_RO(ana_grpid); |
---|
489 | 656 | |
---|
.. | .. |
---|
492 | 659 | { |
---|
493 | 660 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); |
---|
494 | 661 | |
---|
495 | | - return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); |
---|
| 662 | + return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); |
---|
496 | 663 | } |
---|
497 | 664 | DEVICE_ATTR_RO(ana_state); |
---|
498 | 665 | |
---|
.. | .. |
---|
529 | 696 | queue_work(nvme_wq, &ns->ctrl->ana_work); |
---|
530 | 697 | } |
---|
531 | 698 | } else { |
---|
532 | | - mutex_lock(&ns->head->lock); |
---|
533 | 699 | ns->ana_state = NVME_ANA_OPTIMIZED; |
---|
534 | 700 | nvme_mpath_set_live(ns); |
---|
535 | | - mutex_unlock(&ns->head->lock); |
---|
536 | 701 | } |
---|
537 | 702 | |
---|
538 | | - if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { |
---|
539 | | - struct gendisk *disk = ns->head->disk; |
---|
540 | | - |
---|
541 | | - if (disk) |
---|
542 | | - disk->queue->backing_dev_info->capabilities |= |
---|
543 | | - BDI_CAP_STABLE_WRITES; |
---|
544 | | - } |
---|
| 703 | + if (blk_queue_stable_writes(ns->queue) && ns->head->disk) |
---|
| 704 | + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, |
---|
| 705 | + ns->head->disk->queue); |
---|
| 706 | +#ifdef CONFIG_BLK_DEV_ZONED |
---|
| 707 | + if (blk_queue_is_zoned(ns->queue) && ns->head->disk) |
---|
| 708 | + ns->head->disk->queue->nr_zones = ns->queue->nr_zones; |
---|
| 709 | +#endif |
---|
545 | 710 | } |
---|
546 | 711 | |
---|
547 | 712 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) |
---|
548 | 713 | { |
---|
549 | 714 | if (!head->disk) |
---|
550 | 715 | return; |
---|
551 | | - if (head->disk->flags & GENHD_FL_UP) { |
---|
552 | | - sysfs_remove_group(&disk_to_dev(head->disk)->kobj, |
---|
553 | | - &nvme_ns_id_attr_group); |
---|
| 716 | + if (head->disk->flags & GENHD_FL_UP) |
---|
554 | 717 | del_gendisk(head->disk); |
---|
555 | | - } |
---|
556 | 718 | blk_set_queue_dying(head->disk->queue); |
---|
557 | 719 | /* make sure all pending bios are cleaned up */ |
---|
558 | 720 | kblockd_schedule_work(&head->requeue_work); |
---|
559 | 721 | flush_work(&head->requeue_work); |
---|
560 | 722 | blk_cleanup_queue(head->disk->queue); |
---|
| 723 | + if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { |
---|
| 724 | + /* |
---|
| 725 | + * if device_add_disk wasn't called, prevent |
---|
| 726 | + * disk release to put a bogus reference on the |
---|
| 727 | + * request queue |
---|
| 728 | + */ |
---|
| 729 | + head->disk->queue = NULL; |
---|
| 730 | + } |
---|
561 | 731 | put_disk(head->disk); |
---|
562 | 732 | } |
---|
563 | 733 | |
---|
564 | | -int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) |
---|
| 734 | +void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) |
---|
565 | 735 | { |
---|
566 | | - int error; |
---|
| 736 | + mutex_init(&ctrl->ana_lock); |
---|
| 737 | + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); |
---|
| 738 | + INIT_WORK(&ctrl->ana_work, nvme_ana_work); |
---|
| 739 | +} |
---|
| 740 | + |
---|
| 741 | +int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) |
---|
| 742 | +{ |
---|
| 743 | + size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; |
---|
| 744 | + size_t ana_log_size; |
---|
| 745 | + int error = 0; |
---|
567 | 746 | |
---|
568 | 747 | /* check if multipath is enabled and we have the capability */ |
---|
569 | | - if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3))) |
---|
| 748 | + if (!multipath || !ctrl->subsys || |
---|
| 749 | + !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) |
---|
570 | 750 | return 0; |
---|
571 | 751 | |
---|
572 | 752 | ctrl->anacap = id->anacap; |
---|
.. | .. |
---|
574 | 754 | ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); |
---|
575 | 755 | ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); |
---|
576 | 756 | |
---|
577 | | - mutex_init(&ctrl->ana_lock); |
---|
578 | | - timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); |
---|
579 | | - ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + |
---|
580 | | - ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); |
---|
581 | | - ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); |
---|
582 | | - |
---|
583 | | - if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { |
---|
| 757 | + ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + |
---|
| 758 | + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + |
---|
| 759 | + ctrl->max_namespaces * sizeof(__le32); |
---|
| 760 | + if (ana_log_size > max_transfer_size) { |
---|
584 | 761 | dev_err(ctrl->device, |
---|
585 | | - "ANA log page size (%zd) larger than MDTS (%d).\n", |
---|
586 | | - ctrl->ana_log_size, |
---|
587 | | - ctrl->max_hw_sectors << SECTOR_SHIFT); |
---|
| 762 | + "ANA log page size (%zd) larger than MDTS (%zd).\n", |
---|
| 763 | + ana_log_size, max_transfer_size); |
---|
588 | 764 | dev_err(ctrl->device, "disabling ANA support.\n"); |
---|
589 | | - return 0; |
---|
| 765 | + goto out_uninit; |
---|
590 | 766 | } |
---|
591 | | - |
---|
592 | | - INIT_WORK(&ctrl->ana_work, nvme_ana_work); |
---|
593 | | - kfree(ctrl->ana_log_buf); |
---|
594 | | - ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); |
---|
595 | | - if (!ctrl->ana_log_buf) { |
---|
596 | | - error = -ENOMEM; |
---|
597 | | - goto out; |
---|
| 767 | + if (ana_log_size > ctrl->ana_log_size) { |
---|
| 768 | + nvme_mpath_stop(ctrl); |
---|
| 769 | + kfree(ctrl->ana_log_buf); |
---|
| 770 | + ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); |
---|
| 771 | + if (!ctrl->ana_log_buf) |
---|
| 772 | + return -ENOMEM; |
---|
598 | 773 | } |
---|
599 | | - |
---|
600 | | - error = nvme_read_ana_log(ctrl, false); |
---|
| 774 | + ctrl->ana_log_size = ana_log_size; |
---|
| 775 | + error = nvme_read_ana_log(ctrl); |
---|
601 | 776 | if (error) |
---|
602 | | - goto out_free_ana_log_buf; |
---|
| 777 | + goto out_uninit; |
---|
603 | 778 | return 0; |
---|
604 | | -out_free_ana_log_buf: |
---|
605 | | - kfree(ctrl->ana_log_buf); |
---|
606 | | - ctrl->ana_log_buf = NULL; |
---|
607 | | -out: |
---|
| 779 | + |
---|
| 780 | +out_uninit: |
---|
| 781 | + nvme_mpath_uninit(ctrl); |
---|
608 | 782 | return error; |
---|
609 | 783 | } |
---|
610 | 784 | |
---|