| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * PCI Error Recovery Driver for RPA-compliant PPC64 platform. |
|---|
| 3 | 4 | * Copyright IBM Corp. 2004 2005 |
|---|
| 4 | 5 | * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 |
|---|
| 5 | | - * |
|---|
| 6 | | - * All rights reserved. |
|---|
| 7 | | - * |
|---|
| 8 | | - * This program is free software; you can redistribute it and/or modify |
|---|
| 9 | | - * it under the terms of the GNU General Public License as published by |
|---|
| 10 | | - * the Free Software Foundation; either version 2 of the License, or (at |
|---|
| 11 | | - * your option) any later version. |
|---|
| 12 | | - * |
|---|
| 13 | | - * This program is distributed in the hope that it will be useful, but |
|---|
| 14 | | - * WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 15 | | - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or |
|---|
| 16 | | - * NON INFRINGEMENT. See the GNU General Public License for more |
|---|
| 17 | | - * details. |
|---|
| 18 | | - * |
|---|
| 19 | | - * You should have received a copy of the GNU General Public License |
|---|
| 20 | | - * along with this program; if not, write to the Free Software |
|---|
| 21 | | - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|---|
| 22 | 6 | * |
|---|
| 23 | 7 | * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> |
|---|
| 24 | 8 | */ |
|---|
| .. | .. |
|---|
| 27 | 11 | #include <linux/irq.h> |
|---|
| 28 | 12 | #include <linux/module.h> |
|---|
| 29 | 13 | #include <linux/pci.h> |
|---|
| 14 | +#include <linux/pci_hotplug.h> |
|---|
| 30 | 15 | #include <asm/eeh.h> |
|---|
| 31 | 16 | #include <asm/eeh_event.h> |
|---|
| 32 | 17 | #include <asm/ppc-pci.h> |
|---|
| .. | .. |
|---|
| 35 | 20 | #include <asm/rtas.h> |
|---|
| 36 | 21 | |
|---|
| 37 | 22 | struct eeh_rmv_data { |
|---|
| 38 | | - struct list_head edev_list; |
|---|
| 39 | | - int removed; |
|---|
| 23 | + struct list_head removed_vf_list; |
|---|
| 24 | + int removed_dev_count; |
|---|
| 40 | 25 | }; |
|---|
| 41 | 26 | |
|---|
| 42 | 27 | static int eeh_result_priority(enum pci_ers_result result) |
|---|
| .. | .. |
|---|
| 60 | 45 | } |
|---|
| 61 | 46 | }; |
|---|
| 62 | 47 | |
|---|
| 63 | | -const char *pci_ers_result_name(enum pci_ers_result result) |
|---|
| 48 | +static const char *pci_ers_result_name(enum pci_ers_result result) |
|---|
| 64 | 49 | { |
|---|
| 65 | 50 | switch (result) { |
|---|
| 66 | 51 | case PCI_ERS_RESULT_NONE: |
|---|
| .. | .. |
|---|
| 81 | 66 | } |
|---|
| 82 | 67 | }; |
|---|
| 83 | 68 | |
|---|
| 84 | | -static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev, |
|---|
| 85 | | - const char *fmt, ...) |
|---|
| 86 | | -{ |
|---|
| 87 | | - struct va_format vaf; |
|---|
| 88 | | - va_list args; |
|---|
| 89 | | - |
|---|
| 90 | | - va_start(args, fmt); |
|---|
| 91 | | - |
|---|
| 92 | | - vaf.fmt = fmt; |
|---|
| 93 | | - vaf.va = &args; |
|---|
| 94 | | - |
|---|
| 95 | | - printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr, |
|---|
| 96 | | - edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf); |
|---|
| 97 | | - |
|---|
| 98 | | - va_end(args); |
|---|
| 99 | | -} |
|---|
| 100 | | - |
|---|
| 101 | 69 | static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, |
|---|
| 102 | 70 | enum pci_ers_result new) |
|---|
| 103 | 71 | { |
|---|
| .. | .. |
|---|
| 113 | 81 | |
|---|
| 114 | 82 | static bool eeh_edev_actionable(struct eeh_dev *edev) |
|---|
| 115 | 83 | { |
|---|
| 116 | | - return (edev->pdev && !eeh_dev_removed(edev) && |
|---|
| 117 | | - !eeh_pe_passed(edev->pe)); |
|---|
| 84 | + if (!edev->pdev) |
|---|
| 85 | + return false; |
|---|
| 86 | + if (edev->pdev->error_state == pci_channel_io_perm_failure) |
|---|
| 87 | + return false; |
|---|
| 88 | + if (eeh_dev_removed(edev)) |
|---|
| 89 | + return false; |
|---|
| 90 | + if (eeh_pe_passed(edev->pe)) |
|---|
| 91 | + return false; |
|---|
| 92 | + |
|---|
| 93 | + return true; |
|---|
| 118 | 94 | } |
|---|
| 119 | 95 | |
|---|
| 120 | 96 | /** |
|---|
| .. | .. |
|---|
| 214 | 190 | } |
|---|
| 215 | 191 | } |
|---|
| 216 | 192 | |
|---|
| 217 | | -static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) |
|---|
| 193 | +static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) |
|---|
| 218 | 194 | { |
|---|
| 219 | 195 | struct pci_dev *pdev; |
|---|
| 220 | 196 | |
|---|
| 221 | 197 | if (!edev) |
|---|
| 222 | | - return NULL; |
|---|
| 198 | + return; |
|---|
| 223 | 199 | |
|---|
| 224 | 200 | /* |
|---|
| 225 | 201 | * We cannot access the config space on some adapters. |
|---|
| .. | .. |
|---|
| 229 | 205 | * device is created. |
|---|
| 230 | 206 | */ |
|---|
| 231 | 207 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) |
|---|
| 232 | | - return NULL; |
|---|
| 208 | + return; |
|---|
| 233 | 209 | |
|---|
| 234 | 210 | pdev = eeh_dev_to_pci_dev(edev); |
|---|
| 235 | 211 | if (!pdev) |
|---|
| 236 | | - return NULL; |
|---|
| 212 | + return; |
|---|
| 237 | 213 | |
|---|
| 238 | 214 | pci_save_state(pdev); |
|---|
| 239 | | - return NULL; |
|---|
| 240 | 215 | } |
|---|
| 241 | 216 | |
|---|
| 242 | | -static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) |
|---|
| 217 | +static void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s) |
|---|
| 243 | 218 | { |
|---|
| 244 | 219 | struct eeh_pe *pe; |
|---|
| 245 | 220 | struct eeh_dev *edev, *tmp; |
|---|
| .. | .. |
|---|
| 274 | 249 | } |
|---|
| 275 | 250 | |
|---|
| 276 | 251 | typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, |
|---|
| 252 | + struct pci_dev *, |
|---|
| 277 | 253 | struct pci_driver *); |
|---|
| 278 | 254 | static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, |
|---|
| 279 | 255 | enum pci_ers_result *result) |
|---|
| 280 | 256 | { |
|---|
| 257 | + struct pci_dev *pdev; |
|---|
| 281 | 258 | struct pci_driver *driver; |
|---|
| 282 | 259 | enum pci_ers_result new_result; |
|---|
| 283 | 260 | |
|---|
| 284 | | - if (!edev->pdev) { |
|---|
| 261 | + pci_lock_rescan_remove(); |
|---|
| 262 | + pdev = edev->pdev; |
|---|
| 263 | + if (pdev) |
|---|
| 264 | + get_device(&pdev->dev); |
|---|
| 265 | + pci_unlock_rescan_remove(); |
|---|
| 266 | + if (!pdev) { |
|---|
| 285 | 267 | eeh_edev_info(edev, "no device"); |
|---|
| 286 | 268 | return; |
|---|
| 287 | 269 | } |
|---|
| 288 | | - device_lock(&edev->pdev->dev); |
|---|
| 270 | + device_lock(&pdev->dev); |
|---|
| 289 | 271 | if (eeh_edev_actionable(edev)) { |
|---|
| 290 | | - driver = eeh_pcid_get(edev->pdev); |
|---|
| 272 | + driver = eeh_pcid_get(pdev); |
|---|
| 291 | 273 | |
|---|
| 292 | 274 | if (!driver) |
|---|
| 293 | 275 | eeh_edev_info(edev, "no driver"); |
|---|
| .. | .. |
|---|
| 296 | 278 | else if (edev->mode & EEH_DEV_NO_HANDLER) |
|---|
| 297 | 279 | eeh_edev_info(edev, "driver bound too late"); |
|---|
| 298 | 280 | else { |
|---|
| 299 | | - new_result = fn(edev, driver); |
|---|
| 281 | + new_result = fn(edev, pdev, driver); |
|---|
| 300 | 282 | eeh_edev_info(edev, "%s driver reports: '%s'", |
|---|
| 301 | 283 | driver->name, |
|---|
| 302 | 284 | pci_ers_result_name(new_result)); |
|---|
| .. | .. |
|---|
| 305 | 287 | new_result); |
|---|
| 306 | 288 | } |
|---|
| 307 | 289 | if (driver) |
|---|
| 308 | | - eeh_pcid_put(edev->pdev); |
|---|
| 290 | + eeh_pcid_put(pdev); |
|---|
| 309 | 291 | } else { |
|---|
| 310 | | - eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev, |
|---|
| 292 | + eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev, |
|---|
| 311 | 293 | !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); |
|---|
| 312 | 294 | } |
|---|
| 313 | | - device_unlock(&edev->pdev->dev); |
|---|
| 295 | + device_unlock(&pdev->dev); |
|---|
| 296 | + if (edev->pdev != pdev) |
|---|
| 297 | + eeh_edev_warn(edev, "Device changed during processing!\n"); |
|---|
| 298 | + put_device(&pdev->dev); |
|---|
| 314 | 299 | } |
|---|
| 315 | 300 | |
|---|
| 316 | 301 | static void eeh_pe_report(const char *name, struct eeh_pe *root, |
|---|
| .. | .. |
|---|
| 337 | 322 | * Report an EEH error to each device driver. |
|---|
| 338 | 323 | */ |
|---|
| 339 | 324 | static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, |
|---|
| 325 | + struct pci_dev *pdev, |
|---|
| 340 | 326 | struct pci_driver *driver) |
|---|
| 341 | 327 | { |
|---|
| 342 | 328 | enum pci_ers_result rc; |
|---|
| 343 | | - struct pci_dev *dev = edev->pdev; |
|---|
| 344 | 329 | |
|---|
| 345 | 330 | if (!driver->err_handler->error_detected) |
|---|
| 346 | 331 | return PCI_ERS_RESULT_NONE; |
|---|
| 347 | 332 | |
|---|
| 348 | 333 | eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", |
|---|
| 349 | 334 | driver->name); |
|---|
| 350 | | - rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); |
|---|
| 335 | + rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); |
|---|
| 351 | 336 | |
|---|
| 352 | 337 | edev->in_error = true; |
|---|
| 353 | | - pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); |
|---|
| 338 | + pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE); |
|---|
| 354 | 339 | return rc; |
|---|
| 355 | 340 | } |
|---|
| 356 | 341 | |
|---|
| .. | .. |
|---|
| 363 | 348 | * are now enabled. |
|---|
| 364 | 349 | */ |
|---|
| 365 | 350 | static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, |
|---|
| 351 | + struct pci_dev *pdev, |
|---|
| 366 | 352 | struct pci_driver *driver) |
|---|
| 367 | 353 | { |
|---|
| 368 | 354 | if (!driver->err_handler->mmio_enabled) |
|---|
| 369 | 355 | return PCI_ERS_RESULT_NONE; |
|---|
| 370 | 356 | eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); |
|---|
| 371 | | - return driver->err_handler->mmio_enabled(edev->pdev); |
|---|
| 357 | + return driver->err_handler->mmio_enabled(pdev); |
|---|
| 372 | 358 | } |
|---|
| 373 | 359 | |
|---|
| 374 | 360 | /** |
|---|
| .. | .. |
|---|
| 382 | 368 | * driver can work again while the device is recovered. |
|---|
| 383 | 369 | */ |
|---|
| 384 | 370 | static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, |
|---|
| 371 | + struct pci_dev *pdev, |
|---|
| 385 | 372 | struct pci_driver *driver) |
|---|
| 386 | 373 | { |
|---|
| 387 | 374 | if (!driver->err_handler->slot_reset || !edev->in_error) |
|---|
| 388 | 375 | return PCI_ERS_RESULT_NONE; |
|---|
| 389 | 376 | eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); |
|---|
| 390 | | - return driver->err_handler->slot_reset(edev->pdev); |
|---|
| 377 | + return driver->err_handler->slot_reset(pdev); |
|---|
| 391 | 378 | } |
|---|
| 392 | 379 | |
|---|
| 393 | | -static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) |
|---|
| 380 | +static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) |
|---|
| 394 | 381 | { |
|---|
| 395 | 382 | struct pci_dev *pdev; |
|---|
| 396 | 383 | |
|---|
| 397 | 384 | if (!edev) |
|---|
| 398 | | - return NULL; |
|---|
| 385 | + return; |
|---|
| 399 | 386 | |
|---|
| 400 | 387 | /* |
|---|
| 401 | 388 | * The content in the config space isn't saved because |
|---|
| .. | .. |
|---|
| 404 | 391 | * EEH device is created. |
|---|
| 405 | 392 | */ |
|---|
| 406 | 393 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { |
|---|
| 407 | | - if (list_is_last(&edev->list, &edev->pe->edevs)) |
|---|
| 394 | + if (list_is_last(&edev->entry, &edev->pe->edevs)) |
|---|
| 408 | 395 | eeh_pe_restore_bars(edev->pe); |
|---|
| 409 | 396 | |
|---|
| 410 | | - return NULL; |
|---|
| 397 | + return; |
|---|
| 411 | 398 | } |
|---|
| 412 | 399 | |
|---|
| 413 | 400 | pdev = eeh_dev_to_pci_dev(edev); |
|---|
| 414 | 401 | if (!pdev) |
|---|
| 415 | | - return NULL; |
|---|
| 402 | + return; |
|---|
| 416 | 403 | |
|---|
| 417 | 404 | pci_restore_state(pdev); |
|---|
| 418 | | - return NULL; |
|---|
| 419 | 405 | } |
|---|
| 420 | 406 | |
|---|
| 421 | 407 | /** |
|---|
| .. | .. |
|---|
| 428 | 414 | * to make the recovered device work again. |
|---|
| 429 | 415 | */ |
|---|
| 430 | 416 | static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, |
|---|
| 417 | + struct pci_dev *pdev, |
|---|
| 431 | 418 | struct pci_driver *driver) |
|---|
| 432 | 419 | { |
|---|
| 433 | 420 | if (!driver->err_handler->resume || !edev->in_error) |
|---|
| 434 | 421 | return PCI_ERS_RESULT_NONE; |
|---|
| 435 | 422 | |
|---|
| 436 | 423 | eeh_edev_info(edev, "Invoking %s->resume()", driver->name); |
|---|
| 437 | | - driver->err_handler->resume(edev->pdev); |
|---|
| 424 | + driver->err_handler->resume(pdev); |
|---|
| 438 | 425 | |
|---|
| 439 | 426 | pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); |
|---|
| 440 | 427 | #ifdef CONFIG_PCI_IOV |
|---|
| 441 | | - if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) |
|---|
| 442 | | - eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); |
|---|
| 428 | + if (eeh_ops->notify_resume) |
|---|
| 429 | + eeh_ops->notify_resume(edev); |
|---|
| 443 | 430 | #endif |
|---|
| 444 | 431 | return PCI_ERS_RESULT_NONE; |
|---|
| 445 | 432 | } |
|---|
| .. | .. |
|---|
| 453 | 440 | * dead, and that no further recovery attempts will be made on it. |
|---|
| 454 | 441 | */ |
|---|
| 455 | 442 | static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, |
|---|
| 443 | + struct pci_dev *pdev, |
|---|
| 456 | 444 | struct pci_driver *driver) |
|---|
| 457 | 445 | { |
|---|
| 458 | 446 | enum pci_ers_result rc; |
|---|
| .. | .. |
|---|
| 462 | 450 | |
|---|
| 463 | 451 | eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", |
|---|
| 464 | 452 | driver->name); |
|---|
| 465 | | - rc = driver->err_handler->error_detected(edev->pdev, |
|---|
| 453 | + rc = driver->err_handler->error_detected(pdev, |
|---|
| 466 | 454 | pci_channel_io_perm_failure); |
|---|
| 467 | 455 | |
|---|
| 468 | | - pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT); |
|---|
| 456 | + pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); |
|---|
| 469 | 457 | return rc; |
|---|
| 470 | 458 | } |
|---|
| 471 | 459 | |
|---|
| 472 | | -static void *eeh_add_virt_device(void *data, void *userdata) |
|---|
| 460 | +static void *eeh_add_virt_device(struct eeh_dev *edev) |
|---|
| 473 | 461 | { |
|---|
| 474 | 462 | struct pci_driver *driver; |
|---|
| 475 | | - struct eeh_dev *edev = (struct eeh_dev *)data; |
|---|
| 476 | 463 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
|---|
| 477 | | - struct pci_dn *pdn = eeh_dev_to_pdn(edev); |
|---|
| 478 | 464 | |
|---|
| 479 | 465 | if (!(edev->physfn)) { |
|---|
| 480 | | - pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", |
|---|
| 481 | | - __func__, pdn->phb->global_number, pdn->busno, |
|---|
| 482 | | - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); |
|---|
| 466 | + eeh_edev_warn(edev, "Not for VF\n"); |
|---|
| 483 | 467 | return NULL; |
|---|
| 484 | 468 | } |
|---|
| 485 | 469 | |
|---|
| .. | .. |
|---|
| 493 | 477 | } |
|---|
| 494 | 478 | |
|---|
| 495 | 479 | #ifdef CONFIG_PCI_IOV |
|---|
| 496 | | - pci_iov_add_virtfn(edev->physfn, pdn->vf_index); |
|---|
| 480 | + pci_iov_add_virtfn(edev->physfn, edev->vf_index); |
|---|
| 497 | 481 | #endif |
|---|
| 498 | 482 | return NULL; |
|---|
| 499 | 483 | } |
|---|
| 500 | 484 | |
|---|
| 501 | | -static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) |
|---|
| 485 | +static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) |
|---|
| 502 | 486 | { |
|---|
| 503 | 487 | struct pci_driver *driver; |
|---|
| 504 | 488 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
|---|
| 505 | 489 | struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; |
|---|
| 506 | | - int *removed = rmv_data ? &rmv_data->removed : NULL; |
|---|
| 507 | 490 | |
|---|
| 508 | 491 | /* |
|---|
| 509 | 492 | * Actually, we should remove the PCI bridges as well. |
|---|
| .. | .. |
|---|
| 512 | 495 | * support EEH. So we just care about PCI devices for |
|---|
| 513 | 496 | * simplicity here. |
|---|
| 514 | 497 | */ |
|---|
| 515 | | - if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) |
|---|
| 516 | | - return NULL; |
|---|
| 498 | + if (!eeh_edev_actionable(edev) || |
|---|
| 499 | + (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) |
|---|
| 500 | + return; |
|---|
| 517 | 501 | |
|---|
| 518 | | - /* |
|---|
| 519 | | - * We rely on count-based pcibios_release_device() to |
|---|
| 520 | | - * detach permanently offlined PEs. Unfortunately, that's |
|---|
| 521 | | - * not reliable enough. We might have the permanently |
|---|
| 522 | | - * offlined PEs attached, but we needn't take care of |
|---|
| 523 | | - * them and their child devices. |
|---|
| 524 | | - */ |
|---|
| 525 | | - if (eeh_dev_removed(edev)) |
|---|
| 526 | | - return NULL; |
|---|
| 527 | | - |
|---|
| 528 | | - if (removed) { |
|---|
| 529 | | - if (eeh_pe_passed(edev->pe)) |
|---|
| 530 | | - return NULL; |
|---|
| 502 | + if (rmv_data) { |
|---|
| 531 | 503 | driver = eeh_pcid_get(dev); |
|---|
| 532 | 504 | if (driver) { |
|---|
| 533 | 505 | if (driver->err_handler && |
|---|
| 534 | 506 | driver->err_handler->error_detected && |
|---|
| 535 | 507 | driver->err_handler->slot_reset) { |
|---|
| 536 | 508 | eeh_pcid_put(dev); |
|---|
| 537 | | - return NULL; |
|---|
| 509 | + return; |
|---|
| 538 | 510 | } |
|---|
| 539 | 511 | eeh_pcid_put(dev); |
|---|
| 540 | 512 | } |
|---|
| 541 | 513 | } |
|---|
| 542 | 514 | |
|---|
| 543 | 515 | /* Remove it from PCI subsystem */ |
|---|
| 544 | | - pr_debug("EEH: Removing %s without EEH sensitive driver\n", |
|---|
| 545 | | - pci_name(dev)); |
|---|
| 546 | | - edev->bus = dev->bus; |
|---|
| 516 | + pr_info("EEH: Removing %s without EEH sensitive driver\n", |
|---|
| 517 | + pci_name(dev)); |
|---|
| 547 | 518 | edev->mode |= EEH_DEV_DISCONNECTED; |
|---|
| 548 | | - if (removed) |
|---|
| 549 | | - (*removed)++; |
|---|
| 519 | + if (rmv_data) |
|---|
| 520 | + rmv_data->removed_dev_count++; |
|---|
| 550 | 521 | |
|---|
| 551 | 522 | if (edev->physfn) { |
|---|
| 552 | 523 | #ifdef CONFIG_PCI_IOV |
|---|
| 553 | | - struct pci_dn *pdn = eeh_dev_to_pdn(edev); |
|---|
| 554 | | - |
|---|
| 555 | | - pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); |
|---|
| 524 | + pci_iov_remove_virtfn(edev->physfn, edev->vf_index); |
|---|
| 556 | 525 | edev->pdev = NULL; |
|---|
| 557 | 526 | #endif |
|---|
| 558 | 527 | if (rmv_data) |
|---|
| 559 | | - list_add(&edev->rmv_list, &rmv_data->edev_list); |
|---|
| 528 | + list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); |
|---|
| 560 | 529 | } else { |
|---|
| 561 | 530 | pci_lock_rescan_remove(); |
|---|
| 562 | 531 | pci_stop_and_remove_bus_device(dev); |
|---|
| 563 | 532 | pci_unlock_rescan_remove(); |
|---|
| 564 | 533 | } |
|---|
| 565 | | - |
|---|
| 566 | | - return NULL; |
|---|
| 567 | 534 | } |
|---|
| 568 | 535 | |
|---|
| 569 | 536 | static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) |
|---|
| .. | .. |
|---|
| 575 | 542 | continue; |
|---|
| 576 | 543 | |
|---|
| 577 | 544 | edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); |
|---|
| 578 | | - eeh_rmv_from_parent_pe(edev); |
|---|
| 545 | + eeh_pe_tree_remove(edev); |
|---|
| 579 | 546 | } |
|---|
| 580 | 547 | |
|---|
| 581 | 548 | return NULL; |
|---|
| .. | .. |
|---|
| 588 | 555 | * PE reset (for 3 times), we try to clear the frozen state |
|---|
| 589 | 556 | * for 3 times as well. |
|---|
| 590 | 557 | */ |
|---|
| 591 | | -static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag) |
|---|
| 558 | +static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) |
|---|
| 592 | 559 | { |
|---|
| 593 | | - bool clear_sw_state = *(bool *)flag; |
|---|
| 594 | | - int i, rc = 1; |
|---|
| 560 | + struct eeh_pe *pe; |
|---|
| 561 | + int i; |
|---|
| 595 | 562 | |
|---|
| 596 | | - for (i = 0; rc && i < 3; i++) |
|---|
| 597 | | - rc = eeh_unfreeze_pe(pe, clear_sw_state); |
|---|
| 598 | | - |
|---|
| 599 | | - /* Stop immediately on any errors */ |
|---|
| 600 | | - if (rc) { |
|---|
| 601 | | - pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n", |
|---|
| 602 | | - __func__, rc, pe->phb->global_number, pe->addr); |
|---|
| 603 | | - return (void *)pe; |
|---|
| 563 | + eeh_for_each_pe(root, pe) { |
|---|
| 564 | + if (include_passed || !eeh_pe_passed(pe)) { |
|---|
| 565 | + for (i = 0; i < 3; i++) |
|---|
| 566 | + if (!eeh_unfreeze_pe(pe)) |
|---|
| 567 | + break; |
|---|
| 568 | + if (i >= 3) |
|---|
| 569 | + return -EIO; |
|---|
| 570 | + } |
|---|
| 604 | 571 | } |
|---|
| 605 | | - |
|---|
| 606 | | - return NULL; |
|---|
| 607 | | -} |
|---|
| 608 | | - |
|---|
| 609 | | -static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, |
|---|
| 610 | | - bool clear_sw_state) |
|---|
| 611 | | -{ |
|---|
| 612 | | - void *rc; |
|---|
| 613 | | - |
|---|
| 614 | | - rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state); |
|---|
| 615 | | - if (!rc) |
|---|
| 616 | | - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); |
|---|
| 617 | | - |
|---|
| 618 | | - return rc ? -EIO : 0; |
|---|
| 572 | + eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); |
|---|
| 573 | + return 0; |
|---|
| 619 | 574 | } |
|---|
| 620 | 575 | |
|---|
| 621 | 576 | int eeh_pe_reset_and_recover(struct eeh_pe *pe) |
|---|
| .. | .. |
|---|
| 633 | 588 | eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); |
|---|
| 634 | 589 | |
|---|
| 635 | 590 | /* Issue reset */ |
|---|
| 636 | | - ret = eeh_pe_reset_full(pe); |
|---|
| 591 | + ret = eeh_pe_reset_full(pe, true); |
|---|
| 637 | 592 | if (ret) { |
|---|
| 638 | | - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
|---|
| 593 | + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
|---|
| 639 | 594 | return ret; |
|---|
| 640 | 595 | } |
|---|
| 641 | 596 | |
|---|
| 642 | 597 | /* Unfreeze the PE */ |
|---|
| 643 | 598 | ret = eeh_clear_pe_frozen_state(pe, true); |
|---|
| 644 | 599 | if (ret) { |
|---|
| 645 | | - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
|---|
| 600 | + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
|---|
| 646 | 601 | return ret; |
|---|
| 647 | 602 | } |
|---|
| 648 | 603 | |
|---|
| .. | .. |
|---|
| 650 | 605 | eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); |
|---|
| 651 | 606 | |
|---|
| 652 | 607 | /* Clear recovery mode */ |
|---|
| 653 | | - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
|---|
| 608 | + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
|---|
| 654 | 609 | |
|---|
| 655 | 610 | return 0; |
|---|
| 656 | 611 | } |
|---|
| .. | .. |
|---|
| 673 | 628 | time64_t tstamp; |
|---|
| 674 | 629 | int cnt, rc; |
|---|
| 675 | 630 | struct eeh_dev *edev; |
|---|
| 631 | + struct eeh_pe *tmp_pe; |
|---|
| 632 | + bool any_passed = false; |
|---|
| 633 | + |
|---|
| 634 | + eeh_for_each_pe(pe, tmp_pe) |
|---|
| 635 | + any_passed |= eeh_pe_passed(tmp_pe); |
|---|
| 676 | 636 | |
|---|
| 677 | 637 | /* pcibios will clear the counter; save the value */ |
|---|
| 678 | 638 | cnt = pe->freeze_count; |
|---|
| .. | .. |
|---|
| 685 | 645 | * into pci_hp_add_devices(). |
|---|
| 686 | 646 | */ |
|---|
| 687 | 647 | eeh_pe_state_mark(pe, EEH_PE_KEEP); |
|---|
| 688 | | - if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { |
|---|
| 648 | + if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { |
|---|
| 689 | 649 | eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); |
|---|
| 690 | 650 | } else { |
|---|
| 691 | 651 | pci_lock_rescan_remove(); |
|---|
| .. | .. |
|---|
| 702 | 662 | * config accesses. So we prefer to block them. However, controlled |
|---|
| 703 | 663 | * PCI config accesses initiated from EEH itself are allowed. |
|---|
| 704 | 664 | */ |
|---|
| 705 | | - rc = eeh_pe_reset_full(pe); |
|---|
| 665 | + rc = eeh_pe_reset_full(pe, false); |
|---|
| 706 | 666 | if (rc) |
|---|
| 707 | 667 | return rc; |
|---|
| 708 | 668 | |
|---|
| .. | .. |
|---|
| 725 | 685 | * the device up before the scripts have taken it down, |
|---|
| 726 | 686 | * potentially weird things happen. |
|---|
| 727 | 687 | */ |
|---|
| 728 | | - if (!driver_eeh_aware || rmv_data->removed) { |
|---|
| 688 | + if (!driver_eeh_aware || rmv_data->removed_dev_count) { |
|---|
| 729 | 689 | pr_info("EEH: Sleep 5s ahead of %s hotplug\n", |
|---|
| 730 | 690 | (driver_eeh_aware ? "partial" : "complete")); |
|---|
| 731 | 691 | ssleep(5); |
|---|
| .. | .. |
|---|
| 735 | 695 | * PE. We should disconnect it so the binding can be |
|---|
| 736 | 696 | * rebuilt when adding PCI devices. |
|---|
| 737 | 697 | */ |
|---|
| 738 | | - edev = list_first_entry(&pe->edevs, struct eeh_dev, list); |
|---|
| 698 | + edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); |
|---|
| 739 | 699 | eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); |
|---|
| 740 | 700 | if (pe->type & EEH_PE_VF) { |
|---|
| 741 | | - eeh_add_virt_device(edev, NULL); |
|---|
| 701 | + eeh_add_virt_device(edev); |
|---|
| 742 | 702 | } else { |
|---|
| 743 | 703 | if (!driver_eeh_aware) |
|---|
| 744 | | - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); |
|---|
| 704 | + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
|---|
| 745 | 705 | pci_hp_add_devices(bus); |
|---|
| 746 | 706 | } |
|---|
| 747 | 707 | } |
|---|
| 748 | | - eeh_pe_state_clear(pe, EEH_PE_KEEP); |
|---|
| 708 | + eeh_pe_state_clear(pe, EEH_PE_KEEP, true); |
|---|
| 749 | 709 | |
|---|
| 750 | 710 | pe->tstamp = tstamp; |
|---|
| 751 | 711 | pe->freeze_count = cnt; |
|---|
| .. | .. |
|---|
| 758 | 718 | * to come back on line, in seconds. |
|---|
| 759 | 719 | */ |
|---|
| 760 | 720 | #define MAX_WAIT_FOR_RECOVERY 300 |
|---|
| 721 | + |
|---|
| 722 | + |
|---|
| 723 | +/* Walks the PE tree after processing an event to remove any stale PEs. |
|---|
| 724 | + * |
|---|
| 725 | + * NB: This needs to be recursive to ensure the leaf PEs get removed |
|---|
| 726 | + * before their parents do. Although this is possible to do recursively |
|---|
| 727 | + * we don't since this is easier to read and we need to garantee |
|---|
| 728 | + * the leaf nodes will be handled first. |
|---|
| 729 | + */ |
|---|
| 730 | +static void eeh_pe_cleanup(struct eeh_pe *pe) |
|---|
| 731 | +{ |
|---|
| 732 | + struct eeh_pe *child_pe, *tmp; |
|---|
| 733 | + |
|---|
| 734 | + list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) |
|---|
| 735 | + eeh_pe_cleanup(child_pe); |
|---|
| 736 | + |
|---|
| 737 | + if (pe->state & EEH_PE_KEEP) |
|---|
| 738 | + return; |
|---|
| 739 | + |
|---|
| 740 | + if (!(pe->state & EEH_PE_INVALID)) |
|---|
| 741 | + return; |
|---|
| 742 | + |
|---|
| 743 | + if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { |
|---|
| 744 | + list_del(&pe->child); |
|---|
| 745 | + kfree(pe); |
|---|
| 746 | + } |
|---|
| 747 | +} |
|---|
| 748 | + |
|---|
| 749 | +/** |
|---|
| 750 | + * eeh_check_slot_presence - Check if a device is still present in a slot |
|---|
| 751 | + * @pdev: pci_dev to check |
|---|
| 752 | + * |
|---|
| 753 | + * This function may return a false positive if we can't determine the slot's |
|---|
| 754 | + * presence state. This might happen for for PCIe slots if the PE containing |
|---|
| 755 | + * the upstream bridge is also frozen, or the bridge is part of the same PE |
|---|
| 756 | + * as the device. |
|---|
| 757 | + * |
|---|
| 758 | + * This shouldn't happen often, but you might see it if you hotplug a PCIe |
|---|
| 759 | + * switch. |
|---|
| 760 | + */ |
|---|
| 761 | +static bool eeh_slot_presence_check(struct pci_dev *pdev) |
|---|
| 762 | +{ |
|---|
| 763 | + const struct hotplug_slot_ops *ops; |
|---|
| 764 | + struct pci_slot *slot; |
|---|
| 765 | + u8 state; |
|---|
| 766 | + int rc; |
|---|
| 767 | + |
|---|
| 768 | + if (!pdev) |
|---|
| 769 | + return false; |
|---|
| 770 | + |
|---|
| 771 | + if (pdev->error_state == pci_channel_io_perm_failure) |
|---|
| 772 | + return false; |
|---|
| 773 | + |
|---|
| 774 | + slot = pdev->slot; |
|---|
| 775 | + if (!slot || !slot->hotplug) |
|---|
| 776 | + return true; |
|---|
| 777 | + |
|---|
| 778 | + ops = slot->hotplug->ops; |
|---|
| 779 | + if (!ops || !ops->get_adapter_status) |
|---|
| 780 | + return true; |
|---|
| 781 | + |
|---|
| 782 | + /* set the attention indicator while we've got the slot ops */ |
|---|
| 783 | + if (ops->set_attention_status) |
|---|
| 784 | + ops->set_attention_status(slot->hotplug, 1); |
|---|
| 785 | + |
|---|
| 786 | + rc = ops->get_adapter_status(slot->hotplug, &state); |
|---|
| 787 | + if (rc) |
|---|
| 788 | + return true; |
|---|
| 789 | + |
|---|
| 790 | + return !!state; |
|---|
| 791 | +} |
|---|
| 792 | + |
|---|
| 793 | +static void eeh_clear_slot_attention(struct pci_dev *pdev) |
|---|
| 794 | +{ |
|---|
| 795 | + const struct hotplug_slot_ops *ops; |
|---|
| 796 | + struct pci_slot *slot; |
|---|
| 797 | + |
|---|
| 798 | + if (!pdev) |
|---|
| 799 | + return; |
|---|
| 800 | + |
|---|
| 801 | + if (pdev->error_state == pci_channel_io_perm_failure) |
|---|
| 802 | + return; |
|---|
| 803 | + |
|---|
| 804 | + slot = pdev->slot; |
|---|
| 805 | + if (!slot || !slot->hotplug) |
|---|
| 806 | + return; |
|---|
| 807 | + |
|---|
| 808 | + ops = slot->hotplug->ops; |
|---|
| 809 | + if (!ops || !ops->set_attention_status) |
|---|
| 810 | + return; |
|---|
| 811 | + |
|---|
| 812 | + ops->set_attention_status(slot->hotplug, 0); |
|---|
| 813 | +} |
|---|
| 761 | 814 | |
|---|
| 762 | 815 | /** |
|---|
| 763 | 816 | * eeh_handle_normal_event - Handle EEH events on a specific PE |
|---|
| .. | .. |
|---|
| 787 | 840 | struct eeh_pe *tmp_pe; |
|---|
| 788 | 841 | int rc = 0; |
|---|
| 789 | 842 | enum pci_ers_result result = PCI_ERS_RESULT_NONE; |
|---|
| 790 | | - struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0}; |
|---|
| 843 | + struct eeh_rmv_data rmv_data = |
|---|
| 844 | + {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; |
|---|
| 845 | + int devices = 0; |
|---|
| 791 | 846 | |
|---|
| 792 | 847 | bus = eeh_pe_bus_get(pe); |
|---|
| 793 | 848 | if (!bus) { |
|---|
| .. | .. |
|---|
| 796 | 851 | return; |
|---|
| 797 | 852 | } |
|---|
| 798 | 853 | |
|---|
| 799 | | - eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
|---|
| 854 | + /* |
|---|
| 855 | + * When devices are hot-removed we might get an EEH due to |
|---|
| 856 | + * a driver attempting to touch the MMIO space of a removed |
|---|
| 857 | + * device. In this case we don't have a device to recover |
|---|
| 858 | + * so suppress the event if we can't find any present devices. |
|---|
| 859 | + * |
|---|
| 860 | + * The hotplug driver should take care of tearing down the |
|---|
| 861 | + * device itself. |
|---|
| 862 | + */ |
|---|
| 863 | + eeh_for_each_pe(pe, tmp_pe) |
|---|
| 864 | + eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
|---|
| 865 | + if (eeh_slot_presence_check(edev->pdev)) |
|---|
| 866 | + devices++; |
|---|
| 867 | + |
|---|
| 868 | + if (!devices) { |
|---|
| 869 | + pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", |
|---|
| 870 | + pe->phb->global_number, pe->addr); |
|---|
| 871 | + goto out; /* nothing to recover */ |
|---|
| 872 | + } |
|---|
| 873 | + |
|---|
| 874 | + /* Log the event */ |
|---|
| 875 | + if (pe->type & EEH_PE_PHB) { |
|---|
| 876 | + pr_err("EEH: Recovering PHB#%x, location: %s\n", |
|---|
| 877 | + pe->phb->global_number, eeh_pe_loc_get(pe)); |
|---|
| 878 | + } else { |
|---|
| 879 | + struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); |
|---|
| 880 | + |
|---|
| 881 | + pr_err("EEH: Recovering PHB#%x-PE#%x\n", |
|---|
| 882 | + pe->phb->global_number, pe->addr); |
|---|
| 883 | + pr_err("EEH: PE location: %s, PHB location: %s\n", |
|---|
| 884 | + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); |
|---|
| 885 | + } |
|---|
| 886 | + |
|---|
| 887 | +#ifdef CONFIG_STACKTRACE |
|---|
| 888 | + /* |
|---|
| 889 | + * Print the saved stack trace now that we've verified there's |
|---|
| 890 | + * something to recover. |
|---|
| 891 | + */ |
|---|
| 892 | + if (pe->trace_entries) { |
|---|
| 893 | + void **ptrs = (void **) pe->stack_trace; |
|---|
| 894 | + int i; |
|---|
| 895 | + |
|---|
| 896 | + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", |
|---|
| 897 | + pe->phb->global_number, pe->addr); |
|---|
| 898 | + |
|---|
| 899 | + /* FIXME: Use the same format as dump_stack() */ |
|---|
| 900 | + pr_err("EEH: Call Trace:\n"); |
|---|
| 901 | + for (i = 0; i < pe->trace_entries; i++) |
|---|
| 902 | + pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); |
|---|
| 903 | + |
|---|
| 904 | + pe->trace_entries = 0; |
|---|
| 905 | + } |
|---|
| 906 | +#endif /* CONFIG_STACKTRACE */ |
|---|
| 800 | 907 | |
|---|
| 801 | 908 | eeh_pe_update_time_stamp(pe); |
|---|
| 802 | 909 | pe->freeze_count++; |
|---|
| .. | .. |
|---|
| 804 | 911 | pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", |
|---|
| 805 | 912 | pe->phb->global_number, pe->addr, |
|---|
| 806 | 913 | pe->freeze_count); |
|---|
| 807 | | - goto hard_fail; |
|---|
| 914 | + result = PCI_ERS_RESULT_DISCONNECT; |
|---|
| 808 | 915 | } |
|---|
| 809 | | - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", |
|---|
| 810 | | - pe->freeze_count, eeh_max_freezes); |
|---|
| 811 | 916 | |
|---|
| 812 | 917 | eeh_for_each_pe(pe, tmp_pe) |
|---|
| 813 | 918 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
|---|
| .. | .. |
|---|
| 823 | 928 | * the error. Override the result if necessary to have partially |
|---|
| 824 | 929 | * hotplug for this case. |
|---|
| 825 | 930 | */ |
|---|
| 826 | | - pr_info("EEH: Notify device drivers to shutdown\n"); |
|---|
| 827 | | - eeh_set_channel_state(pe, pci_channel_io_frozen); |
|---|
| 828 | | - eeh_set_irq_state(pe, false); |
|---|
| 829 | | - eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, |
|---|
| 830 | | - &result); |
|---|
| 831 | | - if ((pe->type & EEH_PE_PHB) && |
|---|
| 832 | | - result != PCI_ERS_RESULT_NONE && |
|---|
| 833 | | - result != PCI_ERS_RESULT_NEED_RESET) |
|---|
| 834 | | - result = PCI_ERS_RESULT_NEED_RESET; |
|---|
| 931 | + if (result != PCI_ERS_RESULT_DISCONNECT) { |
|---|
| 932 | + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", |
|---|
| 933 | + pe->freeze_count, eeh_max_freezes); |
|---|
| 934 | + pr_info("EEH: Notify device drivers to shutdown\n"); |
|---|
| 935 | + eeh_set_channel_state(pe, pci_channel_io_frozen); |
|---|
| 936 | + eeh_set_irq_state(pe, false); |
|---|
| 937 | + eeh_pe_report("error_detected(IO frozen)", pe, |
|---|
| 938 | + eeh_report_error, &result); |
|---|
| 939 | + if ((pe->type & EEH_PE_PHB) && |
|---|
| 940 | + result != PCI_ERS_RESULT_NONE && |
|---|
| 941 | + result != PCI_ERS_RESULT_NEED_RESET) |
|---|
| 942 | + result = PCI_ERS_RESULT_NEED_RESET; |
|---|
| 943 | + } |
|---|
| 835 | 944 | |
|---|
| 836 | 945 | /* Get the current PCI slot state. This can take a long time, |
|---|
| 837 | 946 | * sometimes over 300 seconds for certain systems. |
|---|
| 838 | 947 | */ |
|---|
| 839 | | - rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); |
|---|
| 840 | | - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { |
|---|
| 841 | | - pr_warn("EEH: Permanent failure\n"); |
|---|
| 842 | | - goto hard_fail; |
|---|
| 948 | + if (result != PCI_ERS_RESULT_DISCONNECT) { |
|---|
| 949 | + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); |
|---|
| 950 | + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { |
|---|
| 951 | + pr_warn("EEH: Permanent failure\n"); |
|---|
| 952 | + result = PCI_ERS_RESULT_DISCONNECT; |
|---|
| 953 | + } |
|---|
| 843 | 954 | } |
|---|
| 844 | 955 | |
|---|
| 845 | 956 | /* Since rtas may enable MMIO when posting the error log, |
|---|
| 846 | 957 | * don't post the error log until after all dev drivers |
|---|
| 847 | 958 | * have been informed. |
|---|
| 848 | 959 | */ |
|---|
| 849 | | - pr_info("EEH: Collect temporary log\n"); |
|---|
| 850 | | - eeh_slot_error_detail(pe, EEH_LOG_TEMP); |
|---|
| 960 | + if (result != PCI_ERS_RESULT_DISCONNECT) { |
|---|
| 961 | + pr_info("EEH: Collect temporary log\n"); |
|---|
| 962 | + eeh_slot_error_detail(pe, EEH_LOG_TEMP); |
|---|
| 963 | + } |
|---|
| 851 | 964 | |
|---|
| 852 | 965 | /* If all device drivers were EEH-unaware, then shut |
|---|
| 853 | 966 | * down all of the device drivers, and hope they |
|---|
| .. | .. |
|---|
| 859 | 972 | if (rc) { |
|---|
| 860 | 973 | pr_warn("%s: Unable to reset, err=%d\n", |
|---|
| 861 | 974 | __func__, rc); |
|---|
| 862 | | - goto hard_fail; |
|---|
| 975 | + result = PCI_ERS_RESULT_DISCONNECT; |
|---|
| 863 | 976 | } |
|---|
| 864 | 977 | } |
|---|
| 865 | 978 | |
|---|
| .. | .. |
|---|
| 868 | 981 | pr_info("EEH: Enable I/O for affected devices\n"); |
|---|
| 869 | 982 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); |
|---|
| 870 | 983 | |
|---|
| 871 | | - if (rc < 0) |
|---|
| 872 | | - goto hard_fail; |
|---|
| 873 | | - if (rc) { |
|---|
| 984 | + if (rc < 0) { |
|---|
| 985 | + result = PCI_ERS_RESULT_DISCONNECT; |
|---|
| 986 | + } else if (rc) { |
|---|
| 874 | 987 | result = PCI_ERS_RESULT_NEED_RESET; |
|---|
| 875 | 988 | } else { |
|---|
| 876 | 989 | pr_info("EEH: Notify device drivers to resume I/O\n"); |
|---|
| .. | .. |
|---|
| 884 | 997 | pr_info("EEH: Enabled DMA for affected devices\n"); |
|---|
| 885 | 998 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); |
|---|
| 886 | 999 | |
|---|
| 887 | | - if (rc < 0) |
|---|
| 888 | | - goto hard_fail; |
|---|
| 889 | | - if (rc) { |
|---|
| 1000 | + if (rc < 0) { |
|---|
| 1001 | + result = PCI_ERS_RESULT_DISCONNECT; |
|---|
| 1002 | + } else if (rc) { |
|---|
| 890 | 1003 | result = PCI_ERS_RESULT_NEED_RESET; |
|---|
| 891 | 1004 | } else { |
|---|
| 892 | 1005 | /* |
|---|
| .. | .. |
|---|
| 894 | 1007 | * is still in frozen state. Clear it before |
|---|
| 895 | 1008 | * resuming the PE. |
|---|
| 896 | 1009 | */ |
|---|
| 897 | | - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); |
|---|
| 1010 | + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); |
|---|
| 898 | 1011 | result = PCI_ERS_RESULT_RECOVERED; |
|---|
| 899 | 1012 | } |
|---|
| 900 | | - } |
|---|
| 901 | | - |
|---|
| 902 | | - /* If any device has a hard failure, then shut off everything. */ |
|---|
| 903 | | - if (result == PCI_ERS_RESULT_DISCONNECT) { |
|---|
| 904 | | - pr_warn("EEH: Device driver gave up\n"); |
|---|
| 905 | | - goto hard_fail; |
|---|
| 906 | 1013 | } |
|---|
| 907 | 1014 | |
|---|
| 908 | 1015 | /* If any device called out for a reset, then reset the slot */ |
|---|
| .. | .. |
|---|
| 912 | 1019 | if (rc) { |
|---|
| 913 | 1020 | pr_warn("%s: Cannot reset, err=%d\n", |
|---|
| 914 | 1021 | __func__, rc); |
|---|
| 915 | | - goto hard_fail; |
|---|
| 1022 | + result = PCI_ERS_RESULT_DISCONNECT; |
|---|
| 1023 | + } else { |
|---|
| 1024 | + result = PCI_ERS_RESULT_NONE; |
|---|
| 1025 | + eeh_set_channel_state(pe, pci_channel_io_normal); |
|---|
| 1026 | + eeh_set_irq_state(pe, true); |
|---|
| 1027 | + eeh_pe_report("slot_reset", pe, eeh_report_reset, |
|---|
| 1028 | + &result); |
|---|
| 1029 | + } |
|---|
| 1030 | + } |
|---|
| 1031 | + |
|---|
| 1032 | + if ((result == PCI_ERS_RESULT_RECOVERED) || |
|---|
| 1033 | + (result == PCI_ERS_RESULT_NONE)) { |
|---|
| 1034 | + /* |
|---|
| 1035 | + * For those hot removed VFs, we should add back them after PF |
|---|
| 1036 | + * get recovered properly. |
|---|
| 1037 | + */ |
|---|
| 1038 | + list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, |
|---|
| 1039 | + rmv_entry) { |
|---|
| 1040 | + eeh_add_virt_device(edev); |
|---|
| 1041 | + list_del(&edev->rmv_entry); |
|---|
| 916 | 1042 | } |
|---|
| 917 | 1043 | |
|---|
| 918 | | - pr_info("EEH: Notify device drivers " |
|---|
| 919 | | - "the completion of reset\n"); |
|---|
| 920 | | - result = PCI_ERS_RESULT_NONE; |
|---|
| 1044 | + /* Tell all device drivers that they can resume operations */ |
|---|
| 1045 | + pr_info("EEH: Notify device driver to resume\n"); |
|---|
| 921 | 1046 | eeh_set_channel_state(pe, pci_channel_io_normal); |
|---|
| 922 | 1047 | eeh_set_irq_state(pe, true); |
|---|
| 923 | | - eeh_pe_report("slot_reset", pe, eeh_report_reset, &result); |
|---|
| 924 | | - } |
|---|
| 925 | | - |
|---|
| 926 | | - /* All devices should claim they have recovered by now. */ |
|---|
| 927 | | - if ((result != PCI_ERS_RESULT_RECOVERED) && |
|---|
| 928 | | - (result != PCI_ERS_RESULT_NONE)) { |
|---|
| 929 | | - pr_warn("EEH: Not recovered\n"); |
|---|
| 930 | | - goto hard_fail; |
|---|
| 931 | | - } |
|---|
| 932 | | - |
|---|
| 933 | | - /* |
|---|
| 934 | | - * For those hot removed VFs, we should add back them after PF get |
|---|
| 935 | | - * recovered properly. |
|---|
| 936 | | - */ |
|---|
| 937 | | - list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) { |
|---|
| 938 | | - eeh_add_virt_device(edev, NULL); |
|---|
| 939 | | - list_del(&edev->rmv_list); |
|---|
| 940 | | - } |
|---|
| 941 | | - |
|---|
| 942 | | - /* Tell all device drivers that they can resume operations */ |
|---|
| 943 | | - pr_info("EEH: Notify device driver to resume\n"); |
|---|
| 944 | | - eeh_set_channel_state(pe, pci_channel_io_normal); |
|---|
| 945 | | - eeh_set_irq_state(pe, true); |
|---|
| 946 | | - eeh_pe_report("resume", pe, eeh_report_resume, NULL); |
|---|
| 947 | | - eeh_for_each_pe(pe, tmp_pe) { |
|---|
| 948 | | - eeh_pe_for_each_dev(tmp_pe, edev, tmp) { |
|---|
| 949 | | - edev->mode &= ~EEH_DEV_NO_HANDLER; |
|---|
| 950 | | - edev->in_error = false; |
|---|
| 1048 | + eeh_pe_report("resume", pe, eeh_report_resume, NULL); |
|---|
| 1049 | + eeh_for_each_pe(pe, tmp_pe) { |
|---|
| 1050 | + eeh_pe_for_each_dev(tmp_pe, edev, tmp) { |
|---|
| 1051 | + edev->mode &= ~EEH_DEV_NO_HANDLER; |
|---|
| 1052 | + edev->in_error = false; |
|---|
| 1053 | + } |
|---|
| 951 | 1054 | } |
|---|
| 1055 | + |
|---|
| 1056 | + pr_info("EEH: Recovery successful.\n"); |
|---|
| 1057 | + goto out; |
|---|
| 952 | 1058 | } |
|---|
| 953 | 1059 | |
|---|
| 954 | | - pr_info("EEH: Recovery successful.\n"); |
|---|
| 955 | | - goto final; |
|---|
| 956 | | - |
|---|
| 957 | | -hard_fail: |
|---|
| 958 | 1060 | /* |
|---|
| 959 | 1061 | * About 90% of all real-life EEH failures in the field |
|---|
| 960 | 1062 | * are due to poorly seated PCI cards. Only 10% or so are |
|---|
| 961 | 1063 | * due to actual, failed cards. |
|---|
| 962 | 1064 | */ |
|---|
| 963 | 1065 | pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" |
|---|
| 964 | | - "Please try reseating or replacing it\n", |
|---|
| 1066 | + "Please try reseating or replacing it\n", |
|---|
| 965 | 1067 | pe->phb->global_number, pe->addr); |
|---|
| 966 | 1068 | |
|---|
| 967 | 1069 | eeh_slot_error_detail(pe, EEH_LOG_PERM); |
|---|
| 968 | 1070 | |
|---|
| 969 | 1071 | /* Notify all devices that they're about to go down. */ |
|---|
| 970 | | - eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
|---|
| 971 | 1072 | eeh_set_irq_state(pe, false); |
|---|
| 972 | 1073 | eeh_pe_report("error_detected(permanent failure)", pe, |
|---|
| 973 | 1074 | eeh_report_failure, NULL); |
|---|
| 1075 | + eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
|---|
| 974 | 1076 | |
|---|
| 975 | 1077 | /* Mark the PE to be removed permanently */ |
|---|
| 976 | 1078 | eeh_pe_state_mark(pe, EEH_PE_REMOVED); |
|---|
| .. | .. |
|---|
| 984 | 1086 | eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); |
|---|
| 985 | 1087 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); |
|---|
| 986 | 1088 | } else { |
|---|
| 987 | | - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); |
|---|
| 1089 | + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
|---|
| 988 | 1090 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); |
|---|
| 989 | 1091 | |
|---|
| 990 | 1092 | pci_lock_rescan_remove(); |
|---|
| .. | .. |
|---|
| 993 | 1095 | /* The passed PE should no longer be used */ |
|---|
| 994 | 1096 | return; |
|---|
| 995 | 1097 | } |
|---|
| 996 | | -final: |
|---|
| 997 | | - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
|---|
| 1098 | + |
|---|
| 1099 | +out: |
|---|
| 1100 | + /* |
|---|
| 1101 | + * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING |
|---|
| 1102 | + * we don't want to modify the PE tree structure so we do it here. |
|---|
| 1103 | + */ |
|---|
| 1104 | + eeh_pe_cleanup(pe); |
|---|
| 1105 | + |
|---|
| 1106 | + /* clear the slot attention LED for all recovered devices */ |
|---|
| 1107 | + eeh_for_each_pe(pe, tmp_pe) |
|---|
| 1108 | + eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
|---|
| 1109 | + eeh_clear_slot_attention(edev->pdev); |
|---|
| 1110 | + |
|---|
| 1111 | + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
|---|
| 998 | 1112 | } |
|---|
| 999 | 1113 | |
|---|
| 1000 | 1114 | /** |
|---|
| .. | .. |
|---|
| 1029 | 1143 | phb_pe = eeh_phb_pe_get(hose); |
|---|
| 1030 | 1144 | if (!phb_pe) continue; |
|---|
| 1031 | 1145 | |
|---|
| 1032 | | - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); |
|---|
| 1146 | + eeh_pe_mark_isolated(phb_pe); |
|---|
| 1033 | 1147 | } |
|---|
| 1034 | 1148 | |
|---|
| 1035 | 1149 | eeh_serialize_unlock(flags); |
|---|
| .. | .. |
|---|
| 1044 | 1158 | /* Purge all events of the PHB */ |
|---|
| 1045 | 1159 | eeh_remove_event(pe, true); |
|---|
| 1046 | 1160 | |
|---|
| 1047 | | - if (rc == EEH_NEXT_ERR_DEAD_PHB) |
|---|
| 1048 | | - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); |
|---|
| 1049 | | - else |
|---|
| 1050 | | - eeh_pe_state_mark(pe, |
|---|
| 1051 | | - EEH_PE_ISOLATED | EEH_PE_RECOVERING); |
|---|
| 1161 | + if (rc != EEH_NEXT_ERR_DEAD_PHB) |
|---|
| 1162 | + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
|---|
| 1163 | + eeh_pe_mark_isolated(pe); |
|---|
| 1052 | 1164 | |
|---|
| 1053 | 1165 | eeh_serialize_unlock(flags); |
|---|
| 1054 | 1166 | |
|---|
| .. | .. |
|---|
| 1068 | 1180 | */ |
|---|
| 1069 | 1181 | if (rc == EEH_NEXT_ERR_FROZEN_PE || |
|---|
| 1070 | 1182 | rc == EEH_NEXT_ERR_FENCED_PHB) { |
|---|
| 1183 | + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
|---|
| 1071 | 1184 | eeh_handle_normal_event(pe); |
|---|
| 1072 | 1185 | } else { |
|---|
| 1186 | + eeh_for_each_pe(pe, tmp_pe) |
|---|
| 1187 | + eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) |
|---|
| 1188 | + edev->mode &= ~EEH_DEV_NO_HANDLER; |
|---|
| 1189 | + |
|---|
| 1190 | + /* Notify all devices to be down */ |
|---|
| 1191 | + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
|---|
| 1192 | + eeh_pe_report( |
|---|
| 1193 | + "error_detected(permanent failure)", pe, |
|---|
| 1194 | + eeh_report_failure, NULL); |
|---|
| 1195 | + eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
|---|
| 1196 | + |
|---|
| 1073 | 1197 | pci_lock_rescan_remove(); |
|---|
| 1074 | 1198 | list_for_each_entry(hose, &hose_list, list_node) { |
|---|
| 1075 | 1199 | phb_pe = eeh_phb_pe_get(hose); |
|---|
| .. | .. |
|---|
| 1078 | 1202 | (phb_pe->state & EEH_PE_RECOVERING)) |
|---|
| 1079 | 1203 | continue; |
|---|
| 1080 | 1204 | |
|---|
| 1081 | | - eeh_for_each_pe(pe, tmp_pe) |
|---|
| 1082 | | - eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) |
|---|
| 1083 | | - edev->mode &= ~EEH_DEV_NO_HANDLER; |
|---|
| 1084 | | - |
|---|
| 1085 | | - /* Notify all devices to be down */ |
|---|
| 1086 | | - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); |
|---|
| 1087 | | - eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
|---|
| 1088 | | - eeh_pe_report( |
|---|
| 1089 | | - "error_detected(permanent failure)", pe, |
|---|
| 1090 | | - eeh_report_failure, NULL); |
|---|
| 1091 | 1205 | bus = eeh_pe_bus_get(phb_pe); |
|---|
| 1092 | 1206 | if (!bus) { |
|---|
| 1093 | 1207 | pr_err("%s: Cannot find PCI bus for " |
|---|