.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * PCI Error Recovery Driver for RPA-compliant PPC64 platform. |
---|
3 | 4 | * Copyright IBM Corp. 2004 2005 |
---|
4 | 5 | * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 |
---|
5 | | - * |
---|
6 | | - * All rights reserved. |
---|
7 | | - * |
---|
8 | | - * This program is free software; you can redistribute it and/or modify |
---|
9 | | - * it under the terms of the GNU General Public License as published by |
---|
10 | | - * the Free Software Foundation; either version 2 of the License, or (at |
---|
11 | | - * your option) any later version. |
---|
12 | | - * |
---|
13 | | - * This program is distributed in the hope that it will be useful, but |
---|
14 | | - * WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
15 | | - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or |
---|
16 | | - * NON INFRINGEMENT. See the GNU General Public License for more |
---|
17 | | - * details. |
---|
18 | | - * |
---|
19 | | - * You should have received a copy of the GNU General Public License |
---|
20 | | - * along with this program; if not, write to the Free Software |
---|
21 | | - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
22 | 6 | * |
---|
23 | 7 | * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> |
---|
24 | 8 | */ |
---|
.. | .. |
---|
27 | 11 | #include <linux/irq.h> |
---|
28 | 12 | #include <linux/module.h> |
---|
29 | 13 | #include <linux/pci.h> |
---|
| 14 | +#include <linux/pci_hotplug.h> |
---|
30 | 15 | #include <asm/eeh.h> |
---|
31 | 16 | #include <asm/eeh_event.h> |
---|
32 | 17 | #include <asm/ppc-pci.h> |
---|
.. | .. |
---|
35 | 20 | #include <asm/rtas.h> |
---|
36 | 21 | |
---|
37 | 22 | struct eeh_rmv_data { |
---|
38 | | - struct list_head edev_list; |
---|
39 | | - int removed; |
---|
| 23 | + struct list_head removed_vf_list; |
---|
| 24 | + int removed_dev_count; |
---|
40 | 25 | }; |
---|
41 | 26 | |
---|
42 | 27 | static int eeh_result_priority(enum pci_ers_result result) |
---|
.. | .. |
---|
60 | 45 | } |
---|
61 | 46 | }; |
---|
62 | 47 | |
---|
63 | | -const char *pci_ers_result_name(enum pci_ers_result result) |
---|
| 48 | +static const char *pci_ers_result_name(enum pci_ers_result result) |
---|
64 | 49 | { |
---|
65 | 50 | switch (result) { |
---|
66 | 51 | case PCI_ERS_RESULT_NONE: |
---|
.. | .. |
---|
81 | 66 | } |
---|
82 | 67 | }; |
---|
83 | 68 | |
---|
84 | | -static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev, |
---|
85 | | - const char *fmt, ...) |
---|
86 | | -{ |
---|
87 | | - struct va_format vaf; |
---|
88 | | - va_list args; |
---|
89 | | - |
---|
90 | | - va_start(args, fmt); |
---|
91 | | - |
---|
92 | | - vaf.fmt = fmt; |
---|
93 | | - vaf.va = &args; |
---|
94 | | - |
---|
95 | | - printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr, |
---|
96 | | - edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf); |
---|
97 | | - |
---|
98 | | - va_end(args); |
---|
99 | | -} |
---|
100 | | - |
---|
101 | 69 | static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, |
---|
102 | 70 | enum pci_ers_result new) |
---|
103 | 71 | { |
---|
.. | .. |
---|
113 | 81 | |
---|
114 | 82 | static bool eeh_edev_actionable(struct eeh_dev *edev) |
---|
115 | 83 | { |
---|
116 | | - return (edev->pdev && !eeh_dev_removed(edev) && |
---|
117 | | - !eeh_pe_passed(edev->pe)); |
---|
| 84 | + if (!edev->pdev) |
---|
| 85 | + return false; |
---|
| 86 | + if (edev->pdev->error_state == pci_channel_io_perm_failure) |
---|
| 87 | + return false; |
---|
| 88 | + if (eeh_dev_removed(edev)) |
---|
| 89 | + return false; |
---|
| 90 | + if (eeh_pe_passed(edev->pe)) |
---|
| 91 | + return false; |
---|
| 92 | + |
---|
| 93 | + return true; |
---|
118 | 94 | } |
---|
119 | 95 | |
---|
120 | 96 | /** |
---|
.. | .. |
---|
214 | 190 | } |
---|
215 | 191 | } |
---|
216 | 192 | |
---|
217 | | -static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) |
---|
| 193 | +static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) |
---|
218 | 194 | { |
---|
219 | 195 | struct pci_dev *pdev; |
---|
220 | 196 | |
---|
221 | 197 | if (!edev) |
---|
222 | | - return NULL; |
---|
| 198 | + return; |
---|
223 | 199 | |
---|
224 | 200 | /* |
---|
225 | 201 | * We cannot access the config space on some adapters. |
---|
.. | .. |
---|
229 | 205 | * device is created. |
---|
230 | 206 | */ |
---|
231 | 207 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) |
---|
232 | | - return NULL; |
---|
| 208 | + return; |
---|
233 | 209 | |
---|
234 | 210 | pdev = eeh_dev_to_pci_dev(edev); |
---|
235 | 211 | if (!pdev) |
---|
236 | | - return NULL; |
---|
| 212 | + return; |
---|
237 | 213 | |
---|
238 | 214 | pci_save_state(pdev); |
---|
239 | | - return NULL; |
---|
240 | 215 | } |
---|
241 | 216 | |
---|
242 | | -static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) |
---|
| 217 | +static void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s) |
---|
243 | 218 | { |
---|
244 | 219 | struct eeh_pe *pe; |
---|
245 | 220 | struct eeh_dev *edev, *tmp; |
---|
.. | .. |
---|
274 | 249 | } |
---|
275 | 250 | |
---|
276 | 251 | typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, |
---|
| 252 | + struct pci_dev *, |
---|
277 | 253 | struct pci_driver *); |
---|
278 | 254 | static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, |
---|
279 | 255 | enum pci_ers_result *result) |
---|
280 | 256 | { |
---|
| 257 | + struct pci_dev *pdev; |
---|
281 | 258 | struct pci_driver *driver; |
---|
282 | 259 | enum pci_ers_result new_result; |
---|
283 | 260 | |
---|
284 | | - if (!edev->pdev) { |
---|
| 261 | + pci_lock_rescan_remove(); |
---|
| 262 | + pdev = edev->pdev; |
---|
| 263 | + if (pdev) |
---|
| 264 | + get_device(&pdev->dev); |
---|
| 265 | + pci_unlock_rescan_remove(); |
---|
| 266 | + if (!pdev) { |
---|
285 | 267 | eeh_edev_info(edev, "no device"); |
---|
286 | 268 | return; |
---|
287 | 269 | } |
---|
288 | | - device_lock(&edev->pdev->dev); |
---|
| 270 | + device_lock(&pdev->dev); |
---|
289 | 271 | if (eeh_edev_actionable(edev)) { |
---|
290 | | - driver = eeh_pcid_get(edev->pdev); |
---|
| 272 | + driver = eeh_pcid_get(pdev); |
---|
291 | 273 | |
---|
292 | 274 | if (!driver) |
---|
293 | 275 | eeh_edev_info(edev, "no driver"); |
---|
.. | .. |
---|
296 | 278 | else if (edev->mode & EEH_DEV_NO_HANDLER) |
---|
297 | 279 | eeh_edev_info(edev, "driver bound too late"); |
---|
298 | 280 | else { |
---|
299 | | - new_result = fn(edev, driver); |
---|
| 281 | + new_result = fn(edev, pdev, driver); |
---|
300 | 282 | eeh_edev_info(edev, "%s driver reports: '%s'", |
---|
301 | 283 | driver->name, |
---|
302 | 284 | pci_ers_result_name(new_result)); |
---|
.. | .. |
---|
305 | 287 | new_result); |
---|
306 | 288 | } |
---|
307 | 289 | if (driver) |
---|
308 | | - eeh_pcid_put(edev->pdev); |
---|
| 290 | + eeh_pcid_put(pdev); |
---|
309 | 291 | } else { |
---|
310 | | - eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev, |
---|
| 292 | + eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev, |
---|
311 | 293 | !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); |
---|
312 | 294 | } |
---|
313 | | - device_unlock(&edev->pdev->dev); |
---|
| 295 | + device_unlock(&pdev->dev); |
---|
| 296 | + if (edev->pdev != pdev) |
---|
| 297 | + eeh_edev_warn(edev, "Device changed during processing!\n"); |
---|
| 298 | + put_device(&pdev->dev); |
---|
314 | 299 | } |
---|
315 | 300 | |
---|
316 | 301 | static void eeh_pe_report(const char *name, struct eeh_pe *root, |
---|
.. | .. |
---|
337 | 322 | * Report an EEH error to each device driver. |
---|
338 | 323 | */ |
---|
339 | 324 | static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, |
---|
| 325 | + struct pci_dev *pdev, |
---|
340 | 326 | struct pci_driver *driver) |
---|
341 | 327 | { |
---|
342 | 328 | enum pci_ers_result rc; |
---|
343 | | - struct pci_dev *dev = edev->pdev; |
---|
344 | 329 | |
---|
345 | 330 | if (!driver->err_handler->error_detected) |
---|
346 | 331 | return PCI_ERS_RESULT_NONE; |
---|
347 | 332 | |
---|
348 | 333 | eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", |
---|
349 | 334 | driver->name); |
---|
350 | | - rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); |
---|
| 335 | + rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); |
---|
351 | 336 | |
---|
352 | 337 | edev->in_error = true; |
---|
353 | | - pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); |
---|
| 338 | + pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE); |
---|
354 | 339 | return rc; |
---|
355 | 340 | } |
---|
356 | 341 | |
---|
.. | .. |
---|
363 | 348 | * are now enabled. |
---|
364 | 349 | */ |
---|
365 | 350 | static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, |
---|
| 351 | + struct pci_dev *pdev, |
---|
366 | 352 | struct pci_driver *driver) |
---|
367 | 353 | { |
---|
368 | 354 | if (!driver->err_handler->mmio_enabled) |
---|
369 | 355 | return PCI_ERS_RESULT_NONE; |
---|
370 | 356 | eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); |
---|
371 | | - return driver->err_handler->mmio_enabled(edev->pdev); |
---|
| 357 | + return driver->err_handler->mmio_enabled(pdev); |
---|
372 | 358 | } |
---|
373 | 359 | |
---|
374 | 360 | /** |
---|
.. | .. |
---|
382 | 368 | * driver can work again while the device is recovered. |
---|
383 | 369 | */ |
---|
384 | 370 | static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, |
---|
| 371 | + struct pci_dev *pdev, |
---|
385 | 372 | struct pci_driver *driver) |
---|
386 | 373 | { |
---|
387 | 374 | if (!driver->err_handler->slot_reset || !edev->in_error) |
---|
388 | 375 | return PCI_ERS_RESULT_NONE; |
---|
389 | 376 | eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); |
---|
390 | | - return driver->err_handler->slot_reset(edev->pdev); |
---|
| 377 | + return driver->err_handler->slot_reset(pdev); |
---|
391 | 378 | } |
---|
392 | 379 | |
---|
393 | | -static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) |
---|
| 380 | +static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) |
---|
394 | 381 | { |
---|
395 | 382 | struct pci_dev *pdev; |
---|
396 | 383 | |
---|
397 | 384 | if (!edev) |
---|
398 | | - return NULL; |
---|
| 385 | + return; |
---|
399 | 386 | |
---|
400 | 387 | /* |
---|
401 | 388 | * The content in the config space isn't saved because |
---|
.. | .. |
---|
404 | 391 | * EEH device is created. |
---|
405 | 392 | */ |
---|
406 | 393 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { |
---|
407 | | - if (list_is_last(&edev->list, &edev->pe->edevs)) |
---|
| 394 | + if (list_is_last(&edev->entry, &edev->pe->edevs)) |
---|
408 | 395 | eeh_pe_restore_bars(edev->pe); |
---|
409 | 396 | |
---|
410 | | - return NULL; |
---|
| 397 | + return; |
---|
411 | 398 | } |
---|
412 | 399 | |
---|
413 | 400 | pdev = eeh_dev_to_pci_dev(edev); |
---|
414 | 401 | if (!pdev) |
---|
415 | | - return NULL; |
---|
| 402 | + return; |
---|
416 | 403 | |
---|
417 | 404 | pci_restore_state(pdev); |
---|
418 | | - return NULL; |
---|
419 | 405 | } |
---|
420 | 406 | |
---|
421 | 407 | /** |
---|
.. | .. |
---|
428 | 414 | * to make the recovered device work again. |
---|
429 | 415 | */ |
---|
430 | 416 | static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, |
---|
| 417 | + struct pci_dev *pdev, |
---|
431 | 418 | struct pci_driver *driver) |
---|
432 | 419 | { |
---|
433 | 420 | if (!driver->err_handler->resume || !edev->in_error) |
---|
434 | 421 | return PCI_ERS_RESULT_NONE; |
---|
435 | 422 | |
---|
436 | 423 | eeh_edev_info(edev, "Invoking %s->resume()", driver->name); |
---|
437 | | - driver->err_handler->resume(edev->pdev); |
---|
| 424 | + driver->err_handler->resume(pdev); |
---|
438 | 425 | |
---|
439 | 426 | pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); |
---|
440 | 427 | #ifdef CONFIG_PCI_IOV |
---|
441 | | - if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) |
---|
442 | | - eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); |
---|
| 428 | + if (eeh_ops->notify_resume) |
---|
| 429 | + eeh_ops->notify_resume(edev); |
---|
443 | 430 | #endif |
---|
444 | 431 | return PCI_ERS_RESULT_NONE; |
---|
445 | 432 | } |
---|
.. | .. |
---|
453 | 440 | * dead, and that no further recovery attempts will be made on it. |
---|
454 | 441 | */ |
---|
455 | 442 | static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, |
---|
| 443 | + struct pci_dev *pdev, |
---|
456 | 444 | struct pci_driver *driver) |
---|
457 | 445 | { |
---|
458 | 446 | enum pci_ers_result rc; |
---|
.. | .. |
---|
462 | 450 | |
---|
463 | 451 | eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", |
---|
464 | 452 | driver->name); |
---|
465 | | - rc = driver->err_handler->error_detected(edev->pdev, |
---|
| 453 | + rc = driver->err_handler->error_detected(pdev, |
---|
466 | 454 | pci_channel_io_perm_failure); |
---|
467 | 455 | |
---|
468 | | - pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT); |
---|
| 456 | + pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); |
---|
469 | 457 | return rc; |
---|
470 | 458 | } |
---|
471 | 459 | |
---|
472 | | -static void *eeh_add_virt_device(void *data, void *userdata) |
---|
| 460 | +static void *eeh_add_virt_device(struct eeh_dev *edev) |
---|
473 | 461 | { |
---|
474 | 462 | struct pci_driver *driver; |
---|
475 | | - struct eeh_dev *edev = (struct eeh_dev *)data; |
---|
476 | 463 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
---|
477 | | - struct pci_dn *pdn = eeh_dev_to_pdn(edev); |
---|
478 | 464 | |
---|
479 | 465 | if (!(edev->physfn)) { |
---|
480 | | - pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", |
---|
481 | | - __func__, pdn->phb->global_number, pdn->busno, |
---|
482 | | - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); |
---|
| 466 | + eeh_edev_warn(edev, "Not for VF\n"); |
---|
483 | 467 | return NULL; |
---|
484 | 468 | } |
---|
485 | 469 | |
---|
.. | .. |
---|
493 | 477 | } |
---|
494 | 478 | |
---|
495 | 479 | #ifdef CONFIG_PCI_IOV |
---|
496 | | - pci_iov_add_virtfn(edev->physfn, pdn->vf_index); |
---|
| 480 | + pci_iov_add_virtfn(edev->physfn, edev->vf_index); |
---|
497 | 481 | #endif |
---|
498 | 482 | return NULL; |
---|
499 | 483 | } |
---|
500 | 484 | |
---|
501 | | -static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) |
---|
| 485 | +static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) |
---|
502 | 486 | { |
---|
503 | 487 | struct pci_driver *driver; |
---|
504 | 488 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
---|
505 | 489 | struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; |
---|
506 | | - int *removed = rmv_data ? &rmv_data->removed : NULL; |
---|
507 | 490 | |
---|
508 | 491 | /* |
---|
509 | 492 | * Actually, we should remove the PCI bridges as well. |
---|
.. | .. |
---|
512 | 495 | * support EEH. So we just care about PCI devices for |
---|
513 | 496 | * simplicity here. |
---|
514 | 497 | */ |
---|
515 | | - if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) |
---|
516 | | - return NULL; |
---|
| 498 | + if (!eeh_edev_actionable(edev) || |
---|
| 499 | + (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) |
---|
| 500 | + return; |
---|
517 | 501 | |
---|
518 | | - /* |
---|
519 | | - * We rely on count-based pcibios_release_device() to |
---|
520 | | - * detach permanently offlined PEs. Unfortunately, that's |
---|
521 | | - * not reliable enough. We might have the permanently |
---|
522 | | - * offlined PEs attached, but we needn't take care of |
---|
523 | | - * them and their child devices. |
---|
524 | | - */ |
---|
525 | | - if (eeh_dev_removed(edev)) |
---|
526 | | - return NULL; |
---|
527 | | - |
---|
528 | | - if (removed) { |
---|
529 | | - if (eeh_pe_passed(edev->pe)) |
---|
530 | | - return NULL; |
---|
| 502 | + if (rmv_data) { |
---|
531 | 503 | driver = eeh_pcid_get(dev); |
---|
532 | 504 | if (driver) { |
---|
533 | 505 | if (driver->err_handler && |
---|
534 | 506 | driver->err_handler->error_detected && |
---|
535 | 507 | driver->err_handler->slot_reset) { |
---|
536 | 508 | eeh_pcid_put(dev); |
---|
537 | | - return NULL; |
---|
| 509 | + return; |
---|
538 | 510 | } |
---|
539 | 511 | eeh_pcid_put(dev); |
---|
540 | 512 | } |
---|
541 | 513 | } |
---|
542 | 514 | |
---|
543 | 515 | /* Remove it from PCI subsystem */ |
---|
544 | | - pr_debug("EEH: Removing %s without EEH sensitive driver\n", |
---|
545 | | - pci_name(dev)); |
---|
546 | | - edev->bus = dev->bus; |
---|
| 516 | + pr_info("EEH: Removing %s without EEH sensitive driver\n", |
---|
| 517 | + pci_name(dev)); |
---|
547 | 518 | edev->mode |= EEH_DEV_DISCONNECTED; |
---|
548 | | - if (removed) |
---|
549 | | - (*removed)++; |
---|
| 519 | + if (rmv_data) |
---|
| 520 | + rmv_data->removed_dev_count++; |
---|
550 | 521 | |
---|
551 | 522 | if (edev->physfn) { |
---|
552 | 523 | #ifdef CONFIG_PCI_IOV |
---|
553 | | - struct pci_dn *pdn = eeh_dev_to_pdn(edev); |
---|
554 | | - |
---|
555 | | - pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); |
---|
| 524 | + pci_iov_remove_virtfn(edev->physfn, edev->vf_index); |
---|
556 | 525 | edev->pdev = NULL; |
---|
557 | 526 | #endif |
---|
558 | 527 | if (rmv_data) |
---|
559 | | - list_add(&edev->rmv_list, &rmv_data->edev_list); |
---|
| 528 | + list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); |
---|
560 | 529 | } else { |
---|
561 | 530 | pci_lock_rescan_remove(); |
---|
562 | 531 | pci_stop_and_remove_bus_device(dev); |
---|
563 | 532 | pci_unlock_rescan_remove(); |
---|
564 | 533 | } |
---|
565 | | - |
---|
566 | | - return NULL; |
---|
567 | 534 | } |
---|
568 | 535 | |
---|
569 | 536 | static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) |
---|
.. | .. |
---|
575 | 542 | continue; |
---|
576 | 543 | |
---|
577 | 544 | edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); |
---|
578 | | - eeh_rmv_from_parent_pe(edev); |
---|
| 545 | + eeh_pe_tree_remove(edev); |
---|
579 | 546 | } |
---|
580 | 547 | |
---|
581 | 548 | return NULL; |
---|
.. | .. |
---|
588 | 555 | * PE reset (for 3 times), we try to clear the frozen state |
---|
589 | 556 | * for 3 times as well. |
---|
590 | 557 | */ |
---|
591 | | -static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag) |
---|
| 558 | +static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) |
---|
592 | 559 | { |
---|
593 | | - bool clear_sw_state = *(bool *)flag; |
---|
594 | | - int i, rc = 1; |
---|
| 560 | + struct eeh_pe *pe; |
---|
| 561 | + int i; |
---|
595 | 562 | |
---|
596 | | - for (i = 0; rc && i < 3; i++) |
---|
597 | | - rc = eeh_unfreeze_pe(pe, clear_sw_state); |
---|
598 | | - |
---|
599 | | - /* Stop immediately on any errors */ |
---|
600 | | - if (rc) { |
---|
601 | | - pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n", |
---|
602 | | - __func__, rc, pe->phb->global_number, pe->addr); |
---|
603 | | - return (void *)pe; |
---|
| 563 | + eeh_for_each_pe(root, pe) { |
---|
| 564 | + if (include_passed || !eeh_pe_passed(pe)) { |
---|
| 565 | + for (i = 0; i < 3; i++) |
---|
| 566 | + if (!eeh_unfreeze_pe(pe)) |
---|
| 567 | + break; |
---|
| 568 | + if (i >= 3) |
---|
| 569 | + return -EIO; |
---|
| 570 | + } |
---|
604 | 571 | } |
---|
605 | | - |
---|
606 | | - return NULL; |
---|
607 | | -} |
---|
608 | | - |
---|
609 | | -static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, |
---|
610 | | - bool clear_sw_state) |
---|
611 | | -{ |
---|
612 | | - void *rc; |
---|
613 | | - |
---|
614 | | - rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state); |
---|
615 | | - if (!rc) |
---|
616 | | - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); |
---|
617 | | - |
---|
618 | | - return rc ? -EIO : 0; |
---|
| 572 | + eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); |
---|
| 573 | + return 0; |
---|
619 | 574 | } |
---|
620 | 575 | |
---|
621 | 576 | int eeh_pe_reset_and_recover(struct eeh_pe *pe) |
---|
.. | .. |
---|
633 | 588 | eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); |
---|
634 | 589 | |
---|
635 | 590 | /* Issue reset */ |
---|
636 | | - ret = eeh_pe_reset_full(pe); |
---|
| 591 | + ret = eeh_pe_reset_full(pe, true); |
---|
637 | 592 | if (ret) { |
---|
638 | | - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
---|
| 593 | + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
---|
639 | 594 | return ret; |
---|
640 | 595 | } |
---|
641 | 596 | |
---|
642 | 597 | /* Unfreeze the PE */ |
---|
643 | 598 | ret = eeh_clear_pe_frozen_state(pe, true); |
---|
644 | 599 | if (ret) { |
---|
645 | | - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
---|
| 600 | + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
---|
646 | 601 | return ret; |
---|
647 | 602 | } |
---|
648 | 603 | |
---|
.. | .. |
---|
650 | 605 | eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); |
---|
651 | 606 | |
---|
652 | 607 | /* Clear recovery mode */ |
---|
653 | | - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
---|
| 608 | + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
---|
654 | 609 | |
---|
655 | 610 | return 0; |
---|
656 | 611 | } |
---|
.. | .. |
---|
673 | 628 | time64_t tstamp; |
---|
674 | 629 | int cnt, rc; |
---|
675 | 630 | struct eeh_dev *edev; |
---|
| 631 | + struct eeh_pe *tmp_pe; |
---|
| 632 | + bool any_passed = false; |
---|
| 633 | + |
---|
| 634 | + eeh_for_each_pe(pe, tmp_pe) |
---|
| 635 | + any_passed |= eeh_pe_passed(tmp_pe); |
---|
676 | 636 | |
---|
677 | 637 | /* pcibios will clear the counter; save the value */ |
---|
678 | 638 | cnt = pe->freeze_count; |
---|
.. | .. |
---|
685 | 645 | * into pci_hp_add_devices(). |
---|
686 | 646 | */ |
---|
687 | 647 | eeh_pe_state_mark(pe, EEH_PE_KEEP); |
---|
688 | | - if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { |
---|
| 648 | + if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { |
---|
689 | 649 | eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); |
---|
690 | 650 | } else { |
---|
691 | 651 | pci_lock_rescan_remove(); |
---|
.. | .. |
---|
702 | 662 | * config accesses. So we prefer to block them. However, controlled |
---|
703 | 663 | * PCI config accesses initiated from EEH itself are allowed. |
---|
704 | 664 | */ |
---|
705 | | - rc = eeh_pe_reset_full(pe); |
---|
| 665 | + rc = eeh_pe_reset_full(pe, false); |
---|
706 | 666 | if (rc) |
---|
707 | 667 | return rc; |
---|
708 | 668 | |
---|
.. | .. |
---|
725 | 685 | * the device up before the scripts have taken it down, |
---|
726 | 686 | * potentially weird things happen. |
---|
727 | 687 | */ |
---|
728 | | - if (!driver_eeh_aware || rmv_data->removed) { |
---|
| 688 | + if (!driver_eeh_aware || rmv_data->removed_dev_count) { |
---|
729 | 689 | pr_info("EEH: Sleep 5s ahead of %s hotplug\n", |
---|
730 | 690 | (driver_eeh_aware ? "partial" : "complete")); |
---|
731 | 691 | ssleep(5); |
---|
.. | .. |
---|
735 | 695 | * PE. We should disconnect it so the binding can be |
---|
736 | 696 | * rebuilt when adding PCI devices. |
---|
737 | 697 | */ |
---|
738 | | - edev = list_first_entry(&pe->edevs, struct eeh_dev, list); |
---|
| 698 | + edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); |
---|
739 | 699 | eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); |
---|
740 | 700 | if (pe->type & EEH_PE_VF) { |
---|
741 | | - eeh_add_virt_device(edev, NULL); |
---|
| 701 | + eeh_add_virt_device(edev); |
---|
742 | 702 | } else { |
---|
743 | 703 | if (!driver_eeh_aware) |
---|
744 | | - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); |
---|
| 704 | + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
---|
745 | 705 | pci_hp_add_devices(bus); |
---|
746 | 706 | } |
---|
747 | 707 | } |
---|
748 | | - eeh_pe_state_clear(pe, EEH_PE_KEEP); |
---|
| 708 | + eeh_pe_state_clear(pe, EEH_PE_KEEP, true); |
---|
749 | 709 | |
---|
750 | 710 | pe->tstamp = tstamp; |
---|
751 | 711 | pe->freeze_count = cnt; |
---|
.. | .. |
---|
758 | 718 | * to come back on line, in seconds. |
---|
759 | 719 | */ |
---|
760 | 720 | #define MAX_WAIT_FOR_RECOVERY 300 |
---|
| 721 | + |
---|
| 722 | + |
---|
| 723 | +/* Walks the PE tree after processing an event to remove any stale PEs. |
---|
| 724 | + * |
---|
| 725 | + * NB: This needs to be recursive to ensure the leaf PEs get removed |
---|
| 726 | + * before their parents do. Although this is possible to do recursively |
---|
| 727 | + * we don't since this is easier to read and we need to garantee |
---|
| 728 | + * the leaf nodes will be handled first. |
---|
| 729 | + */ |
---|
| 730 | +static void eeh_pe_cleanup(struct eeh_pe *pe) |
---|
| 731 | +{ |
---|
| 732 | + struct eeh_pe *child_pe, *tmp; |
---|
| 733 | + |
---|
| 734 | + list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) |
---|
| 735 | + eeh_pe_cleanup(child_pe); |
---|
| 736 | + |
---|
| 737 | + if (pe->state & EEH_PE_KEEP) |
---|
| 738 | + return; |
---|
| 739 | + |
---|
| 740 | + if (!(pe->state & EEH_PE_INVALID)) |
---|
| 741 | + return; |
---|
| 742 | + |
---|
| 743 | + if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { |
---|
| 744 | + list_del(&pe->child); |
---|
| 745 | + kfree(pe); |
---|
| 746 | + } |
---|
| 747 | +} |
---|
| 748 | + |
---|
| 749 | +/** |
---|
| 750 | + * eeh_check_slot_presence - Check if a device is still present in a slot |
---|
| 751 | + * @pdev: pci_dev to check |
---|
| 752 | + * |
---|
| 753 | + * This function may return a false positive if we can't determine the slot's |
---|
| 754 | + * presence state. This might happen for for PCIe slots if the PE containing |
---|
| 755 | + * the upstream bridge is also frozen, or the bridge is part of the same PE |
---|
| 756 | + * as the device. |
---|
| 757 | + * |
---|
| 758 | + * This shouldn't happen often, but you might see it if you hotplug a PCIe |
---|
| 759 | + * switch. |
---|
| 760 | + */ |
---|
| 761 | +static bool eeh_slot_presence_check(struct pci_dev *pdev) |
---|
| 762 | +{ |
---|
| 763 | + const struct hotplug_slot_ops *ops; |
---|
| 764 | + struct pci_slot *slot; |
---|
| 765 | + u8 state; |
---|
| 766 | + int rc; |
---|
| 767 | + |
---|
| 768 | + if (!pdev) |
---|
| 769 | + return false; |
---|
| 770 | + |
---|
| 771 | + if (pdev->error_state == pci_channel_io_perm_failure) |
---|
| 772 | + return false; |
---|
| 773 | + |
---|
| 774 | + slot = pdev->slot; |
---|
| 775 | + if (!slot || !slot->hotplug) |
---|
| 776 | + return true; |
---|
| 777 | + |
---|
| 778 | + ops = slot->hotplug->ops; |
---|
| 779 | + if (!ops || !ops->get_adapter_status) |
---|
| 780 | + return true; |
---|
| 781 | + |
---|
| 782 | + /* set the attention indicator while we've got the slot ops */ |
---|
| 783 | + if (ops->set_attention_status) |
---|
| 784 | + ops->set_attention_status(slot->hotplug, 1); |
---|
| 785 | + |
---|
| 786 | + rc = ops->get_adapter_status(slot->hotplug, &state); |
---|
| 787 | + if (rc) |
---|
| 788 | + return true; |
---|
| 789 | + |
---|
| 790 | + return !!state; |
---|
| 791 | +} |
---|
| 792 | + |
---|
| 793 | +static void eeh_clear_slot_attention(struct pci_dev *pdev) |
---|
| 794 | +{ |
---|
| 795 | + const struct hotplug_slot_ops *ops; |
---|
| 796 | + struct pci_slot *slot; |
---|
| 797 | + |
---|
| 798 | + if (!pdev) |
---|
| 799 | + return; |
---|
| 800 | + |
---|
| 801 | + if (pdev->error_state == pci_channel_io_perm_failure) |
---|
| 802 | + return; |
---|
| 803 | + |
---|
| 804 | + slot = pdev->slot; |
---|
| 805 | + if (!slot || !slot->hotplug) |
---|
| 806 | + return; |
---|
| 807 | + |
---|
| 808 | + ops = slot->hotplug->ops; |
---|
| 809 | + if (!ops || !ops->set_attention_status) |
---|
| 810 | + return; |
---|
| 811 | + |
---|
| 812 | + ops->set_attention_status(slot->hotplug, 0); |
---|
| 813 | +} |
---|
761 | 814 | |
---|
762 | 815 | /** |
---|
763 | 816 | * eeh_handle_normal_event - Handle EEH events on a specific PE |
---|
.. | .. |
---|
787 | 840 | struct eeh_pe *tmp_pe; |
---|
788 | 841 | int rc = 0; |
---|
789 | 842 | enum pci_ers_result result = PCI_ERS_RESULT_NONE; |
---|
790 | | - struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0}; |
---|
| 843 | + struct eeh_rmv_data rmv_data = |
---|
| 844 | + {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; |
---|
| 845 | + int devices = 0; |
---|
791 | 846 | |
---|
792 | 847 | bus = eeh_pe_bus_get(pe); |
---|
793 | 848 | if (!bus) { |
---|
.. | .. |
---|
796 | 851 | return; |
---|
797 | 852 | } |
---|
798 | 853 | |
---|
799 | | - eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
---|
| 854 | + /* |
---|
| 855 | + * When devices are hot-removed we might get an EEH due to |
---|
| 856 | + * a driver attempting to touch the MMIO space of a removed |
---|
| 857 | + * device. In this case we don't have a device to recover |
---|
| 858 | + * so suppress the event if we can't find any present devices. |
---|
| 859 | + * |
---|
| 860 | + * The hotplug driver should take care of tearing down the |
---|
| 861 | + * device itself. |
---|
| 862 | + */ |
---|
| 863 | + eeh_for_each_pe(pe, tmp_pe) |
---|
| 864 | + eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
---|
| 865 | + if (eeh_slot_presence_check(edev->pdev)) |
---|
| 866 | + devices++; |
---|
| 867 | + |
---|
| 868 | + if (!devices) { |
---|
| 869 | + pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", |
---|
| 870 | + pe->phb->global_number, pe->addr); |
---|
| 871 | + goto out; /* nothing to recover */ |
---|
| 872 | + } |
---|
| 873 | + |
---|
| 874 | + /* Log the event */ |
---|
| 875 | + if (pe->type & EEH_PE_PHB) { |
---|
| 876 | + pr_err("EEH: Recovering PHB#%x, location: %s\n", |
---|
| 877 | + pe->phb->global_number, eeh_pe_loc_get(pe)); |
---|
| 878 | + } else { |
---|
| 879 | + struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); |
---|
| 880 | + |
---|
| 881 | + pr_err("EEH: Recovering PHB#%x-PE#%x\n", |
---|
| 882 | + pe->phb->global_number, pe->addr); |
---|
| 883 | + pr_err("EEH: PE location: %s, PHB location: %s\n", |
---|
| 884 | + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); |
---|
| 885 | + } |
---|
| 886 | + |
---|
| 887 | +#ifdef CONFIG_STACKTRACE |
---|
| 888 | + /* |
---|
| 889 | + * Print the saved stack trace now that we've verified there's |
---|
| 890 | + * something to recover. |
---|
| 891 | + */ |
---|
| 892 | + if (pe->trace_entries) { |
---|
| 893 | + void **ptrs = (void **) pe->stack_trace; |
---|
| 894 | + int i; |
---|
| 895 | + |
---|
| 896 | + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", |
---|
| 897 | + pe->phb->global_number, pe->addr); |
---|
| 898 | + |
---|
| 899 | + /* FIXME: Use the same format as dump_stack() */ |
---|
| 900 | + pr_err("EEH: Call Trace:\n"); |
---|
| 901 | + for (i = 0; i < pe->trace_entries; i++) |
---|
| 902 | + pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); |
---|
| 903 | + |
---|
| 904 | + pe->trace_entries = 0; |
---|
| 905 | + } |
---|
| 906 | +#endif /* CONFIG_STACKTRACE */ |
---|
800 | 907 | |
---|
801 | 908 | eeh_pe_update_time_stamp(pe); |
---|
802 | 909 | pe->freeze_count++; |
---|
.. | .. |
---|
804 | 911 | pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", |
---|
805 | 912 | pe->phb->global_number, pe->addr, |
---|
806 | 913 | pe->freeze_count); |
---|
807 | | - goto hard_fail; |
---|
| 914 | + result = PCI_ERS_RESULT_DISCONNECT; |
---|
808 | 915 | } |
---|
809 | | - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", |
---|
810 | | - pe->freeze_count, eeh_max_freezes); |
---|
811 | 916 | |
---|
812 | 917 | eeh_for_each_pe(pe, tmp_pe) |
---|
813 | 918 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
---|
.. | .. |
---|
823 | 928 | * the error. Override the result if necessary to have partially |
---|
824 | 929 | * hotplug for this case. |
---|
825 | 930 | */ |
---|
826 | | - pr_info("EEH: Notify device drivers to shutdown\n"); |
---|
827 | | - eeh_set_channel_state(pe, pci_channel_io_frozen); |
---|
828 | | - eeh_set_irq_state(pe, false); |
---|
829 | | - eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, |
---|
830 | | - &result); |
---|
831 | | - if ((pe->type & EEH_PE_PHB) && |
---|
832 | | - result != PCI_ERS_RESULT_NONE && |
---|
833 | | - result != PCI_ERS_RESULT_NEED_RESET) |
---|
834 | | - result = PCI_ERS_RESULT_NEED_RESET; |
---|
| 931 | + if (result != PCI_ERS_RESULT_DISCONNECT) { |
---|
| 932 | + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", |
---|
| 933 | + pe->freeze_count, eeh_max_freezes); |
---|
| 934 | + pr_info("EEH: Notify device drivers to shutdown\n"); |
---|
| 935 | + eeh_set_channel_state(pe, pci_channel_io_frozen); |
---|
| 936 | + eeh_set_irq_state(pe, false); |
---|
| 937 | + eeh_pe_report("error_detected(IO frozen)", pe, |
---|
| 938 | + eeh_report_error, &result); |
---|
| 939 | + if ((pe->type & EEH_PE_PHB) && |
---|
| 940 | + result != PCI_ERS_RESULT_NONE && |
---|
| 941 | + result != PCI_ERS_RESULT_NEED_RESET) |
---|
| 942 | + result = PCI_ERS_RESULT_NEED_RESET; |
---|
| 943 | + } |
---|
835 | 944 | |
---|
836 | 945 | /* Get the current PCI slot state. This can take a long time, |
---|
837 | 946 | * sometimes over 300 seconds for certain systems. |
---|
838 | 947 | */ |
---|
839 | | - rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); |
---|
840 | | - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { |
---|
841 | | - pr_warn("EEH: Permanent failure\n"); |
---|
842 | | - goto hard_fail; |
---|
| 948 | + if (result != PCI_ERS_RESULT_DISCONNECT) { |
---|
| 949 | + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); |
---|
| 950 | + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { |
---|
| 951 | + pr_warn("EEH: Permanent failure\n"); |
---|
| 952 | + result = PCI_ERS_RESULT_DISCONNECT; |
---|
| 953 | + } |
---|
843 | 954 | } |
---|
844 | 955 | |
---|
845 | 956 | /* Since rtas may enable MMIO when posting the error log, |
---|
846 | 957 | * don't post the error log until after all dev drivers |
---|
847 | 958 | * have been informed. |
---|
848 | 959 | */ |
---|
849 | | - pr_info("EEH: Collect temporary log\n"); |
---|
850 | | - eeh_slot_error_detail(pe, EEH_LOG_TEMP); |
---|
| 960 | + if (result != PCI_ERS_RESULT_DISCONNECT) { |
---|
| 961 | + pr_info("EEH: Collect temporary log\n"); |
---|
| 962 | + eeh_slot_error_detail(pe, EEH_LOG_TEMP); |
---|
| 963 | + } |
---|
851 | 964 | |
---|
852 | 965 | /* If all device drivers were EEH-unaware, then shut |
---|
853 | 966 | * down all of the device drivers, and hope they |
---|
.. | .. |
---|
859 | 972 | if (rc) { |
---|
860 | 973 | pr_warn("%s: Unable to reset, err=%d\n", |
---|
861 | 974 | __func__, rc); |
---|
862 | | - goto hard_fail; |
---|
| 975 | + result = PCI_ERS_RESULT_DISCONNECT; |
---|
863 | 976 | } |
---|
864 | 977 | } |
---|
865 | 978 | |
---|
.. | .. |
---|
868 | 981 | pr_info("EEH: Enable I/O for affected devices\n"); |
---|
869 | 982 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); |
---|
870 | 983 | |
---|
871 | | - if (rc < 0) |
---|
872 | | - goto hard_fail; |
---|
873 | | - if (rc) { |
---|
| 984 | + if (rc < 0) { |
---|
| 985 | + result = PCI_ERS_RESULT_DISCONNECT; |
---|
| 986 | + } else if (rc) { |
---|
874 | 987 | result = PCI_ERS_RESULT_NEED_RESET; |
---|
875 | 988 | } else { |
---|
876 | 989 | pr_info("EEH: Notify device drivers to resume I/O\n"); |
---|
.. | .. |
---|
884 | 997 | pr_info("EEH: Enabled DMA for affected devices\n"); |
---|
885 | 998 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); |
---|
886 | 999 | |
---|
887 | | - if (rc < 0) |
---|
888 | | - goto hard_fail; |
---|
889 | | - if (rc) { |
---|
| 1000 | + if (rc < 0) { |
---|
| 1001 | + result = PCI_ERS_RESULT_DISCONNECT; |
---|
| 1002 | + } else if (rc) { |
---|
890 | 1003 | result = PCI_ERS_RESULT_NEED_RESET; |
---|
891 | 1004 | } else { |
---|
892 | 1005 | /* |
---|
.. | .. |
---|
894 | 1007 | * is still in frozen state. Clear it before |
---|
895 | 1008 | * resuming the PE. |
---|
896 | 1009 | */ |
---|
897 | | - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); |
---|
| 1010 | + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); |
---|
898 | 1011 | result = PCI_ERS_RESULT_RECOVERED; |
---|
899 | 1012 | } |
---|
900 | | - } |
---|
901 | | - |
---|
902 | | - /* If any device has a hard failure, then shut off everything. */ |
---|
903 | | - if (result == PCI_ERS_RESULT_DISCONNECT) { |
---|
904 | | - pr_warn("EEH: Device driver gave up\n"); |
---|
905 | | - goto hard_fail; |
---|
906 | 1013 | } |
---|
907 | 1014 | |
---|
908 | 1015 | /* If any device called out for a reset, then reset the slot */ |
---|
.. | .. |
---|
912 | 1019 | if (rc) { |
---|
913 | 1020 | pr_warn("%s: Cannot reset, err=%d\n", |
---|
914 | 1021 | __func__, rc); |
---|
915 | | - goto hard_fail; |
---|
| 1022 | + result = PCI_ERS_RESULT_DISCONNECT; |
---|
| 1023 | + } else { |
---|
| 1024 | + result = PCI_ERS_RESULT_NONE; |
---|
| 1025 | + eeh_set_channel_state(pe, pci_channel_io_normal); |
---|
| 1026 | + eeh_set_irq_state(pe, true); |
---|
| 1027 | + eeh_pe_report("slot_reset", pe, eeh_report_reset, |
---|
| 1028 | + &result); |
---|
| 1029 | + } |
---|
| 1030 | + } |
---|
| 1031 | + |
---|
| 1032 | + if ((result == PCI_ERS_RESULT_RECOVERED) || |
---|
| 1033 | + (result == PCI_ERS_RESULT_NONE)) { |
---|
| 1034 | + /* |
---|
| 1035 | + * For those hot removed VFs, we should add back them after PF |
---|
| 1036 | + * get recovered properly. |
---|
| 1037 | + */ |
---|
| 1038 | + list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, |
---|
| 1039 | + rmv_entry) { |
---|
| 1040 | + eeh_add_virt_device(edev); |
---|
| 1041 | + list_del(&edev->rmv_entry); |
---|
916 | 1042 | } |
---|
917 | 1043 | |
---|
918 | | - pr_info("EEH: Notify device drivers " |
---|
919 | | - "the completion of reset\n"); |
---|
920 | | - result = PCI_ERS_RESULT_NONE; |
---|
| 1044 | + /* Tell all device drivers that they can resume operations */ |
---|
| 1045 | + pr_info("EEH: Notify device driver to resume\n"); |
---|
921 | 1046 | eeh_set_channel_state(pe, pci_channel_io_normal); |
---|
922 | 1047 | eeh_set_irq_state(pe, true); |
---|
923 | | - eeh_pe_report("slot_reset", pe, eeh_report_reset, &result); |
---|
924 | | - } |
---|
925 | | - |
---|
926 | | - /* All devices should claim they have recovered by now. */ |
---|
927 | | - if ((result != PCI_ERS_RESULT_RECOVERED) && |
---|
928 | | - (result != PCI_ERS_RESULT_NONE)) { |
---|
929 | | - pr_warn("EEH: Not recovered\n"); |
---|
930 | | - goto hard_fail; |
---|
931 | | - } |
---|
932 | | - |
---|
933 | | - /* |
---|
934 | | - * For those hot removed VFs, we should add back them after PF get |
---|
935 | | - * recovered properly. |
---|
936 | | - */ |
---|
937 | | - list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) { |
---|
938 | | - eeh_add_virt_device(edev, NULL); |
---|
939 | | - list_del(&edev->rmv_list); |
---|
940 | | - } |
---|
941 | | - |
---|
942 | | - /* Tell all device drivers that they can resume operations */ |
---|
943 | | - pr_info("EEH: Notify device driver to resume\n"); |
---|
944 | | - eeh_set_channel_state(pe, pci_channel_io_normal); |
---|
945 | | - eeh_set_irq_state(pe, true); |
---|
946 | | - eeh_pe_report("resume", pe, eeh_report_resume, NULL); |
---|
947 | | - eeh_for_each_pe(pe, tmp_pe) { |
---|
948 | | - eeh_pe_for_each_dev(tmp_pe, edev, tmp) { |
---|
949 | | - edev->mode &= ~EEH_DEV_NO_HANDLER; |
---|
950 | | - edev->in_error = false; |
---|
| 1048 | + eeh_pe_report("resume", pe, eeh_report_resume, NULL); |
---|
| 1049 | + eeh_for_each_pe(pe, tmp_pe) { |
---|
| 1050 | + eeh_pe_for_each_dev(tmp_pe, edev, tmp) { |
---|
| 1051 | + edev->mode &= ~EEH_DEV_NO_HANDLER; |
---|
| 1052 | + edev->in_error = false; |
---|
| 1053 | + } |
---|
951 | 1054 | } |
---|
| 1055 | + |
---|
| 1056 | + pr_info("EEH: Recovery successful.\n"); |
---|
| 1057 | + goto out; |
---|
952 | 1058 | } |
---|
953 | 1059 | |
---|
954 | | - pr_info("EEH: Recovery successful.\n"); |
---|
955 | | - goto final; |
---|
956 | | - |
---|
957 | | -hard_fail: |
---|
958 | 1060 | /* |
---|
959 | 1061 | * About 90% of all real-life EEH failures in the field |
---|
960 | 1062 | * are due to poorly seated PCI cards. Only 10% or so are |
---|
961 | 1063 | * due to actual, failed cards. |
---|
962 | 1064 | */ |
---|
963 | 1065 | pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" |
---|
964 | | - "Please try reseating or replacing it\n", |
---|
| 1066 | + "Please try reseating or replacing it\n", |
---|
965 | 1067 | pe->phb->global_number, pe->addr); |
---|
966 | 1068 | |
---|
967 | 1069 | eeh_slot_error_detail(pe, EEH_LOG_PERM); |
---|
968 | 1070 | |
---|
969 | 1071 | /* Notify all devices that they're about to go down. */ |
---|
970 | | - eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
---|
971 | 1072 | eeh_set_irq_state(pe, false); |
---|
972 | 1073 | eeh_pe_report("error_detected(permanent failure)", pe, |
---|
973 | 1074 | eeh_report_failure, NULL); |
---|
| 1075 | + eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
---|
974 | 1076 | |
---|
975 | 1077 | /* Mark the PE to be removed permanently */ |
---|
976 | 1078 | eeh_pe_state_mark(pe, EEH_PE_REMOVED); |
---|
.. | .. |
---|
984 | 1086 | eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); |
---|
985 | 1087 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); |
---|
986 | 1088 | } else { |
---|
987 | | - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); |
---|
| 1089 | + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
---|
988 | 1090 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); |
---|
989 | 1091 | |
---|
990 | 1092 | pci_lock_rescan_remove(); |
---|
.. | .. |
---|
993 | 1095 | /* The passed PE should no longer be used */ |
---|
994 | 1096 | return; |
---|
995 | 1097 | } |
---|
996 | | -final: |
---|
997 | | - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); |
---|
| 1098 | + |
---|
| 1099 | +out: |
---|
| 1100 | + /* |
---|
| 1101 | + * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING |
---|
| 1102 | + * we don't want to modify the PE tree structure so we do it here. |
---|
| 1103 | + */ |
---|
| 1104 | + eeh_pe_cleanup(pe); |
---|
| 1105 | + |
---|
| 1106 | + /* clear the slot attention LED for all recovered devices */ |
---|
| 1107 | + eeh_for_each_pe(pe, tmp_pe) |
---|
| 1108 | + eeh_pe_for_each_dev(tmp_pe, edev, tmp) |
---|
| 1109 | + eeh_clear_slot_attention(edev->pdev); |
---|
| 1110 | + |
---|
| 1111 | + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
---|
998 | 1112 | } |
---|
999 | 1113 | |
---|
1000 | 1114 | /** |
---|
.. | .. |
---|
1029 | 1143 | phb_pe = eeh_phb_pe_get(hose); |
---|
1030 | 1144 | if (!phb_pe) continue; |
---|
1031 | 1145 | |
---|
1032 | | - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); |
---|
| 1146 | + eeh_pe_mark_isolated(phb_pe); |
---|
1033 | 1147 | } |
---|
1034 | 1148 | |
---|
1035 | 1149 | eeh_serialize_unlock(flags); |
---|
.. | .. |
---|
1044 | 1158 | /* Purge all events of the PHB */ |
---|
1045 | 1159 | eeh_remove_event(pe, true); |
---|
1046 | 1160 | |
---|
1047 | | - if (rc == EEH_NEXT_ERR_DEAD_PHB) |
---|
1048 | | - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); |
---|
1049 | | - else |
---|
1050 | | - eeh_pe_state_mark(pe, |
---|
1051 | | - EEH_PE_ISOLATED | EEH_PE_RECOVERING); |
---|
| 1161 | + if (rc != EEH_NEXT_ERR_DEAD_PHB) |
---|
| 1162 | + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
---|
| 1163 | + eeh_pe_mark_isolated(pe); |
---|
1052 | 1164 | |
---|
1053 | 1165 | eeh_serialize_unlock(flags); |
---|
1054 | 1166 | |
---|
.. | .. |
---|
1068 | 1180 | */ |
---|
1069 | 1181 | if (rc == EEH_NEXT_ERR_FROZEN_PE || |
---|
1070 | 1182 | rc == EEH_NEXT_ERR_FENCED_PHB) { |
---|
| 1183 | + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
---|
1071 | 1184 | eeh_handle_normal_event(pe); |
---|
1072 | 1185 | } else { |
---|
| 1186 | + eeh_for_each_pe(pe, tmp_pe) |
---|
| 1187 | + eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) |
---|
| 1188 | + edev->mode &= ~EEH_DEV_NO_HANDLER; |
---|
| 1189 | + |
---|
| 1190 | + /* Notify all devices to be down */ |
---|
| 1191 | + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
---|
| 1192 | + eeh_pe_report( |
---|
| 1193 | + "error_detected(permanent failure)", pe, |
---|
| 1194 | + eeh_report_failure, NULL); |
---|
| 1195 | + eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
---|
| 1196 | + |
---|
1073 | 1197 | pci_lock_rescan_remove(); |
---|
1074 | 1198 | list_for_each_entry(hose, &hose_list, list_node) { |
---|
1075 | 1199 | phb_pe = eeh_phb_pe_get(hose); |
---|
.. | .. |
---|
1078 | 1202 | (phb_pe->state & EEH_PE_RECOVERING)) |
---|
1079 | 1203 | continue; |
---|
1080 | 1204 | |
---|
1081 | | - eeh_for_each_pe(pe, tmp_pe) |
---|
1082 | | - eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) |
---|
1083 | | - edev->mode &= ~EEH_DEV_NO_HANDLER; |
---|
1084 | | - |
---|
1085 | | - /* Notify all devices to be down */ |
---|
1086 | | - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); |
---|
1087 | | - eeh_set_channel_state(pe, pci_channel_io_perm_failure); |
---|
1088 | | - eeh_pe_report( |
---|
1089 | | - "error_detected(permanent failure)", pe, |
---|
1090 | | - eeh_report_failure, NULL); |
---|
1091 | 1205 | bus = eeh_pe_bus_get(phb_pe); |
---|
1092 | 1206 | if (!bus) { |
---|
1093 | 1207 | pr_err("%s: Cannot find PCI bus for " |
---|