hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/drivers/pci/pcie/err.c
....@@ -10,19 +10,15 @@
1010 * Zhang Yanmin (yanmin.zhang@intel.com)
1111 */
1212
13
+#define dev_fmt(fmt) "AER: " fmt
14
+
1315 #include <linux/pci.h>
1416 #include <linux/module.h>
15
-#include <linux/pci.h>
1617 #include <linux/kernel.h>
1718 #include <linux/errno.h>
1819 #include <linux/aer.h>
1920 #include "portdrv.h"
2021 #include "../pci.h"
21
-
22
-struct aer_broadcast_data {
23
- enum pci_channel_state state;
24
- enum pci_ers_result result;
25
-};
2622
2723 static pci_ers_result_t merge_result(enum pci_ers_result orig,
2824 enum pci_ers_result new)
....@@ -49,18 +45,16 @@
4945 return orig;
5046 }
5147
52
-static int report_error_detected(struct pci_dev *dev, void *data)
48
+static int report_error_detected(struct pci_dev *dev,
49
+ pci_channel_state_t state,
50
+ enum pci_ers_result *result)
5351 {
5452 pci_ers_result_t vote;
5553 const struct pci_error_handlers *err_handler;
56
- struct aer_broadcast_data *result_data;
57
-
58
- result_data = (struct aer_broadcast_data *) data;
5954
6055 device_lock(&dev->dev);
61
- dev->error_state = result_data->state;
62
-
63
- if (!dev->driver ||
56
+ if (!pci_dev_set_io_state(dev, state) ||
57
+ !dev->driver ||
6458 !dev->driver->err_handler ||
6559 !dev->driver->err_handler->error_detected) {
6660 /*
....@@ -69,28 +63,36 @@
6963 * error callbacks of "any" device in the subtree, and will
7064 * exit in the disconnected error state.
7165 */
72
- if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
66
+ if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
7367 vote = PCI_ERS_RESULT_NO_AER_DRIVER;
74
- else
68
+ pci_info(dev, "can't recover (no error_detected callback)\n");
69
+ } else {
7570 vote = PCI_ERS_RESULT_NONE;
71
+ }
7672 } else {
7773 err_handler = dev->driver->err_handler;
78
- vote = err_handler->error_detected(dev, result_data->state);
79
- pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
74
+ vote = err_handler->error_detected(dev, state);
8075 }
81
-
82
- result_data->result = merge_result(result_data->result, vote);
76
+ pci_uevent_ers(dev, vote);
77
+ *result = merge_result(*result, vote);
8378 device_unlock(&dev->dev);
8479 return 0;
8580 }
8681
82
+static int report_frozen_detected(struct pci_dev *dev, void *data)
83
+{
84
+ return report_error_detected(dev, pci_channel_io_frozen, data);
85
+}
86
+
87
+static int report_normal_detected(struct pci_dev *dev, void *data)
88
+{
89
+ return report_error_detected(dev, pci_channel_io_normal, data);
90
+}
91
+
8792 static int report_mmio_enabled(struct pci_dev *dev, void *data)
8893 {
89
- pci_ers_result_t vote;
94
+ pci_ers_result_t vote, *result = data;
9095 const struct pci_error_handlers *err_handler;
91
- struct aer_broadcast_data *result_data;
92
-
93
- result_data = (struct aer_broadcast_data *) data;
9496
9597 device_lock(&dev->dev);
9698 if (!dev->driver ||
....@@ -100,7 +102,7 @@
100102
101103 err_handler = dev->driver->err_handler;
102104 vote = err_handler->mmio_enabled(dev);
103
- result_data->result = merge_result(result_data->result, vote);
105
+ *result = merge_result(*result, vote);
104106 out:
105107 device_unlock(&dev->dev);
106108 return 0;
....@@ -108,11 +110,8 @@
108110
109111 static int report_slot_reset(struct pci_dev *dev, void *data)
110112 {
111
- pci_ers_result_t vote;
113
+ pci_ers_result_t vote, *result = data;
112114 const struct pci_error_handlers *err_handler;
113
- struct aer_broadcast_data *result_data;
114
-
115
- result_data = (struct aer_broadcast_data *) data;
116115
117116 device_lock(&dev->dev);
118117 if (!dev->driver ||
....@@ -122,7 +121,7 @@
122121
123122 err_handler = dev->driver->err_handler;
124123 vote = err_handler->slot_reset(dev);
125
- result_data->result = merge_result(result_data->result, vote);
124
+ *result = merge_result(*result, vote);
126125 out:
127126 device_unlock(&dev->dev);
128127 return 0;
....@@ -133,185 +132,83 @@
133132 const struct pci_error_handlers *err_handler;
134133
135134 device_lock(&dev->dev);
136
- dev->error_state = pci_channel_io_normal;
137
-
138
- if (!dev->driver ||
135
+ if (!pci_dev_set_io_state(dev, pci_channel_io_normal) ||
136
+ !dev->driver ||
139137 !dev->driver->err_handler ||
140138 !dev->driver->err_handler->resume)
141139 goto out;
142140
143141 err_handler = dev->driver->err_handler;
144142 err_handler->resume(dev);
145
- pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
146143 out:
144
+ pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
147145 device_unlock(&dev->dev);
148146 return 0;
149147 }
150148
151149 /**
152
- * default_reset_link - default reset function
153
- * @dev: pointer to pci_dev data structure
150
+ * pci_walk_bridge - walk bridges potentially AER affected
151
+ * @bridge: bridge which may be a Port or an RCEC
152
+ * @cb: callback to be called for each device found
153
+ * @userdata: arbitrary pointer to be passed to callback
154154 *
155
- * Invoked when performing link reset on a Downstream Port or a
156
- * Root Port with no aer driver.
157
- */
158
-static pci_ers_result_t default_reset_link(struct pci_dev *dev)
159
-{
160
- int rc;
161
-
162
- rc = pci_bus_error_reset(dev);
163
- pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
164
- return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
165
-}
166
-
167
-static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
168
-{
169
- pci_ers_result_t status;
170
- struct pcie_port_service_driver *driver = NULL;
171
-
172
- driver = pcie_port_find_service(dev, service);
173
- if (driver && driver->reset_link) {
174
- status = driver->reset_link(dev);
175
- } else if (dev->has_secondary_link) {
176
- status = default_reset_link(dev);
177
- } else {
178
- pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
179
- pci_name(dev));
180
- return PCI_ERS_RESULT_DISCONNECT;
181
- }
182
-
183
- if (status != PCI_ERS_RESULT_RECOVERED) {
184
- pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
185
- pci_name(dev));
186
- return PCI_ERS_RESULT_DISCONNECT;
187
- }
188
-
189
- return status;
190
-}
191
-
192
-/**
193
- * broadcast_error_message - handle message broadcast to downstream drivers
194
- * @dev: pointer to from where in a hierarchy message is broadcasted down
195
- * @state: error state
196
- * @error_mesg: message to print
197
- * @cb: callback to be broadcasted
155
+ * If the device provided is a bridge, walk the subordinate bus, including
156
+ * any bridged devices on buses under this bus. Call the provided callback
157
+ * on each device found.
198158 *
199
- * Invoked during error recovery process. Once being invoked, the content
200
- * of error severity will be broadcasted to all downstream drivers in a
201
- * hierarchy in question.
159
+ * If the device provided has no subordinate bus, e.g., an RCEC, call the
160
+ * callback on the device itself.
202161 */
203
-static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
204
- enum pci_channel_state state,
205
- char *error_mesg,
206
- int (*cb)(struct pci_dev *, void *))
162
+static void pci_walk_bridge(struct pci_dev *bridge,
163
+ int (*cb)(struct pci_dev *, void *),
164
+ void *userdata)
207165 {
208
- struct aer_broadcast_data result_data;
209
-
210
- pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
211
- result_data.state = state;
212
- if (cb == report_error_detected)
213
- result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
166
+ if (bridge->subordinate)
167
+ pci_walk_bus(bridge->subordinate, cb, userdata);
214168 else
215
- result_data.result = PCI_ERS_RESULT_RECOVERED;
216
-
217
- pci_walk_bus(dev->subordinate, cb, &result_data);
218
- return result_data.result;
169
+ cb(bridge, userdata);
219170 }
220171
221
-/**
222
- * pcie_do_fatal_recovery - handle fatal error recovery process
223
- * @dev: pointer to a pci_dev data structure of agent detecting an error
224
- *
225
- * Invoked when an error is fatal. Once being invoked, removes the devices
226
- * beneath this AER agent, followed by reset link e.g. secondary bus reset
227
- * followed by re-enumeration of devices.
228
- */
229
-void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
172
+pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
173
+ pci_channel_state_t state,
174
+ pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
230175 {
231
- struct pci_dev *udev;
232
- struct pci_bus *parent;
233
- struct pci_dev *pdev, *temp;
234
- pci_ers_result_t result;
235
-
236
- if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
237
- udev = dev;
238
- else
239
- udev = dev->bus->self;
240
-
241
- parent = udev->subordinate;
242
- pci_lock_rescan_remove();
243
- pci_dev_get(dev);
244
- list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
245
- bus_list) {
246
- pci_dev_get(pdev);
247
- pci_dev_set_disconnected(pdev, NULL);
248
- if (pci_has_subordinate(pdev))
249
- pci_walk_bus(pdev->subordinate,
250
- pci_dev_set_disconnected, NULL);
251
- pci_stop_and_remove_bus_device(pdev);
252
- pci_dev_put(pdev);
253
- }
254
-
255
- result = reset_link(udev, service);
256
-
257
- if ((service == PCIE_PORT_SERVICE_AER) &&
258
- (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
259
- /*
260
- * If the error is reported by a bridge, we think this error
261
- * is related to the downstream link of the bridge, so we
262
- * do error recovery on all subordinates of the bridge instead
263
- * of the bridge and clear the error status of the bridge.
264
- */
265
- pci_aer_clear_fatal_status(dev);
266
- pci_aer_clear_device_status(dev);
267
- }
268
-
269
- if (result == PCI_ERS_RESULT_RECOVERED) {
270
- if (pcie_wait_for_link(udev, true))
271
- pci_rescan_bus(udev->bus);
272
- pci_info(dev, "Device recovery from fatal error successful\n");
273
- } else {
274
- pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
275
- pci_info(dev, "Device recovery from fatal error failed\n");
276
- }
277
-
278
- pci_dev_put(dev);
279
- pci_unlock_rescan_remove();
280
-}
281
-
282
-/**
283
- * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
284
- * @dev: pointer to a pci_dev data structure of agent detecting an error
285
- *
286
- * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
287
- * error detected message to all downstream drivers within a hierarchy in
288
- * question and return the returned code.
289
- */
290
-void pcie_do_nonfatal_recovery(struct pci_dev *dev)
291
-{
292
- pci_ers_result_t status;
293
- enum pci_channel_state state;
294
-
295
- state = pci_channel_io_normal;
176
+ int type = pci_pcie_type(dev);
177
+ struct pci_dev *bridge;
178
+ pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
296179
297180 /*
298
- * Error recovery runs on all subordinates of the first downstream port.
299
- * If the downstream port detected the error, it is cleared at the end.
181
+ * If the error was detected by a Root Port, Downstream Port, or
182
+ * RCEC, recovery runs on the device itself. For Ports, that also
183
+ * includes any subordinate devices.
184
+ *
185
+ * If it was detected by another device (Endpoint, etc), recovery
186
+ * runs on the device and anything else under the same Port, i.e.,
187
+ * everything under "bridge".
300188 */
301
- if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
302
- pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
303
- dev = dev->bus->self;
189
+ if (type == PCI_EXP_TYPE_ROOT_PORT ||
190
+ type == PCI_EXP_TYPE_DOWNSTREAM ||
191
+ type == PCI_EXP_TYPE_RC_EC)
192
+ bridge = dev;
193
+ else
194
+ bridge = pci_upstream_bridge(dev);
304195
305
- status = broadcast_error_message(dev,
306
- state,
307
- "error_detected",
308
- report_error_detected);
196
+ pci_dbg(bridge, "broadcast error_detected message\n");
197
+ if (state == pci_channel_io_frozen) {
198
+ pci_walk_bridge(bridge, report_frozen_detected, &status);
199
+ if (reset_subordinates(bridge) != PCI_ERS_RESULT_RECOVERED) {
200
+ pci_warn(bridge, "subordinate device reset failed\n");
201
+ goto failed;
202
+ }
203
+ } else {
204
+ pci_walk_bridge(bridge, report_normal_detected, &status);
205
+ }
309206
310
- if (status == PCI_ERS_RESULT_CAN_RECOVER)
311
- status = broadcast_error_message(dev,
312
- state,
313
- "mmio_enabled",
314
- report_mmio_enabled);
207
+ if (status == PCI_ERS_RESULT_CAN_RECOVER) {
208
+ status = PCI_ERS_RESULT_RECOVERED;
209
+ pci_dbg(bridge, "broadcast mmio_enabled message\n");
210
+ pci_walk_bridge(bridge, report_mmio_enabled, &status);
211
+ }
315212
316213 if (status == PCI_ERS_RESULT_NEED_RESET) {
317214 /*
....@@ -319,28 +216,28 @@
319216 * functions to reset slot before calling
320217 * drivers' slot_reset callbacks?
321218 */
322
- status = broadcast_error_message(dev,
323
- state,
324
- "slot_reset",
325
- report_slot_reset);
219
+ status = PCI_ERS_RESULT_RECOVERED;
220
+ pci_dbg(bridge, "broadcast slot_reset message\n");
221
+ pci_walk_bridge(bridge, report_slot_reset, &status);
326222 }
327223
328224 if (status != PCI_ERS_RESULT_RECOVERED)
329225 goto failed;
330226
331
- broadcast_error_message(dev,
332
- state,
333
- "resume",
334
- report_resume);
227
+ pci_dbg(bridge, "broadcast resume message\n");
228
+ pci_walk_bridge(bridge, report_resume, &status);
335229
336
- pci_aer_clear_device_status(dev);
337
- pci_cleanup_aer_uncorrect_error_status(dev);
338
- pci_info(dev, "AER: Device recovery successful\n");
339
- return;
230
+ if (pcie_aer_is_native(bridge))
231
+ pcie_clear_device_status(bridge);
232
+ pci_aer_clear_nonfatal_status(bridge);
233
+ pci_info(bridge, "device recovery successful\n");
234
+ return status;
340235
341236 failed:
342
- pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
237
+ pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT);
343238
344239 /* TODO: Should kernel panic here? */
345
- pci_info(dev, "AER: Device recovery failed\n");
240
+ pci_info(bridge, "device recovery failed\n");
241
+
242
+ return status;
346243 }