.. | .. |
---|
36 | 36 | #include <linux/vmalloc.h> |
---|
37 | 37 | #include <linux/hardirq.h> |
---|
38 | 38 | #include <linux/mlx5/driver.h> |
---|
39 | | -#include <linux/mlx5/cmd.h> |
---|
40 | 39 | #include "mlx5_core.h" |
---|
| 40 | +#include "lib/eq.h" |
---|
| 41 | +#include "lib/mlx5.h" |
---|
| 42 | +#include "lib/pci_vsc.h" |
---|
| 43 | +#include "diag/fw_tracer.h" |
---|
41 | 44 | |
---|
42 | 45 | enum { |
---|
43 | 46 | MLX5_HEALTH_POLL_INTERVAL = 2 * HZ, |
---|
.. | .. |
---|
59 | 62 | }; |
---|
60 | 63 | |
---|
61 | 64 | enum { |
---|
62 | | - MLX5_NIC_IFC_FULL = 0, |
---|
63 | | - MLX5_NIC_IFC_DISABLED = 1, |
---|
64 | | - MLX5_NIC_IFC_NO_DRAM_NIC = 2, |
---|
65 | | - MLX5_NIC_IFC_INVALID = 3 |
---|
66 | | -}; |
---|
67 | | - |
---|
68 | | -enum { |
---|
69 | 65 | MLX5_DROP_NEW_HEALTH_WORK, |
---|
70 | | - MLX5_DROP_NEW_RECOVERY_WORK, |
---|
71 | 66 | }; |
---|
72 | 67 | |
---|
73 | | -static u8 get_nic_state(struct mlx5_core_dev *dev) |
---|
| 68 | +enum { |
---|
| 69 | + MLX5_SENSOR_NO_ERR = 0, |
---|
| 70 | + MLX5_SENSOR_PCI_COMM_ERR = 1, |
---|
| 71 | + MLX5_SENSOR_PCI_ERR = 2, |
---|
| 72 | + MLX5_SENSOR_NIC_DISABLED = 3, |
---|
| 73 | + MLX5_SENSOR_NIC_SW_RESET = 4, |
---|
| 74 | + MLX5_SENSOR_FW_SYND_RFR = 5, |
---|
| 75 | +}; |
---|
| 76 | + |
---|
| 77 | +u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) |
---|
74 | 78 | { |
---|
75 | | - return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; |
---|
| 79 | + return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; |
---|
76 | 80 | } |
---|
77 | 81 | |
---|
78 | | -static void trigger_cmd_completions(struct mlx5_core_dev *dev) |
---|
| 82 | +void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) |
---|
79 | 83 | { |
---|
80 | | - unsigned long flags; |
---|
81 | | - u64 vector; |
---|
| 84 | + u32 cur_cmdq_addr_l_sz; |
---|
82 | 85 | |
---|
83 | | - /* wait for pending handlers to complete */ |
---|
84 | | - synchronize_irq(pci_irq_vector(dev->pdev, MLX5_EQ_VEC_CMD)); |
---|
85 | | - spin_lock_irqsave(&dev->cmd.alloc_lock, flags); |
---|
86 | | - vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); |
---|
87 | | - if (!vector) |
---|
88 | | - goto no_trig; |
---|
89 | | - |
---|
90 | | - vector |= MLX5_TRIGGERED_CMD_COMP; |
---|
91 | | - spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); |
---|
92 | | - |
---|
93 | | - mlx5_core_dbg(dev, "vector 0x%llx\n", vector); |
---|
94 | | - mlx5_cmd_comp_handler(dev, vector, true); |
---|
95 | | - return; |
---|
96 | | - |
---|
97 | | -no_trig: |
---|
98 | | - spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); |
---|
| 86 | + cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz); |
---|
| 87 | + iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) | |
---|
| 88 | + state << MLX5_NIC_IFC_OFFSET, |
---|
| 89 | + &dev->iseg->cmdq_addr_l_sz); |
---|
99 | 90 | } |
---|
100 | 91 | |
---|
101 | | -static int in_fatal(struct mlx5_core_dev *dev) |
---|
| 92 | +static bool sensor_pci_not_working(struct mlx5_core_dev *dev) |
---|
102 | 93 | { |
---|
103 | 94 | struct mlx5_core_health *health = &dev->priv.health; |
---|
104 | 95 | struct health_buffer __iomem *h = health->health; |
---|
105 | 96 | |
---|
106 | | - if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) |
---|
107 | | - return 1; |
---|
| 97 | + /* Offline PCI reads return 0xffffffff */ |
---|
| 98 | + return (ioread32be(&h->fw_ver) == 0xffffffff); |
---|
| 99 | +} |
---|
108 | 100 | |
---|
109 | | - if (ioread32be(&h->fw_ver) == 0xffffffff) |
---|
110 | | - return 1; |
---|
| 101 | +static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) |
---|
| 102 | +{ |
---|
| 103 | + struct mlx5_core_health *health = &dev->priv.health; |
---|
| 104 | + struct health_buffer __iomem *h = health->health; |
---|
| 105 | + u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET; |
---|
| 106 | + u8 synd = ioread8(&h->synd); |
---|
111 | 107 | |
---|
112 | | - return 0; |
---|
| 108 | + if (rfr && synd) |
---|
| 109 | + mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); |
---|
| 110 | + return rfr && synd; |
---|
| 111 | +} |
---|
| 112 | + |
---|
| 113 | +u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev) |
---|
| 114 | +{ |
---|
| 115 | + if (sensor_pci_not_working(dev)) |
---|
| 116 | + return MLX5_SENSOR_PCI_COMM_ERR; |
---|
| 117 | + if (pci_channel_offline(dev->pdev)) |
---|
| 118 | + return MLX5_SENSOR_PCI_ERR; |
---|
| 119 | + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) |
---|
| 120 | + return MLX5_SENSOR_NIC_DISABLED; |
---|
| 121 | + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET) |
---|
| 122 | + return MLX5_SENSOR_NIC_SW_RESET; |
---|
| 123 | + if (sensor_fw_synd_rfr(dev)) |
---|
| 124 | + return MLX5_SENSOR_FW_SYND_RFR; |
---|
| 125 | + |
---|
| 126 | + return MLX5_SENSOR_NO_ERR; |
---|
| 127 | +} |
---|
| 128 | + |
---|
| 129 | +static int lock_sem_sw_reset(struct mlx5_core_dev *dev, bool lock) |
---|
| 130 | +{ |
---|
| 131 | + enum mlx5_vsc_state state; |
---|
| 132 | + int ret; |
---|
| 133 | + |
---|
| 134 | + if (!mlx5_core_is_pf(dev)) |
---|
| 135 | + return -EBUSY; |
---|
| 136 | + |
---|
| 137 | + /* Try to lock GW access, this stage doesn't return |
---|
| 138 | + * EBUSY because locked GW does not mean that other PF |
---|
| 139 | + * already started the reset. |
---|
| 140 | + */ |
---|
| 141 | + ret = mlx5_vsc_gw_lock(dev); |
---|
| 142 | + if (ret == -EBUSY) |
---|
| 143 | + return -EINVAL; |
---|
| 144 | + if (ret) |
---|
| 145 | + return ret; |
---|
| 146 | + |
---|
| 147 | + state = lock ? MLX5_VSC_LOCK : MLX5_VSC_UNLOCK; |
---|
| 148 | + /* At this stage, if the return status == EBUSY, then we know |
---|
| 149 | + * for sure that another PF started the reset, so don't allow |
---|
| 150 | + * another reset. |
---|
| 151 | + */ |
---|
| 152 | + ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, state); |
---|
| 153 | + if (ret) |
---|
| 154 | + mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n"); |
---|
| 155 | + |
---|
| 156 | + /* Unlock GW access */ |
---|
| 157 | + mlx5_vsc_gw_unlock(dev); |
---|
| 158 | + |
---|
| 159 | + return ret; |
---|
| 160 | +} |
---|
| 161 | + |
---|
| 162 | +static bool reset_fw_if_needed(struct mlx5_core_dev *dev) |
---|
| 163 | +{ |
---|
| 164 | + bool supported = (ioread32be(&dev->iseg->initializing) >> |
---|
| 165 | + MLX5_FW_RESET_SUPPORTED_OFFSET) & 1; |
---|
| 166 | + u32 fatal_error; |
---|
| 167 | + |
---|
| 168 | + if (!supported) |
---|
| 169 | + return false; |
---|
| 170 | + |
---|
| 171 | + /* The reset only needs to be issued by one PF. The health buffer is |
---|
| 172 | + * shared between all functions, and will be cleared during a reset. |
---|
| 173 | + * Check again to avoid a redundant 2nd reset. If the fatal erros was |
---|
| 174 | + * PCI related a reset won't help. |
---|
| 175 | + */ |
---|
| 176 | + fatal_error = mlx5_health_check_fatal_sensors(dev); |
---|
| 177 | + if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR || |
---|
| 178 | + fatal_error == MLX5_SENSOR_NIC_DISABLED || |
---|
| 179 | + fatal_error == MLX5_SENSOR_NIC_SW_RESET) { |
---|
| 180 | + mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help."); |
---|
| 181 | + return false; |
---|
| 182 | + } |
---|
| 183 | + |
---|
| 184 | + mlx5_core_warn(dev, "Issuing FW Reset\n"); |
---|
| 185 | + /* Write the NIC interface field to initiate the reset, the command |
---|
| 186 | + * interface address also resides here, don't overwrite it. |
---|
| 187 | + */ |
---|
| 188 | + mlx5_set_nic_state(dev, MLX5_NIC_IFC_SW_RESET); |
---|
| 189 | + |
---|
| 190 | + return true; |
---|
| 191 | +} |
---|
| 192 | + |
---|
| 193 | +static void enter_error_state(struct mlx5_core_dev *dev, bool force) |
---|
| 194 | +{ |
---|
| 195 | + if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */ |
---|
| 196 | + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
---|
| 197 | + mlx5_cmd_flush(dev); |
---|
| 198 | + } |
---|
| 199 | + |
---|
| 200 | + mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); |
---|
113 | 201 | } |
---|
114 | 202 | |
---|
115 | 203 | void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) |
---|
116 | 204 | { |
---|
| 205 | + bool err_detected = false; |
---|
| 206 | + |
---|
| 207 | + /* Mark the device as fatal in order to abort FW commands */ |
---|
| 208 | + if ((mlx5_health_check_fatal_sensors(dev) || force) && |
---|
| 209 | + dev->state == MLX5_DEVICE_STATE_UP) { |
---|
| 210 | + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
---|
| 211 | + err_detected = true; |
---|
| 212 | + } |
---|
117 | 213 | mutex_lock(&dev->intf_state_mutex); |
---|
118 | | - if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) |
---|
| 214 | + if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) |
---|
| 215 | + goto unlock;/* a previous error is still being handled */ |
---|
| 216 | + if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) { |
---|
| 217 | + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
---|
| 218 | + goto unlock; |
---|
| 219 | + } |
---|
| 220 | + |
---|
| 221 | + enter_error_state(dev, force); |
---|
| 222 | +unlock: |
---|
| 223 | + mutex_unlock(&dev->intf_state_mutex); |
---|
| 224 | +} |
---|
| 225 | + |
---|
| 226 | +#define MLX5_CRDUMP_WAIT_MS 60000 |
---|
| 227 | +#define MLX5_FW_RESET_WAIT_MS 1000 |
---|
| 228 | +void mlx5_error_sw_reset(struct mlx5_core_dev *dev) |
---|
| 229 | +{ |
---|
| 230 | + unsigned long end, delay_ms = MLX5_FW_RESET_WAIT_MS; |
---|
| 231 | + int lock = -EBUSY; |
---|
| 232 | + |
---|
| 233 | + mutex_lock(&dev->intf_state_mutex); |
---|
| 234 | + if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) |
---|
119 | 235 | goto unlock; |
---|
120 | 236 | |
---|
121 | 237 | mlx5_core_err(dev, "start\n"); |
---|
122 | | - if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) { |
---|
123 | | - dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
---|
124 | | - trigger_cmd_completions(dev); |
---|
| 238 | + |
---|
| 239 | + if (mlx5_health_check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) { |
---|
| 240 | + /* Get cr-dump and reset FW semaphore */ |
---|
| 241 | + lock = lock_sem_sw_reset(dev, true); |
---|
| 242 | + |
---|
| 243 | + if (lock == -EBUSY) { |
---|
| 244 | + delay_ms = MLX5_CRDUMP_WAIT_MS; |
---|
| 245 | + goto recover_from_sw_reset; |
---|
| 246 | + } |
---|
| 247 | + /* Execute SW reset */ |
---|
| 248 | + reset_fw_if_needed(dev); |
---|
125 | 249 | } |
---|
126 | 250 | |
---|
127 | | - mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1); |
---|
| 251 | +recover_from_sw_reset: |
---|
| 252 | + /* Recover from SW reset */ |
---|
| 253 | + end = jiffies + msecs_to_jiffies(delay_ms); |
---|
| 254 | + do { |
---|
| 255 | + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) |
---|
| 256 | + break; |
---|
| 257 | + |
---|
| 258 | + msleep(20); |
---|
| 259 | + } while (!time_after(jiffies, end)); |
---|
| 260 | + |
---|
| 261 | + if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) { |
---|
| 262 | + dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n", |
---|
| 263 | + mlx5_get_nic_state(dev), delay_ms); |
---|
| 264 | + } |
---|
| 265 | + |
---|
| 266 | + /* Release FW semaphore if you are the lock owner */ |
---|
| 267 | + if (!lock) |
---|
| 268 | + lock_sem_sw_reset(dev, false); |
---|
| 269 | + |
---|
128 | 270 | mlx5_core_err(dev, "end\n"); |
---|
129 | 271 | |
---|
130 | 272 | unlock: |
---|
.. | .. |
---|
133 | 275 | |
---|
134 | 276 | static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) |
---|
135 | 277 | { |
---|
136 | | - u8 nic_interface = get_nic_state(dev); |
---|
| 278 | + u8 nic_interface = mlx5_get_nic_state(dev); |
---|
137 | 279 | |
---|
138 | 280 | switch (nic_interface) { |
---|
139 | 281 | case MLX5_NIC_IFC_FULL: |
---|
.. | .. |
---|
147 | 289 | case MLX5_NIC_IFC_NO_DRAM_NIC: |
---|
148 | 290 | mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); |
---|
149 | 291 | break; |
---|
| 292 | + |
---|
| 293 | + case MLX5_NIC_IFC_SW_RESET: |
---|
| 294 | + /* The IFC mode field is 3 bits, so it will read 0x7 in 2 cases: |
---|
| 295 | + * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded |
---|
| 296 | + * and this is a VF), this is not recoverable by SW reset. |
---|
| 297 | + * Logging of this is handled elsewhere. |
---|
| 298 | + * 2. FW reset has been issued by another function, driver can |
---|
| 299 | + * be reloaded to recover after the mode switches to |
---|
| 300 | + * MLX5_NIC_IFC_DISABLED. |
---|
| 301 | + */ |
---|
| 302 | + if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR) |
---|
| 303 | + mlx5_core_warn(dev, "NIC SW reset in progress\n"); |
---|
| 304 | + break; |
---|
| 305 | + |
---|
150 | 306 | default: |
---|
151 | 307 | mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n", |
---|
152 | 308 | nic_interface); |
---|
.. | .. |
---|
155 | 311 | mlx5_disable_device(dev); |
---|
156 | 312 | } |
---|
157 | 313 | |
---|
158 | | -static void health_recover(struct work_struct *work) |
---|
| 314 | +/* How much time to wait until health resetting the driver (in msecs) */ |
---|
| 315 | +#define MLX5_RECOVERY_WAIT_MSECS 60000 |
---|
| 316 | +int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev) |
---|
159 | 317 | { |
---|
160 | | - struct mlx5_core_health *health; |
---|
161 | | - struct delayed_work *dwork; |
---|
162 | | - struct mlx5_core_dev *dev; |
---|
163 | | - struct mlx5_priv *priv; |
---|
164 | | - u8 nic_state; |
---|
| 318 | + unsigned long end; |
---|
165 | 319 | |
---|
166 | | - dwork = container_of(work, struct delayed_work, work); |
---|
167 | | - health = container_of(dwork, struct mlx5_core_health, recover_work); |
---|
168 | | - priv = container_of(health, struct mlx5_priv, health); |
---|
169 | | - dev = container_of(priv, struct mlx5_core_dev, priv); |
---|
170 | | - |
---|
171 | | - nic_state = get_nic_state(dev); |
---|
172 | | - if (nic_state == MLX5_NIC_IFC_INVALID) { |
---|
173 | | - dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n"); |
---|
174 | | - return; |
---|
| 320 | + end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS); |
---|
| 321 | + while (sensor_pci_not_working(dev)) { |
---|
| 322 | + if (time_after(jiffies, end)) |
---|
| 323 | + return -ETIMEDOUT; |
---|
| 324 | + msleep(100); |
---|
175 | 325 | } |
---|
176 | | - |
---|
177 | | - dev_err(&dev->pdev->dev, "starting health recovery flow\n"); |
---|
178 | | - mlx5_recover_device(dev); |
---|
| 326 | + return 0; |
---|
179 | 327 | } |
---|
180 | 328 | |
---|
181 | | -/* How much time to wait until health resetting the driver (in msecs) */ |
---|
182 | | -#define MLX5_RECOVERY_DELAY_MSECS 60000 |
---|
183 | | -static void health_care(struct work_struct *work) |
---|
| 329 | +static int mlx5_health_try_recover(struct mlx5_core_dev *dev) |
---|
184 | 330 | { |
---|
185 | | - unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS); |
---|
186 | | - struct mlx5_core_health *health; |
---|
187 | | - struct mlx5_core_dev *dev; |
---|
188 | | - struct mlx5_priv *priv; |
---|
189 | | - unsigned long flags; |
---|
190 | | - |
---|
191 | | - health = container_of(work, struct mlx5_core_health, work); |
---|
192 | | - priv = container_of(health, struct mlx5_priv, health); |
---|
193 | | - dev = container_of(priv, struct mlx5_core_dev, priv); |
---|
194 | 331 | mlx5_core_warn(dev, "handling bad device here\n"); |
---|
195 | 332 | mlx5_handle_bad_state(dev); |
---|
196 | | - |
---|
197 | | - spin_lock_irqsave(&health->wq_lock, flags); |
---|
198 | | - if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) |
---|
199 | | - schedule_delayed_work(&health->recover_work, recover_delay); |
---|
200 | | - else |
---|
201 | | - dev_err(&dev->pdev->dev, |
---|
202 | | - "new health works are not permitted at this stage\n"); |
---|
203 | | - spin_unlock_irqrestore(&health->wq_lock, flags); |
---|
| 333 | + if (mlx5_health_wait_pci_up(dev)) { |
---|
| 334 | + mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n"); |
---|
| 335 | + return -EIO; |
---|
| 336 | + } |
---|
| 337 | + mlx5_core_err(dev, "starting health recovery flow\n"); |
---|
| 338 | + mlx5_recover_device(dev); |
---|
| 339 | + if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) || |
---|
| 340 | + mlx5_health_check_fatal_sensors(dev)) { |
---|
| 341 | + mlx5_core_err(dev, "health recovery failed\n"); |
---|
| 342 | + return -EIO; |
---|
| 343 | + } |
---|
| 344 | + return 0; |
---|
204 | 345 | } |
---|
205 | 346 | |
---|
206 | 347 | static const char *hsynd_str(u8 synd) |
---|
.. | .. |
---|
246 | 387 | return; |
---|
247 | 388 | |
---|
248 | 389 | for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) |
---|
249 | | - dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i)); |
---|
| 390 | + mlx5_core_err(dev, "assert_var[%d] 0x%08x\n", i, |
---|
| 391 | + ioread32be(h->assert_var + i)); |
---|
250 | 392 | |
---|
251 | | - dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); |
---|
252 | | - dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); |
---|
| 393 | + mlx5_core_err(dev, "assert_exit_ptr 0x%08x\n", |
---|
| 394 | + ioread32be(&h->assert_exit_ptr)); |
---|
| 395 | + mlx5_core_err(dev, "assert_callra 0x%08x\n", |
---|
| 396 | + ioread32be(&h->assert_callra)); |
---|
253 | 397 | sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); |
---|
254 | | - dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str); |
---|
255 | | - dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); |
---|
256 | | - dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index)); |
---|
257 | | - dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); |
---|
258 | | - dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); |
---|
| 398 | + mlx5_core_err(dev, "fw_ver %s\n", fw_str); |
---|
| 399 | + mlx5_core_err(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); |
---|
| 400 | + mlx5_core_err(dev, "irisc_index %d\n", ioread8(&h->irisc_index)); |
---|
| 401 | + mlx5_core_err(dev, "synd 0x%x: %s\n", ioread8(&h->synd), |
---|
| 402 | + hsynd_str(ioread8(&h->synd))); |
---|
| 403 | + mlx5_core_err(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); |
---|
259 | 404 | fw = ioread32be(&h->fw_ver); |
---|
260 | | - dev_err(&dev->pdev->dev, "raw fw_ver 0x%08x\n", fw); |
---|
| 405 | + mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw); |
---|
| 406 | +} |
---|
| 407 | + |
---|
| 408 | +static int |
---|
| 409 | +mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, |
---|
| 410 | + struct devlink_fmsg *fmsg, |
---|
| 411 | + struct netlink_ext_ack *extack) |
---|
| 412 | +{ |
---|
| 413 | + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); |
---|
| 414 | + struct mlx5_core_health *health = &dev->priv.health; |
---|
| 415 | + struct health_buffer __iomem *h = health->health; |
---|
| 416 | + u8 synd; |
---|
| 417 | + int err; |
---|
| 418 | + |
---|
| 419 | + synd = ioread8(&h->synd); |
---|
| 420 | + err = devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd); |
---|
| 421 | + if (err || !synd) |
---|
| 422 | + return err; |
---|
| 423 | + return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd)); |
---|
| 424 | +} |
---|
| 425 | + |
---|
| 426 | +struct mlx5_fw_reporter_ctx { |
---|
| 427 | + u8 err_synd; |
---|
| 428 | + int miss_counter; |
---|
| 429 | +}; |
---|
| 430 | + |
---|
| 431 | +static int |
---|
| 432 | +mlx5_fw_reporter_ctx_pairs_put(struct devlink_fmsg *fmsg, |
---|
| 433 | + struct mlx5_fw_reporter_ctx *fw_reporter_ctx) |
---|
| 434 | +{ |
---|
| 435 | + int err; |
---|
| 436 | + |
---|
| 437 | + err = devlink_fmsg_u8_pair_put(fmsg, "syndrome", |
---|
| 438 | + fw_reporter_ctx->err_synd); |
---|
| 439 | + if (err) |
---|
| 440 | + return err; |
---|
| 441 | + err = devlink_fmsg_u32_pair_put(fmsg, "fw_miss_counter", |
---|
| 442 | + fw_reporter_ctx->miss_counter); |
---|
| 443 | + if (err) |
---|
| 444 | + return err; |
---|
| 445 | + return 0; |
---|
| 446 | +} |
---|
| 447 | + |
---|
| 448 | +static int |
---|
| 449 | +mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev, |
---|
| 450 | + struct devlink_fmsg *fmsg) |
---|
| 451 | +{ |
---|
| 452 | + struct mlx5_core_health *health = &dev->priv.health; |
---|
| 453 | + struct health_buffer __iomem *h = health->health; |
---|
| 454 | + int err; |
---|
| 455 | + int i; |
---|
| 456 | + |
---|
| 457 | + if (!ioread8(&h->synd)) |
---|
| 458 | + return 0; |
---|
| 459 | + |
---|
| 460 | + err = devlink_fmsg_pair_nest_start(fmsg, "health buffer"); |
---|
| 461 | + if (err) |
---|
| 462 | + return err; |
---|
| 463 | + err = devlink_fmsg_obj_nest_start(fmsg); |
---|
| 464 | + if (err) |
---|
| 465 | + return err; |
---|
| 466 | + err = devlink_fmsg_arr_pair_nest_start(fmsg, "assert_var"); |
---|
| 467 | + if (err) |
---|
| 468 | + return err; |
---|
| 469 | + |
---|
| 470 | + for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) { |
---|
| 471 | + err = devlink_fmsg_u32_put(fmsg, ioread32be(h->assert_var + i)); |
---|
| 472 | + if (err) |
---|
| 473 | + return err; |
---|
| 474 | + } |
---|
| 475 | + err = devlink_fmsg_arr_pair_nest_end(fmsg); |
---|
| 476 | + if (err) |
---|
| 477 | + return err; |
---|
| 478 | + err = devlink_fmsg_u32_pair_put(fmsg, "assert_exit_ptr", |
---|
| 479 | + ioread32be(&h->assert_exit_ptr)); |
---|
| 480 | + if (err) |
---|
| 481 | + return err; |
---|
| 482 | + err = devlink_fmsg_u32_pair_put(fmsg, "assert_callra", |
---|
| 483 | + ioread32be(&h->assert_callra)); |
---|
| 484 | + if (err) |
---|
| 485 | + return err; |
---|
| 486 | + err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id)); |
---|
| 487 | + if (err) |
---|
| 488 | + return err; |
---|
| 489 | + err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index", |
---|
| 490 | + ioread8(&h->irisc_index)); |
---|
| 491 | + if (err) |
---|
| 492 | + return err; |
---|
| 493 | + err = devlink_fmsg_u8_pair_put(fmsg, "synd", ioread8(&h->synd)); |
---|
| 494 | + if (err) |
---|
| 495 | + return err; |
---|
| 496 | + err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", |
---|
| 497 | + ioread16be(&h->ext_synd)); |
---|
| 498 | + if (err) |
---|
| 499 | + return err; |
---|
| 500 | + err = devlink_fmsg_u32_pair_put(fmsg, "raw_fw_ver", |
---|
| 501 | + ioread32be(&h->fw_ver)); |
---|
| 502 | + if (err) |
---|
| 503 | + return err; |
---|
| 504 | + err = devlink_fmsg_obj_nest_end(fmsg); |
---|
| 505 | + if (err) |
---|
| 506 | + return err; |
---|
| 507 | + return devlink_fmsg_pair_nest_end(fmsg); |
---|
| 508 | +} |
---|
| 509 | + |
---|
| 510 | +static int |
---|
| 511 | +mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter, |
---|
| 512 | + struct devlink_fmsg *fmsg, void *priv_ctx, |
---|
| 513 | + struct netlink_ext_ack *extack) |
---|
| 514 | +{ |
---|
| 515 | + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); |
---|
| 516 | + int err; |
---|
| 517 | + |
---|
| 518 | + err = mlx5_fw_tracer_trigger_core_dump_general(dev); |
---|
| 519 | + if (err) |
---|
| 520 | + return err; |
---|
| 521 | + |
---|
| 522 | + if (priv_ctx) { |
---|
| 523 | + struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; |
---|
| 524 | + |
---|
| 525 | + err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx); |
---|
| 526 | + if (err) |
---|
| 527 | + return err; |
---|
| 528 | + } |
---|
| 529 | + |
---|
| 530 | + err = mlx5_fw_reporter_heath_buffer_data_put(dev, fmsg); |
---|
| 531 | + if (err) |
---|
| 532 | + return err; |
---|
| 533 | + return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg); |
---|
| 534 | +} |
---|
| 535 | + |
---|
| 536 | +static void mlx5_fw_reporter_err_work(struct work_struct *work) |
---|
| 537 | +{ |
---|
| 538 | + struct mlx5_fw_reporter_ctx fw_reporter_ctx; |
---|
| 539 | + struct mlx5_core_health *health; |
---|
| 540 | + |
---|
| 541 | + health = container_of(work, struct mlx5_core_health, report_work); |
---|
| 542 | + |
---|
| 543 | + if (IS_ERR_OR_NULL(health->fw_reporter)) |
---|
| 544 | + return; |
---|
| 545 | + |
---|
| 546 | + fw_reporter_ctx.err_synd = health->synd; |
---|
| 547 | + fw_reporter_ctx.miss_counter = health->miss_counter; |
---|
| 548 | + if (fw_reporter_ctx.err_synd) { |
---|
| 549 | + devlink_health_report(health->fw_reporter, |
---|
| 550 | + "FW syndrom reported", &fw_reporter_ctx); |
---|
| 551 | + return; |
---|
| 552 | + } |
---|
| 553 | + if (fw_reporter_ctx.miss_counter) |
---|
| 554 | + devlink_health_report(health->fw_reporter, |
---|
| 555 | + "FW miss counter reported", |
---|
| 556 | + &fw_reporter_ctx); |
---|
| 557 | +} |
---|
| 558 | + |
---|
| 559 | +static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { |
---|
| 560 | + .name = "fw", |
---|
| 561 | + .diagnose = mlx5_fw_reporter_diagnose, |
---|
| 562 | + .dump = mlx5_fw_reporter_dump, |
---|
| 563 | +}; |
---|
| 564 | + |
---|
| 565 | +static int |
---|
| 566 | +mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, |
---|
| 567 | + void *priv_ctx, |
---|
| 568 | + struct netlink_ext_ack *extack) |
---|
| 569 | +{ |
---|
| 570 | + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); |
---|
| 571 | + |
---|
| 572 | + return mlx5_health_try_recover(dev); |
---|
| 573 | +} |
---|
| 574 | + |
---|
| 575 | +static int |
---|
| 576 | +mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter, |
---|
| 577 | + struct devlink_fmsg *fmsg, void *priv_ctx, |
---|
| 578 | + struct netlink_ext_ack *extack) |
---|
| 579 | +{ |
---|
| 580 | + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); |
---|
| 581 | + u32 crdump_size = dev->priv.health.crdump_size; |
---|
| 582 | + u32 *cr_data; |
---|
| 583 | + int err; |
---|
| 584 | + |
---|
| 585 | + if (!mlx5_core_is_pf(dev)) |
---|
| 586 | + return -EPERM; |
---|
| 587 | + |
---|
| 588 | + cr_data = kvmalloc(crdump_size, GFP_KERNEL); |
---|
| 589 | + if (!cr_data) |
---|
| 590 | + return -ENOMEM; |
---|
| 591 | + err = mlx5_crdump_collect(dev, cr_data); |
---|
| 592 | + if (err) |
---|
| 593 | + goto free_data; |
---|
| 594 | + |
---|
| 595 | + if (priv_ctx) { |
---|
| 596 | + struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; |
---|
| 597 | + |
---|
| 598 | + err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx); |
---|
| 599 | + if (err) |
---|
| 600 | + goto free_data; |
---|
| 601 | + } |
---|
| 602 | + |
---|
| 603 | + err = devlink_fmsg_binary_pair_put(fmsg, "crdump_data", cr_data, crdump_size); |
---|
| 604 | + |
---|
| 605 | +free_data: |
---|
| 606 | + kvfree(cr_data); |
---|
| 607 | + return err; |
---|
| 608 | +} |
---|
| 609 | + |
---|
| 610 | +static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) |
---|
| 611 | +{ |
---|
| 612 | + struct mlx5_fw_reporter_ctx fw_reporter_ctx; |
---|
| 613 | + struct mlx5_core_health *health; |
---|
| 614 | + struct mlx5_core_dev *dev; |
---|
| 615 | + struct mlx5_priv *priv; |
---|
| 616 | + |
---|
| 617 | + health = container_of(work, struct mlx5_core_health, fatal_report_work); |
---|
| 618 | + priv = container_of(health, struct mlx5_priv, health); |
---|
| 619 | + dev = container_of(priv, struct mlx5_core_dev, priv); |
---|
| 620 | + |
---|
| 621 | + enter_error_state(dev, false); |
---|
| 622 | + if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) { |
---|
| 623 | + if (mlx5_health_try_recover(dev)) |
---|
| 624 | + mlx5_core_err(dev, "health recovery failed\n"); |
---|
| 625 | + return; |
---|
| 626 | + } |
---|
| 627 | + fw_reporter_ctx.err_synd = health->synd; |
---|
| 628 | + fw_reporter_ctx.miss_counter = health->miss_counter; |
---|
| 629 | + devlink_health_report(health->fw_fatal_reporter, |
---|
| 630 | + "FW fatal error reported", &fw_reporter_ctx); |
---|
| 631 | +} |
---|
| 632 | + |
---|
| 633 | +static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { |
---|
| 634 | + .name = "fw_fatal", |
---|
| 635 | + .recover = mlx5_fw_fatal_reporter_recover, |
---|
| 636 | + .dump = mlx5_fw_fatal_reporter_dump, |
---|
| 637 | +}; |
---|
| 638 | + |
---|
| 639 | +#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000 |
---|
| 640 | +static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) |
---|
| 641 | +{ |
---|
| 642 | + struct mlx5_core_health *health = &dev->priv.health; |
---|
| 643 | + struct devlink *devlink = priv_to_devlink(dev); |
---|
| 644 | + |
---|
| 645 | + health->fw_reporter = |
---|
| 646 | + devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops, |
---|
| 647 | + 0, dev); |
---|
| 648 | + if (IS_ERR(health->fw_reporter)) |
---|
| 649 | + mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", |
---|
| 650 | + PTR_ERR(health->fw_reporter)); |
---|
| 651 | + |
---|
| 652 | + health->fw_fatal_reporter = |
---|
| 653 | + devlink_health_reporter_create(devlink, |
---|
| 654 | + &mlx5_fw_fatal_reporter_ops, |
---|
| 655 | + MLX5_REPORTER_FW_GRACEFUL_PERIOD, |
---|
| 656 | + dev); |
---|
| 657 | + if (IS_ERR(health->fw_fatal_reporter)) |
---|
| 658 | + mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", |
---|
| 659 | + PTR_ERR(health->fw_fatal_reporter)); |
---|
| 660 | +} |
---|
| 661 | + |
---|
| 662 | +static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev) |
---|
| 663 | +{ |
---|
| 664 | + struct mlx5_core_health *health = &dev->priv.health; |
---|
| 665 | + |
---|
| 666 | + if (!IS_ERR_OR_NULL(health->fw_reporter)) |
---|
| 667 | + devlink_health_reporter_destroy(health->fw_reporter); |
---|
| 668 | + |
---|
| 669 | + if (!IS_ERR_OR_NULL(health->fw_fatal_reporter)) |
---|
| 670 | + devlink_health_reporter_destroy(health->fw_fatal_reporter); |
---|
261 | 671 | } |
---|
262 | 672 | |
---|
263 | 673 | static unsigned long get_next_poll_jiffies(void) |
---|
.. | .. |
---|
278 | 688 | |
---|
279 | 689 | spin_lock_irqsave(&health->wq_lock, flags); |
---|
280 | 690 | if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) |
---|
281 | | - queue_work(health->wq, &health->work); |
---|
| 691 | + queue_work(health->wq, &health->fatal_report_work); |
---|
282 | 692 | else |
---|
283 | | - dev_err(&dev->pdev->dev, |
---|
284 | | - "new health works are not permitted at this stage\n"); |
---|
| 693 | + mlx5_core_err(dev, "new health works are not permitted at this stage\n"); |
---|
285 | 694 | spin_unlock_irqrestore(&health->wq_lock, flags); |
---|
286 | 695 | } |
---|
287 | 696 | |
---|
.. | .. |
---|
289 | 698 | { |
---|
290 | 699 | struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); |
---|
291 | 700 | struct mlx5_core_health *health = &dev->priv.health; |
---|
| 701 | + struct health_buffer __iomem *h = health->health; |
---|
| 702 | + u32 fatal_error; |
---|
| 703 | + u8 prev_synd; |
---|
292 | 704 | u32 count; |
---|
293 | 705 | |
---|
294 | 706 | if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) |
---|
295 | 707 | goto out; |
---|
| 708 | + |
---|
| 709 | + fatal_error = mlx5_health_check_fatal_sensors(dev); |
---|
| 710 | + |
---|
| 711 | + if (fatal_error && !health->fatal_error) { |
---|
| 712 | + mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); |
---|
| 713 | + dev->priv.health.fatal_error = fatal_error; |
---|
| 714 | + print_health_info(dev); |
---|
| 715 | + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
---|
| 716 | + mlx5_trigger_health_work(dev); |
---|
| 717 | + return; |
---|
| 718 | + } |
---|
296 | 719 | |
---|
297 | 720 | count = ioread32be(health->health_counter); |
---|
298 | 721 | if (count == health->prev) |
---|
.. | .. |
---|
302 | 725 | |
---|
303 | 726 | health->prev = count; |
---|
304 | 727 | if (health->miss_counter == MAX_MISSES) { |
---|
305 | | - dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n"); |
---|
| 728 | + mlx5_core_err(dev, "device's health compromised - reached miss count\n"); |
---|
306 | 729 | print_health_info(dev); |
---|
| 730 | + queue_work(health->wq, &health->report_work); |
---|
307 | 731 | } |
---|
308 | 732 | |
---|
309 | | - if (in_fatal(dev) && !health->sick) { |
---|
310 | | - health->sick = true; |
---|
311 | | - print_health_info(dev); |
---|
312 | | - mlx5_trigger_health_work(dev); |
---|
313 | | - } |
---|
| 733 | + prev_synd = health->synd; |
---|
| 734 | + health->synd = ioread8(&h->synd); |
---|
| 735 | + if (health->synd && health->synd != prev_synd) |
---|
| 736 | + queue_work(health->wq, &health->report_work); |
---|
314 | 737 | |
---|
315 | 738 | out: |
---|
316 | 739 | mod_timer(&health->timer, get_next_poll_jiffies()); |
---|
.. | .. |
---|
321 | 744 | struct mlx5_core_health *health = &dev->priv.health; |
---|
322 | 745 | |
---|
323 | 746 | timer_setup(&health->timer, poll_health, 0); |
---|
324 | | - health->sick = 0; |
---|
| 747 | + health->fatal_error = MLX5_SENSOR_NO_ERR; |
---|
325 | 748 | clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); |
---|
326 | | - clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); |
---|
327 | 749 | health->health = &dev->iseg->health; |
---|
328 | 750 | health->health_counter = &dev->iseg->health_counter; |
---|
329 | 751 | |
---|
.. | .. |
---|
339 | 761 | if (disable_health) { |
---|
340 | 762 | spin_lock_irqsave(&health->wq_lock, flags); |
---|
341 | 763 | set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); |
---|
342 | | - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); |
---|
343 | 764 | spin_unlock_irqrestore(&health->wq_lock, flags); |
---|
344 | 765 | } |
---|
345 | 766 | |
---|
.. | .. |
---|
353 | 774 | |
---|
354 | 775 | spin_lock_irqsave(&health->wq_lock, flags); |
---|
355 | 776 | set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); |
---|
356 | | - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); |
---|
357 | 777 | spin_unlock_irqrestore(&health->wq_lock, flags); |
---|
358 | | - cancel_delayed_work_sync(&health->recover_work); |
---|
359 | | - cancel_work_sync(&health->work); |
---|
| 778 | + cancel_work_sync(&health->report_work); |
---|
| 779 | + cancel_work_sync(&health->fatal_report_work); |
---|
360 | 780 | } |
---|
361 | 781 | |
---|
362 | | -void mlx5_drain_health_recovery(struct mlx5_core_dev *dev) |
---|
| 782 | +void mlx5_health_flush(struct mlx5_core_dev *dev) |
---|
363 | 783 | { |
---|
364 | 784 | struct mlx5_core_health *health = &dev->priv.health; |
---|
365 | | - unsigned long flags; |
---|
366 | 785 | |
---|
367 | | - spin_lock_irqsave(&health->wq_lock, flags); |
---|
368 | | - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); |
---|
369 | | - spin_unlock_irqrestore(&health->wq_lock, flags); |
---|
370 | | - cancel_delayed_work_sync(&dev->priv.health.recover_work); |
---|
| 786 | + flush_workqueue(health->wq); |
---|
371 | 787 | } |
---|
372 | 788 | |
---|
373 | 789 | void mlx5_health_cleanup(struct mlx5_core_dev *dev) |
---|
.. | .. |
---|
375 | 791 | struct mlx5_core_health *health = &dev->priv.health; |
---|
376 | 792 | |
---|
377 | 793 | destroy_workqueue(health->wq); |
---|
| 794 | + mlx5_fw_reporters_destroy(dev); |
---|
378 | 795 | } |
---|
379 | 796 | |
---|
380 | 797 | int mlx5_health_init(struct mlx5_core_dev *dev) |
---|
.. | .. |
---|
382 | 799 | struct mlx5_core_health *health; |
---|
383 | 800 | char *name; |
---|
384 | 801 | |
---|
| 802 | + mlx5_fw_reporters_create(dev); |
---|
| 803 | + |
---|
385 | 804 | health = &dev->priv.health; |
---|
386 | 805 | name = kmalloc(64, GFP_KERNEL); |
---|
387 | 806 | if (!name) |
---|
388 | | - return -ENOMEM; |
---|
| 807 | + goto out_err; |
---|
389 | 808 | |
---|
390 | 809 | strcpy(name, "mlx5_health"); |
---|
391 | | - strcat(name, dev_name(&dev->pdev->dev)); |
---|
| 810 | + strcat(name, dev_name(dev->device)); |
---|
392 | 811 | health->wq = create_singlethread_workqueue(name); |
---|
393 | 812 | kfree(name); |
---|
394 | 813 | if (!health->wq) |
---|
395 | | - return -ENOMEM; |
---|
| 814 | + goto out_err; |
---|
396 | 815 | spin_lock_init(&health->wq_lock); |
---|
397 | | - INIT_WORK(&health->work, health_care); |
---|
398 | | - INIT_DELAYED_WORK(&health->recover_work, health_recover); |
---|
| 816 | + INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work); |
---|
| 817 | + INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); |
---|
399 | 818 | |
---|
400 | 819 | return 0; |
---|
| 820 | + |
---|
| 821 | +out_err: |
---|
| 822 | + mlx5_fw_reporters_destroy(dev); |
---|
| 823 | + return -ENOMEM; |
---|
401 | 824 | } |
---|