| .. | .. |
|---|
| 36 | 36 | #include <linux/vmalloc.h> |
|---|
| 37 | 37 | #include <linux/hardirq.h> |
|---|
| 38 | 38 | #include <linux/mlx5/driver.h> |
|---|
| 39 | | -#include <linux/mlx5/cmd.h> |
|---|
| 40 | 39 | #include "mlx5_core.h" |
|---|
| 40 | +#include "lib/eq.h" |
|---|
| 41 | +#include "lib/mlx5.h" |
|---|
| 42 | +#include "lib/pci_vsc.h" |
|---|
| 43 | +#include "diag/fw_tracer.h" |
|---|
| 41 | 44 | |
|---|
| 42 | 45 | enum { |
|---|
| 43 | 46 | MLX5_HEALTH_POLL_INTERVAL = 2 * HZ, |
|---|
| .. | .. |
|---|
| 59 | 62 | }; |
|---|
| 60 | 63 | |
|---|
| 61 | 64 | enum { |
|---|
| 62 | | - MLX5_NIC_IFC_FULL = 0, |
|---|
| 63 | | - MLX5_NIC_IFC_DISABLED = 1, |
|---|
| 64 | | - MLX5_NIC_IFC_NO_DRAM_NIC = 2, |
|---|
| 65 | | - MLX5_NIC_IFC_INVALID = 3 |
|---|
| 66 | | -}; |
|---|
| 67 | | - |
|---|
| 68 | | -enum { |
|---|
| 69 | 65 | MLX5_DROP_NEW_HEALTH_WORK, |
|---|
| 70 | | - MLX5_DROP_NEW_RECOVERY_WORK, |
|---|
| 71 | 66 | }; |
|---|
| 72 | 67 | |
|---|
| 73 | | -static u8 get_nic_state(struct mlx5_core_dev *dev) |
|---|
| 68 | +enum { |
|---|
| 69 | + MLX5_SENSOR_NO_ERR = 0, |
|---|
| 70 | + MLX5_SENSOR_PCI_COMM_ERR = 1, |
|---|
| 71 | + MLX5_SENSOR_PCI_ERR = 2, |
|---|
| 72 | + MLX5_SENSOR_NIC_DISABLED = 3, |
|---|
| 73 | + MLX5_SENSOR_NIC_SW_RESET = 4, |
|---|
| 74 | + MLX5_SENSOR_FW_SYND_RFR = 5, |
|---|
| 75 | +}; |
|---|
| 76 | + |
|---|
| 77 | +u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) |
|---|
| 74 | 78 | { |
|---|
| 75 | | - return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; |
|---|
| 79 | + return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; |
|---|
| 76 | 80 | } |
|---|
| 77 | 81 | |
|---|
| 78 | | -static void trigger_cmd_completions(struct mlx5_core_dev *dev) |
|---|
| 82 | +void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) |
|---|
| 79 | 83 | { |
|---|
| 80 | | - unsigned long flags; |
|---|
| 81 | | - u64 vector; |
|---|
| 84 | + u32 cur_cmdq_addr_l_sz; |
|---|
| 82 | 85 | |
|---|
| 83 | | - /* wait for pending handlers to complete */ |
|---|
| 84 | | - synchronize_irq(pci_irq_vector(dev->pdev, MLX5_EQ_VEC_CMD)); |
|---|
| 85 | | - spin_lock_irqsave(&dev->cmd.alloc_lock, flags); |
|---|
| 86 | | - vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); |
|---|
| 87 | | - if (!vector) |
|---|
| 88 | | - goto no_trig; |
|---|
| 89 | | - |
|---|
| 90 | | - vector |= MLX5_TRIGGERED_CMD_COMP; |
|---|
| 91 | | - spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); |
|---|
| 92 | | - |
|---|
| 93 | | - mlx5_core_dbg(dev, "vector 0x%llx\n", vector); |
|---|
| 94 | | - mlx5_cmd_comp_handler(dev, vector, true); |
|---|
| 95 | | - return; |
|---|
| 96 | | - |
|---|
| 97 | | -no_trig: |
|---|
| 98 | | - spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); |
|---|
| 86 | + cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz); |
|---|
| 87 | + iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) | |
|---|
| 88 | + state << MLX5_NIC_IFC_OFFSET, |
|---|
| 89 | + &dev->iseg->cmdq_addr_l_sz); |
|---|
| 99 | 90 | } |
|---|
| 100 | 91 | |
|---|
| 101 | | -static int in_fatal(struct mlx5_core_dev *dev) |
|---|
| 92 | +static bool sensor_pci_not_working(struct mlx5_core_dev *dev) |
|---|
| 102 | 93 | { |
|---|
| 103 | 94 | struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 104 | 95 | struct health_buffer __iomem *h = health->health; |
|---|
| 105 | 96 | |
|---|
| 106 | | - if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) |
|---|
| 107 | | - return 1; |
|---|
| 97 | + /* Offline PCI reads return 0xffffffff */ |
|---|
| 98 | + return (ioread32be(&h->fw_ver) == 0xffffffff); |
|---|
| 99 | +} |
|---|
| 108 | 100 | |
|---|
| 109 | | - if (ioread32be(&h->fw_ver) == 0xffffffff) |
|---|
| 110 | | - return 1; |
|---|
| 101 | +static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) |
|---|
| 102 | +{ |
|---|
| 103 | + struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 104 | + struct health_buffer __iomem *h = health->health; |
|---|
| 105 | + u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET; |
|---|
| 106 | + u8 synd = ioread8(&h->synd); |
|---|
| 111 | 107 | |
|---|
| 112 | | - return 0; |
|---|
| 108 | + if (rfr && synd) |
|---|
| 109 | + mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); |
|---|
| 110 | + return rfr && synd; |
|---|
| 111 | +} |
|---|
| 112 | + |
|---|
| 113 | +u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev) |
|---|
| 114 | +{ |
|---|
| 115 | + if (sensor_pci_not_working(dev)) |
|---|
| 116 | + return MLX5_SENSOR_PCI_COMM_ERR; |
|---|
| 117 | + if (pci_channel_offline(dev->pdev)) |
|---|
| 118 | + return MLX5_SENSOR_PCI_ERR; |
|---|
| 119 | + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) |
|---|
| 120 | + return MLX5_SENSOR_NIC_DISABLED; |
|---|
| 121 | + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET) |
|---|
| 122 | + return MLX5_SENSOR_NIC_SW_RESET; |
|---|
| 123 | + if (sensor_fw_synd_rfr(dev)) |
|---|
| 124 | + return MLX5_SENSOR_FW_SYND_RFR; |
|---|
| 125 | + |
|---|
| 126 | + return MLX5_SENSOR_NO_ERR; |
|---|
| 127 | +} |
|---|
| 128 | + |
|---|
| 129 | +static int lock_sem_sw_reset(struct mlx5_core_dev *dev, bool lock) |
|---|
| 130 | +{ |
|---|
| 131 | + enum mlx5_vsc_state state; |
|---|
| 132 | + int ret; |
|---|
| 133 | + |
|---|
| 134 | + if (!mlx5_core_is_pf(dev)) |
|---|
| 135 | + return -EBUSY; |
|---|
| 136 | + |
|---|
| 137 | + /* Try to lock GW access, this stage doesn't return |
|---|
| 138 | + * EBUSY because locked GW does not mean that other PF |
|---|
| 139 | + * already started the reset. |
|---|
| 140 | + */ |
|---|
| 141 | + ret = mlx5_vsc_gw_lock(dev); |
|---|
| 142 | + if (ret == -EBUSY) |
|---|
| 143 | + return -EINVAL; |
|---|
| 144 | + if (ret) |
|---|
| 145 | + return ret; |
|---|
| 146 | + |
|---|
| 147 | + state = lock ? MLX5_VSC_LOCK : MLX5_VSC_UNLOCK; |
|---|
| 148 | + /* At this stage, if the return status == EBUSY, then we know |
|---|
| 149 | + * for sure that another PF started the reset, so don't allow |
|---|
| 150 | + * another reset. |
|---|
| 151 | + */ |
|---|
| 152 | + ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, state); |
|---|
| 153 | + if (ret) |
|---|
| 154 | + mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n"); |
|---|
| 155 | + |
|---|
| 156 | + /* Unlock GW access */ |
|---|
| 157 | + mlx5_vsc_gw_unlock(dev); |
|---|
| 158 | + |
|---|
| 159 | + return ret; |
|---|
| 160 | +} |
|---|
| 161 | + |
|---|
| 162 | +static bool reset_fw_if_needed(struct mlx5_core_dev *dev) |
|---|
| 163 | +{ |
|---|
| 164 | + bool supported = (ioread32be(&dev->iseg->initializing) >> |
|---|
| 165 | + MLX5_FW_RESET_SUPPORTED_OFFSET) & 1; |
|---|
| 166 | + u32 fatal_error; |
|---|
| 167 | + |
|---|
| 168 | + if (!supported) |
|---|
| 169 | + return false; |
|---|
| 170 | + |
|---|
| 171 | + /* The reset only needs to be issued by one PF. The health buffer is |
|---|
| 172 | + * shared between all functions, and will be cleared during a reset. |
|---|
| 173 | + * Check again to avoid a redundant 2nd reset. If the fatal erros was |
|---|
| 174 | + * PCI related a reset won't help. |
|---|
| 175 | + */ |
|---|
| 176 | + fatal_error = mlx5_health_check_fatal_sensors(dev); |
|---|
| 177 | + if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR || |
|---|
| 178 | + fatal_error == MLX5_SENSOR_NIC_DISABLED || |
|---|
| 179 | + fatal_error == MLX5_SENSOR_NIC_SW_RESET) { |
|---|
| 180 | + mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help."); |
|---|
| 181 | + return false; |
|---|
| 182 | + } |
|---|
| 183 | + |
|---|
| 184 | + mlx5_core_warn(dev, "Issuing FW Reset\n"); |
|---|
| 185 | + /* Write the NIC interface field to initiate the reset, the command |
|---|
| 186 | + * interface address also resides here, don't overwrite it. |
|---|
| 187 | + */ |
|---|
| 188 | + mlx5_set_nic_state(dev, MLX5_NIC_IFC_SW_RESET); |
|---|
| 189 | + |
|---|
| 190 | + return true; |
|---|
| 191 | +} |
|---|
| 192 | + |
|---|
| 193 | +static void enter_error_state(struct mlx5_core_dev *dev, bool force) |
|---|
| 194 | +{ |
|---|
| 195 | + if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */ |
|---|
| 196 | + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
|---|
| 197 | + mlx5_cmd_flush(dev); |
|---|
| 198 | + } |
|---|
| 199 | + |
|---|
| 200 | + mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); |
|---|
| 113 | 201 | } |
|---|
| 114 | 202 | |
|---|
| 115 | 203 | void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) |
|---|
| 116 | 204 | { |
|---|
| 205 | + bool err_detected = false; |
|---|
| 206 | + |
|---|
| 207 | + /* Mark the device as fatal in order to abort FW commands */ |
|---|
| 208 | + if ((mlx5_health_check_fatal_sensors(dev) || force) && |
|---|
| 209 | + dev->state == MLX5_DEVICE_STATE_UP) { |
|---|
| 210 | + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
|---|
| 211 | + err_detected = true; |
|---|
| 212 | + } |
|---|
| 117 | 213 | mutex_lock(&dev->intf_state_mutex); |
|---|
| 118 | | - if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) |
|---|
| 214 | + if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) |
|---|
| 215 | + goto unlock;/* a previous error is still being handled */ |
|---|
| 216 | + if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) { |
|---|
| 217 | + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
|---|
| 218 | + goto unlock; |
|---|
| 219 | + } |
|---|
| 220 | + |
|---|
| 221 | + enter_error_state(dev, force); |
|---|
| 222 | +unlock: |
|---|
| 223 | + mutex_unlock(&dev->intf_state_mutex); |
|---|
| 224 | +} |
|---|
| 225 | + |
|---|
| 226 | +#define MLX5_CRDUMP_WAIT_MS 60000 |
|---|
| 227 | +#define MLX5_FW_RESET_WAIT_MS 1000 |
|---|
| 228 | +void mlx5_error_sw_reset(struct mlx5_core_dev *dev) |
|---|
| 229 | +{ |
|---|
| 230 | + unsigned long end, delay_ms = MLX5_FW_RESET_WAIT_MS; |
|---|
| 231 | + int lock = -EBUSY; |
|---|
| 232 | + |
|---|
| 233 | + mutex_lock(&dev->intf_state_mutex); |
|---|
| 234 | + if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) |
|---|
| 119 | 235 | goto unlock; |
|---|
| 120 | 236 | |
|---|
| 121 | 237 | mlx5_core_err(dev, "start\n"); |
|---|
| 122 | | - if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) { |
|---|
| 123 | | - dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
|---|
| 124 | | - trigger_cmd_completions(dev); |
|---|
| 238 | + |
|---|
| 239 | + if (mlx5_health_check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) { |
|---|
| 240 | + /* Get cr-dump and reset FW semaphore */ |
|---|
| 241 | + lock = lock_sem_sw_reset(dev, true); |
|---|
| 242 | + |
|---|
| 243 | + if (lock == -EBUSY) { |
|---|
| 244 | + delay_ms = MLX5_CRDUMP_WAIT_MS; |
|---|
| 245 | + goto recover_from_sw_reset; |
|---|
| 246 | + } |
|---|
| 247 | + /* Execute SW reset */ |
|---|
| 248 | + reset_fw_if_needed(dev); |
|---|
| 125 | 249 | } |
|---|
| 126 | 250 | |
|---|
| 127 | | - mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1); |
|---|
| 251 | +recover_from_sw_reset: |
|---|
| 252 | + /* Recover from SW reset */ |
|---|
| 253 | + end = jiffies + msecs_to_jiffies(delay_ms); |
|---|
| 254 | + do { |
|---|
| 255 | + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) |
|---|
| 256 | + break; |
|---|
| 257 | + |
|---|
| 258 | + msleep(20); |
|---|
| 259 | + } while (!time_after(jiffies, end)); |
|---|
| 260 | + |
|---|
| 261 | + if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) { |
|---|
| 262 | + dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n", |
|---|
| 263 | + mlx5_get_nic_state(dev), delay_ms); |
|---|
| 264 | + } |
|---|
| 265 | + |
|---|
| 266 | + /* Release FW semaphore if you are the lock owner */ |
|---|
| 267 | + if (!lock) |
|---|
| 268 | + lock_sem_sw_reset(dev, false); |
|---|
| 269 | + |
|---|
| 128 | 270 | mlx5_core_err(dev, "end\n"); |
|---|
| 129 | 271 | |
|---|
| 130 | 272 | unlock: |
|---|
| .. | .. |
|---|
| 133 | 275 | |
|---|
| 134 | 276 | static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) |
|---|
| 135 | 277 | { |
|---|
| 136 | | - u8 nic_interface = get_nic_state(dev); |
|---|
| 278 | + u8 nic_interface = mlx5_get_nic_state(dev); |
|---|
| 137 | 279 | |
|---|
| 138 | 280 | switch (nic_interface) { |
|---|
| 139 | 281 | case MLX5_NIC_IFC_FULL: |
|---|
| .. | .. |
|---|
| 147 | 289 | case MLX5_NIC_IFC_NO_DRAM_NIC: |
|---|
| 148 | 290 | mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); |
|---|
| 149 | 291 | break; |
|---|
| 292 | + |
|---|
| 293 | + case MLX5_NIC_IFC_SW_RESET: |
|---|
| 294 | + /* The IFC mode field is 3 bits, so it will read 0x7 in 2 cases: |
|---|
| 295 | + * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded |
|---|
| 296 | + * and this is a VF), this is not recoverable by SW reset. |
|---|
| 297 | + * Logging of this is handled elsewhere. |
|---|
| 298 | + * 2. FW reset has been issued by another function, driver can |
|---|
| 299 | + * be reloaded to recover after the mode switches to |
|---|
| 300 | + * MLX5_NIC_IFC_DISABLED. |
|---|
| 301 | + */ |
|---|
| 302 | + if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR) |
|---|
| 303 | + mlx5_core_warn(dev, "NIC SW reset in progress\n"); |
|---|
| 304 | + break; |
|---|
| 305 | + |
|---|
| 150 | 306 | default: |
|---|
| 151 | 307 | mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n", |
|---|
| 152 | 308 | nic_interface); |
|---|
| .. | .. |
|---|
| 155 | 311 | mlx5_disable_device(dev); |
|---|
| 156 | 312 | } |
|---|
| 157 | 313 | |
|---|
| 158 | | -static void health_recover(struct work_struct *work) |
|---|
| 314 | +/* How much time to wait until health resetting the driver (in msecs) */ |
|---|
| 315 | +#define MLX5_RECOVERY_WAIT_MSECS 60000 |
|---|
| 316 | +int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev) |
|---|
| 159 | 317 | { |
|---|
| 160 | | - struct mlx5_core_health *health; |
|---|
| 161 | | - struct delayed_work *dwork; |
|---|
| 162 | | - struct mlx5_core_dev *dev; |
|---|
| 163 | | - struct mlx5_priv *priv; |
|---|
| 164 | | - u8 nic_state; |
|---|
| 318 | + unsigned long end; |
|---|
| 165 | 319 | |
|---|
| 166 | | - dwork = container_of(work, struct delayed_work, work); |
|---|
| 167 | | - health = container_of(dwork, struct mlx5_core_health, recover_work); |
|---|
| 168 | | - priv = container_of(health, struct mlx5_priv, health); |
|---|
| 169 | | - dev = container_of(priv, struct mlx5_core_dev, priv); |
|---|
| 170 | | - |
|---|
| 171 | | - nic_state = get_nic_state(dev); |
|---|
| 172 | | - if (nic_state == MLX5_NIC_IFC_INVALID) { |
|---|
| 173 | | - dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n"); |
|---|
| 174 | | - return; |
|---|
| 320 | + end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS); |
|---|
| 321 | + while (sensor_pci_not_working(dev)) { |
|---|
| 322 | + if (time_after(jiffies, end)) |
|---|
| 323 | + return -ETIMEDOUT; |
|---|
| 324 | + msleep(100); |
|---|
| 175 | 325 | } |
|---|
| 176 | | - |
|---|
| 177 | | - dev_err(&dev->pdev->dev, "starting health recovery flow\n"); |
|---|
| 178 | | - mlx5_recover_device(dev); |
|---|
| 326 | + return 0; |
|---|
| 179 | 327 | } |
|---|
| 180 | 328 | |
|---|
| 181 | | -/* How much time to wait until health resetting the driver (in msecs) */ |
|---|
| 182 | | -#define MLX5_RECOVERY_DELAY_MSECS 60000 |
|---|
| 183 | | -static void health_care(struct work_struct *work) |
|---|
| 329 | +static int mlx5_health_try_recover(struct mlx5_core_dev *dev) |
|---|
| 184 | 330 | { |
|---|
| 185 | | - unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS); |
|---|
| 186 | | - struct mlx5_core_health *health; |
|---|
| 187 | | - struct mlx5_core_dev *dev; |
|---|
| 188 | | - struct mlx5_priv *priv; |
|---|
| 189 | | - unsigned long flags; |
|---|
| 190 | | - |
|---|
| 191 | | - health = container_of(work, struct mlx5_core_health, work); |
|---|
| 192 | | - priv = container_of(health, struct mlx5_priv, health); |
|---|
| 193 | | - dev = container_of(priv, struct mlx5_core_dev, priv); |
|---|
| 194 | 331 | mlx5_core_warn(dev, "handling bad device here\n"); |
|---|
| 195 | 332 | mlx5_handle_bad_state(dev); |
|---|
| 196 | | - |
|---|
| 197 | | - spin_lock_irqsave(&health->wq_lock, flags); |
|---|
| 198 | | - if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) |
|---|
| 199 | | - schedule_delayed_work(&health->recover_work, recover_delay); |
|---|
| 200 | | - else |
|---|
| 201 | | - dev_err(&dev->pdev->dev, |
|---|
| 202 | | - "new health works are not permitted at this stage\n"); |
|---|
| 203 | | - spin_unlock_irqrestore(&health->wq_lock, flags); |
|---|
| 333 | + if (mlx5_health_wait_pci_up(dev)) { |
|---|
| 334 | + mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n"); |
|---|
| 335 | + return -EIO; |
|---|
| 336 | + } |
|---|
| 337 | + mlx5_core_err(dev, "starting health recovery flow\n"); |
|---|
| 338 | + mlx5_recover_device(dev); |
|---|
| 339 | + if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) || |
|---|
| 340 | + mlx5_health_check_fatal_sensors(dev)) { |
|---|
| 341 | + mlx5_core_err(dev, "health recovery failed\n"); |
|---|
| 342 | + return -EIO; |
|---|
| 343 | + } |
|---|
| 344 | + return 0; |
|---|
| 204 | 345 | } |
|---|
| 205 | 346 | |
|---|
| 206 | 347 | static const char *hsynd_str(u8 synd) |
|---|
| .. | .. |
|---|
| 246 | 387 | return; |
|---|
| 247 | 388 | |
|---|
| 248 | 389 | for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) |
|---|
| 249 | | - dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i)); |
|---|
| 390 | + mlx5_core_err(dev, "assert_var[%d] 0x%08x\n", i, |
|---|
| 391 | + ioread32be(h->assert_var + i)); |
|---|
| 250 | 392 | |
|---|
| 251 | | - dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); |
|---|
| 252 | | - dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); |
|---|
| 393 | + mlx5_core_err(dev, "assert_exit_ptr 0x%08x\n", |
|---|
| 394 | + ioread32be(&h->assert_exit_ptr)); |
|---|
| 395 | + mlx5_core_err(dev, "assert_callra 0x%08x\n", |
|---|
| 396 | + ioread32be(&h->assert_callra)); |
|---|
| 253 | 397 | sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); |
|---|
| 254 | | - dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str); |
|---|
| 255 | | - dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); |
|---|
| 256 | | - dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index)); |
|---|
| 257 | | - dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); |
|---|
| 258 | | - dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); |
|---|
| 398 | + mlx5_core_err(dev, "fw_ver %s\n", fw_str); |
|---|
| 399 | + mlx5_core_err(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); |
|---|
| 400 | + mlx5_core_err(dev, "irisc_index %d\n", ioread8(&h->irisc_index)); |
|---|
| 401 | + mlx5_core_err(dev, "synd 0x%x: %s\n", ioread8(&h->synd), |
|---|
| 402 | + hsynd_str(ioread8(&h->synd))); |
|---|
| 403 | + mlx5_core_err(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); |
|---|
| 259 | 404 | fw = ioread32be(&h->fw_ver); |
|---|
| 260 | | - dev_err(&dev->pdev->dev, "raw fw_ver 0x%08x\n", fw); |
|---|
| 405 | + mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw); |
|---|
| 406 | +} |
|---|
| 407 | + |
|---|
| 408 | +static int |
|---|
| 409 | +mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, |
|---|
| 410 | + struct devlink_fmsg *fmsg, |
|---|
| 411 | + struct netlink_ext_ack *extack) |
|---|
| 412 | +{ |
|---|
| 413 | + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); |
|---|
| 414 | + struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 415 | + struct health_buffer __iomem *h = health->health; |
|---|
| 416 | + u8 synd; |
|---|
| 417 | + int err; |
|---|
| 418 | + |
|---|
| 419 | + synd = ioread8(&h->synd); |
|---|
| 420 | + err = devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd); |
|---|
| 421 | + if (err || !synd) |
|---|
| 422 | + return err; |
|---|
| 423 | + return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd)); |
|---|
| 424 | +} |
|---|
| 425 | + |
|---|
| 426 | +struct mlx5_fw_reporter_ctx { |
|---|
| 427 | + u8 err_synd; |
|---|
| 428 | + int miss_counter; |
|---|
| 429 | +}; |
|---|
| 430 | + |
|---|
| 431 | +static int |
|---|
| 432 | +mlx5_fw_reporter_ctx_pairs_put(struct devlink_fmsg *fmsg, |
|---|
| 433 | + struct mlx5_fw_reporter_ctx *fw_reporter_ctx) |
|---|
| 434 | +{ |
|---|
| 435 | + int err; |
|---|
| 436 | + |
|---|
| 437 | + err = devlink_fmsg_u8_pair_put(fmsg, "syndrome", |
|---|
| 438 | + fw_reporter_ctx->err_synd); |
|---|
| 439 | + if (err) |
|---|
| 440 | + return err; |
|---|
| 441 | + err = devlink_fmsg_u32_pair_put(fmsg, "fw_miss_counter", |
|---|
| 442 | + fw_reporter_ctx->miss_counter); |
|---|
| 443 | + if (err) |
|---|
| 444 | + return err; |
|---|
| 445 | + return 0; |
|---|
| 446 | +} |
|---|
| 447 | + |
|---|
| 448 | +static int |
|---|
| 449 | +mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev, |
|---|
| 450 | + struct devlink_fmsg *fmsg) |
|---|
| 451 | +{ |
|---|
| 452 | + struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 453 | + struct health_buffer __iomem *h = health->health; |
|---|
| 454 | + int err; |
|---|
| 455 | + int i; |
|---|
| 456 | + |
|---|
| 457 | + if (!ioread8(&h->synd)) |
|---|
| 458 | + return 0; |
|---|
| 459 | + |
|---|
| 460 | + err = devlink_fmsg_pair_nest_start(fmsg, "health buffer"); |
|---|
| 461 | + if (err) |
|---|
| 462 | + return err; |
|---|
| 463 | + err = devlink_fmsg_obj_nest_start(fmsg); |
|---|
| 464 | + if (err) |
|---|
| 465 | + return err; |
|---|
| 466 | + err = devlink_fmsg_arr_pair_nest_start(fmsg, "assert_var"); |
|---|
| 467 | + if (err) |
|---|
| 468 | + return err; |
|---|
| 469 | + |
|---|
| 470 | + for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) { |
|---|
| 471 | + err = devlink_fmsg_u32_put(fmsg, ioread32be(h->assert_var + i)); |
|---|
| 472 | + if (err) |
|---|
| 473 | + return err; |
|---|
| 474 | + } |
|---|
| 475 | + err = devlink_fmsg_arr_pair_nest_end(fmsg); |
|---|
| 476 | + if (err) |
|---|
| 477 | + return err; |
|---|
| 478 | + err = devlink_fmsg_u32_pair_put(fmsg, "assert_exit_ptr", |
|---|
| 479 | + ioread32be(&h->assert_exit_ptr)); |
|---|
| 480 | + if (err) |
|---|
| 481 | + return err; |
|---|
| 482 | + err = devlink_fmsg_u32_pair_put(fmsg, "assert_callra", |
|---|
| 483 | + ioread32be(&h->assert_callra)); |
|---|
| 484 | + if (err) |
|---|
| 485 | + return err; |
|---|
| 486 | + err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id)); |
|---|
| 487 | + if (err) |
|---|
| 488 | + return err; |
|---|
| 489 | + err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index", |
|---|
| 490 | + ioread8(&h->irisc_index)); |
|---|
| 491 | + if (err) |
|---|
| 492 | + return err; |
|---|
| 493 | + err = devlink_fmsg_u8_pair_put(fmsg, "synd", ioread8(&h->synd)); |
|---|
| 494 | + if (err) |
|---|
| 495 | + return err; |
|---|
| 496 | + err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", |
|---|
| 497 | + ioread16be(&h->ext_synd)); |
|---|
| 498 | + if (err) |
|---|
| 499 | + return err; |
|---|
| 500 | + err = devlink_fmsg_u32_pair_put(fmsg, "raw_fw_ver", |
|---|
| 501 | + ioread32be(&h->fw_ver)); |
|---|
| 502 | + if (err) |
|---|
| 503 | + return err; |
|---|
| 504 | + err = devlink_fmsg_obj_nest_end(fmsg); |
|---|
| 505 | + if (err) |
|---|
| 506 | + return err; |
|---|
| 507 | + return devlink_fmsg_pair_nest_end(fmsg); |
|---|
| 508 | +} |
|---|
| 509 | + |
|---|
| 510 | +static int |
|---|
| 511 | +mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter, |
|---|
| 512 | + struct devlink_fmsg *fmsg, void *priv_ctx, |
|---|
| 513 | + struct netlink_ext_ack *extack) |
|---|
| 514 | +{ |
|---|
| 515 | + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); |
|---|
| 516 | + int err; |
|---|
| 517 | + |
|---|
| 518 | + err = mlx5_fw_tracer_trigger_core_dump_general(dev); |
|---|
| 519 | + if (err) |
|---|
| 520 | + return err; |
|---|
| 521 | + |
|---|
| 522 | + if (priv_ctx) { |
|---|
| 523 | + struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; |
|---|
| 524 | + |
|---|
| 525 | + err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx); |
|---|
| 526 | + if (err) |
|---|
| 527 | + return err; |
|---|
| 528 | + } |
|---|
| 529 | + |
|---|
| 530 | + err = mlx5_fw_reporter_heath_buffer_data_put(dev, fmsg); |
|---|
| 531 | + if (err) |
|---|
| 532 | + return err; |
|---|
| 533 | + return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg); |
|---|
| 534 | +} |
|---|
| 535 | + |
|---|
| 536 | +static void mlx5_fw_reporter_err_work(struct work_struct *work) |
|---|
| 537 | +{ |
|---|
| 538 | + struct mlx5_fw_reporter_ctx fw_reporter_ctx; |
|---|
| 539 | + struct mlx5_core_health *health; |
|---|
| 540 | + |
|---|
| 541 | + health = container_of(work, struct mlx5_core_health, report_work); |
|---|
| 542 | + |
|---|
| 543 | + if (IS_ERR_OR_NULL(health->fw_reporter)) |
|---|
| 544 | + return; |
|---|
| 545 | + |
|---|
| 546 | + fw_reporter_ctx.err_synd = health->synd; |
|---|
| 547 | + fw_reporter_ctx.miss_counter = health->miss_counter; |
|---|
| 548 | + if (fw_reporter_ctx.err_synd) { |
|---|
| 549 | + devlink_health_report(health->fw_reporter, |
|---|
| 550 | + "FW syndrom reported", &fw_reporter_ctx); |
|---|
| 551 | + return; |
|---|
| 552 | + } |
|---|
| 553 | + if (fw_reporter_ctx.miss_counter) |
|---|
| 554 | + devlink_health_report(health->fw_reporter, |
|---|
| 555 | + "FW miss counter reported", |
|---|
| 556 | + &fw_reporter_ctx); |
|---|
| 557 | +} |
|---|
| 558 | + |
|---|
| 559 | +static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { |
|---|
| 560 | + .name = "fw", |
|---|
| 561 | + .diagnose = mlx5_fw_reporter_diagnose, |
|---|
| 562 | + .dump = mlx5_fw_reporter_dump, |
|---|
| 563 | +}; |
|---|
| 564 | + |
|---|
| 565 | +static int |
|---|
| 566 | +mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, |
|---|
| 567 | + void *priv_ctx, |
|---|
| 568 | + struct netlink_ext_ack *extack) |
|---|
| 569 | +{ |
|---|
| 570 | + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); |
|---|
| 571 | + |
|---|
| 572 | + return mlx5_health_try_recover(dev); |
|---|
| 573 | +} |
|---|
| 574 | + |
|---|
| 575 | +static int |
|---|
| 576 | +mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter, |
|---|
| 577 | + struct devlink_fmsg *fmsg, void *priv_ctx, |
|---|
| 578 | + struct netlink_ext_ack *extack) |
|---|
| 579 | +{ |
|---|
| 580 | + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); |
|---|
| 581 | + u32 crdump_size = dev->priv.health.crdump_size; |
|---|
| 582 | + u32 *cr_data; |
|---|
| 583 | + int err; |
|---|
| 584 | + |
|---|
| 585 | + if (!mlx5_core_is_pf(dev)) |
|---|
| 586 | + return -EPERM; |
|---|
| 587 | + |
|---|
| 588 | + cr_data = kvmalloc(crdump_size, GFP_KERNEL); |
|---|
| 589 | + if (!cr_data) |
|---|
| 590 | + return -ENOMEM; |
|---|
| 591 | + err = mlx5_crdump_collect(dev, cr_data); |
|---|
| 592 | + if (err) |
|---|
| 593 | + goto free_data; |
|---|
| 594 | + |
|---|
| 595 | + if (priv_ctx) { |
|---|
| 596 | + struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; |
|---|
| 597 | + |
|---|
| 598 | + err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx); |
|---|
| 599 | + if (err) |
|---|
| 600 | + goto free_data; |
|---|
| 601 | + } |
|---|
| 602 | + |
|---|
| 603 | + err = devlink_fmsg_binary_pair_put(fmsg, "crdump_data", cr_data, crdump_size); |
|---|
| 604 | + |
|---|
| 605 | +free_data: |
|---|
| 606 | + kvfree(cr_data); |
|---|
| 607 | + return err; |
|---|
| 608 | +} |
|---|
| 609 | + |
|---|
| 610 | +static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) |
|---|
| 611 | +{ |
|---|
| 612 | + struct mlx5_fw_reporter_ctx fw_reporter_ctx; |
|---|
| 613 | + struct mlx5_core_health *health; |
|---|
| 614 | + struct mlx5_core_dev *dev; |
|---|
| 615 | + struct mlx5_priv *priv; |
|---|
| 616 | + |
|---|
| 617 | + health = container_of(work, struct mlx5_core_health, fatal_report_work); |
|---|
| 618 | + priv = container_of(health, struct mlx5_priv, health); |
|---|
| 619 | + dev = container_of(priv, struct mlx5_core_dev, priv); |
|---|
| 620 | + |
|---|
| 621 | + mutex_lock(&dev->intf_state_mutex); |
|---|
| 622 | + if (test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) { |
|---|
| 623 | + mlx5_core_err(dev, "health works are not permitted at this stage\n"); |
|---|
| 624 | + mutex_unlock(&dev->intf_state_mutex); |
|---|
| 625 | + return; |
|---|
| 626 | + } |
|---|
| 627 | + mutex_unlock(&dev->intf_state_mutex); |
|---|
| 628 | + enter_error_state(dev, false); |
|---|
| 629 | + if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) { |
|---|
| 630 | + if (mlx5_health_try_recover(dev)) |
|---|
| 631 | + mlx5_core_err(dev, "health recovery failed\n"); |
|---|
| 632 | + return; |
|---|
| 633 | + } |
|---|
| 634 | + fw_reporter_ctx.err_synd = health->synd; |
|---|
| 635 | + fw_reporter_ctx.miss_counter = health->miss_counter; |
|---|
| 636 | + devlink_health_report(health->fw_fatal_reporter, |
|---|
| 637 | + "FW fatal error reported", &fw_reporter_ctx); |
|---|
| 638 | +} |
|---|
| 639 | + |
|---|
| 640 | +static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { |
|---|
| 641 | + .name = "fw_fatal", |
|---|
| 642 | + .recover = mlx5_fw_fatal_reporter_recover, |
|---|
| 643 | + .dump = mlx5_fw_fatal_reporter_dump, |
|---|
| 644 | +}; |
|---|
| 645 | + |
|---|
| 646 | +#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000 |
|---|
| 647 | +static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) |
|---|
| 648 | +{ |
|---|
| 649 | + struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 650 | + struct devlink *devlink = priv_to_devlink(dev); |
|---|
| 651 | + |
|---|
| 652 | + health->fw_reporter = |
|---|
| 653 | + devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops, |
|---|
| 654 | + 0, dev); |
|---|
| 655 | + if (IS_ERR(health->fw_reporter)) |
|---|
| 656 | + mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", |
|---|
| 657 | + PTR_ERR(health->fw_reporter)); |
|---|
| 658 | + |
|---|
| 659 | + health->fw_fatal_reporter = |
|---|
| 660 | + devlink_health_reporter_create(devlink, |
|---|
| 661 | + &mlx5_fw_fatal_reporter_ops, |
|---|
| 662 | + MLX5_REPORTER_FW_GRACEFUL_PERIOD, |
|---|
| 663 | + dev); |
|---|
| 664 | + if (IS_ERR(health->fw_fatal_reporter)) |
|---|
| 665 | + mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", |
|---|
| 666 | + PTR_ERR(health->fw_fatal_reporter)); |
|---|
| 667 | +} |
|---|
| 668 | + |
|---|
| 669 | +static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev) |
|---|
| 670 | +{ |
|---|
| 671 | + struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 672 | + |
|---|
| 673 | + if (!IS_ERR_OR_NULL(health->fw_reporter)) |
|---|
| 674 | + devlink_health_reporter_destroy(health->fw_reporter); |
|---|
| 675 | + |
|---|
| 676 | + if (!IS_ERR_OR_NULL(health->fw_fatal_reporter)) |
|---|
| 677 | + devlink_health_reporter_destroy(health->fw_fatal_reporter); |
|---|
| 261 | 678 | } |
|---|
| 262 | 679 | |
|---|
| 263 | 680 | static unsigned long get_next_poll_jiffies(void) |
|---|
| .. | .. |
|---|
| 278 | 695 | |
|---|
| 279 | 696 | spin_lock_irqsave(&health->wq_lock, flags); |
|---|
| 280 | 697 | if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) |
|---|
| 281 | | - queue_work(health->wq, &health->work); |
|---|
| 698 | + queue_work(health->wq, &health->fatal_report_work); |
|---|
| 282 | 699 | else |
|---|
| 283 | | - dev_err(&dev->pdev->dev, |
|---|
| 284 | | - "new health works are not permitted at this stage\n"); |
|---|
| 700 | + mlx5_core_err(dev, "new health works are not permitted at this stage\n"); |
|---|
| 285 | 701 | spin_unlock_irqrestore(&health->wq_lock, flags); |
|---|
| 286 | 702 | } |
|---|
| 287 | 703 | |
|---|
| .. | .. |
|---|
| 289 | 705 | { |
|---|
| 290 | 706 | struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); |
|---|
| 291 | 707 | struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 708 | + struct health_buffer __iomem *h = health->health; |
|---|
| 709 | + u32 fatal_error; |
|---|
| 710 | + u8 prev_synd; |
|---|
| 292 | 711 | u32 count; |
|---|
| 293 | 712 | |
|---|
| 294 | 713 | if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) |
|---|
| 295 | 714 | goto out; |
|---|
| 715 | + |
|---|
| 716 | + fatal_error = mlx5_health_check_fatal_sensors(dev); |
|---|
| 717 | + |
|---|
| 718 | + if (fatal_error && !health->fatal_error) { |
|---|
| 719 | + mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); |
|---|
| 720 | + dev->priv.health.fatal_error = fatal_error; |
|---|
| 721 | + print_health_info(dev); |
|---|
| 722 | + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; |
|---|
| 723 | + mlx5_trigger_health_work(dev); |
|---|
| 724 | + return; |
|---|
| 725 | + } |
|---|
| 296 | 726 | |
|---|
| 297 | 727 | count = ioread32be(health->health_counter); |
|---|
| 298 | 728 | if (count == health->prev) |
|---|
| .. | .. |
|---|
| 302 | 732 | |
|---|
| 303 | 733 | health->prev = count; |
|---|
| 304 | 734 | if (health->miss_counter == MAX_MISSES) { |
|---|
| 305 | | - dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n"); |
|---|
| 735 | + mlx5_core_err(dev, "device's health compromised - reached miss count\n"); |
|---|
| 306 | 736 | print_health_info(dev); |
|---|
| 737 | + queue_work(health->wq, &health->report_work); |
|---|
| 307 | 738 | } |
|---|
| 308 | 739 | |
|---|
| 309 | | - if (in_fatal(dev) && !health->sick) { |
|---|
| 310 | | - health->sick = true; |
|---|
| 311 | | - print_health_info(dev); |
|---|
| 312 | | - mlx5_trigger_health_work(dev); |
|---|
| 313 | | - } |
|---|
| 740 | + prev_synd = health->synd; |
|---|
| 741 | + health->synd = ioread8(&h->synd); |
|---|
| 742 | + if (health->synd && health->synd != prev_synd) |
|---|
| 743 | + queue_work(health->wq, &health->report_work); |
|---|
| 314 | 744 | |
|---|
| 315 | 745 | out: |
|---|
| 316 | 746 | mod_timer(&health->timer, get_next_poll_jiffies()); |
|---|
| .. | .. |
|---|
| 321 | 751 | struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 322 | 752 | |
|---|
| 323 | 753 | timer_setup(&health->timer, poll_health, 0); |
|---|
| 324 | | - health->sick = 0; |
|---|
| 754 | + health->fatal_error = MLX5_SENSOR_NO_ERR; |
|---|
| 325 | 755 | clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); |
|---|
| 326 | | - clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); |
|---|
| 327 | 756 | health->health = &dev->iseg->health; |
|---|
| 328 | 757 | health->health_counter = &dev->iseg->health_counter; |
|---|
| 329 | 758 | |
|---|
| .. | .. |
|---|
| 339 | 768 | if (disable_health) { |
|---|
| 340 | 769 | spin_lock_irqsave(&health->wq_lock, flags); |
|---|
| 341 | 770 | set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); |
|---|
| 342 | | - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); |
|---|
| 343 | 771 | spin_unlock_irqrestore(&health->wq_lock, flags); |
|---|
| 344 | 772 | } |
|---|
| 345 | 773 | |
|---|
| .. | .. |
|---|
| 353 | 781 | |
|---|
| 354 | 782 | spin_lock_irqsave(&health->wq_lock, flags); |
|---|
| 355 | 783 | set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); |
|---|
| 356 | | - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); |
|---|
| 357 | 784 | spin_unlock_irqrestore(&health->wq_lock, flags); |
|---|
| 358 | | - cancel_delayed_work_sync(&health->recover_work); |
|---|
| 359 | | - cancel_work_sync(&health->work); |
|---|
| 785 | + cancel_work_sync(&health->report_work); |
|---|
| 786 | + cancel_work_sync(&health->fatal_report_work); |
|---|
| 360 | 787 | } |
|---|
| 361 | 788 | |
|---|
| 362 | | -void mlx5_drain_health_recovery(struct mlx5_core_dev *dev) |
|---|
| 789 | +void mlx5_health_flush(struct mlx5_core_dev *dev) |
|---|
| 363 | 790 | { |
|---|
| 364 | 791 | struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 365 | | - unsigned long flags; |
|---|
| 366 | 792 | |
|---|
| 367 | | - spin_lock_irqsave(&health->wq_lock, flags); |
|---|
| 368 | | - set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); |
|---|
| 369 | | - spin_unlock_irqrestore(&health->wq_lock, flags); |
|---|
| 370 | | - cancel_delayed_work_sync(&dev->priv.health.recover_work); |
|---|
| 793 | + flush_workqueue(health->wq); |
|---|
| 371 | 794 | } |
|---|
| 372 | 795 | |
|---|
| 373 | 796 | void mlx5_health_cleanup(struct mlx5_core_dev *dev) |
|---|
| .. | .. |
|---|
| 375 | 798 | struct mlx5_core_health *health = &dev->priv.health; |
|---|
| 376 | 799 | |
|---|
| 377 | 800 | destroy_workqueue(health->wq); |
|---|
| 801 | + mlx5_fw_reporters_destroy(dev); |
|---|
| 378 | 802 | } |
|---|
| 379 | 803 | |
|---|
| 380 | 804 | int mlx5_health_init(struct mlx5_core_dev *dev) |
|---|
| .. | .. |
|---|
| 382 | 806 | struct mlx5_core_health *health; |
|---|
| 383 | 807 | char *name; |
|---|
| 384 | 808 | |
|---|
| 809 | + mlx5_fw_reporters_create(dev); |
|---|
| 810 | + |
|---|
| 385 | 811 | health = &dev->priv.health; |
|---|
| 386 | 812 | name = kmalloc(64, GFP_KERNEL); |
|---|
| 387 | 813 | if (!name) |
|---|
| 388 | | - return -ENOMEM; |
|---|
| 814 | + goto out_err; |
|---|
| 389 | 815 | |
|---|
| 390 | 816 | strcpy(name, "mlx5_health"); |
|---|
| 391 | | - strcat(name, dev_name(&dev->pdev->dev)); |
|---|
| 817 | + strcat(name, dev_name(dev->device)); |
|---|
| 392 | 818 | health->wq = create_singlethread_workqueue(name); |
|---|
| 393 | 819 | kfree(name); |
|---|
| 394 | 820 | if (!health->wq) |
|---|
| 395 | | - return -ENOMEM; |
|---|
| 821 | + goto out_err; |
|---|
| 396 | 822 | spin_lock_init(&health->wq_lock); |
|---|
| 397 | | - INIT_WORK(&health->work, health_care); |
|---|
| 398 | | - INIT_DELAYED_WORK(&health->recover_work, health_recover); |
|---|
| 823 | + INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work); |
|---|
| 824 | + INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); |
|---|
| 399 | 825 | |
|---|
| 400 | 826 | return 0; |
|---|
| 827 | + |
|---|
| 828 | +out_err: |
|---|
| 829 | + mlx5_fw_reporters_destroy(dev); |
|---|
| 830 | + return -ENOMEM; |
|---|
| 401 | 831 | } |
|---|