| .. | .. |
|---|
| 27 | 27 | */ |
|---|
| 28 | 28 | #include <linux/power_supply.h> |
|---|
| 29 | 29 | #include <linux/kthread.h> |
|---|
| 30 | +#include <linux/module.h> |
|---|
| 30 | 31 | #include <linux/console.h> |
|---|
| 31 | 32 | #include <linux/slab.h> |
|---|
| 32 | | -#include <drm/drmP.h> |
|---|
| 33 | | -#include <drm/drm_crtc_helper.h> |
|---|
| 33 | + |
|---|
| 34 | 34 | #include <drm/drm_atomic_helper.h> |
|---|
| 35 | +#include <drm/drm_probe_helper.h> |
|---|
| 35 | 36 | #include <drm/amdgpu_drm.h> |
|---|
| 36 | 37 | #include <linux/vgaarb.h> |
|---|
| 37 | 38 | #include <linux/vga_switcheroo.h> |
|---|
| .. | .. |
|---|
| 51 | 52 | #endif |
|---|
| 52 | 53 | #include "vi.h" |
|---|
| 53 | 54 | #include "soc15.h" |
|---|
| 55 | +#include "nv.h" |
|---|
| 54 | 56 | #include "bif/bif_4_1_d.h" |
|---|
| 55 | 57 | #include <linux/pci.h> |
|---|
| 56 | 58 | #include <linux/firmware.h> |
|---|
| .. | .. |
|---|
| 59 | 61 | #include "amdgpu_amdkfd.h" |
|---|
| 60 | 62 | #include "amdgpu_pm.h" |
|---|
| 61 | 63 | |
|---|
| 64 | +#include "amdgpu_xgmi.h" |
|---|
| 65 | +#include "amdgpu_ras.h" |
|---|
| 66 | +#include "amdgpu_pmu.h" |
|---|
| 67 | +#include "amdgpu_fru_eeprom.h" |
|---|
| 68 | + |
|---|
| 69 | +#include <linux/suspend.h> |
|---|
| 70 | +#include <drm/task_barrier.h> |
|---|
| 71 | +#include <linux/pm_runtime.h> |
|---|
| 72 | + |
|---|
| 62 | 73 | MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); |
|---|
| 63 | 74 | MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); |
|---|
| 64 | 75 | MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); |
|---|
| 76 | +MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); |
|---|
| 77 | +MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); |
|---|
| 78 | +MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); |
|---|
| 79 | +MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); |
|---|
| 80 | +MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); |
|---|
| 81 | +MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); |
|---|
| 82 | +MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); |
|---|
| 65 | 83 | |
|---|
| 66 | 84 | #define AMDGPU_RESUME_MS 2000 |
|---|
| 67 | 85 | |
|---|
| 68 | | -static const char *amdgpu_asic_name[] = { |
|---|
| 86 | +const char *amdgpu_asic_name[] = { |
|---|
| 69 | 87 | "TAHITI", |
|---|
| 70 | 88 | "PITCAIRN", |
|---|
| 71 | 89 | "VERDE", |
|---|
| .. | .. |
|---|
| 89 | 107 | "VEGA12", |
|---|
| 90 | 108 | "VEGA20", |
|---|
| 91 | 109 | "RAVEN", |
|---|
| 110 | + "ARCTURUS", |
|---|
| 111 | + "RENOIR", |
|---|
| 112 | + "NAVI10", |
|---|
| 113 | + "NAVI14", |
|---|
| 114 | + "NAVI12", |
|---|
| 115 | + "SIENNA_CICHLID", |
|---|
| 116 | + "NAVY_FLOUNDER", |
|---|
| 92 | 117 | "LAST", |
|---|
| 93 | 118 | }; |
|---|
| 119 | + |
|---|
| 120 | +/** |
|---|
| 121 | + * DOC: pcie_replay_count |
|---|
| 122 | + * |
|---|
| 123 | + * The amdgpu driver provides a sysfs API for reporting the total number |
|---|
| 124 | + * of PCIe replays (NAKs) |
|---|
| 125 | + * The file pcie_replay_count is used for this and returns the total |
|---|
| 126 | + * number of replays as a sum of the NAKs generated and NAKs received |
|---|
| 127 | + */ |
|---|
| 128 | + |
|---|
| 129 | +static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, |
|---|
| 130 | + struct device_attribute *attr, char *buf) |
|---|
| 131 | +{ |
|---|
| 132 | + struct drm_device *ddev = dev_get_drvdata(dev); |
|---|
| 133 | + struct amdgpu_device *adev = drm_to_adev(ddev); |
|---|
| 134 | + uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); |
|---|
| 135 | + |
|---|
| 136 | + return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); |
|---|
| 137 | +} |
|---|
| 138 | + |
|---|
| 139 | +static DEVICE_ATTR(pcie_replay_count, S_IRUGO, |
|---|
| 140 | + amdgpu_device_get_pcie_replay_count, NULL); |
|---|
| 94 | 141 | |
|---|
| 95 | 142 | static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); |
|---|
| 96 | 143 | |
|---|
| 97 | 144 | /** |
|---|
| 98 | | - * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control |
|---|
| 145 | + * DOC: product_name |
|---|
| 146 | + * |
|---|
| 147 | + * The amdgpu driver provides a sysfs API for reporting the product name |
|---|
| 148 | + * for the device |
|---|
| 149 | + * The file serial_number is used for this and returns the product name |
|---|
| 150 | + * as returned from the FRU. |
|---|
| 151 | + * NOTE: This is only available for certain server cards |
|---|
| 152 | + */ |
|---|
| 153 | + |
|---|
| 154 | +static ssize_t amdgpu_device_get_product_name(struct device *dev, |
|---|
| 155 | + struct device_attribute *attr, char *buf) |
|---|
| 156 | +{ |
|---|
| 157 | + struct drm_device *ddev = dev_get_drvdata(dev); |
|---|
| 158 | + struct amdgpu_device *adev = drm_to_adev(ddev); |
|---|
| 159 | + |
|---|
| 160 | + return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); |
|---|
| 161 | +} |
|---|
| 162 | + |
|---|
| 163 | +static DEVICE_ATTR(product_name, S_IRUGO, |
|---|
| 164 | + amdgpu_device_get_product_name, NULL); |
|---|
| 165 | + |
|---|
| 166 | +/** |
|---|
| 167 | + * DOC: product_number |
|---|
| 168 | + * |
|---|
| 169 | + * The amdgpu driver provides a sysfs API for reporting the part number |
|---|
| 170 | + * for the device |
|---|
| 171 | + * The file serial_number is used for this and returns the part number |
|---|
| 172 | + * as returned from the FRU. |
|---|
| 173 | + * NOTE: This is only available for certain server cards |
|---|
| 174 | + */ |
|---|
| 175 | + |
|---|
| 176 | +static ssize_t amdgpu_device_get_product_number(struct device *dev, |
|---|
| 177 | + struct device_attribute *attr, char *buf) |
|---|
| 178 | +{ |
|---|
| 179 | + struct drm_device *ddev = dev_get_drvdata(dev); |
|---|
| 180 | + struct amdgpu_device *adev = drm_to_adev(ddev); |
|---|
| 181 | + |
|---|
| 182 | + return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); |
|---|
| 183 | +} |
|---|
| 184 | + |
|---|
| 185 | +static DEVICE_ATTR(product_number, S_IRUGO, |
|---|
| 186 | + amdgpu_device_get_product_number, NULL); |
|---|
| 187 | + |
|---|
| 188 | +/** |
|---|
| 189 | + * DOC: serial_number |
|---|
| 190 | + * |
|---|
| 191 | + * The amdgpu driver provides a sysfs API for reporting the serial number |
|---|
| 192 | + * for the device |
|---|
| 193 | + * The file serial_number is used for this and returns the serial number |
|---|
| 194 | + * as returned from the FRU. |
|---|
| 195 | + * NOTE: This is only available for certain server cards |
|---|
| 196 | + */ |
|---|
| 197 | + |
|---|
| 198 | +static ssize_t amdgpu_device_get_serial_number(struct device *dev, |
|---|
| 199 | + struct device_attribute *attr, char *buf) |
|---|
| 200 | +{ |
|---|
| 201 | + struct drm_device *ddev = dev_get_drvdata(dev); |
|---|
| 202 | + struct amdgpu_device *adev = drm_to_adev(ddev); |
|---|
| 203 | + |
|---|
| 204 | + return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); |
|---|
| 205 | +} |
|---|
| 206 | + |
|---|
| 207 | +static DEVICE_ATTR(serial_number, S_IRUGO, |
|---|
| 208 | + amdgpu_device_get_serial_number, NULL); |
|---|
| 209 | + |
|---|
| 210 | +/** |
|---|
| 211 | + * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control |
|---|
| 99 | 212 | * |
|---|
| 100 | 213 | * @dev: drm_device pointer |
|---|
| 101 | 214 | * |
|---|
| 102 | 215 | * Returns true if the device is a dGPU with HG/PX power control, |
|---|
| 103 | 216 | * otherwise return false. |
|---|
| 104 | 217 | */ |
|---|
| 105 | | -bool amdgpu_device_is_px(struct drm_device *dev) |
|---|
| 218 | +bool amdgpu_device_supports_boco(struct drm_device *dev) |
|---|
| 106 | 219 | { |
|---|
| 107 | | - struct amdgpu_device *adev = dev->dev_private; |
|---|
| 220 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 108 | 221 | |
|---|
| 109 | 222 | if (adev->flags & AMD_IS_PX) |
|---|
| 110 | 223 | return true; |
|---|
| 111 | 224 | return false; |
|---|
| 112 | 225 | } |
|---|
| 113 | 226 | |
|---|
| 227 | +/** |
|---|
| 228 | + * amdgpu_device_supports_baco - Does the device support BACO |
|---|
| 229 | + * |
|---|
| 230 | + * @dev: drm_device pointer |
|---|
| 231 | + * |
|---|
| 232 | + * Returns true if the device supporte BACO, |
|---|
| 233 | + * otherwise return false. |
|---|
| 234 | + */ |
|---|
| 235 | +bool amdgpu_device_supports_baco(struct drm_device *dev) |
|---|
| 236 | +{ |
|---|
| 237 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 238 | + |
|---|
| 239 | + return amdgpu_asic_supports_baco(adev); |
|---|
| 240 | +} |
|---|
| 241 | + |
|---|
| 114 | 242 | /* |
|---|
| 115 | | - * MMIO register access helper functions. |
|---|
| 243 | + * VRAM access helper functions |
|---|
| 244 | + */ |
|---|
| 245 | + |
|---|
| 246 | +/** |
|---|
| 247 | + * amdgpu_device_vram_access - read/write a buffer in vram |
|---|
| 248 | + * |
|---|
| 249 | + * @adev: amdgpu_device pointer |
|---|
| 250 | + * @pos: offset of the buffer in vram |
|---|
| 251 | + * @buf: virtual address of the buffer in system memory |
|---|
| 252 | + * @size: read/write size, sizeof(@buf) must > @size |
|---|
| 253 | + * @write: true - write to vram, otherwise - read from vram |
|---|
| 254 | + */ |
|---|
| 255 | +void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, |
|---|
| 256 | + uint32_t *buf, size_t size, bool write) |
|---|
| 257 | +{ |
|---|
| 258 | + unsigned long flags; |
|---|
| 259 | + uint32_t hi = ~0; |
|---|
| 260 | + uint64_t last; |
|---|
| 261 | + |
|---|
| 262 | + |
|---|
| 263 | +#ifdef CONFIG_64BIT |
|---|
| 264 | + last = min(pos + size, adev->gmc.visible_vram_size); |
|---|
| 265 | + if (last > pos) { |
|---|
| 266 | + void __iomem *addr = adev->mman.aper_base_kaddr + pos; |
|---|
| 267 | + size_t count = last - pos; |
|---|
| 268 | + |
|---|
| 269 | + if (write) { |
|---|
| 270 | + memcpy_toio(addr, buf, count); |
|---|
| 271 | + mb(); |
|---|
| 272 | + amdgpu_asic_flush_hdp(adev, NULL); |
|---|
| 273 | + } else { |
|---|
| 274 | + amdgpu_asic_invalidate_hdp(adev, NULL); |
|---|
| 275 | + mb(); |
|---|
| 276 | + memcpy_fromio(buf, addr, count); |
|---|
| 277 | + } |
|---|
| 278 | + |
|---|
| 279 | + if (count == size) |
|---|
| 280 | + return; |
|---|
| 281 | + |
|---|
| 282 | + pos += count; |
|---|
| 283 | + buf += count / 4; |
|---|
| 284 | + size -= count; |
|---|
| 285 | + } |
|---|
| 286 | +#endif |
|---|
| 287 | + |
|---|
| 288 | + spin_lock_irqsave(&adev->mmio_idx_lock, flags); |
|---|
| 289 | + for (last = pos + size; pos < last; pos += 4) { |
|---|
| 290 | + uint32_t tmp = pos >> 31; |
|---|
| 291 | + |
|---|
| 292 | + WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); |
|---|
| 293 | + if (tmp != hi) { |
|---|
| 294 | + WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); |
|---|
| 295 | + hi = tmp; |
|---|
| 296 | + } |
|---|
| 297 | + if (write) |
|---|
| 298 | + WREG32_NO_KIQ(mmMM_DATA, *buf++); |
|---|
| 299 | + else |
|---|
| 300 | + *buf++ = RREG32_NO_KIQ(mmMM_DATA); |
|---|
| 301 | + } |
|---|
| 302 | + spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); |
|---|
| 303 | +} |
|---|
| 304 | + |
|---|
| 305 | +/* |
|---|
| 306 | + * register access helper functions. |
|---|
| 116 | 307 | */ |
|---|
| 117 | 308 | /** |
|---|
| 118 | | - * amdgpu_mm_rreg - read a memory mapped IO register |
|---|
| 309 | + * amdgpu_device_rreg - read a memory mapped IO or indirect register |
|---|
| 119 | 310 | * |
|---|
| 120 | 311 | * @adev: amdgpu_device pointer |
|---|
| 121 | 312 | * @reg: dword aligned register offset |
|---|
| .. | .. |
|---|
| 123 | 314 | * |
|---|
| 124 | 315 | * Returns the 32 bit value from the offset specified. |
|---|
| 125 | 316 | */ |
|---|
| 126 | | -uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, |
|---|
| 127 | | - uint32_t acc_flags) |
|---|
| 317 | +uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, |
|---|
| 318 | + uint32_t reg, uint32_t acc_flags) |
|---|
| 128 | 319 | { |
|---|
| 129 | 320 | uint32_t ret; |
|---|
| 130 | 321 | |
|---|
| 131 | | - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) |
|---|
| 132 | | - return amdgpu_virt_kiq_rreg(adev, reg); |
|---|
| 322 | + if (adev->in_pci_err_recovery) |
|---|
| 323 | + return 0; |
|---|
| 133 | 324 | |
|---|
| 134 | | - if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) |
|---|
| 135 | | - ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); |
|---|
| 136 | | - else { |
|---|
| 137 | | - unsigned long flags; |
|---|
| 138 | | - |
|---|
| 139 | | - spin_lock_irqsave(&adev->mmio_idx_lock, flags); |
|---|
| 140 | | - writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); |
|---|
| 141 | | - ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); |
|---|
| 142 | | - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); |
|---|
| 325 | + if ((reg * 4) < adev->rmmio_size) { |
|---|
| 326 | + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && |
|---|
| 327 | + amdgpu_sriov_runtime(adev) && |
|---|
| 328 | + down_read_trylock(&adev->reset_sem)) { |
|---|
| 329 | + ret = amdgpu_kiq_rreg(adev, reg); |
|---|
| 330 | + up_read(&adev->reset_sem); |
|---|
| 331 | + } else { |
|---|
| 332 | + ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); |
|---|
| 333 | + } |
|---|
| 334 | + } else { |
|---|
| 335 | + ret = adev->pcie_rreg(adev, reg * 4); |
|---|
| 143 | 336 | } |
|---|
| 144 | | - trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); |
|---|
| 337 | + |
|---|
| 338 | + trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); |
|---|
| 339 | + |
|---|
| 145 | 340 | return ret; |
|---|
| 146 | 341 | } |
|---|
| 147 | 342 | |
|---|
| .. | .. |
|---|
| 159 | 354 | * |
|---|
| 160 | 355 | * Returns the 8 bit value from the offset specified. |
|---|
| 161 | 356 | */ |
|---|
| 162 | | -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { |
|---|
| 357 | +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) |
|---|
| 358 | +{ |
|---|
| 359 | + if (adev->in_pci_err_recovery) |
|---|
| 360 | + return 0; |
|---|
| 361 | + |
|---|
| 163 | 362 | if (offset < adev->rmmio_size) |
|---|
| 164 | 363 | return (readb(adev->rmmio + offset)); |
|---|
| 165 | 364 | BUG(); |
|---|
| .. | .. |
|---|
| 180 | 379 | * |
|---|
| 181 | 380 | * Writes the value specified to the offset specified. |
|---|
| 182 | 381 | */ |
|---|
| 183 | | -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { |
|---|
| 382 | +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) |
|---|
| 383 | +{ |
|---|
| 384 | + if (adev->in_pci_err_recovery) |
|---|
| 385 | + return; |
|---|
| 386 | + |
|---|
| 184 | 387 | if (offset < adev->rmmio_size) |
|---|
| 185 | 388 | writeb(value, adev->rmmio + offset); |
|---|
| 186 | 389 | else |
|---|
| .. | .. |
|---|
| 188 | 391 | } |
|---|
| 189 | 392 | |
|---|
| 190 | 393 | /** |
|---|
| 191 | | - * amdgpu_mm_wreg - write to a memory mapped IO register |
|---|
| 394 | + * amdgpu_device_wreg - write to a memory mapped IO or indirect register |
|---|
| 192 | 395 | * |
|---|
| 193 | 396 | * @adev: amdgpu_device pointer |
|---|
| 194 | 397 | * @reg: dword aligned register offset |
|---|
| .. | .. |
|---|
| 197 | 400 | * |
|---|
| 198 | 401 | * Writes the value specified to the offset specified. |
|---|
| 199 | 402 | */ |
|---|
| 200 | | -void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, |
|---|
| 201 | | - uint32_t acc_flags) |
|---|
| 403 | +void amdgpu_device_wreg(struct amdgpu_device *adev, |
|---|
| 404 | + uint32_t reg, uint32_t v, |
|---|
| 405 | + uint32_t acc_flags) |
|---|
| 202 | 406 | { |
|---|
| 203 | | - trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); |
|---|
| 407 | + if (adev->in_pci_err_recovery) |
|---|
| 408 | + return; |
|---|
| 204 | 409 | |
|---|
| 205 | | - if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { |
|---|
| 206 | | - adev->last_mm_index = v; |
|---|
| 410 | + if ((reg * 4) < adev->rmmio_size) { |
|---|
| 411 | + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && |
|---|
| 412 | + amdgpu_sriov_runtime(adev) && |
|---|
| 413 | + down_read_trylock(&adev->reset_sem)) { |
|---|
| 414 | + amdgpu_kiq_wreg(adev, reg, v); |
|---|
| 415 | + up_read(&adev->reset_sem); |
|---|
| 416 | + } else { |
|---|
| 417 | + writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); |
|---|
| 418 | + } |
|---|
| 419 | + } else { |
|---|
| 420 | + adev->pcie_wreg(adev, reg * 4, v); |
|---|
| 207 | 421 | } |
|---|
| 208 | 422 | |
|---|
| 209 | | - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) |
|---|
| 210 | | - return amdgpu_virt_kiq_wreg(adev, reg, v); |
|---|
| 423 | + trace_amdgpu_device_wreg(adev->pdev->device, reg, v); |
|---|
| 424 | +} |
|---|
| 211 | 425 | |
|---|
| 212 | | - if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) |
|---|
| 426 | +/* |
|---|
| 427 | + * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range |
|---|
| 428 | + * |
|---|
| 429 | + * this function is invoked only the debugfs register access |
|---|
| 430 | + * */ |
|---|
| 431 | +void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, |
|---|
| 432 | + uint32_t reg, uint32_t v) |
|---|
| 433 | +{ |
|---|
| 434 | + if (adev->in_pci_err_recovery) |
|---|
| 435 | + return; |
|---|
| 436 | + |
|---|
| 437 | + if (amdgpu_sriov_fullaccess(adev) && |
|---|
| 438 | + adev->gfx.rlc.funcs && |
|---|
| 439 | + adev->gfx.rlc.funcs->is_rlcg_access_range) { |
|---|
| 440 | + if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) |
|---|
| 441 | + return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); |
|---|
| 442 | + } else { |
|---|
| 213 | 443 | writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); |
|---|
| 214 | | - else { |
|---|
| 215 | | - unsigned long flags; |
|---|
| 216 | | - |
|---|
| 217 | | - spin_lock_irqsave(&adev->mmio_idx_lock, flags); |
|---|
| 218 | | - writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); |
|---|
| 219 | | - writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); |
|---|
| 220 | | - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); |
|---|
| 221 | | - } |
|---|
| 222 | | - |
|---|
| 223 | | - if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { |
|---|
| 224 | | - udelay(500); |
|---|
| 225 | 444 | } |
|---|
| 226 | 445 | } |
|---|
| 227 | 446 | |
|---|
| .. | .. |
|---|
| 235 | 454 | */ |
|---|
| 236 | 455 | u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) |
|---|
| 237 | 456 | { |
|---|
| 457 | + if (adev->in_pci_err_recovery) |
|---|
| 458 | + return 0; |
|---|
| 459 | + |
|---|
| 238 | 460 | if ((reg * 4) < adev->rio_mem_size) |
|---|
| 239 | 461 | return ioread32(adev->rio_mem + (reg * 4)); |
|---|
| 240 | 462 | else { |
|---|
| .. | .. |
|---|
| 254 | 476 | */ |
|---|
| 255 | 477 | void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) |
|---|
| 256 | 478 | { |
|---|
| 257 | | - if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { |
|---|
| 258 | | - adev->last_mm_index = v; |
|---|
| 259 | | - } |
|---|
| 479 | + if (adev->in_pci_err_recovery) |
|---|
| 480 | + return; |
|---|
| 260 | 481 | |
|---|
| 261 | 482 | if ((reg * 4) < adev->rio_mem_size) |
|---|
| 262 | 483 | iowrite32(v, adev->rio_mem + (reg * 4)); |
|---|
| 263 | 484 | else { |
|---|
| 264 | 485 | iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); |
|---|
| 265 | 486 | iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); |
|---|
| 266 | | - } |
|---|
| 267 | | - |
|---|
| 268 | | - if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { |
|---|
| 269 | | - udelay(500); |
|---|
| 270 | 487 | } |
|---|
| 271 | 488 | } |
|---|
| 272 | 489 | |
|---|
| .. | .. |
|---|
| 281 | 498 | */ |
|---|
| 282 | 499 | u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) |
|---|
| 283 | 500 | { |
|---|
| 501 | + if (adev->in_pci_err_recovery) |
|---|
| 502 | + return 0; |
|---|
| 503 | + |
|---|
| 284 | 504 | if (index < adev->doorbell.num_doorbells) { |
|---|
| 285 | 505 | return readl(adev->doorbell.ptr + index); |
|---|
| 286 | 506 | } else { |
|---|
| .. | .. |
|---|
| 301 | 521 | */ |
|---|
| 302 | 522 | void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) |
|---|
| 303 | 523 | { |
|---|
| 524 | + if (adev->in_pci_err_recovery) |
|---|
| 525 | + return; |
|---|
| 526 | + |
|---|
| 304 | 527 | if (index < adev->doorbell.num_doorbells) { |
|---|
| 305 | 528 | writel(v, adev->doorbell.ptr + index); |
|---|
| 306 | 529 | } else { |
|---|
| .. | .. |
|---|
| 319 | 542 | */ |
|---|
| 320 | 543 | u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) |
|---|
| 321 | 544 | { |
|---|
| 545 | + if (adev->in_pci_err_recovery) |
|---|
| 546 | + return 0; |
|---|
| 547 | + |
|---|
| 322 | 548 | if (index < adev->doorbell.num_doorbells) { |
|---|
| 323 | 549 | return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); |
|---|
| 324 | 550 | } else { |
|---|
| .. | .. |
|---|
| 339 | 565 | */ |
|---|
| 340 | 566 | void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) |
|---|
| 341 | 567 | { |
|---|
| 568 | + if (adev->in_pci_err_recovery) |
|---|
| 569 | + return; |
|---|
| 570 | + |
|---|
| 342 | 571 | if (index < adev->doorbell.num_doorbells) { |
|---|
| 343 | 572 | atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); |
|---|
| 344 | 573 | } else { |
|---|
| .. | .. |
|---|
| 347 | 576 | } |
|---|
| 348 | 577 | |
|---|
| 349 | 578 | /** |
|---|
| 579 | + * amdgpu_device_indirect_rreg - read an indirect register |
|---|
| 580 | + * |
|---|
| 581 | + * @adev: amdgpu_device pointer |
|---|
| 582 | + * @pcie_index: mmio register offset |
|---|
| 583 | + * @pcie_data: mmio register offset |
|---|
| 584 | + * |
|---|
| 585 | + * Returns the value of indirect register @reg_addr |
|---|
| 586 | + */ |
|---|
| 587 | +u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, |
|---|
| 588 | + u32 pcie_index, u32 pcie_data, |
|---|
| 589 | + u32 reg_addr) |
|---|
| 590 | +{ |
|---|
| 591 | + unsigned long flags; |
|---|
| 592 | + u32 r; |
|---|
| 593 | + void __iomem *pcie_index_offset; |
|---|
| 594 | + void __iomem *pcie_data_offset; |
|---|
| 595 | + |
|---|
| 596 | + spin_lock_irqsave(&adev->pcie_idx_lock, flags); |
|---|
| 597 | + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; |
|---|
| 598 | + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; |
|---|
| 599 | + |
|---|
| 600 | + writel(reg_addr, pcie_index_offset); |
|---|
| 601 | + readl(pcie_index_offset); |
|---|
| 602 | + r = readl(pcie_data_offset); |
|---|
| 603 | + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); |
|---|
| 604 | + |
|---|
| 605 | + return r; |
|---|
| 606 | +} |
|---|
| 607 | + |
|---|
| 608 | +/** |
|---|
| 609 | + * amdgpu_device_indirect_rreg64 - read a 64bits indirect register |
|---|
| 610 | + * |
|---|
| 611 | + * @adev: amdgpu_device pointer |
|---|
| 612 | + * @pcie_index: mmio register offset |
|---|
| 613 | + * @pcie_data: mmio register offset |
|---|
| 614 | + * |
|---|
| 615 | + * Returns the value of indirect register @reg_addr |
|---|
| 616 | + */ |
|---|
| 617 | +u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, |
|---|
| 618 | + u32 pcie_index, u32 pcie_data, |
|---|
| 619 | + u32 reg_addr) |
|---|
| 620 | +{ |
|---|
| 621 | + unsigned long flags; |
|---|
| 622 | + u64 r; |
|---|
| 623 | + void __iomem *pcie_index_offset; |
|---|
| 624 | + void __iomem *pcie_data_offset; |
|---|
| 625 | + |
|---|
| 626 | + spin_lock_irqsave(&adev->pcie_idx_lock, flags); |
|---|
| 627 | + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; |
|---|
| 628 | + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; |
|---|
| 629 | + |
|---|
| 630 | + /* read low 32 bits */ |
|---|
| 631 | + writel(reg_addr, pcie_index_offset); |
|---|
| 632 | + readl(pcie_index_offset); |
|---|
| 633 | + r = readl(pcie_data_offset); |
|---|
| 634 | + /* read high 32 bits */ |
|---|
| 635 | + writel(reg_addr + 4, pcie_index_offset); |
|---|
| 636 | + readl(pcie_index_offset); |
|---|
| 637 | + r |= ((u64)readl(pcie_data_offset) << 32); |
|---|
| 638 | + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); |
|---|
| 639 | + |
|---|
| 640 | + return r; |
|---|
| 641 | +} |
|---|
| 642 | + |
|---|
| 643 | +/** |
|---|
| 644 | + * amdgpu_device_indirect_wreg - write an indirect register address |
|---|
| 645 | + * |
|---|
| 646 | + * @adev: amdgpu_device pointer |
|---|
| 647 | + * @pcie_index: mmio register offset |
|---|
| 648 | + * @pcie_data: mmio register offset |
|---|
| 649 | + * @reg_addr: indirect register offset |
|---|
| 650 | + * @reg_data: indirect register data |
|---|
| 651 | + * |
|---|
| 652 | + */ |
|---|
| 653 | +void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, |
|---|
| 654 | + u32 pcie_index, u32 pcie_data, |
|---|
| 655 | + u32 reg_addr, u32 reg_data) |
|---|
| 656 | +{ |
|---|
| 657 | + unsigned long flags; |
|---|
| 658 | + void __iomem *pcie_index_offset; |
|---|
| 659 | + void __iomem *pcie_data_offset; |
|---|
| 660 | + |
|---|
| 661 | + spin_lock_irqsave(&adev->pcie_idx_lock, flags); |
|---|
| 662 | + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; |
|---|
| 663 | + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; |
|---|
| 664 | + |
|---|
| 665 | + writel(reg_addr, pcie_index_offset); |
|---|
| 666 | + readl(pcie_index_offset); |
|---|
| 667 | + writel(reg_data, pcie_data_offset); |
|---|
| 668 | + readl(pcie_data_offset); |
|---|
| 669 | + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); |
|---|
| 670 | +} |
|---|
| 671 | + |
|---|
| 672 | +/** |
|---|
| 673 | + * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address |
|---|
| 674 | + * |
|---|
| 675 | + * @adev: amdgpu_device pointer |
|---|
| 676 | + * @pcie_index: mmio register offset |
|---|
| 677 | + * @pcie_data: mmio register offset |
|---|
| 678 | + * @reg_addr: indirect register offset |
|---|
| 679 | + * @reg_data: indirect register data |
|---|
| 680 | + * |
|---|
| 681 | + */ |
|---|
| 682 | +void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, |
|---|
| 683 | + u32 pcie_index, u32 pcie_data, |
|---|
| 684 | + u32 reg_addr, u64 reg_data) |
|---|
| 685 | +{ |
|---|
| 686 | + unsigned long flags; |
|---|
| 687 | + void __iomem *pcie_index_offset; |
|---|
| 688 | + void __iomem *pcie_data_offset; |
|---|
| 689 | + |
|---|
| 690 | + spin_lock_irqsave(&adev->pcie_idx_lock, flags); |
|---|
| 691 | + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; |
|---|
| 692 | + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; |
|---|
| 693 | + |
|---|
| 694 | + /* write low 32 bits */ |
|---|
| 695 | + writel(reg_addr, pcie_index_offset); |
|---|
| 696 | + readl(pcie_index_offset); |
|---|
| 697 | + writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); |
|---|
| 698 | + readl(pcie_data_offset); |
|---|
| 699 | + /* write high 32 bits */ |
|---|
| 700 | + writel(reg_addr + 4, pcie_index_offset); |
|---|
| 701 | + readl(pcie_index_offset); |
|---|
| 702 | + writel((u32)(reg_data >> 32), pcie_data_offset); |
|---|
| 703 | + readl(pcie_data_offset); |
|---|
| 704 | + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); |
|---|
| 705 | +} |
|---|
| 706 | + |
|---|
| 707 | +/** |
|---|
| 350 | 708 | * amdgpu_invalid_rreg - dummy reg read function |
|---|
| 351 | 709 | * |
|---|
| 352 | | - * @adev: amdgpu device pointer |
|---|
| 710 | + * @adev: amdgpu_device pointer |
|---|
| 353 | 711 | * @reg: offset of register |
|---|
| 354 | 712 | * |
|---|
| 355 | 713 | * Dummy register read function. Used for register blocks |
|---|
| .. | .. |
|---|
| 366 | 724 | /** |
|---|
| 367 | 725 | * amdgpu_invalid_wreg - dummy reg write function |
|---|
| 368 | 726 | * |
|---|
| 369 | | - * @adev: amdgpu device pointer |
|---|
| 727 | + * @adev: amdgpu_device pointer |
|---|
| 370 | 728 | * @reg: offset of register |
|---|
| 371 | 729 | * @v: value to write to the register |
|---|
| 372 | 730 | * |
|---|
| .. | .. |
|---|
| 381 | 739 | } |
|---|
| 382 | 740 | |
|---|
| 383 | 741 | /** |
|---|
| 742 | + * amdgpu_invalid_rreg64 - dummy 64 bit reg read function |
|---|
| 743 | + * |
|---|
| 744 | + * @adev: amdgpu_device pointer |
|---|
| 745 | + * @reg: offset of register |
|---|
| 746 | + * |
|---|
| 747 | + * Dummy register read function. Used for register blocks |
|---|
| 748 | + * that certain asics don't have (all asics). |
|---|
| 749 | + * Returns the value in the register. |
|---|
| 750 | + */ |
|---|
| 751 | +static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) |
|---|
| 752 | +{ |
|---|
| 753 | + DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); |
|---|
| 754 | + BUG(); |
|---|
| 755 | + return 0; |
|---|
| 756 | +} |
|---|
| 757 | + |
|---|
| 758 | +/** |
|---|
| 759 | + * amdgpu_invalid_wreg64 - dummy reg write function |
|---|
| 760 | + * |
|---|
| 761 | + * @adev: amdgpu_device pointer |
|---|
| 762 | + * @reg: offset of register |
|---|
| 763 | + * @v: value to write to the register |
|---|
| 764 | + * |
|---|
| 765 | + * Dummy register read function. Used for register blocks |
|---|
| 766 | + * that certain asics don't have (all asics). |
|---|
| 767 | + */ |
|---|
| 768 | +static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) |
|---|
| 769 | +{ |
|---|
| 770 | + DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", |
|---|
| 771 | + reg, v); |
|---|
| 772 | + BUG(); |
|---|
| 773 | +} |
|---|
| 774 | + |
|---|
| 775 | +/** |
|---|
| 384 | 776 | * amdgpu_block_invalid_rreg - dummy reg read function |
|---|
| 385 | 777 | * |
|---|
| 386 | | - * @adev: amdgpu device pointer |
|---|
| 778 | + * @adev: amdgpu_device pointer |
|---|
| 387 | 779 | * @block: offset of instance |
|---|
| 388 | 780 | * @reg: offset of register |
|---|
| 389 | 781 | * |
|---|
| .. | .. |
|---|
| 403 | 795 | /** |
|---|
| 404 | 796 | * amdgpu_block_invalid_wreg - dummy reg write function |
|---|
| 405 | 797 | * |
|---|
| 406 | | - * @adev: amdgpu device pointer |
|---|
| 798 | + * @adev: amdgpu_device pointer |
|---|
| 407 | 799 | * @block: offset of instance |
|---|
| 408 | 800 | * @reg: offset of register |
|---|
| 409 | 801 | * @v: value to write to the register |
|---|
| .. | .. |
|---|
| 421 | 813 | } |
|---|
| 422 | 814 | |
|---|
| 423 | 815 | /** |
|---|
| 816 | + * amdgpu_device_asic_init - Wrapper for atom asic_init |
|---|
| 817 | + * |
|---|
| 818 | + * @adev: amdgpu_device pointer |
|---|
| 819 | + * |
|---|
| 820 | + * Does any asic specific work and then calls atom asic init. |
|---|
| 821 | + */ |
|---|
| 822 | +static int amdgpu_device_asic_init(struct amdgpu_device *adev) |
|---|
| 823 | +{ |
|---|
| 824 | + amdgpu_asic_pre_asic_init(adev); |
|---|
| 825 | + |
|---|
| 826 | + return amdgpu_atom_asic_init(adev->mode_info.atom_context); |
|---|
| 827 | +} |
|---|
| 828 | + |
|---|
| 829 | +/** |
|---|
| 424 | 830 | * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page |
|---|
| 425 | 831 | * |
|---|
| 426 | | - * @adev: amdgpu device pointer |
|---|
| 832 | + * @adev: amdgpu_device pointer |
|---|
| 427 | 833 | * |
|---|
| 428 | 834 | * Allocates a scratch page of VRAM for use by various things in the |
|---|
| 429 | 835 | * driver. |
|---|
| .. | .. |
|---|
| 440 | 846 | /** |
|---|
| 441 | 847 | * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page |
|---|
| 442 | 848 | * |
|---|
| 443 | | - * @adev: amdgpu device pointer |
|---|
| 849 | + * @adev: amdgpu_device pointer |
|---|
| 444 | 850 | * |
|---|
| 445 | 851 | * Frees the VRAM scratch page. |
|---|
| 446 | 852 | */ |
|---|
| .. | .. |
|---|
| 479 | 885 | } else { |
|---|
| 480 | 886 | tmp = RREG32(reg); |
|---|
| 481 | 887 | tmp &= ~and_mask; |
|---|
| 482 | | - tmp |= or_mask; |
|---|
| 888 | + if (adev->family >= AMDGPU_FAMILY_AI) |
|---|
| 889 | + tmp |= (or_mask & and_mask); |
|---|
| 890 | + else |
|---|
| 891 | + tmp |= or_mask; |
|---|
| 483 | 892 | } |
|---|
| 484 | 893 | WREG32(reg, tmp); |
|---|
| 485 | 894 | } |
|---|
| .. | .. |
|---|
| 511 | 920 | */ |
|---|
| 512 | 921 | static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) |
|---|
| 513 | 922 | { |
|---|
| 923 | + |
|---|
| 514 | 924 | /* No doorbell on SI hardware generation */ |
|---|
| 515 | 925 | if (adev->asic_type < CHIP_BONAIRE) { |
|---|
| 516 | 926 | adev->doorbell.base = 0; |
|---|
| .. | .. |
|---|
| 523 | 933 | if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) |
|---|
| 524 | 934 | return -EINVAL; |
|---|
| 525 | 935 | |
|---|
| 936 | + amdgpu_asic_init_doorbell_index(adev); |
|---|
| 937 | + |
|---|
| 526 | 938 | /* doorbell bar mapping */ |
|---|
| 527 | 939 | adev->doorbell.base = pci_resource_start(adev->pdev, 2); |
|---|
| 528 | 940 | adev->doorbell.size = pci_resource_len(adev->pdev, 2); |
|---|
| 529 | 941 | |
|---|
| 530 | 942 | adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), |
|---|
| 531 | | - AMDGPU_DOORBELL_MAX_ASSIGNMENT+1); |
|---|
| 943 | + adev->doorbell_index.max_assignment+1); |
|---|
| 532 | 944 | if (adev->doorbell.num_doorbells == 0) |
|---|
| 533 | 945 | return -EINVAL; |
|---|
| 946 | + |
|---|
| 947 | + /* For Vega, reserve and map two pages on doorbell BAR since SDMA |
|---|
| 948 | + * paging queue doorbell use the second page. The |
|---|
| 949 | + * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the |
|---|
| 950 | + * doorbells are in the first page. So with paging queue enabled, |
|---|
| 951 | + * the max num_doorbells should + 1 page (0x400 in dword) |
|---|
| 952 | + */ |
|---|
| 953 | + if (adev->asic_type >= CHIP_VEGA10) |
|---|
| 954 | + adev->doorbell.num_doorbells += 0x400; |
|---|
| 534 | 955 | |
|---|
| 535 | 956 | adev->doorbell.ptr = ioremap(adev->doorbell.base, |
|---|
| 536 | 957 | adev->doorbell.num_doorbells * |
|---|
| .. | .. |
|---|
| 652 | 1073 | } |
|---|
| 653 | 1074 | |
|---|
| 654 | 1075 | /** |
|---|
| 655 | | - * amdgpu_device_vram_location - try to find VRAM location |
|---|
| 656 | | - * |
|---|
| 657 | | - * @adev: amdgpu device structure holding all necessary informations |
|---|
| 658 | | - * @mc: memory controller structure holding memory informations |
|---|
| 659 | | - * @base: base address at which to put VRAM |
|---|
| 660 | | - * |
|---|
| 661 | | - * Function will try to place VRAM at base address provided |
|---|
| 662 | | - * as parameter. |
|---|
| 663 | | - */ |
|---|
| 664 | | -void amdgpu_device_vram_location(struct amdgpu_device *adev, |
|---|
| 665 | | - struct amdgpu_gmc *mc, u64 base) |
|---|
| 666 | | -{ |
|---|
| 667 | | - uint64_t limit = (uint64_t)amdgpu_vram_limit << 20; |
|---|
| 668 | | - |
|---|
| 669 | | - mc->vram_start = base; |
|---|
| 670 | | - mc->vram_end = mc->vram_start + mc->mc_vram_size - 1; |
|---|
| 671 | | - if (limit && limit < mc->real_vram_size) |
|---|
| 672 | | - mc->real_vram_size = limit; |
|---|
| 673 | | - dev_info(adev->dev, "VRAM: %lluM 0x%016llX - 0x%016llX (%lluM used)\n", |
|---|
| 674 | | - mc->mc_vram_size >> 20, mc->vram_start, |
|---|
| 675 | | - mc->vram_end, mc->real_vram_size >> 20); |
|---|
| 676 | | -} |
|---|
| 677 | | - |
|---|
| 678 | | -/** |
|---|
| 679 | | - * amdgpu_device_gart_location - try to find GART location |
|---|
| 680 | | - * |
|---|
| 681 | | - * @adev: amdgpu device structure holding all necessary informations |
|---|
| 682 | | - * @mc: memory controller structure holding memory informations |
|---|
| 683 | | - * |
|---|
| 684 | | - * Function will place try to place GART before or after VRAM. |
|---|
| 685 | | - * |
|---|
| 686 | | - * If GART size is bigger than space left then we ajust GART size. |
|---|
| 687 | | - * Thus function will never fails. |
|---|
| 688 | | - */ |
|---|
| 689 | | -void amdgpu_device_gart_location(struct amdgpu_device *adev, |
|---|
| 690 | | - struct amdgpu_gmc *mc) |
|---|
| 691 | | -{ |
|---|
| 692 | | - u64 size_af, size_bf; |
|---|
| 693 | | - |
|---|
| 694 | | - mc->gart_size += adev->pm.smu_prv_buffer_size; |
|---|
| 695 | | - |
|---|
| 696 | | - size_af = adev->gmc.mc_mask - mc->vram_end; |
|---|
| 697 | | - size_bf = mc->vram_start; |
|---|
| 698 | | - if (size_bf > size_af) { |
|---|
| 699 | | - if (mc->gart_size > size_bf) { |
|---|
| 700 | | - dev_warn(adev->dev, "limiting GART\n"); |
|---|
| 701 | | - mc->gart_size = size_bf; |
|---|
| 702 | | - } |
|---|
| 703 | | - mc->gart_start = 0; |
|---|
| 704 | | - } else { |
|---|
| 705 | | - if (mc->gart_size > size_af) { |
|---|
| 706 | | - dev_warn(adev->dev, "limiting GART\n"); |
|---|
| 707 | | - mc->gart_size = size_af; |
|---|
| 708 | | - } |
|---|
| 709 | | - /* VCE doesn't like it when BOs cross a 4GB segment, so align |
|---|
| 710 | | - * the GART base on a 4GB boundary as well. |
|---|
| 711 | | - */ |
|---|
| 712 | | - mc->gart_start = ALIGN(mc->vram_end + 1, 0x100000000ULL); |
|---|
| 713 | | - } |
|---|
| 714 | | - mc->gart_end = mc->gart_start + mc->gart_size - 1; |
|---|
| 715 | | - dev_info(adev->dev, "GART: %lluM 0x%016llX - 0x%016llX\n", |
|---|
| 716 | | - mc->gart_size >> 20, mc->gart_start, mc->gart_end); |
|---|
| 717 | | -} |
|---|
| 718 | | - |
|---|
| 719 | | -/** |
|---|
| 720 | 1076 | * amdgpu_device_resize_fb_bar - try to resize FB BAR |
|---|
| 721 | 1077 | * |
|---|
| 722 | 1078 | * @adev: amdgpu_device pointer |
|---|
| .. | .. |
|---|
| 735 | 1091 | u16 cmd; |
|---|
| 736 | 1092 | int r; |
|---|
| 737 | 1093 | |
|---|
| 1094 | + if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) |
|---|
| 1095 | + return 0; |
|---|
| 1096 | + |
|---|
| 738 | 1097 | /* Bypass for VF */ |
|---|
| 739 | 1098 | if (amdgpu_sriov_vf(adev)) |
|---|
| 1099 | + return 0; |
|---|
| 1100 | + |
|---|
| 1101 | + /* skip if the bios has already enabled large BAR */ |
|---|
| 1102 | + if (adev->gmc.real_vram_size && |
|---|
| 1103 | + (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) |
|---|
| 740 | 1104 | return 0; |
|---|
| 741 | 1105 | |
|---|
| 742 | 1106 | /* Check if the root BUS has 64bit memory resources */ |
|---|
| .. | .. |
|---|
| 913 | 1277 | static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) |
|---|
| 914 | 1278 | { |
|---|
| 915 | 1279 | struct sysinfo si; |
|---|
| 916 | | - bool is_os_64 = (sizeof(void *) == 8) ? true : false; |
|---|
| 1280 | + bool is_os_64 = (sizeof(void *) == 8); |
|---|
| 917 | 1281 | uint64_t total_memory; |
|---|
| 918 | 1282 | uint64_t dram_size_seven_GB = 0x1B8000000; |
|---|
| 919 | 1283 | uint64_t dram_size_three_GB = 0xB8000000; |
|---|
| .. | .. |
|---|
| 958 | 1322 | * Validates certain module parameters and updates |
|---|
| 959 | 1323 | * the associated values used by the driver (all asics). |
|---|
| 960 | 1324 | */ |
|---|
| 961 | | -static void amdgpu_device_check_arguments(struct amdgpu_device *adev) |
|---|
| 1325 | +static int amdgpu_device_check_arguments(struct amdgpu_device *adev) |
|---|
| 962 | 1326 | { |
|---|
| 963 | 1327 | if (amdgpu_sched_jobs < 4) { |
|---|
| 964 | 1328 | dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", |
|---|
| .. | .. |
|---|
| 991 | 1355 | amdgpu_vm_fragment_size = -1; |
|---|
| 992 | 1356 | } |
|---|
| 993 | 1357 | |
|---|
| 1358 | + if (amdgpu_sched_hw_submission < 2) { |
|---|
| 1359 | + dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", |
|---|
| 1360 | + amdgpu_sched_hw_submission); |
|---|
| 1361 | + amdgpu_sched_hw_submission = 2; |
|---|
| 1362 | + } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { |
|---|
| 1363 | + dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", |
|---|
| 1364 | + amdgpu_sched_hw_submission); |
|---|
| 1365 | + amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); |
|---|
| 1366 | + } |
|---|
| 1367 | + |
|---|
| 994 | 1368 | amdgpu_device_check_smu_prv_buffer_size(adev); |
|---|
| 995 | 1369 | |
|---|
| 996 | 1370 | amdgpu_device_check_vm_size(adev); |
|---|
| 997 | 1371 | |
|---|
| 998 | 1372 | amdgpu_device_check_block_size(adev); |
|---|
| 999 | 1373 | |
|---|
| 1000 | | - if (amdgpu_vram_page_split != -1 && (amdgpu_vram_page_split < 16 || |
|---|
| 1001 | | - !is_power_of_2(amdgpu_vram_page_split))) { |
|---|
| 1002 | | - dev_warn(adev->dev, "invalid VRAM page split (%d)\n", |
|---|
| 1003 | | - amdgpu_vram_page_split); |
|---|
| 1004 | | - amdgpu_vram_page_split = 1024; |
|---|
| 1005 | | - } |
|---|
| 1006 | | - |
|---|
| 1007 | | - if (amdgpu_lockup_timeout == 0) { |
|---|
| 1008 | | - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 10000\n"); |
|---|
| 1009 | | - amdgpu_lockup_timeout = 10000; |
|---|
| 1010 | | - } |
|---|
| 1011 | | - |
|---|
| 1012 | 1374 | adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); |
|---|
| 1375 | + |
|---|
| 1376 | + amdgpu_gmc_tmz_set(adev); |
|---|
| 1377 | + |
|---|
| 1378 | + if (amdgpu_num_kcq == -1) { |
|---|
| 1379 | + amdgpu_num_kcq = 8; |
|---|
| 1380 | + } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { |
|---|
| 1381 | + amdgpu_num_kcq = 8; |
|---|
| 1382 | + dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); |
|---|
| 1383 | + } |
|---|
| 1384 | + |
|---|
| 1385 | + amdgpu_gmc_noretry_set(adev); |
|---|
| 1386 | + |
|---|
| 1387 | + return 0; |
|---|
| 1013 | 1388 | } |
|---|
| 1014 | 1389 | |
|---|
| 1015 | 1390 | /** |
|---|
| .. | .. |
|---|
| 1021 | 1396 | * Callback for the switcheroo driver. Suspends or resumes the |
|---|
| 1022 | 1397 | * the asics before or after it is powered up using ACPI methods. |
|---|
| 1023 | 1398 | */ |
|---|
| 1024 | | -static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) |
|---|
| 1399 | +static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, |
|---|
| 1400 | + enum vga_switcheroo_state state) |
|---|
| 1025 | 1401 | { |
|---|
| 1026 | 1402 | struct drm_device *dev = pci_get_drvdata(pdev); |
|---|
| 1403 | + int r; |
|---|
| 1027 | 1404 | |
|---|
| 1028 | | - if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF) |
|---|
| 1405 | + if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) |
|---|
| 1029 | 1406 | return; |
|---|
| 1030 | 1407 | |
|---|
| 1031 | 1408 | if (state == VGA_SWITCHEROO_ON) { |
|---|
| 1032 | | - pr_info("amdgpu: switched on\n"); |
|---|
| 1409 | + pr_info("switched on\n"); |
|---|
| 1033 | 1410 | /* don't suspend or resume card normally */ |
|---|
| 1034 | 1411 | dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; |
|---|
| 1035 | 1412 | |
|---|
| 1036 | | - amdgpu_device_resume(dev, true, true); |
|---|
| 1413 | + pci_set_power_state(dev->pdev, PCI_D0); |
|---|
| 1414 | + amdgpu_device_load_pci_state(dev->pdev); |
|---|
| 1415 | + r = pci_enable_device(dev->pdev); |
|---|
| 1416 | + if (r) |
|---|
| 1417 | + DRM_WARN("pci_enable_device failed (%d)\n", r); |
|---|
| 1418 | + amdgpu_device_resume(dev, true); |
|---|
| 1037 | 1419 | |
|---|
| 1038 | 1420 | dev->switch_power_state = DRM_SWITCH_POWER_ON; |
|---|
| 1039 | 1421 | drm_kms_helper_poll_enable(dev); |
|---|
| 1040 | 1422 | } else { |
|---|
| 1041 | | - pr_info("amdgpu: switched off\n"); |
|---|
| 1423 | + pr_info("switched off\n"); |
|---|
| 1042 | 1424 | drm_kms_helper_poll_disable(dev); |
|---|
| 1043 | 1425 | dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; |
|---|
| 1044 | | - amdgpu_device_suspend(dev, true, true); |
|---|
| 1426 | + amdgpu_device_suspend(dev, true); |
|---|
| 1427 | + amdgpu_device_cache_pci_state(dev->pdev); |
|---|
| 1428 | + /* Shut down the device */ |
|---|
| 1429 | + pci_disable_device(dev->pdev); |
|---|
| 1430 | + pci_set_power_state(dev->pdev, PCI_D3cold); |
|---|
| 1045 | 1431 | dev->switch_power_state = DRM_SWITCH_POWER_OFF; |
|---|
| 1046 | 1432 | } |
|---|
| 1047 | 1433 | } |
|---|
| .. | .. |
|---|
| 1064 | 1450 | * locking inversion with the driver load path. And the access here is |
|---|
| 1065 | 1451 | * completely racy anyway. So don't bother with locking for now. |
|---|
| 1066 | 1452 | */ |
|---|
| 1067 | | - return dev->open_count == 0; |
|---|
| 1453 | + return atomic_read(&dev->open_count) == 0; |
|---|
| 1068 | 1454 | } |
|---|
| 1069 | 1455 | |
|---|
| 1070 | 1456 | static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { |
|---|
| .. | .. |
|---|
| 1304 | 1690 | adev->enable_virtual_display = false; |
|---|
| 1305 | 1691 | |
|---|
| 1306 | 1692 | if (amdgpu_virtual_display) { |
|---|
| 1307 | | - struct drm_device *ddev = adev->ddev; |
|---|
| 1693 | + struct drm_device *ddev = adev_to_drm(adev); |
|---|
| 1308 | 1694 | const char *pci_address_name = pci_name(ddev->pdev); |
|---|
| 1309 | 1695 | char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; |
|---|
| 1310 | 1696 | |
|---|
| .. | .. |
|---|
| 1357 | 1743 | static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) |
|---|
| 1358 | 1744 | { |
|---|
| 1359 | 1745 | const char *chip_name; |
|---|
| 1360 | | - char fw_name[30]; |
|---|
| 1746 | + char fw_name[40]; |
|---|
| 1361 | 1747 | int err; |
|---|
| 1362 | 1748 | const struct gpu_info_firmware_header_v1_0 *hdr; |
|---|
| 1363 | 1749 | |
|---|
| 1364 | 1750 | adev->firmware.gpu_info_fw = NULL; |
|---|
| 1365 | 1751 | |
|---|
| 1752 | + if (adev->mman.discovery_bin) { |
|---|
| 1753 | + amdgpu_discovery_get_gfx_info(adev); |
|---|
| 1754 | + |
|---|
| 1755 | + /* |
|---|
| 1756 | + * FIXME: The bounding box is still needed by Navi12, so |
|---|
| 1757 | + * temporarily read it from gpu_info firmware. Should be droped |
|---|
| 1758 | + * when DAL no longer needs it. |
|---|
| 1759 | + */ |
|---|
| 1760 | + if (adev->asic_type != CHIP_NAVI12) |
|---|
| 1761 | + return 0; |
|---|
| 1762 | + } |
|---|
| 1763 | + |
|---|
| 1366 | 1764 | switch (adev->asic_type) { |
|---|
| 1367 | | - case CHIP_TOPAZ: |
|---|
| 1368 | | - case CHIP_TONGA: |
|---|
| 1369 | | - case CHIP_FIJI: |
|---|
| 1370 | | - case CHIP_POLARIS10: |
|---|
| 1371 | | - case CHIP_POLARIS11: |
|---|
| 1372 | | - case CHIP_POLARIS12: |
|---|
| 1373 | | - case CHIP_VEGAM: |
|---|
| 1374 | | - case CHIP_CARRIZO: |
|---|
| 1375 | | - case CHIP_STONEY: |
|---|
| 1376 | 1765 | #ifdef CONFIG_DRM_AMDGPU_SI |
|---|
| 1377 | 1766 | case CHIP_VERDE: |
|---|
| 1378 | 1767 | case CHIP_TAHITI: |
|---|
| .. | .. |
|---|
| 1387 | 1776 | case CHIP_KABINI: |
|---|
| 1388 | 1777 | case CHIP_MULLINS: |
|---|
| 1389 | 1778 | #endif |
|---|
| 1779 | + case CHIP_TOPAZ: |
|---|
| 1780 | + case CHIP_TONGA: |
|---|
| 1781 | + case CHIP_FIJI: |
|---|
| 1782 | + case CHIP_POLARIS10: |
|---|
| 1783 | + case CHIP_POLARIS11: |
|---|
| 1784 | + case CHIP_POLARIS12: |
|---|
| 1785 | + case CHIP_VEGAM: |
|---|
| 1786 | + case CHIP_CARRIZO: |
|---|
| 1787 | + case CHIP_STONEY: |
|---|
| 1390 | 1788 | case CHIP_VEGA20: |
|---|
| 1789 | + case CHIP_SIENNA_CICHLID: |
|---|
| 1790 | + case CHIP_NAVY_FLOUNDER: |
|---|
| 1391 | 1791 | default: |
|---|
| 1392 | 1792 | return 0; |
|---|
| 1393 | 1793 | case CHIP_VEGA10: |
|---|
| .. | .. |
|---|
| 1397 | 1797 | chip_name = "vega12"; |
|---|
| 1398 | 1798 | break; |
|---|
| 1399 | 1799 | case CHIP_RAVEN: |
|---|
| 1400 | | - chip_name = "raven"; |
|---|
| 1800 | + if (adev->apu_flags & AMD_APU_IS_RAVEN2) |
|---|
| 1801 | + chip_name = "raven2"; |
|---|
| 1802 | + else if (adev->apu_flags & AMD_APU_IS_PICASSO) |
|---|
| 1803 | + chip_name = "picasso"; |
|---|
| 1804 | + else |
|---|
| 1805 | + chip_name = "raven"; |
|---|
| 1806 | + break; |
|---|
| 1807 | + case CHIP_ARCTURUS: |
|---|
| 1808 | + chip_name = "arcturus"; |
|---|
| 1809 | + break; |
|---|
| 1810 | + case CHIP_RENOIR: |
|---|
| 1811 | + if (adev->apu_flags & AMD_APU_IS_RENOIR) |
|---|
| 1812 | + chip_name = "renoir"; |
|---|
| 1813 | + else |
|---|
| 1814 | + chip_name = "green_sardine"; |
|---|
| 1815 | + break; |
|---|
| 1816 | + case CHIP_NAVI10: |
|---|
| 1817 | + chip_name = "navi10"; |
|---|
| 1818 | + break; |
|---|
| 1819 | + case CHIP_NAVI14: |
|---|
| 1820 | + chip_name = "navi14"; |
|---|
| 1821 | + break; |
|---|
| 1822 | + case CHIP_NAVI12: |
|---|
| 1823 | + chip_name = "navi12"; |
|---|
| 1401 | 1824 | break; |
|---|
| 1402 | 1825 | } |
|---|
| 1403 | 1826 | |
|---|
| .. | .. |
|---|
| 1427 | 1850 | (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + |
|---|
| 1428 | 1851 | le32_to_cpu(hdr->header.ucode_array_offset_bytes)); |
|---|
| 1429 | 1852 | |
|---|
| 1853 | + /* |
|---|
| 1854 | + * Should be droped when DAL no longer needs it. |
|---|
| 1855 | + */ |
|---|
| 1856 | + if (adev->asic_type == CHIP_NAVI12) |
|---|
| 1857 | + goto parse_soc_bounding_box; |
|---|
| 1858 | + |
|---|
| 1430 | 1859 | adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); |
|---|
| 1431 | 1860 | adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); |
|---|
| 1432 | 1861 | adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); |
|---|
| .. | .. |
|---|
| 1445 | 1874 | adev->gfx.cu_info.max_scratch_slots_per_cu = |
|---|
| 1446 | 1875 | le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); |
|---|
| 1447 | 1876 | adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); |
|---|
| 1877 | + if (hdr->version_minor >= 1) { |
|---|
| 1878 | + const struct gpu_info_firmware_v1_1 *gpu_info_fw = |
|---|
| 1879 | + (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + |
|---|
| 1880 | + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); |
|---|
| 1881 | + adev->gfx.config.num_sc_per_sh = |
|---|
| 1882 | + le32_to_cpu(gpu_info_fw->num_sc_per_sh); |
|---|
| 1883 | + adev->gfx.config.num_packer_per_sc = |
|---|
| 1884 | + le32_to_cpu(gpu_info_fw->num_packer_per_sc); |
|---|
| 1885 | + } |
|---|
| 1886 | + |
|---|
| 1887 | +parse_soc_bounding_box: |
|---|
| 1888 | + /* |
|---|
| 1889 | + * soc bounding box info is not integrated in disocovery table, |
|---|
| 1890 | + * we always need to parse it from gpu info firmware if needed. |
|---|
| 1891 | + */ |
|---|
| 1892 | + if (hdr->version_minor == 2) { |
|---|
| 1893 | + const struct gpu_info_firmware_v1_2 *gpu_info_fw = |
|---|
| 1894 | + (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + |
|---|
| 1895 | + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); |
|---|
| 1896 | + adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; |
|---|
| 1897 | + } |
|---|
| 1448 | 1898 | break; |
|---|
| 1449 | 1899 | } |
|---|
| 1450 | 1900 | default: |
|---|
| .. | .. |
|---|
| 1473 | 1923 | |
|---|
| 1474 | 1924 | amdgpu_device_enable_virtual_display(adev); |
|---|
| 1475 | 1925 | |
|---|
| 1476 | | - switch (adev->asic_type) { |
|---|
| 1477 | | - case CHIP_TOPAZ: |
|---|
| 1478 | | - case CHIP_TONGA: |
|---|
| 1479 | | - case CHIP_FIJI: |
|---|
| 1480 | | - case CHIP_POLARIS10: |
|---|
| 1481 | | - case CHIP_POLARIS11: |
|---|
| 1482 | | - case CHIP_POLARIS12: |
|---|
| 1483 | | - case CHIP_VEGAM: |
|---|
| 1484 | | - case CHIP_CARRIZO: |
|---|
| 1485 | | - case CHIP_STONEY: |
|---|
| 1486 | | - if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) |
|---|
| 1487 | | - adev->family = AMDGPU_FAMILY_CZ; |
|---|
| 1488 | | - else |
|---|
| 1489 | | - adev->family = AMDGPU_FAMILY_VI; |
|---|
| 1490 | | - |
|---|
| 1491 | | - r = vi_set_ip_blocks(adev); |
|---|
| 1926 | + if (amdgpu_sriov_vf(adev)) { |
|---|
| 1927 | + r = amdgpu_virt_request_full_gpu(adev, true); |
|---|
| 1492 | 1928 | if (r) |
|---|
| 1493 | 1929 | return r; |
|---|
| 1494 | | - break; |
|---|
| 1930 | + } |
|---|
| 1931 | + |
|---|
| 1932 | + switch (adev->asic_type) { |
|---|
| 1495 | 1933 | #ifdef CONFIG_DRM_AMDGPU_SI |
|---|
| 1496 | 1934 | case CHIP_VERDE: |
|---|
| 1497 | 1935 | case CHIP_TAHITI: |
|---|
| .. | .. |
|---|
| 1510 | 1948 | case CHIP_KAVERI: |
|---|
| 1511 | 1949 | case CHIP_KABINI: |
|---|
| 1512 | 1950 | case CHIP_MULLINS: |
|---|
| 1513 | | - if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) |
|---|
| 1514 | | - adev->family = AMDGPU_FAMILY_CI; |
|---|
| 1515 | | - else |
|---|
| 1951 | + if (adev->flags & AMD_IS_APU) |
|---|
| 1516 | 1952 | adev->family = AMDGPU_FAMILY_KV; |
|---|
| 1953 | + else |
|---|
| 1954 | + adev->family = AMDGPU_FAMILY_CI; |
|---|
| 1517 | 1955 | |
|---|
| 1518 | 1956 | r = cik_set_ip_blocks(adev); |
|---|
| 1519 | 1957 | if (r) |
|---|
| 1520 | 1958 | return r; |
|---|
| 1521 | 1959 | break; |
|---|
| 1522 | 1960 | #endif |
|---|
| 1961 | + case CHIP_TOPAZ: |
|---|
| 1962 | + case CHIP_TONGA: |
|---|
| 1963 | + case CHIP_FIJI: |
|---|
| 1964 | + case CHIP_POLARIS10: |
|---|
| 1965 | + case CHIP_POLARIS11: |
|---|
| 1966 | + case CHIP_POLARIS12: |
|---|
| 1967 | + case CHIP_VEGAM: |
|---|
| 1968 | + case CHIP_CARRIZO: |
|---|
| 1969 | + case CHIP_STONEY: |
|---|
| 1970 | + if (adev->flags & AMD_IS_APU) |
|---|
| 1971 | + adev->family = AMDGPU_FAMILY_CZ; |
|---|
| 1972 | + else |
|---|
| 1973 | + adev->family = AMDGPU_FAMILY_VI; |
|---|
| 1974 | + |
|---|
| 1975 | + r = vi_set_ip_blocks(adev); |
|---|
| 1976 | + if (r) |
|---|
| 1977 | + return r; |
|---|
| 1978 | + break; |
|---|
| 1523 | 1979 | case CHIP_VEGA10: |
|---|
| 1524 | 1980 | case CHIP_VEGA12: |
|---|
| 1525 | 1981 | case CHIP_VEGA20: |
|---|
| 1526 | 1982 | case CHIP_RAVEN: |
|---|
| 1527 | | - if (adev->asic_type == CHIP_RAVEN) |
|---|
| 1983 | + case CHIP_ARCTURUS: |
|---|
| 1984 | + case CHIP_RENOIR: |
|---|
| 1985 | + if (adev->flags & AMD_IS_APU) |
|---|
| 1528 | 1986 | adev->family = AMDGPU_FAMILY_RV; |
|---|
| 1529 | 1987 | else |
|---|
| 1530 | 1988 | adev->family = AMDGPU_FAMILY_AI; |
|---|
| .. | .. |
|---|
| 1533 | 1991 | if (r) |
|---|
| 1534 | 1992 | return r; |
|---|
| 1535 | 1993 | break; |
|---|
| 1994 | + case CHIP_NAVI10: |
|---|
| 1995 | + case CHIP_NAVI14: |
|---|
| 1996 | + case CHIP_NAVI12: |
|---|
| 1997 | + case CHIP_SIENNA_CICHLID: |
|---|
| 1998 | + case CHIP_NAVY_FLOUNDER: |
|---|
| 1999 | + adev->family = AMDGPU_FAMILY_NV; |
|---|
| 2000 | + |
|---|
| 2001 | + r = nv_set_ip_blocks(adev); |
|---|
| 2002 | + if (r) |
|---|
| 2003 | + return r; |
|---|
| 2004 | + break; |
|---|
| 1536 | 2005 | default: |
|---|
| 1537 | 2006 | /* FIXME: not supported yet */ |
|---|
| 1538 | 2007 | return -EINVAL; |
|---|
| 1539 | 2008 | } |
|---|
| 1540 | 2009 | |
|---|
| 1541 | | - r = amdgpu_device_parse_gpu_info_fw(adev); |
|---|
| 1542 | | - if (r) |
|---|
| 1543 | | - return r; |
|---|
| 1544 | | - |
|---|
| 1545 | 2010 | amdgpu_amdkfd_device_probe(adev); |
|---|
| 1546 | 2011 | |
|---|
| 1547 | | - if (amdgpu_sriov_vf(adev)) { |
|---|
| 1548 | | - r = amdgpu_virt_request_full_gpu(adev, true); |
|---|
| 1549 | | - if (r) |
|---|
| 1550 | | - return -EAGAIN; |
|---|
| 1551 | | - } |
|---|
| 1552 | | - |
|---|
| 1553 | | - adev->powerplay.pp_feature = amdgpu_pp_feature_mask; |
|---|
| 2012 | + adev->pm.pp_feature = amdgpu_pp_feature_mask; |
|---|
| 2013 | + if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) |
|---|
| 2014 | + adev->pm.pp_feature &= ~PP_GFXOFF_MASK; |
|---|
| 1554 | 2015 | |
|---|
| 1555 | 2016 | for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 1556 | 2017 | if ((amdgpu_ip_block_mask & (1 << i)) == 0) { |
|---|
| .. | .. |
|---|
| 1573 | 2034 | adev->ip_blocks[i].status.valid = true; |
|---|
| 1574 | 2035 | } |
|---|
| 1575 | 2036 | } |
|---|
| 2037 | + /* get the vbios after the asic_funcs are set up */ |
|---|
| 2038 | + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { |
|---|
| 2039 | + r = amdgpu_device_parse_gpu_info_fw(adev); |
|---|
| 2040 | + if (r) |
|---|
| 2041 | + return r; |
|---|
| 2042 | + |
|---|
| 2043 | + /* Read BIOS */ |
|---|
| 2044 | + if (!amdgpu_get_bios(adev)) |
|---|
| 2045 | + return -EINVAL; |
|---|
| 2046 | + |
|---|
| 2047 | + r = amdgpu_atombios_init(adev); |
|---|
| 2048 | + if (r) { |
|---|
| 2049 | + dev_err(adev->dev, "amdgpu_atombios_init failed\n"); |
|---|
| 2050 | + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); |
|---|
| 2051 | + return r; |
|---|
| 2052 | + } |
|---|
| 2053 | + |
|---|
| 2054 | + /*get pf2vf msg info at it's earliest time*/ |
|---|
| 2055 | + if (amdgpu_sriov_vf(adev)) |
|---|
| 2056 | + amdgpu_virt_init_data_exchange(adev); |
|---|
| 2057 | + |
|---|
| 2058 | + } |
|---|
| 1576 | 2059 | } |
|---|
| 1577 | 2060 | |
|---|
| 1578 | 2061 | adev->cg_flags &= amdgpu_cg_mask; |
|---|
| 1579 | 2062 | adev->pg_flags &= amdgpu_pg_mask; |
|---|
| 1580 | 2063 | |
|---|
| 1581 | 2064 | return 0; |
|---|
| 2065 | +} |
|---|
| 2066 | + |
|---|
| 2067 | +static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) |
|---|
| 2068 | +{ |
|---|
| 2069 | + int i, r; |
|---|
| 2070 | + |
|---|
| 2071 | + for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 2072 | + if (!adev->ip_blocks[i].status.sw) |
|---|
| 2073 | + continue; |
|---|
| 2074 | + if (adev->ip_blocks[i].status.hw) |
|---|
| 2075 | + continue; |
|---|
| 2076 | + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || |
|---|
| 2077 | + (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || |
|---|
| 2078 | + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { |
|---|
| 2079 | + r = adev->ip_blocks[i].version->funcs->hw_init(adev); |
|---|
| 2080 | + if (r) { |
|---|
| 2081 | + DRM_ERROR("hw_init of IP block <%s> failed %d\n", |
|---|
| 2082 | + adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2083 | + return r; |
|---|
| 2084 | + } |
|---|
| 2085 | + adev->ip_blocks[i].status.hw = true; |
|---|
| 2086 | + } |
|---|
| 2087 | + } |
|---|
| 2088 | + |
|---|
| 2089 | + return 0; |
|---|
| 2090 | +} |
|---|
| 2091 | + |
|---|
| 2092 | +static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) |
|---|
| 2093 | +{ |
|---|
| 2094 | + int i, r; |
|---|
| 2095 | + |
|---|
| 2096 | + for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 2097 | + if (!adev->ip_blocks[i].status.sw) |
|---|
| 2098 | + continue; |
|---|
| 2099 | + if (adev->ip_blocks[i].status.hw) |
|---|
| 2100 | + continue; |
|---|
| 2101 | + r = adev->ip_blocks[i].version->funcs->hw_init(adev); |
|---|
| 2102 | + if (r) { |
|---|
| 2103 | + DRM_ERROR("hw_init of IP block <%s> failed %d\n", |
|---|
| 2104 | + adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2105 | + return r; |
|---|
| 2106 | + } |
|---|
| 2107 | + adev->ip_blocks[i].status.hw = true; |
|---|
| 2108 | + } |
|---|
| 2109 | + |
|---|
| 2110 | + return 0; |
|---|
| 2111 | +} |
|---|
| 2112 | + |
|---|
| 2113 | +static int amdgpu_device_fw_loading(struct amdgpu_device *adev) |
|---|
| 2114 | +{ |
|---|
| 2115 | + int r = 0; |
|---|
| 2116 | + int i; |
|---|
| 2117 | + uint32_t smu_version; |
|---|
| 2118 | + |
|---|
| 2119 | + if (adev->asic_type >= CHIP_VEGA10) { |
|---|
| 2120 | + for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 2121 | + if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) |
|---|
| 2122 | + continue; |
|---|
| 2123 | + |
|---|
| 2124 | + /* no need to do the fw loading again if already done*/ |
|---|
| 2125 | + if (adev->ip_blocks[i].status.hw == true) |
|---|
| 2126 | + break; |
|---|
| 2127 | + |
|---|
| 2128 | + if (amdgpu_in_reset(adev) || adev->in_suspend) { |
|---|
| 2129 | + r = adev->ip_blocks[i].version->funcs->resume(adev); |
|---|
| 2130 | + if (r) { |
|---|
| 2131 | + DRM_ERROR("resume of IP block <%s> failed %d\n", |
|---|
| 2132 | + adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2133 | + return r; |
|---|
| 2134 | + } |
|---|
| 2135 | + } else { |
|---|
| 2136 | + r = adev->ip_blocks[i].version->funcs->hw_init(adev); |
|---|
| 2137 | + if (r) { |
|---|
| 2138 | + DRM_ERROR("hw_init of IP block <%s> failed %d\n", |
|---|
| 2139 | + adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2140 | + return r; |
|---|
| 2141 | + } |
|---|
| 2142 | + } |
|---|
| 2143 | + |
|---|
| 2144 | + adev->ip_blocks[i].status.hw = true; |
|---|
| 2145 | + break; |
|---|
| 2146 | + } |
|---|
| 2147 | + } |
|---|
| 2148 | + |
|---|
| 2149 | + if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) |
|---|
| 2150 | + r = amdgpu_pm_load_smu_firmware(adev, &smu_version); |
|---|
| 2151 | + |
|---|
| 2152 | + return r; |
|---|
| 1582 | 2153 | } |
|---|
| 1583 | 2154 | |
|---|
| 1584 | 2155 | /** |
|---|
| .. | .. |
|---|
| 1596 | 2167 | { |
|---|
| 1597 | 2168 | int i, r; |
|---|
| 1598 | 2169 | |
|---|
| 2170 | + r = amdgpu_ras_init(adev); |
|---|
| 2171 | + if (r) |
|---|
| 2172 | + return r; |
|---|
| 2173 | + |
|---|
| 1599 | 2174 | for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 1600 | 2175 | if (!adev->ip_blocks[i].status.valid) |
|---|
| 1601 | 2176 | continue; |
|---|
| .. | .. |
|---|
| 1603 | 2178 | if (r) { |
|---|
| 1604 | 2179 | DRM_ERROR("sw_init of IP block <%s> failed %d\n", |
|---|
| 1605 | 2180 | adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 1606 | | - return r; |
|---|
| 2181 | + goto init_failed; |
|---|
| 1607 | 2182 | } |
|---|
| 1608 | 2183 | adev->ip_blocks[i].status.sw = true; |
|---|
| 1609 | 2184 | |
|---|
| 1610 | 2185 | /* need to do gmc hw init early so we can allocate gpu mem */ |
|---|
| 1611 | 2186 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { |
|---|
| 2187 | + /* Try to reserve bad pages early */ |
|---|
| 2188 | + if (amdgpu_sriov_vf(adev)) |
|---|
| 2189 | + amdgpu_virt_exchange_data(adev); |
|---|
| 2190 | + |
|---|
| 1612 | 2191 | r = amdgpu_device_vram_scratch_init(adev); |
|---|
| 1613 | 2192 | if (r) { |
|---|
| 1614 | 2193 | DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); |
|---|
| 1615 | | - return r; |
|---|
| 2194 | + goto init_failed; |
|---|
| 1616 | 2195 | } |
|---|
| 1617 | 2196 | r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); |
|---|
| 1618 | 2197 | if (r) { |
|---|
| 1619 | 2198 | DRM_ERROR("hw_init %d failed %d\n", i, r); |
|---|
| 1620 | | - return r; |
|---|
| 2199 | + goto init_failed; |
|---|
| 1621 | 2200 | } |
|---|
| 1622 | 2201 | r = amdgpu_device_wb_init(adev); |
|---|
| 1623 | 2202 | if (r) { |
|---|
| 1624 | 2203 | DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); |
|---|
| 1625 | | - return r; |
|---|
| 2204 | + goto init_failed; |
|---|
| 1626 | 2205 | } |
|---|
| 1627 | 2206 | adev->ip_blocks[i].status.hw = true; |
|---|
| 1628 | 2207 | |
|---|
| 1629 | 2208 | /* right after GMC hw init, we create CSA */ |
|---|
| 1630 | | - if (amdgpu_sriov_vf(adev)) { |
|---|
| 1631 | | - r = amdgpu_allocate_static_csa(adev); |
|---|
| 2209 | + if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { |
|---|
| 2210 | + r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, |
|---|
| 2211 | + AMDGPU_GEM_DOMAIN_VRAM, |
|---|
| 2212 | + AMDGPU_CSA_SIZE); |
|---|
| 1632 | 2213 | if (r) { |
|---|
| 1633 | 2214 | DRM_ERROR("allocate CSA failed %d\n", r); |
|---|
| 1634 | | - return r; |
|---|
| 2215 | + goto init_failed; |
|---|
| 1635 | 2216 | } |
|---|
| 1636 | 2217 | } |
|---|
| 1637 | 2218 | } |
|---|
| 1638 | 2219 | } |
|---|
| 1639 | 2220 | |
|---|
| 1640 | | - for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 1641 | | - if (!adev->ip_blocks[i].status.sw) |
|---|
| 1642 | | - continue; |
|---|
| 1643 | | - if (adev->ip_blocks[i].status.hw) |
|---|
| 1644 | | - continue; |
|---|
| 1645 | | - r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); |
|---|
| 1646 | | - if (r) { |
|---|
| 1647 | | - DRM_ERROR("hw_init of IP block <%s> failed %d\n", |
|---|
| 1648 | | - adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 1649 | | - return r; |
|---|
| 1650 | | - } |
|---|
| 1651 | | - adev->ip_blocks[i].status.hw = true; |
|---|
| 2221 | + if (amdgpu_sriov_vf(adev)) |
|---|
| 2222 | + amdgpu_virt_init_data_exchange(adev); |
|---|
| 2223 | + |
|---|
| 2224 | + r = amdgpu_ib_pool_init(adev); |
|---|
| 2225 | + if (r) { |
|---|
| 2226 | + dev_err(adev->dev, "IB initialization failed (%d).\n", r); |
|---|
| 2227 | + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); |
|---|
| 2228 | + goto init_failed; |
|---|
| 1652 | 2229 | } |
|---|
| 1653 | 2230 | |
|---|
| 2231 | + r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ |
|---|
| 2232 | + if (r) |
|---|
| 2233 | + goto init_failed; |
|---|
| 2234 | + |
|---|
| 2235 | + r = amdgpu_device_ip_hw_init_phase1(adev); |
|---|
| 2236 | + if (r) |
|---|
| 2237 | + goto init_failed; |
|---|
| 2238 | + |
|---|
| 2239 | + r = amdgpu_device_fw_loading(adev); |
|---|
| 2240 | + if (r) |
|---|
| 2241 | + goto init_failed; |
|---|
| 2242 | + |
|---|
| 2243 | + r = amdgpu_device_ip_hw_init_phase2(adev); |
|---|
| 2244 | + if (r) |
|---|
| 2245 | + goto init_failed; |
|---|
| 2246 | + |
|---|
| 2247 | + /* |
|---|
| 2248 | + * retired pages will be loaded from eeprom and reserved here, |
|---|
| 2249 | + * it should be called after amdgpu_device_ip_hw_init_phase2 since |
|---|
| 2250 | + * for some ASICs the RAS EEPROM code relies on SMU fully functioning |
|---|
| 2251 | + * for I2C communication which only true at this point. |
|---|
| 2252 | + * |
|---|
| 2253 | + * amdgpu_ras_recovery_init may fail, but the upper only cares the |
|---|
| 2254 | + * failure from bad gpu situation and stop amdgpu init process |
|---|
| 2255 | + * accordingly. For other failed cases, it will still release all |
|---|
| 2256 | + * the resource and print error message, rather than returning one |
|---|
| 2257 | + * negative value to upper level. |
|---|
| 2258 | + * |
|---|
| 2259 | + * Note: theoretically, this should be called before all vram allocations |
|---|
| 2260 | + * to protect retired page from abusing |
|---|
| 2261 | + */ |
|---|
| 2262 | + r = amdgpu_ras_recovery_init(adev); |
|---|
| 2263 | + if (r) |
|---|
| 2264 | + goto init_failed; |
|---|
| 2265 | + |
|---|
| 2266 | + if (adev->gmc.xgmi.num_physical_nodes > 1) |
|---|
| 2267 | + amdgpu_xgmi_add_device(adev); |
|---|
| 1654 | 2268 | amdgpu_amdkfd_device_init(adev); |
|---|
| 1655 | 2269 | |
|---|
| 1656 | | - if (amdgpu_sriov_vf(adev)) { |
|---|
| 1657 | | - amdgpu_virt_init_data_exchange(adev); |
|---|
| 1658 | | - amdgpu_virt_release_full_gpu(adev, true); |
|---|
| 1659 | | - } |
|---|
| 2270 | + amdgpu_fru_get_product_info(adev); |
|---|
| 1660 | 2271 | |
|---|
| 1661 | | - return 0; |
|---|
| 2272 | +init_failed: |
|---|
| 2273 | + if (amdgpu_sriov_vf(adev)) |
|---|
| 2274 | + amdgpu_virt_release_full_gpu(adev, true); |
|---|
| 2275 | + |
|---|
| 2276 | + return r; |
|---|
| 1662 | 2277 | } |
|---|
| 1663 | 2278 | |
|---|
| 1664 | 2279 | /** |
|---|
| .. | .. |
|---|
| 1687 | 2302 | */ |
|---|
| 1688 | 2303 | static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) |
|---|
| 1689 | 2304 | { |
|---|
| 1690 | | - return !!memcmp(adev->gart.ptr, adev->reset_magic, |
|---|
| 1691 | | - AMDGPU_RESET_MAGIC_NUM); |
|---|
| 2305 | + if (memcmp(adev->gart.ptr, adev->reset_magic, |
|---|
| 2306 | + AMDGPU_RESET_MAGIC_NUM)) |
|---|
| 2307 | + return true; |
|---|
| 2308 | + |
|---|
| 2309 | + if (!amdgpu_in_reset(adev)) |
|---|
| 2310 | + return false; |
|---|
| 2311 | + |
|---|
| 2312 | + /* |
|---|
| 2313 | + * For all ASICs with baco/mode1 reset, the VRAM is |
|---|
| 2314 | + * always assumed to be lost. |
|---|
| 2315 | + */ |
|---|
| 2316 | + switch (amdgpu_asic_reset_method(adev)) { |
|---|
| 2317 | + case AMD_RESET_METHOD_BACO: |
|---|
| 2318 | + case AMD_RESET_METHOD_MODE1: |
|---|
| 2319 | + return true; |
|---|
| 2320 | + default: |
|---|
| 2321 | + return false; |
|---|
| 2322 | + } |
|---|
| 1692 | 2323 | } |
|---|
| 1693 | 2324 | |
|---|
| 1694 | 2325 | /** |
|---|
| 1695 | | - * amdgpu_device_ip_late_set_cg_state - late init for clockgating |
|---|
| 2326 | + * amdgpu_device_set_cg_state - set clockgating for amdgpu device |
|---|
| 1696 | 2327 | * |
|---|
| 1697 | 2328 | * @adev: amdgpu_device pointer |
|---|
| 2329 | + * @state: clockgating state (gate or ungate) |
|---|
| 1698 | 2330 | * |
|---|
| 1699 | | - * Late initialization pass enabling clockgating for hardware IPs. |
|---|
| 1700 | 2331 | * The list of all the hardware IPs that make up the asic is walked and the |
|---|
| 1701 | | - * set_clockgating_state callbacks are run. This stage is run late |
|---|
| 1702 | | - * in the init process. |
|---|
| 2332 | + * set_clockgating_state callbacks are run. |
|---|
| 2333 | + * Late initialization pass enabling clockgating for hardware IPs. |
|---|
| 2334 | + * Fini or suspend, pass disabling clockgating for hardware IPs. |
|---|
| 1703 | 2335 | * Returns 0 on success, negative error code on failure. |
|---|
| 1704 | 2336 | */ |
|---|
| 1705 | | -static int amdgpu_device_ip_late_set_cg_state(struct amdgpu_device *adev) |
|---|
| 2337 | + |
|---|
| 2338 | +static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, |
|---|
| 2339 | + enum amd_clockgating_state state) |
|---|
| 1706 | 2340 | { |
|---|
| 1707 | | - int i = 0, r; |
|---|
| 2341 | + int i, j, r; |
|---|
| 1708 | 2342 | |
|---|
| 1709 | 2343 | if (amdgpu_emu_mode == 1) |
|---|
| 1710 | 2344 | return 0; |
|---|
| 1711 | 2345 | |
|---|
| 1712 | | - for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 1713 | | - if (!adev->ip_blocks[i].status.valid) |
|---|
| 2346 | + for (j = 0; j < adev->num_ip_blocks; j++) { |
|---|
| 2347 | + i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; |
|---|
| 2348 | + if (!adev->ip_blocks[i].status.late_initialized) |
|---|
| 1714 | 2349 | continue; |
|---|
| 1715 | 2350 | /* skip CG for VCE/UVD, it's handled specially */ |
|---|
| 1716 | 2351 | if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && |
|---|
| 1717 | 2352 | adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && |
|---|
| 1718 | 2353 | adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && |
|---|
| 2354 | + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && |
|---|
| 1719 | 2355 | adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
|---|
| 1720 | 2356 | /* enable clockgating to save power */ |
|---|
| 1721 | 2357 | r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
|---|
| 1722 | | - AMD_CG_STATE_GATE); |
|---|
| 2358 | + state); |
|---|
| 1723 | 2359 | if (r) { |
|---|
| 1724 | 2360 | DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", |
|---|
| 1725 | 2361 | adev->ip_blocks[i].version->funcs->name, r); |
|---|
| .. | .. |
|---|
| 1731 | 2367 | return 0; |
|---|
| 1732 | 2368 | } |
|---|
| 1733 | 2369 | |
|---|
| 1734 | | -static int amdgpu_device_ip_late_set_pg_state(struct amdgpu_device *adev) |
|---|
| 2370 | +static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) |
|---|
| 1735 | 2371 | { |
|---|
| 1736 | | - int i = 0, r; |
|---|
| 2372 | + int i, j, r; |
|---|
| 1737 | 2373 | |
|---|
| 1738 | 2374 | if (amdgpu_emu_mode == 1) |
|---|
| 1739 | 2375 | return 0; |
|---|
| 1740 | 2376 | |
|---|
| 1741 | | - for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 1742 | | - if (!adev->ip_blocks[i].status.valid) |
|---|
| 2377 | + for (j = 0; j < adev->num_ip_blocks; j++) { |
|---|
| 2378 | + i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; |
|---|
| 2379 | + if (!adev->ip_blocks[i].status.late_initialized) |
|---|
| 1743 | 2380 | continue; |
|---|
| 1744 | 2381 | /* skip CG for VCE/UVD, it's handled specially */ |
|---|
| 1745 | 2382 | if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && |
|---|
| 1746 | 2383 | adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && |
|---|
| 1747 | 2384 | adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && |
|---|
| 2385 | + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && |
|---|
| 1748 | 2386 | adev->ip_blocks[i].version->funcs->set_powergating_state) { |
|---|
| 1749 | 2387 | /* enable powergating to save power */ |
|---|
| 1750 | 2388 | r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, |
|---|
| 1751 | | - AMD_PG_STATE_GATE); |
|---|
| 2389 | + state); |
|---|
| 1752 | 2390 | if (r) { |
|---|
| 1753 | 2391 | DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", |
|---|
| 1754 | 2392 | adev->ip_blocks[i].version->funcs->name, r); |
|---|
| .. | .. |
|---|
| 1757 | 2395 | } |
|---|
| 1758 | 2396 | } |
|---|
| 1759 | 2397 | return 0; |
|---|
| 2398 | +} |
|---|
| 2399 | + |
|---|
| 2400 | +static int amdgpu_device_enable_mgpu_fan_boost(void) |
|---|
| 2401 | +{ |
|---|
| 2402 | + struct amdgpu_gpu_instance *gpu_ins; |
|---|
| 2403 | + struct amdgpu_device *adev; |
|---|
| 2404 | + int i, ret = 0; |
|---|
| 2405 | + |
|---|
| 2406 | + mutex_lock(&mgpu_info.mutex); |
|---|
| 2407 | + |
|---|
| 2408 | + /* |
|---|
| 2409 | + * MGPU fan boost feature should be enabled |
|---|
| 2410 | + * only when there are two or more dGPUs in |
|---|
| 2411 | + * the system |
|---|
| 2412 | + */ |
|---|
| 2413 | + if (mgpu_info.num_dgpu < 2) |
|---|
| 2414 | + goto out; |
|---|
| 2415 | + |
|---|
| 2416 | + for (i = 0; i < mgpu_info.num_dgpu; i++) { |
|---|
| 2417 | + gpu_ins = &(mgpu_info.gpu_ins[i]); |
|---|
| 2418 | + adev = gpu_ins->adev; |
|---|
| 2419 | + if (!(adev->flags & AMD_IS_APU) && |
|---|
| 2420 | + !gpu_ins->mgpu_fan_enabled) { |
|---|
| 2421 | + ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); |
|---|
| 2422 | + if (ret) |
|---|
| 2423 | + break; |
|---|
| 2424 | + |
|---|
| 2425 | + gpu_ins->mgpu_fan_enabled = 1; |
|---|
| 2426 | + } |
|---|
| 2427 | + } |
|---|
| 2428 | + |
|---|
| 2429 | +out: |
|---|
| 2430 | + mutex_unlock(&mgpu_info.mutex); |
|---|
| 2431 | + |
|---|
| 2432 | + return ret; |
|---|
| 1760 | 2433 | } |
|---|
| 1761 | 2434 | |
|---|
| 1762 | 2435 | /** |
|---|
| .. | .. |
|---|
| 1773 | 2446 | */ |
|---|
| 1774 | 2447 | static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) |
|---|
| 1775 | 2448 | { |
|---|
| 2449 | + struct amdgpu_gpu_instance *gpu_instance; |
|---|
| 1776 | 2450 | int i = 0, r; |
|---|
| 1777 | 2451 | |
|---|
| 1778 | 2452 | for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 1779 | | - if (!adev->ip_blocks[i].status.valid) |
|---|
| 2453 | + if (!adev->ip_blocks[i].status.hw) |
|---|
| 1780 | 2454 | continue; |
|---|
| 1781 | 2455 | if (adev->ip_blocks[i].version->funcs->late_init) { |
|---|
| 1782 | 2456 | r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); |
|---|
| .. | .. |
|---|
| 1785 | 2459 | adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 1786 | 2460 | return r; |
|---|
| 1787 | 2461 | } |
|---|
| 1788 | | - adev->ip_blocks[i].status.late_initialized = true; |
|---|
| 1789 | 2462 | } |
|---|
| 2463 | + adev->ip_blocks[i].status.late_initialized = true; |
|---|
| 1790 | 2464 | } |
|---|
| 1791 | 2465 | |
|---|
| 1792 | | - amdgpu_device_ip_late_set_cg_state(adev); |
|---|
| 1793 | | - amdgpu_device_ip_late_set_pg_state(adev); |
|---|
| 2466 | + amdgpu_ras_set_error_query_ready(adev, true); |
|---|
| 1794 | 2467 | |
|---|
| 1795 | | - queue_delayed_work(system_wq, &adev->late_init_work, |
|---|
| 1796 | | - msecs_to_jiffies(AMDGPU_RESUME_MS)); |
|---|
| 2468 | + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); |
|---|
| 2469 | + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); |
|---|
| 1797 | 2470 | |
|---|
| 1798 | 2471 | amdgpu_device_fill_reset_magic(adev); |
|---|
| 2472 | + |
|---|
| 2473 | + r = amdgpu_device_enable_mgpu_fan_boost(); |
|---|
| 2474 | + if (r) |
|---|
| 2475 | + DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); |
|---|
| 2476 | + |
|---|
| 2477 | + |
|---|
| 2478 | + if (adev->gmc.xgmi.num_physical_nodes > 1) { |
|---|
| 2479 | + mutex_lock(&mgpu_info.mutex); |
|---|
| 2480 | + |
|---|
| 2481 | + /* |
|---|
| 2482 | + * Reset device p-state to low as this was booted with high. |
|---|
| 2483 | + * |
|---|
| 2484 | + * This should be performed only after all devices from the same |
|---|
| 2485 | + * hive get initialized. |
|---|
| 2486 | + * |
|---|
| 2487 | + * However, it's unknown how many device in the hive in advance. |
|---|
| 2488 | + * As this is counted one by one during devices initializations. |
|---|
| 2489 | + * |
|---|
| 2490 | + * So, we wait for all XGMI interlinked devices initialized. |
|---|
| 2491 | + * This may bring some delays as those devices may come from |
|---|
| 2492 | + * different hives. But that should be OK. |
|---|
| 2493 | + */ |
|---|
| 2494 | + if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { |
|---|
| 2495 | + for (i = 0; i < mgpu_info.num_gpu; i++) { |
|---|
| 2496 | + gpu_instance = &(mgpu_info.gpu_ins[i]); |
|---|
| 2497 | + if (gpu_instance->adev->flags & AMD_IS_APU) |
|---|
| 2498 | + continue; |
|---|
| 2499 | + |
|---|
| 2500 | + r = amdgpu_xgmi_set_pstate(gpu_instance->adev, |
|---|
| 2501 | + AMDGPU_XGMI_PSTATE_MIN); |
|---|
| 2502 | + if (r) { |
|---|
| 2503 | + DRM_ERROR("pstate setting failed (%d).\n", r); |
|---|
| 2504 | + break; |
|---|
| 2505 | + } |
|---|
| 2506 | + } |
|---|
| 2507 | + } |
|---|
| 2508 | + |
|---|
| 2509 | + mutex_unlock(&mgpu_info.mutex); |
|---|
| 2510 | + } |
|---|
| 1799 | 2511 | |
|---|
| 1800 | 2512 | return 0; |
|---|
| 1801 | 2513 | } |
|---|
| .. | .. |
|---|
| 1815 | 2527 | { |
|---|
| 1816 | 2528 | int i, r; |
|---|
| 1817 | 2529 | |
|---|
| 2530 | + if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) |
|---|
| 2531 | + amdgpu_virt_release_ras_err_handler_data(adev); |
|---|
| 2532 | + |
|---|
| 2533 | + amdgpu_ras_pre_fini(adev); |
|---|
| 2534 | + |
|---|
| 2535 | + if (adev->gmc.xgmi.num_physical_nodes > 1) |
|---|
| 2536 | + amdgpu_xgmi_remove_device(adev); |
|---|
| 2537 | + |
|---|
| 2538 | + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); |
|---|
| 2539 | + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); |
|---|
| 2540 | + |
|---|
| 1818 | 2541 | amdgpu_amdkfd_device_fini(adev); |
|---|
| 2542 | + |
|---|
| 1819 | 2543 | /* need to disable SMC first */ |
|---|
| 1820 | 2544 | for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 1821 | 2545 | if (!adev->ip_blocks[i].status.hw) |
|---|
| 1822 | 2546 | continue; |
|---|
| 1823 | | - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC && |
|---|
| 1824 | | - adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
|---|
| 1825 | | - /* ungate blocks before hw fini so that we can shutdown the blocks safely */ |
|---|
| 1826 | | - r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
|---|
| 1827 | | - AMD_CG_STATE_UNGATE); |
|---|
| 1828 | | - if (r) { |
|---|
| 1829 | | - DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n", |
|---|
| 1830 | | - adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 1831 | | - return r; |
|---|
| 1832 | | - } |
|---|
| 1833 | | - if (adev->powerplay.pp_funcs->set_powergating_by_smu) |
|---|
| 1834 | | - amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false); |
|---|
| 2547 | + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { |
|---|
| 1835 | 2548 | r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); |
|---|
| 1836 | 2549 | /* XXX handle errors */ |
|---|
| 1837 | 2550 | if (r) { |
|---|
| .. | .. |
|---|
| 1846 | 2559 | for (i = adev->num_ip_blocks - 1; i >= 0; i--) { |
|---|
| 1847 | 2560 | if (!adev->ip_blocks[i].status.hw) |
|---|
| 1848 | 2561 | continue; |
|---|
| 1849 | | - |
|---|
| 1850 | | - if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && |
|---|
| 1851 | | - adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && |
|---|
| 1852 | | - adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && |
|---|
| 1853 | | - adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
|---|
| 1854 | | - /* ungate blocks before hw fini so that we can shutdown the blocks safely */ |
|---|
| 1855 | | - r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
|---|
| 1856 | | - AMD_CG_STATE_UNGATE); |
|---|
| 1857 | | - if (r) { |
|---|
| 1858 | | - DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n", |
|---|
| 1859 | | - adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 1860 | | - return r; |
|---|
| 1861 | | - } |
|---|
| 1862 | | - } |
|---|
| 1863 | 2562 | |
|---|
| 1864 | 2563 | r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); |
|---|
| 1865 | 2564 | /* XXX handle errors */ |
|---|
| .. | .. |
|---|
| 1877 | 2576 | continue; |
|---|
| 1878 | 2577 | |
|---|
| 1879 | 2578 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { |
|---|
| 1880 | | - amdgpu_free_static_csa(adev); |
|---|
| 2579 | + amdgpu_ucode_free_bo(adev); |
|---|
| 2580 | + amdgpu_free_static_csa(&adev->virt.csa_obj); |
|---|
| 1881 | 2581 | amdgpu_device_wb_fini(adev); |
|---|
| 1882 | 2582 | amdgpu_device_vram_scratch_fini(adev); |
|---|
| 2583 | + amdgpu_ib_pool_fini(adev); |
|---|
| 1883 | 2584 | } |
|---|
| 1884 | 2585 | |
|---|
| 1885 | 2586 | r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); |
|---|
| .. | .. |
|---|
| 1900 | 2601 | adev->ip_blocks[i].status.late_initialized = false; |
|---|
| 1901 | 2602 | } |
|---|
| 1902 | 2603 | |
|---|
| 2604 | + amdgpu_ras_fini(adev); |
|---|
| 2605 | + |
|---|
| 1903 | 2606 | if (amdgpu_sriov_vf(adev)) |
|---|
| 1904 | 2607 | if (amdgpu_virt_release_full_gpu(adev, false)) |
|---|
| 1905 | 2608 | DRM_ERROR("failed to release exclusive mode on fini\n"); |
|---|
| .. | .. |
|---|
| 1908 | 2611 | } |
|---|
| 1909 | 2612 | |
|---|
| 1910 | 2613 | /** |
|---|
| 1911 | | - * amdgpu_device_ip_late_init_func_handler - work handler for clockgating |
|---|
| 2614 | + * amdgpu_device_delayed_init_work_handler - work handler for IB tests |
|---|
| 1912 | 2615 | * |
|---|
| 1913 | | - * @work: work_struct |
|---|
| 1914 | | - * |
|---|
| 1915 | | - * Work handler for amdgpu_device_ip_late_set_cg_state. We put the |
|---|
| 1916 | | - * clockgating setup into a worker thread to speed up driver init and |
|---|
| 1917 | | - * resume from suspend. |
|---|
| 2616 | + * @work: work_struct. |
|---|
| 1918 | 2617 | */ |
|---|
| 1919 | | -static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work) |
|---|
| 2618 | +static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) |
|---|
| 1920 | 2619 | { |
|---|
| 1921 | 2620 | struct amdgpu_device *adev = |
|---|
| 1922 | | - container_of(work, struct amdgpu_device, late_init_work.work); |
|---|
| 2621 | + container_of(work, struct amdgpu_device, delayed_init_work.work); |
|---|
| 1923 | 2622 | int r; |
|---|
| 1924 | 2623 | |
|---|
| 1925 | 2624 | r = amdgpu_ib_ring_tests(adev); |
|---|
| 1926 | 2625 | if (r) |
|---|
| 1927 | 2626 | DRM_ERROR("ib ring test failed (%d).\n", r); |
|---|
| 2627 | +} |
|---|
| 2628 | + |
|---|
| 2629 | +static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) |
|---|
| 2630 | +{ |
|---|
| 2631 | + struct amdgpu_device *adev = |
|---|
| 2632 | + container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); |
|---|
| 2633 | + |
|---|
| 2634 | + WARN_ON_ONCE(adev->gfx.gfx_off_state); |
|---|
| 2635 | + WARN_ON_ONCE(adev->gfx.gfx_off_req_count); |
|---|
| 2636 | + |
|---|
| 2637 | + if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) |
|---|
| 2638 | + adev->gfx.gfx_off_state = true; |
|---|
| 1928 | 2639 | } |
|---|
| 1929 | 2640 | |
|---|
| 1930 | 2641 | /** |
|---|
| .. | .. |
|---|
| 1942 | 2653 | { |
|---|
| 1943 | 2654 | int i, r; |
|---|
| 1944 | 2655 | |
|---|
| 1945 | | - if (amdgpu_sriov_vf(adev)) |
|---|
| 1946 | | - amdgpu_virt_request_full_gpu(adev, false); |
|---|
| 2656 | + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); |
|---|
| 2657 | + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); |
|---|
| 1947 | 2658 | |
|---|
| 1948 | 2659 | for (i = adev->num_ip_blocks - 1; i >= 0; i--) { |
|---|
| 1949 | 2660 | if (!adev->ip_blocks[i].status.valid) |
|---|
| 1950 | 2661 | continue; |
|---|
| 1951 | | - /* displays are handled separately */ |
|---|
| 1952 | | - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { |
|---|
| 1953 | | - /* ungate blocks so that suspend can properly shut them down */ |
|---|
| 1954 | | - if (adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
|---|
| 1955 | | - r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
|---|
| 1956 | | - AMD_CG_STATE_UNGATE); |
|---|
| 1957 | | - if (r) { |
|---|
| 1958 | | - DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n", |
|---|
| 1959 | | - adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 1960 | | - } |
|---|
| 1961 | | - } |
|---|
| 1962 | | - /* XXX handle errors */ |
|---|
| 1963 | | - r = adev->ip_blocks[i].version->funcs->suspend(adev); |
|---|
| 1964 | | - /* XXX handle errors */ |
|---|
| 1965 | | - if (r) { |
|---|
| 1966 | | - DRM_ERROR("suspend of IP block <%s> failed %d\n", |
|---|
| 1967 | | - adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 1968 | | - } |
|---|
| 1969 | | - } |
|---|
| 1970 | | - } |
|---|
| 1971 | 2662 | |
|---|
| 1972 | | - if (amdgpu_sriov_vf(adev)) |
|---|
| 1973 | | - amdgpu_virt_release_full_gpu(adev, false); |
|---|
| 2663 | + /* displays are handled separately */ |
|---|
| 2664 | + if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) |
|---|
| 2665 | + continue; |
|---|
| 2666 | + |
|---|
| 2667 | + /* XXX handle errors */ |
|---|
| 2668 | + r = adev->ip_blocks[i].version->funcs->suspend(adev); |
|---|
| 2669 | + /* XXX handle errors */ |
|---|
| 2670 | + if (r) { |
|---|
| 2671 | + DRM_ERROR("suspend of IP block <%s> failed %d\n", |
|---|
| 2672 | + adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2673 | + return r; |
|---|
| 2674 | + } |
|---|
| 2675 | + |
|---|
| 2676 | + adev->ip_blocks[i].status.hw = false; |
|---|
| 2677 | + } |
|---|
| 1974 | 2678 | |
|---|
| 1975 | 2679 | return 0; |
|---|
| 1976 | 2680 | } |
|---|
| .. | .. |
|---|
| 1990 | 2694 | { |
|---|
| 1991 | 2695 | int i, r; |
|---|
| 1992 | 2696 | |
|---|
| 1993 | | - if (amdgpu_sriov_vf(adev)) |
|---|
| 1994 | | - amdgpu_virt_request_full_gpu(adev, false); |
|---|
| 1995 | | - |
|---|
| 1996 | | - /* ungate SMC block first */ |
|---|
| 1997 | | - r = amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_SMC, |
|---|
| 1998 | | - AMD_CG_STATE_UNGATE); |
|---|
| 1999 | | - if (r) { |
|---|
| 2000 | | - DRM_ERROR("set_clockgating_state(ungate) SMC failed %d\n", r); |
|---|
| 2001 | | - } |
|---|
| 2002 | | - |
|---|
| 2003 | | - /* call smu to disable gfx off feature first when suspend */ |
|---|
| 2004 | | - if (adev->powerplay.pp_funcs->set_powergating_by_smu) |
|---|
| 2005 | | - amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false); |
|---|
| 2006 | | - |
|---|
| 2007 | 2697 | for (i = adev->num_ip_blocks - 1; i >= 0; i--) { |
|---|
| 2008 | 2698 | if (!adev->ip_blocks[i].status.valid) |
|---|
| 2009 | 2699 | continue; |
|---|
| 2010 | 2700 | /* displays are handled in phase1 */ |
|---|
| 2011 | 2701 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) |
|---|
| 2012 | 2702 | continue; |
|---|
| 2013 | | - /* ungate blocks so that suspend can properly shut them down */ |
|---|
| 2014 | | - if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_SMC && |
|---|
| 2015 | | - adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
|---|
| 2016 | | - r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
|---|
| 2017 | | - AMD_CG_STATE_UNGATE); |
|---|
| 2018 | | - if (r) { |
|---|
| 2019 | | - DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n", |
|---|
| 2020 | | - adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2021 | | - } |
|---|
| 2703 | + /* PSP lost connection when err_event_athub occurs */ |
|---|
| 2704 | + if (amdgpu_ras_intr_triggered() && |
|---|
| 2705 | + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { |
|---|
| 2706 | + adev->ip_blocks[i].status.hw = false; |
|---|
| 2707 | + continue; |
|---|
| 2022 | 2708 | } |
|---|
| 2023 | 2709 | /* XXX handle errors */ |
|---|
| 2024 | 2710 | r = adev->ip_blocks[i].version->funcs->suspend(adev); |
|---|
| .. | .. |
|---|
| 2027 | 2713 | DRM_ERROR("suspend of IP block <%s> failed %d\n", |
|---|
| 2028 | 2714 | adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2029 | 2715 | } |
|---|
| 2716 | + adev->ip_blocks[i].status.hw = false; |
|---|
| 2717 | + /* handle putting the SMC in the appropriate state */ |
|---|
| 2718 | + if(!amdgpu_sriov_vf(adev)){ |
|---|
| 2719 | + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { |
|---|
| 2720 | + r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); |
|---|
| 2721 | + if (r) { |
|---|
| 2722 | + DRM_ERROR("SMC failed to set mp1 state %d, %d\n", |
|---|
| 2723 | + adev->mp1_state, r); |
|---|
| 2724 | + return r; |
|---|
| 2725 | + } |
|---|
| 2726 | + } |
|---|
| 2727 | + } |
|---|
| 2728 | + adev->ip_blocks[i].status.hw = false; |
|---|
| 2030 | 2729 | } |
|---|
| 2031 | | - |
|---|
| 2032 | | - if (amdgpu_sriov_vf(adev)) |
|---|
| 2033 | | - amdgpu_virt_release_full_gpu(adev, false); |
|---|
| 2034 | 2730 | |
|---|
| 2035 | 2731 | return 0; |
|---|
| 2036 | 2732 | } |
|---|
| .. | .. |
|---|
| 2050 | 2746 | { |
|---|
| 2051 | 2747 | int r; |
|---|
| 2052 | 2748 | |
|---|
| 2749 | + if (amdgpu_sriov_vf(adev)) |
|---|
| 2750 | + amdgpu_virt_request_full_gpu(adev, false); |
|---|
| 2751 | + |
|---|
| 2053 | 2752 | r = amdgpu_device_ip_suspend_phase1(adev); |
|---|
| 2054 | 2753 | if (r) |
|---|
| 2055 | 2754 | return r; |
|---|
| 2056 | 2755 | r = amdgpu_device_ip_suspend_phase2(adev); |
|---|
| 2756 | + |
|---|
| 2757 | + if (amdgpu_sriov_vf(adev)) |
|---|
| 2758 | + amdgpu_virt_release_full_gpu(adev, false); |
|---|
| 2057 | 2759 | |
|---|
| 2058 | 2760 | return r; |
|---|
| 2059 | 2761 | } |
|---|
| .. | .. |
|---|
| 2073 | 2775 | int j; |
|---|
| 2074 | 2776 | struct amdgpu_ip_block *block; |
|---|
| 2075 | 2777 | |
|---|
| 2076 | | - for (j = 0; j < adev->num_ip_blocks; j++) { |
|---|
| 2077 | | - block = &adev->ip_blocks[j]; |
|---|
| 2778 | + block = &adev->ip_blocks[i]; |
|---|
| 2779 | + block->status.hw = false; |
|---|
| 2078 | 2780 | |
|---|
| 2079 | | - if (block->version->type != ip_order[i] || |
|---|
| 2781 | + for (j = 0; j < ARRAY_SIZE(ip_order); j++) { |
|---|
| 2782 | + |
|---|
| 2783 | + if (block->version->type != ip_order[j] || |
|---|
| 2080 | 2784 | !block->status.valid) |
|---|
| 2081 | 2785 | continue; |
|---|
| 2082 | 2786 | |
|---|
| 2083 | 2787 | r = block->version->funcs->hw_init(adev); |
|---|
| 2084 | | - DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); |
|---|
| 2788 | + DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); |
|---|
| 2085 | 2789 | if (r) |
|---|
| 2086 | 2790 | return r; |
|---|
| 2791 | + block->status.hw = true; |
|---|
| 2087 | 2792 | } |
|---|
| 2088 | 2793 | } |
|---|
| 2089 | 2794 | |
|---|
| .. | .. |
|---|
| 2100 | 2805 | AMD_IP_BLOCK_TYPE_GFX, |
|---|
| 2101 | 2806 | AMD_IP_BLOCK_TYPE_SDMA, |
|---|
| 2102 | 2807 | AMD_IP_BLOCK_TYPE_UVD, |
|---|
| 2103 | | - AMD_IP_BLOCK_TYPE_VCE |
|---|
| 2808 | + AMD_IP_BLOCK_TYPE_VCE, |
|---|
| 2809 | + AMD_IP_BLOCK_TYPE_VCN |
|---|
| 2104 | 2810 | }; |
|---|
| 2105 | 2811 | |
|---|
| 2106 | 2812 | for (i = 0; i < ARRAY_SIZE(ip_order); i++) { |
|---|
| .. | .. |
|---|
| 2111 | 2817 | block = &adev->ip_blocks[j]; |
|---|
| 2112 | 2818 | |
|---|
| 2113 | 2819 | if (block->version->type != ip_order[i] || |
|---|
| 2114 | | - !block->status.valid) |
|---|
| 2820 | + !block->status.valid || |
|---|
| 2821 | + block->status.hw) |
|---|
| 2115 | 2822 | continue; |
|---|
| 2116 | 2823 | |
|---|
| 2117 | | - r = block->version->funcs->hw_init(adev); |
|---|
| 2118 | | - DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); |
|---|
| 2824 | + if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) |
|---|
| 2825 | + r = block->version->funcs->resume(adev); |
|---|
| 2826 | + else |
|---|
| 2827 | + r = block->version->funcs->hw_init(adev); |
|---|
| 2828 | + |
|---|
| 2829 | + DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); |
|---|
| 2119 | 2830 | if (r) |
|---|
| 2120 | 2831 | return r; |
|---|
| 2832 | + block->status.hw = true; |
|---|
| 2121 | 2833 | } |
|---|
| 2122 | 2834 | } |
|---|
| 2123 | 2835 | |
|---|
| .. | .. |
|---|
| 2141 | 2853 | int i, r; |
|---|
| 2142 | 2854 | |
|---|
| 2143 | 2855 | for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 2144 | | - if (!adev->ip_blocks[i].status.valid) |
|---|
| 2856 | + if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) |
|---|
| 2145 | 2857 | continue; |
|---|
| 2146 | 2858 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || |
|---|
| 2147 | 2859 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || |
|---|
| 2148 | 2860 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { |
|---|
| 2861 | + |
|---|
| 2149 | 2862 | r = adev->ip_blocks[i].version->funcs->resume(adev); |
|---|
| 2150 | 2863 | if (r) { |
|---|
| 2151 | 2864 | DRM_ERROR("resume of IP block <%s> failed %d\n", |
|---|
| 2152 | 2865 | adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2153 | 2866 | return r; |
|---|
| 2154 | 2867 | } |
|---|
| 2868 | + adev->ip_blocks[i].status.hw = true; |
|---|
| 2155 | 2869 | } |
|---|
| 2156 | 2870 | } |
|---|
| 2157 | 2871 | |
|---|
| .. | .. |
|---|
| 2176 | 2890 | int i, r; |
|---|
| 2177 | 2891 | |
|---|
| 2178 | 2892 | for (i = 0; i < adev->num_ip_blocks; i++) { |
|---|
| 2179 | | - if (!adev->ip_blocks[i].status.valid) |
|---|
| 2893 | + if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) |
|---|
| 2180 | 2894 | continue; |
|---|
| 2181 | 2895 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || |
|---|
| 2182 | 2896 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || |
|---|
| 2183 | | - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) |
|---|
| 2897 | + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || |
|---|
| 2898 | + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) |
|---|
| 2184 | 2899 | continue; |
|---|
| 2185 | 2900 | r = adev->ip_blocks[i].version->funcs->resume(adev); |
|---|
| 2186 | 2901 | if (r) { |
|---|
| .. | .. |
|---|
| 2188 | 2903 | adev->ip_blocks[i].version->funcs->name, r); |
|---|
| 2189 | 2904 | return r; |
|---|
| 2190 | 2905 | } |
|---|
| 2906 | + adev->ip_blocks[i].status.hw = true; |
|---|
| 2191 | 2907 | } |
|---|
| 2192 | 2908 | |
|---|
| 2193 | 2909 | return 0; |
|---|
| .. | .. |
|---|
| 2209 | 2925 | { |
|---|
| 2210 | 2926 | int r; |
|---|
| 2211 | 2927 | |
|---|
| 2928 | + r = amdgpu_amdkfd_resume_iommu(adev); |
|---|
| 2929 | + if (r) |
|---|
| 2930 | + return r; |
|---|
| 2931 | + |
|---|
| 2212 | 2932 | r = amdgpu_device_ip_resume_phase1(adev); |
|---|
| 2213 | 2933 | if (r) |
|---|
| 2214 | 2934 | return r; |
|---|
| 2935 | + |
|---|
| 2936 | + r = amdgpu_device_fw_loading(adev); |
|---|
| 2937 | + if (r) |
|---|
| 2938 | + return r; |
|---|
| 2939 | + |
|---|
| 2215 | 2940 | r = amdgpu_device_ip_resume_phase2(adev); |
|---|
| 2216 | 2941 | |
|---|
| 2217 | 2942 | return r; |
|---|
| .. | .. |
|---|
| 2252 | 2977 | { |
|---|
| 2253 | 2978 | switch (asic_type) { |
|---|
| 2254 | 2979 | #if defined(CONFIG_DRM_AMD_DC) |
|---|
| 2980 | +#if defined(CONFIG_DRM_AMD_DC_SI) |
|---|
| 2981 | + case CHIP_TAHITI: |
|---|
| 2982 | + case CHIP_PITCAIRN: |
|---|
| 2983 | + case CHIP_VERDE: |
|---|
| 2984 | + case CHIP_OLAND: |
|---|
| 2985 | +#endif |
|---|
| 2255 | 2986 | case CHIP_BONAIRE: |
|---|
| 2256 | 2987 | case CHIP_KAVERI: |
|---|
| 2257 | 2988 | case CHIP_KABINI: |
|---|
| .. | .. |
|---|
| 2276 | 3007 | case CHIP_VEGA10: |
|---|
| 2277 | 3008 | case CHIP_VEGA12: |
|---|
| 2278 | 3009 | case CHIP_VEGA20: |
|---|
| 2279 | | -#if defined(CONFIG_DRM_AMD_DC_DCN1_0) |
|---|
| 3010 | +#if defined(CONFIG_DRM_AMD_DC_DCN) |
|---|
| 2280 | 3011 | case CHIP_RAVEN: |
|---|
| 3012 | + case CHIP_NAVI10: |
|---|
| 3013 | + case CHIP_NAVI14: |
|---|
| 3014 | + case CHIP_NAVI12: |
|---|
| 3015 | + case CHIP_RENOIR: |
|---|
| 3016 | +#endif |
|---|
| 3017 | +#if defined(CONFIG_DRM_AMD_DC_DCN3_0) |
|---|
| 3018 | + case CHIP_SIENNA_CICHLID: |
|---|
| 3019 | + case CHIP_NAVY_FLOUNDER: |
|---|
| 2281 | 3020 | #endif |
|---|
| 2282 | 3021 | return amdgpu_dc != 0; |
|---|
| 2283 | 3022 | #endif |
|---|
| 2284 | 3023 | default: |
|---|
| 3024 | + if (amdgpu_dc > 0) |
|---|
| 3025 | + DRM_INFO_ONCE("Display Core has been requested via kernel parameter " |
|---|
| 3026 | + "but isn't supported by ASIC, ignoring\n"); |
|---|
| 2285 | 3027 | return false; |
|---|
| 2286 | 3028 | } |
|---|
| 2287 | 3029 | } |
|---|
| .. | .. |
|---|
| 2289 | 3031 | /** |
|---|
| 2290 | 3032 | * amdgpu_device_has_dc_support - check if dc is supported |
|---|
| 2291 | 3033 | * |
|---|
| 2292 | | - * @adev: amdgpu_device_pointer |
|---|
| 3034 | + * @adev: amdgpu_device pointer |
|---|
| 2293 | 3035 | * |
|---|
| 2294 | 3036 | * Returns true for supported, false for not supported |
|---|
| 2295 | 3037 | */ |
|---|
| 2296 | 3038 | bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) |
|---|
| 2297 | 3039 | { |
|---|
| 2298 | | - if (amdgpu_sriov_vf(adev)) |
|---|
| 3040 | + if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) |
|---|
| 2299 | 3041 | return false; |
|---|
| 2300 | 3042 | |
|---|
| 2301 | 3043 | return amdgpu_device_asic_has_dc_support(adev->asic_type); |
|---|
| 2302 | 3044 | } |
|---|
| 2303 | 3045 | |
|---|
| 3046 | + |
|---|
| 3047 | +static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) |
|---|
| 3048 | +{ |
|---|
| 3049 | + struct amdgpu_device *adev = |
|---|
| 3050 | + container_of(__work, struct amdgpu_device, xgmi_reset_work); |
|---|
| 3051 | + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); |
|---|
| 3052 | + |
|---|
| 3053 | + /* It's a bug to not have a hive within this function */ |
|---|
| 3054 | + if (WARN_ON(!hive)) |
|---|
| 3055 | + return; |
|---|
| 3056 | + |
|---|
| 3057 | + /* |
|---|
| 3058 | + * Use task barrier to synchronize all xgmi reset works across the |
|---|
| 3059 | + * hive. task_barrier_enter and task_barrier_exit will block |
|---|
| 3060 | + * until all the threads running the xgmi reset works reach |
|---|
| 3061 | + * those points. task_barrier_full will do both blocks. |
|---|
| 3062 | + */ |
|---|
| 3063 | + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { |
|---|
| 3064 | + |
|---|
| 3065 | + task_barrier_enter(&hive->tb); |
|---|
| 3066 | + adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); |
|---|
| 3067 | + |
|---|
| 3068 | + if (adev->asic_reset_res) |
|---|
| 3069 | + goto fail; |
|---|
| 3070 | + |
|---|
| 3071 | + task_barrier_exit(&hive->tb); |
|---|
| 3072 | + adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); |
|---|
| 3073 | + |
|---|
| 3074 | + if (adev->asic_reset_res) |
|---|
| 3075 | + goto fail; |
|---|
| 3076 | + |
|---|
| 3077 | + if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) |
|---|
| 3078 | + adev->mmhub.funcs->reset_ras_error_count(adev); |
|---|
| 3079 | + } else { |
|---|
| 3080 | + |
|---|
| 3081 | + task_barrier_full(&hive->tb); |
|---|
| 3082 | + adev->asic_reset_res = amdgpu_asic_reset(adev); |
|---|
| 3083 | + } |
|---|
| 3084 | + |
|---|
| 3085 | +fail: |
|---|
| 3086 | + if (adev->asic_reset_res) |
|---|
| 3087 | + DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", |
|---|
| 3088 | + adev->asic_reset_res, adev_to_drm(adev)->unique); |
|---|
| 3089 | + amdgpu_put_xgmi_hive(hive); |
|---|
| 3090 | +} |
|---|
| 3091 | + |
|---|
| 3092 | +static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) |
|---|
| 3093 | +{ |
|---|
| 3094 | + char *input = amdgpu_lockup_timeout; |
|---|
| 3095 | + char *timeout_setting = NULL; |
|---|
| 3096 | + int index = 0; |
|---|
| 3097 | + long timeout; |
|---|
| 3098 | + int ret = 0; |
|---|
| 3099 | + |
|---|
| 3100 | + /* |
|---|
| 3101 | + * By default timeout for non compute jobs is 10000. |
|---|
| 3102 | + * And there is no timeout enforced on compute jobs. |
|---|
| 3103 | + * In SR-IOV or passthrough mode, timeout for compute |
|---|
| 3104 | + * jobs are 60000 by default. |
|---|
| 3105 | + */ |
|---|
| 3106 | + adev->gfx_timeout = msecs_to_jiffies(10000); |
|---|
| 3107 | + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; |
|---|
| 3108 | + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) |
|---|
| 3109 | + adev->compute_timeout = msecs_to_jiffies(60000); |
|---|
| 3110 | + else |
|---|
| 3111 | + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; |
|---|
| 3112 | + |
|---|
| 3113 | + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { |
|---|
| 3114 | + while ((timeout_setting = strsep(&input, ",")) && |
|---|
| 3115 | + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { |
|---|
| 3116 | + ret = kstrtol(timeout_setting, 0, &timeout); |
|---|
| 3117 | + if (ret) |
|---|
| 3118 | + return ret; |
|---|
| 3119 | + |
|---|
| 3120 | + if (timeout == 0) { |
|---|
| 3121 | + index++; |
|---|
| 3122 | + continue; |
|---|
| 3123 | + } else if (timeout < 0) { |
|---|
| 3124 | + timeout = MAX_SCHEDULE_TIMEOUT; |
|---|
| 3125 | + } else { |
|---|
| 3126 | + timeout = msecs_to_jiffies(timeout); |
|---|
| 3127 | + } |
|---|
| 3128 | + |
|---|
| 3129 | + switch (index++) { |
|---|
| 3130 | + case 0: |
|---|
| 3131 | + adev->gfx_timeout = timeout; |
|---|
| 3132 | + break; |
|---|
| 3133 | + case 1: |
|---|
| 3134 | + adev->compute_timeout = timeout; |
|---|
| 3135 | + break; |
|---|
| 3136 | + case 2: |
|---|
| 3137 | + adev->sdma_timeout = timeout; |
|---|
| 3138 | + break; |
|---|
| 3139 | + case 3: |
|---|
| 3140 | + adev->video_timeout = timeout; |
|---|
| 3141 | + break; |
|---|
| 3142 | + default: |
|---|
| 3143 | + break; |
|---|
| 3144 | + } |
|---|
| 3145 | + } |
|---|
| 3146 | + /* |
|---|
| 3147 | + * There is only one value specified and |
|---|
| 3148 | + * it should apply to all non-compute jobs. |
|---|
| 3149 | + */ |
|---|
| 3150 | + if (index == 1) { |
|---|
| 3151 | + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; |
|---|
| 3152 | + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) |
|---|
| 3153 | + adev->compute_timeout = adev->gfx_timeout; |
|---|
| 3154 | + } |
|---|
| 3155 | + } |
|---|
| 3156 | + |
|---|
| 3157 | + return ret; |
|---|
| 3158 | +} |
|---|
| 3159 | + |
|---|
| 3160 | +static const struct attribute *amdgpu_dev_attributes[] = { |
|---|
| 3161 | + &dev_attr_product_name.attr, |
|---|
| 3162 | + &dev_attr_product_number.attr, |
|---|
| 3163 | + &dev_attr_serial_number.attr, |
|---|
| 3164 | + &dev_attr_pcie_replay_count.attr, |
|---|
| 3165 | + NULL |
|---|
| 3166 | +}; |
|---|
| 3167 | + |
|---|
| 3168 | + |
|---|
| 2304 | 3169 | /** |
|---|
| 2305 | 3170 | * amdgpu_device_init - initialize the driver |
|---|
| 2306 | 3171 | * |
|---|
| 2307 | 3172 | * @adev: amdgpu_device pointer |
|---|
| 2308 | | - * @ddev: drm dev pointer |
|---|
| 2309 | | - * @pdev: pci dev pointer |
|---|
| 2310 | 3173 | * @flags: driver flags |
|---|
| 2311 | 3174 | * |
|---|
| 2312 | 3175 | * Initializes the driver info and hw (all asics). |
|---|
| .. | .. |
|---|
| 2314 | 3177 | * Called at driver startup. |
|---|
| 2315 | 3178 | */ |
|---|
| 2316 | 3179 | int amdgpu_device_init(struct amdgpu_device *adev, |
|---|
| 2317 | | - struct drm_device *ddev, |
|---|
| 2318 | | - struct pci_dev *pdev, |
|---|
| 2319 | 3180 | uint32_t flags) |
|---|
| 2320 | 3181 | { |
|---|
| 3182 | + struct drm_device *ddev = adev_to_drm(adev); |
|---|
| 3183 | + struct pci_dev *pdev = adev->pdev; |
|---|
| 2321 | 3184 | int r, i; |
|---|
| 2322 | | - bool runtime = false; |
|---|
| 3185 | + bool boco = false; |
|---|
| 2323 | 3186 | u32 max_MBps; |
|---|
| 2324 | 3187 | |
|---|
| 2325 | 3188 | adev->shutdown = false; |
|---|
| 2326 | | - adev->dev = &pdev->dev; |
|---|
| 2327 | | - adev->ddev = ddev; |
|---|
| 2328 | | - adev->pdev = pdev; |
|---|
| 2329 | 3189 | adev->flags = flags; |
|---|
| 2330 | | - adev->asic_type = flags & AMD_ASIC_MASK; |
|---|
| 3190 | + |
|---|
| 3191 | + if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) |
|---|
| 3192 | + adev->asic_type = amdgpu_force_asic_type; |
|---|
| 3193 | + else |
|---|
| 3194 | + adev->asic_type = flags & AMD_ASIC_MASK; |
|---|
| 3195 | + |
|---|
| 2331 | 3196 | adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; |
|---|
| 2332 | 3197 | if (amdgpu_emu_mode == 1) |
|---|
| 2333 | | - adev->usec_timeout *= 2; |
|---|
| 3198 | + adev->usec_timeout *= 10; |
|---|
| 2334 | 3199 | adev->gmc.gart_size = 512 * 1024 * 1024; |
|---|
| 2335 | 3200 | adev->accel_working = false; |
|---|
| 2336 | 3201 | adev->num_rings = 0; |
|---|
| 2337 | 3202 | adev->mman.buffer_funcs = NULL; |
|---|
| 2338 | 3203 | adev->mman.buffer_funcs_ring = NULL; |
|---|
| 2339 | 3204 | adev->vm_manager.vm_pte_funcs = NULL; |
|---|
| 2340 | | - adev->vm_manager.vm_pte_num_rings = 0; |
|---|
| 3205 | + adev->vm_manager.vm_pte_num_scheds = 0; |
|---|
| 2341 | 3206 | adev->gmc.gmc_funcs = NULL; |
|---|
| 2342 | 3207 | adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); |
|---|
| 2343 | 3208 | bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); |
|---|
| .. | .. |
|---|
| 2348 | 3213 | adev->pcie_wreg = &amdgpu_invalid_wreg; |
|---|
| 2349 | 3214 | adev->pciep_rreg = &amdgpu_invalid_rreg; |
|---|
| 2350 | 3215 | adev->pciep_wreg = &amdgpu_invalid_wreg; |
|---|
| 3216 | + adev->pcie_rreg64 = &amdgpu_invalid_rreg64; |
|---|
| 3217 | + adev->pcie_wreg64 = &amdgpu_invalid_wreg64; |
|---|
| 2351 | 3218 | adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; |
|---|
| 2352 | 3219 | adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; |
|---|
| 2353 | 3220 | adev->didt_rreg = &amdgpu_invalid_rreg; |
|---|
| .. | .. |
|---|
| 2369 | 3236 | mutex_init(&adev->gfx.gpu_clock_mutex); |
|---|
| 2370 | 3237 | mutex_init(&adev->srbm_mutex); |
|---|
| 2371 | 3238 | mutex_init(&adev->gfx.pipe_reserve_mutex); |
|---|
| 3239 | + mutex_init(&adev->gfx.gfx_off_mutex); |
|---|
| 2372 | 3240 | mutex_init(&adev->grbm_idx_mutex); |
|---|
| 2373 | 3241 | mutex_init(&adev->mn_lock); |
|---|
| 2374 | 3242 | mutex_init(&adev->virt.vf_errors.lock); |
|---|
| 2375 | 3243 | hash_init(adev->mn_hash); |
|---|
| 2376 | | - mutex_init(&adev->lock_reset); |
|---|
| 3244 | + atomic_set(&adev->in_gpu_reset, 0); |
|---|
| 3245 | + init_rwsem(&adev->reset_sem); |
|---|
| 3246 | + mutex_init(&adev->psp.mutex); |
|---|
| 3247 | + mutex_init(&adev->notifier_lock); |
|---|
| 2377 | 3248 | |
|---|
| 2378 | | - amdgpu_device_check_arguments(adev); |
|---|
| 3249 | + r = amdgpu_device_check_arguments(adev); |
|---|
| 3250 | + if (r) |
|---|
| 3251 | + return r; |
|---|
| 2379 | 3252 | |
|---|
| 2380 | 3253 | spin_lock_init(&adev->mmio_idx_lock); |
|---|
| 2381 | 3254 | spin_lock_init(&adev->smc_idx_lock); |
|---|
| .. | .. |
|---|
| 2390 | 3263 | INIT_LIST_HEAD(&adev->shadow_list); |
|---|
| 2391 | 3264 | mutex_init(&adev->shadow_list_lock); |
|---|
| 2392 | 3265 | |
|---|
| 2393 | | - INIT_LIST_HEAD(&adev->ring_lru_list); |
|---|
| 2394 | | - spin_lock_init(&adev->ring_lru_list_lock); |
|---|
| 3266 | + INIT_DELAYED_WORK(&adev->delayed_init_work, |
|---|
| 3267 | + amdgpu_device_delayed_init_work_handler); |
|---|
| 3268 | + INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, |
|---|
| 3269 | + amdgpu_device_delay_enable_gfx_off); |
|---|
| 2395 | 3270 | |
|---|
| 2396 | | - INIT_DELAYED_WORK(&adev->late_init_work, |
|---|
| 2397 | | - amdgpu_device_ip_late_init_func_handler); |
|---|
| 3271 | + INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); |
|---|
| 2398 | 3272 | |
|---|
| 2399 | | - adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; |
|---|
| 3273 | + adev->gfx.gfx_off_req_count = 1; |
|---|
| 3274 | + adev->pm.ac_power = power_supply_is_system_supplied() > 0; |
|---|
| 3275 | + |
|---|
| 3276 | + atomic_set(&adev->throttling_logging_enabled, 1); |
|---|
| 3277 | + /* |
|---|
| 3278 | + * If throttling continues, logging will be performed every minute |
|---|
| 3279 | + * to avoid log flooding. "-1" is subtracted since the thermal |
|---|
| 3280 | + * throttling interrupt comes every second. Thus, the total logging |
|---|
| 3281 | + * interval is 59 seconds(retelimited printk interval) + 1(waiting |
|---|
| 3282 | + * for throttling interrupt) = 60 seconds. |
|---|
| 3283 | + */ |
|---|
| 3284 | + ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); |
|---|
| 3285 | + ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); |
|---|
| 2400 | 3286 | |
|---|
| 2401 | 3287 | /* Registers mapping */ |
|---|
| 2402 | 3288 | /* TODO: block userspace mapping of io register */ |
|---|
| .. | .. |
|---|
| 2415 | 3301 | DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); |
|---|
| 2416 | 3302 | DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); |
|---|
| 2417 | 3303 | |
|---|
| 2418 | | - /* doorbell bar mapping */ |
|---|
| 2419 | | - amdgpu_device_doorbell_init(adev); |
|---|
| 2420 | | - |
|---|
| 2421 | 3304 | /* io port mapping */ |
|---|
| 2422 | 3305 | for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { |
|---|
| 2423 | 3306 | if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { |
|---|
| .. | .. |
|---|
| 2429 | 3312 | if (adev->rio_mem == NULL) |
|---|
| 2430 | 3313 | DRM_INFO("PCI I/O BAR is not found.\n"); |
|---|
| 2431 | 3314 | |
|---|
| 3315 | + /* enable PCIE atomic ops */ |
|---|
| 3316 | + r = pci_enable_atomic_ops_to_root(adev->pdev, |
|---|
| 3317 | + PCI_EXP_DEVCAP2_ATOMIC_COMP32 | |
|---|
| 3318 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64); |
|---|
| 3319 | + if (r) { |
|---|
| 3320 | + adev->have_atomics_support = false; |
|---|
| 3321 | + DRM_INFO("PCIE atomic ops is not supported\n"); |
|---|
| 3322 | + } else { |
|---|
| 3323 | + adev->have_atomics_support = true; |
|---|
| 3324 | + } |
|---|
| 3325 | + |
|---|
| 2432 | 3326 | amdgpu_device_get_pcie_info(adev); |
|---|
| 3327 | + |
|---|
| 3328 | + if (amdgpu_mcbp) |
|---|
| 3329 | + DRM_INFO("MCBP is enabled\n"); |
|---|
| 3330 | + |
|---|
| 3331 | + if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) |
|---|
| 3332 | + adev->enable_mes = true; |
|---|
| 3333 | + |
|---|
| 3334 | + /* detect hw virtualization here */ |
|---|
| 3335 | + amdgpu_detect_virtualization(adev); |
|---|
| 3336 | + |
|---|
| 3337 | + r = amdgpu_device_get_job_timeout_settings(adev); |
|---|
| 3338 | + if (r) { |
|---|
| 3339 | + dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); |
|---|
| 3340 | + return r; |
|---|
| 3341 | + } |
|---|
| 2433 | 3342 | |
|---|
| 2434 | 3343 | /* early init functions */ |
|---|
| 2435 | 3344 | r = amdgpu_device_ip_early_init(adev); |
|---|
| 2436 | 3345 | if (r) |
|---|
| 2437 | 3346 | return r; |
|---|
| 2438 | 3347 | |
|---|
| 3348 | + /* doorbell bar mapping and doorbell index init*/ |
|---|
| 3349 | + amdgpu_device_doorbell_init(adev); |
|---|
| 3350 | + |
|---|
| 2439 | 3351 | /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ |
|---|
| 2440 | 3352 | /* this will fail for cards that aren't VGA class devices, just |
|---|
| 2441 | 3353 | * ignore it */ |
|---|
| 2442 | 3354 | vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); |
|---|
| 2443 | 3355 | |
|---|
| 2444 | | - if (amdgpu_device_is_px(ddev)) |
|---|
| 2445 | | - runtime = true; |
|---|
| 2446 | | - if (!pci_is_thunderbolt_attached(adev->pdev)) |
|---|
| 3356 | + if (amdgpu_device_supports_boco(ddev)) |
|---|
| 3357 | + boco = true; |
|---|
| 3358 | + if (amdgpu_has_atpx() && |
|---|
| 3359 | + (amdgpu_is_atpx_hybrid() || |
|---|
| 3360 | + amdgpu_has_atpx_dgpu_power_cntl()) && |
|---|
| 3361 | + !pci_is_thunderbolt_attached(adev->pdev)) |
|---|
| 2447 | 3362 | vga_switcheroo_register_client(adev->pdev, |
|---|
| 2448 | | - &amdgpu_switcheroo_ops, runtime); |
|---|
| 2449 | | - if (runtime) |
|---|
| 3363 | + &amdgpu_switcheroo_ops, boco); |
|---|
| 3364 | + if (boco) |
|---|
| 2450 | 3365 | vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); |
|---|
| 2451 | 3366 | |
|---|
| 2452 | 3367 | if (amdgpu_emu_mode == 1) { |
|---|
| .. | .. |
|---|
| 2455 | 3370 | goto fence_driver_init; |
|---|
| 2456 | 3371 | } |
|---|
| 2457 | 3372 | |
|---|
| 2458 | | - /* Read BIOS */ |
|---|
| 2459 | | - if (!amdgpu_get_bios(adev)) { |
|---|
| 2460 | | - r = -EINVAL; |
|---|
| 2461 | | - goto failed; |
|---|
| 2462 | | - } |
|---|
| 2463 | | - |
|---|
| 2464 | | - r = amdgpu_atombios_init(adev); |
|---|
| 2465 | | - if (r) { |
|---|
| 2466 | | - dev_err(adev->dev, "amdgpu_atombios_init failed\n"); |
|---|
| 2467 | | - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); |
|---|
| 2468 | | - goto failed; |
|---|
| 2469 | | - } |
|---|
| 2470 | | - |
|---|
| 2471 | 3373 | /* detect if we are with an SRIOV vbios */ |
|---|
| 2472 | 3374 | amdgpu_device_detect_sriov_bios(adev); |
|---|
| 3375 | + |
|---|
| 3376 | + /* check if we need to reset the asic |
|---|
| 3377 | + * E.g., driver was not cleanly unloaded previously, etc. |
|---|
| 3378 | + */ |
|---|
| 3379 | + if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { |
|---|
| 3380 | + r = amdgpu_asic_reset(adev); |
|---|
| 3381 | + if (r) { |
|---|
| 3382 | + dev_err(adev->dev, "asic reset on init failed\n"); |
|---|
| 3383 | + goto failed; |
|---|
| 3384 | + } |
|---|
| 3385 | + } |
|---|
| 3386 | + |
|---|
| 3387 | + pci_enable_pcie_error_reporting(adev->ddev.pdev); |
|---|
| 2473 | 3388 | |
|---|
| 2474 | 3389 | /* Post card if necessary */ |
|---|
| 2475 | 3390 | if (amdgpu_device_need_post(adev)) { |
|---|
| .. | .. |
|---|
| 2479 | 3394 | goto failed; |
|---|
| 2480 | 3395 | } |
|---|
| 2481 | 3396 | DRM_INFO("GPU posting now...\n"); |
|---|
| 2482 | | - r = amdgpu_atom_asic_init(adev->mode_info.atom_context); |
|---|
| 3397 | + r = amdgpu_device_asic_init(adev); |
|---|
| 2483 | 3398 | if (r) { |
|---|
| 2484 | 3399 | dev_err(adev->dev, "gpu post error!\n"); |
|---|
| 2485 | 3400 | goto failed; |
|---|
| .. | .. |
|---|
| 2517 | 3432 | } |
|---|
| 2518 | 3433 | |
|---|
| 2519 | 3434 | /* init the mode config */ |
|---|
| 2520 | | - drm_mode_config_init(adev->ddev); |
|---|
| 3435 | + drm_mode_config_init(adev_to_drm(adev)); |
|---|
| 2521 | 3436 | |
|---|
| 2522 | 3437 | r = amdgpu_device_ip_init(adev); |
|---|
| 2523 | 3438 | if (r) { |
|---|
| .. | .. |
|---|
| 2538 | 3453 | goto failed; |
|---|
| 2539 | 3454 | } |
|---|
| 2540 | 3455 | |
|---|
| 3456 | + dev_info(adev->dev, |
|---|
| 3457 | + "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", |
|---|
| 3458 | + adev->gfx.config.max_shader_engines, |
|---|
| 3459 | + adev->gfx.config.max_sh_per_se, |
|---|
| 3460 | + adev->gfx.config.max_cu_per_sh, |
|---|
| 3461 | + adev->gfx.cu_info.number); |
|---|
| 3462 | + |
|---|
| 2541 | 3463 | adev->accel_working = true; |
|---|
| 2542 | 3464 | |
|---|
| 2543 | 3465 | amdgpu_vm_check_compute_bug(adev); |
|---|
| .. | .. |
|---|
| 2550 | 3472 | /* Get a log2 for easy divisions. */ |
|---|
| 2551 | 3473 | adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); |
|---|
| 2552 | 3474 | |
|---|
| 2553 | | - r = amdgpu_ib_pool_init(adev); |
|---|
| 2554 | | - if (r) { |
|---|
| 2555 | | - dev_err(adev->dev, "IB initialization failed (%d).\n", r); |
|---|
| 2556 | | - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); |
|---|
| 2557 | | - goto failed; |
|---|
| 2558 | | - } |
|---|
| 2559 | | - |
|---|
| 2560 | 3475 | amdgpu_fbdev_init(adev); |
|---|
| 2561 | 3476 | |
|---|
| 2562 | 3477 | r = amdgpu_pm_sysfs_init(adev); |
|---|
| 2563 | | - if (r) |
|---|
| 3478 | + if (r) { |
|---|
| 3479 | + adev->pm_sysfs_en = false; |
|---|
| 2564 | 3480 | DRM_ERROR("registering pm debugfs failed (%d).\n", r); |
|---|
| 3481 | + } else |
|---|
| 3482 | + adev->pm_sysfs_en = true; |
|---|
| 2565 | 3483 | |
|---|
| 2566 | | - r = amdgpu_debugfs_gem_init(adev); |
|---|
| 2567 | | - if (r) |
|---|
| 2568 | | - DRM_ERROR("registering gem debugfs failed (%d).\n", r); |
|---|
| 2569 | | - |
|---|
| 2570 | | - r = amdgpu_debugfs_regs_init(adev); |
|---|
| 2571 | | - if (r) |
|---|
| 2572 | | - DRM_ERROR("registering register debugfs failed (%d).\n", r); |
|---|
| 2573 | | - |
|---|
| 2574 | | - r = amdgpu_debugfs_firmware_init(adev); |
|---|
| 2575 | | - if (r) |
|---|
| 2576 | | - DRM_ERROR("registering firmware debugfs failed (%d).\n", r); |
|---|
| 2577 | | - |
|---|
| 2578 | | - r = amdgpu_debugfs_init(adev); |
|---|
| 2579 | | - if (r) |
|---|
| 2580 | | - DRM_ERROR("Creating debugfs files failed (%d).\n", r); |
|---|
| 3484 | + r = amdgpu_ucode_sysfs_init(adev); |
|---|
| 3485 | + if (r) { |
|---|
| 3486 | + adev->ucode_sysfs_en = false; |
|---|
| 3487 | + DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); |
|---|
| 3488 | + } else |
|---|
| 3489 | + adev->ucode_sysfs_en = true; |
|---|
| 2581 | 3490 | |
|---|
| 2582 | 3491 | if ((amdgpu_testing & 1)) { |
|---|
| 2583 | 3492 | if (adev->accel_working) |
|---|
| .. | .. |
|---|
| 2592 | 3501 | DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); |
|---|
| 2593 | 3502 | } |
|---|
| 2594 | 3503 | |
|---|
| 3504 | + /* |
|---|
| 3505 | + * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. |
|---|
| 3506 | + * Otherwise the mgpu fan boost feature will be skipped due to the |
|---|
| 3507 | + * gpu instance is counted less. |
|---|
| 3508 | + */ |
|---|
| 3509 | + amdgpu_register_gpu_instance(adev); |
|---|
| 3510 | + |
|---|
| 2595 | 3511 | /* enable clockgating, etc. after ib tests, etc. since some blocks require |
|---|
| 2596 | 3512 | * explicit gating rather than handling it automatically. |
|---|
| 2597 | 3513 | */ |
|---|
| .. | .. |
|---|
| 2602 | 3518 | goto failed; |
|---|
| 2603 | 3519 | } |
|---|
| 2604 | 3520 | |
|---|
| 3521 | + /* must succeed. */ |
|---|
| 3522 | + amdgpu_ras_resume(adev); |
|---|
| 3523 | + |
|---|
| 3524 | + queue_delayed_work(system_wq, &adev->delayed_init_work, |
|---|
| 3525 | + msecs_to_jiffies(AMDGPU_RESUME_MS)); |
|---|
| 3526 | + |
|---|
| 3527 | + if (amdgpu_sriov_vf(adev)) |
|---|
| 3528 | + flush_delayed_work(&adev->delayed_init_work); |
|---|
| 3529 | + |
|---|
| 3530 | + r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); |
|---|
| 3531 | + if (r) |
|---|
| 3532 | + dev_err(adev->dev, "Could not create amdgpu device attr\n"); |
|---|
| 3533 | + |
|---|
| 3534 | + if (IS_ENABLED(CONFIG_PERF_EVENTS)) |
|---|
| 3535 | + r = amdgpu_pmu_init(adev); |
|---|
| 3536 | + if (r) |
|---|
| 3537 | + dev_err(adev->dev, "amdgpu_pmu_init failed\n"); |
|---|
| 3538 | + |
|---|
| 3539 | + /* Have stored pci confspace at hand for restore in sudden PCI error */ |
|---|
| 3540 | + if (amdgpu_device_cache_pci_state(adev->pdev)) |
|---|
| 3541 | + pci_restore_state(pdev); |
|---|
| 3542 | + |
|---|
| 2605 | 3543 | return 0; |
|---|
| 2606 | 3544 | |
|---|
| 2607 | 3545 | failed: |
|---|
| 2608 | 3546 | amdgpu_vf_error_trans_all(adev); |
|---|
| 2609 | | - if (runtime) |
|---|
| 3547 | + if (boco) |
|---|
| 2610 | 3548 | vga_switcheroo_fini_domain_pm_ops(adev->dev); |
|---|
| 2611 | 3549 | |
|---|
| 2612 | 3550 | return r; |
|---|
| .. | .. |
|---|
| 2622 | 3560 | */ |
|---|
| 2623 | 3561 | void amdgpu_device_fini(struct amdgpu_device *adev) |
|---|
| 2624 | 3562 | { |
|---|
| 2625 | | - int r; |
|---|
| 2626 | | - |
|---|
| 2627 | | - DRM_INFO("amdgpu: finishing device.\n"); |
|---|
| 3563 | + dev_info(adev->dev, "amdgpu: finishing device.\n"); |
|---|
| 3564 | + flush_delayed_work(&adev->delayed_init_work); |
|---|
| 3565 | + ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); |
|---|
| 2628 | 3566 | adev->shutdown = true; |
|---|
| 3567 | + |
|---|
| 3568 | + kfree(adev->pci_state); |
|---|
| 3569 | + |
|---|
| 3570 | + /* make sure IB test finished before entering exclusive mode |
|---|
| 3571 | + * to avoid preemption on IB test |
|---|
| 3572 | + * */ |
|---|
| 3573 | + if (amdgpu_sriov_vf(adev)) { |
|---|
| 3574 | + amdgpu_virt_request_full_gpu(adev, false); |
|---|
| 3575 | + amdgpu_virt_fini_data_exchange(adev); |
|---|
| 3576 | + } |
|---|
| 3577 | + |
|---|
| 2629 | 3578 | /* disable all interrupts */ |
|---|
| 2630 | 3579 | amdgpu_irq_disable_all(adev); |
|---|
| 2631 | 3580 | if (adev->mode_info.mode_config_initialized){ |
|---|
| 2632 | 3581 | if (!amdgpu_device_has_dc_support(adev)) |
|---|
| 2633 | | - drm_crtc_force_disable_all(adev->ddev); |
|---|
| 3582 | + drm_helper_force_disable_all(adev_to_drm(adev)); |
|---|
| 2634 | 3583 | else |
|---|
| 2635 | | - drm_atomic_helper_shutdown(adev->ddev); |
|---|
| 3584 | + drm_atomic_helper_shutdown(adev_to_drm(adev)); |
|---|
| 2636 | 3585 | } |
|---|
| 2637 | | - amdgpu_ib_pool_fini(adev); |
|---|
| 2638 | 3586 | amdgpu_fence_driver_fini(adev); |
|---|
| 2639 | | - amdgpu_pm_sysfs_fini(adev); |
|---|
| 3587 | + if (adev->pm_sysfs_en) |
|---|
| 3588 | + amdgpu_pm_sysfs_fini(adev); |
|---|
| 2640 | 3589 | amdgpu_fbdev_fini(adev); |
|---|
| 2641 | | - r = amdgpu_device_ip_fini(adev); |
|---|
| 2642 | | - if (adev->firmware.gpu_info_fw) { |
|---|
| 2643 | | - release_firmware(adev->firmware.gpu_info_fw); |
|---|
| 2644 | | - adev->firmware.gpu_info_fw = NULL; |
|---|
| 2645 | | - } |
|---|
| 3590 | + amdgpu_device_ip_fini(adev); |
|---|
| 3591 | + release_firmware(adev->firmware.gpu_info_fw); |
|---|
| 3592 | + adev->firmware.gpu_info_fw = NULL; |
|---|
| 2646 | 3593 | adev->accel_working = false; |
|---|
| 2647 | | - cancel_delayed_work_sync(&adev->late_init_work); |
|---|
| 2648 | 3594 | /* free i2c buses */ |
|---|
| 2649 | 3595 | if (!amdgpu_device_has_dc_support(adev)) |
|---|
| 2650 | 3596 | amdgpu_i2c_fini(adev); |
|---|
| .. | .. |
|---|
| 2654 | 3600 | |
|---|
| 2655 | 3601 | kfree(adev->bios); |
|---|
| 2656 | 3602 | adev->bios = NULL; |
|---|
| 2657 | | - if (!pci_is_thunderbolt_attached(adev->pdev)) |
|---|
| 3603 | + if (amdgpu_has_atpx() && |
|---|
| 3604 | + (amdgpu_is_atpx_hybrid() || |
|---|
| 3605 | + amdgpu_has_atpx_dgpu_power_cntl()) && |
|---|
| 3606 | + !pci_is_thunderbolt_attached(adev->pdev)) |
|---|
| 2658 | 3607 | vga_switcheroo_unregister_client(adev->pdev); |
|---|
| 2659 | | - if (adev->flags & AMD_IS_PX) |
|---|
| 3608 | + if (amdgpu_device_supports_boco(adev_to_drm(adev))) |
|---|
| 2660 | 3609 | vga_switcheroo_fini_domain_pm_ops(adev->dev); |
|---|
| 2661 | 3610 | vga_client_register(adev->pdev, NULL, NULL, NULL); |
|---|
| 2662 | 3611 | if (adev->rio_mem) |
|---|
| .. | .. |
|---|
| 2665 | 3614 | iounmap(adev->rmmio); |
|---|
| 2666 | 3615 | adev->rmmio = NULL; |
|---|
| 2667 | 3616 | amdgpu_device_doorbell_fini(adev); |
|---|
| 2668 | | - amdgpu_debugfs_regs_cleanup(adev); |
|---|
| 3617 | + |
|---|
| 3618 | + if (adev->ucode_sysfs_en) |
|---|
| 3619 | + amdgpu_ucode_sysfs_fini(adev); |
|---|
| 3620 | + |
|---|
| 3621 | + sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); |
|---|
| 3622 | + if (IS_ENABLED(CONFIG_PERF_EVENTS)) |
|---|
| 3623 | + amdgpu_pmu_fini(adev); |
|---|
| 3624 | + if (adev->mman.discovery_bin) |
|---|
| 3625 | + amdgpu_discovery_fini(adev); |
|---|
| 2669 | 3626 | } |
|---|
| 2670 | 3627 | |
|---|
| 2671 | 3628 | |
|---|
| .. | .. |
|---|
| 2676 | 3633 | * amdgpu_device_suspend - initiate device suspend |
|---|
| 2677 | 3634 | * |
|---|
| 2678 | 3635 | * @dev: drm dev pointer |
|---|
| 2679 | | - * @suspend: suspend state |
|---|
| 2680 | 3636 | * @fbcon : notify the fbdev of suspend |
|---|
| 2681 | 3637 | * |
|---|
| 2682 | 3638 | * Puts the hw in the suspend state (all asics). |
|---|
| 2683 | 3639 | * Returns 0 for success or an error on failure. |
|---|
| 2684 | 3640 | * Called at driver suspend. |
|---|
| 2685 | 3641 | */ |
|---|
| 2686 | | -int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) |
|---|
| 3642 | +int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) |
|---|
| 2687 | 3643 | { |
|---|
| 2688 | 3644 | struct amdgpu_device *adev; |
|---|
| 2689 | 3645 | struct drm_crtc *crtc; |
|---|
| 2690 | 3646 | struct drm_connector *connector; |
|---|
| 3647 | + struct drm_connector_list_iter iter; |
|---|
| 2691 | 3648 | int r; |
|---|
| 2692 | 3649 | |
|---|
| 2693 | | - if (dev == NULL || dev->dev_private == NULL) { |
|---|
| 2694 | | - return -ENODEV; |
|---|
| 2695 | | - } |
|---|
| 2696 | | - |
|---|
| 2697 | | - adev = dev->dev_private; |
|---|
| 3650 | + adev = drm_to_adev(dev); |
|---|
| 2698 | 3651 | |
|---|
| 2699 | 3652 | if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) |
|---|
| 2700 | 3653 | return 0; |
|---|
| 2701 | 3654 | |
|---|
| 3655 | + adev->in_suspend = true; |
|---|
| 2702 | 3656 | drm_kms_helper_poll_disable(dev); |
|---|
| 2703 | 3657 | |
|---|
| 2704 | 3658 | if (fbcon) |
|---|
| 2705 | 3659 | amdgpu_fbdev_set_suspend(adev, 1); |
|---|
| 2706 | 3660 | |
|---|
| 3661 | + cancel_delayed_work_sync(&adev->delayed_init_work); |
|---|
| 3662 | + |
|---|
| 2707 | 3663 | if (!amdgpu_device_has_dc_support(adev)) { |
|---|
| 2708 | 3664 | /* turn off display hw */ |
|---|
| 2709 | 3665 | drm_modeset_lock_all(dev); |
|---|
| 2710 | | - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { |
|---|
| 2711 | | - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); |
|---|
| 2712 | | - } |
|---|
| 3666 | + drm_connector_list_iter_begin(dev, &iter); |
|---|
| 3667 | + drm_for_each_connector_iter(connector, &iter) |
|---|
| 3668 | + drm_helper_connector_dpms(connector, |
|---|
| 3669 | + DRM_MODE_DPMS_OFF); |
|---|
| 3670 | + drm_connector_list_iter_end(&iter); |
|---|
| 2713 | 3671 | drm_modeset_unlock_all(dev); |
|---|
| 2714 | 3672 | /* unpin the front buffers and cursors */ |
|---|
| 2715 | 3673 | list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { |
|---|
| .. | .. |
|---|
| 2717 | 3675 | struct drm_framebuffer *fb = crtc->primary->fb; |
|---|
| 2718 | 3676 | struct amdgpu_bo *robj; |
|---|
| 2719 | 3677 | |
|---|
| 2720 | | - if (amdgpu_crtc->cursor_bo) { |
|---|
| 3678 | + if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { |
|---|
| 2721 | 3679 | struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); |
|---|
| 2722 | 3680 | r = amdgpu_bo_reserve(aobj, true); |
|---|
| 2723 | 3681 | if (r == 0) { |
|---|
| .. | .. |
|---|
| 2741 | 3699 | } |
|---|
| 2742 | 3700 | } |
|---|
| 2743 | 3701 | |
|---|
| 2744 | | - amdgpu_amdkfd_suspend(adev); |
|---|
| 3702 | + amdgpu_ras_suspend(adev); |
|---|
| 2745 | 3703 | |
|---|
| 2746 | 3704 | r = amdgpu_device_ip_suspend_phase1(adev); |
|---|
| 3705 | + |
|---|
| 3706 | + amdgpu_amdkfd_suspend(adev, !fbcon); |
|---|
| 2747 | 3707 | |
|---|
| 2748 | 3708 | /* evict vram memory */ |
|---|
| 2749 | 3709 | amdgpu_bo_evict_vram(adev); |
|---|
| .. | .. |
|---|
| 2758 | 3718 | */ |
|---|
| 2759 | 3719 | amdgpu_bo_evict_vram(adev); |
|---|
| 2760 | 3720 | |
|---|
| 2761 | | - pci_save_state(dev->pdev); |
|---|
| 2762 | | - if (suspend) { |
|---|
| 2763 | | - /* Shut down the device */ |
|---|
| 2764 | | - pci_disable_device(dev->pdev); |
|---|
| 2765 | | - pci_set_power_state(dev->pdev, PCI_D3hot); |
|---|
| 2766 | | - } else { |
|---|
| 2767 | | - r = amdgpu_asic_reset(adev); |
|---|
| 2768 | | - if (r) |
|---|
| 2769 | | - DRM_ERROR("amdgpu asic reset failed\n"); |
|---|
| 2770 | | - } |
|---|
| 2771 | | - |
|---|
| 2772 | 3721 | return 0; |
|---|
| 2773 | 3722 | } |
|---|
| 2774 | 3723 | |
|---|
| .. | .. |
|---|
| 2776 | 3725 | * amdgpu_device_resume - initiate device resume |
|---|
| 2777 | 3726 | * |
|---|
| 2778 | 3727 | * @dev: drm dev pointer |
|---|
| 2779 | | - * @resume: resume state |
|---|
| 2780 | 3728 | * @fbcon : notify the fbdev of resume |
|---|
| 2781 | 3729 | * |
|---|
| 2782 | 3730 | * Bring the hw back to operating state (all asics). |
|---|
| 2783 | 3731 | * Returns 0 for success or an error on failure. |
|---|
| 2784 | 3732 | * Called at driver resume. |
|---|
| 2785 | 3733 | */ |
|---|
| 2786 | | -int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) |
|---|
| 3734 | +int amdgpu_device_resume(struct drm_device *dev, bool fbcon) |
|---|
| 2787 | 3735 | { |
|---|
| 2788 | 3736 | struct drm_connector *connector; |
|---|
| 2789 | | - struct amdgpu_device *adev = dev->dev_private; |
|---|
| 3737 | + struct drm_connector_list_iter iter; |
|---|
| 3738 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 2790 | 3739 | struct drm_crtc *crtc; |
|---|
| 2791 | 3740 | int r = 0; |
|---|
| 2792 | 3741 | |
|---|
| 2793 | 3742 | if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) |
|---|
| 2794 | 3743 | return 0; |
|---|
| 2795 | 3744 | |
|---|
| 2796 | | - if (resume) { |
|---|
| 2797 | | - pci_set_power_state(dev->pdev, PCI_D0); |
|---|
| 2798 | | - pci_restore_state(dev->pdev); |
|---|
| 2799 | | - r = pci_enable_device(dev->pdev); |
|---|
| 2800 | | - if (r) |
|---|
| 2801 | | - return r; |
|---|
| 2802 | | - } |
|---|
| 2803 | | - |
|---|
| 2804 | 3745 | /* post card */ |
|---|
| 2805 | 3746 | if (amdgpu_device_need_post(adev)) { |
|---|
| 2806 | | - r = amdgpu_atom_asic_init(adev->mode_info.atom_context); |
|---|
| 3747 | + r = amdgpu_device_asic_init(adev); |
|---|
| 2807 | 3748 | if (r) |
|---|
| 2808 | | - DRM_ERROR("amdgpu asic init failed\n"); |
|---|
| 3749 | + dev_err(adev->dev, "amdgpu asic init failed\n"); |
|---|
| 2809 | 3750 | } |
|---|
| 2810 | 3751 | |
|---|
| 2811 | 3752 | r = amdgpu_device_ip_resume(adev); |
|---|
| 2812 | 3753 | if (r) { |
|---|
| 2813 | | - DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); |
|---|
| 3754 | + dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); |
|---|
| 2814 | 3755 | return r; |
|---|
| 2815 | 3756 | } |
|---|
| 2816 | 3757 | amdgpu_fence_driver_resume(adev); |
|---|
| .. | .. |
|---|
| 2820 | 3761 | if (r) |
|---|
| 2821 | 3762 | return r; |
|---|
| 2822 | 3763 | |
|---|
| 3764 | + queue_delayed_work(system_wq, &adev->delayed_init_work, |
|---|
| 3765 | + msecs_to_jiffies(AMDGPU_RESUME_MS)); |
|---|
| 3766 | + |
|---|
| 2823 | 3767 | if (!amdgpu_device_has_dc_support(adev)) { |
|---|
| 2824 | 3768 | /* pin cursors */ |
|---|
| 2825 | 3769 | list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { |
|---|
| 2826 | 3770 | struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); |
|---|
| 2827 | 3771 | |
|---|
| 2828 | | - if (amdgpu_crtc->cursor_bo) { |
|---|
| 3772 | + if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { |
|---|
| 2829 | 3773 | struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); |
|---|
| 2830 | 3774 | r = amdgpu_bo_reserve(aobj, true); |
|---|
| 2831 | 3775 | if (r == 0) { |
|---|
| 2832 | 3776 | r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); |
|---|
| 2833 | 3777 | if (r != 0) |
|---|
| 2834 | | - DRM_ERROR("Failed to pin cursor BO (%d)\n", r); |
|---|
| 3778 | + dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); |
|---|
| 2835 | 3779 | amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); |
|---|
| 2836 | 3780 | amdgpu_bo_unreserve(aobj); |
|---|
| 2837 | 3781 | } |
|---|
| 2838 | 3782 | } |
|---|
| 2839 | 3783 | } |
|---|
| 2840 | 3784 | } |
|---|
| 2841 | | - r = amdgpu_amdkfd_resume(adev); |
|---|
| 3785 | + r = amdgpu_amdkfd_resume(adev, !fbcon); |
|---|
| 2842 | 3786 | if (r) |
|---|
| 2843 | 3787 | return r; |
|---|
| 2844 | 3788 | |
|---|
| 2845 | 3789 | /* Make sure IB tests flushed */ |
|---|
| 2846 | | - flush_delayed_work(&adev->late_init_work); |
|---|
| 3790 | + flush_delayed_work(&adev->delayed_init_work); |
|---|
| 2847 | 3791 | |
|---|
| 2848 | 3792 | /* blat the mode back in */ |
|---|
| 2849 | 3793 | if (fbcon) { |
|---|
| .. | .. |
|---|
| 2853 | 3797 | |
|---|
| 2854 | 3798 | /* turn on display hw */ |
|---|
| 2855 | 3799 | drm_modeset_lock_all(dev); |
|---|
| 2856 | | - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { |
|---|
| 2857 | | - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); |
|---|
| 2858 | | - } |
|---|
| 3800 | + |
|---|
| 3801 | + drm_connector_list_iter_begin(dev, &iter); |
|---|
| 3802 | + drm_for_each_connector_iter(connector, &iter) |
|---|
| 3803 | + drm_helper_connector_dpms(connector, |
|---|
| 3804 | + DRM_MODE_DPMS_ON); |
|---|
| 3805 | + drm_connector_list_iter_end(&iter); |
|---|
| 3806 | + |
|---|
| 2859 | 3807 | drm_modeset_unlock_all(dev); |
|---|
| 2860 | 3808 | } |
|---|
| 2861 | 3809 | amdgpu_fbdev_set_suspend(adev, 0); |
|---|
| 2862 | 3810 | } |
|---|
| 2863 | 3811 | |
|---|
| 2864 | 3812 | drm_kms_helper_poll_enable(dev); |
|---|
| 3813 | + |
|---|
| 3814 | + amdgpu_ras_resume(adev); |
|---|
| 2865 | 3815 | |
|---|
| 2866 | 3816 | /* |
|---|
| 2867 | 3817 | * Most of the connector probing functions try to acquire runtime pm |
|---|
| .. | .. |
|---|
| 2882 | 3832 | #ifdef CONFIG_PM |
|---|
| 2883 | 3833 | dev->dev->power.disable_depth--; |
|---|
| 2884 | 3834 | #endif |
|---|
| 3835 | + adev->in_suspend = false; |
|---|
| 3836 | + |
|---|
| 2885 | 3837 | return 0; |
|---|
| 2886 | 3838 | } |
|---|
| 2887 | 3839 | |
|---|
| .. | .. |
|---|
| 2913 | 3865 | adev->ip_blocks[i].status.hang = |
|---|
| 2914 | 3866 | adev->ip_blocks[i].version->funcs->check_soft_reset(adev); |
|---|
| 2915 | 3867 | if (adev->ip_blocks[i].status.hang) { |
|---|
| 2916 | | - DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); |
|---|
| 3868 | + dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); |
|---|
| 2917 | 3869 | asic_hang = true; |
|---|
| 2918 | 3870 | } |
|---|
| 2919 | 3871 | } |
|---|
| .. | .. |
|---|
| 2974 | 3926 | (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || |
|---|
| 2975 | 3927 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { |
|---|
| 2976 | 3928 | if (adev->ip_blocks[i].status.hang) { |
|---|
| 2977 | | - DRM_INFO("Some block need full reset!\n"); |
|---|
| 3929 | + dev_info(adev->dev, "Some block need full reset!\n"); |
|---|
| 2978 | 3930 | return true; |
|---|
| 2979 | 3931 | } |
|---|
| 2980 | 3932 | } |
|---|
| .. | .. |
|---|
| 3040 | 3992 | } |
|---|
| 3041 | 3993 | |
|---|
| 3042 | 3994 | /** |
|---|
| 3043 | | - * amdgpu_device_recover_vram_from_shadow - restore shadowed VRAM buffers |
|---|
| 3044 | | - * |
|---|
| 3045 | | - * @adev: amdgpu_device pointer |
|---|
| 3046 | | - * @ring: amdgpu_ring for the engine handling the buffer operations |
|---|
| 3047 | | - * @bo: amdgpu_bo buffer whose shadow is being restored |
|---|
| 3048 | | - * @fence: dma_fence associated with the operation |
|---|
| 3049 | | - * |
|---|
| 3050 | | - * Restores the VRAM buffer contents from the shadow in GTT. Used to |
|---|
| 3051 | | - * restore things like GPUVM page tables after a GPU reset where |
|---|
| 3052 | | - * the contents of VRAM might be lost. |
|---|
| 3053 | | - * Returns 0 on success, negative error code on failure. |
|---|
| 3054 | | - */ |
|---|
| 3055 | | -static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev, |
|---|
| 3056 | | - struct amdgpu_ring *ring, |
|---|
| 3057 | | - struct amdgpu_bo *bo, |
|---|
| 3058 | | - struct dma_fence **fence) |
|---|
| 3059 | | -{ |
|---|
| 3060 | | - uint32_t domain; |
|---|
| 3061 | | - int r; |
|---|
| 3062 | | - |
|---|
| 3063 | | - if (!bo->shadow) |
|---|
| 3064 | | - return 0; |
|---|
| 3065 | | - |
|---|
| 3066 | | - r = amdgpu_bo_reserve(bo, true); |
|---|
| 3067 | | - if (r) |
|---|
| 3068 | | - return r; |
|---|
| 3069 | | - domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type); |
|---|
| 3070 | | - /* if bo has been evicted, then no need to recover */ |
|---|
| 3071 | | - if (domain == AMDGPU_GEM_DOMAIN_VRAM) { |
|---|
| 3072 | | - r = amdgpu_bo_validate(bo->shadow); |
|---|
| 3073 | | - if (r) { |
|---|
| 3074 | | - DRM_ERROR("bo validate failed!\n"); |
|---|
| 3075 | | - goto err; |
|---|
| 3076 | | - } |
|---|
| 3077 | | - |
|---|
| 3078 | | - r = amdgpu_bo_restore_from_shadow(adev, ring, bo, |
|---|
| 3079 | | - NULL, fence, true); |
|---|
| 3080 | | - if (r) { |
|---|
| 3081 | | - DRM_ERROR("recover page table failed!\n"); |
|---|
| 3082 | | - goto err; |
|---|
| 3083 | | - } |
|---|
| 3084 | | - } |
|---|
| 3085 | | -err: |
|---|
| 3086 | | - amdgpu_bo_unreserve(bo); |
|---|
| 3087 | | - return r; |
|---|
| 3088 | | -} |
|---|
| 3089 | | - |
|---|
| 3090 | | -/** |
|---|
| 3091 | | - * amdgpu_device_handle_vram_lost - Handle the loss of VRAM contents |
|---|
| 3995 | + * amdgpu_device_recover_vram - Recover some VRAM contents |
|---|
| 3092 | 3996 | * |
|---|
| 3093 | 3997 | * @adev: amdgpu_device pointer |
|---|
| 3094 | 3998 | * |
|---|
| 3095 | 3999 | * Restores the contents of VRAM buffers from the shadows in GTT. Used to |
|---|
| 3096 | 4000 | * restore things like GPUVM page tables after a GPU reset where |
|---|
| 3097 | 4001 | * the contents of VRAM might be lost. |
|---|
| 3098 | | - * Returns 0 on success, 1 on failure. |
|---|
| 4002 | + * |
|---|
| 4003 | + * Returns: |
|---|
| 4004 | + * 0 on success, negative error code on failure. |
|---|
| 3099 | 4005 | */ |
|---|
| 3100 | | -static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev) |
|---|
| 4006 | +static int amdgpu_device_recover_vram(struct amdgpu_device *adev) |
|---|
| 3101 | 4007 | { |
|---|
| 3102 | | - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; |
|---|
| 3103 | | - struct amdgpu_bo *bo, *tmp; |
|---|
| 3104 | 4008 | struct dma_fence *fence = NULL, *next = NULL; |
|---|
| 3105 | | - long r = 1; |
|---|
| 3106 | | - int i = 0; |
|---|
| 3107 | | - long tmo; |
|---|
| 4009 | + struct amdgpu_bo *shadow; |
|---|
| 4010 | + long r = 1, tmo; |
|---|
| 3108 | 4011 | |
|---|
| 3109 | 4012 | if (amdgpu_sriov_runtime(adev)) |
|---|
| 3110 | 4013 | tmo = msecs_to_jiffies(8000); |
|---|
| 3111 | 4014 | else |
|---|
| 3112 | 4015 | tmo = msecs_to_jiffies(100); |
|---|
| 3113 | 4016 | |
|---|
| 3114 | | - DRM_INFO("recover vram bo from shadow start\n"); |
|---|
| 4017 | + dev_info(adev->dev, "recover vram bo from shadow start\n"); |
|---|
| 3115 | 4018 | mutex_lock(&adev->shadow_list_lock); |
|---|
| 3116 | | - list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { |
|---|
| 3117 | | - next = NULL; |
|---|
| 3118 | | - amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); |
|---|
| 4019 | + list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { |
|---|
| 4020 | + |
|---|
| 4021 | + /* No need to recover an evicted BO */ |
|---|
| 4022 | + if (shadow->tbo.mem.mem_type != TTM_PL_TT || |
|---|
| 4023 | + shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || |
|---|
| 4024 | + shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) |
|---|
| 4025 | + continue; |
|---|
| 4026 | + |
|---|
| 4027 | + r = amdgpu_bo_restore_shadow(shadow, &next); |
|---|
| 4028 | + if (r) |
|---|
| 4029 | + break; |
|---|
| 4030 | + |
|---|
| 3119 | 4031 | if (fence) { |
|---|
| 3120 | | - r = dma_fence_wait_timeout(fence, false, tmo); |
|---|
| 3121 | | - if (r == 0) |
|---|
| 3122 | | - pr_err("wait fence %p[%d] timeout\n", fence, i); |
|---|
| 3123 | | - else if (r < 0) |
|---|
| 3124 | | - pr_err("wait fence %p[%d] interrupted\n", fence, i); |
|---|
| 3125 | | - if (r < 1) { |
|---|
| 3126 | | - dma_fence_put(fence); |
|---|
| 3127 | | - fence = next; |
|---|
| 4032 | + tmo = dma_fence_wait_timeout(fence, false, tmo); |
|---|
| 4033 | + dma_fence_put(fence); |
|---|
| 4034 | + fence = next; |
|---|
| 4035 | + if (tmo == 0) { |
|---|
| 4036 | + r = -ETIMEDOUT; |
|---|
| 4037 | + break; |
|---|
| 4038 | + } else if (tmo < 0) { |
|---|
| 4039 | + r = tmo; |
|---|
| 3128 | 4040 | break; |
|---|
| 3129 | 4041 | } |
|---|
| 3130 | | - i++; |
|---|
| 4042 | + } else { |
|---|
| 4043 | + fence = next; |
|---|
| 3131 | 4044 | } |
|---|
| 3132 | | - |
|---|
| 3133 | | - dma_fence_put(fence); |
|---|
| 3134 | | - fence = next; |
|---|
| 3135 | 4045 | } |
|---|
| 3136 | 4046 | mutex_unlock(&adev->shadow_list_lock); |
|---|
| 3137 | 4047 | |
|---|
| 3138 | | - if (fence) { |
|---|
| 3139 | | - r = dma_fence_wait_timeout(fence, false, tmo); |
|---|
| 3140 | | - if (r == 0) |
|---|
| 3141 | | - pr_err("wait fence %p[%d] timeout\n", fence, i); |
|---|
| 3142 | | - else if (r < 0) |
|---|
| 3143 | | - pr_err("wait fence %p[%d] interrupted\n", fence, i); |
|---|
| 3144 | | - |
|---|
| 3145 | | - } |
|---|
| 4048 | + if (fence) |
|---|
| 4049 | + tmo = dma_fence_wait_timeout(fence, false, tmo); |
|---|
| 3146 | 4050 | dma_fence_put(fence); |
|---|
| 3147 | 4051 | |
|---|
| 3148 | | - if (r > 0) |
|---|
| 3149 | | - DRM_INFO("recover vram bo from shadow done\n"); |
|---|
| 3150 | | - else |
|---|
| 3151 | | - DRM_ERROR("recover vram bo from shadow failed\n"); |
|---|
| 4052 | + if (r < 0 || tmo <= 0) { |
|---|
| 4053 | + dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); |
|---|
| 4054 | + return -EIO; |
|---|
| 4055 | + } |
|---|
| 3152 | 4056 | |
|---|
| 3153 | | - return (r > 0) ? 0 : 1; |
|---|
| 4057 | + dev_info(adev->dev, "recover vram bo from shadow done\n"); |
|---|
| 4058 | + return 0; |
|---|
| 3154 | 4059 | } |
|---|
| 3155 | 4060 | |
|---|
| 3156 | | -/** |
|---|
| 3157 | | - * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough |
|---|
| 3158 | | - * |
|---|
| 3159 | | - * @adev: amdgpu device pointer |
|---|
| 3160 | | - * |
|---|
| 3161 | | - * attempt to do soft-reset or full-reset and reinitialize Asic |
|---|
| 3162 | | - * return 0 means succeeded otherwise failed |
|---|
| 3163 | | - */ |
|---|
| 3164 | | -static int amdgpu_device_reset(struct amdgpu_device *adev) |
|---|
| 3165 | | -{ |
|---|
| 3166 | | - bool need_full_reset, vram_lost = 0; |
|---|
| 3167 | | - int r; |
|---|
| 3168 | | - |
|---|
| 3169 | | - need_full_reset = amdgpu_device_ip_need_full_reset(adev); |
|---|
| 3170 | | - |
|---|
| 3171 | | - if (!need_full_reset) { |
|---|
| 3172 | | - amdgpu_device_ip_pre_soft_reset(adev); |
|---|
| 3173 | | - r = amdgpu_device_ip_soft_reset(adev); |
|---|
| 3174 | | - amdgpu_device_ip_post_soft_reset(adev); |
|---|
| 3175 | | - if (r || amdgpu_device_ip_check_soft_reset(adev)) { |
|---|
| 3176 | | - DRM_INFO("soft reset failed, will fallback to full reset!\n"); |
|---|
| 3177 | | - need_full_reset = true; |
|---|
| 3178 | | - } |
|---|
| 3179 | | - } |
|---|
| 3180 | | - |
|---|
| 3181 | | - if (need_full_reset) { |
|---|
| 3182 | | - r = amdgpu_device_ip_suspend(adev); |
|---|
| 3183 | | - |
|---|
| 3184 | | -retry: |
|---|
| 3185 | | - r = amdgpu_asic_reset(adev); |
|---|
| 3186 | | - /* post card */ |
|---|
| 3187 | | - amdgpu_atom_asic_init(adev->mode_info.atom_context); |
|---|
| 3188 | | - |
|---|
| 3189 | | - if (!r) { |
|---|
| 3190 | | - dev_info(adev->dev, "GPU reset succeeded, trying to resume\n"); |
|---|
| 3191 | | - r = amdgpu_device_ip_resume_phase1(adev); |
|---|
| 3192 | | - if (r) |
|---|
| 3193 | | - goto out; |
|---|
| 3194 | | - |
|---|
| 3195 | | - vram_lost = amdgpu_device_check_vram_lost(adev); |
|---|
| 3196 | | - if (vram_lost) { |
|---|
| 3197 | | - DRM_ERROR("VRAM is lost!\n"); |
|---|
| 3198 | | - atomic_inc(&adev->vram_lost_counter); |
|---|
| 3199 | | - } |
|---|
| 3200 | | - |
|---|
| 3201 | | - r = amdgpu_gtt_mgr_recover( |
|---|
| 3202 | | - &adev->mman.bdev.man[TTM_PL_TT]); |
|---|
| 3203 | | - if (r) |
|---|
| 3204 | | - goto out; |
|---|
| 3205 | | - |
|---|
| 3206 | | - r = amdgpu_device_ip_resume_phase2(adev); |
|---|
| 3207 | | - if (r) |
|---|
| 3208 | | - goto out; |
|---|
| 3209 | | - |
|---|
| 3210 | | - if (vram_lost) |
|---|
| 3211 | | - amdgpu_device_fill_reset_magic(adev); |
|---|
| 3212 | | - } |
|---|
| 3213 | | - } |
|---|
| 3214 | | - |
|---|
| 3215 | | -out: |
|---|
| 3216 | | - if (!r) { |
|---|
| 3217 | | - amdgpu_irq_gpu_reset_resume_helper(adev); |
|---|
| 3218 | | - r = amdgpu_ib_ring_tests(adev); |
|---|
| 3219 | | - if (r) { |
|---|
| 3220 | | - dev_err(adev->dev, "ib ring test failed (%d).\n", r); |
|---|
| 3221 | | - r = amdgpu_device_ip_suspend(adev); |
|---|
| 3222 | | - need_full_reset = true; |
|---|
| 3223 | | - goto retry; |
|---|
| 3224 | | - } |
|---|
| 3225 | | - } |
|---|
| 3226 | | - |
|---|
| 3227 | | - if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost)) |
|---|
| 3228 | | - r = amdgpu_device_handle_vram_lost(adev); |
|---|
| 3229 | | - |
|---|
| 3230 | | - return r; |
|---|
| 3231 | | -} |
|---|
| 3232 | 4061 | |
|---|
| 3233 | 4062 | /** |
|---|
| 3234 | 4063 | * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf |
|---|
| 3235 | 4064 | * |
|---|
| 3236 | | - * @adev: amdgpu device pointer |
|---|
| 4065 | + * @adev: amdgpu_device pointer |
|---|
| 3237 | 4066 | * @from_hypervisor: request from hypervisor |
|---|
| 3238 | 4067 | * |
|---|
| 3239 | 4068 | * do VF FLR and reinitialize Asic |
|---|
| .. | .. |
|---|
| 3251 | 4080 | if (r) |
|---|
| 3252 | 4081 | return r; |
|---|
| 3253 | 4082 | |
|---|
| 4083 | + amdgpu_amdkfd_pre_reset(adev); |
|---|
| 4084 | + |
|---|
| 3254 | 4085 | /* Resume IP prior to SMC */ |
|---|
| 3255 | 4086 | r = amdgpu_device_ip_reinit_early_sriov(adev); |
|---|
| 3256 | 4087 | if (r) |
|---|
| 3257 | 4088 | goto error; |
|---|
| 3258 | 4089 | |
|---|
| 4090 | + amdgpu_virt_init_data_exchange(adev); |
|---|
| 3259 | 4091 | /* we need recover gart prior to run SMC/CP/SDMA resume */ |
|---|
| 3260 | | - amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); |
|---|
| 4092 | + amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); |
|---|
| 4093 | + |
|---|
| 4094 | + r = amdgpu_device_fw_loading(adev); |
|---|
| 4095 | + if (r) |
|---|
| 4096 | + return r; |
|---|
| 3261 | 4097 | |
|---|
| 3262 | 4098 | /* now we are okay to resume SMC/CP/SDMA */ |
|---|
| 3263 | 4099 | r = amdgpu_device_ip_reinit_late_sriov(adev); |
|---|
| .. | .. |
|---|
| 3266 | 4102 | |
|---|
| 3267 | 4103 | amdgpu_irq_gpu_reset_resume_helper(adev); |
|---|
| 3268 | 4104 | r = amdgpu_ib_ring_tests(adev); |
|---|
| 4105 | + amdgpu_amdkfd_post_reset(adev); |
|---|
| 3269 | 4106 | |
|---|
| 3270 | 4107 | error: |
|---|
| 3271 | | - amdgpu_virt_init_data_exchange(adev); |
|---|
| 3272 | 4108 | amdgpu_virt_release_full_gpu(adev, true); |
|---|
| 3273 | 4109 | if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { |
|---|
| 3274 | | - atomic_inc(&adev->vram_lost_counter); |
|---|
| 3275 | | - r = amdgpu_device_handle_vram_lost(adev); |
|---|
| 4110 | + amdgpu_inc_vram_lost(adev); |
|---|
| 4111 | + r = amdgpu_device_recover_vram(adev); |
|---|
| 3276 | 4112 | } |
|---|
| 3277 | 4113 | |
|---|
| 3278 | 4114 | return r; |
|---|
| 3279 | 4115 | } |
|---|
| 3280 | 4116 | |
|---|
| 3281 | 4117 | /** |
|---|
| 3282 | | - * amdgpu_device_gpu_recover - reset the asic and recover scheduler |
|---|
| 4118 | + * amdgpu_device_has_job_running - check if there is any job in mirror list |
|---|
| 3283 | 4119 | * |
|---|
| 3284 | | - * @adev: amdgpu device pointer |
|---|
| 3285 | | - * @job: which job trigger hang |
|---|
| 3286 | | - * @force: forces reset regardless of amdgpu_gpu_recovery |
|---|
| 4120 | + * @adev: amdgpu_device pointer |
|---|
| 3287 | 4121 | * |
|---|
| 3288 | | - * Attempt to reset the GPU if it has hung (all asics). |
|---|
| 3289 | | - * Returns 0 for success or an error on failure. |
|---|
| 4122 | + * check if there is any job in mirror list |
|---|
| 3290 | 4123 | */ |
|---|
| 3291 | | -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, |
|---|
| 3292 | | - struct amdgpu_job *job, bool force) |
|---|
| 4124 | +bool amdgpu_device_has_job_running(struct amdgpu_device *adev) |
|---|
| 3293 | 4125 | { |
|---|
| 3294 | | - int i, r, resched; |
|---|
| 4126 | + int i; |
|---|
| 4127 | + struct drm_sched_job *job; |
|---|
| 3295 | 4128 | |
|---|
| 3296 | | - if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { |
|---|
| 3297 | | - DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); |
|---|
| 3298 | | - return 0; |
|---|
| 4129 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
|---|
| 4130 | + struct amdgpu_ring *ring = adev->rings[i]; |
|---|
| 4131 | + |
|---|
| 4132 | + if (!ring || !ring->sched.thread) |
|---|
| 4133 | + continue; |
|---|
| 4134 | + |
|---|
| 4135 | + spin_lock(&ring->sched.job_list_lock); |
|---|
| 4136 | + job = list_first_entry_or_null(&ring->sched.ring_mirror_list, |
|---|
| 4137 | + struct drm_sched_job, node); |
|---|
| 4138 | + spin_unlock(&ring->sched.job_list_lock); |
|---|
| 4139 | + if (job) |
|---|
| 4140 | + return true; |
|---|
| 4141 | + } |
|---|
| 4142 | + return false; |
|---|
| 4143 | +} |
|---|
| 4144 | + |
|---|
| 4145 | +/** |
|---|
| 4146 | + * amdgpu_device_should_recover_gpu - check if we should try GPU recovery |
|---|
| 4147 | + * |
|---|
| 4148 | + * @adev: amdgpu_device pointer |
|---|
| 4149 | + * |
|---|
| 4150 | + * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover |
|---|
| 4151 | + * a hung GPU. |
|---|
| 4152 | + */ |
|---|
| 4153 | +bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) |
|---|
| 4154 | +{ |
|---|
| 4155 | + if (!amdgpu_device_ip_check_soft_reset(adev)) { |
|---|
| 4156 | + dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); |
|---|
| 4157 | + return false; |
|---|
| 3299 | 4158 | } |
|---|
| 3300 | 4159 | |
|---|
| 3301 | | - if (!force && (amdgpu_gpu_recovery == 0 || |
|---|
| 3302 | | - (amdgpu_gpu_recovery == -1 && !amdgpu_sriov_vf(adev)))) { |
|---|
| 3303 | | - DRM_INFO("GPU recovery disabled.\n"); |
|---|
| 3304 | | - return 0; |
|---|
| 4160 | + if (amdgpu_gpu_recovery == 0) |
|---|
| 4161 | + goto disabled; |
|---|
| 4162 | + |
|---|
| 4163 | + if (amdgpu_sriov_vf(adev)) |
|---|
| 4164 | + return true; |
|---|
| 4165 | + |
|---|
| 4166 | + if (amdgpu_gpu_recovery == -1) { |
|---|
| 4167 | + switch (adev->asic_type) { |
|---|
| 4168 | + case CHIP_BONAIRE: |
|---|
| 4169 | + case CHIP_HAWAII: |
|---|
| 4170 | + case CHIP_TOPAZ: |
|---|
| 4171 | + case CHIP_TONGA: |
|---|
| 4172 | + case CHIP_FIJI: |
|---|
| 4173 | + case CHIP_POLARIS10: |
|---|
| 4174 | + case CHIP_POLARIS11: |
|---|
| 4175 | + case CHIP_POLARIS12: |
|---|
| 4176 | + case CHIP_VEGAM: |
|---|
| 4177 | + case CHIP_VEGA20: |
|---|
| 4178 | + case CHIP_VEGA10: |
|---|
| 4179 | + case CHIP_VEGA12: |
|---|
| 4180 | + case CHIP_RAVEN: |
|---|
| 4181 | + case CHIP_ARCTURUS: |
|---|
| 4182 | + case CHIP_RENOIR: |
|---|
| 4183 | + case CHIP_NAVI10: |
|---|
| 4184 | + case CHIP_NAVI14: |
|---|
| 4185 | + case CHIP_NAVI12: |
|---|
| 4186 | + case CHIP_SIENNA_CICHLID: |
|---|
| 4187 | + break; |
|---|
| 4188 | + default: |
|---|
| 4189 | + goto disabled; |
|---|
| 4190 | + } |
|---|
| 3305 | 4191 | } |
|---|
| 3306 | 4192 | |
|---|
| 3307 | | - dev_info(adev->dev, "GPU reset begin!\n"); |
|---|
| 4193 | + return true; |
|---|
| 3308 | 4194 | |
|---|
| 3309 | | - mutex_lock(&adev->lock_reset); |
|---|
| 3310 | | - atomic_inc(&adev->gpu_reset_counter); |
|---|
| 3311 | | - adev->in_gpu_reset = 1; |
|---|
| 4195 | +disabled: |
|---|
| 4196 | + dev_info(adev->dev, "GPU recovery disabled.\n"); |
|---|
| 4197 | + return false; |
|---|
| 4198 | +} |
|---|
| 3312 | 4199 | |
|---|
| 3313 | | - /* Block kfd */ |
|---|
| 3314 | | - amdgpu_amdkfd_pre_reset(adev); |
|---|
| 3315 | 4200 | |
|---|
| 3316 | | - /* block TTM */ |
|---|
| 3317 | | - resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); |
|---|
| 4201 | +static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, |
|---|
| 4202 | + struct amdgpu_job *job, |
|---|
| 4203 | + bool *need_full_reset_arg) |
|---|
| 4204 | +{ |
|---|
| 4205 | + int i, r = 0; |
|---|
| 4206 | + bool need_full_reset = *need_full_reset_arg; |
|---|
| 4207 | + |
|---|
| 4208 | + amdgpu_debugfs_wait_dump(adev); |
|---|
| 4209 | + |
|---|
| 4210 | + if (amdgpu_sriov_vf(adev)) { |
|---|
| 4211 | + /* stop the data exchange thread */ |
|---|
| 4212 | + amdgpu_virt_fini_data_exchange(adev); |
|---|
| 4213 | + } |
|---|
| 3318 | 4214 | |
|---|
| 3319 | 4215 | /* block all schedulers and reset given job's ring */ |
|---|
| 3320 | 4216 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
|---|
| .. | .. |
|---|
| 3323 | 4219 | if (!ring || !ring->sched.thread) |
|---|
| 3324 | 4220 | continue; |
|---|
| 3325 | 4221 | |
|---|
| 3326 | | - kthread_park(ring->sched.thread); |
|---|
| 3327 | | - |
|---|
| 3328 | | - if (job && job->base.sched == &ring->sched) |
|---|
| 3329 | | - continue; |
|---|
| 3330 | | - |
|---|
| 3331 | | - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL); |
|---|
| 3332 | | - |
|---|
| 3333 | 4222 | /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ |
|---|
| 3334 | 4223 | amdgpu_fence_driver_force_completion(ring); |
|---|
| 3335 | 4224 | } |
|---|
| 3336 | 4225 | |
|---|
| 3337 | | - if (amdgpu_sriov_vf(adev)) |
|---|
| 3338 | | - r = amdgpu_device_reset_sriov(adev, job ? false : true); |
|---|
| 3339 | | - else |
|---|
| 3340 | | - r = amdgpu_device_reset(adev); |
|---|
| 4226 | + if(job) |
|---|
| 4227 | + drm_sched_increase_karma(&job->base); |
|---|
| 3341 | 4228 | |
|---|
| 3342 | | - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
|---|
| 3343 | | - struct amdgpu_ring *ring = adev->rings[i]; |
|---|
| 4229 | + /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ |
|---|
| 4230 | + if (!amdgpu_sriov_vf(adev)) { |
|---|
| 3344 | 4231 | |
|---|
| 3345 | | - if (!ring || !ring->sched.thread) |
|---|
| 3346 | | - continue; |
|---|
| 4232 | + if (!need_full_reset) |
|---|
| 4233 | + need_full_reset = amdgpu_device_ip_need_full_reset(adev); |
|---|
| 3347 | 4234 | |
|---|
| 3348 | | - /* only need recovery sched of the given job's ring |
|---|
| 3349 | | - * or all rings (in the case @job is NULL) |
|---|
| 3350 | | - * after above amdgpu_reset accomplished |
|---|
| 3351 | | - */ |
|---|
| 3352 | | - if ((!job || job->base.sched == &ring->sched) && !r) |
|---|
| 3353 | | - drm_sched_job_recovery(&ring->sched); |
|---|
| 4235 | + if (!need_full_reset) { |
|---|
| 4236 | + amdgpu_device_ip_pre_soft_reset(adev); |
|---|
| 4237 | + r = amdgpu_device_ip_soft_reset(adev); |
|---|
| 4238 | + amdgpu_device_ip_post_soft_reset(adev); |
|---|
| 4239 | + if (r || amdgpu_device_ip_check_soft_reset(adev)) { |
|---|
| 4240 | + dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); |
|---|
| 4241 | + need_full_reset = true; |
|---|
| 4242 | + } |
|---|
| 4243 | + } |
|---|
| 3354 | 4244 | |
|---|
| 3355 | | - kthread_unpark(ring->sched.thread); |
|---|
| 4245 | + if (need_full_reset) |
|---|
| 4246 | + r = amdgpu_device_ip_suspend(adev); |
|---|
| 4247 | + |
|---|
| 4248 | + *need_full_reset_arg = need_full_reset; |
|---|
| 3356 | 4249 | } |
|---|
| 3357 | 4250 | |
|---|
| 3358 | | - if (!amdgpu_device_has_dc_support(adev)) { |
|---|
| 3359 | | - drm_helper_resume_force_mode(adev->ddev); |
|---|
| 4251 | + return r; |
|---|
| 4252 | +} |
|---|
| 4253 | + |
|---|
| 4254 | +static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, |
|---|
| 4255 | + struct list_head *device_list_handle, |
|---|
| 4256 | + bool *need_full_reset_arg, |
|---|
| 4257 | + bool skip_hw_reset) |
|---|
| 4258 | +{ |
|---|
| 4259 | + struct amdgpu_device *tmp_adev = NULL; |
|---|
| 4260 | + bool need_full_reset = *need_full_reset_arg, vram_lost = false; |
|---|
| 4261 | + int r = 0; |
|---|
| 4262 | + |
|---|
| 4263 | + /* |
|---|
| 4264 | + * ASIC reset has to be done on all HGMI hive nodes ASAP |
|---|
| 4265 | + * to allow proper links negotiation in FW (within 1 sec) |
|---|
| 4266 | + */ |
|---|
| 4267 | + if (!skip_hw_reset && need_full_reset) { |
|---|
| 4268 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
|---|
| 4269 | + /* For XGMI run all resets in parallel to speed up the process */ |
|---|
| 4270 | + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { |
|---|
| 4271 | + if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) |
|---|
| 4272 | + r = -EALREADY; |
|---|
| 4273 | + } else |
|---|
| 4274 | + r = amdgpu_asic_reset(tmp_adev); |
|---|
| 4275 | + |
|---|
| 4276 | + if (r) { |
|---|
| 4277 | + dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", |
|---|
| 4278 | + r, adev_to_drm(tmp_adev)->unique); |
|---|
| 4279 | + break; |
|---|
| 4280 | + } |
|---|
| 4281 | + } |
|---|
| 4282 | + |
|---|
| 4283 | + /* For XGMI wait for all resets to complete before proceed */ |
|---|
| 4284 | + if (!r) { |
|---|
| 4285 | + list_for_each_entry(tmp_adev, device_list_handle, |
|---|
| 4286 | + gmc.xgmi.head) { |
|---|
| 4287 | + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { |
|---|
| 4288 | + flush_work(&tmp_adev->xgmi_reset_work); |
|---|
| 4289 | + r = tmp_adev->asic_reset_res; |
|---|
| 4290 | + if (r) |
|---|
| 4291 | + break; |
|---|
| 4292 | + } |
|---|
| 4293 | + } |
|---|
| 4294 | + } |
|---|
| 3360 | 4295 | } |
|---|
| 3361 | 4296 | |
|---|
| 3362 | | - ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); |
|---|
| 4297 | + if (!r && amdgpu_ras_intr_triggered()) { |
|---|
| 4298 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
|---|
| 4299 | + if (tmp_adev->mmhub.funcs && |
|---|
| 4300 | + tmp_adev->mmhub.funcs->reset_ras_error_count) |
|---|
| 4301 | + tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); |
|---|
| 4302 | + } |
|---|
| 3363 | 4303 | |
|---|
| 3364 | | - if (r) { |
|---|
| 3365 | | - /* bad news, how to tell it to userspace ? */ |
|---|
| 3366 | | - dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); |
|---|
| 3367 | | - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); |
|---|
| 4304 | + amdgpu_ras_intr_cleared(); |
|---|
| 4305 | + } |
|---|
| 4306 | + |
|---|
| 4307 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
|---|
| 4308 | + if (need_full_reset) { |
|---|
| 4309 | + /* post card */ |
|---|
| 4310 | + if (amdgpu_device_asic_init(tmp_adev)) |
|---|
| 4311 | + dev_warn(tmp_adev->dev, "asic atom init failed!"); |
|---|
| 4312 | + |
|---|
| 4313 | + if (!r) { |
|---|
| 4314 | + dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); |
|---|
| 4315 | + r = amdgpu_amdkfd_resume_iommu(tmp_adev); |
|---|
| 4316 | + if (r) |
|---|
| 4317 | + goto out; |
|---|
| 4318 | + |
|---|
| 4319 | + r = amdgpu_device_ip_resume_phase1(tmp_adev); |
|---|
| 4320 | + if (r) |
|---|
| 4321 | + goto out; |
|---|
| 4322 | + |
|---|
| 4323 | + vram_lost = amdgpu_device_check_vram_lost(tmp_adev); |
|---|
| 4324 | + if (vram_lost) { |
|---|
| 4325 | + DRM_INFO("VRAM is lost due to GPU reset!\n"); |
|---|
| 4326 | + amdgpu_inc_vram_lost(tmp_adev); |
|---|
| 4327 | + } |
|---|
| 4328 | + |
|---|
| 4329 | + r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); |
|---|
| 4330 | + if (r) |
|---|
| 4331 | + goto out; |
|---|
| 4332 | + |
|---|
| 4333 | + r = amdgpu_device_fw_loading(tmp_adev); |
|---|
| 4334 | + if (r) |
|---|
| 4335 | + return r; |
|---|
| 4336 | + |
|---|
| 4337 | + r = amdgpu_device_ip_resume_phase2(tmp_adev); |
|---|
| 4338 | + if (r) |
|---|
| 4339 | + goto out; |
|---|
| 4340 | + |
|---|
| 4341 | + if (vram_lost) |
|---|
| 4342 | + amdgpu_device_fill_reset_magic(tmp_adev); |
|---|
| 4343 | + |
|---|
| 4344 | + /* |
|---|
| 4345 | + * Add this ASIC as tracked as reset was already |
|---|
| 4346 | + * complete successfully. |
|---|
| 4347 | + */ |
|---|
| 4348 | + amdgpu_register_gpu_instance(tmp_adev); |
|---|
| 4349 | + |
|---|
| 4350 | + r = amdgpu_device_ip_late_init(tmp_adev); |
|---|
| 4351 | + if (r) |
|---|
| 4352 | + goto out; |
|---|
| 4353 | + |
|---|
| 4354 | + amdgpu_fbdev_set_suspend(tmp_adev, 0); |
|---|
| 4355 | + |
|---|
| 4356 | + /* |
|---|
| 4357 | + * The GPU enters bad state once faulty pages |
|---|
| 4358 | + * by ECC has reached the threshold, and ras |
|---|
| 4359 | + * recovery is scheduled next. So add one check |
|---|
| 4360 | + * here to break recovery if it indeed exceeds |
|---|
| 4361 | + * bad page threshold, and remind user to |
|---|
| 4362 | + * retire this GPU or setting one bigger |
|---|
| 4363 | + * bad_page_threshold value to fix this once |
|---|
| 4364 | + * probing driver again. |
|---|
| 4365 | + */ |
|---|
| 4366 | + if (!amdgpu_ras_check_err_threshold(tmp_adev)) { |
|---|
| 4367 | + /* must succeed. */ |
|---|
| 4368 | + amdgpu_ras_resume(tmp_adev); |
|---|
| 4369 | + } else { |
|---|
| 4370 | + r = -EINVAL; |
|---|
| 4371 | + goto out; |
|---|
| 4372 | + } |
|---|
| 4373 | + |
|---|
| 4374 | + /* Update PSP FW topology after reset */ |
|---|
| 4375 | + if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) |
|---|
| 4376 | + r = amdgpu_xgmi_update_topology(hive, tmp_adev); |
|---|
| 4377 | + } |
|---|
| 4378 | + } |
|---|
| 4379 | + |
|---|
| 4380 | +out: |
|---|
| 4381 | + if (!r) { |
|---|
| 4382 | + amdgpu_irq_gpu_reset_resume_helper(tmp_adev); |
|---|
| 4383 | + r = amdgpu_ib_ring_tests(tmp_adev); |
|---|
| 4384 | + if (r) { |
|---|
| 4385 | + dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); |
|---|
| 4386 | + need_full_reset = true; |
|---|
| 4387 | + r = -EAGAIN; |
|---|
| 4388 | + goto end; |
|---|
| 4389 | + } |
|---|
| 4390 | + } |
|---|
| 4391 | + |
|---|
| 4392 | + if (!r) |
|---|
| 4393 | + r = amdgpu_device_recover_vram(tmp_adev); |
|---|
| 4394 | + else |
|---|
| 4395 | + tmp_adev->asic_reset_res = r; |
|---|
| 4396 | + } |
|---|
| 4397 | + |
|---|
| 4398 | +end: |
|---|
| 4399 | + *need_full_reset_arg = need_full_reset; |
|---|
| 4400 | + return r; |
|---|
| 4401 | +} |
|---|
| 4402 | + |
|---|
| 4403 | +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, |
|---|
| 4404 | + struct amdgpu_hive_info *hive) |
|---|
| 4405 | +{ |
|---|
| 4406 | + if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) |
|---|
| 4407 | + return false; |
|---|
| 4408 | + |
|---|
| 4409 | + if (hive) { |
|---|
| 4410 | + down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); |
|---|
| 3368 | 4411 | } else { |
|---|
| 3369 | | - dev_info(adev->dev, "GPU reset(%d) succeeded!\n",atomic_read(&adev->gpu_reset_counter)); |
|---|
| 4412 | + down_write(&adev->reset_sem); |
|---|
| 3370 | 4413 | } |
|---|
| 3371 | 4414 | |
|---|
| 3372 | | - /*unlock kfd */ |
|---|
| 3373 | | - amdgpu_amdkfd_post_reset(adev); |
|---|
| 4415 | + atomic_inc(&adev->gpu_reset_counter); |
|---|
| 4416 | + switch (amdgpu_asic_reset_method(adev)) { |
|---|
| 4417 | + case AMD_RESET_METHOD_MODE1: |
|---|
| 4418 | + adev->mp1_state = PP_MP1_STATE_SHUTDOWN; |
|---|
| 4419 | + break; |
|---|
| 4420 | + case AMD_RESET_METHOD_MODE2: |
|---|
| 4421 | + adev->mp1_state = PP_MP1_STATE_RESET; |
|---|
| 4422 | + break; |
|---|
| 4423 | + default: |
|---|
| 4424 | + adev->mp1_state = PP_MP1_STATE_NONE; |
|---|
| 4425 | + break; |
|---|
| 4426 | + } |
|---|
| 4427 | + |
|---|
| 4428 | + return true; |
|---|
| 4429 | +} |
|---|
| 4430 | + |
|---|
| 4431 | +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) |
|---|
| 4432 | +{ |
|---|
| 3374 | 4433 | amdgpu_vf_error_trans_all(adev); |
|---|
| 3375 | | - adev->in_gpu_reset = 0; |
|---|
| 3376 | | - mutex_unlock(&adev->lock_reset); |
|---|
| 4434 | + adev->mp1_state = PP_MP1_STATE_NONE; |
|---|
| 4435 | + atomic_set(&adev->in_gpu_reset, 0); |
|---|
| 4436 | + up_write(&adev->reset_sem); |
|---|
| 4437 | +} |
|---|
| 4438 | + |
|---|
| 4439 | +static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) |
|---|
| 4440 | +{ |
|---|
| 4441 | + struct pci_dev *p = NULL; |
|---|
| 4442 | + |
|---|
| 4443 | + p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), |
|---|
| 4444 | + adev->pdev->bus->number, 1); |
|---|
| 4445 | + if (p) { |
|---|
| 4446 | + pm_runtime_enable(&(p->dev)); |
|---|
| 4447 | + pm_runtime_resume(&(p->dev)); |
|---|
| 4448 | + } |
|---|
| 4449 | + |
|---|
| 4450 | + pci_dev_put(p); |
|---|
| 4451 | +} |
|---|
| 4452 | + |
|---|
| 4453 | +static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) |
|---|
| 4454 | +{ |
|---|
| 4455 | + enum amd_reset_method reset_method; |
|---|
| 4456 | + struct pci_dev *p = NULL; |
|---|
| 4457 | + u64 expires; |
|---|
| 4458 | + |
|---|
| 4459 | + /* |
|---|
| 4460 | + * For now, only BACO and mode1 reset are confirmed |
|---|
| 4461 | + * to suffer the audio issue without proper suspended. |
|---|
| 4462 | + */ |
|---|
| 4463 | + reset_method = amdgpu_asic_reset_method(adev); |
|---|
| 4464 | + if ((reset_method != AMD_RESET_METHOD_BACO) && |
|---|
| 4465 | + (reset_method != AMD_RESET_METHOD_MODE1)) |
|---|
| 4466 | + return -EINVAL; |
|---|
| 4467 | + |
|---|
| 4468 | + p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), |
|---|
| 4469 | + adev->pdev->bus->number, 1); |
|---|
| 4470 | + if (!p) |
|---|
| 4471 | + return -ENODEV; |
|---|
| 4472 | + |
|---|
| 4473 | + expires = pm_runtime_autosuspend_expiration(&(p->dev)); |
|---|
| 4474 | + if (!expires) |
|---|
| 4475 | + /* |
|---|
| 4476 | + * If we cannot get the audio device autosuspend delay, |
|---|
| 4477 | + * a fixed 4S interval will be used. Considering 3S is |
|---|
| 4478 | + * the audio controller default autosuspend delay setting. |
|---|
| 4479 | + * 4S used here is guaranteed to cover that. |
|---|
| 4480 | + */ |
|---|
| 4481 | + expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; |
|---|
| 4482 | + |
|---|
| 4483 | + while (!pm_runtime_status_suspended(&(p->dev))) { |
|---|
| 4484 | + if (!pm_runtime_suspend(&(p->dev))) |
|---|
| 4485 | + break; |
|---|
| 4486 | + |
|---|
| 4487 | + if (expires < ktime_get_mono_fast_ns()) { |
|---|
| 4488 | + dev_warn(adev->dev, "failed to suspend display audio\n"); |
|---|
| 4489 | + pci_dev_put(p); |
|---|
| 4490 | + /* TODO: abort the succeeding gpu reset? */ |
|---|
| 4491 | + return -ETIMEDOUT; |
|---|
| 4492 | + } |
|---|
| 4493 | + } |
|---|
| 4494 | + |
|---|
| 4495 | + pm_runtime_disable(&(p->dev)); |
|---|
| 4496 | + |
|---|
| 4497 | + pci_dev_put(p); |
|---|
| 4498 | + return 0; |
|---|
| 4499 | +} |
|---|
| 4500 | + |
|---|
| 4501 | +/** |
|---|
| 4502 | + * amdgpu_device_gpu_recover - reset the asic and recover scheduler |
|---|
| 4503 | + * |
|---|
| 4504 | + * @adev: amdgpu_device pointer |
|---|
| 4505 | + * @job: which job trigger hang |
|---|
| 4506 | + * |
|---|
| 4507 | + * Attempt to reset the GPU if it has hung (all asics). |
|---|
| 4508 | + * Attempt to do soft-reset or full-reset and reinitialize Asic |
|---|
| 4509 | + * Returns 0 for success or an error on failure. |
|---|
| 4510 | + */ |
|---|
| 4511 | + |
|---|
| 4512 | +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, |
|---|
| 4513 | + struct amdgpu_job *job) |
|---|
| 4514 | +{ |
|---|
| 4515 | + struct list_head device_list, *device_list_handle = NULL; |
|---|
| 4516 | + bool need_full_reset = false; |
|---|
| 4517 | + bool job_signaled = false; |
|---|
| 4518 | + struct amdgpu_hive_info *hive = NULL; |
|---|
| 4519 | + struct amdgpu_device *tmp_adev = NULL; |
|---|
| 4520 | + int i, r = 0; |
|---|
| 4521 | + bool need_emergency_restart = false; |
|---|
| 4522 | + bool audio_suspended = false; |
|---|
| 4523 | + |
|---|
| 4524 | + /* |
|---|
| 4525 | + * Special case: RAS triggered and full reset isn't supported |
|---|
| 4526 | + */ |
|---|
| 4527 | + need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); |
|---|
| 4528 | + |
|---|
| 4529 | + /* |
|---|
| 4530 | + * Flush RAM to disk so that after reboot |
|---|
| 4531 | + * the user can read log and see why the system rebooted. |
|---|
| 4532 | + */ |
|---|
| 4533 | + if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { |
|---|
| 4534 | + DRM_WARN("Emergency reboot."); |
|---|
| 4535 | + |
|---|
| 4536 | + ksys_sync_helper(); |
|---|
| 4537 | + emergency_restart(); |
|---|
| 4538 | + } |
|---|
| 4539 | + |
|---|
| 4540 | + dev_info(adev->dev, "GPU %s begin!\n", |
|---|
| 4541 | + need_emergency_restart ? "jobs stop":"reset"); |
|---|
| 4542 | + |
|---|
| 4543 | + /* |
|---|
| 4544 | + * Here we trylock to avoid chain of resets executing from |
|---|
| 4545 | + * either trigger by jobs on different adevs in XGMI hive or jobs on |
|---|
| 4546 | + * different schedulers for same device while this TO handler is running. |
|---|
| 4547 | + * We always reset all schedulers for device and all devices for XGMI |
|---|
| 4548 | + * hive so that should take care of them too. |
|---|
| 4549 | + */ |
|---|
| 4550 | + hive = amdgpu_get_xgmi_hive(adev); |
|---|
| 4551 | + if (hive) { |
|---|
| 4552 | + if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { |
|---|
| 4553 | + DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", |
|---|
| 4554 | + job ? job->base.id : -1, hive->hive_id); |
|---|
| 4555 | + amdgpu_put_xgmi_hive(hive); |
|---|
| 4556 | + return 0; |
|---|
| 4557 | + } |
|---|
| 4558 | + mutex_lock(&hive->hive_lock); |
|---|
| 4559 | + } |
|---|
| 4560 | + |
|---|
| 4561 | + /* |
|---|
| 4562 | + * Build list of devices to reset. |
|---|
| 4563 | + * In case we are in XGMI hive mode, resort the device list |
|---|
| 4564 | + * to put adev in the 1st position. |
|---|
| 4565 | + */ |
|---|
| 4566 | + INIT_LIST_HEAD(&device_list); |
|---|
| 4567 | + if (adev->gmc.xgmi.num_physical_nodes > 1) { |
|---|
| 4568 | + if (!hive) |
|---|
| 4569 | + return -ENODEV; |
|---|
| 4570 | + if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) |
|---|
| 4571 | + list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); |
|---|
| 4572 | + device_list_handle = &hive->device_list; |
|---|
| 4573 | + } else { |
|---|
| 4574 | + list_add_tail(&adev->gmc.xgmi.head, &device_list); |
|---|
| 4575 | + device_list_handle = &device_list; |
|---|
| 4576 | + } |
|---|
| 4577 | + |
|---|
| 4578 | + /* block all schedulers and reset given job's ring */ |
|---|
| 4579 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
|---|
| 4580 | + if (!amdgpu_device_lock_adev(tmp_adev, hive)) { |
|---|
| 4581 | + dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", |
|---|
| 4582 | + job ? job->base.id : -1); |
|---|
| 4583 | + r = 0; |
|---|
| 4584 | + goto skip_recovery; |
|---|
| 4585 | + } |
|---|
| 4586 | + |
|---|
| 4587 | + /* |
|---|
| 4588 | + * Try to put the audio codec into suspend state |
|---|
| 4589 | + * before gpu reset started. |
|---|
| 4590 | + * |
|---|
| 4591 | + * Due to the power domain of the graphics device |
|---|
| 4592 | + * is shared with AZ power domain. Without this, |
|---|
| 4593 | + * we may change the audio hardware from behind |
|---|
| 4594 | + * the audio driver's back. That will trigger |
|---|
| 4595 | + * some audio codec errors. |
|---|
| 4596 | + */ |
|---|
| 4597 | + if (!amdgpu_device_suspend_display_audio(tmp_adev)) |
|---|
| 4598 | + audio_suspended = true; |
|---|
| 4599 | + |
|---|
| 4600 | + amdgpu_ras_set_error_query_ready(tmp_adev, false); |
|---|
| 4601 | + |
|---|
| 4602 | + cancel_delayed_work_sync(&tmp_adev->delayed_init_work); |
|---|
| 4603 | + |
|---|
| 4604 | + if (!amdgpu_sriov_vf(tmp_adev)) |
|---|
| 4605 | + amdgpu_amdkfd_pre_reset(tmp_adev); |
|---|
| 4606 | + |
|---|
| 4607 | + /* |
|---|
| 4608 | + * Mark these ASICs to be reseted as untracked first |
|---|
| 4609 | + * And add them back after reset completed |
|---|
| 4610 | + */ |
|---|
| 4611 | + amdgpu_unregister_gpu_instance(tmp_adev); |
|---|
| 4612 | + |
|---|
| 4613 | + amdgpu_fbdev_set_suspend(tmp_adev, 1); |
|---|
| 4614 | + |
|---|
| 4615 | + /* disable ras on ALL IPs */ |
|---|
| 4616 | + if (!need_emergency_restart && |
|---|
| 4617 | + amdgpu_device_ip_need_full_reset(tmp_adev)) |
|---|
| 4618 | + amdgpu_ras_suspend(tmp_adev); |
|---|
| 4619 | + |
|---|
| 4620 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
|---|
| 4621 | + struct amdgpu_ring *ring = tmp_adev->rings[i]; |
|---|
| 4622 | + |
|---|
| 4623 | + if (!ring || !ring->sched.thread) |
|---|
| 4624 | + continue; |
|---|
| 4625 | + |
|---|
| 4626 | + drm_sched_stop(&ring->sched, job ? &job->base : NULL); |
|---|
| 4627 | + |
|---|
| 4628 | + if (need_emergency_restart) |
|---|
| 4629 | + amdgpu_job_stop_all_jobs_on_sched(&ring->sched); |
|---|
| 4630 | + } |
|---|
| 4631 | + } |
|---|
| 4632 | + |
|---|
| 4633 | + if (need_emergency_restart) |
|---|
| 4634 | + goto skip_sched_resume; |
|---|
| 4635 | + |
|---|
| 4636 | + /* |
|---|
| 4637 | + * Must check guilty signal here since after this point all old |
|---|
| 4638 | + * HW fences are force signaled. |
|---|
| 4639 | + * |
|---|
| 4640 | + * job->base holds a reference to parent fence |
|---|
| 4641 | + */ |
|---|
| 4642 | + if (job && job->base.s_fence->parent && |
|---|
| 4643 | + dma_fence_is_signaled(job->base.s_fence->parent)) { |
|---|
| 4644 | + job_signaled = true; |
|---|
| 4645 | + dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); |
|---|
| 4646 | + goto skip_hw_reset; |
|---|
| 4647 | + } |
|---|
| 4648 | + |
|---|
| 4649 | +retry: /* Rest of adevs pre asic reset from XGMI hive. */ |
|---|
| 4650 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
|---|
| 4651 | + r = amdgpu_device_pre_asic_reset(tmp_adev, |
|---|
| 4652 | + (tmp_adev == adev) ? job : NULL, |
|---|
| 4653 | + &need_full_reset); |
|---|
| 4654 | + /*TODO Should we stop ?*/ |
|---|
| 4655 | + if (r) { |
|---|
| 4656 | + dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", |
|---|
| 4657 | + r, adev_to_drm(tmp_adev)->unique); |
|---|
| 4658 | + tmp_adev->asic_reset_res = r; |
|---|
| 4659 | + } |
|---|
| 4660 | + } |
|---|
| 4661 | + |
|---|
| 4662 | + /* Actual ASIC resets if needed.*/ |
|---|
| 4663 | + /* TODO Implement XGMI hive reset logic for SRIOV */ |
|---|
| 4664 | + if (amdgpu_sriov_vf(adev)) { |
|---|
| 4665 | + r = amdgpu_device_reset_sriov(adev, job ? false : true); |
|---|
| 4666 | + if (r) |
|---|
| 4667 | + adev->asic_reset_res = r; |
|---|
| 4668 | + } else { |
|---|
| 4669 | + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); |
|---|
| 4670 | + if (r && r == -EAGAIN) |
|---|
| 4671 | + goto retry; |
|---|
| 4672 | + } |
|---|
| 4673 | + |
|---|
| 4674 | +skip_hw_reset: |
|---|
| 4675 | + |
|---|
| 4676 | + /* Post ASIC reset for all devs .*/ |
|---|
| 4677 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
|---|
| 4678 | + |
|---|
| 4679 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
|---|
| 4680 | + struct amdgpu_ring *ring = tmp_adev->rings[i]; |
|---|
| 4681 | + |
|---|
| 4682 | + if (!ring || !ring->sched.thread) |
|---|
| 4683 | + continue; |
|---|
| 4684 | + |
|---|
| 4685 | + /* No point to resubmit jobs if we didn't HW reset*/ |
|---|
| 4686 | + if (!tmp_adev->asic_reset_res && !job_signaled) |
|---|
| 4687 | + drm_sched_resubmit_jobs(&ring->sched); |
|---|
| 4688 | + |
|---|
| 4689 | + drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); |
|---|
| 4690 | + } |
|---|
| 4691 | + |
|---|
| 4692 | + if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { |
|---|
| 4693 | + drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); |
|---|
| 4694 | + } |
|---|
| 4695 | + |
|---|
| 4696 | + tmp_adev->asic_reset_res = 0; |
|---|
| 4697 | + |
|---|
| 4698 | + if (r) { |
|---|
| 4699 | + /* bad news, how to tell it to userspace ? */ |
|---|
| 4700 | + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); |
|---|
| 4701 | + amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); |
|---|
| 4702 | + } else { |
|---|
| 4703 | + dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); |
|---|
| 4704 | + } |
|---|
| 4705 | + } |
|---|
| 4706 | + |
|---|
| 4707 | +skip_sched_resume: |
|---|
| 4708 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
|---|
| 4709 | + /*unlock kfd: SRIOV would do it separately */ |
|---|
| 4710 | + if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) |
|---|
| 4711 | + amdgpu_amdkfd_post_reset(tmp_adev); |
|---|
| 4712 | + if (audio_suspended) |
|---|
| 4713 | + amdgpu_device_resume_display_audio(tmp_adev); |
|---|
| 4714 | + amdgpu_device_unlock_adev(tmp_adev); |
|---|
| 4715 | + } |
|---|
| 4716 | + |
|---|
| 4717 | +skip_recovery: |
|---|
| 4718 | + if (hive) { |
|---|
| 4719 | + atomic_set(&hive->in_reset, 0); |
|---|
| 4720 | + mutex_unlock(&hive->hive_lock); |
|---|
| 4721 | + amdgpu_put_xgmi_hive(hive); |
|---|
| 4722 | + } |
|---|
| 4723 | + |
|---|
| 4724 | + if (r) |
|---|
| 4725 | + dev_info(adev->dev, "GPU reset end with ret = %d\n", r); |
|---|
| 3377 | 4726 | return r; |
|---|
| 3378 | 4727 | } |
|---|
| 3379 | 4728 | |
|---|
| .. | .. |
|---|
| 3389 | 4738 | static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) |
|---|
| 3390 | 4739 | { |
|---|
| 3391 | 4740 | struct pci_dev *pdev; |
|---|
| 3392 | | - enum pci_bus_speed speed_cap; |
|---|
| 3393 | | - enum pcie_link_width link_width; |
|---|
| 4741 | + enum pci_bus_speed speed_cap, platform_speed_cap; |
|---|
| 4742 | + enum pcie_link_width platform_link_width; |
|---|
| 3394 | 4743 | |
|---|
| 3395 | 4744 | if (amdgpu_pcie_gen_cap) |
|---|
| 3396 | 4745 | adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; |
|---|
| .. | .. |
|---|
| 3406 | 4755 | adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; |
|---|
| 3407 | 4756 | return; |
|---|
| 3408 | 4757 | } |
|---|
| 4758 | + |
|---|
| 4759 | + if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) |
|---|
| 4760 | + return; |
|---|
| 4761 | + |
|---|
| 4762 | + pcie_bandwidth_available(adev->pdev, NULL, |
|---|
| 4763 | + &platform_speed_cap, &platform_link_width); |
|---|
| 3409 | 4764 | |
|---|
| 3410 | 4765 | if (adev->pm.pcie_gen_mask == 0) { |
|---|
| 3411 | 4766 | /* asic caps */ |
|---|
| .. | .. |
|---|
| 3432 | 4787 | adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; |
|---|
| 3433 | 4788 | } |
|---|
| 3434 | 4789 | /* platform caps */ |
|---|
| 3435 | | - pdev = adev->ddev->pdev->bus->self; |
|---|
| 3436 | | - speed_cap = pcie_get_speed_cap(pdev); |
|---|
| 3437 | | - if (speed_cap == PCI_SPEED_UNKNOWN) { |
|---|
| 4790 | + if (platform_speed_cap == PCI_SPEED_UNKNOWN) { |
|---|
| 3438 | 4791 | adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | |
|---|
| 3439 | 4792 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); |
|---|
| 3440 | 4793 | } else { |
|---|
| 3441 | | - if (speed_cap == PCIE_SPEED_16_0GT) |
|---|
| 4794 | + if (platform_speed_cap == PCIE_SPEED_16_0GT) |
|---|
| 3442 | 4795 | adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | |
|---|
| 3443 | 4796 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | |
|---|
| 3444 | 4797 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | |
|---|
| 3445 | 4798 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); |
|---|
| 3446 | | - else if (speed_cap == PCIE_SPEED_8_0GT) |
|---|
| 4799 | + else if (platform_speed_cap == PCIE_SPEED_8_0GT) |
|---|
| 3447 | 4800 | adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | |
|---|
| 3448 | 4801 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | |
|---|
| 3449 | 4802 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); |
|---|
| 3450 | | - else if (speed_cap == PCIE_SPEED_5_0GT) |
|---|
| 4803 | + else if (platform_speed_cap == PCIE_SPEED_5_0GT) |
|---|
| 3451 | 4804 | adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | |
|---|
| 3452 | 4805 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); |
|---|
| 3453 | 4806 | else |
|---|
| .. | .. |
|---|
| 3456 | 4809 | } |
|---|
| 3457 | 4810 | } |
|---|
| 3458 | 4811 | if (adev->pm.pcie_mlw_mask == 0) { |
|---|
| 3459 | | - pdev = adev->ddev->pdev->bus->self; |
|---|
| 3460 | | - link_width = pcie_get_width_cap(pdev); |
|---|
| 3461 | | - if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { |
|---|
| 4812 | + if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { |
|---|
| 3462 | 4813 | adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; |
|---|
| 3463 | 4814 | } else { |
|---|
| 3464 | | - switch (link_width) { |
|---|
| 4815 | + switch (platform_link_width) { |
|---|
| 3465 | 4816 | case PCIE_LNK_X32: |
|---|
| 3466 | 4817 | adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | |
|---|
| 3467 | 4818 | CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | |
|---|
| .. | .. |
|---|
| 3511 | 4862 | } |
|---|
| 3512 | 4863 | } |
|---|
| 3513 | 4864 | |
|---|
| 4865 | +int amdgpu_device_baco_enter(struct drm_device *dev) |
|---|
| 4866 | +{ |
|---|
| 4867 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 4868 | + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
|---|
| 4869 | + |
|---|
| 4870 | + if (!amdgpu_device_supports_baco(adev_to_drm(adev))) |
|---|
| 4871 | + return -ENOTSUPP; |
|---|
| 4872 | + |
|---|
| 4873 | + if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) |
|---|
| 4874 | + adev->nbio.funcs->enable_doorbell_interrupt(adev, false); |
|---|
| 4875 | + |
|---|
| 4876 | + return amdgpu_dpm_baco_enter(adev); |
|---|
| 4877 | +} |
|---|
| 4878 | + |
|---|
| 4879 | +int amdgpu_device_baco_exit(struct drm_device *dev) |
|---|
| 4880 | +{ |
|---|
| 4881 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 4882 | + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
|---|
| 4883 | + int ret = 0; |
|---|
| 4884 | + |
|---|
| 4885 | + if (!amdgpu_device_supports_baco(adev_to_drm(adev))) |
|---|
| 4886 | + return -ENOTSUPP; |
|---|
| 4887 | + |
|---|
| 4888 | + ret = amdgpu_dpm_baco_exit(adev); |
|---|
| 4889 | + if (ret) |
|---|
| 4890 | + return ret; |
|---|
| 4891 | + |
|---|
| 4892 | + if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) |
|---|
| 4893 | + adev->nbio.funcs->enable_doorbell_interrupt(adev, true); |
|---|
| 4894 | + |
|---|
| 4895 | + return 0; |
|---|
| 4896 | +} |
|---|
| 4897 | + |
|---|
| 4898 | +static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) |
|---|
| 4899 | +{ |
|---|
| 4900 | + int i; |
|---|
| 4901 | + |
|---|
| 4902 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
|---|
| 4903 | + struct amdgpu_ring *ring = adev->rings[i]; |
|---|
| 4904 | + |
|---|
| 4905 | + if (!ring || !ring->sched.thread) |
|---|
| 4906 | + continue; |
|---|
| 4907 | + |
|---|
| 4908 | + cancel_delayed_work_sync(&ring->sched.work_tdr); |
|---|
| 4909 | + } |
|---|
| 4910 | +} |
|---|
| 4911 | + |
|---|
| 4912 | +/** |
|---|
| 4913 | + * amdgpu_pci_error_detected - Called when a PCI error is detected. |
|---|
| 4914 | + * @pdev: PCI device struct |
|---|
| 4915 | + * @state: PCI channel state |
|---|
| 4916 | + * |
|---|
| 4917 | + * Description: Called when a PCI error is detected. |
|---|
| 4918 | + * |
|---|
| 4919 | + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. |
|---|
| 4920 | + */ |
|---|
| 4921 | +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) |
|---|
| 4922 | +{ |
|---|
| 4923 | + struct drm_device *dev = pci_get_drvdata(pdev); |
|---|
| 4924 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 4925 | + int i; |
|---|
| 4926 | + |
|---|
| 4927 | + DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); |
|---|
| 4928 | + |
|---|
| 4929 | + if (adev->gmc.xgmi.num_physical_nodes > 1) { |
|---|
| 4930 | + DRM_WARN("No support for XGMI hive yet..."); |
|---|
| 4931 | + return PCI_ERS_RESULT_DISCONNECT; |
|---|
| 4932 | + } |
|---|
| 4933 | + |
|---|
| 4934 | + switch (state) { |
|---|
| 4935 | + case pci_channel_io_normal: |
|---|
| 4936 | + return PCI_ERS_RESULT_CAN_RECOVER; |
|---|
| 4937 | + /* Fatal error, prepare for slot reset */ |
|---|
| 4938 | + case pci_channel_io_frozen: |
|---|
| 4939 | + /* |
|---|
| 4940 | + * Cancel and wait for all TDRs in progress if failing to |
|---|
| 4941 | + * set adev->in_gpu_reset in amdgpu_device_lock_adev |
|---|
| 4942 | + * |
|---|
| 4943 | + * Locking adev->reset_sem will prevent any external access |
|---|
| 4944 | + * to GPU during PCI error recovery |
|---|
| 4945 | + */ |
|---|
| 4946 | + while (!amdgpu_device_lock_adev(adev, NULL)) |
|---|
| 4947 | + amdgpu_cancel_all_tdr(adev); |
|---|
| 4948 | + |
|---|
| 4949 | + /* |
|---|
| 4950 | + * Block any work scheduling as we do for regular GPU reset |
|---|
| 4951 | + * for the duration of the recovery |
|---|
| 4952 | + */ |
|---|
| 4953 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
|---|
| 4954 | + struct amdgpu_ring *ring = adev->rings[i]; |
|---|
| 4955 | + |
|---|
| 4956 | + if (!ring || !ring->sched.thread) |
|---|
| 4957 | + continue; |
|---|
| 4958 | + |
|---|
| 4959 | + drm_sched_stop(&ring->sched, NULL); |
|---|
| 4960 | + } |
|---|
| 4961 | + return PCI_ERS_RESULT_NEED_RESET; |
|---|
| 4962 | + case pci_channel_io_perm_failure: |
|---|
| 4963 | + /* Permanent error, prepare for device removal */ |
|---|
| 4964 | + return PCI_ERS_RESULT_DISCONNECT; |
|---|
| 4965 | + } |
|---|
| 4966 | + |
|---|
| 4967 | + return PCI_ERS_RESULT_NEED_RESET; |
|---|
| 4968 | +} |
|---|
| 4969 | + |
|---|
| 4970 | +/** |
|---|
| 4971 | + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers |
|---|
| 4972 | + * @pdev: pointer to PCI device |
|---|
| 4973 | + */ |
|---|
| 4974 | +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) |
|---|
| 4975 | +{ |
|---|
| 4976 | + |
|---|
| 4977 | + DRM_INFO("PCI error: mmio enabled callback!!\n"); |
|---|
| 4978 | + |
|---|
| 4979 | + /* TODO - dump whatever for debugging purposes */ |
|---|
| 4980 | + |
|---|
| 4981 | + /* This called only if amdgpu_pci_error_detected returns |
|---|
| 4982 | + * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still |
|---|
| 4983 | + * works, no need to reset slot. |
|---|
| 4984 | + */ |
|---|
| 4985 | + |
|---|
| 4986 | + return PCI_ERS_RESULT_RECOVERED; |
|---|
| 4987 | +} |
|---|
| 4988 | + |
|---|
| 4989 | +/** |
|---|
| 4990 | + * amdgpu_pci_slot_reset - Called when PCI slot has been reset. |
|---|
| 4991 | + * @pdev: PCI device struct |
|---|
| 4992 | + * |
|---|
| 4993 | + * Description: This routine is called by the pci error recovery |
|---|
| 4994 | + * code after the PCI slot has been reset, just before we |
|---|
| 4995 | + * should resume normal operations. |
|---|
| 4996 | + */ |
|---|
| 4997 | +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) |
|---|
| 4998 | +{ |
|---|
| 4999 | + struct drm_device *dev = pci_get_drvdata(pdev); |
|---|
| 5000 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 5001 | + int r, i; |
|---|
| 5002 | + bool need_full_reset = true; |
|---|
| 5003 | + u32 memsize; |
|---|
| 5004 | + struct list_head device_list; |
|---|
| 5005 | + |
|---|
| 5006 | + DRM_INFO("PCI error: slot reset callback!!\n"); |
|---|
| 5007 | + |
|---|
| 5008 | + INIT_LIST_HEAD(&device_list); |
|---|
| 5009 | + list_add_tail(&adev->gmc.xgmi.head, &device_list); |
|---|
| 5010 | + |
|---|
| 5011 | + /* wait for asic to come out of reset */ |
|---|
| 5012 | + msleep(500); |
|---|
| 5013 | + |
|---|
| 5014 | + /* Restore PCI confspace */ |
|---|
| 5015 | + amdgpu_device_load_pci_state(pdev); |
|---|
| 5016 | + |
|---|
| 5017 | + /* confirm ASIC came out of reset */ |
|---|
| 5018 | + for (i = 0; i < adev->usec_timeout; i++) { |
|---|
| 5019 | + memsize = amdgpu_asic_get_config_memsize(adev); |
|---|
| 5020 | + |
|---|
| 5021 | + if (memsize != 0xffffffff) |
|---|
| 5022 | + break; |
|---|
| 5023 | + udelay(1); |
|---|
| 5024 | + } |
|---|
| 5025 | + if (memsize == 0xffffffff) { |
|---|
| 5026 | + r = -ETIME; |
|---|
| 5027 | + goto out; |
|---|
| 5028 | + } |
|---|
| 5029 | + |
|---|
| 5030 | + adev->in_pci_err_recovery = true; |
|---|
| 5031 | + r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); |
|---|
| 5032 | + adev->in_pci_err_recovery = false; |
|---|
| 5033 | + if (r) |
|---|
| 5034 | + goto out; |
|---|
| 5035 | + |
|---|
| 5036 | + r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); |
|---|
| 5037 | + |
|---|
| 5038 | +out: |
|---|
| 5039 | + if (!r) { |
|---|
| 5040 | + if (amdgpu_device_cache_pci_state(adev->pdev)) |
|---|
| 5041 | + pci_restore_state(adev->pdev); |
|---|
| 5042 | + |
|---|
| 5043 | + DRM_INFO("PCIe error recovery succeeded\n"); |
|---|
| 5044 | + } else { |
|---|
| 5045 | + DRM_ERROR("PCIe error recovery failed, err:%d", r); |
|---|
| 5046 | + amdgpu_device_unlock_adev(adev); |
|---|
| 5047 | + } |
|---|
| 5048 | + |
|---|
| 5049 | + return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; |
|---|
| 5050 | +} |
|---|
| 5051 | + |
|---|
| 5052 | +/** |
|---|
| 5053 | + * amdgpu_pci_resume() - resume normal ops after PCI reset |
|---|
| 5054 | + * @pdev: pointer to PCI device |
|---|
| 5055 | + * |
|---|
| 5056 | + * Called when the error recovery driver tells us that its |
|---|
| 5057 | + * OK to resume normal operation. Use completion to allow |
|---|
| 5058 | + * halted scsi ops to resume. |
|---|
| 5059 | + */ |
|---|
| 5060 | +void amdgpu_pci_resume(struct pci_dev *pdev) |
|---|
| 5061 | +{ |
|---|
| 5062 | + struct drm_device *dev = pci_get_drvdata(pdev); |
|---|
| 5063 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 5064 | + int i; |
|---|
| 5065 | + |
|---|
| 5066 | + |
|---|
| 5067 | + DRM_INFO("PCI error: resume callback!!\n"); |
|---|
| 5068 | + |
|---|
| 5069 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
|---|
| 5070 | + struct amdgpu_ring *ring = adev->rings[i]; |
|---|
| 5071 | + |
|---|
| 5072 | + if (!ring || !ring->sched.thread) |
|---|
| 5073 | + continue; |
|---|
| 5074 | + |
|---|
| 5075 | + |
|---|
| 5076 | + drm_sched_resubmit_jobs(&ring->sched); |
|---|
| 5077 | + drm_sched_start(&ring->sched, true); |
|---|
| 5078 | + } |
|---|
| 5079 | + |
|---|
| 5080 | + amdgpu_device_unlock_adev(adev); |
|---|
| 5081 | +} |
|---|
| 5082 | + |
|---|
| 5083 | +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) |
|---|
| 5084 | +{ |
|---|
| 5085 | + struct drm_device *dev = pci_get_drvdata(pdev); |
|---|
| 5086 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 5087 | + int r; |
|---|
| 5088 | + |
|---|
| 5089 | + r = pci_save_state(pdev); |
|---|
| 5090 | + if (!r) { |
|---|
| 5091 | + kfree(adev->pci_state); |
|---|
| 5092 | + |
|---|
| 5093 | + adev->pci_state = pci_store_saved_state(pdev); |
|---|
| 5094 | + |
|---|
| 5095 | + if (!adev->pci_state) { |
|---|
| 5096 | + DRM_ERROR("Failed to store PCI saved state"); |
|---|
| 5097 | + return false; |
|---|
| 5098 | + } |
|---|
| 5099 | + } else { |
|---|
| 5100 | + DRM_WARN("Failed to save PCI state, err:%d\n", r); |
|---|
| 5101 | + return false; |
|---|
| 5102 | + } |
|---|
| 5103 | + |
|---|
| 5104 | + return true; |
|---|
| 5105 | +} |
|---|
| 5106 | + |
|---|
| 5107 | +bool amdgpu_device_load_pci_state(struct pci_dev *pdev) |
|---|
| 5108 | +{ |
|---|
| 5109 | + struct drm_device *dev = pci_get_drvdata(pdev); |
|---|
| 5110 | + struct amdgpu_device *adev = drm_to_adev(dev); |
|---|
| 5111 | + int r; |
|---|
| 5112 | + |
|---|
| 5113 | + if (!adev->pci_state) |
|---|
| 5114 | + return false; |
|---|
| 5115 | + |
|---|
| 5116 | + r = pci_load_saved_state(pdev, adev->pci_state); |
|---|
| 5117 | + |
|---|
| 5118 | + if (!r) { |
|---|
| 5119 | + pci_restore_state(pdev); |
|---|
| 5120 | + } else { |
|---|
| 5121 | + DRM_WARN("Failed to load PCI state, err:%d\n", r); |
|---|
| 5122 | + return false; |
|---|
| 5123 | + } |
|---|
| 5124 | + |
|---|
| 5125 | + return true; |
|---|
| 5126 | +} |
|---|
| 5127 | + |
|---|
| 5128 | + |
|---|