.. | .. |
---|
27 | 27 | */ |
---|
28 | 28 | #include <linux/power_supply.h> |
---|
29 | 29 | #include <linux/kthread.h> |
---|
| 30 | +#include <linux/module.h> |
---|
30 | 31 | #include <linux/console.h> |
---|
31 | 32 | #include <linux/slab.h> |
---|
32 | | -#include <drm/drmP.h> |
---|
33 | | -#include <drm/drm_crtc_helper.h> |
---|
| 33 | + |
---|
34 | 34 | #include <drm/drm_atomic_helper.h> |
---|
| 35 | +#include <drm/drm_probe_helper.h> |
---|
35 | 36 | #include <drm/amdgpu_drm.h> |
---|
36 | 37 | #include <linux/vgaarb.h> |
---|
37 | 38 | #include <linux/vga_switcheroo.h> |
---|
.. | .. |
---|
51 | 52 | #endif |
---|
52 | 53 | #include "vi.h" |
---|
53 | 54 | #include "soc15.h" |
---|
| 55 | +#include "nv.h" |
---|
54 | 56 | #include "bif/bif_4_1_d.h" |
---|
55 | 57 | #include <linux/pci.h> |
---|
56 | 58 | #include <linux/firmware.h> |
---|
.. | .. |
---|
59 | 61 | #include "amdgpu_amdkfd.h" |
---|
60 | 62 | #include "amdgpu_pm.h" |
---|
61 | 63 | |
---|
| 64 | +#include "amdgpu_xgmi.h" |
---|
| 65 | +#include "amdgpu_ras.h" |
---|
| 66 | +#include "amdgpu_pmu.h" |
---|
| 67 | +#include "amdgpu_fru_eeprom.h" |
---|
| 68 | + |
---|
| 69 | +#include <linux/suspend.h> |
---|
| 70 | +#include <drm/task_barrier.h> |
---|
| 71 | +#include <linux/pm_runtime.h> |
---|
| 72 | + |
---|
62 | 73 | MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); |
---|
63 | 74 | MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); |
---|
64 | 75 | MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); |
---|
| 76 | +MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); |
---|
| 77 | +MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); |
---|
| 78 | +MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); |
---|
| 79 | +MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); |
---|
| 80 | +MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); |
---|
| 81 | +MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); |
---|
| 82 | +MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); |
---|
65 | 83 | |
---|
66 | 84 | #define AMDGPU_RESUME_MS 2000 |
---|
67 | 85 | |
---|
68 | | -static const char *amdgpu_asic_name[] = { |
---|
| 86 | +const char *amdgpu_asic_name[] = { |
---|
69 | 87 | "TAHITI", |
---|
70 | 88 | "PITCAIRN", |
---|
71 | 89 | "VERDE", |
---|
.. | .. |
---|
89 | 107 | "VEGA12", |
---|
90 | 108 | "VEGA20", |
---|
91 | 109 | "RAVEN", |
---|
| 110 | + "ARCTURUS", |
---|
| 111 | + "RENOIR", |
---|
| 112 | + "NAVI10", |
---|
| 113 | + "NAVI14", |
---|
| 114 | + "NAVI12", |
---|
| 115 | + "SIENNA_CICHLID", |
---|
| 116 | + "NAVY_FLOUNDER", |
---|
92 | 117 | "LAST", |
---|
93 | 118 | }; |
---|
| 119 | + |
---|
| 120 | +/** |
---|
| 121 | + * DOC: pcie_replay_count |
---|
| 122 | + * |
---|
| 123 | + * The amdgpu driver provides a sysfs API for reporting the total number |
---|
| 124 | + * of PCIe replays (NAKs) |
---|
| 125 | + * The file pcie_replay_count is used for this and returns the total |
---|
| 126 | + * number of replays as a sum of the NAKs generated and NAKs received |
---|
| 127 | + */ |
---|
| 128 | + |
---|
| 129 | +static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, |
---|
| 130 | + struct device_attribute *attr, char *buf) |
---|
| 131 | +{ |
---|
| 132 | + struct drm_device *ddev = dev_get_drvdata(dev); |
---|
| 133 | + struct amdgpu_device *adev = drm_to_adev(ddev); |
---|
| 134 | + uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); |
---|
| 135 | + |
---|
| 136 | + return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); |
---|
| 137 | +} |
---|
| 138 | + |
---|
| 139 | +static DEVICE_ATTR(pcie_replay_count, S_IRUGO, |
---|
| 140 | + amdgpu_device_get_pcie_replay_count, NULL); |
---|
94 | 141 | |
---|
95 | 142 | static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); |
---|
96 | 143 | |
---|
97 | 144 | /** |
---|
98 | | - * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control |
---|
| 145 | + * DOC: product_name |
---|
| 146 | + * |
---|
| 147 | + * The amdgpu driver provides a sysfs API for reporting the product name |
---|
| 148 | + * for the device |
---|
| 149 | + * The file serial_number is used for this and returns the product name |
---|
| 150 | + * as returned from the FRU. |
---|
| 151 | + * NOTE: This is only available for certain server cards |
---|
| 152 | + */ |
---|
| 153 | + |
---|
| 154 | +static ssize_t amdgpu_device_get_product_name(struct device *dev, |
---|
| 155 | + struct device_attribute *attr, char *buf) |
---|
| 156 | +{ |
---|
| 157 | + struct drm_device *ddev = dev_get_drvdata(dev); |
---|
| 158 | + struct amdgpu_device *adev = drm_to_adev(ddev); |
---|
| 159 | + |
---|
| 160 | + return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); |
---|
| 161 | +} |
---|
| 162 | + |
---|
| 163 | +static DEVICE_ATTR(product_name, S_IRUGO, |
---|
| 164 | + amdgpu_device_get_product_name, NULL); |
---|
| 165 | + |
---|
| 166 | +/** |
---|
| 167 | + * DOC: product_number |
---|
| 168 | + * |
---|
| 169 | + * The amdgpu driver provides a sysfs API for reporting the part number |
---|
| 170 | + * for the device |
---|
| 171 | + * The file serial_number is used for this and returns the part number |
---|
| 172 | + * as returned from the FRU. |
---|
| 173 | + * NOTE: This is only available for certain server cards |
---|
| 174 | + */ |
---|
| 175 | + |
---|
| 176 | +static ssize_t amdgpu_device_get_product_number(struct device *dev, |
---|
| 177 | + struct device_attribute *attr, char *buf) |
---|
| 178 | +{ |
---|
| 179 | + struct drm_device *ddev = dev_get_drvdata(dev); |
---|
| 180 | + struct amdgpu_device *adev = drm_to_adev(ddev); |
---|
| 181 | + |
---|
| 182 | + return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); |
---|
| 183 | +} |
---|
| 184 | + |
---|
| 185 | +static DEVICE_ATTR(product_number, S_IRUGO, |
---|
| 186 | + amdgpu_device_get_product_number, NULL); |
---|
| 187 | + |
---|
| 188 | +/** |
---|
| 189 | + * DOC: serial_number |
---|
| 190 | + * |
---|
| 191 | + * The amdgpu driver provides a sysfs API for reporting the serial number |
---|
| 192 | + * for the device |
---|
| 193 | + * The file serial_number is used for this and returns the serial number |
---|
| 194 | + * as returned from the FRU. |
---|
| 195 | + * NOTE: This is only available for certain server cards |
---|
| 196 | + */ |
---|
| 197 | + |
---|
| 198 | +static ssize_t amdgpu_device_get_serial_number(struct device *dev, |
---|
| 199 | + struct device_attribute *attr, char *buf) |
---|
| 200 | +{ |
---|
| 201 | + struct drm_device *ddev = dev_get_drvdata(dev); |
---|
| 202 | + struct amdgpu_device *adev = drm_to_adev(ddev); |
---|
| 203 | + |
---|
| 204 | + return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); |
---|
| 205 | +} |
---|
| 206 | + |
---|
| 207 | +static DEVICE_ATTR(serial_number, S_IRUGO, |
---|
| 208 | + amdgpu_device_get_serial_number, NULL); |
---|
| 209 | + |
---|
| 210 | +/** |
---|
| 211 | + * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control |
---|
99 | 212 | * |
---|
100 | 213 | * @dev: drm_device pointer |
---|
101 | 214 | * |
---|
102 | 215 | * Returns true if the device is a dGPU with HG/PX power control, |
---|
103 | 216 | * otherwise return false. |
---|
104 | 217 | */ |
---|
105 | | -bool amdgpu_device_is_px(struct drm_device *dev) |
---|
| 218 | +bool amdgpu_device_supports_boco(struct drm_device *dev) |
---|
106 | 219 | { |
---|
107 | | - struct amdgpu_device *adev = dev->dev_private; |
---|
| 220 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
108 | 221 | |
---|
109 | 222 | if (adev->flags & AMD_IS_PX) |
---|
110 | 223 | return true; |
---|
111 | 224 | return false; |
---|
112 | 225 | } |
---|
113 | 226 | |
---|
| 227 | +/** |
---|
| 228 | + * amdgpu_device_supports_baco - Does the device support BACO |
---|
| 229 | + * |
---|
| 230 | + * @dev: drm_device pointer |
---|
| 231 | + * |
---|
| 232 | + * Returns true if the device supporte BACO, |
---|
| 233 | + * otherwise return false. |
---|
| 234 | + */ |
---|
| 235 | +bool amdgpu_device_supports_baco(struct drm_device *dev) |
---|
| 236 | +{ |
---|
| 237 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
| 238 | + |
---|
| 239 | + return amdgpu_asic_supports_baco(adev); |
---|
| 240 | +} |
---|
| 241 | + |
---|
114 | 242 | /* |
---|
115 | | - * MMIO register access helper functions. |
---|
| 243 | + * VRAM access helper functions |
---|
| 244 | + */ |
---|
| 245 | + |
---|
| 246 | +/** |
---|
| 247 | + * amdgpu_device_vram_access - read/write a buffer in vram |
---|
| 248 | + * |
---|
| 249 | + * @adev: amdgpu_device pointer |
---|
| 250 | + * @pos: offset of the buffer in vram |
---|
| 251 | + * @buf: virtual address of the buffer in system memory |
---|
| 252 | + * @size: read/write size, sizeof(@buf) must > @size |
---|
| 253 | + * @write: true - write to vram, otherwise - read from vram |
---|
| 254 | + */ |
---|
| 255 | +void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, |
---|
| 256 | + uint32_t *buf, size_t size, bool write) |
---|
| 257 | +{ |
---|
| 258 | + unsigned long flags; |
---|
| 259 | + uint32_t hi = ~0; |
---|
| 260 | + uint64_t last; |
---|
| 261 | + |
---|
| 262 | + |
---|
| 263 | +#ifdef CONFIG_64BIT |
---|
| 264 | + last = min(pos + size, adev->gmc.visible_vram_size); |
---|
| 265 | + if (last > pos) { |
---|
| 266 | + void __iomem *addr = adev->mman.aper_base_kaddr + pos; |
---|
| 267 | + size_t count = last - pos; |
---|
| 268 | + |
---|
| 269 | + if (write) { |
---|
| 270 | + memcpy_toio(addr, buf, count); |
---|
| 271 | + mb(); |
---|
| 272 | + amdgpu_asic_flush_hdp(adev, NULL); |
---|
| 273 | + } else { |
---|
| 274 | + amdgpu_asic_invalidate_hdp(adev, NULL); |
---|
| 275 | + mb(); |
---|
| 276 | + memcpy_fromio(buf, addr, count); |
---|
| 277 | + } |
---|
| 278 | + |
---|
| 279 | + if (count == size) |
---|
| 280 | + return; |
---|
| 281 | + |
---|
| 282 | + pos += count; |
---|
| 283 | + buf += count / 4; |
---|
| 284 | + size -= count; |
---|
| 285 | + } |
---|
| 286 | +#endif |
---|
| 287 | + |
---|
| 288 | + spin_lock_irqsave(&adev->mmio_idx_lock, flags); |
---|
| 289 | + for (last = pos + size; pos < last; pos += 4) { |
---|
| 290 | + uint32_t tmp = pos >> 31; |
---|
| 291 | + |
---|
| 292 | + WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); |
---|
| 293 | + if (tmp != hi) { |
---|
| 294 | + WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); |
---|
| 295 | + hi = tmp; |
---|
| 296 | + } |
---|
| 297 | + if (write) |
---|
| 298 | + WREG32_NO_KIQ(mmMM_DATA, *buf++); |
---|
| 299 | + else |
---|
| 300 | + *buf++ = RREG32_NO_KIQ(mmMM_DATA); |
---|
| 301 | + } |
---|
| 302 | + spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); |
---|
| 303 | +} |
---|
| 304 | + |
---|
| 305 | +/* |
---|
| 306 | + * register access helper functions. |
---|
116 | 307 | */ |
---|
117 | 308 | /** |
---|
118 | | - * amdgpu_mm_rreg - read a memory mapped IO register |
---|
| 309 | + * amdgpu_device_rreg - read a memory mapped IO or indirect register |
---|
119 | 310 | * |
---|
120 | 311 | * @adev: amdgpu_device pointer |
---|
121 | 312 | * @reg: dword aligned register offset |
---|
.. | .. |
---|
123 | 314 | * |
---|
124 | 315 | * Returns the 32 bit value from the offset specified. |
---|
125 | 316 | */ |
---|
126 | | -uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, |
---|
127 | | - uint32_t acc_flags) |
---|
| 317 | +uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, |
---|
| 318 | + uint32_t reg, uint32_t acc_flags) |
---|
128 | 319 | { |
---|
129 | 320 | uint32_t ret; |
---|
130 | 321 | |
---|
131 | | - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) |
---|
132 | | - return amdgpu_virt_kiq_rreg(adev, reg); |
---|
| 322 | + if (adev->in_pci_err_recovery) |
---|
| 323 | + return 0; |
---|
133 | 324 | |
---|
134 | | - if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) |
---|
135 | | - ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); |
---|
136 | | - else { |
---|
137 | | - unsigned long flags; |
---|
138 | | - |
---|
139 | | - spin_lock_irqsave(&adev->mmio_idx_lock, flags); |
---|
140 | | - writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); |
---|
141 | | - ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); |
---|
142 | | - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); |
---|
| 325 | + if ((reg * 4) < adev->rmmio_size) { |
---|
| 326 | + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && |
---|
| 327 | + amdgpu_sriov_runtime(adev) && |
---|
| 328 | + down_read_trylock(&adev->reset_sem)) { |
---|
| 329 | + ret = amdgpu_kiq_rreg(adev, reg); |
---|
| 330 | + up_read(&adev->reset_sem); |
---|
| 331 | + } else { |
---|
| 332 | + ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); |
---|
| 333 | + } |
---|
| 334 | + } else { |
---|
| 335 | + ret = adev->pcie_rreg(adev, reg * 4); |
---|
143 | 336 | } |
---|
144 | | - trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); |
---|
| 337 | + |
---|
| 338 | + trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); |
---|
| 339 | + |
---|
145 | 340 | return ret; |
---|
146 | 341 | } |
---|
147 | 342 | |
---|
.. | .. |
---|
159 | 354 | * |
---|
160 | 355 | * Returns the 8 bit value from the offset specified. |
---|
161 | 356 | */ |
---|
162 | | -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { |
---|
| 357 | +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) |
---|
| 358 | +{ |
---|
| 359 | + if (adev->in_pci_err_recovery) |
---|
| 360 | + return 0; |
---|
| 361 | + |
---|
163 | 362 | if (offset < adev->rmmio_size) |
---|
164 | 363 | return (readb(adev->rmmio + offset)); |
---|
165 | 364 | BUG(); |
---|
.. | .. |
---|
180 | 379 | * |
---|
181 | 380 | * Writes the value specified to the offset specified. |
---|
182 | 381 | */ |
---|
183 | | -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { |
---|
| 382 | +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) |
---|
| 383 | +{ |
---|
| 384 | + if (adev->in_pci_err_recovery) |
---|
| 385 | + return; |
---|
| 386 | + |
---|
184 | 387 | if (offset < adev->rmmio_size) |
---|
185 | 388 | writeb(value, adev->rmmio + offset); |
---|
186 | 389 | else |
---|
.. | .. |
---|
188 | 391 | } |
---|
189 | 392 | |
---|
190 | 393 | /** |
---|
191 | | - * amdgpu_mm_wreg - write to a memory mapped IO register |
---|
| 394 | + * amdgpu_device_wreg - write to a memory mapped IO or indirect register |
---|
192 | 395 | * |
---|
193 | 396 | * @adev: amdgpu_device pointer |
---|
194 | 397 | * @reg: dword aligned register offset |
---|
.. | .. |
---|
197 | 400 | * |
---|
198 | 401 | * Writes the value specified to the offset specified. |
---|
199 | 402 | */ |
---|
200 | | -void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, |
---|
201 | | - uint32_t acc_flags) |
---|
| 403 | +void amdgpu_device_wreg(struct amdgpu_device *adev, |
---|
| 404 | + uint32_t reg, uint32_t v, |
---|
| 405 | + uint32_t acc_flags) |
---|
202 | 406 | { |
---|
203 | | - trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); |
---|
| 407 | + if (adev->in_pci_err_recovery) |
---|
| 408 | + return; |
---|
204 | 409 | |
---|
205 | | - if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { |
---|
206 | | - adev->last_mm_index = v; |
---|
| 410 | + if ((reg * 4) < adev->rmmio_size) { |
---|
| 411 | + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && |
---|
| 412 | + amdgpu_sriov_runtime(adev) && |
---|
| 413 | + down_read_trylock(&adev->reset_sem)) { |
---|
| 414 | + amdgpu_kiq_wreg(adev, reg, v); |
---|
| 415 | + up_read(&adev->reset_sem); |
---|
| 416 | + } else { |
---|
| 417 | + writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); |
---|
| 418 | + } |
---|
| 419 | + } else { |
---|
| 420 | + adev->pcie_wreg(adev, reg * 4, v); |
---|
207 | 421 | } |
---|
208 | 422 | |
---|
209 | | - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) |
---|
210 | | - return amdgpu_virt_kiq_wreg(adev, reg, v); |
---|
| 423 | + trace_amdgpu_device_wreg(adev->pdev->device, reg, v); |
---|
| 424 | +} |
---|
211 | 425 | |
---|
212 | | - if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) |
---|
| 426 | +/* |
---|
| 427 | + * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range |
---|
| 428 | + * |
---|
| 429 | + * this function is invoked only the debugfs register access |
---|
| 430 | + * */ |
---|
| 431 | +void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, |
---|
| 432 | + uint32_t reg, uint32_t v) |
---|
| 433 | +{ |
---|
| 434 | + if (adev->in_pci_err_recovery) |
---|
| 435 | + return; |
---|
| 436 | + |
---|
| 437 | + if (amdgpu_sriov_fullaccess(adev) && |
---|
| 438 | + adev->gfx.rlc.funcs && |
---|
| 439 | + adev->gfx.rlc.funcs->is_rlcg_access_range) { |
---|
| 440 | + if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) |
---|
| 441 | + return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); |
---|
| 442 | + } else { |
---|
213 | 443 | writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); |
---|
214 | | - else { |
---|
215 | | - unsigned long flags; |
---|
216 | | - |
---|
217 | | - spin_lock_irqsave(&adev->mmio_idx_lock, flags); |
---|
218 | | - writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); |
---|
219 | | - writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); |
---|
220 | | - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); |
---|
221 | | - } |
---|
222 | | - |
---|
223 | | - if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { |
---|
224 | | - udelay(500); |
---|
225 | 444 | } |
---|
226 | 445 | } |
---|
227 | 446 | |
---|
.. | .. |
---|
235 | 454 | */ |
---|
236 | 455 | u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) |
---|
237 | 456 | { |
---|
| 457 | + if (adev->in_pci_err_recovery) |
---|
| 458 | + return 0; |
---|
| 459 | + |
---|
238 | 460 | if ((reg * 4) < adev->rio_mem_size) |
---|
239 | 461 | return ioread32(adev->rio_mem + (reg * 4)); |
---|
240 | 462 | else { |
---|
.. | .. |
---|
254 | 476 | */ |
---|
255 | 477 | void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) |
---|
256 | 478 | { |
---|
257 | | - if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { |
---|
258 | | - adev->last_mm_index = v; |
---|
259 | | - } |
---|
| 479 | + if (adev->in_pci_err_recovery) |
---|
| 480 | + return; |
---|
260 | 481 | |
---|
261 | 482 | if ((reg * 4) < adev->rio_mem_size) |
---|
262 | 483 | iowrite32(v, adev->rio_mem + (reg * 4)); |
---|
263 | 484 | else { |
---|
264 | 485 | iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); |
---|
265 | 486 | iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); |
---|
266 | | - } |
---|
267 | | - |
---|
268 | | - if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { |
---|
269 | | - udelay(500); |
---|
270 | 487 | } |
---|
271 | 488 | } |
---|
272 | 489 | |
---|
.. | .. |
---|
281 | 498 | */ |
---|
282 | 499 | u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) |
---|
283 | 500 | { |
---|
| 501 | + if (adev->in_pci_err_recovery) |
---|
| 502 | + return 0; |
---|
| 503 | + |
---|
284 | 504 | if (index < adev->doorbell.num_doorbells) { |
---|
285 | 505 | return readl(adev->doorbell.ptr + index); |
---|
286 | 506 | } else { |
---|
.. | .. |
---|
301 | 521 | */ |
---|
302 | 522 | void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) |
---|
303 | 523 | { |
---|
| 524 | + if (adev->in_pci_err_recovery) |
---|
| 525 | + return; |
---|
| 526 | + |
---|
304 | 527 | if (index < adev->doorbell.num_doorbells) { |
---|
305 | 528 | writel(v, adev->doorbell.ptr + index); |
---|
306 | 529 | } else { |
---|
.. | .. |
---|
319 | 542 | */ |
---|
320 | 543 | u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) |
---|
321 | 544 | { |
---|
| 545 | + if (adev->in_pci_err_recovery) |
---|
| 546 | + return 0; |
---|
| 547 | + |
---|
322 | 548 | if (index < adev->doorbell.num_doorbells) { |
---|
323 | 549 | return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); |
---|
324 | 550 | } else { |
---|
.. | .. |
---|
339 | 565 | */ |
---|
340 | 566 | void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) |
---|
341 | 567 | { |
---|
| 568 | + if (adev->in_pci_err_recovery) |
---|
| 569 | + return; |
---|
| 570 | + |
---|
342 | 571 | if (index < adev->doorbell.num_doorbells) { |
---|
343 | 572 | atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); |
---|
344 | 573 | } else { |
---|
.. | .. |
---|
347 | 576 | } |
---|
348 | 577 | |
---|
349 | 578 | /** |
---|
| 579 | + * amdgpu_device_indirect_rreg - read an indirect register |
---|
| 580 | + * |
---|
| 581 | + * @adev: amdgpu_device pointer |
---|
| 582 | + * @pcie_index: mmio register offset |
---|
| 583 | + * @pcie_data: mmio register offset |
---|
| 584 | + * |
---|
| 585 | + * Returns the value of indirect register @reg_addr |
---|
| 586 | + */ |
---|
| 587 | +u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, |
---|
| 588 | + u32 pcie_index, u32 pcie_data, |
---|
| 589 | + u32 reg_addr) |
---|
| 590 | +{ |
---|
| 591 | + unsigned long flags; |
---|
| 592 | + u32 r; |
---|
| 593 | + void __iomem *pcie_index_offset; |
---|
| 594 | + void __iomem *pcie_data_offset; |
---|
| 595 | + |
---|
| 596 | + spin_lock_irqsave(&adev->pcie_idx_lock, flags); |
---|
| 597 | + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; |
---|
| 598 | + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; |
---|
| 599 | + |
---|
| 600 | + writel(reg_addr, pcie_index_offset); |
---|
| 601 | + readl(pcie_index_offset); |
---|
| 602 | + r = readl(pcie_data_offset); |
---|
| 603 | + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); |
---|
| 604 | + |
---|
| 605 | + return r; |
---|
| 606 | +} |
---|
| 607 | + |
---|
| 608 | +/** |
---|
| 609 | + * amdgpu_device_indirect_rreg64 - read a 64bits indirect register |
---|
| 610 | + * |
---|
| 611 | + * @adev: amdgpu_device pointer |
---|
| 612 | + * @pcie_index: mmio register offset |
---|
| 613 | + * @pcie_data: mmio register offset |
---|
| 614 | + * |
---|
| 615 | + * Returns the value of indirect register @reg_addr |
---|
| 616 | + */ |
---|
| 617 | +u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, |
---|
| 618 | + u32 pcie_index, u32 pcie_data, |
---|
| 619 | + u32 reg_addr) |
---|
| 620 | +{ |
---|
| 621 | + unsigned long flags; |
---|
| 622 | + u64 r; |
---|
| 623 | + void __iomem *pcie_index_offset; |
---|
| 624 | + void __iomem *pcie_data_offset; |
---|
| 625 | + |
---|
| 626 | + spin_lock_irqsave(&adev->pcie_idx_lock, flags); |
---|
| 627 | + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; |
---|
| 628 | + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; |
---|
| 629 | + |
---|
| 630 | + /* read low 32 bits */ |
---|
| 631 | + writel(reg_addr, pcie_index_offset); |
---|
| 632 | + readl(pcie_index_offset); |
---|
| 633 | + r = readl(pcie_data_offset); |
---|
| 634 | + /* read high 32 bits */ |
---|
| 635 | + writel(reg_addr + 4, pcie_index_offset); |
---|
| 636 | + readl(pcie_index_offset); |
---|
| 637 | + r |= ((u64)readl(pcie_data_offset) << 32); |
---|
| 638 | + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); |
---|
| 639 | + |
---|
| 640 | + return r; |
---|
| 641 | +} |
---|
| 642 | + |
---|
| 643 | +/** |
---|
| 644 | + * amdgpu_device_indirect_wreg - write an indirect register address |
---|
| 645 | + * |
---|
| 646 | + * @adev: amdgpu_device pointer |
---|
| 647 | + * @pcie_index: mmio register offset |
---|
| 648 | + * @pcie_data: mmio register offset |
---|
| 649 | + * @reg_addr: indirect register offset |
---|
| 650 | + * @reg_data: indirect register data |
---|
| 651 | + * |
---|
| 652 | + */ |
---|
| 653 | +void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, |
---|
| 654 | + u32 pcie_index, u32 pcie_data, |
---|
| 655 | + u32 reg_addr, u32 reg_data) |
---|
| 656 | +{ |
---|
| 657 | + unsigned long flags; |
---|
| 658 | + void __iomem *pcie_index_offset; |
---|
| 659 | + void __iomem *pcie_data_offset; |
---|
| 660 | + |
---|
| 661 | + spin_lock_irqsave(&adev->pcie_idx_lock, flags); |
---|
| 662 | + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; |
---|
| 663 | + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; |
---|
| 664 | + |
---|
| 665 | + writel(reg_addr, pcie_index_offset); |
---|
| 666 | + readl(pcie_index_offset); |
---|
| 667 | + writel(reg_data, pcie_data_offset); |
---|
| 668 | + readl(pcie_data_offset); |
---|
| 669 | + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); |
---|
| 670 | +} |
---|
| 671 | + |
---|
| 672 | +/** |
---|
| 673 | + * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address |
---|
| 674 | + * |
---|
| 675 | + * @adev: amdgpu_device pointer |
---|
| 676 | + * @pcie_index: mmio register offset |
---|
| 677 | + * @pcie_data: mmio register offset |
---|
| 678 | + * @reg_addr: indirect register offset |
---|
| 679 | + * @reg_data: indirect register data |
---|
| 680 | + * |
---|
| 681 | + */ |
---|
| 682 | +void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, |
---|
| 683 | + u32 pcie_index, u32 pcie_data, |
---|
| 684 | + u32 reg_addr, u64 reg_data) |
---|
| 685 | +{ |
---|
| 686 | + unsigned long flags; |
---|
| 687 | + void __iomem *pcie_index_offset; |
---|
| 688 | + void __iomem *pcie_data_offset; |
---|
| 689 | + |
---|
| 690 | + spin_lock_irqsave(&adev->pcie_idx_lock, flags); |
---|
| 691 | + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; |
---|
| 692 | + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; |
---|
| 693 | + |
---|
| 694 | + /* write low 32 bits */ |
---|
| 695 | + writel(reg_addr, pcie_index_offset); |
---|
| 696 | + readl(pcie_index_offset); |
---|
| 697 | + writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); |
---|
| 698 | + readl(pcie_data_offset); |
---|
| 699 | + /* write high 32 bits */ |
---|
| 700 | + writel(reg_addr + 4, pcie_index_offset); |
---|
| 701 | + readl(pcie_index_offset); |
---|
| 702 | + writel((u32)(reg_data >> 32), pcie_data_offset); |
---|
| 703 | + readl(pcie_data_offset); |
---|
| 704 | + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); |
---|
| 705 | +} |
---|
| 706 | + |
---|
| 707 | +/** |
---|
350 | 708 | * amdgpu_invalid_rreg - dummy reg read function |
---|
351 | 709 | * |
---|
352 | | - * @adev: amdgpu device pointer |
---|
| 710 | + * @adev: amdgpu_device pointer |
---|
353 | 711 | * @reg: offset of register |
---|
354 | 712 | * |
---|
355 | 713 | * Dummy register read function. Used for register blocks |
---|
.. | .. |
---|
366 | 724 | /** |
---|
367 | 725 | * amdgpu_invalid_wreg - dummy reg write function |
---|
368 | 726 | * |
---|
369 | | - * @adev: amdgpu device pointer |
---|
| 727 | + * @adev: amdgpu_device pointer |
---|
370 | 728 | * @reg: offset of register |
---|
371 | 729 | * @v: value to write to the register |
---|
372 | 730 | * |
---|
.. | .. |
---|
381 | 739 | } |
---|
382 | 740 | |
---|
383 | 741 | /** |
---|
| 742 | + * amdgpu_invalid_rreg64 - dummy 64 bit reg read function |
---|
| 743 | + * |
---|
| 744 | + * @adev: amdgpu_device pointer |
---|
| 745 | + * @reg: offset of register |
---|
| 746 | + * |
---|
| 747 | + * Dummy register read function. Used for register blocks |
---|
| 748 | + * that certain asics don't have (all asics). |
---|
| 749 | + * Returns the value in the register. |
---|
| 750 | + */ |
---|
| 751 | +static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) |
---|
| 752 | +{ |
---|
| 753 | + DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); |
---|
| 754 | + BUG(); |
---|
| 755 | + return 0; |
---|
| 756 | +} |
---|
| 757 | + |
---|
| 758 | +/** |
---|
| 759 | + * amdgpu_invalid_wreg64 - dummy reg write function |
---|
| 760 | + * |
---|
| 761 | + * @adev: amdgpu_device pointer |
---|
| 762 | + * @reg: offset of register |
---|
| 763 | + * @v: value to write to the register |
---|
| 764 | + * |
---|
| 765 | + * Dummy register read function. Used for register blocks |
---|
| 766 | + * that certain asics don't have (all asics). |
---|
| 767 | + */ |
---|
| 768 | +static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) |
---|
| 769 | +{ |
---|
| 770 | + DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", |
---|
| 771 | + reg, v); |
---|
| 772 | + BUG(); |
---|
| 773 | +} |
---|
| 774 | + |
---|
| 775 | +/** |
---|
384 | 776 | * amdgpu_block_invalid_rreg - dummy reg read function |
---|
385 | 777 | * |
---|
386 | | - * @adev: amdgpu device pointer |
---|
| 778 | + * @adev: amdgpu_device pointer |
---|
387 | 779 | * @block: offset of instance |
---|
388 | 780 | * @reg: offset of register |
---|
389 | 781 | * |
---|
.. | .. |
---|
403 | 795 | /** |
---|
404 | 796 | * amdgpu_block_invalid_wreg - dummy reg write function |
---|
405 | 797 | * |
---|
406 | | - * @adev: amdgpu device pointer |
---|
| 798 | + * @adev: amdgpu_device pointer |
---|
407 | 799 | * @block: offset of instance |
---|
408 | 800 | * @reg: offset of register |
---|
409 | 801 | * @v: value to write to the register |
---|
.. | .. |
---|
421 | 813 | } |
---|
422 | 814 | |
---|
423 | 815 | /** |
---|
| 816 | + * amdgpu_device_asic_init - Wrapper for atom asic_init |
---|
| 817 | + * |
---|
| 818 | + * @adev: amdgpu_device pointer |
---|
| 819 | + * |
---|
| 820 | + * Does any asic specific work and then calls atom asic init. |
---|
| 821 | + */ |
---|
| 822 | +static int amdgpu_device_asic_init(struct amdgpu_device *adev) |
---|
| 823 | +{ |
---|
| 824 | + amdgpu_asic_pre_asic_init(adev); |
---|
| 825 | + |
---|
| 826 | + return amdgpu_atom_asic_init(adev->mode_info.atom_context); |
---|
| 827 | +} |
---|
| 828 | + |
---|
| 829 | +/** |
---|
424 | 830 | * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page |
---|
425 | 831 | * |
---|
426 | | - * @adev: amdgpu device pointer |
---|
| 832 | + * @adev: amdgpu_device pointer |
---|
427 | 833 | * |
---|
428 | 834 | * Allocates a scratch page of VRAM for use by various things in the |
---|
429 | 835 | * driver. |
---|
.. | .. |
---|
440 | 846 | /** |
---|
441 | 847 | * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page |
---|
442 | 848 | * |
---|
443 | | - * @adev: amdgpu device pointer |
---|
| 849 | + * @adev: amdgpu_device pointer |
---|
444 | 850 | * |
---|
445 | 851 | * Frees the VRAM scratch page. |
---|
446 | 852 | */ |
---|
.. | .. |
---|
479 | 885 | } else { |
---|
480 | 886 | tmp = RREG32(reg); |
---|
481 | 887 | tmp &= ~and_mask; |
---|
482 | | - tmp |= or_mask; |
---|
| 888 | + if (adev->family >= AMDGPU_FAMILY_AI) |
---|
| 889 | + tmp |= (or_mask & and_mask); |
---|
| 890 | + else |
---|
| 891 | + tmp |= or_mask; |
---|
483 | 892 | } |
---|
484 | 893 | WREG32(reg, tmp); |
---|
485 | 894 | } |
---|
.. | .. |
---|
511 | 920 | */ |
---|
512 | 921 | static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) |
---|
513 | 922 | { |
---|
| 923 | + |
---|
514 | 924 | /* No doorbell on SI hardware generation */ |
---|
515 | 925 | if (adev->asic_type < CHIP_BONAIRE) { |
---|
516 | 926 | adev->doorbell.base = 0; |
---|
.. | .. |
---|
523 | 933 | if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) |
---|
524 | 934 | return -EINVAL; |
---|
525 | 935 | |
---|
| 936 | + amdgpu_asic_init_doorbell_index(adev); |
---|
| 937 | + |
---|
526 | 938 | /* doorbell bar mapping */ |
---|
527 | 939 | adev->doorbell.base = pci_resource_start(adev->pdev, 2); |
---|
528 | 940 | adev->doorbell.size = pci_resource_len(adev->pdev, 2); |
---|
529 | 941 | |
---|
530 | 942 | adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), |
---|
531 | | - AMDGPU_DOORBELL_MAX_ASSIGNMENT+1); |
---|
| 943 | + adev->doorbell_index.max_assignment+1); |
---|
532 | 944 | if (adev->doorbell.num_doorbells == 0) |
---|
533 | 945 | return -EINVAL; |
---|
| 946 | + |
---|
| 947 | + /* For Vega, reserve and map two pages on doorbell BAR since SDMA |
---|
| 948 | + * paging queue doorbell use the second page. The |
---|
| 949 | + * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the |
---|
| 950 | + * doorbells are in the first page. So with paging queue enabled, |
---|
| 951 | + * the max num_doorbells should + 1 page (0x400 in dword) |
---|
| 952 | + */ |
---|
| 953 | + if (adev->asic_type >= CHIP_VEGA10) |
---|
| 954 | + adev->doorbell.num_doorbells += 0x400; |
---|
534 | 955 | |
---|
535 | 956 | adev->doorbell.ptr = ioremap(adev->doorbell.base, |
---|
536 | 957 | adev->doorbell.num_doorbells * |
---|
.. | .. |
---|
652 | 1073 | } |
---|
653 | 1074 | |
---|
654 | 1075 | /** |
---|
655 | | - * amdgpu_device_vram_location - try to find VRAM location |
---|
656 | | - * |
---|
657 | | - * @adev: amdgpu device structure holding all necessary informations |
---|
658 | | - * @mc: memory controller structure holding memory informations |
---|
659 | | - * @base: base address at which to put VRAM |
---|
660 | | - * |
---|
661 | | - * Function will try to place VRAM at base address provided |
---|
662 | | - * as parameter. |
---|
663 | | - */ |
---|
664 | | -void amdgpu_device_vram_location(struct amdgpu_device *adev, |
---|
665 | | - struct amdgpu_gmc *mc, u64 base) |
---|
666 | | -{ |
---|
667 | | - uint64_t limit = (uint64_t)amdgpu_vram_limit << 20; |
---|
668 | | - |
---|
669 | | - mc->vram_start = base; |
---|
670 | | - mc->vram_end = mc->vram_start + mc->mc_vram_size - 1; |
---|
671 | | - if (limit && limit < mc->real_vram_size) |
---|
672 | | - mc->real_vram_size = limit; |
---|
673 | | - dev_info(adev->dev, "VRAM: %lluM 0x%016llX - 0x%016llX (%lluM used)\n", |
---|
674 | | - mc->mc_vram_size >> 20, mc->vram_start, |
---|
675 | | - mc->vram_end, mc->real_vram_size >> 20); |
---|
676 | | -} |
---|
677 | | - |
---|
678 | | -/** |
---|
679 | | - * amdgpu_device_gart_location - try to find GART location |
---|
680 | | - * |
---|
681 | | - * @adev: amdgpu device structure holding all necessary informations |
---|
682 | | - * @mc: memory controller structure holding memory informations |
---|
683 | | - * |
---|
684 | | - * Function will place try to place GART before or after VRAM. |
---|
685 | | - * |
---|
686 | | - * If GART size is bigger than space left then we ajust GART size. |
---|
687 | | - * Thus function will never fails. |
---|
688 | | - */ |
---|
689 | | -void amdgpu_device_gart_location(struct amdgpu_device *adev, |
---|
690 | | - struct amdgpu_gmc *mc) |
---|
691 | | -{ |
---|
692 | | - u64 size_af, size_bf; |
---|
693 | | - |
---|
694 | | - mc->gart_size += adev->pm.smu_prv_buffer_size; |
---|
695 | | - |
---|
696 | | - size_af = adev->gmc.mc_mask - mc->vram_end; |
---|
697 | | - size_bf = mc->vram_start; |
---|
698 | | - if (size_bf > size_af) { |
---|
699 | | - if (mc->gart_size > size_bf) { |
---|
700 | | - dev_warn(adev->dev, "limiting GART\n"); |
---|
701 | | - mc->gart_size = size_bf; |
---|
702 | | - } |
---|
703 | | - mc->gart_start = 0; |
---|
704 | | - } else { |
---|
705 | | - if (mc->gart_size > size_af) { |
---|
706 | | - dev_warn(adev->dev, "limiting GART\n"); |
---|
707 | | - mc->gart_size = size_af; |
---|
708 | | - } |
---|
709 | | - /* VCE doesn't like it when BOs cross a 4GB segment, so align |
---|
710 | | - * the GART base on a 4GB boundary as well. |
---|
711 | | - */ |
---|
712 | | - mc->gart_start = ALIGN(mc->vram_end + 1, 0x100000000ULL); |
---|
713 | | - } |
---|
714 | | - mc->gart_end = mc->gart_start + mc->gart_size - 1; |
---|
715 | | - dev_info(adev->dev, "GART: %lluM 0x%016llX - 0x%016llX\n", |
---|
716 | | - mc->gart_size >> 20, mc->gart_start, mc->gart_end); |
---|
717 | | -} |
---|
718 | | - |
---|
719 | | -/** |
---|
720 | 1076 | * amdgpu_device_resize_fb_bar - try to resize FB BAR |
---|
721 | 1077 | * |
---|
722 | 1078 | * @adev: amdgpu_device pointer |
---|
.. | .. |
---|
737 | 1093 | |
---|
738 | 1094 | /* Bypass for VF */ |
---|
739 | 1095 | if (amdgpu_sriov_vf(adev)) |
---|
| 1096 | + return 0; |
---|
| 1097 | + |
---|
| 1098 | + /* skip if the bios has already enabled large BAR */ |
---|
| 1099 | + if (adev->gmc.real_vram_size && |
---|
| 1100 | + (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) |
---|
740 | 1101 | return 0; |
---|
741 | 1102 | |
---|
742 | 1103 | /* Check if the root BUS has 64bit memory resources */ |
---|
.. | .. |
---|
913 | 1274 | static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) |
---|
914 | 1275 | { |
---|
915 | 1276 | struct sysinfo si; |
---|
916 | | - bool is_os_64 = (sizeof(void *) == 8) ? true : false; |
---|
| 1277 | + bool is_os_64 = (sizeof(void *) == 8); |
---|
917 | 1278 | uint64_t total_memory; |
---|
918 | 1279 | uint64_t dram_size_seven_GB = 0x1B8000000; |
---|
919 | 1280 | uint64_t dram_size_three_GB = 0xB8000000; |
---|
.. | .. |
---|
958 | 1319 | * Validates certain module parameters and updates |
---|
959 | 1320 | * the associated values used by the driver (all asics). |
---|
960 | 1321 | */ |
---|
961 | | -static void amdgpu_device_check_arguments(struct amdgpu_device *adev) |
---|
| 1322 | +static int amdgpu_device_check_arguments(struct amdgpu_device *adev) |
---|
962 | 1323 | { |
---|
963 | 1324 | if (amdgpu_sched_jobs < 4) { |
---|
964 | 1325 | dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", |
---|
.. | .. |
---|
991 | 1352 | amdgpu_vm_fragment_size = -1; |
---|
992 | 1353 | } |
---|
993 | 1354 | |
---|
| 1355 | + if (amdgpu_sched_hw_submission < 2) { |
---|
| 1356 | + dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", |
---|
| 1357 | + amdgpu_sched_hw_submission); |
---|
| 1358 | + amdgpu_sched_hw_submission = 2; |
---|
| 1359 | + } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { |
---|
| 1360 | + dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", |
---|
| 1361 | + amdgpu_sched_hw_submission); |
---|
| 1362 | + amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); |
---|
| 1363 | + } |
---|
| 1364 | + |
---|
994 | 1365 | amdgpu_device_check_smu_prv_buffer_size(adev); |
---|
995 | 1366 | |
---|
996 | 1367 | amdgpu_device_check_vm_size(adev); |
---|
997 | 1368 | |
---|
998 | 1369 | amdgpu_device_check_block_size(adev); |
---|
999 | 1370 | |
---|
1000 | | - if (amdgpu_vram_page_split != -1 && (amdgpu_vram_page_split < 16 || |
---|
1001 | | - !is_power_of_2(amdgpu_vram_page_split))) { |
---|
1002 | | - dev_warn(adev->dev, "invalid VRAM page split (%d)\n", |
---|
1003 | | - amdgpu_vram_page_split); |
---|
1004 | | - amdgpu_vram_page_split = 1024; |
---|
1005 | | - } |
---|
1006 | | - |
---|
1007 | | - if (amdgpu_lockup_timeout == 0) { |
---|
1008 | | - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 10000\n"); |
---|
1009 | | - amdgpu_lockup_timeout = 10000; |
---|
1010 | | - } |
---|
1011 | | - |
---|
1012 | 1371 | adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); |
---|
| 1372 | + |
---|
| 1373 | + amdgpu_gmc_tmz_set(adev); |
---|
| 1374 | + |
---|
| 1375 | + if (amdgpu_num_kcq == -1) { |
---|
| 1376 | + amdgpu_num_kcq = 8; |
---|
| 1377 | + } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { |
---|
| 1378 | + amdgpu_num_kcq = 8; |
---|
| 1379 | + dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); |
---|
| 1380 | + } |
---|
| 1381 | + |
---|
| 1382 | + amdgpu_gmc_noretry_set(adev); |
---|
| 1383 | + |
---|
| 1384 | + return 0; |
---|
1013 | 1385 | } |
---|
1014 | 1386 | |
---|
1015 | 1387 | /** |
---|
.. | .. |
---|
1021 | 1393 | * Callback for the switcheroo driver. Suspends or resumes the |
---|
1022 | 1394 | * the asics before or after it is powered up using ACPI methods. |
---|
1023 | 1395 | */ |
---|
1024 | | -static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) |
---|
| 1396 | +static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, |
---|
| 1397 | + enum vga_switcheroo_state state) |
---|
1025 | 1398 | { |
---|
1026 | 1399 | struct drm_device *dev = pci_get_drvdata(pdev); |
---|
| 1400 | + int r; |
---|
1027 | 1401 | |
---|
1028 | | - if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF) |
---|
| 1402 | + if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) |
---|
1029 | 1403 | return; |
---|
1030 | 1404 | |
---|
1031 | 1405 | if (state == VGA_SWITCHEROO_ON) { |
---|
1032 | | - pr_info("amdgpu: switched on\n"); |
---|
| 1406 | + pr_info("switched on\n"); |
---|
1033 | 1407 | /* don't suspend or resume card normally */ |
---|
1034 | 1408 | dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; |
---|
1035 | 1409 | |
---|
1036 | | - amdgpu_device_resume(dev, true, true); |
---|
| 1410 | + pci_set_power_state(dev->pdev, PCI_D0); |
---|
| 1411 | + amdgpu_device_load_pci_state(dev->pdev); |
---|
| 1412 | + r = pci_enable_device(dev->pdev); |
---|
| 1413 | + if (r) |
---|
| 1414 | + DRM_WARN("pci_enable_device failed (%d)\n", r); |
---|
| 1415 | + amdgpu_device_resume(dev, true); |
---|
1037 | 1416 | |
---|
1038 | 1417 | dev->switch_power_state = DRM_SWITCH_POWER_ON; |
---|
1039 | 1418 | drm_kms_helper_poll_enable(dev); |
---|
1040 | 1419 | } else { |
---|
1041 | | - pr_info("amdgpu: switched off\n"); |
---|
| 1420 | + pr_info("switched off\n"); |
---|
1042 | 1421 | drm_kms_helper_poll_disable(dev); |
---|
1043 | 1422 | dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; |
---|
1044 | | - amdgpu_device_suspend(dev, true, true); |
---|
| 1423 | + amdgpu_device_suspend(dev, true); |
---|
| 1424 | + amdgpu_device_cache_pci_state(dev->pdev); |
---|
| 1425 | + /* Shut down the device */ |
---|
| 1426 | + pci_disable_device(dev->pdev); |
---|
| 1427 | + pci_set_power_state(dev->pdev, PCI_D3cold); |
---|
1045 | 1428 | dev->switch_power_state = DRM_SWITCH_POWER_OFF; |
---|
1046 | 1429 | } |
---|
1047 | 1430 | } |
---|
.. | .. |
---|
1064 | 1447 | * locking inversion with the driver load path. And the access here is |
---|
1065 | 1448 | * completely racy anyway. So don't bother with locking for now. |
---|
1066 | 1449 | */ |
---|
1067 | | - return dev->open_count == 0; |
---|
| 1450 | + return atomic_read(&dev->open_count) == 0; |
---|
1068 | 1451 | } |
---|
1069 | 1452 | |
---|
1070 | 1453 | static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { |
---|
.. | .. |
---|
1304 | 1687 | adev->enable_virtual_display = false; |
---|
1305 | 1688 | |
---|
1306 | 1689 | if (amdgpu_virtual_display) { |
---|
1307 | | - struct drm_device *ddev = adev->ddev; |
---|
| 1690 | + struct drm_device *ddev = adev_to_drm(adev); |
---|
1308 | 1691 | const char *pci_address_name = pci_name(ddev->pdev); |
---|
1309 | 1692 | char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; |
---|
1310 | 1693 | |
---|
.. | .. |
---|
1357 | 1740 | static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) |
---|
1358 | 1741 | { |
---|
1359 | 1742 | const char *chip_name; |
---|
1360 | | - char fw_name[30]; |
---|
| 1743 | + char fw_name[40]; |
---|
1361 | 1744 | int err; |
---|
1362 | 1745 | const struct gpu_info_firmware_header_v1_0 *hdr; |
---|
1363 | 1746 | |
---|
1364 | 1747 | adev->firmware.gpu_info_fw = NULL; |
---|
1365 | 1748 | |
---|
| 1749 | + if (adev->mman.discovery_bin) { |
---|
| 1750 | + amdgpu_discovery_get_gfx_info(adev); |
---|
| 1751 | + |
---|
| 1752 | + /* |
---|
| 1753 | + * FIXME: The bounding box is still needed by Navi12, so |
---|
| 1754 | + * temporarily read it from gpu_info firmware. Should be droped |
---|
| 1755 | + * when DAL no longer needs it. |
---|
| 1756 | + */ |
---|
| 1757 | + if (adev->asic_type != CHIP_NAVI12) |
---|
| 1758 | + return 0; |
---|
| 1759 | + } |
---|
| 1760 | + |
---|
1366 | 1761 | switch (adev->asic_type) { |
---|
1367 | | - case CHIP_TOPAZ: |
---|
1368 | | - case CHIP_TONGA: |
---|
1369 | | - case CHIP_FIJI: |
---|
1370 | | - case CHIP_POLARIS10: |
---|
1371 | | - case CHIP_POLARIS11: |
---|
1372 | | - case CHIP_POLARIS12: |
---|
1373 | | - case CHIP_VEGAM: |
---|
1374 | | - case CHIP_CARRIZO: |
---|
1375 | | - case CHIP_STONEY: |
---|
1376 | 1762 | #ifdef CONFIG_DRM_AMDGPU_SI |
---|
1377 | 1763 | case CHIP_VERDE: |
---|
1378 | 1764 | case CHIP_TAHITI: |
---|
.. | .. |
---|
1387 | 1773 | case CHIP_KABINI: |
---|
1388 | 1774 | case CHIP_MULLINS: |
---|
1389 | 1775 | #endif |
---|
| 1776 | + case CHIP_TOPAZ: |
---|
| 1777 | + case CHIP_TONGA: |
---|
| 1778 | + case CHIP_FIJI: |
---|
| 1779 | + case CHIP_POLARIS10: |
---|
| 1780 | + case CHIP_POLARIS11: |
---|
| 1781 | + case CHIP_POLARIS12: |
---|
| 1782 | + case CHIP_VEGAM: |
---|
| 1783 | + case CHIP_CARRIZO: |
---|
| 1784 | + case CHIP_STONEY: |
---|
1390 | 1785 | case CHIP_VEGA20: |
---|
| 1786 | + case CHIP_SIENNA_CICHLID: |
---|
| 1787 | + case CHIP_NAVY_FLOUNDER: |
---|
1391 | 1788 | default: |
---|
1392 | 1789 | return 0; |
---|
1393 | 1790 | case CHIP_VEGA10: |
---|
.. | .. |
---|
1397 | 1794 | chip_name = "vega12"; |
---|
1398 | 1795 | break; |
---|
1399 | 1796 | case CHIP_RAVEN: |
---|
1400 | | - chip_name = "raven"; |
---|
| 1797 | + if (adev->apu_flags & AMD_APU_IS_RAVEN2) |
---|
| 1798 | + chip_name = "raven2"; |
---|
| 1799 | + else if (adev->apu_flags & AMD_APU_IS_PICASSO) |
---|
| 1800 | + chip_name = "picasso"; |
---|
| 1801 | + else |
---|
| 1802 | + chip_name = "raven"; |
---|
| 1803 | + break; |
---|
| 1804 | + case CHIP_ARCTURUS: |
---|
| 1805 | + chip_name = "arcturus"; |
---|
| 1806 | + break; |
---|
| 1807 | + case CHIP_RENOIR: |
---|
| 1808 | + if (adev->apu_flags & AMD_APU_IS_RENOIR) |
---|
| 1809 | + chip_name = "renoir"; |
---|
| 1810 | + else |
---|
| 1811 | + chip_name = "green_sardine"; |
---|
| 1812 | + break; |
---|
| 1813 | + case CHIP_NAVI10: |
---|
| 1814 | + chip_name = "navi10"; |
---|
| 1815 | + break; |
---|
| 1816 | + case CHIP_NAVI14: |
---|
| 1817 | + chip_name = "navi14"; |
---|
| 1818 | + break; |
---|
| 1819 | + case CHIP_NAVI12: |
---|
| 1820 | + chip_name = "navi12"; |
---|
1401 | 1821 | break; |
---|
1402 | 1822 | } |
---|
1403 | 1823 | |
---|
.. | .. |
---|
1427 | 1847 | (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + |
---|
1428 | 1848 | le32_to_cpu(hdr->header.ucode_array_offset_bytes)); |
---|
1429 | 1849 | |
---|
| 1850 | + /* |
---|
| 1851 | + * Should be droped when DAL no longer needs it. |
---|
| 1852 | + */ |
---|
| 1853 | + if (adev->asic_type == CHIP_NAVI12) |
---|
| 1854 | + goto parse_soc_bounding_box; |
---|
| 1855 | + |
---|
1430 | 1856 | adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); |
---|
1431 | 1857 | adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); |
---|
1432 | 1858 | adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); |
---|
.. | .. |
---|
1445 | 1871 | adev->gfx.cu_info.max_scratch_slots_per_cu = |
---|
1446 | 1872 | le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); |
---|
1447 | 1873 | adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); |
---|
| 1874 | + if (hdr->version_minor >= 1) { |
---|
| 1875 | + const struct gpu_info_firmware_v1_1 *gpu_info_fw = |
---|
| 1876 | + (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + |
---|
| 1877 | + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); |
---|
| 1878 | + adev->gfx.config.num_sc_per_sh = |
---|
| 1879 | + le32_to_cpu(gpu_info_fw->num_sc_per_sh); |
---|
| 1880 | + adev->gfx.config.num_packer_per_sc = |
---|
| 1881 | + le32_to_cpu(gpu_info_fw->num_packer_per_sc); |
---|
| 1882 | + } |
---|
| 1883 | + |
---|
| 1884 | +parse_soc_bounding_box: |
---|
| 1885 | + /* |
---|
| 1886 | + * soc bounding box info is not integrated in disocovery table, |
---|
| 1887 | + * we always need to parse it from gpu info firmware if needed. |
---|
| 1888 | + */ |
---|
| 1889 | + if (hdr->version_minor == 2) { |
---|
| 1890 | + const struct gpu_info_firmware_v1_2 *gpu_info_fw = |
---|
| 1891 | + (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + |
---|
| 1892 | + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); |
---|
| 1893 | + adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; |
---|
| 1894 | + } |
---|
1448 | 1895 | break; |
---|
1449 | 1896 | } |
---|
1450 | 1897 | default: |
---|
.. | .. |
---|
1473 | 1920 | |
---|
1474 | 1921 | amdgpu_device_enable_virtual_display(adev); |
---|
1475 | 1922 | |
---|
1476 | | - switch (adev->asic_type) { |
---|
1477 | | - case CHIP_TOPAZ: |
---|
1478 | | - case CHIP_TONGA: |
---|
1479 | | - case CHIP_FIJI: |
---|
1480 | | - case CHIP_POLARIS10: |
---|
1481 | | - case CHIP_POLARIS11: |
---|
1482 | | - case CHIP_POLARIS12: |
---|
1483 | | - case CHIP_VEGAM: |
---|
1484 | | - case CHIP_CARRIZO: |
---|
1485 | | - case CHIP_STONEY: |
---|
1486 | | - if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) |
---|
1487 | | - adev->family = AMDGPU_FAMILY_CZ; |
---|
1488 | | - else |
---|
1489 | | - adev->family = AMDGPU_FAMILY_VI; |
---|
1490 | | - |
---|
1491 | | - r = vi_set_ip_blocks(adev); |
---|
| 1923 | + if (amdgpu_sriov_vf(adev)) { |
---|
| 1924 | + r = amdgpu_virt_request_full_gpu(adev, true); |
---|
1492 | 1925 | if (r) |
---|
1493 | 1926 | return r; |
---|
1494 | | - break; |
---|
| 1927 | + } |
---|
| 1928 | + |
---|
| 1929 | + switch (adev->asic_type) { |
---|
1495 | 1930 | #ifdef CONFIG_DRM_AMDGPU_SI |
---|
1496 | 1931 | case CHIP_VERDE: |
---|
1497 | 1932 | case CHIP_TAHITI: |
---|
.. | .. |
---|
1510 | 1945 | case CHIP_KAVERI: |
---|
1511 | 1946 | case CHIP_KABINI: |
---|
1512 | 1947 | case CHIP_MULLINS: |
---|
1513 | | - if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) |
---|
1514 | | - adev->family = AMDGPU_FAMILY_CI; |
---|
1515 | | - else |
---|
| 1948 | + if (adev->flags & AMD_IS_APU) |
---|
1516 | 1949 | adev->family = AMDGPU_FAMILY_KV; |
---|
| 1950 | + else |
---|
| 1951 | + adev->family = AMDGPU_FAMILY_CI; |
---|
1517 | 1952 | |
---|
1518 | 1953 | r = cik_set_ip_blocks(adev); |
---|
1519 | 1954 | if (r) |
---|
1520 | 1955 | return r; |
---|
1521 | 1956 | break; |
---|
1522 | 1957 | #endif |
---|
| 1958 | + case CHIP_TOPAZ: |
---|
| 1959 | + case CHIP_TONGA: |
---|
| 1960 | + case CHIP_FIJI: |
---|
| 1961 | + case CHIP_POLARIS10: |
---|
| 1962 | + case CHIP_POLARIS11: |
---|
| 1963 | + case CHIP_POLARIS12: |
---|
| 1964 | + case CHIP_VEGAM: |
---|
| 1965 | + case CHIP_CARRIZO: |
---|
| 1966 | + case CHIP_STONEY: |
---|
| 1967 | + if (adev->flags & AMD_IS_APU) |
---|
| 1968 | + adev->family = AMDGPU_FAMILY_CZ; |
---|
| 1969 | + else |
---|
| 1970 | + adev->family = AMDGPU_FAMILY_VI; |
---|
| 1971 | + |
---|
| 1972 | + r = vi_set_ip_blocks(adev); |
---|
| 1973 | + if (r) |
---|
| 1974 | + return r; |
---|
| 1975 | + break; |
---|
1523 | 1976 | case CHIP_VEGA10: |
---|
1524 | 1977 | case CHIP_VEGA12: |
---|
1525 | 1978 | case CHIP_VEGA20: |
---|
1526 | 1979 | case CHIP_RAVEN: |
---|
1527 | | - if (adev->asic_type == CHIP_RAVEN) |
---|
| 1980 | + case CHIP_ARCTURUS: |
---|
| 1981 | + case CHIP_RENOIR: |
---|
| 1982 | + if (adev->flags & AMD_IS_APU) |
---|
1528 | 1983 | adev->family = AMDGPU_FAMILY_RV; |
---|
1529 | 1984 | else |
---|
1530 | 1985 | adev->family = AMDGPU_FAMILY_AI; |
---|
.. | .. |
---|
1533 | 1988 | if (r) |
---|
1534 | 1989 | return r; |
---|
1535 | 1990 | break; |
---|
| 1991 | + case CHIP_NAVI10: |
---|
| 1992 | + case CHIP_NAVI14: |
---|
| 1993 | + case CHIP_NAVI12: |
---|
| 1994 | + case CHIP_SIENNA_CICHLID: |
---|
| 1995 | + case CHIP_NAVY_FLOUNDER: |
---|
| 1996 | + adev->family = AMDGPU_FAMILY_NV; |
---|
| 1997 | + |
---|
| 1998 | + r = nv_set_ip_blocks(adev); |
---|
| 1999 | + if (r) |
---|
| 2000 | + return r; |
---|
| 2001 | + break; |
---|
1536 | 2002 | default: |
---|
1537 | 2003 | /* FIXME: not supported yet */ |
---|
1538 | 2004 | return -EINVAL; |
---|
1539 | 2005 | } |
---|
1540 | 2006 | |
---|
1541 | | - r = amdgpu_device_parse_gpu_info_fw(adev); |
---|
1542 | | - if (r) |
---|
1543 | | - return r; |
---|
1544 | | - |
---|
1545 | 2007 | amdgpu_amdkfd_device_probe(adev); |
---|
1546 | 2008 | |
---|
1547 | | - if (amdgpu_sriov_vf(adev)) { |
---|
1548 | | - r = amdgpu_virt_request_full_gpu(adev, true); |
---|
1549 | | - if (r) |
---|
1550 | | - return -EAGAIN; |
---|
1551 | | - } |
---|
1552 | | - |
---|
1553 | | - adev->powerplay.pp_feature = amdgpu_pp_feature_mask; |
---|
| 2009 | + adev->pm.pp_feature = amdgpu_pp_feature_mask; |
---|
| 2010 | + if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) |
---|
| 2011 | + adev->pm.pp_feature &= ~PP_GFXOFF_MASK; |
---|
1554 | 2012 | |
---|
1555 | 2013 | for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
1556 | 2014 | if ((amdgpu_ip_block_mask & (1 << i)) == 0) { |
---|
.. | .. |
---|
1573 | 2031 | adev->ip_blocks[i].status.valid = true; |
---|
1574 | 2032 | } |
---|
1575 | 2033 | } |
---|
| 2034 | + /* get the vbios after the asic_funcs are set up */ |
---|
| 2035 | + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { |
---|
| 2036 | + r = amdgpu_device_parse_gpu_info_fw(adev); |
---|
| 2037 | + if (r) |
---|
| 2038 | + return r; |
---|
| 2039 | + |
---|
| 2040 | + /* Read BIOS */ |
---|
| 2041 | + if (!amdgpu_get_bios(adev)) |
---|
| 2042 | + return -EINVAL; |
---|
| 2043 | + |
---|
| 2044 | + r = amdgpu_atombios_init(adev); |
---|
| 2045 | + if (r) { |
---|
| 2046 | + dev_err(adev->dev, "amdgpu_atombios_init failed\n"); |
---|
| 2047 | + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); |
---|
| 2048 | + return r; |
---|
| 2049 | + } |
---|
| 2050 | + |
---|
| 2051 | + /*get pf2vf msg info at it's earliest time*/ |
---|
| 2052 | + if (amdgpu_sriov_vf(adev)) |
---|
| 2053 | + amdgpu_virt_init_data_exchange(adev); |
---|
| 2054 | + |
---|
| 2055 | + } |
---|
1576 | 2056 | } |
---|
1577 | 2057 | |
---|
1578 | 2058 | adev->cg_flags &= amdgpu_cg_mask; |
---|
1579 | 2059 | adev->pg_flags &= amdgpu_pg_mask; |
---|
1580 | 2060 | |
---|
1581 | 2061 | return 0; |
---|
| 2062 | +} |
---|
| 2063 | + |
---|
| 2064 | +static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) |
---|
| 2065 | +{ |
---|
| 2066 | + int i, r; |
---|
| 2067 | + |
---|
| 2068 | + for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
| 2069 | + if (!adev->ip_blocks[i].status.sw) |
---|
| 2070 | + continue; |
---|
| 2071 | + if (adev->ip_blocks[i].status.hw) |
---|
| 2072 | + continue; |
---|
| 2073 | + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || |
---|
| 2074 | + (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || |
---|
| 2075 | + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { |
---|
| 2076 | + r = adev->ip_blocks[i].version->funcs->hw_init(adev); |
---|
| 2077 | + if (r) { |
---|
| 2078 | + DRM_ERROR("hw_init of IP block <%s> failed %d\n", |
---|
| 2079 | + adev->ip_blocks[i].version->funcs->name, r); |
---|
| 2080 | + return r; |
---|
| 2081 | + } |
---|
| 2082 | + adev->ip_blocks[i].status.hw = true; |
---|
| 2083 | + } |
---|
| 2084 | + } |
---|
| 2085 | + |
---|
| 2086 | + return 0; |
---|
| 2087 | +} |
---|
| 2088 | + |
---|
| 2089 | +static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) |
---|
| 2090 | +{ |
---|
| 2091 | + int i, r; |
---|
| 2092 | + |
---|
| 2093 | + for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
| 2094 | + if (!adev->ip_blocks[i].status.sw) |
---|
| 2095 | + continue; |
---|
| 2096 | + if (adev->ip_blocks[i].status.hw) |
---|
| 2097 | + continue; |
---|
| 2098 | + r = adev->ip_blocks[i].version->funcs->hw_init(adev); |
---|
| 2099 | + if (r) { |
---|
| 2100 | + DRM_ERROR("hw_init of IP block <%s> failed %d\n", |
---|
| 2101 | + adev->ip_blocks[i].version->funcs->name, r); |
---|
| 2102 | + return r; |
---|
| 2103 | + } |
---|
| 2104 | + adev->ip_blocks[i].status.hw = true; |
---|
| 2105 | + } |
---|
| 2106 | + |
---|
| 2107 | + return 0; |
---|
| 2108 | +} |
---|
| 2109 | + |
---|
| 2110 | +static int amdgpu_device_fw_loading(struct amdgpu_device *adev) |
---|
| 2111 | +{ |
---|
| 2112 | + int r = 0; |
---|
| 2113 | + int i; |
---|
| 2114 | + uint32_t smu_version; |
---|
| 2115 | + |
---|
| 2116 | + if (adev->asic_type >= CHIP_VEGA10) { |
---|
| 2117 | + for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
| 2118 | + if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) |
---|
| 2119 | + continue; |
---|
| 2120 | + |
---|
| 2121 | + /* no need to do the fw loading again if already done*/ |
---|
| 2122 | + if (adev->ip_blocks[i].status.hw == true) |
---|
| 2123 | + break; |
---|
| 2124 | + |
---|
| 2125 | + if (amdgpu_in_reset(adev) || adev->in_suspend) { |
---|
| 2126 | + r = adev->ip_blocks[i].version->funcs->resume(adev); |
---|
| 2127 | + if (r) { |
---|
| 2128 | + DRM_ERROR("resume of IP block <%s> failed %d\n", |
---|
| 2129 | + adev->ip_blocks[i].version->funcs->name, r); |
---|
| 2130 | + return r; |
---|
| 2131 | + } |
---|
| 2132 | + } else { |
---|
| 2133 | + r = adev->ip_blocks[i].version->funcs->hw_init(adev); |
---|
| 2134 | + if (r) { |
---|
| 2135 | + DRM_ERROR("hw_init of IP block <%s> failed %d\n", |
---|
| 2136 | + adev->ip_blocks[i].version->funcs->name, r); |
---|
| 2137 | + return r; |
---|
| 2138 | + } |
---|
| 2139 | + } |
---|
| 2140 | + |
---|
| 2141 | + adev->ip_blocks[i].status.hw = true; |
---|
| 2142 | + break; |
---|
| 2143 | + } |
---|
| 2144 | + } |
---|
| 2145 | + |
---|
| 2146 | + if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) |
---|
| 2147 | + r = amdgpu_pm_load_smu_firmware(adev, &smu_version); |
---|
| 2148 | + |
---|
| 2149 | + return r; |
---|
1582 | 2150 | } |
---|
1583 | 2151 | |
---|
1584 | 2152 | /** |
---|
.. | .. |
---|
1596 | 2164 | { |
---|
1597 | 2165 | int i, r; |
---|
1598 | 2166 | |
---|
| 2167 | + r = amdgpu_ras_init(adev); |
---|
| 2168 | + if (r) |
---|
| 2169 | + return r; |
---|
| 2170 | + |
---|
1599 | 2171 | for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
1600 | 2172 | if (!adev->ip_blocks[i].status.valid) |
---|
1601 | 2173 | continue; |
---|
.. | .. |
---|
1603 | 2175 | if (r) { |
---|
1604 | 2176 | DRM_ERROR("sw_init of IP block <%s> failed %d\n", |
---|
1605 | 2177 | adev->ip_blocks[i].version->funcs->name, r); |
---|
1606 | | - return r; |
---|
| 2178 | + goto init_failed; |
---|
1607 | 2179 | } |
---|
1608 | 2180 | adev->ip_blocks[i].status.sw = true; |
---|
1609 | 2181 | |
---|
1610 | 2182 | /* need to do gmc hw init early so we can allocate gpu mem */ |
---|
1611 | 2183 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { |
---|
| 2184 | + /* Try to reserve bad pages early */ |
---|
| 2185 | + if (amdgpu_sriov_vf(adev)) |
---|
| 2186 | + amdgpu_virt_exchange_data(adev); |
---|
| 2187 | + |
---|
1612 | 2188 | r = amdgpu_device_vram_scratch_init(adev); |
---|
1613 | 2189 | if (r) { |
---|
1614 | 2190 | DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); |
---|
1615 | | - return r; |
---|
| 2191 | + goto init_failed; |
---|
1616 | 2192 | } |
---|
1617 | 2193 | r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); |
---|
1618 | 2194 | if (r) { |
---|
1619 | 2195 | DRM_ERROR("hw_init %d failed %d\n", i, r); |
---|
1620 | | - return r; |
---|
| 2196 | + goto init_failed; |
---|
1621 | 2197 | } |
---|
1622 | 2198 | r = amdgpu_device_wb_init(adev); |
---|
1623 | 2199 | if (r) { |
---|
1624 | 2200 | DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); |
---|
1625 | | - return r; |
---|
| 2201 | + goto init_failed; |
---|
1626 | 2202 | } |
---|
1627 | 2203 | adev->ip_blocks[i].status.hw = true; |
---|
1628 | 2204 | |
---|
1629 | 2205 | /* right after GMC hw init, we create CSA */ |
---|
1630 | | - if (amdgpu_sriov_vf(adev)) { |
---|
1631 | | - r = amdgpu_allocate_static_csa(adev); |
---|
| 2206 | + if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { |
---|
| 2207 | + r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, |
---|
| 2208 | + AMDGPU_GEM_DOMAIN_VRAM, |
---|
| 2209 | + AMDGPU_CSA_SIZE); |
---|
1632 | 2210 | if (r) { |
---|
1633 | 2211 | DRM_ERROR("allocate CSA failed %d\n", r); |
---|
1634 | | - return r; |
---|
| 2212 | + goto init_failed; |
---|
1635 | 2213 | } |
---|
1636 | 2214 | } |
---|
1637 | 2215 | } |
---|
1638 | 2216 | } |
---|
1639 | 2217 | |
---|
1640 | | - for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
1641 | | - if (!adev->ip_blocks[i].status.sw) |
---|
1642 | | - continue; |
---|
1643 | | - if (adev->ip_blocks[i].status.hw) |
---|
1644 | | - continue; |
---|
1645 | | - r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); |
---|
1646 | | - if (r) { |
---|
1647 | | - DRM_ERROR("hw_init of IP block <%s> failed %d\n", |
---|
1648 | | - adev->ip_blocks[i].version->funcs->name, r); |
---|
1649 | | - return r; |
---|
1650 | | - } |
---|
1651 | | - adev->ip_blocks[i].status.hw = true; |
---|
| 2218 | + if (amdgpu_sriov_vf(adev)) |
---|
| 2219 | + amdgpu_virt_init_data_exchange(adev); |
---|
| 2220 | + |
---|
| 2221 | + r = amdgpu_ib_pool_init(adev); |
---|
| 2222 | + if (r) { |
---|
| 2223 | + dev_err(adev->dev, "IB initialization failed (%d).\n", r); |
---|
| 2224 | + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); |
---|
| 2225 | + goto init_failed; |
---|
1652 | 2226 | } |
---|
1653 | 2227 | |
---|
| 2228 | + r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ |
---|
| 2229 | + if (r) |
---|
| 2230 | + goto init_failed; |
---|
| 2231 | + |
---|
| 2232 | + r = amdgpu_device_ip_hw_init_phase1(adev); |
---|
| 2233 | + if (r) |
---|
| 2234 | + goto init_failed; |
---|
| 2235 | + |
---|
| 2236 | + r = amdgpu_device_fw_loading(adev); |
---|
| 2237 | + if (r) |
---|
| 2238 | + goto init_failed; |
---|
| 2239 | + |
---|
| 2240 | + r = amdgpu_device_ip_hw_init_phase2(adev); |
---|
| 2241 | + if (r) |
---|
| 2242 | + goto init_failed; |
---|
| 2243 | + |
---|
| 2244 | + /* |
---|
| 2245 | + * retired pages will be loaded from eeprom and reserved here, |
---|
| 2246 | + * it should be called after amdgpu_device_ip_hw_init_phase2 since |
---|
| 2247 | + * for some ASICs the RAS EEPROM code relies on SMU fully functioning |
---|
| 2248 | + * for I2C communication which only true at this point. |
---|
| 2249 | + * |
---|
| 2250 | + * amdgpu_ras_recovery_init may fail, but the upper only cares the |
---|
| 2251 | + * failure from bad gpu situation and stop amdgpu init process |
---|
| 2252 | + * accordingly. For other failed cases, it will still release all |
---|
| 2253 | + * the resource and print error message, rather than returning one |
---|
| 2254 | + * negative value to upper level. |
---|
| 2255 | + * |
---|
| 2256 | + * Note: theoretically, this should be called before all vram allocations |
---|
| 2257 | + * to protect retired page from abusing |
---|
| 2258 | + */ |
---|
| 2259 | + r = amdgpu_ras_recovery_init(adev); |
---|
| 2260 | + if (r) |
---|
| 2261 | + goto init_failed; |
---|
| 2262 | + |
---|
| 2263 | + if (adev->gmc.xgmi.num_physical_nodes > 1) |
---|
| 2264 | + amdgpu_xgmi_add_device(adev); |
---|
1654 | 2265 | amdgpu_amdkfd_device_init(adev); |
---|
1655 | 2266 | |
---|
1656 | | - if (amdgpu_sriov_vf(adev)) { |
---|
1657 | | - amdgpu_virt_init_data_exchange(adev); |
---|
1658 | | - amdgpu_virt_release_full_gpu(adev, true); |
---|
1659 | | - } |
---|
| 2267 | + amdgpu_fru_get_product_info(adev); |
---|
1660 | 2268 | |
---|
1661 | | - return 0; |
---|
| 2269 | +init_failed: |
---|
| 2270 | + if (amdgpu_sriov_vf(adev)) |
---|
| 2271 | + amdgpu_virt_release_full_gpu(adev, true); |
---|
| 2272 | + |
---|
| 2273 | + return r; |
---|
1662 | 2274 | } |
---|
1663 | 2275 | |
---|
1664 | 2276 | /** |
---|
.. | .. |
---|
1687 | 2299 | */ |
---|
1688 | 2300 | static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) |
---|
1689 | 2301 | { |
---|
1690 | | - return !!memcmp(adev->gart.ptr, adev->reset_magic, |
---|
1691 | | - AMDGPU_RESET_MAGIC_NUM); |
---|
| 2302 | + if (memcmp(adev->gart.ptr, adev->reset_magic, |
---|
| 2303 | + AMDGPU_RESET_MAGIC_NUM)) |
---|
| 2304 | + return true; |
---|
| 2305 | + |
---|
| 2306 | + if (!amdgpu_in_reset(adev)) |
---|
| 2307 | + return false; |
---|
| 2308 | + |
---|
| 2309 | + /* |
---|
| 2310 | + * For all ASICs with baco/mode1 reset, the VRAM is |
---|
| 2311 | + * always assumed to be lost. |
---|
| 2312 | + */ |
---|
| 2313 | + switch (amdgpu_asic_reset_method(adev)) { |
---|
| 2314 | + case AMD_RESET_METHOD_BACO: |
---|
| 2315 | + case AMD_RESET_METHOD_MODE1: |
---|
| 2316 | + return true; |
---|
| 2317 | + default: |
---|
| 2318 | + return false; |
---|
| 2319 | + } |
---|
1692 | 2320 | } |
---|
1693 | 2321 | |
---|
1694 | 2322 | /** |
---|
1695 | | - * amdgpu_device_ip_late_set_cg_state - late init for clockgating |
---|
| 2323 | + * amdgpu_device_set_cg_state - set clockgating for amdgpu device |
---|
1696 | 2324 | * |
---|
1697 | 2325 | * @adev: amdgpu_device pointer |
---|
| 2326 | + * @state: clockgating state (gate or ungate) |
---|
1698 | 2327 | * |
---|
1699 | | - * Late initialization pass enabling clockgating for hardware IPs. |
---|
1700 | 2328 | * The list of all the hardware IPs that make up the asic is walked and the |
---|
1701 | | - * set_clockgating_state callbacks are run. This stage is run late |
---|
1702 | | - * in the init process. |
---|
| 2329 | + * set_clockgating_state callbacks are run. |
---|
| 2330 | + * Late initialization pass enabling clockgating for hardware IPs. |
---|
| 2331 | + * Fini or suspend, pass disabling clockgating for hardware IPs. |
---|
1703 | 2332 | * Returns 0 on success, negative error code on failure. |
---|
1704 | 2333 | */ |
---|
1705 | | -static int amdgpu_device_ip_late_set_cg_state(struct amdgpu_device *adev) |
---|
| 2334 | + |
---|
| 2335 | +static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, |
---|
| 2336 | + enum amd_clockgating_state state) |
---|
1706 | 2337 | { |
---|
1707 | | - int i = 0, r; |
---|
| 2338 | + int i, j, r; |
---|
1708 | 2339 | |
---|
1709 | 2340 | if (amdgpu_emu_mode == 1) |
---|
1710 | 2341 | return 0; |
---|
1711 | 2342 | |
---|
1712 | | - for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
1713 | | - if (!adev->ip_blocks[i].status.valid) |
---|
| 2343 | + for (j = 0; j < adev->num_ip_blocks; j++) { |
---|
| 2344 | + i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; |
---|
| 2345 | + if (!adev->ip_blocks[i].status.late_initialized) |
---|
1714 | 2346 | continue; |
---|
1715 | 2347 | /* skip CG for VCE/UVD, it's handled specially */ |
---|
1716 | 2348 | if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && |
---|
1717 | 2349 | adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && |
---|
1718 | 2350 | adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && |
---|
| 2351 | + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && |
---|
1719 | 2352 | adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
---|
1720 | 2353 | /* enable clockgating to save power */ |
---|
1721 | 2354 | r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
---|
1722 | | - AMD_CG_STATE_GATE); |
---|
| 2355 | + state); |
---|
1723 | 2356 | if (r) { |
---|
1724 | 2357 | DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", |
---|
1725 | 2358 | adev->ip_blocks[i].version->funcs->name, r); |
---|
.. | .. |
---|
1731 | 2364 | return 0; |
---|
1732 | 2365 | } |
---|
1733 | 2366 | |
---|
1734 | | -static int amdgpu_device_ip_late_set_pg_state(struct amdgpu_device *adev) |
---|
| 2367 | +static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) |
---|
1735 | 2368 | { |
---|
1736 | | - int i = 0, r; |
---|
| 2369 | + int i, j, r; |
---|
1737 | 2370 | |
---|
1738 | 2371 | if (amdgpu_emu_mode == 1) |
---|
1739 | 2372 | return 0; |
---|
1740 | 2373 | |
---|
1741 | | - for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
1742 | | - if (!adev->ip_blocks[i].status.valid) |
---|
| 2374 | + for (j = 0; j < adev->num_ip_blocks; j++) { |
---|
| 2375 | + i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; |
---|
| 2376 | + if (!adev->ip_blocks[i].status.late_initialized) |
---|
1743 | 2377 | continue; |
---|
1744 | 2378 | /* skip CG for VCE/UVD, it's handled specially */ |
---|
1745 | 2379 | if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && |
---|
1746 | 2380 | adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && |
---|
1747 | 2381 | adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && |
---|
| 2382 | + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && |
---|
1748 | 2383 | adev->ip_blocks[i].version->funcs->set_powergating_state) { |
---|
1749 | 2384 | /* enable powergating to save power */ |
---|
1750 | 2385 | r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, |
---|
1751 | | - AMD_PG_STATE_GATE); |
---|
| 2386 | + state); |
---|
1752 | 2387 | if (r) { |
---|
1753 | 2388 | DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", |
---|
1754 | 2389 | adev->ip_blocks[i].version->funcs->name, r); |
---|
.. | .. |
---|
1757 | 2392 | } |
---|
1758 | 2393 | } |
---|
1759 | 2394 | return 0; |
---|
| 2395 | +} |
---|
| 2396 | + |
---|
| 2397 | +static int amdgpu_device_enable_mgpu_fan_boost(void) |
---|
| 2398 | +{ |
---|
| 2399 | + struct amdgpu_gpu_instance *gpu_ins; |
---|
| 2400 | + struct amdgpu_device *adev; |
---|
| 2401 | + int i, ret = 0; |
---|
| 2402 | + |
---|
| 2403 | + mutex_lock(&mgpu_info.mutex); |
---|
| 2404 | + |
---|
| 2405 | + /* |
---|
| 2406 | + * MGPU fan boost feature should be enabled |
---|
| 2407 | + * only when there are two or more dGPUs in |
---|
| 2408 | + * the system |
---|
| 2409 | + */ |
---|
| 2410 | + if (mgpu_info.num_dgpu < 2) |
---|
| 2411 | + goto out; |
---|
| 2412 | + |
---|
| 2413 | + for (i = 0; i < mgpu_info.num_dgpu; i++) { |
---|
| 2414 | + gpu_ins = &(mgpu_info.gpu_ins[i]); |
---|
| 2415 | + adev = gpu_ins->adev; |
---|
| 2416 | + if (!(adev->flags & AMD_IS_APU) && |
---|
| 2417 | + !gpu_ins->mgpu_fan_enabled) { |
---|
| 2418 | + ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); |
---|
| 2419 | + if (ret) |
---|
| 2420 | + break; |
---|
| 2421 | + |
---|
| 2422 | + gpu_ins->mgpu_fan_enabled = 1; |
---|
| 2423 | + } |
---|
| 2424 | + } |
---|
| 2425 | + |
---|
| 2426 | +out: |
---|
| 2427 | + mutex_unlock(&mgpu_info.mutex); |
---|
| 2428 | + |
---|
| 2429 | + return ret; |
---|
1760 | 2430 | } |
---|
1761 | 2431 | |
---|
1762 | 2432 | /** |
---|
.. | .. |
---|
1773 | 2443 | */ |
---|
1774 | 2444 | static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) |
---|
1775 | 2445 | { |
---|
| 2446 | + struct amdgpu_gpu_instance *gpu_instance; |
---|
1776 | 2447 | int i = 0, r; |
---|
1777 | 2448 | |
---|
1778 | 2449 | for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
1779 | | - if (!adev->ip_blocks[i].status.valid) |
---|
| 2450 | + if (!adev->ip_blocks[i].status.hw) |
---|
1780 | 2451 | continue; |
---|
1781 | 2452 | if (adev->ip_blocks[i].version->funcs->late_init) { |
---|
1782 | 2453 | r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); |
---|
.. | .. |
---|
1785 | 2456 | adev->ip_blocks[i].version->funcs->name, r); |
---|
1786 | 2457 | return r; |
---|
1787 | 2458 | } |
---|
1788 | | - adev->ip_blocks[i].status.late_initialized = true; |
---|
1789 | 2459 | } |
---|
| 2460 | + adev->ip_blocks[i].status.late_initialized = true; |
---|
1790 | 2461 | } |
---|
1791 | 2462 | |
---|
1792 | | - amdgpu_device_ip_late_set_cg_state(adev); |
---|
1793 | | - amdgpu_device_ip_late_set_pg_state(adev); |
---|
| 2463 | + amdgpu_ras_set_error_query_ready(adev, true); |
---|
1794 | 2464 | |
---|
1795 | | - queue_delayed_work(system_wq, &adev->late_init_work, |
---|
1796 | | - msecs_to_jiffies(AMDGPU_RESUME_MS)); |
---|
| 2465 | + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); |
---|
| 2466 | + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); |
---|
1797 | 2467 | |
---|
1798 | 2468 | amdgpu_device_fill_reset_magic(adev); |
---|
| 2469 | + |
---|
| 2470 | + r = amdgpu_device_enable_mgpu_fan_boost(); |
---|
| 2471 | + if (r) |
---|
| 2472 | + DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); |
---|
| 2473 | + |
---|
| 2474 | + |
---|
| 2475 | + if (adev->gmc.xgmi.num_physical_nodes > 1) { |
---|
| 2476 | + mutex_lock(&mgpu_info.mutex); |
---|
| 2477 | + |
---|
| 2478 | + /* |
---|
| 2479 | + * Reset device p-state to low as this was booted with high. |
---|
| 2480 | + * |
---|
| 2481 | + * This should be performed only after all devices from the same |
---|
| 2482 | + * hive get initialized. |
---|
| 2483 | + * |
---|
| 2484 | + * However, it's unknown how many device in the hive in advance. |
---|
| 2485 | + * As this is counted one by one during devices initializations. |
---|
| 2486 | + * |
---|
| 2487 | + * So, we wait for all XGMI interlinked devices initialized. |
---|
| 2488 | + * This may bring some delays as those devices may come from |
---|
| 2489 | + * different hives. But that should be OK. |
---|
| 2490 | + */ |
---|
| 2491 | + if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { |
---|
| 2492 | + for (i = 0; i < mgpu_info.num_gpu; i++) { |
---|
| 2493 | + gpu_instance = &(mgpu_info.gpu_ins[i]); |
---|
| 2494 | + if (gpu_instance->adev->flags & AMD_IS_APU) |
---|
| 2495 | + continue; |
---|
| 2496 | + |
---|
| 2497 | + r = amdgpu_xgmi_set_pstate(gpu_instance->adev, |
---|
| 2498 | + AMDGPU_XGMI_PSTATE_MIN); |
---|
| 2499 | + if (r) { |
---|
| 2500 | + DRM_ERROR("pstate setting failed (%d).\n", r); |
---|
| 2501 | + break; |
---|
| 2502 | + } |
---|
| 2503 | + } |
---|
| 2504 | + } |
---|
| 2505 | + |
---|
| 2506 | + mutex_unlock(&mgpu_info.mutex); |
---|
| 2507 | + } |
---|
1799 | 2508 | |
---|
1800 | 2509 | return 0; |
---|
1801 | 2510 | } |
---|
.. | .. |
---|
1815 | 2524 | { |
---|
1816 | 2525 | int i, r; |
---|
1817 | 2526 | |
---|
| 2527 | + if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) |
---|
| 2528 | + amdgpu_virt_release_ras_err_handler_data(adev); |
---|
| 2529 | + |
---|
| 2530 | + amdgpu_ras_pre_fini(adev); |
---|
| 2531 | + |
---|
| 2532 | + if (adev->gmc.xgmi.num_physical_nodes > 1) |
---|
| 2533 | + amdgpu_xgmi_remove_device(adev); |
---|
| 2534 | + |
---|
| 2535 | + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); |
---|
| 2536 | + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); |
---|
| 2537 | + |
---|
1818 | 2538 | amdgpu_amdkfd_device_fini(adev); |
---|
| 2539 | + |
---|
1819 | 2540 | /* need to disable SMC first */ |
---|
1820 | 2541 | for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
1821 | 2542 | if (!adev->ip_blocks[i].status.hw) |
---|
1822 | 2543 | continue; |
---|
1823 | | - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC && |
---|
1824 | | - adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
---|
1825 | | - /* ungate blocks before hw fini so that we can shutdown the blocks safely */ |
---|
1826 | | - r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
---|
1827 | | - AMD_CG_STATE_UNGATE); |
---|
1828 | | - if (r) { |
---|
1829 | | - DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n", |
---|
1830 | | - adev->ip_blocks[i].version->funcs->name, r); |
---|
1831 | | - return r; |
---|
1832 | | - } |
---|
1833 | | - if (adev->powerplay.pp_funcs->set_powergating_by_smu) |
---|
1834 | | - amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false); |
---|
| 2544 | + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { |
---|
1835 | 2545 | r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); |
---|
1836 | 2546 | /* XXX handle errors */ |
---|
1837 | 2547 | if (r) { |
---|
.. | .. |
---|
1846 | 2556 | for (i = adev->num_ip_blocks - 1; i >= 0; i--) { |
---|
1847 | 2557 | if (!adev->ip_blocks[i].status.hw) |
---|
1848 | 2558 | continue; |
---|
1849 | | - |
---|
1850 | | - if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && |
---|
1851 | | - adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && |
---|
1852 | | - adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && |
---|
1853 | | - adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
---|
1854 | | - /* ungate blocks before hw fini so that we can shutdown the blocks safely */ |
---|
1855 | | - r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
---|
1856 | | - AMD_CG_STATE_UNGATE); |
---|
1857 | | - if (r) { |
---|
1858 | | - DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n", |
---|
1859 | | - adev->ip_blocks[i].version->funcs->name, r); |
---|
1860 | | - return r; |
---|
1861 | | - } |
---|
1862 | | - } |
---|
1863 | 2559 | |
---|
1864 | 2560 | r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); |
---|
1865 | 2561 | /* XXX handle errors */ |
---|
.. | .. |
---|
1877 | 2573 | continue; |
---|
1878 | 2574 | |
---|
1879 | 2575 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { |
---|
1880 | | - amdgpu_free_static_csa(adev); |
---|
| 2576 | + amdgpu_ucode_free_bo(adev); |
---|
| 2577 | + amdgpu_free_static_csa(&adev->virt.csa_obj); |
---|
1881 | 2578 | amdgpu_device_wb_fini(adev); |
---|
1882 | 2579 | amdgpu_device_vram_scratch_fini(adev); |
---|
| 2580 | + amdgpu_ib_pool_fini(adev); |
---|
1883 | 2581 | } |
---|
1884 | 2582 | |
---|
1885 | 2583 | r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); |
---|
.. | .. |
---|
1900 | 2598 | adev->ip_blocks[i].status.late_initialized = false; |
---|
1901 | 2599 | } |
---|
1902 | 2600 | |
---|
| 2601 | + amdgpu_ras_fini(adev); |
---|
| 2602 | + |
---|
1903 | 2603 | if (amdgpu_sriov_vf(adev)) |
---|
1904 | 2604 | if (amdgpu_virt_release_full_gpu(adev, false)) |
---|
1905 | 2605 | DRM_ERROR("failed to release exclusive mode on fini\n"); |
---|
.. | .. |
---|
1908 | 2608 | } |
---|
1909 | 2609 | |
---|
1910 | 2610 | /** |
---|
1911 | | - * amdgpu_device_ip_late_init_func_handler - work handler for clockgating |
---|
| 2611 | + * amdgpu_device_delayed_init_work_handler - work handler for IB tests |
---|
1912 | 2612 | * |
---|
1913 | | - * @work: work_struct |
---|
1914 | | - * |
---|
1915 | | - * Work handler for amdgpu_device_ip_late_set_cg_state. We put the |
---|
1916 | | - * clockgating setup into a worker thread to speed up driver init and |
---|
1917 | | - * resume from suspend. |
---|
| 2613 | + * @work: work_struct. |
---|
1918 | 2614 | */ |
---|
1919 | | -static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work) |
---|
| 2615 | +static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) |
---|
1920 | 2616 | { |
---|
1921 | 2617 | struct amdgpu_device *adev = |
---|
1922 | | - container_of(work, struct amdgpu_device, late_init_work.work); |
---|
| 2618 | + container_of(work, struct amdgpu_device, delayed_init_work.work); |
---|
1923 | 2619 | int r; |
---|
1924 | 2620 | |
---|
1925 | 2621 | r = amdgpu_ib_ring_tests(adev); |
---|
1926 | 2622 | if (r) |
---|
1927 | 2623 | DRM_ERROR("ib ring test failed (%d).\n", r); |
---|
| 2624 | +} |
---|
| 2625 | + |
---|
| 2626 | +static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) |
---|
| 2627 | +{ |
---|
| 2628 | + struct amdgpu_device *adev = |
---|
| 2629 | + container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); |
---|
| 2630 | + |
---|
| 2631 | + WARN_ON_ONCE(adev->gfx.gfx_off_state); |
---|
| 2632 | + WARN_ON_ONCE(adev->gfx.gfx_off_req_count); |
---|
| 2633 | + |
---|
| 2634 | + if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) |
---|
| 2635 | + adev->gfx.gfx_off_state = true; |
---|
1928 | 2636 | } |
---|
1929 | 2637 | |
---|
1930 | 2638 | /** |
---|
.. | .. |
---|
1942 | 2650 | { |
---|
1943 | 2651 | int i, r; |
---|
1944 | 2652 | |
---|
1945 | | - if (amdgpu_sriov_vf(adev)) |
---|
1946 | | - amdgpu_virt_request_full_gpu(adev, false); |
---|
| 2653 | + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); |
---|
| 2654 | + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); |
---|
1947 | 2655 | |
---|
1948 | 2656 | for (i = adev->num_ip_blocks - 1; i >= 0; i--) { |
---|
1949 | 2657 | if (!adev->ip_blocks[i].status.valid) |
---|
1950 | 2658 | continue; |
---|
1951 | | - /* displays are handled separately */ |
---|
1952 | | - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { |
---|
1953 | | - /* ungate blocks so that suspend can properly shut them down */ |
---|
1954 | | - if (adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
---|
1955 | | - r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
---|
1956 | | - AMD_CG_STATE_UNGATE); |
---|
1957 | | - if (r) { |
---|
1958 | | - DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n", |
---|
1959 | | - adev->ip_blocks[i].version->funcs->name, r); |
---|
1960 | | - } |
---|
1961 | | - } |
---|
1962 | | - /* XXX handle errors */ |
---|
1963 | | - r = adev->ip_blocks[i].version->funcs->suspend(adev); |
---|
1964 | | - /* XXX handle errors */ |
---|
1965 | | - if (r) { |
---|
1966 | | - DRM_ERROR("suspend of IP block <%s> failed %d\n", |
---|
1967 | | - adev->ip_blocks[i].version->funcs->name, r); |
---|
1968 | | - } |
---|
1969 | | - } |
---|
1970 | | - } |
---|
1971 | 2659 | |
---|
1972 | | - if (amdgpu_sriov_vf(adev)) |
---|
1973 | | - amdgpu_virt_release_full_gpu(adev, false); |
---|
| 2660 | + /* displays are handled separately */ |
---|
| 2661 | + if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) |
---|
| 2662 | + continue; |
---|
| 2663 | + |
---|
| 2664 | + /* XXX handle errors */ |
---|
| 2665 | + r = adev->ip_blocks[i].version->funcs->suspend(adev); |
---|
| 2666 | + /* XXX handle errors */ |
---|
| 2667 | + if (r) { |
---|
| 2668 | + DRM_ERROR("suspend of IP block <%s> failed %d\n", |
---|
| 2669 | + adev->ip_blocks[i].version->funcs->name, r); |
---|
| 2670 | + return r; |
---|
| 2671 | + } |
---|
| 2672 | + |
---|
| 2673 | + adev->ip_blocks[i].status.hw = false; |
---|
| 2674 | + } |
---|
1974 | 2675 | |
---|
1975 | 2676 | return 0; |
---|
1976 | 2677 | } |
---|
.. | .. |
---|
1990 | 2691 | { |
---|
1991 | 2692 | int i, r; |
---|
1992 | 2693 | |
---|
1993 | | - if (amdgpu_sriov_vf(adev)) |
---|
1994 | | - amdgpu_virt_request_full_gpu(adev, false); |
---|
1995 | | - |
---|
1996 | | - /* ungate SMC block first */ |
---|
1997 | | - r = amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_SMC, |
---|
1998 | | - AMD_CG_STATE_UNGATE); |
---|
1999 | | - if (r) { |
---|
2000 | | - DRM_ERROR("set_clockgating_state(ungate) SMC failed %d\n", r); |
---|
2001 | | - } |
---|
2002 | | - |
---|
2003 | | - /* call smu to disable gfx off feature first when suspend */ |
---|
2004 | | - if (adev->powerplay.pp_funcs->set_powergating_by_smu) |
---|
2005 | | - amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false); |
---|
2006 | | - |
---|
2007 | 2694 | for (i = adev->num_ip_blocks - 1; i >= 0; i--) { |
---|
2008 | 2695 | if (!adev->ip_blocks[i].status.valid) |
---|
2009 | 2696 | continue; |
---|
2010 | 2697 | /* displays are handled in phase1 */ |
---|
2011 | 2698 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) |
---|
2012 | 2699 | continue; |
---|
2013 | | - /* ungate blocks so that suspend can properly shut them down */ |
---|
2014 | | - if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_SMC && |
---|
2015 | | - adev->ip_blocks[i].version->funcs->set_clockgating_state) { |
---|
2016 | | - r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, |
---|
2017 | | - AMD_CG_STATE_UNGATE); |
---|
2018 | | - if (r) { |
---|
2019 | | - DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n", |
---|
2020 | | - adev->ip_blocks[i].version->funcs->name, r); |
---|
2021 | | - } |
---|
| 2700 | + /* PSP lost connection when err_event_athub occurs */ |
---|
| 2701 | + if (amdgpu_ras_intr_triggered() && |
---|
| 2702 | + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { |
---|
| 2703 | + adev->ip_blocks[i].status.hw = false; |
---|
| 2704 | + continue; |
---|
2022 | 2705 | } |
---|
2023 | 2706 | /* XXX handle errors */ |
---|
2024 | 2707 | r = adev->ip_blocks[i].version->funcs->suspend(adev); |
---|
.. | .. |
---|
2027 | 2710 | DRM_ERROR("suspend of IP block <%s> failed %d\n", |
---|
2028 | 2711 | adev->ip_blocks[i].version->funcs->name, r); |
---|
2029 | 2712 | } |
---|
| 2713 | + adev->ip_blocks[i].status.hw = false; |
---|
| 2714 | + /* handle putting the SMC in the appropriate state */ |
---|
| 2715 | + if(!amdgpu_sriov_vf(adev)){ |
---|
| 2716 | + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { |
---|
| 2717 | + r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); |
---|
| 2718 | + if (r) { |
---|
| 2719 | + DRM_ERROR("SMC failed to set mp1 state %d, %d\n", |
---|
| 2720 | + adev->mp1_state, r); |
---|
| 2721 | + return r; |
---|
| 2722 | + } |
---|
| 2723 | + } |
---|
| 2724 | + } |
---|
| 2725 | + adev->ip_blocks[i].status.hw = false; |
---|
2030 | 2726 | } |
---|
2031 | | - |
---|
2032 | | - if (amdgpu_sriov_vf(adev)) |
---|
2033 | | - amdgpu_virt_release_full_gpu(adev, false); |
---|
2034 | 2727 | |
---|
2035 | 2728 | return 0; |
---|
2036 | 2729 | } |
---|
.. | .. |
---|
2050 | 2743 | { |
---|
2051 | 2744 | int r; |
---|
2052 | 2745 | |
---|
| 2746 | + if (amdgpu_sriov_vf(adev)) |
---|
| 2747 | + amdgpu_virt_request_full_gpu(adev, false); |
---|
| 2748 | + |
---|
2053 | 2749 | r = amdgpu_device_ip_suspend_phase1(adev); |
---|
2054 | 2750 | if (r) |
---|
2055 | 2751 | return r; |
---|
2056 | 2752 | r = amdgpu_device_ip_suspend_phase2(adev); |
---|
| 2753 | + |
---|
| 2754 | + if (amdgpu_sriov_vf(adev)) |
---|
| 2755 | + amdgpu_virt_release_full_gpu(adev, false); |
---|
2057 | 2756 | |
---|
2058 | 2757 | return r; |
---|
2059 | 2758 | } |
---|
.. | .. |
---|
2073 | 2772 | int j; |
---|
2074 | 2773 | struct amdgpu_ip_block *block; |
---|
2075 | 2774 | |
---|
2076 | | - for (j = 0; j < adev->num_ip_blocks; j++) { |
---|
2077 | | - block = &adev->ip_blocks[j]; |
---|
| 2775 | + block = &adev->ip_blocks[i]; |
---|
| 2776 | + block->status.hw = false; |
---|
2078 | 2777 | |
---|
2079 | | - if (block->version->type != ip_order[i] || |
---|
| 2778 | + for (j = 0; j < ARRAY_SIZE(ip_order); j++) { |
---|
| 2779 | + |
---|
| 2780 | + if (block->version->type != ip_order[j] || |
---|
2080 | 2781 | !block->status.valid) |
---|
2081 | 2782 | continue; |
---|
2082 | 2783 | |
---|
2083 | 2784 | r = block->version->funcs->hw_init(adev); |
---|
2084 | | - DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); |
---|
| 2785 | + DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); |
---|
2085 | 2786 | if (r) |
---|
2086 | 2787 | return r; |
---|
| 2788 | + block->status.hw = true; |
---|
2087 | 2789 | } |
---|
2088 | 2790 | } |
---|
2089 | 2791 | |
---|
.. | .. |
---|
2100 | 2802 | AMD_IP_BLOCK_TYPE_GFX, |
---|
2101 | 2803 | AMD_IP_BLOCK_TYPE_SDMA, |
---|
2102 | 2804 | AMD_IP_BLOCK_TYPE_UVD, |
---|
2103 | | - AMD_IP_BLOCK_TYPE_VCE |
---|
| 2805 | + AMD_IP_BLOCK_TYPE_VCE, |
---|
| 2806 | + AMD_IP_BLOCK_TYPE_VCN |
---|
2104 | 2807 | }; |
---|
2105 | 2808 | |
---|
2106 | 2809 | for (i = 0; i < ARRAY_SIZE(ip_order); i++) { |
---|
.. | .. |
---|
2111 | 2814 | block = &adev->ip_blocks[j]; |
---|
2112 | 2815 | |
---|
2113 | 2816 | if (block->version->type != ip_order[i] || |
---|
2114 | | - !block->status.valid) |
---|
| 2817 | + !block->status.valid || |
---|
| 2818 | + block->status.hw) |
---|
2115 | 2819 | continue; |
---|
2116 | 2820 | |
---|
2117 | | - r = block->version->funcs->hw_init(adev); |
---|
2118 | | - DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); |
---|
| 2821 | + if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) |
---|
| 2822 | + r = block->version->funcs->resume(adev); |
---|
| 2823 | + else |
---|
| 2824 | + r = block->version->funcs->hw_init(adev); |
---|
| 2825 | + |
---|
| 2826 | + DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); |
---|
2119 | 2827 | if (r) |
---|
2120 | 2828 | return r; |
---|
| 2829 | + block->status.hw = true; |
---|
2121 | 2830 | } |
---|
2122 | 2831 | } |
---|
2123 | 2832 | |
---|
.. | .. |
---|
2141 | 2850 | int i, r; |
---|
2142 | 2851 | |
---|
2143 | 2852 | for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
2144 | | - if (!adev->ip_blocks[i].status.valid) |
---|
| 2853 | + if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) |
---|
2145 | 2854 | continue; |
---|
2146 | 2855 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || |
---|
2147 | 2856 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || |
---|
2148 | 2857 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { |
---|
| 2858 | + |
---|
2149 | 2859 | r = adev->ip_blocks[i].version->funcs->resume(adev); |
---|
2150 | 2860 | if (r) { |
---|
2151 | 2861 | DRM_ERROR("resume of IP block <%s> failed %d\n", |
---|
2152 | 2862 | adev->ip_blocks[i].version->funcs->name, r); |
---|
2153 | 2863 | return r; |
---|
2154 | 2864 | } |
---|
| 2865 | + adev->ip_blocks[i].status.hw = true; |
---|
2155 | 2866 | } |
---|
2156 | 2867 | } |
---|
2157 | 2868 | |
---|
.. | .. |
---|
2176 | 2887 | int i, r; |
---|
2177 | 2888 | |
---|
2178 | 2889 | for (i = 0; i < adev->num_ip_blocks; i++) { |
---|
2179 | | - if (!adev->ip_blocks[i].status.valid) |
---|
| 2890 | + if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) |
---|
2180 | 2891 | continue; |
---|
2181 | 2892 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || |
---|
2182 | 2893 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || |
---|
2183 | | - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) |
---|
| 2894 | + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || |
---|
| 2895 | + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) |
---|
2184 | 2896 | continue; |
---|
2185 | 2897 | r = adev->ip_blocks[i].version->funcs->resume(adev); |
---|
2186 | 2898 | if (r) { |
---|
.. | .. |
---|
2188 | 2900 | adev->ip_blocks[i].version->funcs->name, r); |
---|
2189 | 2901 | return r; |
---|
2190 | 2902 | } |
---|
| 2903 | + adev->ip_blocks[i].status.hw = true; |
---|
2191 | 2904 | } |
---|
2192 | 2905 | |
---|
2193 | 2906 | return 0; |
---|
.. | .. |
---|
2209 | 2922 | { |
---|
2210 | 2923 | int r; |
---|
2211 | 2924 | |
---|
| 2925 | + r = amdgpu_amdkfd_resume_iommu(adev); |
---|
| 2926 | + if (r) |
---|
| 2927 | + return r; |
---|
| 2928 | + |
---|
2212 | 2929 | r = amdgpu_device_ip_resume_phase1(adev); |
---|
2213 | 2930 | if (r) |
---|
2214 | 2931 | return r; |
---|
| 2932 | + |
---|
| 2933 | + r = amdgpu_device_fw_loading(adev); |
---|
| 2934 | + if (r) |
---|
| 2935 | + return r; |
---|
| 2936 | + |
---|
2215 | 2937 | r = amdgpu_device_ip_resume_phase2(adev); |
---|
2216 | 2938 | |
---|
2217 | 2939 | return r; |
---|
.. | .. |
---|
2252 | 2974 | { |
---|
2253 | 2975 | switch (asic_type) { |
---|
2254 | 2976 | #if defined(CONFIG_DRM_AMD_DC) |
---|
| 2977 | +#if defined(CONFIG_DRM_AMD_DC_SI) |
---|
| 2978 | + case CHIP_TAHITI: |
---|
| 2979 | + case CHIP_PITCAIRN: |
---|
| 2980 | + case CHIP_VERDE: |
---|
| 2981 | + case CHIP_OLAND: |
---|
| 2982 | +#endif |
---|
2255 | 2983 | case CHIP_BONAIRE: |
---|
2256 | 2984 | case CHIP_KAVERI: |
---|
2257 | 2985 | case CHIP_KABINI: |
---|
.. | .. |
---|
2276 | 3004 | case CHIP_VEGA10: |
---|
2277 | 3005 | case CHIP_VEGA12: |
---|
2278 | 3006 | case CHIP_VEGA20: |
---|
2279 | | -#if defined(CONFIG_DRM_AMD_DC_DCN1_0) |
---|
| 3007 | +#if defined(CONFIG_DRM_AMD_DC_DCN) |
---|
2280 | 3008 | case CHIP_RAVEN: |
---|
| 3009 | + case CHIP_NAVI10: |
---|
| 3010 | + case CHIP_NAVI14: |
---|
| 3011 | + case CHIP_NAVI12: |
---|
| 3012 | + case CHIP_RENOIR: |
---|
| 3013 | +#endif |
---|
| 3014 | +#if defined(CONFIG_DRM_AMD_DC_DCN3_0) |
---|
| 3015 | + case CHIP_SIENNA_CICHLID: |
---|
| 3016 | + case CHIP_NAVY_FLOUNDER: |
---|
2281 | 3017 | #endif |
---|
2282 | 3018 | return amdgpu_dc != 0; |
---|
2283 | 3019 | #endif |
---|
2284 | 3020 | default: |
---|
| 3021 | + if (amdgpu_dc > 0) |
---|
| 3022 | + DRM_INFO_ONCE("Display Core has been requested via kernel parameter " |
---|
| 3023 | + "but isn't supported by ASIC, ignoring\n"); |
---|
2285 | 3024 | return false; |
---|
2286 | 3025 | } |
---|
2287 | 3026 | } |
---|
.. | .. |
---|
2289 | 3028 | /** |
---|
2290 | 3029 | * amdgpu_device_has_dc_support - check if dc is supported |
---|
2291 | 3030 | * |
---|
2292 | | - * @adev: amdgpu_device_pointer |
---|
| 3031 | + * @adev: amdgpu_device pointer |
---|
2293 | 3032 | * |
---|
2294 | 3033 | * Returns true for supported, false for not supported |
---|
2295 | 3034 | */ |
---|
2296 | 3035 | bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) |
---|
2297 | 3036 | { |
---|
2298 | | - if (amdgpu_sriov_vf(adev)) |
---|
| 3037 | + if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) |
---|
2299 | 3038 | return false; |
---|
2300 | 3039 | |
---|
2301 | 3040 | return amdgpu_device_asic_has_dc_support(adev->asic_type); |
---|
2302 | 3041 | } |
---|
2303 | 3042 | |
---|
| 3043 | + |
---|
| 3044 | +static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) |
---|
| 3045 | +{ |
---|
| 3046 | + struct amdgpu_device *adev = |
---|
| 3047 | + container_of(__work, struct amdgpu_device, xgmi_reset_work); |
---|
| 3048 | + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); |
---|
| 3049 | + |
---|
| 3050 | + /* It's a bug to not have a hive within this function */ |
---|
| 3051 | + if (WARN_ON(!hive)) |
---|
| 3052 | + return; |
---|
| 3053 | + |
---|
| 3054 | + /* |
---|
| 3055 | + * Use task barrier to synchronize all xgmi reset works across the |
---|
| 3056 | + * hive. task_barrier_enter and task_barrier_exit will block |
---|
| 3057 | + * until all the threads running the xgmi reset works reach |
---|
| 3058 | + * those points. task_barrier_full will do both blocks. |
---|
| 3059 | + */ |
---|
| 3060 | + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { |
---|
| 3061 | + |
---|
| 3062 | + task_barrier_enter(&hive->tb); |
---|
| 3063 | + adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); |
---|
| 3064 | + |
---|
| 3065 | + if (adev->asic_reset_res) |
---|
| 3066 | + goto fail; |
---|
| 3067 | + |
---|
| 3068 | + task_barrier_exit(&hive->tb); |
---|
| 3069 | + adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); |
---|
| 3070 | + |
---|
| 3071 | + if (adev->asic_reset_res) |
---|
| 3072 | + goto fail; |
---|
| 3073 | + |
---|
| 3074 | + if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) |
---|
| 3075 | + adev->mmhub.funcs->reset_ras_error_count(adev); |
---|
| 3076 | + } else { |
---|
| 3077 | + |
---|
| 3078 | + task_barrier_full(&hive->tb); |
---|
| 3079 | + adev->asic_reset_res = amdgpu_asic_reset(adev); |
---|
| 3080 | + } |
---|
| 3081 | + |
---|
| 3082 | +fail: |
---|
| 3083 | + if (adev->asic_reset_res) |
---|
| 3084 | + DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", |
---|
| 3085 | + adev->asic_reset_res, adev_to_drm(adev)->unique); |
---|
| 3086 | + amdgpu_put_xgmi_hive(hive); |
---|
| 3087 | +} |
---|
| 3088 | + |
---|
| 3089 | +static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) |
---|
| 3090 | +{ |
---|
| 3091 | + char *input = amdgpu_lockup_timeout; |
---|
| 3092 | + char *timeout_setting = NULL; |
---|
| 3093 | + int index = 0; |
---|
| 3094 | + long timeout; |
---|
| 3095 | + int ret = 0; |
---|
| 3096 | + |
---|
| 3097 | + /* |
---|
| 3098 | + * By default timeout for non compute jobs is 10000. |
---|
| 3099 | + * And there is no timeout enforced on compute jobs. |
---|
| 3100 | + * In SR-IOV or passthrough mode, timeout for compute |
---|
| 3101 | + * jobs are 60000 by default. |
---|
| 3102 | + */ |
---|
| 3103 | + adev->gfx_timeout = msecs_to_jiffies(10000); |
---|
| 3104 | + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; |
---|
| 3105 | + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) |
---|
| 3106 | + adev->compute_timeout = msecs_to_jiffies(60000); |
---|
| 3107 | + else |
---|
| 3108 | + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; |
---|
| 3109 | + |
---|
| 3110 | + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { |
---|
| 3111 | + while ((timeout_setting = strsep(&input, ",")) && |
---|
| 3112 | + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { |
---|
| 3113 | + ret = kstrtol(timeout_setting, 0, &timeout); |
---|
| 3114 | + if (ret) |
---|
| 3115 | + return ret; |
---|
| 3116 | + |
---|
| 3117 | + if (timeout == 0) { |
---|
| 3118 | + index++; |
---|
| 3119 | + continue; |
---|
| 3120 | + } else if (timeout < 0) { |
---|
| 3121 | + timeout = MAX_SCHEDULE_TIMEOUT; |
---|
| 3122 | + } else { |
---|
| 3123 | + timeout = msecs_to_jiffies(timeout); |
---|
| 3124 | + } |
---|
| 3125 | + |
---|
| 3126 | + switch (index++) { |
---|
| 3127 | + case 0: |
---|
| 3128 | + adev->gfx_timeout = timeout; |
---|
| 3129 | + break; |
---|
| 3130 | + case 1: |
---|
| 3131 | + adev->compute_timeout = timeout; |
---|
| 3132 | + break; |
---|
| 3133 | + case 2: |
---|
| 3134 | + adev->sdma_timeout = timeout; |
---|
| 3135 | + break; |
---|
| 3136 | + case 3: |
---|
| 3137 | + adev->video_timeout = timeout; |
---|
| 3138 | + break; |
---|
| 3139 | + default: |
---|
| 3140 | + break; |
---|
| 3141 | + } |
---|
| 3142 | + } |
---|
| 3143 | + /* |
---|
| 3144 | + * There is only one value specified and |
---|
| 3145 | + * it should apply to all non-compute jobs. |
---|
| 3146 | + */ |
---|
| 3147 | + if (index == 1) { |
---|
| 3148 | + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; |
---|
| 3149 | + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) |
---|
| 3150 | + adev->compute_timeout = adev->gfx_timeout; |
---|
| 3151 | + } |
---|
| 3152 | + } |
---|
| 3153 | + |
---|
| 3154 | + return ret; |
---|
| 3155 | +} |
---|
| 3156 | + |
---|
| 3157 | +static const struct attribute *amdgpu_dev_attributes[] = { |
---|
| 3158 | + &dev_attr_product_name.attr, |
---|
| 3159 | + &dev_attr_product_number.attr, |
---|
| 3160 | + &dev_attr_serial_number.attr, |
---|
| 3161 | + &dev_attr_pcie_replay_count.attr, |
---|
| 3162 | + NULL |
---|
| 3163 | +}; |
---|
| 3164 | + |
---|
| 3165 | + |
---|
2304 | 3166 | /** |
---|
2305 | 3167 | * amdgpu_device_init - initialize the driver |
---|
2306 | 3168 | * |
---|
2307 | 3169 | * @adev: amdgpu_device pointer |
---|
2308 | | - * @ddev: drm dev pointer |
---|
2309 | | - * @pdev: pci dev pointer |
---|
2310 | 3170 | * @flags: driver flags |
---|
2311 | 3171 | * |
---|
2312 | 3172 | * Initializes the driver info and hw (all asics). |
---|
.. | .. |
---|
2314 | 3174 | * Called at driver startup. |
---|
2315 | 3175 | */ |
---|
2316 | 3176 | int amdgpu_device_init(struct amdgpu_device *adev, |
---|
2317 | | - struct drm_device *ddev, |
---|
2318 | | - struct pci_dev *pdev, |
---|
2319 | 3177 | uint32_t flags) |
---|
2320 | 3178 | { |
---|
| 3179 | + struct drm_device *ddev = adev_to_drm(adev); |
---|
| 3180 | + struct pci_dev *pdev = adev->pdev; |
---|
2321 | 3181 | int r, i; |
---|
2322 | | - bool runtime = false; |
---|
| 3182 | + bool boco = false; |
---|
2323 | 3183 | u32 max_MBps; |
---|
2324 | 3184 | |
---|
2325 | 3185 | adev->shutdown = false; |
---|
2326 | | - adev->dev = &pdev->dev; |
---|
2327 | | - adev->ddev = ddev; |
---|
2328 | | - adev->pdev = pdev; |
---|
2329 | 3186 | adev->flags = flags; |
---|
2330 | | - adev->asic_type = flags & AMD_ASIC_MASK; |
---|
| 3187 | + |
---|
| 3188 | + if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) |
---|
| 3189 | + adev->asic_type = amdgpu_force_asic_type; |
---|
| 3190 | + else |
---|
| 3191 | + adev->asic_type = flags & AMD_ASIC_MASK; |
---|
| 3192 | + |
---|
2331 | 3193 | adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; |
---|
2332 | 3194 | if (amdgpu_emu_mode == 1) |
---|
2333 | | - adev->usec_timeout *= 2; |
---|
| 3195 | + adev->usec_timeout *= 10; |
---|
2334 | 3196 | adev->gmc.gart_size = 512 * 1024 * 1024; |
---|
2335 | 3197 | adev->accel_working = false; |
---|
2336 | 3198 | adev->num_rings = 0; |
---|
2337 | 3199 | adev->mman.buffer_funcs = NULL; |
---|
2338 | 3200 | adev->mman.buffer_funcs_ring = NULL; |
---|
2339 | 3201 | adev->vm_manager.vm_pte_funcs = NULL; |
---|
2340 | | - adev->vm_manager.vm_pte_num_rings = 0; |
---|
| 3202 | + adev->vm_manager.vm_pte_num_scheds = 0; |
---|
2341 | 3203 | adev->gmc.gmc_funcs = NULL; |
---|
2342 | 3204 | adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); |
---|
2343 | 3205 | bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); |
---|
.. | .. |
---|
2348 | 3210 | adev->pcie_wreg = &amdgpu_invalid_wreg; |
---|
2349 | 3211 | adev->pciep_rreg = &amdgpu_invalid_rreg; |
---|
2350 | 3212 | adev->pciep_wreg = &amdgpu_invalid_wreg; |
---|
| 3213 | + adev->pcie_rreg64 = &amdgpu_invalid_rreg64; |
---|
| 3214 | + adev->pcie_wreg64 = &amdgpu_invalid_wreg64; |
---|
2351 | 3215 | adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; |
---|
2352 | 3216 | adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; |
---|
2353 | 3217 | adev->didt_rreg = &amdgpu_invalid_rreg; |
---|
.. | .. |
---|
2369 | 3233 | mutex_init(&adev->gfx.gpu_clock_mutex); |
---|
2370 | 3234 | mutex_init(&adev->srbm_mutex); |
---|
2371 | 3235 | mutex_init(&adev->gfx.pipe_reserve_mutex); |
---|
| 3236 | + mutex_init(&adev->gfx.gfx_off_mutex); |
---|
2372 | 3237 | mutex_init(&adev->grbm_idx_mutex); |
---|
2373 | 3238 | mutex_init(&adev->mn_lock); |
---|
2374 | 3239 | mutex_init(&adev->virt.vf_errors.lock); |
---|
2375 | 3240 | hash_init(adev->mn_hash); |
---|
2376 | | - mutex_init(&adev->lock_reset); |
---|
| 3241 | + atomic_set(&adev->in_gpu_reset, 0); |
---|
| 3242 | + init_rwsem(&adev->reset_sem); |
---|
| 3243 | + mutex_init(&adev->psp.mutex); |
---|
| 3244 | + mutex_init(&adev->notifier_lock); |
---|
2377 | 3245 | |
---|
2378 | | - amdgpu_device_check_arguments(adev); |
---|
| 3246 | + r = amdgpu_device_check_arguments(adev); |
---|
| 3247 | + if (r) |
---|
| 3248 | + return r; |
---|
2379 | 3249 | |
---|
2380 | 3250 | spin_lock_init(&adev->mmio_idx_lock); |
---|
2381 | 3251 | spin_lock_init(&adev->smc_idx_lock); |
---|
.. | .. |
---|
2390 | 3260 | INIT_LIST_HEAD(&adev->shadow_list); |
---|
2391 | 3261 | mutex_init(&adev->shadow_list_lock); |
---|
2392 | 3262 | |
---|
2393 | | - INIT_LIST_HEAD(&adev->ring_lru_list); |
---|
2394 | | - spin_lock_init(&adev->ring_lru_list_lock); |
---|
| 3263 | + INIT_DELAYED_WORK(&adev->delayed_init_work, |
---|
| 3264 | + amdgpu_device_delayed_init_work_handler); |
---|
| 3265 | + INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, |
---|
| 3266 | + amdgpu_device_delay_enable_gfx_off); |
---|
2395 | 3267 | |
---|
2396 | | - INIT_DELAYED_WORK(&adev->late_init_work, |
---|
2397 | | - amdgpu_device_ip_late_init_func_handler); |
---|
| 3268 | + INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); |
---|
2398 | 3269 | |
---|
2399 | | - adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; |
---|
| 3270 | + adev->gfx.gfx_off_req_count = 1; |
---|
| 3271 | + adev->pm.ac_power = power_supply_is_system_supplied() > 0; |
---|
| 3272 | + |
---|
| 3273 | + atomic_set(&adev->throttling_logging_enabled, 1); |
---|
| 3274 | + /* |
---|
| 3275 | + * If throttling continues, logging will be performed every minute |
---|
| 3276 | + * to avoid log flooding. "-1" is subtracted since the thermal |
---|
| 3277 | + * throttling interrupt comes every second. Thus, the total logging |
---|
| 3278 | + * interval is 59 seconds(retelimited printk interval) + 1(waiting |
---|
| 3279 | + * for throttling interrupt) = 60 seconds. |
---|
| 3280 | + */ |
---|
| 3281 | + ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); |
---|
| 3282 | + ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); |
---|
2400 | 3283 | |
---|
2401 | 3284 | /* Registers mapping */ |
---|
2402 | 3285 | /* TODO: block userspace mapping of io register */ |
---|
.. | .. |
---|
2415 | 3298 | DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); |
---|
2416 | 3299 | DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); |
---|
2417 | 3300 | |
---|
2418 | | - /* doorbell bar mapping */ |
---|
2419 | | - amdgpu_device_doorbell_init(adev); |
---|
2420 | | - |
---|
2421 | 3301 | /* io port mapping */ |
---|
2422 | 3302 | for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { |
---|
2423 | 3303 | if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { |
---|
.. | .. |
---|
2429 | 3309 | if (adev->rio_mem == NULL) |
---|
2430 | 3310 | DRM_INFO("PCI I/O BAR is not found.\n"); |
---|
2431 | 3311 | |
---|
| 3312 | + /* enable PCIE atomic ops */ |
---|
| 3313 | + r = pci_enable_atomic_ops_to_root(adev->pdev, |
---|
| 3314 | + PCI_EXP_DEVCAP2_ATOMIC_COMP32 | |
---|
| 3315 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64); |
---|
| 3316 | + if (r) { |
---|
| 3317 | + adev->have_atomics_support = false; |
---|
| 3318 | + DRM_INFO("PCIE atomic ops is not supported\n"); |
---|
| 3319 | + } else { |
---|
| 3320 | + adev->have_atomics_support = true; |
---|
| 3321 | + } |
---|
| 3322 | + |
---|
2432 | 3323 | amdgpu_device_get_pcie_info(adev); |
---|
| 3324 | + |
---|
| 3325 | + if (amdgpu_mcbp) |
---|
| 3326 | + DRM_INFO("MCBP is enabled\n"); |
---|
| 3327 | + |
---|
| 3328 | + if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) |
---|
| 3329 | + adev->enable_mes = true; |
---|
| 3330 | + |
---|
| 3331 | + /* detect hw virtualization here */ |
---|
| 3332 | + amdgpu_detect_virtualization(adev); |
---|
| 3333 | + |
---|
| 3334 | + r = amdgpu_device_get_job_timeout_settings(adev); |
---|
| 3335 | + if (r) { |
---|
| 3336 | + dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); |
---|
| 3337 | + return r; |
---|
| 3338 | + } |
---|
2433 | 3339 | |
---|
2434 | 3340 | /* early init functions */ |
---|
2435 | 3341 | r = amdgpu_device_ip_early_init(adev); |
---|
2436 | 3342 | if (r) |
---|
2437 | 3343 | return r; |
---|
2438 | 3344 | |
---|
| 3345 | + /* doorbell bar mapping and doorbell index init*/ |
---|
| 3346 | + amdgpu_device_doorbell_init(adev); |
---|
| 3347 | + |
---|
2439 | 3348 | /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ |
---|
2440 | 3349 | /* this will fail for cards that aren't VGA class devices, just |
---|
2441 | 3350 | * ignore it */ |
---|
2442 | 3351 | vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); |
---|
2443 | 3352 | |
---|
2444 | | - if (amdgpu_device_is_px(ddev)) |
---|
2445 | | - runtime = true; |
---|
2446 | | - if (!pci_is_thunderbolt_attached(adev->pdev)) |
---|
| 3353 | + if (amdgpu_device_supports_boco(ddev)) |
---|
| 3354 | + boco = true; |
---|
| 3355 | + if (amdgpu_has_atpx() && |
---|
| 3356 | + (amdgpu_is_atpx_hybrid() || |
---|
| 3357 | + amdgpu_has_atpx_dgpu_power_cntl()) && |
---|
| 3358 | + !pci_is_thunderbolt_attached(adev->pdev)) |
---|
2447 | 3359 | vga_switcheroo_register_client(adev->pdev, |
---|
2448 | | - &amdgpu_switcheroo_ops, runtime); |
---|
2449 | | - if (runtime) |
---|
| 3360 | + &amdgpu_switcheroo_ops, boco); |
---|
| 3361 | + if (boco) |
---|
2450 | 3362 | vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); |
---|
2451 | 3363 | |
---|
2452 | 3364 | if (amdgpu_emu_mode == 1) { |
---|
.. | .. |
---|
2455 | 3367 | goto fence_driver_init; |
---|
2456 | 3368 | } |
---|
2457 | 3369 | |
---|
2458 | | - /* Read BIOS */ |
---|
2459 | | - if (!amdgpu_get_bios(adev)) { |
---|
2460 | | - r = -EINVAL; |
---|
2461 | | - goto failed; |
---|
2462 | | - } |
---|
2463 | | - |
---|
2464 | | - r = amdgpu_atombios_init(adev); |
---|
2465 | | - if (r) { |
---|
2466 | | - dev_err(adev->dev, "amdgpu_atombios_init failed\n"); |
---|
2467 | | - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); |
---|
2468 | | - goto failed; |
---|
2469 | | - } |
---|
2470 | | - |
---|
2471 | 3370 | /* detect if we are with an SRIOV vbios */ |
---|
2472 | 3371 | amdgpu_device_detect_sriov_bios(adev); |
---|
| 3372 | + |
---|
| 3373 | + /* check if we need to reset the asic |
---|
| 3374 | + * E.g., driver was not cleanly unloaded previously, etc. |
---|
| 3375 | + */ |
---|
| 3376 | + if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { |
---|
| 3377 | + r = amdgpu_asic_reset(adev); |
---|
| 3378 | + if (r) { |
---|
| 3379 | + dev_err(adev->dev, "asic reset on init failed\n"); |
---|
| 3380 | + goto failed; |
---|
| 3381 | + } |
---|
| 3382 | + } |
---|
| 3383 | + |
---|
| 3384 | + pci_enable_pcie_error_reporting(adev->ddev.pdev); |
---|
2473 | 3385 | |
---|
2474 | 3386 | /* Post card if necessary */ |
---|
2475 | 3387 | if (amdgpu_device_need_post(adev)) { |
---|
.. | .. |
---|
2479 | 3391 | goto failed; |
---|
2480 | 3392 | } |
---|
2481 | 3393 | DRM_INFO("GPU posting now...\n"); |
---|
2482 | | - r = amdgpu_atom_asic_init(adev->mode_info.atom_context); |
---|
| 3394 | + r = amdgpu_device_asic_init(adev); |
---|
2483 | 3395 | if (r) { |
---|
2484 | 3396 | dev_err(adev->dev, "gpu post error!\n"); |
---|
2485 | 3397 | goto failed; |
---|
.. | .. |
---|
2517 | 3429 | } |
---|
2518 | 3430 | |
---|
2519 | 3431 | /* init the mode config */ |
---|
2520 | | - drm_mode_config_init(adev->ddev); |
---|
| 3432 | + drm_mode_config_init(adev_to_drm(adev)); |
---|
2521 | 3433 | |
---|
2522 | 3434 | r = amdgpu_device_ip_init(adev); |
---|
2523 | 3435 | if (r) { |
---|
.. | .. |
---|
2538 | 3450 | goto failed; |
---|
2539 | 3451 | } |
---|
2540 | 3452 | |
---|
| 3453 | + dev_info(adev->dev, |
---|
| 3454 | + "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", |
---|
| 3455 | + adev->gfx.config.max_shader_engines, |
---|
| 3456 | + adev->gfx.config.max_sh_per_se, |
---|
| 3457 | + adev->gfx.config.max_cu_per_sh, |
---|
| 3458 | + adev->gfx.cu_info.number); |
---|
| 3459 | + |
---|
2541 | 3460 | adev->accel_working = true; |
---|
2542 | 3461 | |
---|
2543 | 3462 | amdgpu_vm_check_compute_bug(adev); |
---|
.. | .. |
---|
2550 | 3469 | /* Get a log2 for easy divisions. */ |
---|
2551 | 3470 | adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); |
---|
2552 | 3471 | |
---|
2553 | | - r = amdgpu_ib_pool_init(adev); |
---|
2554 | | - if (r) { |
---|
2555 | | - dev_err(adev->dev, "IB initialization failed (%d).\n", r); |
---|
2556 | | - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); |
---|
2557 | | - goto failed; |
---|
2558 | | - } |
---|
2559 | | - |
---|
2560 | 3472 | amdgpu_fbdev_init(adev); |
---|
2561 | 3473 | |
---|
2562 | 3474 | r = amdgpu_pm_sysfs_init(adev); |
---|
2563 | | - if (r) |
---|
| 3475 | + if (r) { |
---|
| 3476 | + adev->pm_sysfs_en = false; |
---|
2564 | 3477 | DRM_ERROR("registering pm debugfs failed (%d).\n", r); |
---|
| 3478 | + } else |
---|
| 3479 | + adev->pm_sysfs_en = true; |
---|
2565 | 3480 | |
---|
2566 | | - r = amdgpu_debugfs_gem_init(adev); |
---|
2567 | | - if (r) |
---|
2568 | | - DRM_ERROR("registering gem debugfs failed (%d).\n", r); |
---|
2569 | | - |
---|
2570 | | - r = amdgpu_debugfs_regs_init(adev); |
---|
2571 | | - if (r) |
---|
2572 | | - DRM_ERROR("registering register debugfs failed (%d).\n", r); |
---|
2573 | | - |
---|
2574 | | - r = amdgpu_debugfs_firmware_init(adev); |
---|
2575 | | - if (r) |
---|
2576 | | - DRM_ERROR("registering firmware debugfs failed (%d).\n", r); |
---|
2577 | | - |
---|
2578 | | - r = amdgpu_debugfs_init(adev); |
---|
2579 | | - if (r) |
---|
2580 | | - DRM_ERROR("Creating debugfs files failed (%d).\n", r); |
---|
| 3481 | + r = amdgpu_ucode_sysfs_init(adev); |
---|
| 3482 | + if (r) { |
---|
| 3483 | + adev->ucode_sysfs_en = false; |
---|
| 3484 | + DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); |
---|
| 3485 | + } else |
---|
| 3486 | + adev->ucode_sysfs_en = true; |
---|
2581 | 3487 | |
---|
2582 | 3488 | if ((amdgpu_testing & 1)) { |
---|
2583 | 3489 | if (adev->accel_working) |
---|
.. | .. |
---|
2592 | 3498 | DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); |
---|
2593 | 3499 | } |
---|
2594 | 3500 | |
---|
| 3501 | + /* |
---|
| 3502 | + * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. |
---|
| 3503 | + * Otherwise the mgpu fan boost feature will be skipped due to the |
---|
| 3504 | + * gpu instance is counted less. |
---|
| 3505 | + */ |
---|
| 3506 | + amdgpu_register_gpu_instance(adev); |
---|
| 3507 | + |
---|
2595 | 3508 | /* enable clockgating, etc. after ib tests, etc. since some blocks require |
---|
2596 | 3509 | * explicit gating rather than handling it automatically. |
---|
2597 | 3510 | */ |
---|
.. | .. |
---|
2602 | 3515 | goto failed; |
---|
2603 | 3516 | } |
---|
2604 | 3517 | |
---|
| 3518 | + /* must succeed. */ |
---|
| 3519 | + amdgpu_ras_resume(adev); |
---|
| 3520 | + |
---|
| 3521 | + queue_delayed_work(system_wq, &adev->delayed_init_work, |
---|
| 3522 | + msecs_to_jiffies(AMDGPU_RESUME_MS)); |
---|
| 3523 | + |
---|
| 3524 | + if (amdgpu_sriov_vf(adev)) |
---|
| 3525 | + flush_delayed_work(&adev->delayed_init_work); |
---|
| 3526 | + |
---|
| 3527 | + r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); |
---|
| 3528 | + if (r) |
---|
| 3529 | + dev_err(adev->dev, "Could not create amdgpu device attr\n"); |
---|
| 3530 | + |
---|
| 3531 | + if (IS_ENABLED(CONFIG_PERF_EVENTS)) |
---|
| 3532 | + r = amdgpu_pmu_init(adev); |
---|
| 3533 | + if (r) |
---|
| 3534 | + dev_err(adev->dev, "amdgpu_pmu_init failed\n"); |
---|
| 3535 | + |
---|
| 3536 | + /* Have stored pci confspace at hand for restore in sudden PCI error */ |
---|
| 3537 | + if (amdgpu_device_cache_pci_state(adev->pdev)) |
---|
| 3538 | + pci_restore_state(pdev); |
---|
| 3539 | + |
---|
2605 | 3540 | return 0; |
---|
2606 | 3541 | |
---|
2607 | 3542 | failed: |
---|
2608 | 3543 | amdgpu_vf_error_trans_all(adev); |
---|
2609 | | - if (runtime) |
---|
| 3544 | + if (boco) |
---|
2610 | 3545 | vga_switcheroo_fini_domain_pm_ops(adev->dev); |
---|
2611 | 3546 | |
---|
2612 | 3547 | return r; |
---|
.. | .. |
---|
2622 | 3557 | */ |
---|
2623 | 3558 | void amdgpu_device_fini(struct amdgpu_device *adev) |
---|
2624 | 3559 | { |
---|
2625 | | - int r; |
---|
2626 | | - |
---|
2627 | | - DRM_INFO("amdgpu: finishing device.\n"); |
---|
| 3560 | + dev_info(adev->dev, "amdgpu: finishing device.\n"); |
---|
| 3561 | + flush_delayed_work(&adev->delayed_init_work); |
---|
| 3562 | + ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); |
---|
2628 | 3563 | adev->shutdown = true; |
---|
| 3564 | + |
---|
| 3565 | + kfree(adev->pci_state); |
---|
| 3566 | + |
---|
| 3567 | + /* make sure IB test finished before entering exclusive mode |
---|
| 3568 | + * to avoid preemption on IB test |
---|
| 3569 | + * */ |
---|
| 3570 | + if (amdgpu_sriov_vf(adev)) { |
---|
| 3571 | + amdgpu_virt_request_full_gpu(adev, false); |
---|
| 3572 | + amdgpu_virt_fini_data_exchange(adev); |
---|
| 3573 | + } |
---|
| 3574 | + |
---|
2629 | 3575 | /* disable all interrupts */ |
---|
2630 | 3576 | amdgpu_irq_disable_all(adev); |
---|
2631 | 3577 | if (adev->mode_info.mode_config_initialized){ |
---|
2632 | 3578 | if (!amdgpu_device_has_dc_support(adev)) |
---|
2633 | | - drm_crtc_force_disable_all(adev->ddev); |
---|
| 3579 | + drm_helper_force_disable_all(adev_to_drm(adev)); |
---|
2634 | 3580 | else |
---|
2635 | | - drm_atomic_helper_shutdown(adev->ddev); |
---|
| 3581 | + drm_atomic_helper_shutdown(adev_to_drm(adev)); |
---|
2636 | 3582 | } |
---|
2637 | | - amdgpu_ib_pool_fini(adev); |
---|
2638 | 3583 | amdgpu_fence_driver_fini(adev); |
---|
2639 | | - amdgpu_pm_sysfs_fini(adev); |
---|
| 3584 | + if (adev->pm_sysfs_en) |
---|
| 3585 | + amdgpu_pm_sysfs_fini(adev); |
---|
2640 | 3586 | amdgpu_fbdev_fini(adev); |
---|
2641 | | - r = amdgpu_device_ip_fini(adev); |
---|
2642 | | - if (adev->firmware.gpu_info_fw) { |
---|
2643 | | - release_firmware(adev->firmware.gpu_info_fw); |
---|
2644 | | - adev->firmware.gpu_info_fw = NULL; |
---|
2645 | | - } |
---|
| 3587 | + amdgpu_device_ip_fini(adev); |
---|
| 3588 | + release_firmware(adev->firmware.gpu_info_fw); |
---|
| 3589 | + adev->firmware.gpu_info_fw = NULL; |
---|
2646 | 3590 | adev->accel_working = false; |
---|
2647 | | - cancel_delayed_work_sync(&adev->late_init_work); |
---|
2648 | 3591 | /* free i2c buses */ |
---|
2649 | 3592 | if (!amdgpu_device_has_dc_support(adev)) |
---|
2650 | 3593 | amdgpu_i2c_fini(adev); |
---|
.. | .. |
---|
2654 | 3597 | |
---|
2655 | 3598 | kfree(adev->bios); |
---|
2656 | 3599 | adev->bios = NULL; |
---|
2657 | | - if (!pci_is_thunderbolt_attached(adev->pdev)) |
---|
| 3600 | + if (amdgpu_has_atpx() && |
---|
| 3601 | + (amdgpu_is_atpx_hybrid() || |
---|
| 3602 | + amdgpu_has_atpx_dgpu_power_cntl()) && |
---|
| 3603 | + !pci_is_thunderbolt_attached(adev->pdev)) |
---|
2658 | 3604 | vga_switcheroo_unregister_client(adev->pdev); |
---|
2659 | | - if (adev->flags & AMD_IS_PX) |
---|
| 3605 | + if (amdgpu_device_supports_boco(adev_to_drm(adev))) |
---|
2660 | 3606 | vga_switcheroo_fini_domain_pm_ops(adev->dev); |
---|
2661 | 3607 | vga_client_register(adev->pdev, NULL, NULL, NULL); |
---|
2662 | 3608 | if (adev->rio_mem) |
---|
.. | .. |
---|
2665 | 3611 | iounmap(adev->rmmio); |
---|
2666 | 3612 | adev->rmmio = NULL; |
---|
2667 | 3613 | amdgpu_device_doorbell_fini(adev); |
---|
2668 | | - amdgpu_debugfs_regs_cleanup(adev); |
---|
| 3614 | + |
---|
| 3615 | + if (adev->ucode_sysfs_en) |
---|
| 3616 | + amdgpu_ucode_sysfs_fini(adev); |
---|
| 3617 | + |
---|
| 3618 | + sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); |
---|
| 3619 | + if (IS_ENABLED(CONFIG_PERF_EVENTS)) |
---|
| 3620 | + amdgpu_pmu_fini(adev); |
---|
| 3621 | + if (adev->mman.discovery_bin) |
---|
| 3622 | + amdgpu_discovery_fini(adev); |
---|
2669 | 3623 | } |
---|
2670 | 3624 | |
---|
2671 | 3625 | |
---|
.. | .. |
---|
2676 | 3630 | * amdgpu_device_suspend - initiate device suspend |
---|
2677 | 3631 | * |
---|
2678 | 3632 | * @dev: drm dev pointer |
---|
2679 | | - * @suspend: suspend state |
---|
2680 | 3633 | * @fbcon : notify the fbdev of suspend |
---|
2681 | 3634 | * |
---|
2682 | 3635 | * Puts the hw in the suspend state (all asics). |
---|
2683 | 3636 | * Returns 0 for success or an error on failure. |
---|
2684 | 3637 | * Called at driver suspend. |
---|
2685 | 3638 | */ |
---|
2686 | | -int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) |
---|
| 3639 | +int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) |
---|
2687 | 3640 | { |
---|
2688 | 3641 | struct amdgpu_device *adev; |
---|
2689 | 3642 | struct drm_crtc *crtc; |
---|
2690 | 3643 | struct drm_connector *connector; |
---|
| 3644 | + struct drm_connector_list_iter iter; |
---|
2691 | 3645 | int r; |
---|
2692 | 3646 | |
---|
2693 | | - if (dev == NULL || dev->dev_private == NULL) { |
---|
2694 | | - return -ENODEV; |
---|
2695 | | - } |
---|
2696 | | - |
---|
2697 | | - adev = dev->dev_private; |
---|
| 3647 | + adev = drm_to_adev(dev); |
---|
2698 | 3648 | |
---|
2699 | 3649 | if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) |
---|
2700 | 3650 | return 0; |
---|
2701 | 3651 | |
---|
| 3652 | + adev->in_suspend = true; |
---|
2702 | 3653 | drm_kms_helper_poll_disable(dev); |
---|
2703 | 3654 | |
---|
2704 | 3655 | if (fbcon) |
---|
2705 | 3656 | amdgpu_fbdev_set_suspend(adev, 1); |
---|
2706 | 3657 | |
---|
| 3658 | + cancel_delayed_work_sync(&adev->delayed_init_work); |
---|
| 3659 | + |
---|
2707 | 3660 | if (!amdgpu_device_has_dc_support(adev)) { |
---|
2708 | 3661 | /* turn off display hw */ |
---|
2709 | 3662 | drm_modeset_lock_all(dev); |
---|
2710 | | - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { |
---|
2711 | | - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); |
---|
2712 | | - } |
---|
| 3663 | + drm_connector_list_iter_begin(dev, &iter); |
---|
| 3664 | + drm_for_each_connector_iter(connector, &iter) |
---|
| 3665 | + drm_helper_connector_dpms(connector, |
---|
| 3666 | + DRM_MODE_DPMS_OFF); |
---|
| 3667 | + drm_connector_list_iter_end(&iter); |
---|
2713 | 3668 | drm_modeset_unlock_all(dev); |
---|
2714 | 3669 | /* unpin the front buffers and cursors */ |
---|
2715 | 3670 | list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { |
---|
.. | .. |
---|
2717 | 3672 | struct drm_framebuffer *fb = crtc->primary->fb; |
---|
2718 | 3673 | struct amdgpu_bo *robj; |
---|
2719 | 3674 | |
---|
2720 | | - if (amdgpu_crtc->cursor_bo) { |
---|
| 3675 | + if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { |
---|
2721 | 3676 | struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); |
---|
2722 | 3677 | r = amdgpu_bo_reserve(aobj, true); |
---|
2723 | 3678 | if (r == 0) { |
---|
.. | .. |
---|
2741 | 3696 | } |
---|
2742 | 3697 | } |
---|
2743 | 3698 | |
---|
2744 | | - amdgpu_amdkfd_suspend(adev); |
---|
| 3699 | + amdgpu_ras_suspend(adev); |
---|
2745 | 3700 | |
---|
2746 | 3701 | r = amdgpu_device_ip_suspend_phase1(adev); |
---|
| 3702 | + |
---|
| 3703 | + amdgpu_amdkfd_suspend(adev, !fbcon); |
---|
2747 | 3704 | |
---|
2748 | 3705 | /* evict vram memory */ |
---|
2749 | 3706 | amdgpu_bo_evict_vram(adev); |
---|
.. | .. |
---|
2758 | 3715 | */ |
---|
2759 | 3716 | amdgpu_bo_evict_vram(adev); |
---|
2760 | 3717 | |
---|
2761 | | - pci_save_state(dev->pdev); |
---|
2762 | | - if (suspend) { |
---|
2763 | | - /* Shut down the device */ |
---|
2764 | | - pci_disable_device(dev->pdev); |
---|
2765 | | - pci_set_power_state(dev->pdev, PCI_D3hot); |
---|
2766 | | - } else { |
---|
2767 | | - r = amdgpu_asic_reset(adev); |
---|
2768 | | - if (r) |
---|
2769 | | - DRM_ERROR("amdgpu asic reset failed\n"); |
---|
2770 | | - } |
---|
2771 | | - |
---|
2772 | 3718 | return 0; |
---|
2773 | 3719 | } |
---|
2774 | 3720 | |
---|
.. | .. |
---|
2776 | 3722 | * amdgpu_device_resume - initiate device resume |
---|
2777 | 3723 | * |
---|
2778 | 3724 | * @dev: drm dev pointer |
---|
2779 | | - * @resume: resume state |
---|
2780 | 3725 | * @fbcon : notify the fbdev of resume |
---|
2781 | 3726 | * |
---|
2782 | 3727 | * Bring the hw back to operating state (all asics). |
---|
2783 | 3728 | * Returns 0 for success or an error on failure. |
---|
2784 | 3729 | * Called at driver resume. |
---|
2785 | 3730 | */ |
---|
2786 | | -int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) |
---|
| 3731 | +int amdgpu_device_resume(struct drm_device *dev, bool fbcon) |
---|
2787 | 3732 | { |
---|
2788 | 3733 | struct drm_connector *connector; |
---|
2789 | | - struct amdgpu_device *adev = dev->dev_private; |
---|
| 3734 | + struct drm_connector_list_iter iter; |
---|
| 3735 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
2790 | 3736 | struct drm_crtc *crtc; |
---|
2791 | 3737 | int r = 0; |
---|
2792 | 3738 | |
---|
2793 | 3739 | if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) |
---|
2794 | 3740 | return 0; |
---|
2795 | 3741 | |
---|
2796 | | - if (resume) { |
---|
2797 | | - pci_set_power_state(dev->pdev, PCI_D0); |
---|
2798 | | - pci_restore_state(dev->pdev); |
---|
2799 | | - r = pci_enable_device(dev->pdev); |
---|
2800 | | - if (r) |
---|
2801 | | - return r; |
---|
2802 | | - } |
---|
2803 | | - |
---|
2804 | 3742 | /* post card */ |
---|
2805 | 3743 | if (amdgpu_device_need_post(adev)) { |
---|
2806 | | - r = amdgpu_atom_asic_init(adev->mode_info.atom_context); |
---|
| 3744 | + r = amdgpu_device_asic_init(adev); |
---|
2807 | 3745 | if (r) |
---|
2808 | | - DRM_ERROR("amdgpu asic init failed\n"); |
---|
| 3746 | + dev_err(adev->dev, "amdgpu asic init failed\n"); |
---|
2809 | 3747 | } |
---|
2810 | 3748 | |
---|
2811 | 3749 | r = amdgpu_device_ip_resume(adev); |
---|
2812 | 3750 | if (r) { |
---|
2813 | | - DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); |
---|
| 3751 | + dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); |
---|
2814 | 3752 | return r; |
---|
2815 | 3753 | } |
---|
2816 | 3754 | amdgpu_fence_driver_resume(adev); |
---|
.. | .. |
---|
2820 | 3758 | if (r) |
---|
2821 | 3759 | return r; |
---|
2822 | 3760 | |
---|
| 3761 | + queue_delayed_work(system_wq, &adev->delayed_init_work, |
---|
| 3762 | + msecs_to_jiffies(AMDGPU_RESUME_MS)); |
---|
| 3763 | + |
---|
2823 | 3764 | if (!amdgpu_device_has_dc_support(adev)) { |
---|
2824 | 3765 | /* pin cursors */ |
---|
2825 | 3766 | list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { |
---|
2826 | 3767 | struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); |
---|
2827 | 3768 | |
---|
2828 | | - if (amdgpu_crtc->cursor_bo) { |
---|
| 3769 | + if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { |
---|
2829 | 3770 | struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); |
---|
2830 | 3771 | r = amdgpu_bo_reserve(aobj, true); |
---|
2831 | 3772 | if (r == 0) { |
---|
2832 | 3773 | r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); |
---|
2833 | 3774 | if (r != 0) |
---|
2834 | | - DRM_ERROR("Failed to pin cursor BO (%d)\n", r); |
---|
| 3775 | + dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); |
---|
2835 | 3776 | amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); |
---|
2836 | 3777 | amdgpu_bo_unreserve(aobj); |
---|
2837 | 3778 | } |
---|
2838 | 3779 | } |
---|
2839 | 3780 | } |
---|
2840 | 3781 | } |
---|
2841 | | - r = amdgpu_amdkfd_resume(adev); |
---|
| 3782 | + r = amdgpu_amdkfd_resume(adev, !fbcon); |
---|
2842 | 3783 | if (r) |
---|
2843 | 3784 | return r; |
---|
2844 | 3785 | |
---|
2845 | 3786 | /* Make sure IB tests flushed */ |
---|
2846 | | - flush_delayed_work(&adev->late_init_work); |
---|
| 3787 | + flush_delayed_work(&adev->delayed_init_work); |
---|
2847 | 3788 | |
---|
2848 | 3789 | /* blat the mode back in */ |
---|
2849 | 3790 | if (fbcon) { |
---|
.. | .. |
---|
2853 | 3794 | |
---|
2854 | 3795 | /* turn on display hw */ |
---|
2855 | 3796 | drm_modeset_lock_all(dev); |
---|
2856 | | - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { |
---|
2857 | | - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); |
---|
2858 | | - } |
---|
| 3797 | + |
---|
| 3798 | + drm_connector_list_iter_begin(dev, &iter); |
---|
| 3799 | + drm_for_each_connector_iter(connector, &iter) |
---|
| 3800 | + drm_helper_connector_dpms(connector, |
---|
| 3801 | + DRM_MODE_DPMS_ON); |
---|
| 3802 | + drm_connector_list_iter_end(&iter); |
---|
| 3803 | + |
---|
2859 | 3804 | drm_modeset_unlock_all(dev); |
---|
2860 | 3805 | } |
---|
2861 | 3806 | amdgpu_fbdev_set_suspend(adev, 0); |
---|
2862 | 3807 | } |
---|
2863 | 3808 | |
---|
2864 | 3809 | drm_kms_helper_poll_enable(dev); |
---|
| 3810 | + |
---|
| 3811 | + amdgpu_ras_resume(adev); |
---|
2865 | 3812 | |
---|
2866 | 3813 | /* |
---|
2867 | 3814 | * Most of the connector probing functions try to acquire runtime pm |
---|
.. | .. |
---|
2882 | 3829 | #ifdef CONFIG_PM |
---|
2883 | 3830 | dev->dev->power.disable_depth--; |
---|
2884 | 3831 | #endif |
---|
| 3832 | + adev->in_suspend = false; |
---|
| 3833 | + |
---|
2885 | 3834 | return 0; |
---|
2886 | 3835 | } |
---|
2887 | 3836 | |
---|
.. | .. |
---|
2913 | 3862 | adev->ip_blocks[i].status.hang = |
---|
2914 | 3863 | adev->ip_blocks[i].version->funcs->check_soft_reset(adev); |
---|
2915 | 3864 | if (adev->ip_blocks[i].status.hang) { |
---|
2916 | | - DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); |
---|
| 3865 | + dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); |
---|
2917 | 3866 | asic_hang = true; |
---|
2918 | 3867 | } |
---|
2919 | 3868 | } |
---|
.. | .. |
---|
2974 | 3923 | (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || |
---|
2975 | 3924 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { |
---|
2976 | 3925 | if (adev->ip_blocks[i].status.hang) { |
---|
2977 | | - DRM_INFO("Some block need full reset!\n"); |
---|
| 3926 | + dev_info(adev->dev, "Some block need full reset!\n"); |
---|
2978 | 3927 | return true; |
---|
2979 | 3928 | } |
---|
2980 | 3929 | } |
---|
.. | .. |
---|
3040 | 3989 | } |
---|
3041 | 3990 | |
---|
3042 | 3991 | /** |
---|
3043 | | - * amdgpu_device_recover_vram_from_shadow - restore shadowed VRAM buffers |
---|
3044 | | - * |
---|
3045 | | - * @adev: amdgpu_device pointer |
---|
3046 | | - * @ring: amdgpu_ring for the engine handling the buffer operations |
---|
3047 | | - * @bo: amdgpu_bo buffer whose shadow is being restored |
---|
3048 | | - * @fence: dma_fence associated with the operation |
---|
3049 | | - * |
---|
3050 | | - * Restores the VRAM buffer contents from the shadow in GTT. Used to |
---|
3051 | | - * restore things like GPUVM page tables after a GPU reset where |
---|
3052 | | - * the contents of VRAM might be lost. |
---|
3053 | | - * Returns 0 on success, negative error code on failure. |
---|
3054 | | - */ |
---|
3055 | | -static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev, |
---|
3056 | | - struct amdgpu_ring *ring, |
---|
3057 | | - struct amdgpu_bo *bo, |
---|
3058 | | - struct dma_fence **fence) |
---|
3059 | | -{ |
---|
3060 | | - uint32_t domain; |
---|
3061 | | - int r; |
---|
3062 | | - |
---|
3063 | | - if (!bo->shadow) |
---|
3064 | | - return 0; |
---|
3065 | | - |
---|
3066 | | - r = amdgpu_bo_reserve(bo, true); |
---|
3067 | | - if (r) |
---|
3068 | | - return r; |
---|
3069 | | - domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type); |
---|
3070 | | - /* if bo has been evicted, then no need to recover */ |
---|
3071 | | - if (domain == AMDGPU_GEM_DOMAIN_VRAM) { |
---|
3072 | | - r = amdgpu_bo_validate(bo->shadow); |
---|
3073 | | - if (r) { |
---|
3074 | | - DRM_ERROR("bo validate failed!\n"); |
---|
3075 | | - goto err; |
---|
3076 | | - } |
---|
3077 | | - |
---|
3078 | | - r = amdgpu_bo_restore_from_shadow(adev, ring, bo, |
---|
3079 | | - NULL, fence, true); |
---|
3080 | | - if (r) { |
---|
3081 | | - DRM_ERROR("recover page table failed!\n"); |
---|
3082 | | - goto err; |
---|
3083 | | - } |
---|
3084 | | - } |
---|
3085 | | -err: |
---|
3086 | | - amdgpu_bo_unreserve(bo); |
---|
3087 | | - return r; |
---|
3088 | | -} |
---|
3089 | | - |
---|
3090 | | -/** |
---|
3091 | | - * amdgpu_device_handle_vram_lost - Handle the loss of VRAM contents |
---|
| 3992 | + * amdgpu_device_recover_vram - Recover some VRAM contents |
---|
3092 | 3993 | * |
---|
3093 | 3994 | * @adev: amdgpu_device pointer |
---|
3094 | 3995 | * |
---|
3095 | 3996 | * Restores the contents of VRAM buffers from the shadows in GTT. Used to |
---|
3096 | 3997 | * restore things like GPUVM page tables after a GPU reset where |
---|
3097 | 3998 | * the contents of VRAM might be lost. |
---|
3098 | | - * Returns 0 on success, 1 on failure. |
---|
| 3999 | + * |
---|
| 4000 | + * Returns: |
---|
| 4001 | + * 0 on success, negative error code on failure. |
---|
3099 | 4002 | */ |
---|
3100 | | -static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev) |
---|
| 4003 | +static int amdgpu_device_recover_vram(struct amdgpu_device *adev) |
---|
3101 | 4004 | { |
---|
3102 | | - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; |
---|
3103 | | - struct amdgpu_bo *bo, *tmp; |
---|
3104 | 4005 | struct dma_fence *fence = NULL, *next = NULL; |
---|
3105 | | - long r = 1; |
---|
3106 | | - int i = 0; |
---|
3107 | | - long tmo; |
---|
| 4006 | + struct amdgpu_bo *shadow; |
---|
| 4007 | + long r = 1, tmo; |
---|
3108 | 4008 | |
---|
3109 | 4009 | if (amdgpu_sriov_runtime(adev)) |
---|
3110 | 4010 | tmo = msecs_to_jiffies(8000); |
---|
3111 | 4011 | else |
---|
3112 | 4012 | tmo = msecs_to_jiffies(100); |
---|
3113 | 4013 | |
---|
3114 | | - DRM_INFO("recover vram bo from shadow start\n"); |
---|
| 4014 | + dev_info(adev->dev, "recover vram bo from shadow start\n"); |
---|
3115 | 4015 | mutex_lock(&adev->shadow_list_lock); |
---|
3116 | | - list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { |
---|
3117 | | - next = NULL; |
---|
3118 | | - amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); |
---|
| 4016 | + list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { |
---|
| 4017 | + |
---|
| 4018 | + /* No need to recover an evicted BO */ |
---|
| 4019 | + if (shadow->tbo.mem.mem_type != TTM_PL_TT || |
---|
| 4020 | + shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || |
---|
| 4021 | + shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) |
---|
| 4022 | + continue; |
---|
| 4023 | + |
---|
| 4024 | + r = amdgpu_bo_restore_shadow(shadow, &next); |
---|
| 4025 | + if (r) |
---|
| 4026 | + break; |
---|
| 4027 | + |
---|
3119 | 4028 | if (fence) { |
---|
3120 | | - r = dma_fence_wait_timeout(fence, false, tmo); |
---|
3121 | | - if (r == 0) |
---|
3122 | | - pr_err("wait fence %p[%d] timeout\n", fence, i); |
---|
3123 | | - else if (r < 0) |
---|
3124 | | - pr_err("wait fence %p[%d] interrupted\n", fence, i); |
---|
3125 | | - if (r < 1) { |
---|
3126 | | - dma_fence_put(fence); |
---|
3127 | | - fence = next; |
---|
| 4029 | + tmo = dma_fence_wait_timeout(fence, false, tmo); |
---|
| 4030 | + dma_fence_put(fence); |
---|
| 4031 | + fence = next; |
---|
| 4032 | + if (tmo == 0) { |
---|
| 4033 | + r = -ETIMEDOUT; |
---|
| 4034 | + break; |
---|
| 4035 | + } else if (tmo < 0) { |
---|
| 4036 | + r = tmo; |
---|
3128 | 4037 | break; |
---|
3129 | 4038 | } |
---|
3130 | | - i++; |
---|
| 4039 | + } else { |
---|
| 4040 | + fence = next; |
---|
3131 | 4041 | } |
---|
3132 | | - |
---|
3133 | | - dma_fence_put(fence); |
---|
3134 | | - fence = next; |
---|
3135 | 4042 | } |
---|
3136 | 4043 | mutex_unlock(&adev->shadow_list_lock); |
---|
3137 | 4044 | |
---|
3138 | | - if (fence) { |
---|
3139 | | - r = dma_fence_wait_timeout(fence, false, tmo); |
---|
3140 | | - if (r == 0) |
---|
3141 | | - pr_err("wait fence %p[%d] timeout\n", fence, i); |
---|
3142 | | - else if (r < 0) |
---|
3143 | | - pr_err("wait fence %p[%d] interrupted\n", fence, i); |
---|
3144 | | - |
---|
3145 | | - } |
---|
| 4045 | + if (fence) |
---|
| 4046 | + tmo = dma_fence_wait_timeout(fence, false, tmo); |
---|
3146 | 4047 | dma_fence_put(fence); |
---|
3147 | 4048 | |
---|
3148 | | - if (r > 0) |
---|
3149 | | - DRM_INFO("recover vram bo from shadow done\n"); |
---|
3150 | | - else |
---|
3151 | | - DRM_ERROR("recover vram bo from shadow failed\n"); |
---|
| 4049 | + if (r < 0 || tmo <= 0) { |
---|
| 4050 | + dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); |
---|
| 4051 | + return -EIO; |
---|
| 4052 | + } |
---|
3152 | 4053 | |
---|
3153 | | - return (r > 0) ? 0 : 1; |
---|
| 4054 | + dev_info(adev->dev, "recover vram bo from shadow done\n"); |
---|
| 4055 | + return 0; |
---|
3154 | 4056 | } |
---|
3155 | 4057 | |
---|
3156 | | -/** |
---|
3157 | | - * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough |
---|
3158 | | - * |
---|
3159 | | - * @adev: amdgpu device pointer |
---|
3160 | | - * |
---|
3161 | | - * attempt to do soft-reset or full-reset and reinitialize Asic |
---|
3162 | | - * return 0 means succeeded otherwise failed |
---|
3163 | | - */ |
---|
3164 | | -static int amdgpu_device_reset(struct amdgpu_device *adev) |
---|
3165 | | -{ |
---|
3166 | | - bool need_full_reset, vram_lost = 0; |
---|
3167 | | - int r; |
---|
3168 | | - |
---|
3169 | | - need_full_reset = amdgpu_device_ip_need_full_reset(adev); |
---|
3170 | | - |
---|
3171 | | - if (!need_full_reset) { |
---|
3172 | | - amdgpu_device_ip_pre_soft_reset(adev); |
---|
3173 | | - r = amdgpu_device_ip_soft_reset(adev); |
---|
3174 | | - amdgpu_device_ip_post_soft_reset(adev); |
---|
3175 | | - if (r || amdgpu_device_ip_check_soft_reset(adev)) { |
---|
3176 | | - DRM_INFO("soft reset failed, will fallback to full reset!\n"); |
---|
3177 | | - need_full_reset = true; |
---|
3178 | | - } |
---|
3179 | | - } |
---|
3180 | | - |
---|
3181 | | - if (need_full_reset) { |
---|
3182 | | - r = amdgpu_device_ip_suspend(adev); |
---|
3183 | | - |
---|
3184 | | -retry: |
---|
3185 | | - r = amdgpu_asic_reset(adev); |
---|
3186 | | - /* post card */ |
---|
3187 | | - amdgpu_atom_asic_init(adev->mode_info.atom_context); |
---|
3188 | | - |
---|
3189 | | - if (!r) { |
---|
3190 | | - dev_info(adev->dev, "GPU reset succeeded, trying to resume\n"); |
---|
3191 | | - r = amdgpu_device_ip_resume_phase1(adev); |
---|
3192 | | - if (r) |
---|
3193 | | - goto out; |
---|
3194 | | - |
---|
3195 | | - vram_lost = amdgpu_device_check_vram_lost(adev); |
---|
3196 | | - if (vram_lost) { |
---|
3197 | | - DRM_ERROR("VRAM is lost!\n"); |
---|
3198 | | - atomic_inc(&adev->vram_lost_counter); |
---|
3199 | | - } |
---|
3200 | | - |
---|
3201 | | - r = amdgpu_gtt_mgr_recover( |
---|
3202 | | - &adev->mman.bdev.man[TTM_PL_TT]); |
---|
3203 | | - if (r) |
---|
3204 | | - goto out; |
---|
3205 | | - |
---|
3206 | | - r = amdgpu_device_ip_resume_phase2(adev); |
---|
3207 | | - if (r) |
---|
3208 | | - goto out; |
---|
3209 | | - |
---|
3210 | | - if (vram_lost) |
---|
3211 | | - amdgpu_device_fill_reset_magic(adev); |
---|
3212 | | - } |
---|
3213 | | - } |
---|
3214 | | - |
---|
3215 | | -out: |
---|
3216 | | - if (!r) { |
---|
3217 | | - amdgpu_irq_gpu_reset_resume_helper(adev); |
---|
3218 | | - r = amdgpu_ib_ring_tests(adev); |
---|
3219 | | - if (r) { |
---|
3220 | | - dev_err(adev->dev, "ib ring test failed (%d).\n", r); |
---|
3221 | | - r = amdgpu_device_ip_suspend(adev); |
---|
3222 | | - need_full_reset = true; |
---|
3223 | | - goto retry; |
---|
3224 | | - } |
---|
3225 | | - } |
---|
3226 | | - |
---|
3227 | | - if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost)) |
---|
3228 | | - r = amdgpu_device_handle_vram_lost(adev); |
---|
3229 | | - |
---|
3230 | | - return r; |
---|
3231 | | -} |
---|
3232 | 4058 | |
---|
3233 | 4059 | /** |
---|
3234 | 4060 | * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf |
---|
3235 | 4061 | * |
---|
3236 | | - * @adev: amdgpu device pointer |
---|
| 4062 | + * @adev: amdgpu_device pointer |
---|
3237 | 4063 | * @from_hypervisor: request from hypervisor |
---|
3238 | 4064 | * |
---|
3239 | 4065 | * do VF FLR and reinitialize Asic |
---|
.. | .. |
---|
3251 | 4077 | if (r) |
---|
3252 | 4078 | return r; |
---|
3253 | 4079 | |
---|
| 4080 | + amdgpu_amdkfd_pre_reset(adev); |
---|
| 4081 | + |
---|
3254 | 4082 | /* Resume IP prior to SMC */ |
---|
3255 | 4083 | r = amdgpu_device_ip_reinit_early_sriov(adev); |
---|
3256 | 4084 | if (r) |
---|
3257 | 4085 | goto error; |
---|
3258 | 4086 | |
---|
| 4087 | + amdgpu_virt_init_data_exchange(adev); |
---|
3259 | 4088 | /* we need recover gart prior to run SMC/CP/SDMA resume */ |
---|
3260 | | - amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); |
---|
| 4089 | + amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); |
---|
| 4090 | + |
---|
| 4091 | + r = amdgpu_device_fw_loading(adev); |
---|
| 4092 | + if (r) |
---|
| 4093 | + return r; |
---|
3261 | 4094 | |
---|
3262 | 4095 | /* now we are okay to resume SMC/CP/SDMA */ |
---|
3263 | 4096 | r = amdgpu_device_ip_reinit_late_sriov(adev); |
---|
.. | .. |
---|
3266 | 4099 | |
---|
3267 | 4100 | amdgpu_irq_gpu_reset_resume_helper(adev); |
---|
3268 | 4101 | r = amdgpu_ib_ring_tests(adev); |
---|
| 4102 | + amdgpu_amdkfd_post_reset(adev); |
---|
3269 | 4103 | |
---|
3270 | 4104 | error: |
---|
3271 | | - amdgpu_virt_init_data_exchange(adev); |
---|
3272 | 4105 | amdgpu_virt_release_full_gpu(adev, true); |
---|
3273 | 4106 | if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { |
---|
3274 | | - atomic_inc(&adev->vram_lost_counter); |
---|
3275 | | - r = amdgpu_device_handle_vram_lost(adev); |
---|
| 4107 | + amdgpu_inc_vram_lost(adev); |
---|
| 4108 | + r = amdgpu_device_recover_vram(adev); |
---|
3276 | 4109 | } |
---|
3277 | 4110 | |
---|
3278 | 4111 | return r; |
---|
3279 | 4112 | } |
---|
3280 | 4113 | |
---|
3281 | 4114 | /** |
---|
3282 | | - * amdgpu_device_gpu_recover - reset the asic and recover scheduler |
---|
| 4115 | + * amdgpu_device_has_job_running - check if there is any job in mirror list |
---|
3283 | 4116 | * |
---|
3284 | | - * @adev: amdgpu device pointer |
---|
3285 | | - * @job: which job trigger hang |
---|
3286 | | - * @force: forces reset regardless of amdgpu_gpu_recovery |
---|
| 4117 | + * @adev: amdgpu_device pointer |
---|
3287 | 4118 | * |
---|
3288 | | - * Attempt to reset the GPU if it has hung (all asics). |
---|
3289 | | - * Returns 0 for success or an error on failure. |
---|
| 4119 | + * check if there is any job in mirror list |
---|
3290 | 4120 | */ |
---|
3291 | | -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, |
---|
3292 | | - struct amdgpu_job *job, bool force) |
---|
| 4121 | +bool amdgpu_device_has_job_running(struct amdgpu_device *adev) |
---|
3293 | 4122 | { |
---|
3294 | | - int i, r, resched; |
---|
| 4123 | + int i; |
---|
| 4124 | + struct drm_sched_job *job; |
---|
3295 | 4125 | |
---|
3296 | | - if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { |
---|
3297 | | - DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); |
---|
3298 | | - return 0; |
---|
| 4126 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
---|
| 4127 | + struct amdgpu_ring *ring = adev->rings[i]; |
---|
| 4128 | + |
---|
| 4129 | + if (!ring || !ring->sched.thread) |
---|
| 4130 | + continue; |
---|
| 4131 | + |
---|
| 4132 | + spin_lock(&ring->sched.job_list_lock); |
---|
| 4133 | + job = list_first_entry_or_null(&ring->sched.ring_mirror_list, |
---|
| 4134 | + struct drm_sched_job, node); |
---|
| 4135 | + spin_unlock(&ring->sched.job_list_lock); |
---|
| 4136 | + if (job) |
---|
| 4137 | + return true; |
---|
| 4138 | + } |
---|
| 4139 | + return false; |
---|
| 4140 | +} |
---|
| 4141 | + |
---|
| 4142 | +/** |
---|
| 4143 | + * amdgpu_device_should_recover_gpu - check if we should try GPU recovery |
---|
| 4144 | + * |
---|
| 4145 | + * @adev: amdgpu_device pointer |
---|
| 4146 | + * |
---|
| 4147 | + * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover |
---|
| 4148 | + * a hung GPU. |
---|
| 4149 | + */ |
---|
| 4150 | +bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) |
---|
| 4151 | +{ |
---|
| 4152 | + if (!amdgpu_device_ip_check_soft_reset(adev)) { |
---|
| 4153 | + dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); |
---|
| 4154 | + return false; |
---|
3299 | 4155 | } |
---|
3300 | 4156 | |
---|
3301 | | - if (!force && (amdgpu_gpu_recovery == 0 || |
---|
3302 | | - (amdgpu_gpu_recovery == -1 && !amdgpu_sriov_vf(adev)))) { |
---|
3303 | | - DRM_INFO("GPU recovery disabled.\n"); |
---|
3304 | | - return 0; |
---|
| 4157 | + if (amdgpu_gpu_recovery == 0) |
---|
| 4158 | + goto disabled; |
---|
| 4159 | + |
---|
| 4160 | + if (amdgpu_sriov_vf(adev)) |
---|
| 4161 | + return true; |
---|
| 4162 | + |
---|
| 4163 | + if (amdgpu_gpu_recovery == -1) { |
---|
| 4164 | + switch (adev->asic_type) { |
---|
| 4165 | + case CHIP_BONAIRE: |
---|
| 4166 | + case CHIP_HAWAII: |
---|
| 4167 | + case CHIP_TOPAZ: |
---|
| 4168 | + case CHIP_TONGA: |
---|
| 4169 | + case CHIP_FIJI: |
---|
| 4170 | + case CHIP_POLARIS10: |
---|
| 4171 | + case CHIP_POLARIS11: |
---|
| 4172 | + case CHIP_POLARIS12: |
---|
| 4173 | + case CHIP_VEGAM: |
---|
| 4174 | + case CHIP_VEGA20: |
---|
| 4175 | + case CHIP_VEGA10: |
---|
| 4176 | + case CHIP_VEGA12: |
---|
| 4177 | + case CHIP_RAVEN: |
---|
| 4178 | + case CHIP_ARCTURUS: |
---|
| 4179 | + case CHIP_RENOIR: |
---|
| 4180 | + case CHIP_NAVI10: |
---|
| 4181 | + case CHIP_NAVI14: |
---|
| 4182 | + case CHIP_NAVI12: |
---|
| 4183 | + case CHIP_SIENNA_CICHLID: |
---|
| 4184 | + break; |
---|
| 4185 | + default: |
---|
| 4186 | + goto disabled; |
---|
| 4187 | + } |
---|
3305 | 4188 | } |
---|
3306 | 4189 | |
---|
3307 | | - dev_info(adev->dev, "GPU reset begin!\n"); |
---|
| 4190 | + return true; |
---|
3308 | 4191 | |
---|
3309 | | - mutex_lock(&adev->lock_reset); |
---|
3310 | | - atomic_inc(&adev->gpu_reset_counter); |
---|
3311 | | - adev->in_gpu_reset = 1; |
---|
| 4192 | +disabled: |
---|
| 4193 | + dev_info(adev->dev, "GPU recovery disabled.\n"); |
---|
| 4194 | + return false; |
---|
| 4195 | +} |
---|
3312 | 4196 | |
---|
3313 | | - /* Block kfd */ |
---|
3314 | | - amdgpu_amdkfd_pre_reset(adev); |
---|
3315 | 4197 | |
---|
3316 | | - /* block TTM */ |
---|
3317 | | - resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); |
---|
| 4198 | +static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, |
---|
| 4199 | + struct amdgpu_job *job, |
---|
| 4200 | + bool *need_full_reset_arg) |
---|
| 4201 | +{ |
---|
| 4202 | + int i, r = 0; |
---|
| 4203 | + bool need_full_reset = *need_full_reset_arg; |
---|
| 4204 | + |
---|
| 4205 | + amdgpu_debugfs_wait_dump(adev); |
---|
| 4206 | + |
---|
| 4207 | + if (amdgpu_sriov_vf(adev)) { |
---|
| 4208 | + /* stop the data exchange thread */ |
---|
| 4209 | + amdgpu_virt_fini_data_exchange(adev); |
---|
| 4210 | + } |
---|
3318 | 4211 | |
---|
3319 | 4212 | /* block all schedulers and reset given job's ring */ |
---|
3320 | 4213 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
---|
.. | .. |
---|
3323 | 4216 | if (!ring || !ring->sched.thread) |
---|
3324 | 4217 | continue; |
---|
3325 | 4218 | |
---|
3326 | | - kthread_park(ring->sched.thread); |
---|
3327 | | - |
---|
3328 | | - if (job && job->base.sched == &ring->sched) |
---|
3329 | | - continue; |
---|
3330 | | - |
---|
3331 | | - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL); |
---|
3332 | | - |
---|
3333 | 4219 | /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ |
---|
3334 | 4220 | amdgpu_fence_driver_force_completion(ring); |
---|
3335 | 4221 | } |
---|
3336 | 4222 | |
---|
3337 | | - if (amdgpu_sriov_vf(adev)) |
---|
3338 | | - r = amdgpu_device_reset_sriov(adev, job ? false : true); |
---|
3339 | | - else |
---|
3340 | | - r = amdgpu_device_reset(adev); |
---|
| 4223 | + if(job) |
---|
| 4224 | + drm_sched_increase_karma(&job->base); |
---|
3341 | 4225 | |
---|
3342 | | - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
---|
3343 | | - struct amdgpu_ring *ring = adev->rings[i]; |
---|
| 4226 | + /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ |
---|
| 4227 | + if (!amdgpu_sriov_vf(adev)) { |
---|
3344 | 4228 | |
---|
3345 | | - if (!ring || !ring->sched.thread) |
---|
3346 | | - continue; |
---|
| 4229 | + if (!need_full_reset) |
---|
| 4230 | + need_full_reset = amdgpu_device_ip_need_full_reset(adev); |
---|
3347 | 4231 | |
---|
3348 | | - /* only need recovery sched of the given job's ring |
---|
3349 | | - * or all rings (in the case @job is NULL) |
---|
3350 | | - * after above amdgpu_reset accomplished |
---|
3351 | | - */ |
---|
3352 | | - if ((!job || job->base.sched == &ring->sched) && !r) |
---|
3353 | | - drm_sched_job_recovery(&ring->sched); |
---|
| 4232 | + if (!need_full_reset) { |
---|
| 4233 | + amdgpu_device_ip_pre_soft_reset(adev); |
---|
| 4234 | + r = amdgpu_device_ip_soft_reset(adev); |
---|
| 4235 | + amdgpu_device_ip_post_soft_reset(adev); |
---|
| 4236 | + if (r || amdgpu_device_ip_check_soft_reset(adev)) { |
---|
| 4237 | + dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); |
---|
| 4238 | + need_full_reset = true; |
---|
| 4239 | + } |
---|
| 4240 | + } |
---|
3354 | 4241 | |
---|
3355 | | - kthread_unpark(ring->sched.thread); |
---|
| 4242 | + if (need_full_reset) |
---|
| 4243 | + r = amdgpu_device_ip_suspend(adev); |
---|
| 4244 | + |
---|
| 4245 | + *need_full_reset_arg = need_full_reset; |
---|
3356 | 4246 | } |
---|
3357 | 4247 | |
---|
3358 | | - if (!amdgpu_device_has_dc_support(adev)) { |
---|
3359 | | - drm_helper_resume_force_mode(adev->ddev); |
---|
| 4248 | + return r; |
---|
| 4249 | +} |
---|
| 4250 | + |
---|
| 4251 | +static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, |
---|
| 4252 | + struct list_head *device_list_handle, |
---|
| 4253 | + bool *need_full_reset_arg, |
---|
| 4254 | + bool skip_hw_reset) |
---|
| 4255 | +{ |
---|
| 4256 | + struct amdgpu_device *tmp_adev = NULL; |
---|
| 4257 | + bool need_full_reset = *need_full_reset_arg, vram_lost = false; |
---|
| 4258 | + int r = 0; |
---|
| 4259 | + |
---|
| 4260 | + /* |
---|
| 4261 | + * ASIC reset has to be done on all HGMI hive nodes ASAP |
---|
| 4262 | + * to allow proper links negotiation in FW (within 1 sec) |
---|
| 4263 | + */ |
---|
| 4264 | + if (!skip_hw_reset && need_full_reset) { |
---|
| 4265 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
---|
| 4266 | + /* For XGMI run all resets in parallel to speed up the process */ |
---|
| 4267 | + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { |
---|
| 4268 | + if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) |
---|
| 4269 | + r = -EALREADY; |
---|
| 4270 | + } else |
---|
| 4271 | + r = amdgpu_asic_reset(tmp_adev); |
---|
| 4272 | + |
---|
| 4273 | + if (r) { |
---|
| 4274 | + dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", |
---|
| 4275 | + r, adev_to_drm(tmp_adev)->unique); |
---|
| 4276 | + break; |
---|
| 4277 | + } |
---|
| 4278 | + } |
---|
| 4279 | + |
---|
| 4280 | + /* For XGMI wait for all resets to complete before proceed */ |
---|
| 4281 | + if (!r) { |
---|
| 4282 | + list_for_each_entry(tmp_adev, device_list_handle, |
---|
| 4283 | + gmc.xgmi.head) { |
---|
| 4284 | + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { |
---|
| 4285 | + flush_work(&tmp_adev->xgmi_reset_work); |
---|
| 4286 | + r = tmp_adev->asic_reset_res; |
---|
| 4287 | + if (r) |
---|
| 4288 | + break; |
---|
| 4289 | + } |
---|
| 4290 | + } |
---|
| 4291 | + } |
---|
3360 | 4292 | } |
---|
3361 | 4293 | |
---|
3362 | | - ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); |
---|
| 4294 | + if (!r && amdgpu_ras_intr_triggered()) { |
---|
| 4295 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
---|
| 4296 | + if (tmp_adev->mmhub.funcs && |
---|
| 4297 | + tmp_adev->mmhub.funcs->reset_ras_error_count) |
---|
| 4298 | + tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); |
---|
| 4299 | + } |
---|
3363 | 4300 | |
---|
3364 | | - if (r) { |
---|
3365 | | - /* bad news, how to tell it to userspace ? */ |
---|
3366 | | - dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); |
---|
3367 | | - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); |
---|
| 4301 | + amdgpu_ras_intr_cleared(); |
---|
| 4302 | + } |
---|
| 4303 | + |
---|
| 4304 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
---|
| 4305 | + if (need_full_reset) { |
---|
| 4306 | + /* post card */ |
---|
| 4307 | + if (amdgpu_device_asic_init(tmp_adev)) |
---|
| 4308 | + dev_warn(tmp_adev->dev, "asic atom init failed!"); |
---|
| 4309 | + |
---|
| 4310 | + if (!r) { |
---|
| 4311 | + dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); |
---|
| 4312 | + r = amdgpu_amdkfd_resume_iommu(tmp_adev); |
---|
| 4313 | + if (r) |
---|
| 4314 | + goto out; |
---|
| 4315 | + |
---|
| 4316 | + r = amdgpu_device_ip_resume_phase1(tmp_adev); |
---|
| 4317 | + if (r) |
---|
| 4318 | + goto out; |
---|
| 4319 | + |
---|
| 4320 | + vram_lost = amdgpu_device_check_vram_lost(tmp_adev); |
---|
| 4321 | + if (vram_lost) { |
---|
| 4322 | + DRM_INFO("VRAM is lost due to GPU reset!\n"); |
---|
| 4323 | + amdgpu_inc_vram_lost(tmp_adev); |
---|
| 4324 | + } |
---|
| 4325 | + |
---|
| 4326 | + r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); |
---|
| 4327 | + if (r) |
---|
| 4328 | + goto out; |
---|
| 4329 | + |
---|
| 4330 | + r = amdgpu_device_fw_loading(tmp_adev); |
---|
| 4331 | + if (r) |
---|
| 4332 | + return r; |
---|
| 4333 | + |
---|
| 4334 | + r = amdgpu_device_ip_resume_phase2(tmp_adev); |
---|
| 4335 | + if (r) |
---|
| 4336 | + goto out; |
---|
| 4337 | + |
---|
| 4338 | + if (vram_lost) |
---|
| 4339 | + amdgpu_device_fill_reset_magic(tmp_adev); |
---|
| 4340 | + |
---|
| 4341 | + /* |
---|
| 4342 | + * Add this ASIC as tracked as reset was already |
---|
| 4343 | + * complete successfully. |
---|
| 4344 | + */ |
---|
| 4345 | + amdgpu_register_gpu_instance(tmp_adev); |
---|
| 4346 | + |
---|
| 4347 | + r = amdgpu_device_ip_late_init(tmp_adev); |
---|
| 4348 | + if (r) |
---|
| 4349 | + goto out; |
---|
| 4350 | + |
---|
| 4351 | + amdgpu_fbdev_set_suspend(tmp_adev, 0); |
---|
| 4352 | + |
---|
| 4353 | + /* |
---|
| 4354 | + * The GPU enters bad state once faulty pages |
---|
| 4355 | + * by ECC has reached the threshold, and ras |
---|
| 4356 | + * recovery is scheduled next. So add one check |
---|
| 4357 | + * here to break recovery if it indeed exceeds |
---|
| 4358 | + * bad page threshold, and remind user to |
---|
| 4359 | + * retire this GPU or setting one bigger |
---|
| 4360 | + * bad_page_threshold value to fix this once |
---|
| 4361 | + * probing driver again. |
---|
| 4362 | + */ |
---|
| 4363 | + if (!amdgpu_ras_check_err_threshold(tmp_adev)) { |
---|
| 4364 | + /* must succeed. */ |
---|
| 4365 | + amdgpu_ras_resume(tmp_adev); |
---|
| 4366 | + } else { |
---|
| 4367 | + r = -EINVAL; |
---|
| 4368 | + goto out; |
---|
| 4369 | + } |
---|
| 4370 | + |
---|
| 4371 | + /* Update PSP FW topology after reset */ |
---|
| 4372 | + if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) |
---|
| 4373 | + r = amdgpu_xgmi_update_topology(hive, tmp_adev); |
---|
| 4374 | + } |
---|
| 4375 | + } |
---|
| 4376 | + |
---|
| 4377 | +out: |
---|
| 4378 | + if (!r) { |
---|
| 4379 | + amdgpu_irq_gpu_reset_resume_helper(tmp_adev); |
---|
| 4380 | + r = amdgpu_ib_ring_tests(tmp_adev); |
---|
| 4381 | + if (r) { |
---|
| 4382 | + dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); |
---|
| 4383 | + need_full_reset = true; |
---|
| 4384 | + r = -EAGAIN; |
---|
| 4385 | + goto end; |
---|
| 4386 | + } |
---|
| 4387 | + } |
---|
| 4388 | + |
---|
| 4389 | + if (!r) |
---|
| 4390 | + r = amdgpu_device_recover_vram(tmp_adev); |
---|
| 4391 | + else |
---|
| 4392 | + tmp_adev->asic_reset_res = r; |
---|
| 4393 | + } |
---|
| 4394 | + |
---|
| 4395 | +end: |
---|
| 4396 | + *need_full_reset_arg = need_full_reset; |
---|
| 4397 | + return r; |
---|
| 4398 | +} |
---|
| 4399 | + |
---|
| 4400 | +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, |
---|
| 4401 | + struct amdgpu_hive_info *hive) |
---|
| 4402 | +{ |
---|
| 4403 | + if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) |
---|
| 4404 | + return false; |
---|
| 4405 | + |
---|
| 4406 | + if (hive) { |
---|
| 4407 | + down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); |
---|
3368 | 4408 | } else { |
---|
3369 | | - dev_info(adev->dev, "GPU reset(%d) succeeded!\n",atomic_read(&adev->gpu_reset_counter)); |
---|
| 4409 | + down_write(&adev->reset_sem); |
---|
3370 | 4410 | } |
---|
3371 | 4411 | |
---|
3372 | | - /*unlock kfd */ |
---|
3373 | | - amdgpu_amdkfd_post_reset(adev); |
---|
| 4412 | + atomic_inc(&adev->gpu_reset_counter); |
---|
| 4413 | + switch (amdgpu_asic_reset_method(adev)) { |
---|
| 4414 | + case AMD_RESET_METHOD_MODE1: |
---|
| 4415 | + adev->mp1_state = PP_MP1_STATE_SHUTDOWN; |
---|
| 4416 | + break; |
---|
| 4417 | + case AMD_RESET_METHOD_MODE2: |
---|
| 4418 | + adev->mp1_state = PP_MP1_STATE_RESET; |
---|
| 4419 | + break; |
---|
| 4420 | + default: |
---|
| 4421 | + adev->mp1_state = PP_MP1_STATE_NONE; |
---|
| 4422 | + break; |
---|
| 4423 | + } |
---|
| 4424 | + |
---|
| 4425 | + return true; |
---|
| 4426 | +} |
---|
| 4427 | + |
---|
| 4428 | +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) |
---|
| 4429 | +{ |
---|
3374 | 4430 | amdgpu_vf_error_trans_all(adev); |
---|
3375 | | - adev->in_gpu_reset = 0; |
---|
3376 | | - mutex_unlock(&adev->lock_reset); |
---|
| 4431 | + adev->mp1_state = PP_MP1_STATE_NONE; |
---|
| 4432 | + atomic_set(&adev->in_gpu_reset, 0); |
---|
| 4433 | + up_write(&adev->reset_sem); |
---|
| 4434 | +} |
---|
| 4435 | + |
---|
| 4436 | +static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) |
---|
| 4437 | +{ |
---|
| 4438 | + struct pci_dev *p = NULL; |
---|
| 4439 | + |
---|
| 4440 | + p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), |
---|
| 4441 | + adev->pdev->bus->number, 1); |
---|
| 4442 | + if (p) { |
---|
| 4443 | + pm_runtime_enable(&(p->dev)); |
---|
| 4444 | + pm_runtime_resume(&(p->dev)); |
---|
| 4445 | + } |
---|
| 4446 | +} |
---|
| 4447 | + |
---|
| 4448 | +static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) |
---|
| 4449 | +{ |
---|
| 4450 | + enum amd_reset_method reset_method; |
---|
| 4451 | + struct pci_dev *p = NULL; |
---|
| 4452 | + u64 expires; |
---|
| 4453 | + |
---|
| 4454 | + /* |
---|
| 4455 | + * For now, only BACO and mode1 reset are confirmed |
---|
| 4456 | + * to suffer the audio issue without proper suspended. |
---|
| 4457 | + */ |
---|
| 4458 | + reset_method = amdgpu_asic_reset_method(adev); |
---|
| 4459 | + if ((reset_method != AMD_RESET_METHOD_BACO) && |
---|
| 4460 | + (reset_method != AMD_RESET_METHOD_MODE1)) |
---|
| 4461 | + return -EINVAL; |
---|
| 4462 | + |
---|
| 4463 | + p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), |
---|
| 4464 | + adev->pdev->bus->number, 1); |
---|
| 4465 | + if (!p) |
---|
| 4466 | + return -ENODEV; |
---|
| 4467 | + |
---|
| 4468 | + expires = pm_runtime_autosuspend_expiration(&(p->dev)); |
---|
| 4469 | + if (!expires) |
---|
| 4470 | + /* |
---|
| 4471 | + * If we cannot get the audio device autosuspend delay, |
---|
| 4472 | + * a fixed 4S interval will be used. Considering 3S is |
---|
| 4473 | + * the audio controller default autosuspend delay setting. |
---|
| 4474 | + * 4S used here is guaranteed to cover that. |
---|
| 4475 | + */ |
---|
| 4476 | + expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; |
---|
| 4477 | + |
---|
| 4478 | + while (!pm_runtime_status_suspended(&(p->dev))) { |
---|
| 4479 | + if (!pm_runtime_suspend(&(p->dev))) |
---|
| 4480 | + break; |
---|
| 4481 | + |
---|
| 4482 | + if (expires < ktime_get_mono_fast_ns()) { |
---|
| 4483 | + dev_warn(adev->dev, "failed to suspend display audio\n"); |
---|
| 4484 | + /* TODO: abort the succeeding gpu reset? */ |
---|
| 4485 | + return -ETIMEDOUT; |
---|
| 4486 | + } |
---|
| 4487 | + } |
---|
| 4488 | + |
---|
| 4489 | + pm_runtime_disable(&(p->dev)); |
---|
| 4490 | + |
---|
| 4491 | + return 0; |
---|
| 4492 | +} |
---|
| 4493 | + |
---|
| 4494 | +/** |
---|
| 4495 | + * amdgpu_device_gpu_recover - reset the asic and recover scheduler |
---|
| 4496 | + * |
---|
| 4497 | + * @adev: amdgpu_device pointer |
---|
| 4498 | + * @job: which job trigger hang |
---|
| 4499 | + * |
---|
| 4500 | + * Attempt to reset the GPU if it has hung (all asics). |
---|
| 4501 | + * Attempt to do soft-reset or full-reset and reinitialize Asic |
---|
| 4502 | + * Returns 0 for success or an error on failure. |
---|
| 4503 | + */ |
---|
| 4504 | + |
---|
| 4505 | +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, |
---|
| 4506 | + struct amdgpu_job *job) |
---|
| 4507 | +{ |
---|
| 4508 | + struct list_head device_list, *device_list_handle = NULL; |
---|
| 4509 | + bool need_full_reset = false; |
---|
| 4510 | + bool job_signaled = false; |
---|
| 4511 | + struct amdgpu_hive_info *hive = NULL; |
---|
| 4512 | + struct amdgpu_device *tmp_adev = NULL; |
---|
| 4513 | + int i, r = 0; |
---|
| 4514 | + bool need_emergency_restart = false; |
---|
| 4515 | + bool audio_suspended = false; |
---|
| 4516 | + |
---|
| 4517 | + /* |
---|
| 4518 | + * Special case: RAS triggered and full reset isn't supported |
---|
| 4519 | + */ |
---|
| 4520 | + need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); |
---|
| 4521 | + |
---|
| 4522 | + /* |
---|
| 4523 | + * Flush RAM to disk so that after reboot |
---|
| 4524 | + * the user can read log and see why the system rebooted. |
---|
| 4525 | + */ |
---|
| 4526 | + if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { |
---|
| 4527 | + DRM_WARN("Emergency reboot."); |
---|
| 4528 | + |
---|
| 4529 | + ksys_sync_helper(); |
---|
| 4530 | + emergency_restart(); |
---|
| 4531 | + } |
---|
| 4532 | + |
---|
| 4533 | + dev_info(adev->dev, "GPU %s begin!\n", |
---|
| 4534 | + need_emergency_restart ? "jobs stop":"reset"); |
---|
| 4535 | + |
---|
| 4536 | + /* |
---|
| 4537 | + * Here we trylock to avoid chain of resets executing from |
---|
| 4538 | + * either trigger by jobs on different adevs in XGMI hive or jobs on |
---|
| 4539 | + * different schedulers for same device while this TO handler is running. |
---|
| 4540 | + * We always reset all schedulers for device and all devices for XGMI |
---|
| 4541 | + * hive so that should take care of them too. |
---|
| 4542 | + */ |
---|
| 4543 | + hive = amdgpu_get_xgmi_hive(adev); |
---|
| 4544 | + if (hive) { |
---|
| 4545 | + if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { |
---|
| 4546 | + DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", |
---|
| 4547 | + job ? job->base.id : -1, hive->hive_id); |
---|
| 4548 | + amdgpu_put_xgmi_hive(hive); |
---|
| 4549 | + return 0; |
---|
| 4550 | + } |
---|
| 4551 | + mutex_lock(&hive->hive_lock); |
---|
| 4552 | + } |
---|
| 4553 | + |
---|
| 4554 | + /* |
---|
| 4555 | + * Build list of devices to reset. |
---|
| 4556 | + * In case we are in XGMI hive mode, resort the device list |
---|
| 4557 | + * to put adev in the 1st position. |
---|
| 4558 | + */ |
---|
| 4559 | + INIT_LIST_HEAD(&device_list); |
---|
| 4560 | + if (adev->gmc.xgmi.num_physical_nodes > 1) { |
---|
| 4561 | + if (!hive) |
---|
| 4562 | + return -ENODEV; |
---|
| 4563 | + if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) |
---|
| 4564 | + list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); |
---|
| 4565 | + device_list_handle = &hive->device_list; |
---|
| 4566 | + } else { |
---|
| 4567 | + list_add_tail(&adev->gmc.xgmi.head, &device_list); |
---|
| 4568 | + device_list_handle = &device_list; |
---|
| 4569 | + } |
---|
| 4570 | + |
---|
| 4571 | + /* block all schedulers and reset given job's ring */ |
---|
| 4572 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
---|
| 4573 | + if (!amdgpu_device_lock_adev(tmp_adev, hive)) { |
---|
| 4574 | + dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", |
---|
| 4575 | + job ? job->base.id : -1); |
---|
| 4576 | + r = 0; |
---|
| 4577 | + goto skip_recovery; |
---|
| 4578 | + } |
---|
| 4579 | + |
---|
| 4580 | + /* |
---|
| 4581 | + * Try to put the audio codec into suspend state |
---|
| 4582 | + * before gpu reset started. |
---|
| 4583 | + * |
---|
| 4584 | + * Due to the power domain of the graphics device |
---|
| 4585 | + * is shared with AZ power domain. Without this, |
---|
| 4586 | + * we may change the audio hardware from behind |
---|
| 4587 | + * the audio driver's back. That will trigger |
---|
| 4588 | + * some audio codec errors. |
---|
| 4589 | + */ |
---|
| 4590 | + if (!amdgpu_device_suspend_display_audio(tmp_adev)) |
---|
| 4591 | + audio_suspended = true; |
---|
| 4592 | + |
---|
| 4593 | + amdgpu_ras_set_error_query_ready(tmp_adev, false); |
---|
| 4594 | + |
---|
| 4595 | + cancel_delayed_work_sync(&tmp_adev->delayed_init_work); |
---|
| 4596 | + |
---|
| 4597 | + if (!amdgpu_sriov_vf(tmp_adev)) |
---|
| 4598 | + amdgpu_amdkfd_pre_reset(tmp_adev); |
---|
| 4599 | + |
---|
| 4600 | + /* |
---|
| 4601 | + * Mark these ASICs to be reseted as untracked first |
---|
| 4602 | + * And add them back after reset completed |
---|
| 4603 | + */ |
---|
| 4604 | + amdgpu_unregister_gpu_instance(tmp_adev); |
---|
| 4605 | + |
---|
| 4606 | + amdgpu_fbdev_set_suspend(tmp_adev, 1); |
---|
| 4607 | + |
---|
| 4608 | + /* disable ras on ALL IPs */ |
---|
| 4609 | + if (!need_emergency_restart && |
---|
| 4610 | + amdgpu_device_ip_need_full_reset(tmp_adev)) |
---|
| 4611 | + amdgpu_ras_suspend(tmp_adev); |
---|
| 4612 | + |
---|
| 4613 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
---|
| 4614 | + struct amdgpu_ring *ring = tmp_adev->rings[i]; |
---|
| 4615 | + |
---|
| 4616 | + if (!ring || !ring->sched.thread) |
---|
| 4617 | + continue; |
---|
| 4618 | + |
---|
| 4619 | + drm_sched_stop(&ring->sched, job ? &job->base : NULL); |
---|
| 4620 | + |
---|
| 4621 | + if (need_emergency_restart) |
---|
| 4622 | + amdgpu_job_stop_all_jobs_on_sched(&ring->sched); |
---|
| 4623 | + } |
---|
| 4624 | + } |
---|
| 4625 | + |
---|
| 4626 | + if (need_emergency_restart) |
---|
| 4627 | + goto skip_sched_resume; |
---|
| 4628 | + |
---|
| 4629 | + /* |
---|
| 4630 | + * Must check guilty signal here since after this point all old |
---|
| 4631 | + * HW fences are force signaled. |
---|
| 4632 | + * |
---|
| 4633 | + * job->base holds a reference to parent fence |
---|
| 4634 | + */ |
---|
| 4635 | + if (job && job->base.s_fence->parent && |
---|
| 4636 | + dma_fence_is_signaled(job->base.s_fence->parent)) { |
---|
| 4637 | + job_signaled = true; |
---|
| 4638 | + dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); |
---|
| 4639 | + goto skip_hw_reset; |
---|
| 4640 | + } |
---|
| 4641 | + |
---|
| 4642 | +retry: /* Rest of adevs pre asic reset from XGMI hive. */ |
---|
| 4643 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
---|
| 4644 | + r = amdgpu_device_pre_asic_reset(tmp_adev, |
---|
| 4645 | + (tmp_adev == adev) ? job : NULL, |
---|
| 4646 | + &need_full_reset); |
---|
| 4647 | + /*TODO Should we stop ?*/ |
---|
| 4648 | + if (r) { |
---|
| 4649 | + dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", |
---|
| 4650 | + r, adev_to_drm(tmp_adev)->unique); |
---|
| 4651 | + tmp_adev->asic_reset_res = r; |
---|
| 4652 | + } |
---|
| 4653 | + } |
---|
| 4654 | + |
---|
| 4655 | + /* Actual ASIC resets if needed.*/ |
---|
| 4656 | + /* TODO Implement XGMI hive reset logic for SRIOV */ |
---|
| 4657 | + if (amdgpu_sriov_vf(adev)) { |
---|
| 4658 | + r = amdgpu_device_reset_sriov(adev, job ? false : true); |
---|
| 4659 | + if (r) |
---|
| 4660 | + adev->asic_reset_res = r; |
---|
| 4661 | + } else { |
---|
| 4662 | + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); |
---|
| 4663 | + if (r && r == -EAGAIN) |
---|
| 4664 | + goto retry; |
---|
| 4665 | + } |
---|
| 4666 | + |
---|
| 4667 | +skip_hw_reset: |
---|
| 4668 | + |
---|
| 4669 | + /* Post ASIC reset for all devs .*/ |
---|
| 4670 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
---|
| 4671 | + |
---|
| 4672 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
---|
| 4673 | + struct amdgpu_ring *ring = tmp_adev->rings[i]; |
---|
| 4674 | + |
---|
| 4675 | + if (!ring || !ring->sched.thread) |
---|
| 4676 | + continue; |
---|
| 4677 | + |
---|
| 4678 | + /* No point to resubmit jobs if we didn't HW reset*/ |
---|
| 4679 | + if (!tmp_adev->asic_reset_res && !job_signaled) |
---|
| 4680 | + drm_sched_resubmit_jobs(&ring->sched); |
---|
| 4681 | + |
---|
| 4682 | + drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); |
---|
| 4683 | + } |
---|
| 4684 | + |
---|
| 4685 | + if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { |
---|
| 4686 | + drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); |
---|
| 4687 | + } |
---|
| 4688 | + |
---|
| 4689 | + tmp_adev->asic_reset_res = 0; |
---|
| 4690 | + |
---|
| 4691 | + if (r) { |
---|
| 4692 | + /* bad news, how to tell it to userspace ? */ |
---|
| 4693 | + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); |
---|
| 4694 | + amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); |
---|
| 4695 | + } else { |
---|
| 4696 | + dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); |
---|
| 4697 | + } |
---|
| 4698 | + } |
---|
| 4699 | + |
---|
| 4700 | +skip_sched_resume: |
---|
| 4701 | + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
---|
| 4702 | + /*unlock kfd: SRIOV would do it separately */ |
---|
| 4703 | + if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) |
---|
| 4704 | + amdgpu_amdkfd_post_reset(tmp_adev); |
---|
| 4705 | + if (audio_suspended) |
---|
| 4706 | + amdgpu_device_resume_display_audio(tmp_adev); |
---|
| 4707 | + amdgpu_device_unlock_adev(tmp_adev); |
---|
| 4708 | + } |
---|
| 4709 | + |
---|
| 4710 | +skip_recovery: |
---|
| 4711 | + if (hive) { |
---|
| 4712 | + atomic_set(&hive->in_reset, 0); |
---|
| 4713 | + mutex_unlock(&hive->hive_lock); |
---|
| 4714 | + amdgpu_put_xgmi_hive(hive); |
---|
| 4715 | + } |
---|
| 4716 | + |
---|
| 4717 | + if (r) |
---|
| 4718 | + dev_info(adev->dev, "GPU reset end with ret = %d\n", r); |
---|
3377 | 4719 | return r; |
---|
3378 | 4720 | } |
---|
3379 | 4721 | |
---|
.. | .. |
---|
3389 | 4731 | static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) |
---|
3390 | 4732 | { |
---|
3391 | 4733 | struct pci_dev *pdev; |
---|
3392 | | - enum pci_bus_speed speed_cap; |
---|
3393 | | - enum pcie_link_width link_width; |
---|
| 4734 | + enum pci_bus_speed speed_cap, platform_speed_cap; |
---|
| 4735 | + enum pcie_link_width platform_link_width; |
---|
3394 | 4736 | |
---|
3395 | 4737 | if (amdgpu_pcie_gen_cap) |
---|
3396 | 4738 | adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; |
---|
.. | .. |
---|
3406 | 4748 | adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; |
---|
3407 | 4749 | return; |
---|
3408 | 4750 | } |
---|
| 4751 | + |
---|
| 4752 | + if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) |
---|
| 4753 | + return; |
---|
| 4754 | + |
---|
| 4755 | + pcie_bandwidth_available(adev->pdev, NULL, |
---|
| 4756 | + &platform_speed_cap, &platform_link_width); |
---|
3409 | 4757 | |
---|
3410 | 4758 | if (adev->pm.pcie_gen_mask == 0) { |
---|
3411 | 4759 | /* asic caps */ |
---|
.. | .. |
---|
3432 | 4780 | adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; |
---|
3433 | 4781 | } |
---|
3434 | 4782 | /* platform caps */ |
---|
3435 | | - pdev = adev->ddev->pdev->bus->self; |
---|
3436 | | - speed_cap = pcie_get_speed_cap(pdev); |
---|
3437 | | - if (speed_cap == PCI_SPEED_UNKNOWN) { |
---|
| 4783 | + if (platform_speed_cap == PCI_SPEED_UNKNOWN) { |
---|
3438 | 4784 | adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | |
---|
3439 | 4785 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); |
---|
3440 | 4786 | } else { |
---|
3441 | | - if (speed_cap == PCIE_SPEED_16_0GT) |
---|
| 4787 | + if (platform_speed_cap == PCIE_SPEED_16_0GT) |
---|
3442 | 4788 | adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | |
---|
3443 | 4789 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | |
---|
3444 | 4790 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | |
---|
3445 | 4791 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); |
---|
3446 | | - else if (speed_cap == PCIE_SPEED_8_0GT) |
---|
| 4792 | + else if (platform_speed_cap == PCIE_SPEED_8_0GT) |
---|
3447 | 4793 | adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | |
---|
3448 | 4794 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | |
---|
3449 | 4795 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); |
---|
3450 | | - else if (speed_cap == PCIE_SPEED_5_0GT) |
---|
| 4796 | + else if (platform_speed_cap == PCIE_SPEED_5_0GT) |
---|
3451 | 4797 | adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | |
---|
3452 | 4798 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); |
---|
3453 | 4799 | else |
---|
.. | .. |
---|
3456 | 4802 | } |
---|
3457 | 4803 | } |
---|
3458 | 4804 | if (adev->pm.pcie_mlw_mask == 0) { |
---|
3459 | | - pdev = adev->ddev->pdev->bus->self; |
---|
3460 | | - link_width = pcie_get_width_cap(pdev); |
---|
3461 | | - if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { |
---|
| 4805 | + if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { |
---|
3462 | 4806 | adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; |
---|
3463 | 4807 | } else { |
---|
3464 | | - switch (link_width) { |
---|
| 4808 | + switch (platform_link_width) { |
---|
3465 | 4809 | case PCIE_LNK_X32: |
---|
3466 | 4810 | adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | |
---|
3467 | 4811 | CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | |
---|
.. | .. |
---|
3511 | 4855 | } |
---|
3512 | 4856 | } |
---|
3513 | 4857 | |
---|
| 4858 | +int amdgpu_device_baco_enter(struct drm_device *dev) |
---|
| 4859 | +{ |
---|
| 4860 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
| 4861 | + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
---|
| 4862 | + |
---|
| 4863 | + if (!amdgpu_device_supports_baco(adev_to_drm(adev))) |
---|
| 4864 | + return -ENOTSUPP; |
---|
| 4865 | + |
---|
| 4866 | + if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) |
---|
| 4867 | + adev->nbio.funcs->enable_doorbell_interrupt(adev, false); |
---|
| 4868 | + |
---|
| 4869 | + return amdgpu_dpm_baco_enter(adev); |
---|
| 4870 | +} |
---|
| 4871 | + |
---|
| 4872 | +int amdgpu_device_baco_exit(struct drm_device *dev) |
---|
| 4873 | +{ |
---|
| 4874 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
| 4875 | + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); |
---|
| 4876 | + int ret = 0; |
---|
| 4877 | + |
---|
| 4878 | + if (!amdgpu_device_supports_baco(adev_to_drm(adev))) |
---|
| 4879 | + return -ENOTSUPP; |
---|
| 4880 | + |
---|
| 4881 | + ret = amdgpu_dpm_baco_exit(adev); |
---|
| 4882 | + if (ret) |
---|
| 4883 | + return ret; |
---|
| 4884 | + |
---|
| 4885 | + if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) |
---|
| 4886 | + adev->nbio.funcs->enable_doorbell_interrupt(adev, true); |
---|
| 4887 | + |
---|
| 4888 | + return 0; |
---|
| 4889 | +} |
---|
| 4890 | + |
---|
| 4891 | +static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) |
---|
| 4892 | +{ |
---|
| 4893 | + int i; |
---|
| 4894 | + |
---|
| 4895 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
---|
| 4896 | + struct amdgpu_ring *ring = adev->rings[i]; |
---|
| 4897 | + |
---|
| 4898 | + if (!ring || !ring->sched.thread) |
---|
| 4899 | + continue; |
---|
| 4900 | + |
---|
| 4901 | + cancel_delayed_work_sync(&ring->sched.work_tdr); |
---|
| 4902 | + } |
---|
| 4903 | +} |
---|
| 4904 | + |
---|
| 4905 | +/** |
---|
| 4906 | + * amdgpu_pci_error_detected - Called when a PCI error is detected. |
---|
| 4907 | + * @pdev: PCI device struct |
---|
| 4908 | + * @state: PCI channel state |
---|
| 4909 | + * |
---|
| 4910 | + * Description: Called when a PCI error is detected. |
---|
| 4911 | + * |
---|
| 4912 | + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. |
---|
| 4913 | + */ |
---|
| 4914 | +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) |
---|
| 4915 | +{ |
---|
| 4916 | + struct drm_device *dev = pci_get_drvdata(pdev); |
---|
| 4917 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
| 4918 | + int i; |
---|
| 4919 | + |
---|
| 4920 | + DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); |
---|
| 4921 | + |
---|
| 4922 | + if (adev->gmc.xgmi.num_physical_nodes > 1) { |
---|
| 4923 | + DRM_WARN("No support for XGMI hive yet..."); |
---|
| 4924 | + return PCI_ERS_RESULT_DISCONNECT; |
---|
| 4925 | + } |
---|
| 4926 | + |
---|
| 4927 | + switch (state) { |
---|
| 4928 | + case pci_channel_io_normal: |
---|
| 4929 | + return PCI_ERS_RESULT_CAN_RECOVER; |
---|
| 4930 | + /* Fatal error, prepare for slot reset */ |
---|
| 4931 | + case pci_channel_io_frozen: |
---|
| 4932 | + /* |
---|
| 4933 | + * Cancel and wait for all TDRs in progress if failing to |
---|
| 4934 | + * set adev->in_gpu_reset in amdgpu_device_lock_adev |
---|
| 4935 | + * |
---|
| 4936 | + * Locking adev->reset_sem will prevent any external access |
---|
| 4937 | + * to GPU during PCI error recovery |
---|
| 4938 | + */ |
---|
| 4939 | + while (!amdgpu_device_lock_adev(adev, NULL)) |
---|
| 4940 | + amdgpu_cancel_all_tdr(adev); |
---|
| 4941 | + |
---|
| 4942 | + /* |
---|
| 4943 | + * Block any work scheduling as we do for regular GPU reset |
---|
| 4944 | + * for the duration of the recovery |
---|
| 4945 | + */ |
---|
| 4946 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
---|
| 4947 | + struct amdgpu_ring *ring = adev->rings[i]; |
---|
| 4948 | + |
---|
| 4949 | + if (!ring || !ring->sched.thread) |
---|
| 4950 | + continue; |
---|
| 4951 | + |
---|
| 4952 | + drm_sched_stop(&ring->sched, NULL); |
---|
| 4953 | + } |
---|
| 4954 | + return PCI_ERS_RESULT_NEED_RESET; |
---|
| 4955 | + case pci_channel_io_perm_failure: |
---|
| 4956 | + /* Permanent error, prepare for device removal */ |
---|
| 4957 | + return PCI_ERS_RESULT_DISCONNECT; |
---|
| 4958 | + } |
---|
| 4959 | + |
---|
| 4960 | + return PCI_ERS_RESULT_NEED_RESET; |
---|
| 4961 | +} |
---|
| 4962 | + |
---|
| 4963 | +/** |
---|
| 4964 | + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers |
---|
| 4965 | + * @pdev: pointer to PCI device |
---|
| 4966 | + */ |
---|
| 4967 | +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) |
---|
| 4968 | +{ |
---|
| 4969 | + |
---|
| 4970 | + DRM_INFO("PCI error: mmio enabled callback!!\n"); |
---|
| 4971 | + |
---|
| 4972 | + /* TODO - dump whatever for debugging purposes */ |
---|
| 4973 | + |
---|
| 4974 | + /* This called only if amdgpu_pci_error_detected returns |
---|
| 4975 | + * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still |
---|
| 4976 | + * works, no need to reset slot. |
---|
| 4977 | + */ |
---|
| 4978 | + |
---|
| 4979 | + return PCI_ERS_RESULT_RECOVERED; |
---|
| 4980 | +} |
---|
| 4981 | + |
---|
| 4982 | +/** |
---|
| 4983 | + * amdgpu_pci_slot_reset - Called when PCI slot has been reset. |
---|
| 4984 | + * @pdev: PCI device struct |
---|
| 4985 | + * |
---|
| 4986 | + * Description: This routine is called by the pci error recovery |
---|
| 4987 | + * code after the PCI slot has been reset, just before we |
---|
| 4988 | + * should resume normal operations. |
---|
| 4989 | + */ |
---|
| 4990 | +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) |
---|
| 4991 | +{ |
---|
| 4992 | + struct drm_device *dev = pci_get_drvdata(pdev); |
---|
| 4993 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
| 4994 | + int r, i; |
---|
| 4995 | + bool need_full_reset = true; |
---|
| 4996 | + u32 memsize; |
---|
| 4997 | + struct list_head device_list; |
---|
| 4998 | + |
---|
| 4999 | + DRM_INFO("PCI error: slot reset callback!!\n"); |
---|
| 5000 | + |
---|
| 5001 | + INIT_LIST_HEAD(&device_list); |
---|
| 5002 | + list_add_tail(&adev->gmc.xgmi.head, &device_list); |
---|
| 5003 | + |
---|
| 5004 | + /* wait for asic to come out of reset */ |
---|
| 5005 | + msleep(500); |
---|
| 5006 | + |
---|
| 5007 | + /* Restore PCI confspace */ |
---|
| 5008 | + amdgpu_device_load_pci_state(pdev); |
---|
| 5009 | + |
---|
| 5010 | + /* confirm ASIC came out of reset */ |
---|
| 5011 | + for (i = 0; i < adev->usec_timeout; i++) { |
---|
| 5012 | + memsize = amdgpu_asic_get_config_memsize(adev); |
---|
| 5013 | + |
---|
| 5014 | + if (memsize != 0xffffffff) |
---|
| 5015 | + break; |
---|
| 5016 | + udelay(1); |
---|
| 5017 | + } |
---|
| 5018 | + if (memsize == 0xffffffff) { |
---|
| 5019 | + r = -ETIME; |
---|
| 5020 | + goto out; |
---|
| 5021 | + } |
---|
| 5022 | + |
---|
| 5023 | + adev->in_pci_err_recovery = true; |
---|
| 5024 | + r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); |
---|
| 5025 | + adev->in_pci_err_recovery = false; |
---|
| 5026 | + if (r) |
---|
| 5027 | + goto out; |
---|
| 5028 | + |
---|
| 5029 | + r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); |
---|
| 5030 | + |
---|
| 5031 | +out: |
---|
| 5032 | + if (!r) { |
---|
| 5033 | + if (amdgpu_device_cache_pci_state(adev->pdev)) |
---|
| 5034 | + pci_restore_state(adev->pdev); |
---|
| 5035 | + |
---|
| 5036 | + DRM_INFO("PCIe error recovery succeeded\n"); |
---|
| 5037 | + } else { |
---|
| 5038 | + DRM_ERROR("PCIe error recovery failed, err:%d", r); |
---|
| 5039 | + amdgpu_device_unlock_adev(adev); |
---|
| 5040 | + } |
---|
| 5041 | + |
---|
| 5042 | + return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; |
---|
| 5043 | +} |
---|
| 5044 | + |
---|
| 5045 | +/** |
---|
| 5046 | + * amdgpu_pci_resume() - resume normal ops after PCI reset |
---|
| 5047 | + * @pdev: pointer to PCI device |
---|
| 5048 | + * |
---|
| 5049 | + * Called when the error recovery driver tells us that its |
---|
| 5050 | + * OK to resume normal operation. Use completion to allow |
---|
| 5051 | + * halted scsi ops to resume. |
---|
| 5052 | + */ |
---|
| 5053 | +void amdgpu_pci_resume(struct pci_dev *pdev) |
---|
| 5054 | +{ |
---|
| 5055 | + struct drm_device *dev = pci_get_drvdata(pdev); |
---|
| 5056 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
| 5057 | + int i; |
---|
| 5058 | + |
---|
| 5059 | + |
---|
| 5060 | + DRM_INFO("PCI error: resume callback!!\n"); |
---|
| 5061 | + |
---|
| 5062 | + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
---|
| 5063 | + struct amdgpu_ring *ring = adev->rings[i]; |
---|
| 5064 | + |
---|
| 5065 | + if (!ring || !ring->sched.thread) |
---|
| 5066 | + continue; |
---|
| 5067 | + |
---|
| 5068 | + |
---|
| 5069 | + drm_sched_resubmit_jobs(&ring->sched); |
---|
| 5070 | + drm_sched_start(&ring->sched, true); |
---|
| 5071 | + } |
---|
| 5072 | + |
---|
| 5073 | + amdgpu_device_unlock_adev(adev); |
---|
| 5074 | +} |
---|
| 5075 | + |
---|
| 5076 | +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) |
---|
| 5077 | +{ |
---|
| 5078 | + struct drm_device *dev = pci_get_drvdata(pdev); |
---|
| 5079 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
| 5080 | + int r; |
---|
| 5081 | + |
---|
| 5082 | + r = pci_save_state(pdev); |
---|
| 5083 | + if (!r) { |
---|
| 5084 | + kfree(adev->pci_state); |
---|
| 5085 | + |
---|
| 5086 | + adev->pci_state = pci_store_saved_state(pdev); |
---|
| 5087 | + |
---|
| 5088 | + if (!adev->pci_state) { |
---|
| 5089 | + DRM_ERROR("Failed to store PCI saved state"); |
---|
| 5090 | + return false; |
---|
| 5091 | + } |
---|
| 5092 | + } else { |
---|
| 5093 | + DRM_WARN("Failed to save PCI state, err:%d\n", r); |
---|
| 5094 | + return false; |
---|
| 5095 | + } |
---|
| 5096 | + |
---|
| 5097 | + return true; |
---|
| 5098 | +} |
---|
| 5099 | + |
---|
| 5100 | +bool amdgpu_device_load_pci_state(struct pci_dev *pdev) |
---|
| 5101 | +{ |
---|
| 5102 | + struct drm_device *dev = pci_get_drvdata(pdev); |
---|
| 5103 | + struct amdgpu_device *adev = drm_to_adev(dev); |
---|
| 5104 | + int r; |
---|
| 5105 | + |
---|
| 5106 | + if (!adev->pci_state) |
---|
| 5107 | + return false; |
---|
| 5108 | + |
---|
| 5109 | + r = pci_load_saved_state(pdev, adev->pci_state); |
---|
| 5110 | + |
---|
| 5111 | + if (!r) { |
---|
| 5112 | + pci_restore_state(pdev); |
---|
| 5113 | + } else { |
---|
| 5114 | + DRM_WARN("Failed to load PCI state, err:%d\n", r); |
---|
| 5115 | + return false; |
---|
| 5116 | + } |
---|
| 5117 | + |
---|
| 5118 | + return true; |
---|
| 5119 | +} |
---|
| 5120 | + |
---|
| 5121 | + |
---|