~hc/RK356X_SDK_RELEASE.git

..	..	@@ -27,11 +27,12 @@
27	27	*/
28	28	#include <linux/power_supply.h>
29	29	#include <linux/kthread.h>
	30	+#include <linux/module.h>
30	31	#include <linux/console.h>
31	32	#include <linux/slab.h>
32		-#include <drm/drmP.h>
33		-#include <drm/drm_crtc_helper.h>
	33	+
34	34	#include <drm/drm_atomic_helper.h>
	35	+#include <drm/drm_probe_helper.h>
35	36	#include <drm/amdgpu_drm.h>
36	37	#include <linux/vgaarb.h>
37	38	#include <linux/vga_switcheroo.h>
..	..	@@ -51,6 +52,7 @@
51	52	#endif
52	53	#include "vi.h"
53	54	#include "soc15.h"
	55	+#include "nv.h"
54	56	#include "bif/bif_4_1_d.h"
55	57	#include <linux/pci.h>
56	58	#include <linux/firmware.h>
..	..	@@ -59,13 +61,29 @@
59	61	#include "amdgpu_amdkfd.h"
60	62	#include "amdgpu_pm.h"
61	63
	64	+#include "amdgpu_xgmi.h"
	65	+#include "amdgpu_ras.h"
	66	+#include "amdgpu_pmu.h"
	67	+#include "amdgpu_fru_eeprom.h"
	68	+
	69	+#include <linux/suspend.h>
	70	+#include <drm/task_barrier.h>
	71	+#include <linux/pm_runtime.h>
	72	+
62	73	MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
63	74	MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
64	75	MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
	76	+MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
	77	+MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
	78	+MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
	79	+MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
	80	+MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
	81	+MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
	82	+MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
65	83
66	84	#define AMDGPU_RESUME_MS 2000
67	85
68		-static const char *amdgpu_asic_name[] = {
	86	+const char *amdgpu_asic_name[] = {
69	87	"TAHITI",
70	88	"PITCAIRN",
71	89	"VERDE",
..	..	@@ -89,33 +107,206 @@
89	107	"VEGA12",
90	108	"VEGA20",
91	109	"RAVEN",
	110	+ "ARCTURUS",
	111	+ "RENOIR",
	112	+ "NAVI10",
	113	+ "NAVI14",
	114	+ "NAVI12",
	115	+ "SIENNA_CICHLID",
	116	+ "NAVY_FLOUNDER",
92	117	"LAST",
93	118	};
	119	+
	120	+/**
	121	+ * DOC: pcie_replay_count
	122	+ *
	123	+ * The amdgpu driver provides a sysfs API for reporting the total number
	124	+ * of PCIe replays (NAKs)
	125	+ * The file pcie_replay_count is used for this and returns the total
	126	+ * number of replays as a sum of the NAKs generated and NAKs received
	127	+ */
	128	+
	129	+static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
	130	+ struct device_attribute attr, char buf)
	131	+{
	132	+ struct drm_device *ddev = dev_get_drvdata(dev);
	133	+ struct amdgpu_device *adev = drm_to_adev(ddev);
	134	+ uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
	135	+
	136	+ return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
	137	+}
	138	+
	139	+static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
	140	+ amdgpu_device_get_pcie_replay_count, NULL);
94	141
95	142	static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
96	143
97	144	/**
98		- * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control
	145	+ * DOC: product_name
	146	+ *
	147	+ * The amdgpu driver provides a sysfs API for reporting the product name
	148	+ * for the device
	149	+ * The file serial_number is used for this and returns the product name
	150	+ * as returned from the FRU.
	151	+ * NOTE: This is only available for certain server cards
	152	+ */
	153	+
	154	+static ssize_t amdgpu_device_get_product_name(struct device *dev,
	155	+ struct device_attribute attr, char buf)
	156	+{
	157	+ struct drm_device *ddev = dev_get_drvdata(dev);
	158	+ struct amdgpu_device *adev = drm_to_adev(ddev);
	159	+
	160	+ return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
	161	+}
	162	+
	163	+static DEVICE_ATTR(product_name, S_IRUGO,
	164	+ amdgpu_device_get_product_name, NULL);
	165	+
	166	+/**
	167	+ * DOC: product_number
	168	+ *
	169	+ * The amdgpu driver provides a sysfs API for reporting the part number
	170	+ * for the device
	171	+ * The file serial_number is used for this and returns the part number
	172	+ * as returned from the FRU.
	173	+ * NOTE: This is only available for certain server cards
	174	+ */
	175	+
	176	+static ssize_t amdgpu_device_get_product_number(struct device *dev,
	177	+ struct device_attribute attr, char buf)
	178	+{
	179	+ struct drm_device *ddev = dev_get_drvdata(dev);
	180	+ struct amdgpu_device *adev = drm_to_adev(ddev);
	181	+
	182	+ return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
	183	+}
	184	+
	185	+static DEVICE_ATTR(product_number, S_IRUGO,
	186	+ amdgpu_device_get_product_number, NULL);
	187	+
	188	+/**
	189	+ * DOC: serial_number
	190	+ *
	191	+ * The amdgpu driver provides a sysfs API for reporting the serial number
	192	+ * for the device
	193	+ * The file serial_number is used for this and returns the serial number
	194	+ * as returned from the FRU.
	195	+ * NOTE: This is only available for certain server cards
	196	+ */
	197	+
	198	+static ssize_t amdgpu_device_get_serial_number(struct device *dev,
	199	+ struct device_attribute attr, char buf)
	200	+{
	201	+ struct drm_device *ddev = dev_get_drvdata(dev);
	202	+ struct amdgpu_device *adev = drm_to_adev(ddev);
	203	+
	204	+ return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
	205	+}
	206	+
	207	+static DEVICE_ATTR(serial_number, S_IRUGO,
	208	+ amdgpu_device_get_serial_number, NULL);
	209	+
	210	+/**
	211	+ * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
99	212	*
100	213	* @dev: drm_device pointer
101	214	*
102	215	* Returns true if the device is a dGPU with HG/PX power control,
103	216	* otherwise return false.
104	217	*/
105		-bool amdgpu_device_is_px(struct drm_device *dev)
	218	+bool amdgpu_device_supports_boco(struct drm_device *dev)
106	219	{
107		- struct amdgpu_device *adev = dev->dev_private;
	220	+ struct amdgpu_device *adev = drm_to_adev(dev);
108	221
109	222	if (adev->flags & AMD_IS_PX)
110	223	return true;
111	224	return false;
112	225	}
113	226
	227	+/**
	228	+ * amdgpu_device_supports_baco - Does the device support BACO
	229	+ *
	230	+ * @dev: drm_device pointer
	231	+ *
	232	+ * Returns true if the device supporte BACO,
	233	+ * otherwise return false.
	234	+ */
	235	+bool amdgpu_device_supports_baco(struct drm_device *dev)
	236	+{
	237	+ struct amdgpu_device *adev = drm_to_adev(dev);
	238	+
	239	+ return amdgpu_asic_supports_baco(adev);
	240	+}
	241	+
114	242	/*
115		- * MMIO register access helper functions.
	243	+ * VRAM access helper functions
	244	+ */
	245	+
	246	+/**
	247	+ * amdgpu_device_vram_access - read/write a buffer in vram
	248	+ *
	249	+ * @adev: amdgpu_device pointer
	250	+ * @pos: offset of the buffer in vram
	251	+ * @buf: virtual address of the buffer in system memory
	252	+ * @size: read/write size, sizeof(@buf) must > @size
	253	+ * @write: true - write to vram, otherwise - read from vram
	254	+ */
	255	+void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
	256	+ uint32_t *buf, size_t size, bool write)
	257	+{
	258	+ unsigned long flags;
	259	+ uint32_t hi = ~0;
	260	+ uint64_t last;
	261	+
	262	+
	263	+#ifdef CONFIG_64BIT
	264	+ last = min(pos + size, adev->gmc.visible_vram_size);
	265	+ if (last > pos) {
	266	+ void __iomem *addr = adev->mman.aper_base_kaddr + pos;
	267	+ size_t count = last - pos;
	268	+
	269	+ if (write) {
	270	+ memcpy_toio(addr, buf, count);
	271	+ mb();
	272	+ amdgpu_asic_flush_hdp(adev, NULL);
	273	+ } else {
	274	+ amdgpu_asic_invalidate_hdp(adev, NULL);
	275	+ mb();
	276	+ memcpy_fromio(buf, addr, count);
	277	+ }
	278	+
	279	+ if (count == size)
	280	+ return;
	281	+
	282	+ pos += count;
	283	+ buf += count / 4;
	284	+ size -= count;
	285	+ }
	286	+#endif
	287	+
	288	+ spin_lock_irqsave(&adev->mmio_idx_lock, flags);
	289	+ for (last = pos + size; pos < last; pos += 4) {
	290	+ uint32_t tmp = pos >> 31;
	291	+
	292	+ WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) \| 0x80000000);
	293	+ if (tmp != hi) {
	294	+ WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
	295	+ hi = tmp;
	296	+ }
	297	+ if (write)
	298	+ WREG32_NO_KIQ(mmMM_DATA, *buf++);
	299	+ else
	300	+ *buf++ = RREG32_NO_KIQ(mmMM_DATA);
	301	+ }
	302	+ spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
	303	+}
	304	+
	305	+/*
	306	+ * register access helper functions.
116	307	*/
117	308	/**
118		- * amdgpu_mm_rreg - read a memory mapped IO register
	309	+ * amdgpu_device_rreg - read a memory mapped IO or indirect register
119	310	*
120	311	* @adev: amdgpu_device pointer
121	312	* @reg: dword aligned register offset
..	..	@@ -123,25 +314,29 @@
123	314	*
124	315	* Returns the 32 bit value from the offset specified.
125	316	*/
126		-uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
127		- uint32_t acc_flags)
	317	+uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
	318	+ uint32_t reg, uint32_t acc_flags)
128	319	{
129	320	uint32_t ret;
130	321
131		- if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
132		- return amdgpu_virt_kiq_rreg(adev, reg);
	322	+ if (adev->in_pci_err_recovery)
	323	+ return 0;
133	324
134		- if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
135		- ret = readl(((void __iomem )adev->rmmio) + (reg 4));
136		- else {
137		- unsigned long flags;
138		-
139		- spin_lock_irqsave(&adev->mmio_idx_lock, flags);
140		- writel((reg * 4), ((void __iomem )adev->rmmio) + (mmMM_INDEX 4));
141		- ret = readl(((void __iomem )adev->rmmio) + (mmMM_DATA 4));
142		- spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
	325	+ if ((reg * 4) < adev->rmmio_size) {
	326	+ if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
	327	+ amdgpu_sriov_runtime(adev) &&
	328	+ down_read_trylock(&adev->reset_sem)) {
	329	+ ret = amdgpu_kiq_rreg(adev, reg);
	330	+ up_read(&adev->reset_sem);
	331	+ } else {
	332	+ ret = readl(((void __iomem )adev->rmmio) + (reg 4));
	333	+ }
	334	+ } else {
	335	+ ret = adev->pcie_rreg(adev, reg * 4);
143	336	}
144		- trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
	337	+
	338	+ trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
	339	+
145	340	return ret;
146	341	}
147	342
..	..	@@ -159,7 +354,11 @@
159	354	*
160	355	* Returns the 8 bit value from the offset specified.
161	356	*/
162		-uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
	357	+uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
	358	+{
	359	+ if (adev->in_pci_err_recovery)
	360	+ return 0;
	361	+
163	362	if (offset < adev->rmmio_size)
164	363	return (readb(adev->rmmio + offset));
165	364	BUG();
..	..	@@ -180,7 +379,11 @@
180	379	*
181	380	* Writes the value specified to the offset specified.
182	381	*/
183		-void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
	382	+void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
	383	+{
	384	+ if (adev->in_pci_err_recovery)
	385	+ return;
	386	+
184	387	if (offset < adev->rmmio_size)
185	388	writeb(value, adev->rmmio + offset);
186	389	else
..	..	@@ -188,7 +391,7 @@
188	391	}
189	392
190	393	/**
191		- * amdgpu_mm_wreg - write to a memory mapped IO register
	394	+ * amdgpu_device_wreg - write to a memory mapped IO or indirect register
192	395	*
193	396	* @adev: amdgpu_device pointer
194	397	* @reg: dword aligned register offset
..	..	@@ -197,31 +400,47 @@
197	400	*
198	401	* Writes the value specified to the offset specified.
199	402	*/
200		-void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
201		- uint32_t acc_flags)
	403	+void amdgpu_device_wreg(struct amdgpu_device *adev,
	404	+ uint32_t reg, uint32_t v,
	405	+ uint32_t acc_flags)
202	406	{
203		- trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
	407	+ if (adev->in_pci_err_recovery)
	408	+ return;
204	409
205		- if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
206		- adev->last_mm_index = v;
	410	+ if ((reg * 4) < adev->rmmio_size) {
	411	+ if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
	412	+ amdgpu_sriov_runtime(adev) &&
	413	+ down_read_trylock(&adev->reset_sem)) {
	414	+ amdgpu_kiq_wreg(adev, reg, v);
	415	+ up_read(&adev->reset_sem);
	416	+ } else {
	417	+ writel(v, ((void __iomem )adev->rmmio) + (reg 4));
	418	+ }
	419	+ } else {
	420	+ adev->pcie_wreg(adev, reg * 4, v);
207	421	}
208	422
209		- if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
210		- return amdgpu_virt_kiq_wreg(adev, reg, v);
	423	+ trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
	424	+}
211	425
212		- if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
	426	+/*
	427	+ * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
	428	+ *
	429	+ * this function is invoked only the debugfs register access
	430	+ * */
	431	+void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
	432	+ uint32_t reg, uint32_t v)
	433	+{
	434	+ if (adev->in_pci_err_recovery)
	435	+ return;
	436	+
	437	+ if (amdgpu_sriov_fullaccess(adev) &&
	438	+ adev->gfx.rlc.funcs &&
	439	+ adev->gfx.rlc.funcs->is_rlcg_access_range) {
	440	+ if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
	441	+ return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
	442	+ } else {
213	443	writel(v, ((void __iomem )adev->rmmio) + (reg 4));
214		- else {
215		- unsigned long flags;
216		-
217		- spin_lock_irqsave(&adev->mmio_idx_lock, flags);
218		- writel((reg * 4), ((void __iomem )adev->rmmio) + (mmMM_INDEX 4));
219		- writel(v, ((void __iomem )adev->rmmio) + (mmMM_DATA 4));
220		- spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
221		- }
222		-
223		- if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
224		- udelay(500);
225	444	}
226	445	}
227	446
..	..	@@ -235,6 +454,9 @@
235	454	*/
236	455	u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
237	456	{
	457	+ if (adev->in_pci_err_recovery)
	458	+ return 0;
	459	+
238	460	if ((reg * 4) < adev->rio_mem_size)
239	461	return ioread32(adev->rio_mem + (reg * 4));
240	462	else {
..	..	@@ -254,19 +476,14 @@
254	476	*/
255	477	void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
256	478	{
257		- if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
258		- adev->last_mm_index = v;
259		- }
	479	+ if (adev->in_pci_err_recovery)
	480	+ return;
260	481
261	482	if ((reg * 4) < adev->rio_mem_size)
262	483	iowrite32(v, adev->rio_mem + (reg * 4));
263	484	else {
264	485	iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
265	486	iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
266		- }
267		-
268		- if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
269		- udelay(500);
270	487	}
271	488	}
272	489
..	..	@@ -281,6 +498,9 @@
281	498	*/
282	499	u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
283	500	{
	501	+ if (adev->in_pci_err_recovery)
	502	+ return 0;
	503	+
284	504	if (index < adev->doorbell.num_doorbells) {
285	505	return readl(adev->doorbell.ptr + index);
286	506	} else {
..	..	@@ -301,6 +521,9 @@
301	521	*/
302	522	void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
303	523	{
	524	+ if (adev->in_pci_err_recovery)
	525	+ return;
	526	+
304	527	if (index < adev->doorbell.num_doorbells) {
305	528	writel(v, adev->doorbell.ptr + index);
306	529	} else {
..	..	@@ -319,6 +542,9 @@
319	542	*/
320	543	u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
321	544	{
	545	+ if (adev->in_pci_err_recovery)
	546	+ return 0;
	547	+
322	548	if (index < adev->doorbell.num_doorbells) {
323	549	return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
324	550	} else {
..	..	@@ -339,6 +565,9 @@
339	565	*/
340	566	void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
341	567	{
	568	+ if (adev->in_pci_err_recovery)
	569	+ return;
	570	+
342	571	if (index < adev->doorbell.num_doorbells) {
343	572	atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
344	573	} else {
..	..	@@ -347,9 +576,138 @@
347	576	}
348	577
349	578	/**
	579	+ * amdgpu_device_indirect_rreg - read an indirect register
	580	+ *
	581	+ * @adev: amdgpu_device pointer
	582	+ * @pcie_index: mmio register offset
	583	+ * @pcie_data: mmio register offset
	584	+ *
	585	+ * Returns the value of indirect register @reg_addr
	586	+ */
	587	+u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
	588	+ u32 pcie_index, u32 pcie_data,
	589	+ u32 reg_addr)
	590	+{
	591	+ unsigned long flags;
	592	+ u32 r;
	593	+ void __iomem *pcie_index_offset;
	594	+ void __iomem *pcie_data_offset;
	595	+
	596	+ spin_lock_irqsave(&adev->pcie_idx_lock, flags);
	597	+ pcie_index_offset = (void __iomem )adev->rmmio + pcie_index 4;
	598	+ pcie_data_offset = (void __iomem )adev->rmmio + pcie_data 4;
	599	+
	600	+ writel(reg_addr, pcie_index_offset);
	601	+ readl(pcie_index_offset);
	602	+ r = readl(pcie_data_offset);
	603	+ spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
	604	+
	605	+ return r;
	606	+}
	607	+
	608	+/**
	609	+ * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
	610	+ *
	611	+ * @adev: amdgpu_device pointer
	612	+ * @pcie_index: mmio register offset
	613	+ * @pcie_data: mmio register offset
	614	+ *
	615	+ * Returns the value of indirect register @reg_addr
	616	+ */
	617	+u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
	618	+ u32 pcie_index, u32 pcie_data,
	619	+ u32 reg_addr)
	620	+{
	621	+ unsigned long flags;
	622	+ u64 r;
	623	+ void __iomem *pcie_index_offset;
	624	+ void __iomem *pcie_data_offset;
	625	+
	626	+ spin_lock_irqsave(&adev->pcie_idx_lock, flags);
	627	+ pcie_index_offset = (void __iomem )adev->rmmio + pcie_index 4;
	628	+ pcie_data_offset = (void __iomem )adev->rmmio + pcie_data 4;
	629	+
	630	+ /* read low 32 bits */
	631	+ writel(reg_addr, pcie_index_offset);
	632	+ readl(pcie_index_offset);
	633	+ r = readl(pcie_data_offset);
	634	+ /* read high 32 bits */
	635	+ writel(reg_addr + 4, pcie_index_offset);
	636	+ readl(pcie_index_offset);
	637	+ r \|= ((u64)readl(pcie_data_offset) << 32);
	638	+ spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
	639	+
	640	+ return r;
	641	+}
	642	+
	643	+/**
	644	+ * amdgpu_device_indirect_wreg - write an indirect register address
	645	+ *
	646	+ * @adev: amdgpu_device pointer
	647	+ * @pcie_index: mmio register offset
	648	+ * @pcie_data: mmio register offset
	649	+ * @reg_addr: indirect register offset
	650	+ * @reg_data: indirect register data
	651	+ *
	652	+ */
	653	+void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
	654	+ u32 pcie_index, u32 pcie_data,
	655	+ u32 reg_addr, u32 reg_data)
	656	+{
	657	+ unsigned long flags;
	658	+ void __iomem *pcie_index_offset;
	659	+ void __iomem *pcie_data_offset;
	660	+
	661	+ spin_lock_irqsave(&adev->pcie_idx_lock, flags);
	662	+ pcie_index_offset = (void __iomem )adev->rmmio + pcie_index 4;
	663	+ pcie_data_offset = (void __iomem )adev->rmmio + pcie_data 4;
	664	+
	665	+ writel(reg_addr, pcie_index_offset);
	666	+ readl(pcie_index_offset);
	667	+ writel(reg_data, pcie_data_offset);
	668	+ readl(pcie_data_offset);
	669	+ spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
	670	+}
	671	+
	672	+/**
	673	+ * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
	674	+ *
	675	+ * @adev: amdgpu_device pointer
	676	+ * @pcie_index: mmio register offset
	677	+ * @pcie_data: mmio register offset
	678	+ * @reg_addr: indirect register offset
	679	+ * @reg_data: indirect register data
	680	+ *
	681	+ */
	682	+void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
	683	+ u32 pcie_index, u32 pcie_data,
	684	+ u32 reg_addr, u64 reg_data)
	685	+{
	686	+ unsigned long flags;
	687	+ void __iomem *pcie_index_offset;
	688	+ void __iomem *pcie_data_offset;
	689	+
	690	+ spin_lock_irqsave(&adev->pcie_idx_lock, flags);
	691	+ pcie_index_offset = (void __iomem )adev->rmmio + pcie_index 4;
	692	+ pcie_data_offset = (void __iomem )adev->rmmio + pcie_data 4;
	693	+
	694	+ /* write low 32 bits */
	695	+ writel(reg_addr, pcie_index_offset);
	696	+ readl(pcie_index_offset);
	697	+ writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
	698	+ readl(pcie_data_offset);
	699	+ /* write high 32 bits */
	700	+ writel(reg_addr + 4, pcie_index_offset);
	701	+ readl(pcie_index_offset);
	702	+ writel((u32)(reg_data >> 32), pcie_data_offset);
	703	+ readl(pcie_data_offset);
	704	+ spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
	705	+}
	706	+
	707	+/**
350	708	* amdgpu_invalid_rreg - dummy reg read function
351	709	*
352		- * @adev: amdgpu device pointer
	710	+ * @adev: amdgpu_device pointer
353	711	* @reg: offset of register
354	712	*
355	713	* Dummy register read function. Used for register blocks
..	..	@@ -366,7 +724,7 @@
366	724	/**
367	725	* amdgpu_invalid_wreg - dummy reg write function
368	726	*
369		- * @adev: amdgpu device pointer
	727	+ * @adev: amdgpu_device pointer
370	728	* @reg: offset of register
371	729	* @v: value to write to the register
372	730	*
..	..	@@ -381,9 +739,43 @@
381	739	}
382	740
383	741	/**
	742	+ * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
	743	+ *
	744	+ * @adev: amdgpu_device pointer
	745	+ * @reg: offset of register
	746	+ *
	747	+ * Dummy register read function. Used for register blocks
	748	+ * that certain asics don't have (all asics).
	749	+ * Returns the value in the register.
	750	+ */
	751	+static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
	752	+{
	753	+ DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
	754	+ BUG();
	755	+ return 0;
	756	+}
	757	+
	758	+/**
	759	+ * amdgpu_invalid_wreg64 - dummy reg write function
	760	+ *
	761	+ * @adev: amdgpu_device pointer
	762	+ * @reg: offset of register
	763	+ * @v: value to write to the register
	764	+ *
	765	+ * Dummy register read function. Used for register blocks
	766	+ * that certain asics don't have (all asics).
	767	+ */
	768	+static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
	769	+{
	770	+ DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
	771	+ reg, v);
	772	+ BUG();
	773	+}
	774	+
	775	+/**
384	776	* amdgpu_block_invalid_rreg - dummy reg read function
385	777	*
386		- * @adev: amdgpu device pointer
	778	+ * @adev: amdgpu_device pointer
387	779	* @block: offset of instance
388	780	* @reg: offset of register
389	781	*
..	..	@@ -403,7 +795,7 @@
403	795	/**
404	796	* amdgpu_block_invalid_wreg - dummy reg write function
405	797	*
406		- * @adev: amdgpu device pointer
	798	+ * @adev: amdgpu_device pointer
407	799	* @block: offset of instance
408	800	* @reg: offset of register
409	801	* @v: value to write to the register
..	..	@@ -421,9 +813,23 @@
421	813	}
422	814
423	815	/**
	816	+ * amdgpu_device_asic_init - Wrapper for atom asic_init
	817	+ *
	818	+ * @adev: amdgpu_device pointer
	819	+ *
	820	+ * Does any asic specific work and then calls atom asic init.
	821	+ */
	822	+static int amdgpu_device_asic_init(struct amdgpu_device *adev)
	823	+{
	824	+ amdgpu_asic_pre_asic_init(adev);
	825	+
	826	+ return amdgpu_atom_asic_init(adev->mode_info.atom_context);
	827	+}
	828	+
	829	+/**
424	830	* amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
425	831	*
426		- * @adev: amdgpu device pointer
	832	+ * @adev: amdgpu_device pointer
427	833	*
428	834	* Allocates a scratch page of VRAM for use by various things in the
429	835	* driver.
..	..	@@ -440,7 +846,7 @@
440	846	/**
441	847	* amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
442	848	*
443		- * @adev: amdgpu device pointer
	849	+ * @adev: amdgpu_device pointer
444	850	*
445	851	* Frees the VRAM scratch page.
446	852	*/
..	..	@@ -479,7 +885,10 @@
479	885	} else {
480	886	tmp = RREG32(reg);
481	887	tmp &= ~and_mask;
482		- tmp \|= or_mask;
	888	+ if (adev->family >= AMDGPU_FAMILY_AI)
	889	+ tmp \|= (or_mask & and_mask);
	890	+ else
	891	+ tmp \|= or_mask;
483	892	}
484	893	WREG32(reg, tmp);
485	894	}
..	..	@@ -511,6 +920,7 @@
511	920	*/
512	921	static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
513	922	{
	923	+
514	924	/* No doorbell on SI hardware generation */
515	925	if (adev->asic_type < CHIP_BONAIRE) {
516	926	adev->doorbell.base = 0;
..	..	@@ -523,14 +933,25 @@
523	933	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
524	934	return -EINVAL;
525	935
	936	+ amdgpu_asic_init_doorbell_index(adev);
	937	+
526	938	/* doorbell bar mapping */
527	939	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
528	940	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
529	941
530	942	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
531		- AMDGPU_DOORBELL_MAX_ASSIGNMENT+1);
	943	+ adev->doorbell_index.max_assignment+1);
532	944	if (adev->doorbell.num_doorbells == 0)
533	945	return -EINVAL;
	946	+
	947	+ /* For Vega, reserve and map two pages on doorbell BAR since SDMA
	948	+ * paging queue doorbell use the second page. The
	949	+ * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
	950	+ * doorbells are in the first page. So with paging queue enabled,
	951	+ * the max num_doorbells should + 1 page (0x400 in dword)
	952	+ */
	953	+ if (adev->asic_type >= CHIP_VEGA10)
	954	+ adev->doorbell.num_doorbells += 0x400;
534	955
535	956	adev->doorbell.ptr = ioremap(adev->doorbell.base,
536	957	adev->doorbell.num_doorbells *
..	..	@@ -652,71 +1073,6 @@
652	1073	}
653	1074
654	1075	/**
655		- * amdgpu_device_vram_location - try to find VRAM location
656		- *
657		- * @adev: amdgpu device structure holding all necessary informations
658		- * @mc: memory controller structure holding memory informations
659		- * @base: base address at which to put VRAM
660		- *
661		- * Function will try to place VRAM at base address provided
662		- * as parameter.
663		- */
664		-void amdgpu_device_vram_location(struct amdgpu_device *adev,
665		- struct amdgpu_gmc *mc, u64 base)
666		-{
667		- uint64_t limit = (uint64_t)amdgpu_vram_limit << 20;
668		-
669		- mc->vram_start = base;
670		- mc->vram_end = mc->vram_start + mc->mc_vram_size - 1;
671		- if (limit && limit < mc->real_vram_size)
672		- mc->real_vram_size = limit;
673		- dev_info(adev->dev, "VRAM: %lluM 0x%016llX - 0x%016llX (%lluM used)\n",
674		- mc->mc_vram_size >> 20, mc->vram_start,
675		- mc->vram_end, mc->real_vram_size >> 20);
676		-}
677		-
678		-/**
679		- * amdgpu_device_gart_location - try to find GART location
680		- *
681		- * @adev: amdgpu device structure holding all necessary informations
682		- * @mc: memory controller structure holding memory informations
683		- *
684		- * Function will place try to place GART before or after VRAM.
685		- *
686		- * If GART size is bigger than space left then we ajust GART size.
687		- * Thus function will never fails.
688		- */
689		-void amdgpu_device_gart_location(struct amdgpu_device *adev,
690		- struct amdgpu_gmc *mc)
691		-{
692		- u64 size_af, size_bf;
693		-
694		- mc->gart_size += adev->pm.smu_prv_buffer_size;
695		-
696		- size_af = adev->gmc.mc_mask - mc->vram_end;
697		- size_bf = mc->vram_start;
698		- if (size_bf > size_af) {
699		- if (mc->gart_size > size_bf) {
700		- dev_warn(adev->dev, "limiting GART\n");
701		- mc->gart_size = size_bf;
702		- }
703		- mc->gart_start = 0;
704		- } else {
705		- if (mc->gart_size > size_af) {
706		- dev_warn(adev->dev, "limiting GART\n");
707		- mc->gart_size = size_af;
708		- }
709		- /* VCE doesn't like it when BOs cross a 4GB segment, so align
710		- * the GART base on a 4GB boundary as well.
711		- */
712		- mc->gart_start = ALIGN(mc->vram_end + 1, 0x100000000ULL);
713		- }
714		- mc->gart_end = mc->gart_start + mc->gart_size - 1;
715		- dev_info(adev->dev, "GART: %lluM 0x%016llX - 0x%016llX\n",
716		- mc->gart_size >> 20, mc->gart_start, mc->gart_end);
717		-}
718		-
719		-/**
720	1076	* amdgpu_device_resize_fb_bar - try to resize FB BAR
721	1077	*
722	1078	* @adev: amdgpu_device pointer
..	..	@@ -735,8 +1091,16 @@
735	1091	u16 cmd;
736	1092	int r;
737	1093
	1094	+ if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
	1095	+ return 0;
	1096	+
738	1097	/* Bypass for VF */
739	1098	if (amdgpu_sriov_vf(adev))
	1099	+ return 0;
	1100	+
	1101	+ /* skip if the bios has already enabled large BAR */
	1102	+ if (adev->gmc.real_vram_size &&
	1103	+ (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
740	1104	return 0;
741	1105
742	1106	/* Check if the root BUS has 64bit memory resources */
..	..	@@ -913,7 +1277,7 @@
913	1277	static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
914	1278	{
915	1279	struct sysinfo si;
916		- bool is_os_64 = (sizeof(void *) == 8) ? true : false;
	1280	+ bool is_os_64 = (sizeof(void *) == 8);
917	1281	uint64_t total_memory;
918	1282	uint64_t dram_size_seven_GB = 0x1B8000000;
919	1283	uint64_t dram_size_three_GB = 0xB8000000;
..	..	@@ -958,7 +1322,7 @@
958	1322	* Validates certain module parameters and updates
959	1323	* the associated values used by the driver (all asics).
960	1324	*/
961		-static void amdgpu_device_check_arguments(struct amdgpu_device *adev)
	1325	+static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
962	1326	{
963	1327	if (amdgpu_sched_jobs < 4) {
964	1328	dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
..	..	@@ -991,25 +1355,36 @@
991	1355	amdgpu_vm_fragment_size = -1;
992	1356	}
993	1357
	1358	+ if (amdgpu_sched_hw_submission < 2) {
	1359	+ dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
	1360	+ amdgpu_sched_hw_submission);
	1361	+ amdgpu_sched_hw_submission = 2;
	1362	+ } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
	1363	+ dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
	1364	+ amdgpu_sched_hw_submission);
	1365	+ amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
	1366	+ }
	1367	+
994	1368	amdgpu_device_check_smu_prv_buffer_size(adev);
995	1369
996	1370	amdgpu_device_check_vm_size(adev);
997	1371
998	1372	amdgpu_device_check_block_size(adev);
999	1373
1000		- if (amdgpu_vram_page_split != -1 && (amdgpu_vram_page_split < 16 \|\|
1001		- !is_power_of_2(amdgpu_vram_page_split))) {
1002		- dev_warn(adev->dev, "invalid VRAM page split (%d)\n",
1003		- amdgpu_vram_page_split);
1004		- amdgpu_vram_page_split = 1024;
1005		- }
1006		-
1007		- if (amdgpu_lockup_timeout == 0) {
1008		- dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 10000\n");
1009		- amdgpu_lockup_timeout = 10000;
1010		- }
1011		-
1012	1374	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
	1375	+
	1376	+ amdgpu_gmc_tmz_set(adev);
	1377	+
	1378	+ if (amdgpu_num_kcq == -1) {
	1379	+ amdgpu_num_kcq = 8;
	1380	+ } else if (amdgpu_num_kcq > 8 \|\| amdgpu_num_kcq < 0) {
	1381	+ amdgpu_num_kcq = 8;
	1382	+ dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
	1383	+ }
	1384	+
	1385	+ amdgpu_gmc_noretry_set(adev);
	1386	+
	1387	+ return 0;
1013	1388	}
1014	1389
1015	1390	/**
..	..	@@ -1021,27 +1396,38 @@
1021	1396	* Callback for the switcheroo driver. Suspends or resumes the
1022	1397	* the asics before or after it is powered up using ACPI methods.
1023	1398	*/
1024		-static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
	1399	+static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
	1400	+ enum vga_switcheroo_state state)
1025	1401	{
1026	1402	struct drm_device *dev = pci_get_drvdata(pdev);
	1403	+ int r;
1027	1404
1028		- if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF)
	1405	+ if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1029	1406	return;
1030	1407
1031	1408	if (state == VGA_SWITCHEROO_ON) {
1032		- pr_info("amdgpu: switched on\n");
	1409	+ pr_info("switched on\n");
1033	1410	/* don't suspend or resume card normally */
1034	1411	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1035	1412
1036		- amdgpu_device_resume(dev, true, true);
	1413	+ pci_set_power_state(dev->pdev, PCI_D0);
	1414	+ amdgpu_device_load_pci_state(dev->pdev);
	1415	+ r = pci_enable_device(dev->pdev);
	1416	+ if (r)
	1417	+ DRM_WARN("pci_enable_device failed (%d)\n", r);
	1418	+ amdgpu_device_resume(dev, true);
1037	1419
1038	1420	dev->switch_power_state = DRM_SWITCH_POWER_ON;
1039	1421	drm_kms_helper_poll_enable(dev);
1040	1422	} else {
1041		- pr_info("amdgpu: switched off\n");
	1423	+ pr_info("switched off\n");
1042	1424	drm_kms_helper_poll_disable(dev);
1043	1425	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1044		- amdgpu_device_suspend(dev, true, true);
	1426	+ amdgpu_device_suspend(dev, true);
	1427	+ amdgpu_device_cache_pci_state(dev->pdev);
	1428	+ /* Shut down the device */
	1429	+ pci_disable_device(dev->pdev);
	1430	+ pci_set_power_state(dev->pdev, PCI_D3cold);
1045	1431	dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1046	1432	}
1047	1433	}
..	..	@@ -1064,7 +1450,7 @@
1064	1450	* locking inversion with the driver load path. And the access here is
1065	1451	* completely racy anyway. So don't bother with locking for now.
1066	1452	*/
1067		- return dev->open_count == 0;
	1453	+ return atomic_read(&dev->open_count) == 0;
1068	1454	}
1069	1455
1070	1456	static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
..	..	@@ -1304,7 +1690,7 @@
1304	1690	adev->enable_virtual_display = false;
1305	1691
1306	1692	if (amdgpu_virtual_display) {
1307		- struct drm_device *ddev = adev->ddev;
	1693	+ struct drm_device *ddev = adev_to_drm(adev);
1308	1694	const char *pci_address_name = pci_name(ddev->pdev);
1309	1695	char pciaddstr, pciaddstr_tmp, pciaddname_tmp, pciaddname;
1310	1696
..	..	@@ -1357,22 +1743,25 @@
1357	1743	static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1358	1744	{
1359	1745	const char *chip_name;
1360		- char fw_name[30];
	1746	+ char fw_name[40];
1361	1747	int err;
1362	1748	const struct gpu_info_firmware_header_v1_0 *hdr;
1363	1749
1364	1750	adev->firmware.gpu_info_fw = NULL;
1365	1751
	1752	+ if (adev->mman.discovery_bin) {
	1753	+ amdgpu_discovery_get_gfx_info(adev);
	1754	+
	1755	+ /*
	1756	+ * FIXME: The bounding box is still needed by Navi12, so
	1757	+ * temporarily read it from gpu_info firmware. Should be droped
	1758	+ * when DAL no longer needs it.
	1759	+ */
	1760	+ if (adev->asic_type != CHIP_NAVI12)
	1761	+ return 0;
	1762	+ }
	1763	+
1366	1764	switch (adev->asic_type) {
1367		- case CHIP_TOPAZ:
1368		- case CHIP_TONGA:
1369		- case CHIP_FIJI:
1370		- case CHIP_POLARIS10:
1371		- case CHIP_POLARIS11:
1372		- case CHIP_POLARIS12:
1373		- case CHIP_VEGAM:
1374		- case CHIP_CARRIZO:
1375		- case CHIP_STONEY:
1376	1765	#ifdef CONFIG_DRM_AMDGPU_SI
1377	1766	case CHIP_VERDE:
1378	1767	case CHIP_TAHITI:
..	..	@@ -1387,7 +1776,18 @@
1387	1776	case CHIP_KABINI:
1388	1777	case CHIP_MULLINS:
1389	1778	#endif
	1779	+ case CHIP_TOPAZ:
	1780	+ case CHIP_TONGA:
	1781	+ case CHIP_FIJI:
	1782	+ case CHIP_POLARIS10:
	1783	+ case CHIP_POLARIS11:
	1784	+ case CHIP_POLARIS12:
	1785	+ case CHIP_VEGAM:
	1786	+ case CHIP_CARRIZO:
	1787	+ case CHIP_STONEY:
1390	1788	case CHIP_VEGA20:
	1789	+ case CHIP_SIENNA_CICHLID:
	1790	+ case CHIP_NAVY_FLOUNDER:
1391	1791	default:
1392	1792	return 0;
1393	1793	case CHIP_VEGA10:
..	..	@@ -1397,7 +1797,30 @@
1397	1797	chip_name = "vega12";
1398	1798	break;
1399	1799	case CHIP_RAVEN:
1400		- chip_name = "raven";
	1800	+ if (adev->apu_flags & AMD_APU_IS_RAVEN2)
	1801	+ chip_name = "raven2";
	1802	+ else if (adev->apu_flags & AMD_APU_IS_PICASSO)
	1803	+ chip_name = "picasso";
	1804	+ else
	1805	+ chip_name = "raven";
	1806	+ break;
	1807	+ case CHIP_ARCTURUS:
	1808	+ chip_name = "arcturus";
	1809	+ break;
	1810	+ case CHIP_RENOIR:
	1811	+ if (adev->apu_flags & AMD_APU_IS_RENOIR)
	1812	+ chip_name = "renoir";
	1813	+ else
	1814	+ chip_name = "green_sardine";
	1815	+ break;
	1816	+ case CHIP_NAVI10:
	1817	+ chip_name = "navi10";
	1818	+ break;
	1819	+ case CHIP_NAVI14:
	1820	+ chip_name = "navi14";
	1821	+ break;
	1822	+ case CHIP_NAVI12:
	1823	+ chip_name = "navi12";
1401	1824	break;
1402	1825	}
1403	1826
..	..	@@ -1427,6 +1850,12 @@
1427	1850	(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1428	1851	le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1429	1852
	1853	+ /*
	1854	+ * Should be droped when DAL no longer needs it.
	1855	+ */
	1856	+ if (adev->asic_type == CHIP_NAVI12)
	1857	+ goto parse_soc_bounding_box;
	1858	+
1430	1859	adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1431	1860	adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1432	1861	adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
..	..	@@ -1445,6 +1874,27 @@
1445	1874	adev->gfx.cu_info.max_scratch_slots_per_cu =
1446	1875	le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1447	1876	adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
	1877	+ if (hdr->version_minor >= 1) {
	1878	+ const struct gpu_info_firmware_v1_1 *gpu_info_fw =
	1879	+ (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
	1880	+ le32_to_cpu(hdr->header.ucode_array_offset_bytes));
	1881	+ adev->gfx.config.num_sc_per_sh =
	1882	+ le32_to_cpu(gpu_info_fw->num_sc_per_sh);
	1883	+ adev->gfx.config.num_packer_per_sc =
	1884	+ le32_to_cpu(gpu_info_fw->num_packer_per_sc);
	1885	+ }
	1886	+
	1887	+parse_soc_bounding_box:
	1888	+ /*
	1889	+ * soc bounding box info is not integrated in disocovery table,
	1890	+ * we always need to parse it from gpu info firmware if needed.
	1891	+ */
	1892	+ if (hdr->version_minor == 2) {
	1893	+ const struct gpu_info_firmware_v1_2 *gpu_info_fw =
	1894	+ (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
	1895	+ le32_to_cpu(hdr->header.ucode_array_offset_bytes));
	1896	+ adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
	1897	+ }
1448	1898	break;
1449	1899	}
1450	1900	default:
..	..	@@ -1473,25 +1923,13 @@
1473	1923
1474	1924	amdgpu_device_enable_virtual_display(adev);
1475	1925
1476		- switch (adev->asic_type) {
1477		- case CHIP_TOPAZ:
1478		- case CHIP_TONGA:
1479		- case CHIP_FIJI:
1480		- case CHIP_POLARIS10:
1481		- case CHIP_POLARIS11:
1482		- case CHIP_POLARIS12:
1483		- case CHIP_VEGAM:
1484		- case CHIP_CARRIZO:
1485		- case CHIP_STONEY:
1486		- if (adev->asic_type == CHIP_CARRIZO \|\| adev->asic_type == CHIP_STONEY)
1487		- adev->family = AMDGPU_FAMILY_CZ;
1488		- else
1489		- adev->family = AMDGPU_FAMILY_VI;
1490		-
1491		- r = vi_set_ip_blocks(adev);
	1926	+ if (amdgpu_sriov_vf(adev)) {
	1927	+ r = amdgpu_virt_request_full_gpu(adev, true);
1492	1928	if (r)
1493	1929	return r;
1494		- break;
	1930	+ }
	1931	+
	1932	+ switch (adev->asic_type) {
1495	1933	#ifdef CONFIG_DRM_AMDGPU_SI
1496	1934	case CHIP_VERDE:
1497	1935	case CHIP_TAHITI:
..	..	@@ -1510,21 +1948,41 @@
1510	1948	case CHIP_KAVERI:
1511	1949	case CHIP_KABINI:
1512	1950	case CHIP_MULLINS:
1513		- if ((adev->asic_type == CHIP_BONAIRE) \|\| (adev->asic_type == CHIP_HAWAII))
1514		- adev->family = AMDGPU_FAMILY_CI;
1515		- else
	1951	+ if (adev->flags & AMD_IS_APU)
1516	1952	adev->family = AMDGPU_FAMILY_KV;
	1953	+ else
	1954	+ adev->family = AMDGPU_FAMILY_CI;
1517	1955
1518	1956	r = cik_set_ip_blocks(adev);
1519	1957	if (r)
1520	1958	return r;
1521	1959	break;
1522	1960	#endif
	1961	+ case CHIP_TOPAZ:
	1962	+ case CHIP_TONGA:
	1963	+ case CHIP_FIJI:
	1964	+ case CHIP_POLARIS10:
	1965	+ case CHIP_POLARIS11:
	1966	+ case CHIP_POLARIS12:
	1967	+ case CHIP_VEGAM:
	1968	+ case CHIP_CARRIZO:
	1969	+ case CHIP_STONEY:
	1970	+ if (adev->flags & AMD_IS_APU)
	1971	+ adev->family = AMDGPU_FAMILY_CZ;
	1972	+ else
	1973	+ adev->family = AMDGPU_FAMILY_VI;
	1974	+
	1975	+ r = vi_set_ip_blocks(adev);
	1976	+ if (r)
	1977	+ return r;
	1978	+ break;
1523	1979	case CHIP_VEGA10:
1524	1980	case CHIP_VEGA12:
1525	1981	case CHIP_VEGA20:
1526	1982	case CHIP_RAVEN:
1527		- if (adev->asic_type == CHIP_RAVEN)
	1983	+ case CHIP_ARCTURUS:
	1984	+ case CHIP_RENOIR:
	1985	+ if (adev->flags & AMD_IS_APU)
1528	1986	adev->family = AMDGPU_FAMILY_RV;
1529	1987	else
1530	1988	adev->family = AMDGPU_FAMILY_AI;
..	..	@@ -1533,24 +1991,27 @@
1533	1991	if (r)
1534	1992	return r;
1535	1993	break;
	1994	+ case CHIP_NAVI10:
	1995	+ case CHIP_NAVI14:
	1996	+ case CHIP_NAVI12:
	1997	+ case CHIP_SIENNA_CICHLID:
	1998	+ case CHIP_NAVY_FLOUNDER:
	1999	+ adev->family = AMDGPU_FAMILY_NV;
	2000	+
	2001	+ r = nv_set_ip_blocks(adev);
	2002	+ if (r)
	2003	+ return r;
	2004	+ break;
1536	2005	default:
1537	2006	/* FIXME: not supported yet */
1538	2007	return -EINVAL;
1539	2008	}
1540	2009
1541		- r = amdgpu_device_parse_gpu_info_fw(adev);
1542		- if (r)
1543		- return r;
1544		-
1545	2010	amdgpu_amdkfd_device_probe(adev);
1546	2011
1547		- if (amdgpu_sriov_vf(adev)) {
1548		- r = amdgpu_virt_request_full_gpu(adev, true);
1549		- if (r)
1550		- return -EAGAIN;
1551		- }
1552		-
1553		- adev->powerplay.pp_feature = amdgpu_pp_feature_mask;
	2012	+ adev->pm.pp_feature = amdgpu_pp_feature_mask;
	2013	+ if (amdgpu_sriov_vf(adev) \|\| sched_policy == KFD_SCHED_POLICY_NO_HWS)
	2014	+ adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1554	2015
1555	2016	for (i = 0; i < adev->num_ip_blocks; i++) {
1556	2017	if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
..	..	@@ -1573,12 +2034,122 @@
1573	2034	adev->ip_blocks[i].status.valid = true;
1574	2035	}
1575	2036	}
	2037	+ /* get the vbios after the asic_funcs are set up */
	2038	+ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
	2039	+ r = amdgpu_device_parse_gpu_info_fw(adev);
	2040	+ if (r)
	2041	+ return r;
	2042	+
	2043	+ /* Read BIOS */
	2044	+ if (!amdgpu_get_bios(adev))
	2045	+ return -EINVAL;
	2046	+
	2047	+ r = amdgpu_atombios_init(adev);
	2048	+ if (r) {
	2049	+ dev_err(adev->dev, "amdgpu_atombios_init failed\n");
	2050	+ amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
	2051	+ return r;
	2052	+ }
	2053	+
	2054	+ /get pf2vf msg info at it's earliest time/
	2055	+ if (amdgpu_sriov_vf(adev))
	2056	+ amdgpu_virt_init_data_exchange(adev);
	2057	+
	2058	+ }
1576	2059	}
1577	2060
1578	2061	adev->cg_flags &= amdgpu_cg_mask;
1579	2062	adev->pg_flags &= amdgpu_pg_mask;
1580	2063
1581	2064	return 0;
	2065	+}
	2066	+
	2067	+static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
	2068	+{
	2069	+ int i, r;
	2070	+
	2071	+ for (i = 0; i < adev->num_ip_blocks; i++) {
	2072	+ if (!adev->ip_blocks[i].status.sw)
	2073	+ continue;
	2074	+ if (adev->ip_blocks[i].status.hw)
	2075	+ continue;
	2076	+ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON \|\|
	2077	+ (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) \|\|
	2078	+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
	2079	+ r = adev->ip_blocks[i].version->funcs->hw_init(adev);
	2080	+ if (r) {
	2081	+ DRM_ERROR("hw_init of IP block <%s> failed %d\n",
	2082	+ adev->ip_blocks[i].version->funcs->name, r);
	2083	+ return r;
	2084	+ }
	2085	+ adev->ip_blocks[i].status.hw = true;
	2086	+ }
	2087	+ }
	2088	+
	2089	+ return 0;
	2090	+}
	2091	+
	2092	+static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
	2093	+{
	2094	+ int i, r;
	2095	+
	2096	+ for (i = 0; i < adev->num_ip_blocks; i++) {
	2097	+ if (!adev->ip_blocks[i].status.sw)
	2098	+ continue;
	2099	+ if (adev->ip_blocks[i].status.hw)
	2100	+ continue;
	2101	+ r = adev->ip_blocks[i].version->funcs->hw_init(adev);
	2102	+ if (r) {
	2103	+ DRM_ERROR("hw_init of IP block <%s> failed %d\n",
	2104	+ adev->ip_blocks[i].version->funcs->name, r);
	2105	+ return r;
	2106	+ }
	2107	+ adev->ip_blocks[i].status.hw = true;
	2108	+ }
	2109	+
	2110	+ return 0;
	2111	+}
	2112	+
	2113	+static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
	2114	+{
	2115	+ int r = 0;
	2116	+ int i;
	2117	+ uint32_t smu_version;
	2118	+
	2119	+ if (adev->asic_type >= CHIP_VEGA10) {
	2120	+ for (i = 0; i < adev->num_ip_blocks; i++) {
	2121	+ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
	2122	+ continue;
	2123	+
	2124	+ /* no need to do the fw loading again if already done*/
	2125	+ if (adev->ip_blocks[i].status.hw == true)
	2126	+ break;
	2127	+
	2128	+ if (amdgpu_in_reset(adev) \|\| adev->in_suspend) {
	2129	+ r = adev->ip_blocks[i].version->funcs->resume(adev);
	2130	+ if (r) {
	2131	+ DRM_ERROR("resume of IP block <%s> failed %d\n",
	2132	+ adev->ip_blocks[i].version->funcs->name, r);
	2133	+ return r;
	2134	+ }
	2135	+ } else {
	2136	+ r = adev->ip_blocks[i].version->funcs->hw_init(adev);
	2137	+ if (r) {
	2138	+ DRM_ERROR("hw_init of IP block <%s> failed %d\n",
	2139	+ adev->ip_blocks[i].version->funcs->name, r);
	2140	+ return r;
	2141	+ }
	2142	+ }
	2143	+
	2144	+ adev->ip_blocks[i].status.hw = true;
	2145	+ break;
	2146	+ }
	2147	+ }
	2148	+
	2149	+ if (!amdgpu_sriov_vf(adev) \|\| adev->asic_type == CHIP_TONGA)
	2150	+ r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
	2151	+
	2152	+ return r;
1582	2153	}
1583	2154
1584	2155	/**
..	..	@@ -1596,6 +2167,10 @@
1596	2167	{
1597	2168	int i, r;
1598	2169
	2170	+ r = amdgpu_ras_init(adev);
	2171	+ if (r)
	2172	+ return r;
	2173	+
1599	2174	for (i = 0; i < adev->num_ip_blocks; i++) {
1600	2175	if (!adev->ip_blocks[i].status.valid)
1601	2176	continue;
..	..	@@ -1603,62 +2178,102 @@
1603	2178	if (r) {
1604	2179	DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1605	2180	adev->ip_blocks[i].version->funcs->name, r);
1606		- return r;
	2181	+ goto init_failed;
1607	2182	}
1608	2183	adev->ip_blocks[i].status.sw = true;
1609	2184
1610	2185	/* need to do gmc hw init early so we can allocate gpu mem */
1611	2186	if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
	2187	+ /* Try to reserve bad pages early */
	2188	+ if (amdgpu_sriov_vf(adev))
	2189	+ amdgpu_virt_exchange_data(adev);
	2190	+
1612	2191	r = amdgpu_device_vram_scratch_init(adev);
1613	2192	if (r) {
1614	2193	DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
1615		- return r;
	2194	+ goto init_failed;
1616	2195	}
1617	2196	r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
1618	2197	if (r) {
1619	2198	DRM_ERROR("hw_init %d failed %d\n", i, r);
1620		- return r;
	2199	+ goto init_failed;
1621	2200	}
1622	2201	r = amdgpu_device_wb_init(adev);
1623	2202	if (r) {
1624	2203	DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
1625		- return r;
	2204	+ goto init_failed;
1626	2205	}
1627	2206	adev->ip_blocks[i].status.hw = true;
1628	2207
1629	2208	/* right after GMC hw init, we create CSA */
1630		- if (amdgpu_sriov_vf(adev)) {
1631		- r = amdgpu_allocate_static_csa(adev);
	2209	+ if (amdgpu_mcbp \|\| amdgpu_sriov_vf(adev)) {
	2210	+ r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
	2211	+ AMDGPU_GEM_DOMAIN_VRAM,
	2212	+ AMDGPU_CSA_SIZE);
1632	2213	if (r) {
1633	2214	DRM_ERROR("allocate CSA failed %d\n", r);
1634		- return r;
	2215	+ goto init_failed;
1635	2216	}
1636	2217	}
1637	2218	}
1638	2219	}
1639	2220
1640		- for (i = 0; i < adev->num_ip_blocks; i++) {
1641		- if (!adev->ip_blocks[i].status.sw)
1642		- continue;
1643		- if (adev->ip_blocks[i].status.hw)
1644		- continue;
1645		- r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
1646		- if (r) {
1647		- DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1648		- adev->ip_blocks[i].version->funcs->name, r);
1649		- return r;
1650		- }
1651		- adev->ip_blocks[i].status.hw = true;
	2221	+ if (amdgpu_sriov_vf(adev))
	2222	+ amdgpu_virt_init_data_exchange(adev);
	2223	+
	2224	+ r = amdgpu_ib_pool_init(adev);
	2225	+ if (r) {
	2226	+ dev_err(adev->dev, "IB initialization failed (%d).\n", r);
	2227	+ amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
	2228	+ goto init_failed;
1652	2229	}
1653	2230
	2231	+ r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
	2232	+ if (r)
	2233	+ goto init_failed;
	2234	+
	2235	+ r = amdgpu_device_ip_hw_init_phase1(adev);
	2236	+ if (r)
	2237	+ goto init_failed;
	2238	+
	2239	+ r = amdgpu_device_fw_loading(adev);
	2240	+ if (r)
	2241	+ goto init_failed;
	2242	+
	2243	+ r = amdgpu_device_ip_hw_init_phase2(adev);
	2244	+ if (r)
	2245	+ goto init_failed;
	2246	+
	2247	+ /*
	2248	+ * retired pages will be loaded from eeprom and reserved here,
	2249	+ * it should be called after amdgpu_device_ip_hw_init_phase2 since
	2250	+ * for some ASICs the RAS EEPROM code relies on SMU fully functioning
	2251	+ * for I2C communication which only true at this point.
	2252	+ *
	2253	+ * amdgpu_ras_recovery_init may fail, but the upper only cares the
	2254	+ * failure from bad gpu situation and stop amdgpu init process
	2255	+ * accordingly. For other failed cases, it will still release all
	2256	+ * the resource and print error message, rather than returning one
	2257	+ * negative value to upper level.
	2258	+ *
	2259	+ * Note: theoretically, this should be called before all vram allocations
	2260	+ * to protect retired page from abusing
	2261	+ */
	2262	+ r = amdgpu_ras_recovery_init(adev);
	2263	+ if (r)
	2264	+ goto init_failed;
	2265	+
	2266	+ if (adev->gmc.xgmi.num_physical_nodes > 1)
	2267	+ amdgpu_xgmi_add_device(adev);
1654	2268	amdgpu_amdkfd_device_init(adev);
1655	2269
1656		- if (amdgpu_sriov_vf(adev)) {
1657		- amdgpu_virt_init_data_exchange(adev);
1658		- amdgpu_virt_release_full_gpu(adev, true);
1659		- }
	2270	+ amdgpu_fru_get_product_info(adev);
1660	2271
1661		- return 0;
	2272	+init_failed:
	2273	+ if (amdgpu_sriov_vf(adev))
	2274	+ amdgpu_virt_release_full_gpu(adev, true);
	2275	+
	2276	+ return r;
1662	2277	}
1663	2278
1664	2279	/**
..	..	@@ -1687,39 +2302,60 @@
1687	2302	*/
1688	2303	static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
1689	2304	{
1690		- return !!memcmp(adev->gart.ptr, adev->reset_magic,
1691		- AMDGPU_RESET_MAGIC_NUM);
	2305	+ if (memcmp(adev->gart.ptr, adev->reset_magic,
	2306	+ AMDGPU_RESET_MAGIC_NUM))
	2307	+ return true;
	2308	+
	2309	+ if (!amdgpu_in_reset(adev))
	2310	+ return false;
	2311	+
	2312	+ /*
	2313	+ * For all ASICs with baco/mode1 reset, the VRAM is
	2314	+ * always assumed to be lost.
	2315	+ */
	2316	+ switch (amdgpu_asic_reset_method(adev)) {
	2317	+ case AMD_RESET_METHOD_BACO:
	2318	+ case AMD_RESET_METHOD_MODE1:
	2319	+ return true;
	2320	+ default:
	2321	+ return false;
	2322	+ }
1692	2323	}
1693	2324
1694	2325	/**
1695		- * amdgpu_device_ip_late_set_cg_state - late init for clockgating
	2326	+ * amdgpu_device_set_cg_state - set clockgating for amdgpu device
1696	2327	*
1697	2328	* @adev: amdgpu_device pointer
	2329	+ * @state: clockgating state (gate or ungate)
1698	2330	*
1699		- * Late initialization pass enabling clockgating for hardware IPs.
1700	2331	* The list of all the hardware IPs that make up the asic is walked and the
1701		- * set_clockgating_state callbacks are run. This stage is run late
1702		- * in the init process.
	2332	+ * set_clockgating_state callbacks are run.
	2333	+ * Late initialization pass enabling clockgating for hardware IPs.
	2334	+ * Fini or suspend, pass disabling clockgating for hardware IPs.
1703	2335	* Returns 0 on success, negative error code on failure.
1704	2336	*/
1705		-static int amdgpu_device_ip_late_set_cg_state(struct amdgpu_device *adev)
	2337	+
	2338	+static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
	2339	+ enum amd_clockgating_state state)
1706	2340	{
1707		- int i = 0, r;
	2341	+ int i, j, r;
1708	2342
1709	2343	if (amdgpu_emu_mode == 1)
1710	2344	return 0;
1711	2345
1712		- for (i = 0; i < adev->num_ip_blocks; i++) {
1713		- if (!adev->ip_blocks[i].status.valid)
	2346	+ for (j = 0; j < adev->num_ip_blocks; j++) {
	2347	+ i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
	2348	+ if (!adev->ip_blocks[i].status.late_initialized)
1714	2349	continue;
1715	2350	/* skip CG for VCE/UVD, it's handled specially */
1716	2351	if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
1717	2352	adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
1718	2353	adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
	2354	+ adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
1719	2355	adev->ip_blocks[i].version->funcs->set_clockgating_state) {
1720	2356	/* enable clockgating to save power */
1721	2357	r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1722		- AMD_CG_STATE_GATE);
	2358	+ state);
1723	2359	if (r) {
1724	2360	DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
1725	2361	adev->ip_blocks[i].version->funcs->name, r);
..	..	@@ -1731,24 +2367,26 @@
1731	2367	return 0;
1732	2368	}
1733	2369
1734		-static int amdgpu_device_ip_late_set_pg_state(struct amdgpu_device *adev)
	2370	+static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
1735	2371	{
1736		- int i = 0, r;
	2372	+ int i, j, r;
1737	2373
1738	2374	if (amdgpu_emu_mode == 1)
1739	2375	return 0;
1740	2376
1741		- for (i = 0; i < adev->num_ip_blocks; i++) {
1742		- if (!adev->ip_blocks[i].status.valid)
	2377	+ for (j = 0; j < adev->num_ip_blocks; j++) {
	2378	+ i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
	2379	+ if (!adev->ip_blocks[i].status.late_initialized)
1743	2380	continue;
1744	2381	/* skip CG for VCE/UVD, it's handled specially */
1745	2382	if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
1746	2383	adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
1747	2384	adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
	2385	+ adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
1748	2386	adev->ip_blocks[i].version->funcs->set_powergating_state) {
1749	2387	/* enable powergating to save power */
1750	2388	r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1751		- AMD_PG_STATE_GATE);
	2389	+ state);
1752	2390	if (r) {
1753	2391	DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
1754	2392	adev->ip_blocks[i].version->funcs->name, r);
..	..	@@ -1757,6 +2395,41 @@
1757	2395	}
1758	2396	}
1759	2397	return 0;
	2398	+}
	2399	+
	2400	+static int amdgpu_device_enable_mgpu_fan_boost(void)
	2401	+{
	2402	+ struct amdgpu_gpu_instance *gpu_ins;
	2403	+ struct amdgpu_device *adev;
	2404	+ int i, ret = 0;
	2405	+
	2406	+ mutex_lock(&mgpu_info.mutex);
	2407	+
	2408	+ /*
	2409	+ * MGPU fan boost feature should be enabled
	2410	+ * only when there are two or more dGPUs in
	2411	+ * the system
	2412	+ */
	2413	+ if (mgpu_info.num_dgpu < 2)
	2414	+ goto out;
	2415	+
	2416	+ for (i = 0; i < mgpu_info.num_dgpu; i++) {
	2417	+ gpu_ins = &(mgpu_info.gpu_ins[i]);
	2418	+ adev = gpu_ins->adev;
	2419	+ if (!(adev->flags & AMD_IS_APU) &&
	2420	+ !gpu_ins->mgpu_fan_enabled) {
	2421	+ ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
	2422	+ if (ret)
	2423	+ break;
	2424	+
	2425	+ gpu_ins->mgpu_fan_enabled = 1;
	2426	+ }
	2427	+ }
	2428	+
	2429	+out:
	2430	+ mutex_unlock(&mgpu_info.mutex);
	2431	+
	2432	+ return ret;
1760	2433	}
1761	2434
1762	2435	/**
..	..	@@ -1773,10 +2446,11 @@
1773	2446	*/
1774	2447	static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
1775	2448	{
	2449	+ struct amdgpu_gpu_instance *gpu_instance;
1776	2450	int i = 0, r;
1777	2451
1778	2452	for (i = 0; i < adev->num_ip_blocks; i++) {
1779		- if (!adev->ip_blocks[i].status.valid)
	2453	+ if (!adev->ip_blocks[i].status.hw)
1780	2454	continue;
1781	2455	if (adev->ip_blocks[i].version->funcs->late_init) {
1782	2456	r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
..	..	@@ -1785,17 +2459,55 @@
1785	2459	adev->ip_blocks[i].version->funcs->name, r);
1786	2460	return r;
1787	2461	}
1788		- adev->ip_blocks[i].status.late_initialized = true;
1789	2462	}
	2463	+ adev->ip_blocks[i].status.late_initialized = true;
1790	2464	}
1791	2465
1792		- amdgpu_device_ip_late_set_cg_state(adev);
1793		- amdgpu_device_ip_late_set_pg_state(adev);
	2466	+ amdgpu_ras_set_error_query_ready(adev, true);
1794	2467
1795		- queue_delayed_work(system_wq, &adev->late_init_work,
1796		- msecs_to_jiffies(AMDGPU_RESUME_MS));
	2468	+ amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
	2469	+ amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
1797	2470
1798	2471	amdgpu_device_fill_reset_magic(adev);
	2472	+
	2473	+ r = amdgpu_device_enable_mgpu_fan_boost();
	2474	+ if (r)
	2475	+ DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
	2476	+
	2477	+
	2478	+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
	2479	+ mutex_lock(&mgpu_info.mutex);
	2480	+
	2481	+ /*
	2482	+ * Reset device p-state to low as this was booted with high.
	2483	+ *
	2484	+ * This should be performed only after all devices from the same
	2485	+ * hive get initialized.
	2486	+ *
	2487	+ * However, it's unknown how many device in the hive in advance.
	2488	+ * As this is counted one by one during devices initializations.
	2489	+ *
	2490	+ * So, we wait for all XGMI interlinked devices initialized.
	2491	+ * This may bring some delays as those devices may come from
	2492	+ * different hives. But that should be OK.
	2493	+ */
	2494	+ if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
	2495	+ for (i = 0; i < mgpu_info.num_gpu; i++) {
	2496	+ gpu_instance = &(mgpu_info.gpu_ins[i]);
	2497	+ if (gpu_instance->adev->flags & AMD_IS_APU)
	2498	+ continue;
	2499	+
	2500	+ r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
	2501	+ AMDGPU_XGMI_PSTATE_MIN);
	2502	+ if (r) {
	2503	+ DRM_ERROR("pstate setting failed (%d).\n", r);
	2504	+ break;
	2505	+ }
	2506	+ }
	2507	+ }
	2508	+
	2509	+ mutex_unlock(&mgpu_info.mutex);
	2510	+ }
1799	2511
1800	2512	return 0;
1801	2513	}
..	..	@@ -1815,23 +2527,24 @@
1815	2527	{
1816	2528	int i, r;
1817	2529
	2530	+ if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
	2531	+ amdgpu_virt_release_ras_err_handler_data(adev);
	2532	+
	2533	+ amdgpu_ras_pre_fini(adev);
	2534	+
	2535	+ if (adev->gmc.xgmi.num_physical_nodes > 1)
	2536	+ amdgpu_xgmi_remove_device(adev);
	2537	+
	2538	+ amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
	2539	+ amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
	2540	+
1818	2541	amdgpu_amdkfd_device_fini(adev);
	2542	+
1819	2543	/* need to disable SMC first */
1820	2544	for (i = 0; i < adev->num_ip_blocks; i++) {
1821	2545	if (!adev->ip_blocks[i].status.hw)
1822	2546	continue;
1823		- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC &&
1824		- adev->ip_blocks[i].version->funcs->set_clockgating_state) {
1825		- /* ungate blocks before hw fini so that we can shutdown the blocks safely */
1826		- r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1827		- AMD_CG_STATE_UNGATE);
1828		- if (r) {
1829		- DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n",
1830		- adev->ip_blocks[i].version->funcs->name, r);
1831		- return r;
1832		- }
1833		- if (adev->powerplay.pp_funcs->set_powergating_by_smu)
1834		- amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false);
	2547	+ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
1835	2548	r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
1836	2549	/* XXX handle errors */
1837	2550	if (r) {
..	..	@@ -1846,20 +2559,6 @@
1846	2559	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
1847	2560	if (!adev->ip_blocks[i].status.hw)
1848	2561	continue;
1849		-
1850		- if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
1851		- adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
1852		- adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
1853		- adev->ip_blocks[i].version->funcs->set_clockgating_state) {
1854		- /* ungate blocks before hw fini so that we can shutdown the blocks safely */
1855		- r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1856		- AMD_CG_STATE_UNGATE);
1857		- if (r) {
1858		- DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n",
1859		- adev->ip_blocks[i].version->funcs->name, r);
1860		- return r;
1861		- }
1862		- }
1863	2562
1864	2563	r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
1865	2564	/* XXX handle errors */
..	..	@@ -1877,9 +2576,11 @@
1877	2576	continue;
1878	2577
1879	2578	if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
1880		- amdgpu_free_static_csa(adev);
	2579	+ amdgpu_ucode_free_bo(adev);
	2580	+ amdgpu_free_static_csa(&adev->virt.csa_obj);
1881	2581	amdgpu_device_wb_fini(adev);
1882	2582	amdgpu_device_vram_scratch_fini(adev);
	2583	+ amdgpu_ib_pool_fini(adev);
1883	2584	}
1884	2585
1885	2586	r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
..	..	@@ -1900,6 +2601,8 @@
1900	2601	adev->ip_blocks[i].status.late_initialized = false;
1901	2602	}
1902	2603
	2604	+ amdgpu_ras_fini(adev);
	2605	+
1903	2606	if (amdgpu_sriov_vf(adev))
1904	2607	if (amdgpu_virt_release_full_gpu(adev, false))
1905	2608	DRM_ERROR("failed to release exclusive mode on fini\n");
..	..	@@ -1908,23 +2611,31 @@
1908	2611	}
1909	2612
1910	2613	/**
1911		- * amdgpu_device_ip_late_init_func_handler - work handler for clockgating
	2614	+ * amdgpu_device_delayed_init_work_handler - work handler for IB tests
1912	2615	*
1913		- * @work: work_struct
1914		- *
1915		- * Work handler for amdgpu_device_ip_late_set_cg_state. We put the
1916		- * clockgating setup into a worker thread to speed up driver init and
1917		- * resume from suspend.
	2616	+ * @work: work_struct.
1918	2617	*/
1919		-static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
	2618	+static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
1920	2619	{
1921	2620	struct amdgpu_device *adev =
1922		- container_of(work, struct amdgpu_device, late_init_work.work);
	2621	+ container_of(work, struct amdgpu_device, delayed_init_work.work);
1923	2622	int r;
1924	2623
1925	2624	r = amdgpu_ib_ring_tests(adev);
1926	2625	if (r)
1927	2626	DRM_ERROR("ib ring test failed (%d).\n", r);
	2627	+}
	2628	+
	2629	+static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
	2630	+{
	2631	+ struct amdgpu_device *adev =
	2632	+ container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
	2633	+
	2634	+ WARN_ON_ONCE(adev->gfx.gfx_off_state);
	2635	+ WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
	2636	+
	2637	+ if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
	2638	+ adev->gfx.gfx_off_state = true;
1928	2639	}
1929	2640
1930	2641	/**
..	..	@@ -1942,35 +2653,28 @@
1942	2653	{
1943	2654	int i, r;
1944	2655
1945		- if (amdgpu_sriov_vf(adev))
1946		- amdgpu_virt_request_full_gpu(adev, false);
	2656	+ amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
	2657	+ amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
1947	2658
1948	2659	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
1949	2660	if (!adev->ip_blocks[i].status.valid)
1950	2661	continue;
1951		- /* displays are handled separately */
1952		- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
1953		- /* ungate blocks so that suspend can properly shut them down */
1954		- if (adev->ip_blocks[i].version->funcs->set_clockgating_state) {
1955		- r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1956		- AMD_CG_STATE_UNGATE);
1957		- if (r) {
1958		- DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n",
1959		- adev->ip_blocks[i].version->funcs->name, r);
1960		- }
1961		- }
1962		- /* XXX handle errors */
1963		- r = adev->ip_blocks[i].version->funcs->suspend(adev);
1964		- /* XXX handle errors */
1965		- if (r) {
1966		- DRM_ERROR("suspend of IP block <%s> failed %d\n",
1967		- adev->ip_blocks[i].version->funcs->name, r);
1968		- }
1969		- }
1970		- }
1971	2662
1972		- if (amdgpu_sriov_vf(adev))
1973		- amdgpu_virt_release_full_gpu(adev, false);
	2663	+ /* displays are handled separately */
	2664	+ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
	2665	+ continue;
	2666	+
	2667	+ /* XXX handle errors */
	2668	+ r = adev->ip_blocks[i].version->funcs->suspend(adev);
	2669	+ /* XXX handle errors */
	2670	+ if (r) {
	2671	+ DRM_ERROR("suspend of IP block <%s> failed %d\n",
	2672	+ adev->ip_blocks[i].version->funcs->name, r);
	2673	+ return r;
	2674	+ }
	2675	+
	2676	+ adev->ip_blocks[i].status.hw = false;
	2677	+ }
1974	2678
1975	2679	return 0;
1976	2680	}
..	..	@@ -1990,35 +2694,17 @@
1990	2694	{
1991	2695	int i, r;
1992	2696
1993		- if (amdgpu_sriov_vf(adev))
1994		- amdgpu_virt_request_full_gpu(adev, false);
1995		-
1996		- /* ungate SMC block first */
1997		- r = amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_SMC,
1998		- AMD_CG_STATE_UNGATE);
1999		- if (r) {
2000		- DRM_ERROR("set_clockgating_state(ungate) SMC failed %d\n", r);
2001		- }
2002		-
2003		- /* call smu to disable gfx off feature first when suspend */
2004		- if (adev->powerplay.pp_funcs->set_powergating_by_smu)
2005		- amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false);
2006		-
2007	2697	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2008	2698	if (!adev->ip_blocks[i].status.valid)
2009	2699	continue;
2010	2700	/* displays are handled in phase1 */
2011	2701	if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2012	2702	continue;
2013		- /* ungate blocks so that suspend can properly shut them down */
2014		- if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_SMC &&
2015		- adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2016		- r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2017		- AMD_CG_STATE_UNGATE);
2018		- if (r) {
2019		- DRM_ERROR("set_clockgating_state(ungate) of IP block <%s> failed %d\n",
2020		- adev->ip_blocks[i].version->funcs->name, r);
2021		- }
	2703	+ /* PSP lost connection when err_event_athub occurs */
	2704	+ if (amdgpu_ras_intr_triggered() &&
	2705	+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
	2706	+ adev->ip_blocks[i].status.hw = false;
	2707	+ continue;
2022	2708	}
2023	2709	/* XXX handle errors */
2024	2710	r = adev->ip_blocks[i].version->funcs->suspend(adev);
..	..	@@ -2027,10 +2713,20 @@
2027	2713	DRM_ERROR("suspend of IP block <%s> failed %d\n",
2028	2714	adev->ip_blocks[i].version->funcs->name, r);
2029	2715	}
	2716	+ adev->ip_blocks[i].status.hw = false;
	2717	+ /* handle putting the SMC in the appropriate state */
	2718	+ if(!amdgpu_sriov_vf(adev)){
	2719	+ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
	2720	+ r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
	2721	+ if (r) {
	2722	+ DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
	2723	+ adev->mp1_state, r);
	2724	+ return r;
	2725	+ }
	2726	+ }
	2727	+ }
	2728	+ adev->ip_blocks[i].status.hw = false;
2030	2729	}
2031		-
2032		- if (amdgpu_sriov_vf(adev))
2033		- amdgpu_virt_release_full_gpu(adev, false);
2034	2730
2035	2731	return 0;
2036	2732	}
..	..	@@ -2050,10 +2746,16 @@
2050	2746	{
2051	2747	int r;
2052	2748
	2749	+ if (amdgpu_sriov_vf(adev))
	2750	+ amdgpu_virt_request_full_gpu(adev, false);
	2751	+
2053	2752	r = amdgpu_device_ip_suspend_phase1(adev);
2054	2753	if (r)
2055	2754	return r;
2056	2755	r = amdgpu_device_ip_suspend_phase2(adev);
	2756	+
	2757	+ if (amdgpu_sriov_vf(adev))
	2758	+ amdgpu_virt_release_full_gpu(adev, false);
2057	2759
2058	2760	return r;
2059	2761	}
..	..	@@ -2073,17 +2775,20 @@
2073	2775	int j;
2074	2776	struct amdgpu_ip_block *block;
2075	2777
2076		- for (j = 0; j < adev->num_ip_blocks; j++) {
2077		- block = &adev->ip_blocks[j];
	2778	+ block = &adev->ip_blocks[i];
	2779	+ block->status.hw = false;
2078	2780
2079		- if (block->version->type != ip_order[i] \|\|
	2781	+ for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
	2782	+
	2783	+ if (block->version->type != ip_order[j] \|\|
2080	2784	!block->status.valid)
2081	2785	continue;
2082	2786
2083	2787	r = block->version->funcs->hw_init(adev);
2084		- DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
	2788	+ DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2085	2789	if (r)
2086	2790	return r;
	2791	+ block->status.hw = true;
2087	2792	}
2088	2793	}
2089	2794
..	..	@@ -2100,7 +2805,8 @@
2100	2805	AMD_IP_BLOCK_TYPE_GFX,
2101	2806	AMD_IP_BLOCK_TYPE_SDMA,
2102	2807	AMD_IP_BLOCK_TYPE_UVD,
2103		- AMD_IP_BLOCK_TYPE_VCE
	2808	+ AMD_IP_BLOCK_TYPE_VCE,
	2809	+ AMD_IP_BLOCK_TYPE_VCN
2104	2810	};
2105	2811
2106	2812	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
..	..	@@ -2111,13 +2817,19 @@
2111	2817	block = &adev->ip_blocks[j];
2112	2818
2113	2819	if (block->version->type != ip_order[i] \|\|
2114		- !block->status.valid)
	2820	+ !block->status.valid \|\|
	2821	+ block->status.hw)
2115	2822	continue;
2116	2823
2117		- r = block->version->funcs->hw_init(adev);
2118		- DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
	2824	+ if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
	2825	+ r = block->version->funcs->resume(adev);
	2826	+ else
	2827	+ r = block->version->funcs->hw_init(adev);
	2828	+
	2829	+ DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2119	2830	if (r)
2120	2831	return r;
	2832	+ block->status.hw = true;
2121	2833	}
2122	2834	}
2123	2835
..	..	@@ -2141,17 +2853,19 @@
2141	2853	int i, r;
2142	2854
2143	2855	for (i = 0; i < adev->num_ip_blocks; i++) {
2144		- if (!adev->ip_blocks[i].status.valid)
	2856	+ if (!adev->ip_blocks[i].status.valid \|\| adev->ip_blocks[i].status.hw)
2145	2857	continue;
2146	2858	if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON \|\|
2147	2859	adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC \|\|
2148	2860	adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
	2861	+
2149	2862	r = adev->ip_blocks[i].version->funcs->resume(adev);
2150	2863	if (r) {
2151	2864	DRM_ERROR("resume of IP block <%s> failed %d\n",
2152	2865	adev->ip_blocks[i].version->funcs->name, r);
2153	2866	return r;
2154	2867	}
	2868	+ adev->ip_blocks[i].status.hw = true;
2155	2869	}
2156	2870	}
2157	2871
..	..	@@ -2176,11 +2890,12 @@
2176	2890	int i, r;
2177	2891
2178	2892	for (i = 0; i < adev->num_ip_blocks; i++) {
2179		- if (!adev->ip_blocks[i].status.valid)
	2893	+ if (!adev->ip_blocks[i].status.valid \|\| adev->ip_blocks[i].status.hw)
2180	2894	continue;
2181	2895	if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON \|\|
2182	2896	adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC \|\|
2183		- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
	2897	+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH \|\|
	2898	+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2184	2899	continue;
2185	2900	r = adev->ip_blocks[i].version->funcs->resume(adev);
2186	2901	if (r) {
..	..	@@ -2188,6 +2903,7 @@
2188	2903	adev->ip_blocks[i].version->funcs->name, r);
2189	2904	return r;
2190	2905	}
	2906	+ adev->ip_blocks[i].status.hw = true;
2191	2907	}
2192	2908
2193	2909	return 0;
..	..	@@ -2209,9 +2925,18 @@
2209	2925	{
2210	2926	int r;
2211	2927
	2928	+ r = amdgpu_amdkfd_resume_iommu(adev);
	2929	+ if (r)
	2930	+ return r;
	2931	+
2212	2932	r = amdgpu_device_ip_resume_phase1(adev);
2213	2933	if (r)
2214	2934	return r;
	2935	+
	2936	+ r = amdgpu_device_fw_loading(adev);
	2937	+ if (r)
	2938	+ return r;
	2939	+
2215	2940	r = amdgpu_device_ip_resume_phase2(adev);
2216	2941
2217	2942	return r;
..	..	@@ -2252,6 +2977,12 @@
2252	2977	{
2253	2978	switch (asic_type) {
2254	2979	#if defined(CONFIG_DRM_AMD_DC)
	2980	+#if defined(CONFIG_DRM_AMD_DC_SI)
	2981	+ case CHIP_TAHITI:
	2982	+ case CHIP_PITCAIRN:
	2983	+ case CHIP_VERDE:
	2984	+ case CHIP_OLAND:
	2985	+#endif
2255	2986	case CHIP_BONAIRE:
2256	2987	case CHIP_KAVERI:
2257	2988	case CHIP_KABINI:
..	..	@@ -2276,12 +3007,23 @@
2276	3007	case CHIP_VEGA10:
2277	3008	case CHIP_VEGA12:
2278	3009	case CHIP_VEGA20:
2279		-#if defined(CONFIG_DRM_AMD_DC_DCN1_0)
	3010	+#if defined(CONFIG_DRM_AMD_DC_DCN)
2280	3011	case CHIP_RAVEN:
	3012	+ case CHIP_NAVI10:
	3013	+ case CHIP_NAVI14:
	3014	+ case CHIP_NAVI12:
	3015	+ case CHIP_RENOIR:
	3016	+#endif
	3017	+#if defined(CONFIG_DRM_AMD_DC_DCN3_0)
	3018	+ case CHIP_SIENNA_CICHLID:
	3019	+ case CHIP_NAVY_FLOUNDER:
2281	3020	#endif
2282	3021	return amdgpu_dc != 0;
2283	3022	#endif
2284	3023	default:
	3024	+ if (amdgpu_dc > 0)
	3025	+ DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
	3026	+ "but isn't supported by ASIC, ignoring\n");
2285	3027	return false;
2286	3028	}
2287	3029	}
..	..	@@ -2289,24 +3031,145 @@
2289	3031	/**
2290	3032	* amdgpu_device_has_dc_support - check if dc is supported
2291	3033	*
2292		- * @adev: amdgpu_device_pointer
	3034	+ * @adev: amdgpu_device pointer
2293	3035	*
2294	3036	* Returns true for supported, false for not supported
2295	3037	*/
2296	3038	bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2297	3039	{
2298		- if (amdgpu_sriov_vf(adev))
	3040	+ if (amdgpu_sriov_vf(adev) \|\| adev->enable_virtual_display)
2299	3041	return false;
2300	3042
2301	3043	return amdgpu_device_asic_has_dc_support(adev->asic_type);
2302	3044	}
2303	3045
	3046	+
	3047	+static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
	3048	+{
	3049	+ struct amdgpu_device *adev =
	3050	+ container_of(__work, struct amdgpu_device, xgmi_reset_work);
	3051	+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
	3052	+
	3053	+ /* It's a bug to not have a hive within this function */
	3054	+ if (WARN_ON(!hive))
	3055	+ return;
	3056	+
	3057	+ /*
	3058	+ * Use task barrier to synchronize all xgmi reset works across the
	3059	+ * hive. task_barrier_enter and task_barrier_exit will block
	3060	+ * until all the threads running the xgmi reset works reach
	3061	+ * those points. task_barrier_full will do both blocks.
	3062	+ */
	3063	+ if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
	3064	+
	3065	+ task_barrier_enter(&hive->tb);
	3066	+ adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
	3067	+
	3068	+ if (adev->asic_reset_res)
	3069	+ goto fail;
	3070	+
	3071	+ task_barrier_exit(&hive->tb);
	3072	+ adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
	3073	+
	3074	+ if (adev->asic_reset_res)
	3075	+ goto fail;
	3076	+
	3077	+ if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
	3078	+ adev->mmhub.funcs->reset_ras_error_count(adev);
	3079	+ } else {
	3080	+
	3081	+ task_barrier_full(&hive->tb);
	3082	+ adev->asic_reset_res = amdgpu_asic_reset(adev);
	3083	+ }
	3084	+
	3085	+fail:
	3086	+ if (adev->asic_reset_res)
	3087	+ DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
	3088	+ adev->asic_reset_res, adev_to_drm(adev)->unique);
	3089	+ amdgpu_put_xgmi_hive(hive);
	3090	+}
	3091	+
	3092	+static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
	3093	+{
	3094	+ char *input = amdgpu_lockup_timeout;
	3095	+ char *timeout_setting = NULL;
	3096	+ int index = 0;
	3097	+ long timeout;
	3098	+ int ret = 0;
	3099	+
	3100	+ /*
	3101	+ * By default timeout for non compute jobs is 10000.
	3102	+ * And there is no timeout enforced on compute jobs.
	3103	+ * In SR-IOV or passthrough mode, timeout for compute
	3104	+ * jobs are 60000 by default.
	3105	+ */
	3106	+ adev->gfx_timeout = msecs_to_jiffies(10000);
	3107	+ adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
	3108	+ if (amdgpu_sriov_vf(adev) \|\| amdgpu_passthrough(adev))
	3109	+ adev->compute_timeout = msecs_to_jiffies(60000);
	3110	+ else
	3111	+ adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
	3112	+
	3113	+ if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
	3114	+ while ((timeout_setting = strsep(&input, ",")) &&
	3115	+ strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
	3116	+ ret = kstrtol(timeout_setting, 0, &timeout);
	3117	+ if (ret)
	3118	+ return ret;
	3119	+
	3120	+ if (timeout == 0) {
	3121	+ index++;
	3122	+ continue;
	3123	+ } else if (timeout < 0) {
	3124	+ timeout = MAX_SCHEDULE_TIMEOUT;
	3125	+ } else {
	3126	+ timeout = msecs_to_jiffies(timeout);
	3127	+ }
	3128	+
	3129	+ switch (index++) {
	3130	+ case 0:
	3131	+ adev->gfx_timeout = timeout;
	3132	+ break;
	3133	+ case 1:
	3134	+ adev->compute_timeout = timeout;
	3135	+ break;
	3136	+ case 2:
	3137	+ adev->sdma_timeout = timeout;
	3138	+ break;
	3139	+ case 3:
	3140	+ adev->video_timeout = timeout;
	3141	+ break;
	3142	+ default:
	3143	+ break;
	3144	+ }
	3145	+ }
	3146	+ /*
	3147	+ * There is only one value specified and
	3148	+ * it should apply to all non-compute jobs.
	3149	+ */
	3150	+ if (index == 1) {
	3151	+ adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
	3152	+ if (amdgpu_sriov_vf(adev) \|\| amdgpu_passthrough(adev))
	3153	+ adev->compute_timeout = adev->gfx_timeout;
	3154	+ }
	3155	+ }
	3156	+
	3157	+ return ret;
	3158	+}
	3159	+
	3160	+static const struct attribute *amdgpu_dev_attributes[] = {
	3161	+ &dev_attr_product_name.attr,
	3162	+ &dev_attr_product_number.attr,
	3163	+ &dev_attr_serial_number.attr,
	3164	+ &dev_attr_pcie_replay_count.attr,
	3165	+ NULL
	3166	+};
	3167	+
	3168	+
2304	3169	/**
2305	3170	* amdgpu_device_init - initialize the driver
2306	3171	*
2307	3172	* @adev: amdgpu_device pointer
2308		- * @ddev: drm dev pointer
2309		- * @pdev: pci dev pointer
2310	3173	* @flags: driver flags
2311	3174	*
2312	3175	* Initializes the driver info and hw (all asics).
..	..	@@ -2314,30 +3177,32 @@
2314	3177	* Called at driver startup.
2315	3178	*/
2316	3179	int amdgpu_device_init(struct amdgpu_device *adev,
2317		- struct drm_device *ddev,
2318		- struct pci_dev *pdev,
2319	3180	uint32_t flags)
2320	3181	{
	3182	+ struct drm_device *ddev = adev_to_drm(adev);
	3183	+ struct pci_dev *pdev = adev->pdev;
2321	3184	int r, i;
2322		- bool runtime = false;
	3185	+ bool boco = false;
2323	3186	u32 max_MBps;
2324	3187
2325	3188	adev->shutdown = false;
2326		- adev->dev = &pdev->dev;
2327		- adev->ddev = ddev;
2328		- adev->pdev = pdev;
2329	3189	adev->flags = flags;
2330		- adev->asic_type = flags & AMD_ASIC_MASK;
	3190	+
	3191	+ if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
	3192	+ adev->asic_type = amdgpu_force_asic_type;
	3193	+ else
	3194	+ adev->asic_type = flags & AMD_ASIC_MASK;
	3195	+
2331	3196	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2332	3197	if (amdgpu_emu_mode == 1)
2333		- adev->usec_timeout *= 2;
	3198	+ adev->usec_timeout *= 10;
2334	3199	adev->gmc.gart_size = 512 * 1024 * 1024;
2335	3200	adev->accel_working = false;
2336	3201	adev->num_rings = 0;
2337	3202	adev->mman.buffer_funcs = NULL;
2338	3203	adev->mman.buffer_funcs_ring = NULL;
2339	3204	adev->vm_manager.vm_pte_funcs = NULL;
2340		- adev->vm_manager.vm_pte_num_rings = 0;
	3205	+ adev->vm_manager.vm_pte_num_scheds = 0;
2341	3206	adev->gmc.gmc_funcs = NULL;
2342	3207	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2343	3208	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
..	..	@@ -2348,6 +3213,8 @@
2348	3213	adev->pcie_wreg = &amdgpu_invalid_wreg;
2349	3214	adev->pciep_rreg = &amdgpu_invalid_rreg;
2350	3215	adev->pciep_wreg = &amdgpu_invalid_wreg;
	3216	+ adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
	3217	+ adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
2351	3218	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
2352	3219	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
2353	3220	adev->didt_rreg = &amdgpu_invalid_rreg;
..	..	@@ -2369,13 +3236,19 @@
2369	3236	mutex_init(&adev->gfx.gpu_clock_mutex);
2370	3237	mutex_init(&adev->srbm_mutex);
2371	3238	mutex_init(&adev->gfx.pipe_reserve_mutex);
	3239	+ mutex_init(&adev->gfx.gfx_off_mutex);
2372	3240	mutex_init(&adev->grbm_idx_mutex);
2373	3241	mutex_init(&adev->mn_lock);
2374	3242	mutex_init(&adev->virt.vf_errors.lock);
2375	3243	hash_init(adev->mn_hash);
2376		- mutex_init(&adev->lock_reset);
	3244	+ atomic_set(&adev->in_gpu_reset, 0);
	3245	+ init_rwsem(&adev->reset_sem);
	3246	+ mutex_init(&adev->psp.mutex);
	3247	+ mutex_init(&adev->notifier_lock);
2377	3248
2378		- amdgpu_device_check_arguments(adev);
	3249	+ r = amdgpu_device_check_arguments(adev);
	3250	+ if (r)
	3251	+ return r;
2379	3252
2380	3253	spin_lock_init(&adev->mmio_idx_lock);
2381	3254	spin_lock_init(&adev->smc_idx_lock);
..	..	@@ -2390,13 +3263,26 @@
2390	3263	INIT_LIST_HEAD(&adev->shadow_list);
2391	3264	mutex_init(&adev->shadow_list_lock);
2392	3265
2393		- INIT_LIST_HEAD(&adev->ring_lru_list);
2394		- spin_lock_init(&adev->ring_lru_list_lock);
	3266	+ INIT_DELAYED_WORK(&adev->delayed_init_work,
	3267	+ amdgpu_device_delayed_init_work_handler);
	3268	+ INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
	3269	+ amdgpu_device_delay_enable_gfx_off);
2395	3270
2396		- INIT_DELAYED_WORK(&adev->late_init_work,
2397		- amdgpu_device_ip_late_init_func_handler);
	3271	+ INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
2398	3272
2399		- adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
	3273	+ adev->gfx.gfx_off_req_count = 1;
	3274	+ adev->pm.ac_power = power_supply_is_system_supplied() > 0;
	3275	+
	3276	+ atomic_set(&adev->throttling_logging_enabled, 1);
	3277	+ /*
	3278	+ * If throttling continues, logging will be performed every minute
	3279	+ * to avoid log flooding. "-1" is subtracted since the thermal
	3280	+ * throttling interrupt comes every second. Thus, the total logging
	3281	+ * interval is 59 seconds(retelimited printk interval) + 1(waiting
	3282	+ * for throttling interrupt) = 60 seconds.
	3283	+ */
	3284	+ ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
	3285	+ ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
2400	3286
2401	3287	/* Registers mapping */
2402	3288	/* TODO: block userspace mapping of io register */
..	..	@@ -2415,9 +3301,6 @@
2415	3301	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
2416	3302	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
2417	3303
2418		- /* doorbell bar mapping */
2419		- amdgpu_device_doorbell_init(adev);
2420		-
2421	3304	/* io port mapping */
2422	3305	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
2423	3306	if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
..	..	@@ -2429,24 +3312,56 @@
2429	3312	if (adev->rio_mem == NULL)
2430	3313	DRM_INFO("PCI I/O BAR is not found.\n");
2431	3314
	3315	+ /* enable PCIE atomic ops */
	3316	+ r = pci_enable_atomic_ops_to_root(adev->pdev,
	3317	+ PCI_EXP_DEVCAP2_ATOMIC_COMP32 \|
	3318	+ PCI_EXP_DEVCAP2_ATOMIC_COMP64);
	3319	+ if (r) {
	3320	+ adev->have_atomics_support = false;
	3321	+ DRM_INFO("PCIE atomic ops is not supported\n");
	3322	+ } else {
	3323	+ adev->have_atomics_support = true;
	3324	+ }
	3325	+
2432	3326	amdgpu_device_get_pcie_info(adev);
	3327	+
	3328	+ if (amdgpu_mcbp)
	3329	+ DRM_INFO("MCBP is enabled\n");
	3330	+
	3331	+ if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
	3332	+ adev->enable_mes = true;
	3333	+
	3334	+ /* detect hw virtualization here */
	3335	+ amdgpu_detect_virtualization(adev);
	3336	+
	3337	+ r = amdgpu_device_get_job_timeout_settings(adev);
	3338	+ if (r) {
	3339	+ dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
	3340	+ return r;
	3341	+ }
2433	3342
2434	3343	/* early init functions */
2435	3344	r = amdgpu_device_ip_early_init(adev);
2436	3345	if (r)
2437	3346	return r;
2438	3347
	3348	+ /* doorbell bar mapping and doorbell index init*/
	3349	+ amdgpu_device_doorbell_init(adev);
	3350	+
2439	3351	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
2440	3352	/* this will fail for cards that aren't VGA class devices, just
2441	3353	* ignore it */
2442	3354	vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
2443	3355
2444		- if (amdgpu_device_is_px(ddev))
2445		- runtime = true;
2446		- if (!pci_is_thunderbolt_attached(adev->pdev))
	3356	+ if (amdgpu_device_supports_boco(ddev))
	3357	+ boco = true;
	3358	+ if (amdgpu_has_atpx() &&
	3359	+ (amdgpu_is_atpx_hybrid() \|\|
	3360	+ amdgpu_has_atpx_dgpu_power_cntl()) &&
	3361	+ !pci_is_thunderbolt_attached(adev->pdev))
2447	3362	vga_switcheroo_register_client(adev->pdev,
2448		- &amdgpu_switcheroo_ops, runtime);
2449		- if (runtime)
	3363	+ &amdgpu_switcheroo_ops, boco);
	3364	+ if (boco)
2450	3365	vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
2451	3366
2452	3367	if (amdgpu_emu_mode == 1) {
..	..	@@ -2455,21 +3370,21 @@
2455	3370	goto fence_driver_init;
2456	3371	}
2457	3372
2458		- /* Read BIOS */
2459		- if (!amdgpu_get_bios(adev)) {
2460		- r = -EINVAL;
2461		- goto failed;
2462		- }
2463		-
2464		- r = amdgpu_atombios_init(adev);
2465		- if (r) {
2466		- dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2467		- amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2468		- goto failed;
2469		- }
2470		-
2471	3373	/* detect if we are with an SRIOV vbios */
2472	3374	amdgpu_device_detect_sriov_bios(adev);
	3375	+
	3376	+ /* check if we need to reset the asic
	3377	+ * E.g., driver was not cleanly unloaded previously, etc.
	3378	+ */
	3379	+ if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
	3380	+ r = amdgpu_asic_reset(adev);
	3381	+ if (r) {
	3382	+ dev_err(adev->dev, "asic reset on init failed\n");
	3383	+ goto failed;
	3384	+ }
	3385	+ }
	3386	+
	3387	+ pci_enable_pcie_error_reporting(adev->ddev.pdev);
2473	3388
2474	3389	/* Post card if necessary */
2475	3390	if (amdgpu_device_need_post(adev)) {
..	..	@@ -2479,7 +3394,7 @@
2479	3394	goto failed;
2480	3395	}
2481	3396	DRM_INFO("GPU posting now...\n");
2482		- r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
	3397	+ r = amdgpu_device_asic_init(adev);
2483	3398	if (r) {
2484	3399	dev_err(adev->dev, "gpu post error!\n");
2485	3400	goto failed;
..	..	@@ -2517,7 +3432,7 @@
2517	3432	}
2518	3433
2519	3434	/* init the mode config */
2520		- drm_mode_config_init(adev->ddev);
	3435	+ drm_mode_config_init(adev_to_drm(adev));
2521	3436
2522	3437	r = amdgpu_device_ip_init(adev);
2523	3438	if (r) {
..	..	@@ -2538,6 +3453,13 @@
2538	3453	goto failed;
2539	3454	}
2540	3455
	3456	+ dev_info(adev->dev,
	3457	+ "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
	3458	+ adev->gfx.config.max_shader_engines,
	3459	+ adev->gfx.config.max_sh_per_se,
	3460	+ adev->gfx.config.max_cu_per_sh,
	3461	+ adev->gfx.cu_info.number);
	3462	+
2541	3463	adev->accel_working = true;
2542	3464
2543	3465	amdgpu_vm_check_compute_bug(adev);
..	..	@@ -2550,34 +3472,21 @@
2550	3472	/* Get a log2 for easy divisions. */
2551	3473	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
2552	3474
2553		- r = amdgpu_ib_pool_init(adev);
2554		- if (r) {
2555		- dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2556		- amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2557		- goto failed;
2558		- }
2559		-
2560	3475	amdgpu_fbdev_init(adev);
2561	3476
2562	3477	r = amdgpu_pm_sysfs_init(adev);
2563		- if (r)
	3478	+ if (r) {
	3479	+ adev->pm_sysfs_en = false;
2564	3480	DRM_ERROR("registering pm debugfs failed (%d).\n", r);
	3481	+ } else
	3482	+ adev->pm_sysfs_en = true;
2565	3483
2566		- r = amdgpu_debugfs_gem_init(adev);
2567		- if (r)
2568		- DRM_ERROR("registering gem debugfs failed (%d).\n", r);
2569		-
2570		- r = amdgpu_debugfs_regs_init(adev);
2571		- if (r)
2572		- DRM_ERROR("registering register debugfs failed (%d).\n", r);
2573		-
2574		- r = amdgpu_debugfs_firmware_init(adev);
2575		- if (r)
2576		- DRM_ERROR("registering firmware debugfs failed (%d).\n", r);
2577		-
2578		- r = amdgpu_debugfs_init(adev);
2579		- if (r)
2580		- DRM_ERROR("Creating debugfs files failed (%d).\n", r);
	3484	+ r = amdgpu_ucode_sysfs_init(adev);
	3485	+ if (r) {
	3486	+ adev->ucode_sysfs_en = false;
	3487	+ DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
	3488	+ } else
	3489	+ adev->ucode_sysfs_en = true;
2581	3490
2582	3491	if ((amdgpu_testing & 1)) {
2583	3492	if (adev->accel_working)
..	..	@@ -2592,6 +3501,13 @@
2592	3501	DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
2593	3502	}
2594	3503
	3504	+ /*
	3505	+ * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
	3506	+ * Otherwise the mgpu fan boost feature will be skipped due to the
	3507	+ * gpu instance is counted less.
	3508	+ */
	3509	+ amdgpu_register_gpu_instance(adev);
	3510	+
2595	3511	/* enable clockgating, etc. after ib tests, etc. since some blocks require
2596	3512	* explicit gating rather than handling it automatically.
2597	3513	*/
..	..	@@ -2602,11 +3518,33 @@
2602	3518	goto failed;
2603	3519	}
2604	3520
	3521	+ /* must succeed. */
	3522	+ amdgpu_ras_resume(adev);
	3523	+
	3524	+ queue_delayed_work(system_wq, &adev->delayed_init_work,
	3525	+ msecs_to_jiffies(AMDGPU_RESUME_MS));
	3526	+
	3527	+ if (amdgpu_sriov_vf(adev))
	3528	+ flush_delayed_work(&adev->delayed_init_work);
	3529	+
	3530	+ r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
	3531	+ if (r)
	3532	+ dev_err(adev->dev, "Could not create amdgpu device attr\n");
	3533	+
	3534	+ if (IS_ENABLED(CONFIG_PERF_EVENTS))
	3535	+ r = amdgpu_pmu_init(adev);
	3536	+ if (r)
	3537	+ dev_err(adev->dev, "amdgpu_pmu_init failed\n");
	3538	+
	3539	+ /* Have stored pci confspace at hand for restore in sudden PCI error */
	3540	+ if (amdgpu_device_cache_pci_state(adev->pdev))
	3541	+ pci_restore_state(pdev);
	3542	+
2605	3543	return 0;
2606	3544
2607	3545	failed:
2608	3546	amdgpu_vf_error_trans_all(adev);
2609		- if (runtime)
	3547	+ if (boco)
2610	3548	vga_switcheroo_fini_domain_pm_ops(adev->dev);
2611	3549
2612	3550	return r;
..	..	@@ -2622,29 +3560,37 @@
2622	3560	*/
2623	3561	void amdgpu_device_fini(struct amdgpu_device *adev)
2624	3562	{
2625		- int r;
2626		-
2627		- DRM_INFO("amdgpu: finishing device.\n");
	3563	+ dev_info(adev->dev, "amdgpu: finishing device.\n");
	3564	+ flush_delayed_work(&adev->delayed_init_work);
	3565	+ ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
2628	3566	adev->shutdown = true;
	3567	+
	3568	+ kfree(adev->pci_state);
	3569	+
	3570	+ /* make sure IB test finished before entering exclusive mode
	3571	+ * to avoid preemption on IB test
	3572	+ * */
	3573	+ if (amdgpu_sriov_vf(adev)) {
	3574	+ amdgpu_virt_request_full_gpu(adev, false);
	3575	+ amdgpu_virt_fini_data_exchange(adev);
	3576	+ }
	3577	+
2629	3578	/* disable all interrupts */
2630	3579	amdgpu_irq_disable_all(adev);
2631	3580	if (adev->mode_info.mode_config_initialized){
2632	3581	if (!amdgpu_device_has_dc_support(adev))
2633		- drm_crtc_force_disable_all(adev->ddev);
	3582	+ drm_helper_force_disable_all(adev_to_drm(adev));
2634	3583	else
2635		- drm_atomic_helper_shutdown(adev->ddev);
	3584	+ drm_atomic_helper_shutdown(adev_to_drm(adev));
2636	3585	}
2637		- amdgpu_ib_pool_fini(adev);
2638	3586	amdgpu_fence_driver_fini(adev);
2639		- amdgpu_pm_sysfs_fini(adev);
	3587	+ if (adev->pm_sysfs_en)
	3588	+ amdgpu_pm_sysfs_fini(adev);
2640	3589	amdgpu_fbdev_fini(adev);
2641		- r = amdgpu_device_ip_fini(adev);
2642		- if (adev->firmware.gpu_info_fw) {
2643		- release_firmware(adev->firmware.gpu_info_fw);
2644		- adev->firmware.gpu_info_fw = NULL;
2645		- }
	3590	+ amdgpu_device_ip_fini(adev);
	3591	+ release_firmware(adev->firmware.gpu_info_fw);
	3592	+ adev->firmware.gpu_info_fw = NULL;
2646	3593	adev->accel_working = false;
2647		- cancel_delayed_work_sync(&adev->late_init_work);
2648	3594	/* free i2c buses */
2649	3595	if (!amdgpu_device_has_dc_support(adev))
2650	3596	amdgpu_i2c_fini(adev);
..	..	@@ -2654,9 +3600,12 @@
2654	3600
2655	3601	kfree(adev->bios);
2656	3602	adev->bios = NULL;
2657		- if (!pci_is_thunderbolt_attached(adev->pdev))
	3603	+ if (amdgpu_has_atpx() &&
	3604	+ (amdgpu_is_atpx_hybrid() \|\|
	3605	+ amdgpu_has_atpx_dgpu_power_cntl()) &&
	3606	+ !pci_is_thunderbolt_attached(adev->pdev))
2658	3607	vga_switcheroo_unregister_client(adev->pdev);
2659		- if (adev->flags & AMD_IS_PX)
	3608	+ if (amdgpu_device_supports_boco(adev_to_drm(adev)))
2660	3609	vga_switcheroo_fini_domain_pm_ops(adev->dev);
2661	3610	vga_client_register(adev->pdev, NULL, NULL, NULL);
2662	3611	if (adev->rio_mem)
..	..	@@ -2665,7 +3614,15 @@
2665	3614	iounmap(adev->rmmio);
2666	3615	adev->rmmio = NULL;
2667	3616	amdgpu_device_doorbell_fini(adev);
2668		- amdgpu_debugfs_regs_cleanup(adev);
	3617	+
	3618	+ if (adev->ucode_sysfs_en)
	3619	+ amdgpu_ucode_sysfs_fini(adev);
	3620	+
	3621	+ sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
	3622	+ if (IS_ENABLED(CONFIG_PERF_EVENTS))
	3623	+ amdgpu_pmu_fini(adev);
	3624	+ if (adev->mman.discovery_bin)
	3625	+ amdgpu_discovery_fini(adev);
2669	3626	}
2670	3627
2671	3628
..	..	@@ -2676,40 +3633,41 @@
2676	3633	* amdgpu_device_suspend - initiate device suspend
2677	3634	*
2678	3635	* @dev: drm dev pointer
2679		- * @suspend: suspend state
2680	3636	* @fbcon : notify the fbdev of suspend
2681	3637	*
2682	3638	* Puts the hw in the suspend state (all asics).
2683	3639	* Returns 0 for success or an error on failure.
2684	3640	* Called at driver suspend.
2685	3641	*/
2686		-int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
	3642	+int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
2687	3643	{
2688	3644	struct amdgpu_device *adev;
2689	3645	struct drm_crtc *crtc;
2690	3646	struct drm_connector *connector;
	3647	+ struct drm_connector_list_iter iter;
2691	3648	int r;
2692	3649
2693		- if (dev == NULL \|\| dev->dev_private == NULL) {
2694		- return -ENODEV;
2695		- }
2696		-
2697		- adev = dev->dev_private;
	3650	+ adev = drm_to_adev(dev);
2698	3651
2699	3652	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
2700	3653	return 0;
2701	3654
	3655	+ adev->in_suspend = true;
2702	3656	drm_kms_helper_poll_disable(dev);
2703	3657
2704	3658	if (fbcon)
2705	3659	amdgpu_fbdev_set_suspend(adev, 1);
2706	3660
	3661	+ cancel_delayed_work_sync(&adev->delayed_init_work);
	3662	+
2707	3663	if (!amdgpu_device_has_dc_support(adev)) {
2708	3664	/* turn off display hw */
2709	3665	drm_modeset_lock_all(dev);
2710		- list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
2711		- drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF);
2712		- }
	3666	+ drm_connector_list_iter_begin(dev, &iter);
	3667	+ drm_for_each_connector_iter(connector, &iter)
	3668	+ drm_helper_connector_dpms(connector,
	3669	+ DRM_MODE_DPMS_OFF);
	3670	+ drm_connector_list_iter_end(&iter);
2713	3671	drm_modeset_unlock_all(dev);
2714	3672	/* unpin the front buffers and cursors */
2715	3673	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
..	..	@@ -2717,7 +3675,7 @@
2717	3675	struct drm_framebuffer *fb = crtc->primary->fb;
2718	3676	struct amdgpu_bo *robj;
2719	3677
2720		- if (amdgpu_crtc->cursor_bo) {
	3678	+ if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
2721	3679	struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
2722	3680	r = amdgpu_bo_reserve(aobj, true);
2723	3681	if (r == 0) {
..	..	@@ -2741,9 +3699,11 @@
2741	3699	}
2742	3700	}
2743	3701
2744		- amdgpu_amdkfd_suspend(adev);
	3702	+ amdgpu_ras_suspend(adev);
2745	3703
2746	3704	r = amdgpu_device_ip_suspend_phase1(adev);
	3705	+
	3706	+ amdgpu_amdkfd_suspend(adev, !fbcon);
2747	3707
2748	3708	/* evict vram memory */
2749	3709	amdgpu_bo_evict_vram(adev);
..	..	@@ -2758,17 +3718,6 @@
2758	3718	*/
2759	3719	amdgpu_bo_evict_vram(adev);
2760	3720
2761		- pci_save_state(dev->pdev);
2762		- if (suspend) {
2763		- /* Shut down the device */
2764		- pci_disable_device(dev->pdev);
2765		- pci_set_power_state(dev->pdev, PCI_D3hot);
2766		- } else {
2767		- r = amdgpu_asic_reset(adev);
2768		- if (r)
2769		- DRM_ERROR("amdgpu asic reset failed\n");
2770		- }
2771		-
2772	3721	return 0;
2773	3722	}
2774	3723
..	..	@@ -2776,41 +3725,33 @@
2776	3725	* amdgpu_device_resume - initiate device resume
2777	3726	*
2778	3727	* @dev: drm dev pointer
2779		- * @resume: resume state
2780	3728	* @fbcon : notify the fbdev of resume
2781	3729	*
2782	3730	* Bring the hw back to operating state (all asics).
2783	3731	* Returns 0 for success or an error on failure.
2784	3732	* Called at driver resume.
2785	3733	*/
2786		-int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
	3734	+int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
2787	3735	{
2788	3736	struct drm_connector *connector;
2789		- struct amdgpu_device *adev = dev->dev_private;
	3737	+ struct drm_connector_list_iter iter;
	3738	+ struct amdgpu_device *adev = drm_to_adev(dev);
2790	3739	struct drm_crtc *crtc;
2791	3740	int r = 0;
2792	3741
2793	3742	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
2794	3743	return 0;
2795	3744
2796		- if (resume) {
2797		- pci_set_power_state(dev->pdev, PCI_D0);
2798		- pci_restore_state(dev->pdev);
2799		- r = pci_enable_device(dev->pdev);
2800		- if (r)
2801		- return r;
2802		- }
2803		-
2804	3745	/* post card */
2805	3746	if (amdgpu_device_need_post(adev)) {
2806		- r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
	3747	+ r = amdgpu_device_asic_init(adev);
2807	3748	if (r)
2808		- DRM_ERROR("amdgpu asic init failed\n");
	3749	+ dev_err(adev->dev, "amdgpu asic init failed\n");
2809	3750	}
2810	3751
2811	3752	r = amdgpu_device_ip_resume(adev);
2812	3753	if (r) {
2813		- DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
	3754	+ dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
2814	3755	return r;
2815	3756	}
2816	3757	amdgpu_fence_driver_resume(adev);
..	..	@@ -2820,30 +3761,33 @@
2820	3761	if (r)
2821	3762	return r;
2822	3763
	3764	+ queue_delayed_work(system_wq, &adev->delayed_init_work,
	3765	+ msecs_to_jiffies(AMDGPU_RESUME_MS));
	3766	+
2823	3767	if (!amdgpu_device_has_dc_support(adev)) {
2824	3768	/* pin cursors */
2825	3769	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
2826	3770	struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
2827	3771
2828		- if (amdgpu_crtc->cursor_bo) {
	3772	+ if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
2829	3773	struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
2830	3774	r = amdgpu_bo_reserve(aobj, true);
2831	3775	if (r == 0) {
2832	3776	r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
2833	3777	if (r != 0)
2834		- DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
	3778	+ dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
2835	3779	amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
2836	3780	amdgpu_bo_unreserve(aobj);
2837	3781	}
2838	3782	}
2839	3783	}
2840	3784	}
2841		- r = amdgpu_amdkfd_resume(adev);
	3785	+ r = amdgpu_amdkfd_resume(adev, !fbcon);
2842	3786	if (r)
2843	3787	return r;
2844	3788
2845	3789	/* Make sure IB tests flushed */
2846		- flush_delayed_work(&adev->late_init_work);
	3790	+ flush_delayed_work(&adev->delayed_init_work);
2847	3791
2848	3792	/* blat the mode back in */
2849	3793	if (fbcon) {
..	..	@@ -2853,15 +3797,21 @@
2853	3797
2854	3798	/* turn on display hw */
2855	3799	drm_modeset_lock_all(dev);
2856		- list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
2857		- drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON);
2858		- }
	3800	+
	3801	+ drm_connector_list_iter_begin(dev, &iter);
	3802	+ drm_for_each_connector_iter(connector, &iter)
	3803	+ drm_helper_connector_dpms(connector,
	3804	+ DRM_MODE_DPMS_ON);
	3805	+ drm_connector_list_iter_end(&iter);
	3806	+
2859	3807	drm_modeset_unlock_all(dev);
2860	3808	}
2861	3809	amdgpu_fbdev_set_suspend(adev, 0);
2862	3810	}
2863	3811
2864	3812	drm_kms_helper_poll_enable(dev);
	3813	+
	3814	+ amdgpu_ras_resume(adev);
2865	3815
2866	3816	/*
2867	3817	* Most of the connector probing functions try to acquire runtime pm
..	..	@@ -2882,6 +3832,8 @@
2882	3832	#ifdef CONFIG_PM
2883	3833	dev->dev->power.disable_depth--;
2884	3834	#endif
	3835	+ adev->in_suspend = false;
	3836	+
2885	3837	return 0;
2886	3838	}
2887	3839
..	..	@@ -2913,7 +3865,7 @@
2913	3865	adev->ip_blocks[i].status.hang =
2914	3866	adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
2915	3867	if (adev->ip_blocks[i].status.hang) {
2916		- DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
	3868	+ dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
2917	3869	asic_hang = true;
2918	3870	}
2919	3871	}
..	..	@@ -2974,7 +3926,7 @@
2974	3926	(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) \|\|
2975	3927	adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2976	3928	if (adev->ip_blocks[i].status.hang) {
2977		- DRM_INFO("Some block need full reset!\n");
	3929	+ dev_info(adev->dev, "Some block need full reset!\n");
2978	3930	return true;
2979	3931	}
2980	3932	}
..	..	@@ -3040,200 +3992,77 @@
3040	3992	}
3041	3993
3042	3994	/**
3043		- * amdgpu_device_recover_vram_from_shadow - restore shadowed VRAM buffers
3044		- *
3045		- * @adev: amdgpu_device pointer
3046		- * @ring: amdgpu_ring for the engine handling the buffer operations
3047		- * @bo: amdgpu_bo buffer whose shadow is being restored
3048		- * @fence: dma_fence associated with the operation
3049		- *
3050		- * Restores the VRAM buffer contents from the shadow in GTT. Used to
3051		- * restore things like GPUVM page tables after a GPU reset where
3052		- * the contents of VRAM might be lost.
3053		- * Returns 0 on success, negative error code on failure.
3054		- */
3055		-static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev,
3056		- struct amdgpu_ring *ring,
3057		- struct amdgpu_bo *bo,
3058		- struct dma_fence **fence)
3059		-{
3060		- uint32_t domain;
3061		- int r;
3062		-
3063		- if (!bo->shadow)
3064		- return 0;
3065		-
3066		- r = amdgpu_bo_reserve(bo, true);
3067		- if (r)
3068		- return r;
3069		- domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type);
3070		- /* if bo has been evicted, then no need to recover */
3071		- if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
3072		- r = amdgpu_bo_validate(bo->shadow);
3073		- if (r) {
3074		- DRM_ERROR("bo validate failed!\n");
3075		- goto err;
3076		- }
3077		-
3078		- r = amdgpu_bo_restore_from_shadow(adev, ring, bo,
3079		- NULL, fence, true);
3080		- if (r) {
3081		- DRM_ERROR("recover page table failed!\n");
3082		- goto err;
3083		- }
3084		- }
3085		-err:
3086		- amdgpu_bo_unreserve(bo);
3087		- return r;
3088		-}
3089		-
3090		-/**
3091		- * amdgpu_device_handle_vram_lost - Handle the loss of VRAM contents
	3995	+ * amdgpu_device_recover_vram - Recover some VRAM contents
3092	3996	*
3093	3997	* @adev: amdgpu_device pointer
3094	3998	*
3095	3999	* Restores the contents of VRAM buffers from the shadows in GTT. Used to
3096	4000	* restore things like GPUVM page tables after a GPU reset where
3097	4001	* the contents of VRAM might be lost.
3098		- * Returns 0 on success, 1 on failure.
	4002	+ *
	4003	+ * Returns:
	4004	+ * 0 on success, negative error code on failure.
3099	4005	*/
3100		-static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
	4006	+static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3101	4007	{
3102		- struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
3103		- struct amdgpu_bo bo, tmp;
3104	4008	struct dma_fence fence = NULL, next = NULL;
3105		- long r = 1;
3106		- int i = 0;
3107		- long tmo;
	4009	+ struct amdgpu_bo *shadow;
	4010	+ long r = 1, tmo;
3108	4011
3109	4012	if (amdgpu_sriov_runtime(adev))
3110	4013	tmo = msecs_to_jiffies(8000);
3111	4014	else
3112	4015	tmo = msecs_to_jiffies(100);
3113	4016
3114		- DRM_INFO("recover vram bo from shadow start\n");
	4017	+ dev_info(adev->dev, "recover vram bo from shadow start\n");
3115	4018	mutex_lock(&adev->shadow_list_lock);
3116		- list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
3117		- next = NULL;
3118		- amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
	4019	+ list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
	4020	+
	4021	+ /* No need to recover an evicted BO */
	4022	+ if (shadow->tbo.mem.mem_type != TTM_PL_TT \|\|
	4023	+ shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET \|\|
	4024	+ shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
	4025	+ continue;
	4026	+
	4027	+ r = amdgpu_bo_restore_shadow(shadow, &next);
	4028	+ if (r)
	4029	+ break;
	4030	+
3119	4031	if (fence) {
3120		- r = dma_fence_wait_timeout(fence, false, tmo);
3121		- if (r == 0)
3122		- pr_err("wait fence %p[%d] timeout\n", fence, i);
3123		- else if (r < 0)
3124		- pr_err("wait fence %p[%d] interrupted\n", fence, i);
3125		- if (r < 1) {
3126		- dma_fence_put(fence);
3127		- fence = next;
	4032	+ tmo = dma_fence_wait_timeout(fence, false, tmo);
	4033	+ dma_fence_put(fence);
	4034	+ fence = next;
	4035	+ if (tmo == 0) {
	4036	+ r = -ETIMEDOUT;
	4037	+ break;
	4038	+ } else if (tmo < 0) {
	4039	+ r = tmo;
3128	4040	break;
3129	4041	}
3130		- i++;
	4042	+ } else {
	4043	+ fence = next;
3131	4044	}
3132		-
3133		- dma_fence_put(fence);
3134		- fence = next;
3135	4045	}
3136	4046	mutex_unlock(&adev->shadow_list_lock);
3137	4047
3138		- if (fence) {
3139		- r = dma_fence_wait_timeout(fence, false, tmo);
3140		- if (r == 0)
3141		- pr_err("wait fence %p[%d] timeout\n", fence, i);
3142		- else if (r < 0)
3143		- pr_err("wait fence %p[%d] interrupted\n", fence, i);
3144		-
3145		- }
	4048	+ if (fence)
	4049	+ tmo = dma_fence_wait_timeout(fence, false, tmo);
3146	4050	dma_fence_put(fence);
3147	4051
3148		- if (r > 0)
3149		- DRM_INFO("recover vram bo from shadow done\n");
3150		- else
3151		- DRM_ERROR("recover vram bo from shadow failed\n");
	4052	+ if (r < 0 \|\| tmo <= 0) {
	4053	+ dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
	4054	+ return -EIO;
	4055	+ }
3152	4056
3153		- return (r > 0) ? 0 : 1;
	4057	+ dev_info(adev->dev, "recover vram bo from shadow done\n");
	4058	+ return 0;
3154	4059	}
3155	4060
3156		-/**
3157		- * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough
3158		- *
3159		- * @adev: amdgpu device pointer
3160		- *
3161		- * attempt to do soft-reset or full-reset and reinitialize Asic
3162		- * return 0 means succeeded otherwise failed
3163		- */
3164		-static int amdgpu_device_reset(struct amdgpu_device *adev)
3165		-{
3166		- bool need_full_reset, vram_lost = 0;
3167		- int r;
3168		-
3169		- need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3170		-
3171		- if (!need_full_reset) {
3172		- amdgpu_device_ip_pre_soft_reset(adev);
3173		- r = amdgpu_device_ip_soft_reset(adev);
3174		- amdgpu_device_ip_post_soft_reset(adev);
3175		- if (r \|\| amdgpu_device_ip_check_soft_reset(adev)) {
3176		- DRM_INFO("soft reset failed, will fallback to full reset!\n");
3177		- need_full_reset = true;
3178		- }
3179		- }
3180		-
3181		- if (need_full_reset) {
3182		- r = amdgpu_device_ip_suspend(adev);
3183		-
3184		-retry:
3185		- r = amdgpu_asic_reset(adev);
3186		- /* post card */
3187		- amdgpu_atom_asic_init(adev->mode_info.atom_context);
3188		-
3189		- if (!r) {
3190		- dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
3191		- r = amdgpu_device_ip_resume_phase1(adev);
3192		- if (r)
3193		- goto out;
3194		-
3195		- vram_lost = amdgpu_device_check_vram_lost(adev);
3196		- if (vram_lost) {
3197		- DRM_ERROR("VRAM is lost!\n");
3198		- atomic_inc(&adev->vram_lost_counter);
3199		- }
3200		-
3201		- r = amdgpu_gtt_mgr_recover(
3202		- &adev->mman.bdev.man[TTM_PL_TT]);
3203		- if (r)
3204		- goto out;
3205		-
3206		- r = amdgpu_device_ip_resume_phase2(adev);
3207		- if (r)
3208		- goto out;
3209		-
3210		- if (vram_lost)
3211		- amdgpu_device_fill_reset_magic(adev);
3212		- }
3213		- }
3214		-
3215		-out:
3216		- if (!r) {
3217		- amdgpu_irq_gpu_reset_resume_helper(adev);
3218		- r = amdgpu_ib_ring_tests(adev);
3219		- if (r) {
3220		- dev_err(adev->dev, "ib ring test failed (%d).\n", r);
3221		- r = amdgpu_device_ip_suspend(adev);
3222		- need_full_reset = true;
3223		- goto retry;
3224		- }
3225		- }
3226		-
3227		- if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) \|\| vram_lost))
3228		- r = amdgpu_device_handle_vram_lost(adev);
3229		-
3230		- return r;
3231		-}
3232	4061
3233	4062	/**
3234	4063	* amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3235	4064	*
3236		- * @adev: amdgpu device pointer
	4065	+ * @adev: amdgpu_device pointer
3237	4066	* @from_hypervisor: request from hypervisor
3238	4067	*
3239	4068	* do VF FLR and reinitialize Asic
..	..	@@ -3251,13 +4080,20 @@
3251	4080	if (r)
3252	4081	return r;
3253	4082
	4083	+ amdgpu_amdkfd_pre_reset(adev);
	4084	+
3254	4085	/* Resume IP prior to SMC */
3255	4086	r = amdgpu_device_ip_reinit_early_sriov(adev);
3256	4087	if (r)
3257	4088	goto error;
3258	4089
	4090	+ amdgpu_virt_init_data_exchange(adev);
3259	4091	/* we need recover gart prior to run SMC/CP/SDMA resume */
3260		- amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
	4092	+ amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
	4093	+
	4094	+ r = amdgpu_device_fw_loading(adev);
	4095	+ if (r)
	4096	+ return r;
3261	4097
3262	4098	/* now we are okay to resume SMC/CP/SDMA */
3263	4099	r = amdgpu_device_ip_reinit_late_sriov(adev);
..	..	@@ -3266,55 +4102,115 @@
3266	4102
3267	4103	amdgpu_irq_gpu_reset_resume_helper(adev);
3268	4104	r = amdgpu_ib_ring_tests(adev);
	4105	+ amdgpu_amdkfd_post_reset(adev);
3269	4106
3270	4107	error:
3271		- amdgpu_virt_init_data_exchange(adev);
3272	4108	amdgpu_virt_release_full_gpu(adev, true);
3273	4109	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3274		- atomic_inc(&adev->vram_lost_counter);
3275		- r = amdgpu_device_handle_vram_lost(adev);
	4110	+ amdgpu_inc_vram_lost(adev);
	4111	+ r = amdgpu_device_recover_vram(adev);
3276	4112	}
3277	4113
3278	4114	return r;
3279	4115	}
3280	4116
3281	4117	/**
3282		- * amdgpu_device_gpu_recover - reset the asic and recover scheduler
	4118	+ * amdgpu_device_has_job_running - check if there is any job in mirror list
3283	4119	*
3284		- * @adev: amdgpu device pointer
3285		- * @job: which job trigger hang
3286		- * @force: forces reset regardless of amdgpu_gpu_recovery
	4120	+ * @adev: amdgpu_device pointer
3287	4121	*
3288		- * Attempt to reset the GPU if it has hung (all asics).
3289		- * Returns 0 for success or an error on failure.
	4122	+ * check if there is any job in mirror list
3290	4123	*/
3291		-int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3292		- struct amdgpu_job *job, bool force)
	4124	+bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
3293	4125	{
3294		- int i, r, resched;
	4126	+ int i;
	4127	+ struct drm_sched_job *job;
3295	4128
3296		- if (!force && !amdgpu_device_ip_check_soft_reset(adev)) {
3297		- DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
3298		- return 0;
	4129	+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
	4130	+ struct amdgpu_ring *ring = adev->rings[i];
	4131	+
	4132	+ if (!ring \|\| !ring->sched.thread)
	4133	+ continue;
	4134	+
	4135	+ spin_lock(&ring->sched.job_list_lock);
	4136	+ job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
	4137	+ struct drm_sched_job, node);
	4138	+ spin_unlock(&ring->sched.job_list_lock);
	4139	+ if (job)
	4140	+ return true;
	4141	+ }
	4142	+ return false;
	4143	+}
	4144	+
	4145	+/**
	4146	+ * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
	4147	+ *
	4148	+ * @adev: amdgpu_device pointer
	4149	+ *
	4150	+ * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
	4151	+ * a hung GPU.
	4152	+ */
	4153	+bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
	4154	+{
	4155	+ if (!amdgpu_device_ip_check_soft_reset(adev)) {
	4156	+ dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
	4157	+ return false;
3299	4158	}
3300	4159
3301		- if (!force && (amdgpu_gpu_recovery == 0 \|\|
3302		- (amdgpu_gpu_recovery == -1 && !amdgpu_sriov_vf(adev)))) {
3303		- DRM_INFO("GPU recovery disabled.\n");
3304		- return 0;
	4160	+ if (amdgpu_gpu_recovery == 0)
	4161	+ goto disabled;
	4162	+
	4163	+ if (amdgpu_sriov_vf(adev))
	4164	+ return true;
	4165	+
	4166	+ if (amdgpu_gpu_recovery == -1) {
	4167	+ switch (adev->asic_type) {
	4168	+ case CHIP_BONAIRE:
	4169	+ case CHIP_HAWAII:
	4170	+ case CHIP_TOPAZ:
	4171	+ case CHIP_TONGA:
	4172	+ case CHIP_FIJI:
	4173	+ case CHIP_POLARIS10:
	4174	+ case CHIP_POLARIS11:
	4175	+ case CHIP_POLARIS12:
	4176	+ case CHIP_VEGAM:
	4177	+ case CHIP_VEGA20:
	4178	+ case CHIP_VEGA10:
	4179	+ case CHIP_VEGA12:
	4180	+ case CHIP_RAVEN:
	4181	+ case CHIP_ARCTURUS:
	4182	+ case CHIP_RENOIR:
	4183	+ case CHIP_NAVI10:
	4184	+ case CHIP_NAVI14:
	4185	+ case CHIP_NAVI12:
	4186	+ case CHIP_SIENNA_CICHLID:
	4187	+ break;
	4188	+ default:
	4189	+ goto disabled;
	4190	+ }
3305	4191	}
3306	4192
3307		- dev_info(adev->dev, "GPU reset begin!\n");
	4193	+ return true;
3308	4194
3309		- mutex_lock(&adev->lock_reset);
3310		- atomic_inc(&adev->gpu_reset_counter);
3311		- adev->in_gpu_reset = 1;
	4195	+disabled:
	4196	+ dev_info(adev->dev, "GPU recovery disabled.\n");
	4197	+ return false;
	4198	+}
3312	4199
3313		- /* Block kfd */
3314		- amdgpu_amdkfd_pre_reset(adev);
3315	4200
3316		- /* block TTM */
3317		- resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
	4201	+static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
	4202	+ struct amdgpu_job *job,
	4203	+ bool *need_full_reset_arg)
	4204	+{
	4205	+ int i, r = 0;
	4206	+ bool need_full_reset = *need_full_reset_arg;
	4207	+
	4208	+ amdgpu_debugfs_wait_dump(adev);
	4209	+
	4210	+ if (amdgpu_sriov_vf(adev)) {
	4211	+ /* stop the data exchange thread */
	4212	+ amdgpu_virt_fini_data_exchange(adev);
	4213	+ }
3318	4214
3319	4215	/* block all schedulers and reset given job's ring */
3320	4216	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
..	..	@@ -3323,57 +4219,510 @@
3323	4219	if (!ring \|\| !ring->sched.thread)
3324	4220	continue;
3325	4221
3326		- kthread_park(ring->sched.thread);
3327		-
3328		- if (job && job->base.sched == &ring->sched)
3329		- continue;
3330		-
3331		- drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL);
3332		-
3333	4222	/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3334	4223	amdgpu_fence_driver_force_completion(ring);
3335	4224	}
3336	4225
3337		- if (amdgpu_sriov_vf(adev))
3338		- r = amdgpu_device_reset_sriov(adev, job ? false : true);
3339		- else
3340		- r = amdgpu_device_reset(adev);
	4226	+ if(job)
	4227	+ drm_sched_increase_karma(&job->base);
3341	4228
3342		- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3343		- struct amdgpu_ring *ring = adev->rings[i];
	4229	+ /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
	4230	+ if (!amdgpu_sriov_vf(adev)) {
3344	4231
3345		- if (!ring \|\| !ring->sched.thread)
3346		- continue;
	4232	+ if (!need_full_reset)
	4233	+ need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3347	4234
3348		- /* only need recovery sched of the given job's ring
3349		- * or all rings (in the case @job is NULL)
3350		- * after above amdgpu_reset accomplished
3351		- */
3352		- if ((!job \|\| job->base.sched == &ring->sched) && !r)
3353		- drm_sched_job_recovery(&ring->sched);
	4235	+ if (!need_full_reset) {
	4236	+ amdgpu_device_ip_pre_soft_reset(adev);
	4237	+ r = amdgpu_device_ip_soft_reset(adev);
	4238	+ amdgpu_device_ip_post_soft_reset(adev);
	4239	+ if (r \|\| amdgpu_device_ip_check_soft_reset(adev)) {
	4240	+ dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
	4241	+ need_full_reset = true;
	4242	+ }
	4243	+ }
3354	4244
3355		- kthread_unpark(ring->sched.thread);
	4245	+ if (need_full_reset)
	4246	+ r = amdgpu_device_ip_suspend(adev);
	4247	+
	4248	+ *need_full_reset_arg = need_full_reset;
3356	4249	}
3357	4250
3358		- if (!amdgpu_device_has_dc_support(adev)) {
3359		- drm_helper_resume_force_mode(adev->ddev);
	4251	+ return r;
	4252	+}
	4253	+
	4254	+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
	4255	+ struct list_head *device_list_handle,
	4256	+ bool *need_full_reset_arg,
	4257	+ bool skip_hw_reset)
	4258	+{
	4259	+ struct amdgpu_device *tmp_adev = NULL;
	4260	+ bool need_full_reset = *need_full_reset_arg, vram_lost = false;
	4261	+ int r = 0;
	4262	+
	4263	+ /*
	4264	+ * ASIC reset has to be done on all HGMI hive nodes ASAP
	4265	+ * to allow proper links negotiation in FW (within 1 sec)
	4266	+ */
	4267	+ if (!skip_hw_reset && need_full_reset) {
	4268	+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
	4269	+ /* For XGMI run all resets in parallel to speed up the process */
	4270	+ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
	4271	+ if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
	4272	+ r = -EALREADY;
	4273	+ } else
	4274	+ r = amdgpu_asic_reset(tmp_adev);
	4275	+
	4276	+ if (r) {
	4277	+ dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
	4278	+ r, adev_to_drm(tmp_adev)->unique);
	4279	+ break;
	4280	+ }
	4281	+ }
	4282	+
	4283	+ /* For XGMI wait for all resets to complete before proceed */
	4284	+ if (!r) {
	4285	+ list_for_each_entry(tmp_adev, device_list_handle,
	4286	+ gmc.xgmi.head) {
	4287	+ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
	4288	+ flush_work(&tmp_adev->xgmi_reset_work);
	4289	+ r = tmp_adev->asic_reset_res;
	4290	+ if (r)
	4291	+ break;
	4292	+ }
	4293	+ }
	4294	+ }
3360	4295	}
3361	4296
3362		- ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
	4297	+ if (!r && amdgpu_ras_intr_triggered()) {
	4298	+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
	4299	+ if (tmp_adev->mmhub.funcs &&
	4300	+ tmp_adev->mmhub.funcs->reset_ras_error_count)
	4301	+ tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
	4302	+ }
3363	4303
3364		- if (r) {
3365		- /* bad news, how to tell it to userspace ? */
3366		- dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
3367		- amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
	4304	+ amdgpu_ras_intr_cleared();
	4305	+ }
	4306	+
	4307	+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
	4308	+ if (need_full_reset) {
	4309	+ /* post card */
	4310	+ if (amdgpu_device_asic_init(tmp_adev))
	4311	+ dev_warn(tmp_adev->dev, "asic atom init failed!");
	4312	+
	4313	+ if (!r) {
	4314	+ dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
	4315	+ r = amdgpu_amdkfd_resume_iommu(tmp_adev);
	4316	+ if (r)
	4317	+ goto out;
	4318	+
	4319	+ r = amdgpu_device_ip_resume_phase1(tmp_adev);
	4320	+ if (r)
	4321	+ goto out;
	4322	+
	4323	+ vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
	4324	+ if (vram_lost) {
	4325	+ DRM_INFO("VRAM is lost due to GPU reset!\n");
	4326	+ amdgpu_inc_vram_lost(tmp_adev);
	4327	+ }
	4328	+
	4329	+ r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
	4330	+ if (r)
	4331	+ goto out;
	4332	+
	4333	+ r = amdgpu_device_fw_loading(tmp_adev);
	4334	+ if (r)
	4335	+ return r;
	4336	+
	4337	+ r = amdgpu_device_ip_resume_phase2(tmp_adev);
	4338	+ if (r)
	4339	+ goto out;
	4340	+
	4341	+ if (vram_lost)
	4342	+ amdgpu_device_fill_reset_magic(tmp_adev);
	4343	+
	4344	+ /*
	4345	+ * Add this ASIC as tracked as reset was already
	4346	+ * complete successfully.
	4347	+ */
	4348	+ amdgpu_register_gpu_instance(tmp_adev);
	4349	+
	4350	+ r = amdgpu_device_ip_late_init(tmp_adev);
	4351	+ if (r)
	4352	+ goto out;
	4353	+
	4354	+ amdgpu_fbdev_set_suspend(tmp_adev, 0);
	4355	+
	4356	+ /*
	4357	+ * The GPU enters bad state once faulty pages
	4358	+ * by ECC has reached the threshold, and ras
	4359	+ * recovery is scheduled next. So add one check
	4360	+ * here to break recovery if it indeed exceeds
	4361	+ * bad page threshold, and remind user to
	4362	+ * retire this GPU or setting one bigger
	4363	+ * bad_page_threshold value to fix this once
	4364	+ * probing driver again.
	4365	+ */
	4366	+ if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
	4367	+ /* must succeed. */
	4368	+ amdgpu_ras_resume(tmp_adev);
	4369	+ } else {
	4370	+ r = -EINVAL;
	4371	+ goto out;
	4372	+ }
	4373	+
	4374	+ /* Update PSP FW topology after reset */
	4375	+ if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
	4376	+ r = amdgpu_xgmi_update_topology(hive, tmp_adev);
	4377	+ }
	4378	+ }
	4379	+
	4380	+out:
	4381	+ if (!r) {
	4382	+ amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
	4383	+ r = amdgpu_ib_ring_tests(tmp_adev);
	4384	+ if (r) {
	4385	+ dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
	4386	+ need_full_reset = true;
	4387	+ r = -EAGAIN;
	4388	+ goto end;
	4389	+ }
	4390	+ }
	4391	+
	4392	+ if (!r)
	4393	+ r = amdgpu_device_recover_vram(tmp_adev);
	4394	+ else
	4395	+ tmp_adev->asic_reset_res = r;
	4396	+ }
	4397	+
	4398	+end:
	4399	+ *need_full_reset_arg = need_full_reset;
	4400	+ return r;
	4401	+}
	4402	+
	4403	+static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
	4404	+ struct amdgpu_hive_info *hive)
	4405	+{
	4406	+ if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
	4407	+ return false;
	4408	+
	4409	+ if (hive) {
	4410	+ down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
3368	4411	} else {
3369		- dev_info(adev->dev, "GPU reset(%d) succeeded!\n",atomic_read(&adev->gpu_reset_counter));
	4412	+ down_write(&adev->reset_sem);
3370	4413	}
3371	4414
3372		- /unlock kfd /
3373		- amdgpu_amdkfd_post_reset(adev);
	4415	+ atomic_inc(&adev->gpu_reset_counter);
	4416	+ switch (amdgpu_asic_reset_method(adev)) {
	4417	+ case AMD_RESET_METHOD_MODE1:
	4418	+ adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
	4419	+ break;
	4420	+ case AMD_RESET_METHOD_MODE2:
	4421	+ adev->mp1_state = PP_MP1_STATE_RESET;
	4422	+ break;
	4423	+ default:
	4424	+ adev->mp1_state = PP_MP1_STATE_NONE;
	4425	+ break;
	4426	+ }
	4427	+
	4428	+ return true;
	4429	+}
	4430	+
	4431	+static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
	4432	+{
3374	4433	amdgpu_vf_error_trans_all(adev);
3375		- adev->in_gpu_reset = 0;
3376		- mutex_unlock(&adev->lock_reset);
	4434	+ adev->mp1_state = PP_MP1_STATE_NONE;
	4435	+ atomic_set(&adev->in_gpu_reset, 0);
	4436	+ up_write(&adev->reset_sem);
	4437	+}
	4438	+
	4439	+static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
	4440	+{
	4441	+ struct pci_dev *p = NULL;
	4442	+
	4443	+ p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
	4444	+ adev->pdev->bus->number, 1);
	4445	+ if (p) {
	4446	+ pm_runtime_enable(&(p->dev));
	4447	+ pm_runtime_resume(&(p->dev));
	4448	+ }
	4449	+
	4450	+ pci_dev_put(p);
	4451	+}
	4452	+
	4453	+static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
	4454	+{
	4455	+ enum amd_reset_method reset_method;
	4456	+ struct pci_dev *p = NULL;
	4457	+ u64 expires;
	4458	+
	4459	+ /*
	4460	+ * For now, only BACO and mode1 reset are confirmed
	4461	+ * to suffer the audio issue without proper suspended.
	4462	+ */
	4463	+ reset_method = amdgpu_asic_reset_method(adev);
	4464	+ if ((reset_method != AMD_RESET_METHOD_BACO) &&
	4465	+ (reset_method != AMD_RESET_METHOD_MODE1))
	4466	+ return -EINVAL;
	4467	+
	4468	+ p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
	4469	+ adev->pdev->bus->number, 1);
	4470	+ if (!p)
	4471	+ return -ENODEV;
	4472	+
	4473	+ expires = pm_runtime_autosuspend_expiration(&(p->dev));
	4474	+ if (!expires)
	4475	+ /*
	4476	+ * If we cannot get the audio device autosuspend delay,
	4477	+ * a fixed 4S interval will be used. Considering 3S is
	4478	+ * the audio controller default autosuspend delay setting.
	4479	+ * 4S used here is guaranteed to cover that.
	4480	+ */
	4481	+ expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
	4482	+
	4483	+ while (!pm_runtime_status_suspended(&(p->dev))) {
	4484	+ if (!pm_runtime_suspend(&(p->dev)))
	4485	+ break;
	4486	+
	4487	+ if (expires < ktime_get_mono_fast_ns()) {
	4488	+ dev_warn(adev->dev, "failed to suspend display audio\n");
	4489	+ pci_dev_put(p);
	4490	+ /* TODO: abort the succeeding gpu reset? */
	4491	+ return -ETIMEDOUT;
	4492	+ }
	4493	+ }
	4494	+
	4495	+ pm_runtime_disable(&(p->dev));
	4496	+
	4497	+ pci_dev_put(p);
	4498	+ return 0;
	4499	+}
	4500	+
	4501	+/**
	4502	+ * amdgpu_device_gpu_recover - reset the asic and recover scheduler
	4503	+ *
	4504	+ * @adev: amdgpu_device pointer
	4505	+ * @job: which job trigger hang
	4506	+ *
	4507	+ * Attempt to reset the GPU if it has hung (all asics).
	4508	+ * Attempt to do soft-reset or full-reset and reinitialize Asic
	4509	+ * Returns 0 for success or an error on failure.
	4510	+ */
	4511	+
	4512	+int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	4513	+ struct amdgpu_job *job)
	4514	+{
	4515	+ struct list_head device_list, *device_list_handle = NULL;
	4516	+ bool need_full_reset = false;
	4517	+ bool job_signaled = false;
	4518	+ struct amdgpu_hive_info *hive = NULL;
	4519	+ struct amdgpu_device *tmp_adev = NULL;
	4520	+ int i, r = 0;
	4521	+ bool need_emergency_restart = false;
	4522	+ bool audio_suspended = false;
	4523	+
	4524	+ /*
	4525	+ * Special case: RAS triggered and full reset isn't supported
	4526	+ */
	4527	+ need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
	4528	+
	4529	+ /*
	4530	+ * Flush RAM to disk so that after reboot
	4531	+ * the user can read log and see why the system rebooted.
	4532	+ */
	4533	+ if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
	4534	+ DRM_WARN("Emergency reboot.");
	4535	+
	4536	+ ksys_sync_helper();
	4537	+ emergency_restart();
	4538	+ }
	4539	+
	4540	+ dev_info(adev->dev, "GPU %s begin!\n",
	4541	+ need_emergency_restart ? "jobs stop":"reset");
	4542	+
	4543	+ /*
	4544	+ * Here we trylock to avoid chain of resets executing from
	4545	+ * either trigger by jobs on different adevs in XGMI hive or jobs on
	4546	+ * different schedulers for same device while this TO handler is running.
	4547	+ * We always reset all schedulers for device and all devices for XGMI
	4548	+ * hive so that should take care of them too.
	4549	+ */
	4550	+ hive = amdgpu_get_xgmi_hive(adev);
	4551	+ if (hive) {
	4552	+ if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
	4553	+ DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
	4554	+ job ? job->base.id : -1, hive->hive_id);
	4555	+ amdgpu_put_xgmi_hive(hive);
	4556	+ return 0;
	4557	+ }
	4558	+ mutex_lock(&hive->hive_lock);
	4559	+ }
	4560	+
	4561	+ /*
	4562	+ * Build list of devices to reset.
	4563	+ * In case we are in XGMI hive mode, resort the device list
	4564	+ * to put adev in the 1st position.
	4565	+ */
	4566	+ INIT_LIST_HEAD(&device_list);
	4567	+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
	4568	+ if (!hive)
	4569	+ return -ENODEV;
	4570	+ if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
	4571	+ list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
	4572	+ device_list_handle = &hive->device_list;
	4573	+ } else {
	4574	+ list_add_tail(&adev->gmc.xgmi.head, &device_list);
	4575	+ device_list_handle = &device_list;
	4576	+ }
	4577	+
	4578	+ /* block all schedulers and reset given job's ring */
	4579	+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
	4580	+ if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
	4581	+ dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
	4582	+ job ? job->base.id : -1);
	4583	+ r = 0;
	4584	+ goto skip_recovery;
	4585	+ }
	4586	+
	4587	+ /*
	4588	+ * Try to put the audio codec into suspend state
	4589	+ * before gpu reset started.
	4590	+ *
	4591	+ * Due to the power domain of the graphics device
	4592	+ * is shared with AZ power domain. Without this,
	4593	+ * we may change the audio hardware from behind
	4594	+ * the audio driver's back. That will trigger
	4595	+ * some audio codec errors.
	4596	+ */
	4597	+ if (!amdgpu_device_suspend_display_audio(tmp_adev))
	4598	+ audio_suspended = true;
	4599	+
	4600	+ amdgpu_ras_set_error_query_ready(tmp_adev, false);
	4601	+
	4602	+ cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
	4603	+
	4604	+ if (!amdgpu_sriov_vf(tmp_adev))
	4605	+ amdgpu_amdkfd_pre_reset(tmp_adev);
	4606	+
	4607	+ /*
	4608	+ * Mark these ASICs to be reseted as untracked first
	4609	+ * And add them back after reset completed
	4610	+ */
	4611	+ amdgpu_unregister_gpu_instance(tmp_adev);
	4612	+
	4613	+ amdgpu_fbdev_set_suspend(tmp_adev, 1);
	4614	+
	4615	+ /* disable ras on ALL IPs */
	4616	+ if (!need_emergency_restart &&
	4617	+ amdgpu_device_ip_need_full_reset(tmp_adev))
	4618	+ amdgpu_ras_suspend(tmp_adev);
	4619	+
	4620	+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
	4621	+ struct amdgpu_ring *ring = tmp_adev->rings[i];
	4622	+
	4623	+ if (!ring \|\| !ring->sched.thread)
	4624	+ continue;
	4625	+
	4626	+ drm_sched_stop(&ring->sched, job ? &job->base : NULL);
	4627	+
	4628	+ if (need_emergency_restart)
	4629	+ amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
	4630	+ }
	4631	+ }
	4632	+
	4633	+ if (need_emergency_restart)
	4634	+ goto skip_sched_resume;
	4635	+
	4636	+ /*
	4637	+ * Must check guilty signal here since after this point all old
	4638	+ * HW fences are force signaled.
	4639	+ *
	4640	+ * job->base holds a reference to parent fence
	4641	+ */
	4642	+ if (job && job->base.s_fence->parent &&
	4643	+ dma_fence_is_signaled(job->base.s_fence->parent)) {
	4644	+ job_signaled = true;
	4645	+ dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
	4646	+ goto skip_hw_reset;
	4647	+ }
	4648	+
	4649	+retry: /* Rest of adevs pre asic reset from XGMI hive. */
	4650	+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
	4651	+ r = amdgpu_device_pre_asic_reset(tmp_adev,
	4652	+ (tmp_adev == adev) ? job : NULL,
	4653	+ &need_full_reset);
	4654	+ /TODO Should we stop ?/
	4655	+ if (r) {
	4656	+ dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
	4657	+ r, adev_to_drm(tmp_adev)->unique);
	4658	+ tmp_adev->asic_reset_res = r;
	4659	+ }
	4660	+ }
	4661	+
	4662	+ /* Actual ASIC resets if needed.*/
	4663	+ /* TODO Implement XGMI hive reset logic for SRIOV */
	4664	+ if (amdgpu_sriov_vf(adev)) {
	4665	+ r = amdgpu_device_reset_sriov(adev, job ? false : true);
	4666	+ if (r)
	4667	+ adev->asic_reset_res = r;
	4668	+ } else {
	4669	+ r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
	4670	+ if (r && r == -EAGAIN)
	4671	+ goto retry;
	4672	+ }
	4673	+
	4674	+skip_hw_reset:
	4675	+
	4676	+ /* Post ASIC reset for all devs .*/
	4677	+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
	4678	+
	4679	+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
	4680	+ struct amdgpu_ring *ring = tmp_adev->rings[i];
	4681	+
	4682	+ if (!ring \|\| !ring->sched.thread)
	4683	+ continue;
	4684	+
	4685	+ /* No point to resubmit jobs if we didn't HW reset*/
	4686	+ if (!tmp_adev->asic_reset_res && !job_signaled)
	4687	+ drm_sched_resubmit_jobs(&ring->sched);
	4688	+
	4689	+ drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
	4690	+ }
	4691	+
	4692	+ if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
	4693	+ drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
	4694	+ }
	4695	+
	4696	+ tmp_adev->asic_reset_res = 0;
	4697	+
	4698	+ if (r) {
	4699	+ /* bad news, how to tell it to userspace ? */
	4700	+ dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
	4701	+ amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
	4702	+ } else {
	4703	+ dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
	4704	+ }
	4705	+ }
	4706	+
	4707	+skip_sched_resume:
	4708	+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
	4709	+ /unlock kfd: SRIOV would do it separately /
	4710	+ if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
	4711	+ amdgpu_amdkfd_post_reset(tmp_adev);
	4712	+ if (audio_suspended)
	4713	+ amdgpu_device_resume_display_audio(tmp_adev);
	4714	+ amdgpu_device_unlock_adev(tmp_adev);
	4715	+ }
	4716	+
	4717	+skip_recovery:
	4718	+ if (hive) {
	4719	+ atomic_set(&hive->in_reset, 0);
	4720	+ mutex_unlock(&hive->hive_lock);
	4721	+ amdgpu_put_xgmi_hive(hive);
	4722	+ }
	4723	+
	4724	+ if (r)
	4725	+ dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
3377	4726	return r;
3378	4727	}
3379	4728
..	..	@@ -3389,8 +4738,8 @@
3389	4738	static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
3390	4739	{
3391	4740	struct pci_dev *pdev;
3392		- enum pci_bus_speed speed_cap;
3393		- enum pcie_link_width link_width;
	4741	+ enum pci_bus_speed speed_cap, platform_speed_cap;
	4742	+ enum pcie_link_width platform_link_width;
3394	4743
3395	4744	if (amdgpu_pcie_gen_cap)
3396	4745	adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
..	..	@@ -3406,6 +4755,12 @@
3406	4755	adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
3407	4756	return;
3408	4757	}
	4758	+
	4759	+ if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
	4760	+ return;
	4761	+
	4762	+ pcie_bandwidth_available(adev->pdev, NULL,
	4763	+ &platform_speed_cap, &platform_link_width);
3409	4764
3410	4765	if (adev->pm.pcie_gen_mask == 0) {
3411	4766	/* asic caps */
..	..	@@ -3432,22 +4787,20 @@
3432	4787	adev->pm.pcie_gen_mask \|= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
3433	4788	}
3434	4789	/* platform caps */
3435		- pdev = adev->ddev->pdev->bus->self;
3436		- speed_cap = pcie_get_speed_cap(pdev);
3437		- if (speed_cap == PCI_SPEED_UNKNOWN) {
	4790	+ if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
3438	4791	adev->pm.pcie_gen_mask \|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 \|
3439	4792	CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
3440	4793	} else {
3441		- if (speed_cap == PCIE_SPEED_16_0GT)
	4794	+ if (platform_speed_cap == PCIE_SPEED_16_0GT)
3442	4795	adev->pm.pcie_gen_mask \|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 \|
3443	4796	CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 \|
3444	4797	CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 \|
3445	4798	CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
3446		- else if (speed_cap == PCIE_SPEED_8_0GT)
	4799	+ else if (platform_speed_cap == PCIE_SPEED_8_0GT)
3447	4800	adev->pm.pcie_gen_mask \|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 \|
3448	4801	CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 \|
3449	4802	CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
3450		- else if (speed_cap == PCIE_SPEED_5_0GT)
	4803	+ else if (platform_speed_cap == PCIE_SPEED_5_0GT)
3451	4804	adev->pm.pcie_gen_mask \|= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 \|
3452	4805	CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
3453	4806	else
..	..	@@ -3456,12 +4809,10 @@
3456	4809	}
3457	4810	}
3458	4811	if (adev->pm.pcie_mlw_mask == 0) {
3459		- pdev = adev->ddev->pdev->bus->self;
3460		- link_width = pcie_get_width_cap(pdev);
3461		- if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
	4812	+ if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
3462	4813	adev->pm.pcie_mlw_mask \|= AMDGPU_DEFAULT_PCIE_MLW_MASK;
3463	4814	} else {
3464		- switch (link_width) {
	4815	+ switch (platform_link_width) {
3465	4816	case PCIE_LNK_X32:
3466	4817	adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 \|
3467	4818	CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 \|
..	..	@@ -3511,3 +4862,267 @@
3511	4862	}
3512	4863	}
3513	4864
	4865	+int amdgpu_device_baco_enter(struct drm_device *dev)
	4866	+{
	4867	+ struct amdgpu_device *adev = drm_to_adev(dev);
	4868	+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
	4869	+
	4870	+ if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
	4871	+ return -ENOTSUPP;
	4872	+
	4873	+ if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
	4874	+ adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
	4875	+
	4876	+ return amdgpu_dpm_baco_enter(adev);
	4877	+}
	4878	+
	4879	+int amdgpu_device_baco_exit(struct drm_device *dev)
	4880	+{
	4881	+ struct amdgpu_device *adev = drm_to_adev(dev);
	4882	+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
	4883	+ int ret = 0;
	4884	+
	4885	+ if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
	4886	+ return -ENOTSUPP;
	4887	+
	4888	+ ret = amdgpu_dpm_baco_exit(adev);
	4889	+ if (ret)
	4890	+ return ret;
	4891	+
	4892	+ if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
	4893	+ adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
	4894	+
	4895	+ return 0;
	4896	+}
	4897	+
	4898	+static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
	4899	+{
	4900	+ int i;
	4901	+
	4902	+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
	4903	+ struct amdgpu_ring *ring = adev->rings[i];
	4904	+
	4905	+ if (!ring \|\| !ring->sched.thread)
	4906	+ continue;
	4907	+
	4908	+ cancel_delayed_work_sync(&ring->sched.work_tdr);
	4909	+ }
	4910	+}
	4911	+
	4912	+/**
	4913	+ * amdgpu_pci_error_detected - Called when a PCI error is detected.
	4914	+ * @pdev: PCI device struct
	4915	+ * @state: PCI channel state
	4916	+ *
	4917	+ * Description: Called when a PCI error is detected.
	4918	+ *
	4919	+ * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
	4920	+ */
	4921	+pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
	4922	+{
	4923	+ struct drm_device *dev = pci_get_drvdata(pdev);
	4924	+ struct amdgpu_device *adev = drm_to_adev(dev);
	4925	+ int i;
	4926	+
	4927	+ DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
	4928	+
	4929	+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
	4930	+ DRM_WARN("No support for XGMI hive yet...");
	4931	+ return PCI_ERS_RESULT_DISCONNECT;
	4932	+ }
	4933	+
	4934	+ switch (state) {
	4935	+ case pci_channel_io_normal:
	4936	+ return PCI_ERS_RESULT_CAN_RECOVER;
	4937	+ /* Fatal error, prepare for slot reset */
	4938	+ case pci_channel_io_frozen:
	4939	+ /*
	4940	+ * Cancel and wait for all TDRs in progress if failing to
	4941	+ * set adev->in_gpu_reset in amdgpu_device_lock_adev
	4942	+ *
	4943	+ * Locking adev->reset_sem will prevent any external access
	4944	+ * to GPU during PCI error recovery
	4945	+ */
	4946	+ while (!amdgpu_device_lock_adev(adev, NULL))
	4947	+ amdgpu_cancel_all_tdr(adev);
	4948	+
	4949	+ /*
	4950	+ * Block any work scheduling as we do for regular GPU reset
	4951	+ * for the duration of the recovery
	4952	+ */
	4953	+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
	4954	+ struct amdgpu_ring *ring = adev->rings[i];
	4955	+
	4956	+ if (!ring \|\| !ring->sched.thread)
	4957	+ continue;
	4958	+
	4959	+ drm_sched_stop(&ring->sched, NULL);
	4960	+ }
	4961	+ return PCI_ERS_RESULT_NEED_RESET;
	4962	+ case pci_channel_io_perm_failure:
	4963	+ /* Permanent error, prepare for device removal */
	4964	+ return PCI_ERS_RESULT_DISCONNECT;
	4965	+ }
	4966	+
	4967	+ return PCI_ERS_RESULT_NEED_RESET;
	4968	+}
	4969	+
	4970	+/**
	4971	+ * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
	4972	+ * @pdev: pointer to PCI device
	4973	+ */
	4974	+pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
	4975	+{
	4976	+
	4977	+ DRM_INFO("PCI error: mmio enabled callback!!\n");
	4978	+
	4979	+ /* TODO - dump whatever for debugging purposes */
	4980	+
	4981	+ /* This called only if amdgpu_pci_error_detected returns
	4982	+ * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
	4983	+ * works, no need to reset slot.
	4984	+ */
	4985	+
	4986	+ return PCI_ERS_RESULT_RECOVERED;
	4987	+}
	4988	+
	4989	+/**
	4990	+ * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
	4991	+ * @pdev: PCI device struct
	4992	+ *
	4993	+ * Description: This routine is called by the pci error recovery
	4994	+ * code after the PCI slot has been reset, just before we
	4995	+ * should resume normal operations.
	4996	+ */
	4997	+pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
	4998	+{
	4999	+ struct drm_device *dev = pci_get_drvdata(pdev);
	5000	+ struct amdgpu_device *adev = drm_to_adev(dev);
	5001	+ int r, i;
	5002	+ bool need_full_reset = true;
	5003	+ u32 memsize;
	5004	+ struct list_head device_list;
	5005	+
	5006	+ DRM_INFO("PCI error: slot reset callback!!\n");
	5007	+
	5008	+ INIT_LIST_HEAD(&device_list);
	5009	+ list_add_tail(&adev->gmc.xgmi.head, &device_list);
	5010	+
	5011	+ /* wait for asic to come out of reset */
	5012	+ msleep(500);
	5013	+
	5014	+ /* Restore PCI confspace */
	5015	+ amdgpu_device_load_pci_state(pdev);
	5016	+
	5017	+ /* confirm ASIC came out of reset */
	5018	+ for (i = 0; i < adev->usec_timeout; i++) {
	5019	+ memsize = amdgpu_asic_get_config_memsize(adev);
	5020	+
	5021	+ if (memsize != 0xffffffff)
	5022	+ break;
	5023	+ udelay(1);
	5024	+ }
	5025	+ if (memsize == 0xffffffff) {
	5026	+ r = -ETIME;
	5027	+ goto out;
	5028	+ }
	5029	+
	5030	+ adev->in_pci_err_recovery = true;
	5031	+ r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
	5032	+ adev->in_pci_err_recovery = false;
	5033	+ if (r)
	5034	+ goto out;
	5035	+
	5036	+ r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
	5037	+
	5038	+out:
	5039	+ if (!r) {
	5040	+ if (amdgpu_device_cache_pci_state(adev->pdev))
	5041	+ pci_restore_state(adev->pdev);
	5042	+
	5043	+ DRM_INFO("PCIe error recovery succeeded\n");
	5044	+ } else {
	5045	+ DRM_ERROR("PCIe error recovery failed, err:%d", r);
	5046	+ amdgpu_device_unlock_adev(adev);
	5047	+ }
	5048	+
	5049	+ return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
	5050	+}
	5051	+
	5052	+/**
	5053	+ * amdgpu_pci_resume() - resume normal ops after PCI reset
	5054	+ * @pdev: pointer to PCI device
	5055	+ *
	5056	+ * Called when the error recovery driver tells us that its
	5057	+ * OK to resume normal operation. Use completion to allow
	5058	+ * halted scsi ops to resume.
	5059	+ */
	5060	+void amdgpu_pci_resume(struct pci_dev *pdev)
	5061	+{
	5062	+ struct drm_device *dev = pci_get_drvdata(pdev);
	5063	+ struct amdgpu_device *adev = drm_to_adev(dev);
	5064	+ int i;
	5065	+
	5066	+
	5067	+ DRM_INFO("PCI error: resume callback!!\n");
	5068	+
	5069	+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
	5070	+ struct amdgpu_ring *ring = adev->rings[i];
	5071	+
	5072	+ if (!ring \|\| !ring->sched.thread)
	5073	+ continue;
	5074	+
	5075	+
	5076	+ drm_sched_resubmit_jobs(&ring->sched);
	5077	+ drm_sched_start(&ring->sched, true);
	5078	+ }
	5079	+
	5080	+ amdgpu_device_unlock_adev(adev);
	5081	+}
	5082	+
	5083	+bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
	5084	+{
	5085	+ struct drm_device *dev = pci_get_drvdata(pdev);
	5086	+ struct amdgpu_device *adev = drm_to_adev(dev);
	5087	+ int r;
	5088	+
	5089	+ r = pci_save_state(pdev);
	5090	+ if (!r) {
	5091	+ kfree(adev->pci_state);
	5092	+
	5093	+ adev->pci_state = pci_store_saved_state(pdev);
	5094	+
	5095	+ if (!adev->pci_state) {
	5096	+ DRM_ERROR("Failed to store PCI saved state");
	5097	+ return false;
	5098	+ }
	5099	+ } else {
	5100	+ DRM_WARN("Failed to save PCI state, err:%d\n", r);
	5101	+ return false;
	5102	+ }
	5103	+
	5104	+ return true;
	5105	+}
	5106	+
	5107	+bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
	5108	+{
	5109	+ struct drm_device *dev = pci_get_drvdata(pdev);
	5110	+ struct amdgpu_device *adev = drm_to_adev(dev);
	5111	+ int r;
	5112	+
	5113	+ if (!adev->pci_state)
	5114	+ return false;
	5115	+
	5116	+ r = pci_load_saved_state(pdev, adev->pci_state);
	5117	+
	5118	+ if (!r) {
	5119	+ pci_restore_state(pdev);
	5120	+ } else {
	5121	+ DRM_WARN("Failed to load PCI state, err:%d\n", r);
	5122	+ return false;
	5123	+ }
	5124	+
	5125	+ return true;
	5126	+}
	5127	+
	5128	+