drm/amdgpu: Block all job scheduling activity during DPC recovery
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
fdf2f6c5 33
4562236b 34#include <drm/drm_atomic_helper.h>
fcd70cd3 35#include <drm/drm_probe_helper.h>
d38ceaf9
AD
36#include <drm/amdgpu_drm.h>
37#include <linux/vgaarb.h>
38#include <linux/vga_switcheroo.h>
39#include <linux/efi.h>
40#include "amdgpu.h"
f4b373f4 41#include "amdgpu_trace.h"
d38ceaf9
AD
42#include "amdgpu_i2c.h"
43#include "atom.h"
44#include "amdgpu_atombios.h"
a5bde2f9 45#include "amdgpu_atomfirmware.h"
d0dd7f0c 46#include "amd_pcie.h"
33f34802
KW
47#ifdef CONFIG_DRM_AMDGPU_SI
48#include "si.h"
49#endif
a2e73f56
AD
50#ifdef CONFIG_DRM_AMDGPU_CIK
51#include "cik.h"
52#endif
aaa36a97 53#include "vi.h"
460826e6 54#include "soc15.h"
0a5b8c7b 55#include "nv.h"
d38ceaf9 56#include "bif/bif_4_1_d.h"
9accf2fd 57#include <linux/pci.h>
bec86378 58#include <linux/firmware.h>
89041940 59#include "amdgpu_vf_error.h"
d38ceaf9 60
ba997709 61#include "amdgpu_amdkfd.h"
d2f52ac8 62#include "amdgpu_pm.h"
d38ceaf9 63
5183411b 64#include "amdgpu_xgmi.h"
c030f2e4 65#include "amdgpu_ras.h"
9c7c85f7 66#include "amdgpu_pmu.h"
bd607166 67#include "amdgpu_fru_eeprom.h"
5183411b 68
d5ea093e 69#include <linux/suspend.h>
c6a6e2db 70#include <drm/task_barrier.h>
3f12acc8 71#include <linux/pm_runtime.h>
d5ea093e 72
e2a75f88 73MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 74MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 75MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 76MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 77MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 78MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
b51a26a0 79MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
23c6268e 80MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
ed42cfe1 81MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
42b325e5 82MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
c0a43457 83MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
120eb833 84MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
e2a75f88 85
2dc80b00
S
86#define AMDGPU_RESUME_MS 2000
87
050091ab 88const char *amdgpu_asic_name[] = {
da69c161
KW
89 "TAHITI",
90 "PITCAIRN",
91 "VERDE",
92 "OLAND",
93 "HAINAN",
d38ceaf9
AD
94 "BONAIRE",
95 "KAVERI",
96 "KABINI",
97 "HAWAII",
98 "MULLINS",
99 "TOPAZ",
100 "TONGA",
48299f95 101 "FIJI",
d38ceaf9 102 "CARRIZO",
139f4917 103 "STONEY",
2cc0c0b5
FC
104 "POLARIS10",
105 "POLARIS11",
c4642a47 106 "POLARIS12",
48ff108d 107 "VEGAM",
d4196f01 108 "VEGA10",
8fab806a 109 "VEGA12",
956fcddc 110 "VEGA20",
2ca8a5d2 111 "RAVEN",
d6c3b24e 112 "ARCTURUS",
1eee4228 113 "RENOIR",
852a6626 114 "NAVI10",
87dbad02 115 "NAVI14",
9802f5d7 116 "NAVI12",
ccaf72d3 117 "SIENNA_CICHLID",
ddd8fbe7 118 "NAVY_FLOUNDER",
d38ceaf9
AD
119 "LAST",
120};
121
dcea6e65
KR
122/**
123 * DOC: pcie_replay_count
124 *
125 * The amdgpu driver provides a sysfs API for reporting the total number
126 * of PCIe replays (NAKs)
127 * The file pcie_replay_count is used for this and returns the total
128 * number of replays as a sum of the NAKs generated and NAKs received
129 */
130
131static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
132 struct device_attribute *attr, char *buf)
133{
134 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 135 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
137
138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
139}
140
141static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
142 amdgpu_device_get_pcie_replay_count, NULL);
143
5494d864
AD
144static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
145
bd607166
KR
146/**
147 * DOC: product_name
148 *
149 * The amdgpu driver provides a sysfs API for reporting the product name
150 * for the device
151 * The file serial_number is used for this and returns the product name
152 * as returned from the FRU.
153 * NOTE: This is only available for certain server cards
154 */
155
156static ssize_t amdgpu_device_get_product_name(struct device *dev,
157 struct device_attribute *attr, char *buf)
158{
159 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 160 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
161
162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
163}
164
165static DEVICE_ATTR(product_name, S_IRUGO,
166 amdgpu_device_get_product_name, NULL);
167
168/**
169 * DOC: product_number
170 *
171 * The amdgpu driver provides a sysfs API for reporting the part number
172 * for the device
173 * The file serial_number is used for this and returns the part number
174 * as returned from the FRU.
175 * NOTE: This is only available for certain server cards
176 */
177
178static ssize_t amdgpu_device_get_product_number(struct device *dev,
179 struct device_attribute *attr, char *buf)
180{
181 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 182 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
183
184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
185}
186
187static DEVICE_ATTR(product_number, S_IRUGO,
188 amdgpu_device_get_product_number, NULL);
189
190/**
191 * DOC: serial_number
192 *
193 * The amdgpu driver provides a sysfs API for reporting the serial number
194 * for the device
195 * The file serial_number is used for this and returns the serial number
196 * as returned from the FRU.
197 * NOTE: This is only available for certain server cards
198 */
199
200static ssize_t amdgpu_device_get_serial_number(struct device *dev,
201 struct device_attribute *attr, char *buf)
202{
203 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 204 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
205
206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
207}
208
209static DEVICE_ATTR(serial_number, S_IRUGO,
210 amdgpu_device_get_serial_number, NULL);
211
e3ecdffa 212/**
31af062a 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
e3ecdffa
AD
214 *
215 * @dev: drm_device pointer
216 *
217 * Returns true if the device is a dGPU with HG/PX power control,
218 * otherwise return false.
219 */
31af062a 220bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 221{
1348969a 222 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 223
2f7d10b3 224 if (adev->flags & AMD_IS_PX)
d38ceaf9
AD
225 return true;
226 return false;
227}
228
a69cba42
AD
229/**
230 * amdgpu_device_supports_baco - Does the device support BACO
231 *
232 * @dev: drm_device pointer
233 *
234 * Returns true if the device supporte BACO,
235 * otherwise return false.
236 */
237bool amdgpu_device_supports_baco(struct drm_device *dev)
238{
1348969a 239 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
240
241 return amdgpu_asic_supports_baco(adev);
242}
243
e35e2b11
TY
244/**
245 * VRAM access helper functions.
246 *
247 * amdgpu_device_vram_access - read/write a buffer in vram
248 *
249 * @adev: amdgpu_device pointer
250 * @pos: offset of the buffer in vram
251 * @buf: virtual address of the buffer in system memory
252 * @size: read/write size, sizeof(@buf) must > @size
253 * @write: true - write to vram, otherwise - read from vram
254 */
255void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 uint32_t *buf, size_t size, bool write)
257{
e35e2b11 258 unsigned long flags;
ce05ac56
CK
259 uint32_t hi = ~0;
260 uint64_t last;
261
9d11eb0d
CK
262
263#ifdef CONFIG_64BIT
264 last = min(pos + size, adev->gmc.visible_vram_size);
265 if (last > pos) {
266 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 size_t count = last - pos;
268
269 if (write) {
270 memcpy_toio(addr, buf, count);
271 mb();
272 amdgpu_asic_flush_hdp(adev, NULL);
273 } else {
274 amdgpu_asic_invalidate_hdp(adev, NULL);
275 mb();
276 memcpy_fromio(buf, addr, count);
277 }
278
279 if (count == size)
280 return;
281
282 pos += count;
283 buf += count / 4;
284 size -= count;
285 }
286#endif
287
ce05ac56
CK
288 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 for (last = pos + size; pos < last; pos += 4) {
290 uint32_t tmp = pos >> 31;
e35e2b11 291
e35e2b11 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
ce05ac56
CK
293 if (tmp != hi) {
294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 hi = tmp;
296 }
e35e2b11
TY
297 if (write)
298 WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 else
300 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
e35e2b11 301 }
ce05ac56 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
e35e2b11
TY
303}
304
d38ceaf9 305/*
e78b579d 306 * MMIO register access helper functions.
d38ceaf9 307 */
e3ecdffa 308/**
e78b579d 309 * amdgpu_mm_rreg - read a memory mapped IO register
e3ecdffa
AD
310 *
311 * @adev: amdgpu_device pointer
312 * @reg: dword aligned register offset
313 * @acc_flags: access flags which require special behavior
314 *
315 * Returns the 32 bit value from the offset specified.
316 */
e78b579d
HZ
317uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
318 uint32_t acc_flags)
d38ceaf9 319{
f4b373f4
TSD
320 uint32_t ret;
321
bf36b52e
AG
322 if (adev->in_pci_err_recovery)
323 return 0;
324
81202807
DL
325 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
326 down_read_trylock(&adev->reset_sem)) {
327 ret = amdgpu_kiq_rreg(adev, reg);
328 up_read(&adev->reset_sem);
329 return ret;
330 }
bc992ba5 331
ec59847e 332 if ((reg * 4) < adev->rmmio_size)
f4b373f4 333 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
e78b579d
HZ
334 else {
335 unsigned long flags;
336
337 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
338 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
339 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
341 }
81202807 342
e78b579d 343 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
f4b373f4 344 return ret;
d38ceaf9
AD
345}
346
421a2a30
ML
347/*
348 * MMIO register read with bytes helper functions
349 * @offset:bytes offset from MMIO start
350 *
351*/
352
e3ecdffa
AD
353/**
354 * amdgpu_mm_rreg8 - read a memory mapped IO register
355 *
356 * @adev: amdgpu_device pointer
357 * @offset: byte aligned register offset
358 *
359 * Returns the 8 bit value from the offset specified.
360 */
421a2a30 361uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
bf36b52e
AG
362 if (adev->in_pci_err_recovery)
363 return 0;
364
421a2a30
ML
365 if (offset < adev->rmmio_size)
366 return (readb(adev->rmmio + offset));
367 BUG();
368}
369
370/*
371 * MMIO register write with bytes helper functions
372 * @offset:bytes offset from MMIO start
373 * @value: the value want to be written to the register
374 *
375*/
e3ecdffa
AD
376/**
377 * amdgpu_mm_wreg8 - read a memory mapped IO register
378 *
379 * @adev: amdgpu_device pointer
380 * @offset: byte aligned register offset
381 * @value: 8 bit value to write
382 *
383 * Writes the value specified to the offset specified.
384 */
421a2a30 385void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
bf36b52e
AG
386 if (adev->in_pci_err_recovery)
387 return;
388
421a2a30
ML
389 if (offset < adev->rmmio_size)
390 writeb(value, adev->rmmio + offset);
391 else
392 BUG();
393}
394
e230ac11
ND
395static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
396 uint32_t reg, uint32_t v,
397 uint32_t acc_flags)
2e0cc4d4 398{
bf36b52e
AG
399 if (adev->in_pci_err_recovery)
400 return;
401
e78b579d 402 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
2e0cc4d4 403
ec59847e 404 if ((reg * 4) < adev->rmmio_size)
2e0cc4d4 405 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
e78b579d
HZ
406 else {
407 unsigned long flags;
408
409 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
410 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
411 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
412 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
413 }
2e0cc4d4
ML
414}
415
e3ecdffa 416/**
e78b579d 417 * amdgpu_mm_wreg - write to a memory mapped IO register
e3ecdffa
AD
418 *
419 * @adev: amdgpu_device pointer
420 * @reg: dword aligned register offset
421 * @v: 32 bit value to write to the register
422 * @acc_flags: access flags which require special behavior
423 *
424 * Writes the value specified to the offset specified.
425 */
e78b579d
HZ
426void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
427 uint32_t acc_flags)
d38ceaf9 428{
bf36b52e
AG
429 if (adev->in_pci_err_recovery)
430 return;
431
81202807
DL
432 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
433 down_read_trylock(&adev->reset_sem)) {
434 amdgpu_kiq_wreg(adev, reg, v);
435 up_read(&adev->reset_sem);
436 return;
437 }
bc992ba5 438
e78b579d 439 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
2e0cc4d4 440}
d38ceaf9 441
2e0cc4d4
ML
442/*
443 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
444 *
445 * this function is invoked only the debugfs register access
446 * */
447void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
448 uint32_t acc_flags)
449{
bf36b52e
AG
450 if (adev->in_pci_err_recovery)
451 return;
452
2e0cc4d4
ML
453 if (amdgpu_sriov_fullaccess(adev) &&
454 adev->gfx.rlc.funcs &&
455 adev->gfx.rlc.funcs->is_rlcg_access_range) {
47ed4e1c 456
2e0cc4d4
ML
457 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
458 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
47ed4e1c 459 }
2e0cc4d4 460
e78b579d 461 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
d38ceaf9
AD
462}
463
e3ecdffa
AD
464/**
465 * amdgpu_io_rreg - read an IO register
466 *
467 * @adev: amdgpu_device pointer
468 * @reg: dword aligned register offset
469 *
470 * Returns the 32 bit value from the offset specified.
471 */
d38ceaf9
AD
472u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
473{
bf36b52e
AG
474 if (adev->in_pci_err_recovery)
475 return 0;
476
d38ceaf9
AD
477 if ((reg * 4) < adev->rio_mem_size)
478 return ioread32(adev->rio_mem + (reg * 4));
479 else {
480 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
481 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
482 }
483}
484
e3ecdffa
AD
485/**
486 * amdgpu_io_wreg - write to an IO register
487 *
488 * @adev: amdgpu_device pointer
489 * @reg: dword aligned register offset
490 * @v: 32 bit value to write to the register
491 *
492 * Writes the value specified to the offset specified.
493 */
d38ceaf9
AD
494void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
495{
bf36b52e
AG
496 if (adev->in_pci_err_recovery)
497 return;
498
d38ceaf9
AD
499 if ((reg * 4) < adev->rio_mem_size)
500 iowrite32(v, adev->rio_mem + (reg * 4));
501 else {
502 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
503 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
504 }
505}
506
507/**
508 * amdgpu_mm_rdoorbell - read a doorbell dword
509 *
510 * @adev: amdgpu_device pointer
511 * @index: doorbell index
512 *
513 * Returns the value in the doorbell aperture at the
514 * requested doorbell index (CIK).
515 */
516u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
517{
bf36b52e
AG
518 if (adev->in_pci_err_recovery)
519 return 0;
520
d38ceaf9
AD
521 if (index < adev->doorbell.num_doorbells) {
522 return readl(adev->doorbell.ptr + index);
523 } else {
524 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
525 return 0;
526 }
527}
528
529/**
530 * amdgpu_mm_wdoorbell - write a doorbell dword
531 *
532 * @adev: amdgpu_device pointer
533 * @index: doorbell index
534 * @v: value to write
535 *
536 * Writes @v to the doorbell aperture at the
537 * requested doorbell index (CIK).
538 */
539void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
540{
bf36b52e
AG
541 if (adev->in_pci_err_recovery)
542 return;
543
d38ceaf9
AD
544 if (index < adev->doorbell.num_doorbells) {
545 writel(v, adev->doorbell.ptr + index);
546 } else {
547 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
548 }
549}
550
832be404
KW
551/**
552 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
553 *
554 * @adev: amdgpu_device pointer
555 * @index: doorbell index
556 *
557 * Returns the value in the doorbell aperture at the
558 * requested doorbell index (VEGA10+).
559 */
560u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
561{
bf36b52e
AG
562 if (adev->in_pci_err_recovery)
563 return 0;
564
832be404
KW
565 if (index < adev->doorbell.num_doorbells) {
566 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
567 } else {
568 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
569 return 0;
570 }
571}
572
573/**
574 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
575 *
576 * @adev: amdgpu_device pointer
577 * @index: doorbell index
578 * @v: value to write
579 *
580 * Writes @v to the doorbell aperture at the
581 * requested doorbell index (VEGA10+).
582 */
583void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
584{
bf36b52e
AG
585 if (adev->in_pci_err_recovery)
586 return;
587
832be404
KW
588 if (index < adev->doorbell.num_doorbells) {
589 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
590 } else {
591 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
592 }
593}
594
d38ceaf9
AD
595/**
596 * amdgpu_invalid_rreg - dummy reg read function
597 *
598 * @adev: amdgpu device pointer
599 * @reg: offset of register
600 *
601 * Dummy register read function. Used for register blocks
602 * that certain asics don't have (all asics).
603 * Returns the value in the register.
604 */
605static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
606{
607 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
608 BUG();
609 return 0;
610}
611
612/**
613 * amdgpu_invalid_wreg - dummy reg write function
614 *
615 * @adev: amdgpu device pointer
616 * @reg: offset of register
617 * @v: value to write to the register
618 *
619 * Dummy register read function. Used for register blocks
620 * that certain asics don't have (all asics).
621 */
622static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
623{
624 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
625 reg, v);
626 BUG();
627}
628
4fa1c6a6
TZ
629/**
630 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
631 *
632 * @adev: amdgpu device pointer
633 * @reg: offset of register
634 *
635 * Dummy register read function. Used for register blocks
636 * that certain asics don't have (all asics).
637 * Returns the value in the register.
638 */
639static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
640{
641 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
642 BUG();
643 return 0;
644}
645
646/**
647 * amdgpu_invalid_wreg64 - dummy reg write function
648 *
649 * @adev: amdgpu device pointer
650 * @reg: offset of register
651 * @v: value to write to the register
652 *
653 * Dummy register read function. Used for register blocks
654 * that certain asics don't have (all asics).
655 */
656static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
657{
658 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
659 reg, v);
660 BUG();
661}
662
d38ceaf9
AD
663/**
664 * amdgpu_block_invalid_rreg - dummy reg read function
665 *
666 * @adev: amdgpu device pointer
667 * @block: offset of instance
668 * @reg: offset of register
669 *
670 * Dummy register read function. Used for register blocks
671 * that certain asics don't have (all asics).
672 * Returns the value in the register.
673 */
674static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
675 uint32_t block, uint32_t reg)
676{
677 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
678 reg, block);
679 BUG();
680 return 0;
681}
682
683/**
684 * amdgpu_block_invalid_wreg - dummy reg write function
685 *
686 * @adev: amdgpu device pointer
687 * @block: offset of instance
688 * @reg: offset of register
689 * @v: value to write to the register
690 *
691 * Dummy register read function. Used for register blocks
692 * that certain asics don't have (all asics).
693 */
694static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
695 uint32_t block,
696 uint32_t reg, uint32_t v)
697{
698 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
699 reg, block, v);
700 BUG();
701}
702
4d2997ab
AD
703/**
704 * amdgpu_device_asic_init - Wrapper for atom asic_init
705 *
706 * @dev: drm_device pointer
707 *
708 * Does any asic specific work and then calls atom asic init.
709 */
710static int amdgpu_device_asic_init(struct amdgpu_device *adev)
711{
712 amdgpu_asic_pre_asic_init(adev);
713
714 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
715}
716
e3ecdffa
AD
717/**
718 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
719 *
720 * @adev: amdgpu device pointer
721 *
722 * Allocates a scratch page of VRAM for use by various things in the
723 * driver.
724 */
06ec9070 725static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 726{
a4a02777
CK
727 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
728 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
729 &adev->vram_scratch.robj,
730 &adev->vram_scratch.gpu_addr,
731 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
732}
733
e3ecdffa
AD
734/**
735 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
736 *
737 * @adev: amdgpu device pointer
738 *
739 * Frees the VRAM scratch page.
740 */
06ec9070 741static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 742{
078af1a3 743 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
744}
745
746/**
9c3f2b54 747 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
748 *
749 * @adev: amdgpu_device pointer
750 * @registers: pointer to the register array
751 * @array_size: size of the register array
752 *
753 * Programs an array or registers with and and or masks.
754 * This is a helper for setting golden registers.
755 */
9c3f2b54
AD
756void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
757 const u32 *registers,
758 const u32 array_size)
d38ceaf9
AD
759{
760 u32 tmp, reg, and_mask, or_mask;
761 int i;
762
763 if (array_size % 3)
764 return;
765
766 for (i = 0; i < array_size; i +=3) {
767 reg = registers[i + 0];
768 and_mask = registers[i + 1];
769 or_mask = registers[i + 2];
770
771 if (and_mask == 0xffffffff) {
772 tmp = or_mask;
773 } else {
774 tmp = RREG32(reg);
775 tmp &= ~and_mask;
e0d07657
HZ
776 if (adev->family >= AMDGPU_FAMILY_AI)
777 tmp |= (or_mask & and_mask);
778 else
779 tmp |= or_mask;
d38ceaf9
AD
780 }
781 WREG32(reg, tmp);
782 }
783}
784
e3ecdffa
AD
785/**
786 * amdgpu_device_pci_config_reset - reset the GPU
787 *
788 * @adev: amdgpu_device pointer
789 *
790 * Resets the GPU using the pci config reset sequence.
791 * Only applicable to asics prior to vega10.
792 */
8111c387 793void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
794{
795 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
796}
797
798/*
799 * GPU doorbell aperture helpers function.
800 */
801/**
06ec9070 802 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
803 *
804 * @adev: amdgpu_device pointer
805 *
806 * Init doorbell driver information (CIK)
807 * Returns 0 on success, error on failure.
808 */
06ec9070 809static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 810{
6585661d 811
705e519e
CK
812 /* No doorbell on SI hardware generation */
813 if (adev->asic_type < CHIP_BONAIRE) {
814 adev->doorbell.base = 0;
815 adev->doorbell.size = 0;
816 adev->doorbell.num_doorbells = 0;
817 adev->doorbell.ptr = NULL;
818 return 0;
819 }
820
d6895ad3
CK
821 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
822 return -EINVAL;
823
22357775
AD
824 amdgpu_asic_init_doorbell_index(adev);
825
d38ceaf9
AD
826 /* doorbell bar mapping */
827 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
828 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
829
edf600da 830 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
9564f192 831 adev->doorbell_index.max_assignment+1);
d38ceaf9
AD
832 if (adev->doorbell.num_doorbells == 0)
833 return -EINVAL;
834
ec3db8a6 835 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
88dc26e4
OZ
836 * paging queue doorbell use the second page. The
837 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
838 * doorbells are in the first page. So with paging queue enabled,
839 * the max num_doorbells should + 1 page (0x400 in dword)
ec3db8a6
PY
840 */
841 if (adev->asic_type >= CHIP_VEGA10)
88dc26e4 842 adev->doorbell.num_doorbells += 0x400;
ec3db8a6 843
8972e5d2
CK
844 adev->doorbell.ptr = ioremap(adev->doorbell.base,
845 adev->doorbell.num_doorbells *
846 sizeof(u32));
847 if (adev->doorbell.ptr == NULL)
d38ceaf9 848 return -ENOMEM;
d38ceaf9
AD
849
850 return 0;
851}
852
853/**
06ec9070 854 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
855 *
856 * @adev: amdgpu_device pointer
857 *
858 * Tear down doorbell driver information (CIK)
859 */
06ec9070 860static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
861{
862 iounmap(adev->doorbell.ptr);
863 adev->doorbell.ptr = NULL;
864}
865
22cb0164 866
d38ceaf9
AD
867
868/*
06ec9070 869 * amdgpu_device_wb_*()
455a7bc2 870 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 871 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
872 */
873
874/**
06ec9070 875 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
876 *
877 * @adev: amdgpu_device pointer
878 *
879 * Disables Writeback and frees the Writeback memory (all asics).
880 * Used at driver shutdown.
881 */
06ec9070 882static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
883{
884 if (adev->wb.wb_obj) {
a76ed485
AD
885 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
886 &adev->wb.gpu_addr,
887 (void **)&adev->wb.wb);
d38ceaf9
AD
888 adev->wb.wb_obj = NULL;
889 }
890}
891
892/**
06ec9070 893 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
d38ceaf9
AD
894 *
895 * @adev: amdgpu_device pointer
896 *
455a7bc2 897 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
898 * Used at driver startup.
899 * Returns 0 on success or an -error on failure.
900 */
06ec9070 901static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
902{
903 int r;
904
905 if (adev->wb.wb_obj == NULL) {
97407b63
AD
906 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
907 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
908 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
909 &adev->wb.wb_obj, &adev->wb.gpu_addr,
910 (void **)&adev->wb.wb);
d38ceaf9
AD
911 if (r) {
912 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
913 return r;
914 }
d38ceaf9
AD
915
916 adev->wb.num_wb = AMDGPU_MAX_WB;
917 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
918
919 /* clear wb memory */
73469585 920 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
921 }
922
923 return 0;
924}
925
926/**
131b4b36 927 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
928 *
929 * @adev: amdgpu_device pointer
930 * @wb: wb index
931 *
932 * Allocate a wb slot for use by the driver (all asics).
933 * Returns 0 on success or -EINVAL on failure.
934 */
131b4b36 935int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
936{
937 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 938
97407b63 939 if (offset < adev->wb.num_wb) {
7014285a 940 __set_bit(offset, adev->wb.used);
63ae07ca 941 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
942 return 0;
943 } else {
944 return -EINVAL;
945 }
946}
947
d38ceaf9 948/**
131b4b36 949 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
950 *
951 * @adev: amdgpu_device pointer
952 * @wb: wb index
953 *
954 * Free a wb slot allocated for use by the driver (all asics)
955 */
131b4b36 956void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 957{
73469585 958 wb >>= 3;
d38ceaf9 959 if (wb < adev->wb.num_wb)
73469585 960 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
961}
962
d6895ad3
CK
963/**
964 * amdgpu_device_resize_fb_bar - try to resize FB BAR
965 *
966 * @adev: amdgpu_device pointer
967 *
968 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
969 * to fail, but if any of the BARs is not accessible after the size we abort
970 * driver loading by returning -ENODEV.
971 */
972int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
973{
770d13b1 974 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
d6895ad3 975 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
31b8adab
CK
976 struct pci_bus *root;
977 struct resource *res;
978 unsigned i;
d6895ad3
CK
979 u16 cmd;
980 int r;
981
0c03b912 982 /* Bypass for VF */
983 if (amdgpu_sriov_vf(adev))
984 return 0;
985
b7221f2b
AD
986 /* skip if the bios has already enabled large BAR */
987 if (adev->gmc.real_vram_size &&
988 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
989 return 0;
990
31b8adab
CK
991 /* Check if the root BUS has 64bit memory resources */
992 root = adev->pdev->bus;
993 while (root->parent)
994 root = root->parent;
995
996 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 997 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
998 res->start > 0x100000000ull)
999 break;
1000 }
1001
1002 /* Trying to resize is pointless without a root hub window above 4GB */
1003 if (!res)
1004 return 0;
1005
d6895ad3
CK
1006 /* Disable memory decoding while we change the BAR addresses and size */
1007 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1008 pci_write_config_word(adev->pdev, PCI_COMMAND,
1009 cmd & ~PCI_COMMAND_MEMORY);
1010
1011 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1012 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1013 if (adev->asic_type >= CHIP_BONAIRE)
1014 pci_release_resource(adev->pdev, 2);
1015
1016 pci_release_resource(adev->pdev, 0);
1017
1018 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1019 if (r == -ENOSPC)
1020 DRM_INFO("Not enough PCI address space for a large BAR.");
1021 else if (r && r != -ENOTSUPP)
1022 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1023
1024 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1025
1026 /* When the doorbell or fb BAR isn't available we have no chance of
1027 * using the device.
1028 */
06ec9070 1029 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1030 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1031 return -ENODEV;
1032
1033 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1034
1035 return 0;
1036}
a05502e5 1037
d38ceaf9
AD
1038/*
1039 * GPU helpers function.
1040 */
1041/**
39c640c0 1042 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1043 *
1044 * @adev: amdgpu_device pointer
1045 *
c836fec5
JQ
1046 * Check if the asic has been initialized (all asics) at driver startup
1047 * or post is needed if hw reset is performed.
1048 * Returns true if need or false if not.
d38ceaf9 1049 */
39c640c0 1050bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1051{
1052 uint32_t reg;
1053
bec86378
ML
1054 if (amdgpu_sriov_vf(adev))
1055 return false;
1056
1057 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1058 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1059 * some old smc fw still need driver do vPost otherwise gpu hang, while
1060 * those smc fw version above 22.15 doesn't have this flaw, so we force
1061 * vpost executed for smc version below 22.15
bec86378
ML
1062 */
1063 if (adev->asic_type == CHIP_FIJI) {
1064 int err;
1065 uint32_t fw_ver;
1066 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1067 /* force vPost if error occured */
1068 if (err)
1069 return true;
1070
1071 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1072 if (fw_ver < 0x00160e00)
1073 return true;
bec86378 1074 }
bec86378 1075 }
91fe77eb 1076
1077 if (adev->has_hw_reset) {
1078 adev->has_hw_reset = false;
1079 return true;
1080 }
1081
1082 /* bios scratch used on CIK+ */
1083 if (adev->asic_type >= CHIP_BONAIRE)
1084 return amdgpu_atombios_scratch_need_asic_init(adev);
1085
1086 /* check MEM_SIZE for older asics */
1087 reg = amdgpu_asic_get_config_memsize(adev);
1088
1089 if ((reg != 0) && (reg != 0xffffffff))
1090 return false;
1091
1092 return true;
bec86378
ML
1093}
1094
d38ceaf9
AD
1095/* if we get transitioned to only one device, take VGA back */
1096/**
06ec9070 1097 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9
AD
1098 *
1099 * @cookie: amdgpu_device pointer
1100 * @state: enable/disable vga decode
1101 *
1102 * Enable/disable vga decode (all asics).
1103 * Returns VGA resource flags.
1104 */
06ec9070 1105static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
d38ceaf9
AD
1106{
1107 struct amdgpu_device *adev = cookie;
1108 amdgpu_asic_set_vga_state(adev, state);
1109 if (state)
1110 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1111 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1112 else
1113 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1114}
1115
e3ecdffa
AD
1116/**
1117 * amdgpu_device_check_block_size - validate the vm block size
1118 *
1119 * @adev: amdgpu_device pointer
1120 *
1121 * Validates the vm block size specified via module parameter.
1122 * The vm block size defines number of bits in page table versus page directory,
1123 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1124 * page table and the remaining bits are in the page directory.
1125 */
06ec9070 1126static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1127{
1128 /* defines number of bits in page table versus page directory,
1129 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1130 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1131 if (amdgpu_vm_block_size == -1)
1132 return;
a1adf8be 1133
bab4fee7 1134 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1135 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1136 amdgpu_vm_block_size);
97489129 1137 amdgpu_vm_block_size = -1;
a1adf8be 1138 }
a1adf8be
CZ
1139}
1140
e3ecdffa
AD
1141/**
1142 * amdgpu_device_check_vm_size - validate the vm size
1143 *
1144 * @adev: amdgpu_device pointer
1145 *
1146 * Validates the vm size in GB specified via module parameter.
1147 * The VM size is the size of the GPU virtual memory space in GB.
1148 */
06ec9070 1149static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1150{
64dab074
AD
1151 /* no need to check the default value */
1152 if (amdgpu_vm_size == -1)
1153 return;
1154
83ca145d
ZJ
1155 if (amdgpu_vm_size < 1) {
1156 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1157 amdgpu_vm_size);
f3368128 1158 amdgpu_vm_size = -1;
83ca145d 1159 }
83ca145d
ZJ
1160}
1161
7951e376
RZ
1162static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1163{
1164 struct sysinfo si;
a9d4fe2f 1165 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1166 uint64_t total_memory;
1167 uint64_t dram_size_seven_GB = 0x1B8000000;
1168 uint64_t dram_size_three_GB = 0xB8000000;
1169
1170 if (amdgpu_smu_memory_pool_size == 0)
1171 return;
1172
1173 if (!is_os_64) {
1174 DRM_WARN("Not 64-bit OS, feature not supported\n");
1175 goto def_value;
1176 }
1177 si_meminfo(&si);
1178 total_memory = (uint64_t)si.totalram * si.mem_unit;
1179
1180 if ((amdgpu_smu_memory_pool_size == 1) ||
1181 (amdgpu_smu_memory_pool_size == 2)) {
1182 if (total_memory < dram_size_three_GB)
1183 goto def_value1;
1184 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1185 (amdgpu_smu_memory_pool_size == 8)) {
1186 if (total_memory < dram_size_seven_GB)
1187 goto def_value1;
1188 } else {
1189 DRM_WARN("Smu memory pool size not supported\n");
1190 goto def_value;
1191 }
1192 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1193
1194 return;
1195
1196def_value1:
1197 DRM_WARN("No enough system memory\n");
1198def_value:
1199 adev->pm.smu_prv_buffer_size = 0;
1200}
1201
d38ceaf9 1202/**
06ec9070 1203 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1204 *
1205 * @adev: amdgpu_device pointer
1206 *
1207 * Validates certain module parameters and updates
1208 * the associated values used by the driver (all asics).
1209 */
912dfc84 1210static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1211{
5b011235
CZ
1212 if (amdgpu_sched_jobs < 4) {
1213 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1214 amdgpu_sched_jobs);
1215 amdgpu_sched_jobs = 4;
76117507 1216 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1217 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1218 amdgpu_sched_jobs);
1219 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1220 }
d38ceaf9 1221
83e74db6 1222 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1223 /* gart size must be greater or equal to 32M */
1224 dev_warn(adev->dev, "gart size (%d) too small\n",
1225 amdgpu_gart_size);
83e74db6 1226 amdgpu_gart_size = -1;
d38ceaf9
AD
1227 }
1228
36d38372 1229 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1230 /* gtt size must be greater or equal to 32M */
36d38372
CK
1231 dev_warn(adev->dev, "gtt size (%d) too small\n",
1232 amdgpu_gtt_size);
1233 amdgpu_gtt_size = -1;
d38ceaf9
AD
1234 }
1235
d07f14be
RH
1236 /* valid range is between 4 and 9 inclusive */
1237 if (amdgpu_vm_fragment_size != -1 &&
1238 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1239 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1240 amdgpu_vm_fragment_size = -1;
1241 }
1242
5d5bd5e3
KW
1243 if (amdgpu_sched_hw_submission < 2) {
1244 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1245 amdgpu_sched_hw_submission);
1246 amdgpu_sched_hw_submission = 2;
1247 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1248 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1249 amdgpu_sched_hw_submission);
1250 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1251 }
1252
7951e376
RZ
1253 amdgpu_device_check_smu_prv_buffer_size(adev);
1254
06ec9070 1255 amdgpu_device_check_vm_size(adev);
d38ceaf9 1256
06ec9070 1257 amdgpu_device_check_block_size(adev);
6a7f76e7 1258
19aede77 1259 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1260
c6252390 1261 amdgpu_gmc_tmz_set(adev);
01a8dcec 1262
a300de40
ML
1263 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1264 amdgpu_num_kcq = 8;
c16ce562 1265 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
a300de40
ML
1266 }
1267
e3c00faa 1268 return 0;
d38ceaf9
AD
1269}
1270
1271/**
1272 * amdgpu_switcheroo_set_state - set switcheroo state
1273 *
1274 * @pdev: pci dev pointer
1694467b 1275 * @state: vga_switcheroo state
d38ceaf9
AD
1276 *
1277 * Callback for the switcheroo driver. Suspends or resumes the
1278 * the asics before or after it is powered up using ACPI methods.
1279 */
8aba21b7
LT
1280static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1281 enum vga_switcheroo_state state)
d38ceaf9
AD
1282{
1283 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1284 int r;
d38ceaf9 1285
31af062a 1286 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1287 return;
1288
1289 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1290 pr_info("switched on\n");
d38ceaf9
AD
1291 /* don't suspend or resume card normally */
1292 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1293
de185019
AD
1294 pci_set_power_state(dev->pdev, PCI_D0);
1295 pci_restore_state(dev->pdev);
1296 r = pci_enable_device(dev->pdev);
1297 if (r)
1298 DRM_WARN("pci_enable_device failed (%d)\n", r);
1299 amdgpu_device_resume(dev, true);
d38ceaf9 1300
d38ceaf9
AD
1301 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1302 drm_kms_helper_poll_enable(dev);
1303 } else {
dd4fa6c1 1304 pr_info("switched off\n");
d38ceaf9
AD
1305 drm_kms_helper_poll_disable(dev);
1306 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019
AD
1307 amdgpu_device_suspend(dev, true);
1308 pci_save_state(dev->pdev);
1309 /* Shut down the device */
1310 pci_disable_device(dev->pdev);
1311 pci_set_power_state(dev->pdev, PCI_D3cold);
d38ceaf9
AD
1312 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1313 }
1314}
1315
1316/**
1317 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1318 *
1319 * @pdev: pci dev pointer
1320 *
1321 * Callback for the switcheroo driver. Check of the switcheroo
1322 * state can be changed.
1323 * Returns true if the state can be changed, false if not.
1324 */
1325static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1326{
1327 struct drm_device *dev = pci_get_drvdata(pdev);
1328
1329 /*
1330 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1331 * locking inversion with the driver load path. And the access here is
1332 * completely racy anyway. So don't bother with locking for now.
1333 */
7e13ad89 1334 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1335}
1336
1337static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1338 .set_gpu_state = amdgpu_switcheroo_set_state,
1339 .reprobe = NULL,
1340 .can_switch = amdgpu_switcheroo_can_switch,
1341};
1342
e3ecdffa
AD
1343/**
1344 * amdgpu_device_ip_set_clockgating_state - set the CG state
1345 *
87e3f136 1346 * @dev: amdgpu_device pointer
e3ecdffa
AD
1347 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1348 * @state: clockgating state (gate or ungate)
1349 *
1350 * Sets the requested clockgating state for all instances of
1351 * the hardware IP specified.
1352 * Returns the error code from the last instance.
1353 */
43fa561f 1354int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1355 enum amd_ip_block_type block_type,
1356 enum amd_clockgating_state state)
d38ceaf9 1357{
43fa561f 1358 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1359 int i, r = 0;
1360
1361 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1362 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1363 continue;
c722865a
RZ
1364 if (adev->ip_blocks[i].version->type != block_type)
1365 continue;
1366 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1367 continue;
1368 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1369 (void *)adev, state);
1370 if (r)
1371 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1372 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1373 }
1374 return r;
1375}
1376
e3ecdffa
AD
1377/**
1378 * amdgpu_device_ip_set_powergating_state - set the PG state
1379 *
87e3f136 1380 * @dev: amdgpu_device pointer
e3ecdffa
AD
1381 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1382 * @state: powergating state (gate or ungate)
1383 *
1384 * Sets the requested powergating state for all instances of
1385 * the hardware IP specified.
1386 * Returns the error code from the last instance.
1387 */
43fa561f 1388int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1389 enum amd_ip_block_type block_type,
1390 enum amd_powergating_state state)
d38ceaf9 1391{
43fa561f 1392 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1393 int i, r = 0;
1394
1395 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1396 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1397 continue;
c722865a
RZ
1398 if (adev->ip_blocks[i].version->type != block_type)
1399 continue;
1400 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1401 continue;
1402 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1403 (void *)adev, state);
1404 if (r)
1405 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1406 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1407 }
1408 return r;
1409}
1410
e3ecdffa
AD
1411/**
1412 * amdgpu_device_ip_get_clockgating_state - get the CG state
1413 *
1414 * @adev: amdgpu_device pointer
1415 * @flags: clockgating feature flags
1416 *
1417 * Walks the list of IPs on the device and updates the clockgating
1418 * flags for each IP.
1419 * Updates @flags with the feature flags for each hardware IP where
1420 * clockgating is enabled.
1421 */
2990a1fc
AD
1422void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1423 u32 *flags)
6cb2d4e4
HR
1424{
1425 int i;
1426
1427 for (i = 0; i < adev->num_ip_blocks; i++) {
1428 if (!adev->ip_blocks[i].status.valid)
1429 continue;
1430 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1431 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1432 }
1433}
1434
e3ecdffa
AD
1435/**
1436 * amdgpu_device_ip_wait_for_idle - wait for idle
1437 *
1438 * @adev: amdgpu_device pointer
1439 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1440 *
1441 * Waits for the request hardware IP to be idle.
1442 * Returns 0 for success or a negative error code on failure.
1443 */
2990a1fc
AD
1444int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1445 enum amd_ip_block_type block_type)
5dbbb60b
AD
1446{
1447 int i, r;
1448
1449 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1450 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1451 continue;
a1255107
AD
1452 if (adev->ip_blocks[i].version->type == block_type) {
1453 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1454 if (r)
1455 return r;
1456 break;
1457 }
1458 }
1459 return 0;
1460
1461}
1462
e3ecdffa
AD
1463/**
1464 * amdgpu_device_ip_is_idle - is the hardware IP idle
1465 *
1466 * @adev: amdgpu_device pointer
1467 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1468 *
1469 * Check if the hardware IP is idle or not.
1470 * Returns true if it the IP is idle, false if not.
1471 */
2990a1fc
AD
1472bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1473 enum amd_ip_block_type block_type)
5dbbb60b
AD
1474{
1475 int i;
1476
1477 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1478 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1479 continue;
a1255107
AD
1480 if (adev->ip_blocks[i].version->type == block_type)
1481 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1482 }
1483 return true;
1484
1485}
1486
e3ecdffa
AD
1487/**
1488 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1489 *
1490 * @adev: amdgpu_device pointer
87e3f136 1491 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1492 *
1493 * Returns a pointer to the hardware IP block structure
1494 * if it exists for the asic, otherwise NULL.
1495 */
2990a1fc
AD
1496struct amdgpu_ip_block *
1497amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1498 enum amd_ip_block_type type)
d38ceaf9
AD
1499{
1500 int i;
1501
1502 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1503 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1504 return &adev->ip_blocks[i];
1505
1506 return NULL;
1507}
1508
1509/**
2990a1fc 1510 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1511 *
1512 * @adev: amdgpu_device pointer
5fc3aeeb 1513 * @type: enum amd_ip_block_type
d38ceaf9
AD
1514 * @major: major version
1515 * @minor: minor version
1516 *
1517 * return 0 if equal or greater
1518 * return 1 if smaller or the ip_block doesn't exist
1519 */
2990a1fc
AD
1520int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1521 enum amd_ip_block_type type,
1522 u32 major, u32 minor)
d38ceaf9 1523{
2990a1fc 1524 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1525
a1255107
AD
1526 if (ip_block && ((ip_block->version->major > major) ||
1527 ((ip_block->version->major == major) &&
1528 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1529 return 0;
1530
1531 return 1;
1532}
1533
a1255107 1534/**
2990a1fc 1535 * amdgpu_device_ip_block_add
a1255107
AD
1536 *
1537 * @adev: amdgpu_device pointer
1538 * @ip_block_version: pointer to the IP to add
1539 *
1540 * Adds the IP block driver information to the collection of IPs
1541 * on the asic.
1542 */
2990a1fc
AD
1543int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1544 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1545{
1546 if (!ip_block_version)
1547 return -EINVAL;
1548
e966a725 1549 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1550 ip_block_version->funcs->name);
1551
a1255107
AD
1552 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1553
1554 return 0;
1555}
1556
e3ecdffa
AD
1557/**
1558 * amdgpu_device_enable_virtual_display - enable virtual display feature
1559 *
1560 * @adev: amdgpu_device pointer
1561 *
1562 * Enabled the virtual display feature if the user has enabled it via
1563 * the module parameter virtual_display. This feature provides a virtual
1564 * display hardware on headless boards or in virtualized environments.
1565 * This function parses and validates the configuration string specified by
1566 * the user and configues the virtual display configuration (number of
1567 * virtual connectors, crtcs, etc.) specified.
1568 */
483ef985 1569static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1570{
1571 adev->enable_virtual_display = false;
1572
1573 if (amdgpu_virtual_display) {
4a580877 1574 struct drm_device *ddev = adev_to_drm(adev);
9accf2fd 1575 const char *pci_address_name = pci_name(ddev->pdev);
0f66356d 1576 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1577
1578 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1579 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1580 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1581 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1582 if (!strcmp("all", pciaddname)
1583 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1584 long num_crtc;
1585 int res = -1;
1586
9accf2fd 1587 adev->enable_virtual_display = true;
0f66356d
ED
1588
1589 if (pciaddname_tmp)
1590 res = kstrtol(pciaddname_tmp, 10,
1591 &num_crtc);
1592
1593 if (!res) {
1594 if (num_crtc < 1)
1595 num_crtc = 1;
1596 if (num_crtc > 6)
1597 num_crtc = 6;
1598 adev->mode_info.num_crtc = num_crtc;
1599 } else {
1600 adev->mode_info.num_crtc = 1;
1601 }
9accf2fd
ED
1602 break;
1603 }
1604 }
1605
0f66356d
ED
1606 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1607 amdgpu_virtual_display, pci_address_name,
1608 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1609
1610 kfree(pciaddstr);
1611 }
1612}
1613
e3ecdffa
AD
1614/**
1615 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1616 *
1617 * @adev: amdgpu_device pointer
1618 *
1619 * Parses the asic configuration parameters specified in the gpu info
1620 * firmware and makes them availale to the driver for use in configuring
1621 * the asic.
1622 * Returns 0 on success, -EINVAL on failure.
1623 */
e2a75f88
AD
1624static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1625{
e2a75f88 1626 const char *chip_name;
c0a43457 1627 char fw_name[40];
e2a75f88
AD
1628 int err;
1629 const struct gpu_info_firmware_header_v1_0 *hdr;
1630
ab4fe3e1
HR
1631 adev->firmware.gpu_info_fw = NULL;
1632
72de33f8 1633 if (adev->mman.discovery_bin) {
258620d0 1634 amdgpu_discovery_get_gfx_info(adev);
cc375d8c
TY
1635
1636 /*
1637 * FIXME: The bounding box is still needed by Navi12, so
1638 * temporarily read it from gpu_info firmware. Should be droped
1639 * when DAL no longer needs it.
1640 */
1641 if (adev->asic_type != CHIP_NAVI12)
1642 return 0;
258620d0
AD
1643 }
1644
e2a75f88 1645 switch (adev->asic_type) {
e2a75f88
AD
1646#ifdef CONFIG_DRM_AMDGPU_SI
1647 case CHIP_VERDE:
1648 case CHIP_TAHITI:
1649 case CHIP_PITCAIRN:
1650 case CHIP_OLAND:
1651 case CHIP_HAINAN:
1652#endif
1653#ifdef CONFIG_DRM_AMDGPU_CIK
1654 case CHIP_BONAIRE:
1655 case CHIP_HAWAII:
1656 case CHIP_KAVERI:
1657 case CHIP_KABINI:
1658 case CHIP_MULLINS:
1659#endif
da87c30b
AD
1660 case CHIP_TOPAZ:
1661 case CHIP_TONGA:
1662 case CHIP_FIJI:
1663 case CHIP_POLARIS10:
1664 case CHIP_POLARIS11:
1665 case CHIP_POLARIS12:
1666 case CHIP_VEGAM:
1667 case CHIP_CARRIZO:
1668 case CHIP_STONEY:
27c0bc71 1669 case CHIP_VEGA20:
e2a75f88
AD
1670 default:
1671 return 0;
1672 case CHIP_VEGA10:
1673 chip_name = "vega10";
1674 break;
3f76dced
AD
1675 case CHIP_VEGA12:
1676 chip_name = "vega12";
1677 break;
2d2e5e7e 1678 case CHIP_RAVEN:
54f78a76 1679 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1680 chip_name = "raven2";
54f78a76 1681 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1682 chip_name = "picasso";
54c4d17e
FX
1683 else
1684 chip_name = "raven";
2d2e5e7e 1685 break;
65e60f6e
LM
1686 case CHIP_ARCTURUS:
1687 chip_name = "arcturus";
1688 break;
b51a26a0
HR
1689 case CHIP_RENOIR:
1690 chip_name = "renoir";
1691 break;
23c6268e
HR
1692 case CHIP_NAVI10:
1693 chip_name = "navi10";
1694 break;
ed42cfe1
XY
1695 case CHIP_NAVI14:
1696 chip_name = "navi14";
1697 break;
42b325e5
XY
1698 case CHIP_NAVI12:
1699 chip_name = "navi12";
1700 break;
c0a43457
LG
1701 case CHIP_SIENNA_CICHLID:
1702 chip_name = "sienna_cichlid";
1703 break;
120eb833
JC
1704 case CHIP_NAVY_FLOUNDER:
1705 chip_name = "navy_flounder";
1706 break;
e2a75f88
AD
1707 }
1708
1709 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1710 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1711 if (err) {
1712 dev_err(adev->dev,
1713 "Failed to load gpu_info firmware \"%s\"\n",
1714 fw_name);
1715 goto out;
1716 }
ab4fe3e1 1717 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1718 if (err) {
1719 dev_err(adev->dev,
1720 "Failed to validate gpu_info firmware \"%s\"\n",
1721 fw_name);
1722 goto out;
1723 }
1724
ab4fe3e1 1725 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1726 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1727
1728 switch (hdr->version_major) {
1729 case 1:
1730 {
1731 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1732 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1733 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1734
cc375d8c
TY
1735 /*
1736 * Should be droped when DAL no longer needs it.
1737 */
1738 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
1739 goto parse_soc_bounding_box;
1740
b5ab16bf
AD
1741 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1742 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1743 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1744 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 1745 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
1746 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1747 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1748 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1749 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1750 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 1751 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
1752 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1753 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
1754 adev->gfx.cu_info.max_waves_per_simd =
1755 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1756 adev->gfx.cu_info.max_scratch_slots_per_cu =
1757 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1758 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 1759 if (hdr->version_minor >= 1) {
35c2e910
HZ
1760 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1761 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1762 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1763 adev->gfx.config.num_sc_per_sh =
1764 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1765 adev->gfx.config.num_packer_per_sc =
1766 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1767 }
ec51d3fa
XY
1768
1769parse_soc_bounding_box:
ec51d3fa
XY
1770 /*
1771 * soc bounding box info is not integrated in disocovery table,
258620d0 1772 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 1773 */
48321c3d
HW
1774 if (hdr->version_minor == 2) {
1775 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1776 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1777 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1778 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1779 }
e2a75f88
AD
1780 break;
1781 }
1782 default:
1783 dev_err(adev->dev,
1784 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1785 err = -EINVAL;
1786 goto out;
1787 }
1788out:
e2a75f88
AD
1789 return err;
1790}
1791
e3ecdffa
AD
1792/**
1793 * amdgpu_device_ip_early_init - run early init for hardware IPs
1794 *
1795 * @adev: amdgpu_device pointer
1796 *
1797 * Early initialization pass for hardware IPs. The hardware IPs that make
1798 * up each asic are discovered each IP's early_init callback is run. This
1799 * is the first stage in initializing the asic.
1800 * Returns 0 on success, negative error code on failure.
1801 */
06ec9070 1802static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 1803{
aaa36a97 1804 int i, r;
d38ceaf9 1805
483ef985 1806 amdgpu_device_enable_virtual_display(adev);
a6be7570 1807
00a979f3 1808 if (amdgpu_sriov_vf(adev)) {
00a979f3 1809 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
1810 if (r)
1811 return r;
00a979f3
WS
1812 }
1813
d38ceaf9 1814 switch (adev->asic_type) {
33f34802
KW
1815#ifdef CONFIG_DRM_AMDGPU_SI
1816 case CHIP_VERDE:
1817 case CHIP_TAHITI:
1818 case CHIP_PITCAIRN:
1819 case CHIP_OLAND:
1820 case CHIP_HAINAN:
295d0daf 1821 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
1822 r = si_set_ip_blocks(adev);
1823 if (r)
1824 return r;
1825 break;
1826#endif
a2e73f56
AD
1827#ifdef CONFIG_DRM_AMDGPU_CIK
1828 case CHIP_BONAIRE:
1829 case CHIP_HAWAII:
1830 case CHIP_KAVERI:
1831 case CHIP_KABINI:
1832 case CHIP_MULLINS:
e1ad2d53 1833 if (adev->flags & AMD_IS_APU)
a2e73f56 1834 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
1835 else
1836 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
1837
1838 r = cik_set_ip_blocks(adev);
1839 if (r)
1840 return r;
1841 break;
1842#endif
da87c30b
AD
1843 case CHIP_TOPAZ:
1844 case CHIP_TONGA:
1845 case CHIP_FIJI:
1846 case CHIP_POLARIS10:
1847 case CHIP_POLARIS11:
1848 case CHIP_POLARIS12:
1849 case CHIP_VEGAM:
1850 case CHIP_CARRIZO:
1851 case CHIP_STONEY:
1852 if (adev->flags & AMD_IS_APU)
1853 adev->family = AMDGPU_FAMILY_CZ;
1854 else
1855 adev->family = AMDGPU_FAMILY_VI;
1856
1857 r = vi_set_ip_blocks(adev);
1858 if (r)
1859 return r;
1860 break;
e48a3cd9
AD
1861 case CHIP_VEGA10:
1862 case CHIP_VEGA12:
e4bd8170 1863 case CHIP_VEGA20:
e48a3cd9 1864 case CHIP_RAVEN:
61cf44c1 1865 case CHIP_ARCTURUS:
b51a26a0 1866 case CHIP_RENOIR:
70534d1e 1867 if (adev->flags & AMD_IS_APU)
2ca8a5d2
CZ
1868 adev->family = AMDGPU_FAMILY_RV;
1869 else
1870 adev->family = AMDGPU_FAMILY_AI;
460826e6
KW
1871
1872 r = soc15_set_ip_blocks(adev);
1873 if (r)
1874 return r;
1875 break;
0a5b8c7b 1876 case CHIP_NAVI10:
7ecb5cd4 1877 case CHIP_NAVI14:
4808cf9c 1878 case CHIP_NAVI12:
11e8aef5 1879 case CHIP_SIENNA_CICHLID:
41f446bf 1880 case CHIP_NAVY_FLOUNDER:
0a5b8c7b
HR
1881 adev->family = AMDGPU_FAMILY_NV;
1882
1883 r = nv_set_ip_blocks(adev);
1884 if (r)
1885 return r;
1886 break;
d38ceaf9
AD
1887 default:
1888 /* FIXME: not supported yet */
1889 return -EINVAL;
1890 }
1891
1884734a 1892 amdgpu_amdkfd_device_probe(adev);
1893
3b94fb10 1894 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 1895 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 1896 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
00f54b97 1897
d38ceaf9
AD
1898 for (i = 0; i < adev->num_ip_blocks; i++) {
1899 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
1900 DRM_ERROR("disabled ip block: %d <%s>\n",
1901 i, adev->ip_blocks[i].version->funcs->name);
a1255107 1902 adev->ip_blocks[i].status.valid = false;
d38ceaf9 1903 } else {
a1255107
AD
1904 if (adev->ip_blocks[i].version->funcs->early_init) {
1905 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 1906 if (r == -ENOENT) {
a1255107 1907 adev->ip_blocks[i].status.valid = false;
2c1a2784 1908 } else if (r) {
a1255107
AD
1909 DRM_ERROR("early_init of IP block <%s> failed %d\n",
1910 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 1911 return r;
2c1a2784 1912 } else {
a1255107 1913 adev->ip_blocks[i].status.valid = true;
2c1a2784 1914 }
974e6b64 1915 } else {
a1255107 1916 adev->ip_blocks[i].status.valid = true;
d38ceaf9 1917 }
d38ceaf9 1918 }
21a249ca
AD
1919 /* get the vbios after the asic_funcs are set up */
1920 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
1921 r = amdgpu_device_parse_gpu_info_fw(adev);
1922 if (r)
1923 return r;
1924
21a249ca
AD
1925 /* Read BIOS */
1926 if (!amdgpu_get_bios(adev))
1927 return -EINVAL;
1928
1929 r = amdgpu_atombios_init(adev);
1930 if (r) {
1931 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1932 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1933 return r;
1934 }
1935 }
d38ceaf9
AD
1936 }
1937
395d1fb9
NH
1938 adev->cg_flags &= amdgpu_cg_mask;
1939 adev->pg_flags &= amdgpu_pg_mask;
1940
d38ceaf9
AD
1941 return 0;
1942}
1943
0a4f2520
RZ
1944static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1945{
1946 int i, r;
1947
1948 for (i = 0; i < adev->num_ip_blocks; i++) {
1949 if (!adev->ip_blocks[i].status.sw)
1950 continue;
1951 if (adev->ip_blocks[i].status.hw)
1952 continue;
1953 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 1954 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
1955 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1956 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1957 if (r) {
1958 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1959 adev->ip_blocks[i].version->funcs->name, r);
1960 return r;
1961 }
1962 adev->ip_blocks[i].status.hw = true;
1963 }
1964 }
1965
1966 return 0;
1967}
1968
1969static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1970{
1971 int i, r;
1972
1973 for (i = 0; i < adev->num_ip_blocks; i++) {
1974 if (!adev->ip_blocks[i].status.sw)
1975 continue;
1976 if (adev->ip_blocks[i].status.hw)
1977 continue;
1978 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1979 if (r) {
1980 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1981 adev->ip_blocks[i].version->funcs->name, r);
1982 return r;
1983 }
1984 adev->ip_blocks[i].status.hw = true;
1985 }
1986
1987 return 0;
1988}
1989
7a3e0bb2
RZ
1990static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1991{
1992 int r = 0;
1993 int i;
80f41f84 1994 uint32_t smu_version;
7a3e0bb2
RZ
1995
1996 if (adev->asic_type >= CHIP_VEGA10) {
1997 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
1998 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1999 continue;
2000
2001 /* no need to do the fw loading again if already done*/
2002 if (adev->ip_blocks[i].status.hw == true)
2003 break;
2004
53b3f8f4 2005 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2006 r = adev->ip_blocks[i].version->funcs->resume(adev);
2007 if (r) {
2008 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2009 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2010 return r;
2011 }
2012 } else {
2013 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2014 if (r) {
2015 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2016 adev->ip_blocks[i].version->funcs->name, r);
2017 return r;
7a3e0bb2 2018 }
7a3e0bb2 2019 }
482f0e53
ML
2020
2021 adev->ip_blocks[i].status.hw = true;
2022 break;
7a3e0bb2
RZ
2023 }
2024 }
482f0e53 2025
8973d9ec
ED
2026 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2027 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2028
80f41f84 2029 return r;
7a3e0bb2
RZ
2030}
2031
e3ecdffa
AD
2032/**
2033 * amdgpu_device_ip_init - run init for hardware IPs
2034 *
2035 * @adev: amdgpu_device pointer
2036 *
2037 * Main initialization pass for hardware IPs. The list of all the hardware
2038 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2039 * are run. sw_init initializes the software state associated with each IP
2040 * and hw_init initializes the hardware associated with each IP.
2041 * Returns 0 on success, negative error code on failure.
2042 */
06ec9070 2043static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2044{
2045 int i, r;
2046
c030f2e4 2047 r = amdgpu_ras_init(adev);
2048 if (r)
2049 return r;
2050
d38ceaf9 2051 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2052 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2053 continue;
a1255107 2054 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2055 if (r) {
a1255107
AD
2056 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2057 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2058 goto init_failed;
2c1a2784 2059 }
a1255107 2060 adev->ip_blocks[i].status.sw = true;
bfca0289 2061
d38ceaf9 2062 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2063 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
06ec9070 2064 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2065 if (r) {
2066 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2067 goto init_failed;
2c1a2784 2068 }
a1255107 2069 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2070 if (r) {
2071 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2072 goto init_failed;
2c1a2784 2073 }
06ec9070 2074 r = amdgpu_device_wb_init(adev);
2c1a2784 2075 if (r) {
06ec9070 2076 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2077 goto init_failed;
2c1a2784 2078 }
a1255107 2079 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2080
2081 /* right after GMC hw init, we create CSA */
f92d5c61 2082 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2083 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2084 AMDGPU_GEM_DOMAIN_VRAM,
2085 AMDGPU_CSA_SIZE);
2493664f
ML
2086 if (r) {
2087 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2088 goto init_failed;
2493664f
ML
2089 }
2090 }
d38ceaf9
AD
2091 }
2092 }
2093
c9ffa427
YT
2094 if (amdgpu_sriov_vf(adev))
2095 amdgpu_virt_init_data_exchange(adev);
2096
533aed27
AG
2097 r = amdgpu_ib_pool_init(adev);
2098 if (r) {
2099 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2100 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2101 goto init_failed;
2102 }
2103
c8963ea4
RZ
2104 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2105 if (r)
72d3f592 2106 goto init_failed;
0a4f2520
RZ
2107
2108 r = amdgpu_device_ip_hw_init_phase1(adev);
2109 if (r)
72d3f592 2110 goto init_failed;
0a4f2520 2111
7a3e0bb2
RZ
2112 r = amdgpu_device_fw_loading(adev);
2113 if (r)
72d3f592 2114 goto init_failed;
7a3e0bb2 2115
0a4f2520
RZ
2116 r = amdgpu_device_ip_hw_init_phase2(adev);
2117 if (r)
72d3f592 2118 goto init_failed;
d38ceaf9 2119
121a2bc6
AG
2120 /*
2121 * retired pages will be loaded from eeprom and reserved here,
2122 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2123 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2124 * for I2C communication which only true at this point.
b82e65a9
GC
2125 *
2126 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2127 * failure from bad gpu situation and stop amdgpu init process
2128 * accordingly. For other failed cases, it will still release all
2129 * the resource and print error message, rather than returning one
2130 * negative value to upper level.
121a2bc6
AG
2131 *
2132 * Note: theoretically, this should be called before all vram allocations
2133 * to protect retired page from abusing
2134 */
b82e65a9
GC
2135 r = amdgpu_ras_recovery_init(adev);
2136 if (r)
2137 goto init_failed;
121a2bc6 2138
3e2e2ab5
HZ
2139 if (adev->gmc.xgmi.num_physical_nodes > 1)
2140 amdgpu_xgmi_add_device(adev);
1884734a 2141 amdgpu_amdkfd_device_init(adev);
c6332b97 2142
bd607166
KR
2143 amdgpu_fru_get_product_info(adev);
2144
72d3f592 2145init_failed:
c9ffa427 2146 if (amdgpu_sriov_vf(adev))
c6332b97 2147 amdgpu_virt_release_full_gpu(adev, true);
2148
72d3f592 2149 return r;
d38ceaf9
AD
2150}
2151
e3ecdffa
AD
2152/**
2153 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2154 *
2155 * @adev: amdgpu_device pointer
2156 *
2157 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2158 * this function before a GPU reset. If the value is retained after a
2159 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2160 */
06ec9070 2161static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2162{
2163 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2164}
2165
e3ecdffa
AD
2166/**
2167 * amdgpu_device_check_vram_lost - check if vram is valid
2168 *
2169 * @adev: amdgpu_device pointer
2170 *
2171 * Checks the reset magic value written to the gart pointer in VRAM.
2172 * The driver calls this after a GPU reset to see if the contents of
2173 * VRAM is lost or now.
2174 * returns true if vram is lost, false if not.
2175 */
06ec9070 2176static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2177{
dadce777
EQ
2178 if (memcmp(adev->gart.ptr, adev->reset_magic,
2179 AMDGPU_RESET_MAGIC_NUM))
2180 return true;
2181
53b3f8f4 2182 if (!amdgpu_in_reset(adev))
dadce777
EQ
2183 return false;
2184
2185 /*
2186 * For all ASICs with baco/mode1 reset, the VRAM is
2187 * always assumed to be lost.
2188 */
2189 switch (amdgpu_asic_reset_method(adev)) {
2190 case AMD_RESET_METHOD_BACO:
2191 case AMD_RESET_METHOD_MODE1:
2192 return true;
2193 default:
2194 return false;
2195 }
0c49e0b8
CZ
2196}
2197
e3ecdffa 2198/**
1112a46b 2199 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2200 *
2201 * @adev: amdgpu_device pointer
b8b72130 2202 * @state: clockgating state (gate or ungate)
e3ecdffa 2203 *
e3ecdffa 2204 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2205 * set_clockgating_state callbacks are run.
2206 * Late initialization pass enabling clockgating for hardware IPs.
2207 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2208 * Returns 0 on success, negative error code on failure.
2209 */
fdd34271 2210
1112a46b
RZ
2211static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2212 enum amd_clockgating_state state)
d38ceaf9 2213{
1112a46b 2214 int i, j, r;
d38ceaf9 2215
4a2ba394
SL
2216 if (amdgpu_emu_mode == 1)
2217 return 0;
2218
1112a46b
RZ
2219 for (j = 0; j < adev->num_ip_blocks; j++) {
2220 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2221 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2222 continue;
4a446d55 2223 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2224 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2225 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2226 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2227 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2228 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2229 /* enable clockgating to save power */
a1255107 2230 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2231 state);
4a446d55
AD
2232 if (r) {
2233 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2234 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2235 return r;
2236 }
b0b00ff1 2237 }
d38ceaf9 2238 }
06b18f61 2239
c9f96fd5
RZ
2240 return 0;
2241}
2242
1112a46b 2243static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
c9f96fd5 2244{
1112a46b 2245 int i, j, r;
06b18f61 2246
c9f96fd5
RZ
2247 if (amdgpu_emu_mode == 1)
2248 return 0;
2249
1112a46b
RZ
2250 for (j = 0; j < adev->num_ip_blocks; j++) {
2251 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2252 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5
RZ
2253 continue;
2254 /* skip CG for VCE/UVD, it's handled specially */
2255 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2256 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2257 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2258 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2259 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2260 /* enable powergating to save power */
2261 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2262 state);
c9f96fd5
RZ
2263 if (r) {
2264 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2265 adev->ip_blocks[i].version->funcs->name, r);
2266 return r;
2267 }
2268 }
2269 }
2dc80b00
S
2270 return 0;
2271}
2272
beff74bc
AD
2273static int amdgpu_device_enable_mgpu_fan_boost(void)
2274{
2275 struct amdgpu_gpu_instance *gpu_ins;
2276 struct amdgpu_device *adev;
2277 int i, ret = 0;
2278
2279 mutex_lock(&mgpu_info.mutex);
2280
2281 /*
2282 * MGPU fan boost feature should be enabled
2283 * only when there are two or more dGPUs in
2284 * the system
2285 */
2286 if (mgpu_info.num_dgpu < 2)
2287 goto out;
2288
2289 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2290 gpu_ins = &(mgpu_info.gpu_ins[i]);
2291 adev = gpu_ins->adev;
2292 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2293 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2294 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2295 if (ret)
2296 break;
2297
2298 gpu_ins->mgpu_fan_enabled = 1;
2299 }
2300 }
2301
2302out:
2303 mutex_unlock(&mgpu_info.mutex);
2304
2305 return ret;
2306}
2307
e3ecdffa
AD
2308/**
2309 * amdgpu_device_ip_late_init - run late init for hardware IPs
2310 *
2311 * @adev: amdgpu_device pointer
2312 *
2313 * Late initialization pass for hardware IPs. The list of all the hardware
2314 * IPs that make up the asic is walked and the late_init callbacks are run.
2315 * late_init covers any special initialization that an IP requires
2316 * after all of the have been initialized or something that needs to happen
2317 * late in the init process.
2318 * Returns 0 on success, negative error code on failure.
2319 */
06ec9070 2320static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2321{
60599a03 2322 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2323 int i = 0, r;
2324
2325 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2326 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2327 continue;
2328 if (adev->ip_blocks[i].version->funcs->late_init) {
2329 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2330 if (r) {
2331 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2332 adev->ip_blocks[i].version->funcs->name, r);
2333 return r;
2334 }
2dc80b00 2335 }
73f847db 2336 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2337 }
2338
a891d239
DL
2339 amdgpu_ras_set_error_query_ready(adev, true);
2340
1112a46b
RZ
2341 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2342 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2343
06ec9070 2344 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2345
beff74bc
AD
2346 r = amdgpu_device_enable_mgpu_fan_boost();
2347 if (r)
2348 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2349
60599a03
EQ
2350
2351 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2352 mutex_lock(&mgpu_info.mutex);
2353
2354 /*
2355 * Reset device p-state to low as this was booted with high.
2356 *
2357 * This should be performed only after all devices from the same
2358 * hive get initialized.
2359 *
2360 * However, it's unknown how many device in the hive in advance.
2361 * As this is counted one by one during devices initializations.
2362 *
2363 * So, we wait for all XGMI interlinked devices initialized.
2364 * This may bring some delays as those devices may come from
2365 * different hives. But that should be OK.
2366 */
2367 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2368 for (i = 0; i < mgpu_info.num_gpu; i++) {
2369 gpu_instance = &(mgpu_info.gpu_ins[i]);
2370 if (gpu_instance->adev->flags & AMD_IS_APU)
2371 continue;
2372
d84a430d
JK
2373 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2374 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2375 if (r) {
2376 DRM_ERROR("pstate setting failed (%d).\n", r);
2377 break;
2378 }
2379 }
2380 }
2381
2382 mutex_unlock(&mgpu_info.mutex);
2383 }
2384
d38ceaf9
AD
2385 return 0;
2386}
2387
e3ecdffa
AD
2388/**
2389 * amdgpu_device_ip_fini - run fini for hardware IPs
2390 *
2391 * @adev: amdgpu_device pointer
2392 *
2393 * Main teardown pass for hardware IPs. The list of all the hardware
2394 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2395 * are run. hw_fini tears down the hardware associated with each IP
2396 * and sw_fini tears down any software state associated with each IP.
2397 * Returns 0 on success, negative error code on failure.
2398 */
06ec9070 2399static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
d38ceaf9
AD
2400{
2401 int i, r;
2402
5278a159
SY
2403 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2404 amdgpu_virt_release_ras_err_handler_data(adev);
2405
c030f2e4 2406 amdgpu_ras_pre_fini(adev);
2407
a82400b5
AG
2408 if (adev->gmc.xgmi.num_physical_nodes > 1)
2409 amdgpu_xgmi_remove_device(adev);
2410
1884734a 2411 amdgpu_amdkfd_device_fini(adev);
05df1f01
RZ
2412
2413 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2414 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2415
3e96dbfd
AD
2416 /* need to disable SMC first */
2417 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2418 if (!adev->ip_blocks[i].status.hw)
3e96dbfd 2419 continue;
fdd34271 2420 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
a1255107 2421 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3e96dbfd
AD
2422 /* XXX handle errors */
2423 if (r) {
2424 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
a1255107 2425 adev->ip_blocks[i].version->funcs->name, r);
3e96dbfd 2426 }
a1255107 2427 adev->ip_blocks[i].status.hw = false;
3e96dbfd
AD
2428 break;
2429 }
2430 }
2431
d38ceaf9 2432 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2433 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2434 continue;
8201a67a 2435
a1255107 2436 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2437 /* XXX handle errors */
2c1a2784 2438 if (r) {
a1255107
AD
2439 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2440 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2441 }
8201a67a 2442
a1255107 2443 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2444 }
2445
9950cda2 2446
d38ceaf9 2447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2448 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2449 continue;
c12aba3a
ML
2450
2451 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2452 amdgpu_ucode_free_bo(adev);
1e256e27 2453 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2454 amdgpu_device_wb_fini(adev);
2455 amdgpu_device_vram_scratch_fini(adev);
533aed27 2456 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2457 }
2458
a1255107 2459 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2460 /* XXX handle errors */
2c1a2784 2461 if (r) {
a1255107
AD
2462 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2463 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2464 }
a1255107
AD
2465 adev->ip_blocks[i].status.sw = false;
2466 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2467 }
2468
a6dcfd9c 2469 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2470 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2471 continue;
a1255107
AD
2472 if (adev->ip_blocks[i].version->funcs->late_fini)
2473 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2474 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2475 }
2476
c030f2e4 2477 amdgpu_ras_fini(adev);
2478
030308fc 2479 if (amdgpu_sriov_vf(adev))
24136135
ML
2480 if (amdgpu_virt_release_full_gpu(adev, false))
2481 DRM_ERROR("failed to release exclusive mode on fini\n");
2493664f 2482
d38ceaf9
AD
2483 return 0;
2484}
2485
e3ecdffa 2486/**
beff74bc 2487 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2488 *
1112a46b 2489 * @work: work_struct.
e3ecdffa 2490 */
beff74bc 2491static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2492{
2493 struct amdgpu_device *adev =
beff74bc 2494 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2495 int r;
2496
2497 r = amdgpu_ib_ring_tests(adev);
2498 if (r)
2499 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2500}
2501
1e317b99
RZ
2502static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2503{
2504 struct amdgpu_device *adev =
2505 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2506
2507 mutex_lock(&adev->gfx.gfx_off_mutex);
2508 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2509 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2510 adev->gfx.gfx_off_state = true;
2511 }
2512 mutex_unlock(&adev->gfx.gfx_off_mutex);
2513}
2514
e3ecdffa 2515/**
e7854a03 2516 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2517 *
2518 * @adev: amdgpu_device pointer
2519 *
2520 * Main suspend function for hardware IPs. The list of all the hardware
2521 * IPs that make up the asic is walked, clockgating is disabled and the
2522 * suspend callbacks are run. suspend puts the hardware and software state
2523 * in each IP into a state suitable for suspend.
2524 * Returns 0 on success, negative error code on failure.
2525 */
e7854a03
AD
2526static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2527{
2528 int i, r;
2529
ced1ba97
PL
2530 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2531 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2532
e7854a03
AD
2533 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2534 if (!adev->ip_blocks[i].status.valid)
2535 continue;
2b9f7848 2536
e7854a03 2537 /* displays are handled separately */
2b9f7848
ND
2538 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2539 continue;
2540
2541 /* XXX handle errors */
2542 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2543 /* XXX handle errors */
2544 if (r) {
2545 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2546 adev->ip_blocks[i].version->funcs->name, r);
2547 return r;
e7854a03 2548 }
2b9f7848
ND
2549
2550 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2551 }
2552
e7854a03
AD
2553 return 0;
2554}
2555
2556/**
2557 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2558 *
2559 * @adev: amdgpu_device pointer
2560 *
2561 * Main suspend function for hardware IPs. The list of all the hardware
2562 * IPs that make up the asic is walked, clockgating is disabled and the
2563 * suspend callbacks are run. suspend puts the hardware and software state
2564 * in each IP into a state suitable for suspend.
2565 * Returns 0 on success, negative error code on failure.
2566 */
2567static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2568{
2569 int i, r;
2570
2571 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2572 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2573 continue;
e7854a03
AD
2574 /* displays are handled in phase1 */
2575 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2576 continue;
bff77e86
LM
2577 /* PSP lost connection when err_event_athub occurs */
2578 if (amdgpu_ras_intr_triggered() &&
2579 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2580 adev->ip_blocks[i].status.hw = false;
2581 continue;
2582 }
d38ceaf9 2583 /* XXX handle errors */
a1255107 2584 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2585 /* XXX handle errors */
2c1a2784 2586 if (r) {
a1255107
AD
2587 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2588 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2589 }
876923fb 2590 adev->ip_blocks[i].status.hw = false;
a3a09142 2591 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2592 if(!amdgpu_sriov_vf(adev)){
2593 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2594 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2595 if (r) {
2596 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2597 adev->mp1_state, r);
2598 return r;
2599 }
a3a09142
AD
2600 }
2601 }
b5507c7e 2602 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2603 }
2604
2605 return 0;
2606}
2607
e7854a03
AD
2608/**
2609 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2610 *
2611 * @adev: amdgpu_device pointer
2612 *
2613 * Main suspend function for hardware IPs. The list of all the hardware
2614 * IPs that make up the asic is walked, clockgating is disabled and the
2615 * suspend callbacks are run. suspend puts the hardware and software state
2616 * in each IP into a state suitable for suspend.
2617 * Returns 0 on success, negative error code on failure.
2618 */
2619int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2620{
2621 int r;
2622
e7819644
YT
2623 if (amdgpu_sriov_vf(adev))
2624 amdgpu_virt_request_full_gpu(adev, false);
2625
e7854a03
AD
2626 r = amdgpu_device_ip_suspend_phase1(adev);
2627 if (r)
2628 return r;
2629 r = amdgpu_device_ip_suspend_phase2(adev);
2630
e7819644
YT
2631 if (amdgpu_sriov_vf(adev))
2632 amdgpu_virt_release_full_gpu(adev, false);
2633
e7854a03
AD
2634 return r;
2635}
2636
06ec9070 2637static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2638{
2639 int i, r;
2640
2cb681b6
ML
2641 static enum amd_ip_block_type ip_order[] = {
2642 AMD_IP_BLOCK_TYPE_GMC,
2643 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 2644 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
2645 AMD_IP_BLOCK_TYPE_IH,
2646 };
a90ad3c2 2647
2cb681b6
ML
2648 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2649 int j;
2650 struct amdgpu_ip_block *block;
a90ad3c2 2651
4cd2a96d
J
2652 block = &adev->ip_blocks[i];
2653 block->status.hw = false;
2cb681b6 2654
4cd2a96d 2655 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 2656
4cd2a96d 2657 if (block->version->type != ip_order[j] ||
2cb681b6
ML
2658 !block->status.valid)
2659 continue;
2660
2661 r = block->version->funcs->hw_init(adev);
0aaeefcc 2662 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2663 if (r)
2664 return r;
482f0e53 2665 block->status.hw = true;
a90ad3c2
ML
2666 }
2667 }
2668
2669 return 0;
2670}
2671
06ec9070 2672static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2673{
2674 int i, r;
2675
2cb681b6
ML
2676 static enum amd_ip_block_type ip_order[] = {
2677 AMD_IP_BLOCK_TYPE_SMC,
2678 AMD_IP_BLOCK_TYPE_DCE,
2679 AMD_IP_BLOCK_TYPE_GFX,
2680 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 2681 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
2682 AMD_IP_BLOCK_TYPE_VCE,
2683 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 2684 };
a90ad3c2 2685
2cb681b6
ML
2686 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2687 int j;
2688 struct amdgpu_ip_block *block;
a90ad3c2 2689
2cb681b6
ML
2690 for (j = 0; j < adev->num_ip_blocks; j++) {
2691 block = &adev->ip_blocks[j];
2692
2693 if (block->version->type != ip_order[i] ||
482f0e53
ML
2694 !block->status.valid ||
2695 block->status.hw)
2cb681b6
ML
2696 continue;
2697
895bd048
JZ
2698 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2699 r = block->version->funcs->resume(adev);
2700 else
2701 r = block->version->funcs->hw_init(adev);
2702
0aaeefcc 2703 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2704 if (r)
2705 return r;
482f0e53 2706 block->status.hw = true;
a90ad3c2
ML
2707 }
2708 }
2709
2710 return 0;
2711}
2712
e3ecdffa
AD
2713/**
2714 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2715 *
2716 * @adev: amdgpu_device pointer
2717 *
2718 * First resume function for hardware IPs. The list of all the hardware
2719 * IPs that make up the asic is walked and the resume callbacks are run for
2720 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2721 * after a suspend and updates the software state as necessary. This
2722 * function is also used for restoring the GPU after a GPU reset.
2723 * Returns 0 on success, negative error code on failure.
2724 */
06ec9070 2725static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
2726{
2727 int i, r;
2728
a90ad3c2 2729 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2730 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 2731 continue;
a90ad3c2 2732 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
2733 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2734 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 2735
fcf0649f
CZ
2736 r = adev->ip_blocks[i].version->funcs->resume(adev);
2737 if (r) {
2738 DRM_ERROR("resume of IP block <%s> failed %d\n",
2739 adev->ip_blocks[i].version->funcs->name, r);
2740 return r;
2741 }
482f0e53 2742 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
2743 }
2744 }
2745
2746 return 0;
2747}
2748
e3ecdffa
AD
2749/**
2750 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2751 *
2752 * @adev: amdgpu_device pointer
2753 *
2754 * First resume function for hardware IPs. The list of all the hardware
2755 * IPs that make up the asic is walked and the resume callbacks are run for
2756 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2757 * functional state after a suspend and updates the software state as
2758 * necessary. This function is also used for restoring the GPU after a GPU
2759 * reset.
2760 * Returns 0 on success, negative error code on failure.
2761 */
06ec9070 2762static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2763{
2764 int i, r;
2765
2766 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2767 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 2768 continue;
fcf0649f 2769 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 2770 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
2771 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2772 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 2773 continue;
a1255107 2774 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 2775 if (r) {
a1255107
AD
2776 DRM_ERROR("resume of IP block <%s> failed %d\n",
2777 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2778 return r;
2c1a2784 2779 }
482f0e53 2780 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
2781 }
2782
2783 return 0;
2784}
2785
e3ecdffa
AD
2786/**
2787 * amdgpu_device_ip_resume - run resume for hardware IPs
2788 *
2789 * @adev: amdgpu_device pointer
2790 *
2791 * Main resume function for hardware IPs. The hardware IPs
2792 * are split into two resume functions because they are
2793 * are also used in in recovering from a GPU reset and some additional
2794 * steps need to be take between them. In this case (S3/S4) they are
2795 * run sequentially.
2796 * Returns 0 on success, negative error code on failure.
2797 */
06ec9070 2798static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
2799{
2800 int r;
2801
06ec9070 2802 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
2803 if (r)
2804 return r;
7a3e0bb2
RZ
2805
2806 r = amdgpu_device_fw_loading(adev);
2807 if (r)
2808 return r;
2809
06ec9070 2810 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
2811
2812 return r;
2813}
2814
e3ecdffa
AD
2815/**
2816 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2817 *
2818 * @adev: amdgpu_device pointer
2819 *
2820 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2821 */
4e99a44e 2822static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 2823{
6867e1b5
ML
2824 if (amdgpu_sriov_vf(adev)) {
2825 if (adev->is_atom_fw) {
2826 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2827 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2828 } else {
2829 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2830 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2831 }
2832
2833 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2834 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 2835 }
048765ad
AR
2836}
2837
e3ecdffa
AD
2838/**
2839 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2840 *
2841 * @asic_type: AMD asic type
2842 *
2843 * Check if there is DC (new modesetting infrastructre) support for an asic.
2844 * returns true if DC has support, false if not.
2845 */
4562236b
HW
2846bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2847{
2848 switch (asic_type) {
2849#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
2850#if defined(CONFIG_DRM_AMD_DC_SI)
2851 case CHIP_TAHITI:
2852 case CHIP_PITCAIRN:
2853 case CHIP_VERDE:
2854 case CHIP_OLAND:
2855#endif
4562236b 2856 case CHIP_BONAIRE:
0d6fbccb 2857 case CHIP_KAVERI:
367e6687
AD
2858 case CHIP_KABINI:
2859 case CHIP_MULLINS:
d9fda248
HW
2860 /*
2861 * We have systems in the wild with these ASICs that require
2862 * LVDS and VGA support which is not supported with DC.
2863 *
2864 * Fallback to the non-DC driver here by default so as not to
2865 * cause regressions.
2866 */
2867 return amdgpu_dc > 0;
2868 case CHIP_HAWAII:
4562236b
HW
2869 case CHIP_CARRIZO:
2870 case CHIP_STONEY:
4562236b 2871 case CHIP_POLARIS10:
675fd32b 2872 case CHIP_POLARIS11:
2c8ad2d5 2873 case CHIP_POLARIS12:
675fd32b 2874 case CHIP_VEGAM:
4562236b
HW
2875 case CHIP_TONGA:
2876 case CHIP_FIJI:
42f8ffa1 2877 case CHIP_VEGA10:
dca7b401 2878 case CHIP_VEGA12:
c6034aa2 2879 case CHIP_VEGA20:
b86a1aa3 2880#if defined(CONFIG_DRM_AMD_DC_DCN)
fd187853 2881 case CHIP_RAVEN:
b4f199c7 2882 case CHIP_NAVI10:
8fceceb6 2883 case CHIP_NAVI14:
078655d9 2884 case CHIP_NAVI12:
e1c14c43 2885 case CHIP_RENOIR:
81d9bfb8
JFZ
2886#endif
2887#if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2888 case CHIP_SIENNA_CICHLID:
a6c5308f 2889 case CHIP_NAVY_FLOUNDER:
42f8ffa1 2890#endif
fd187853 2891 return amdgpu_dc != 0;
4562236b
HW
2892#endif
2893 default:
93b09a9a
SS
2894 if (amdgpu_dc > 0)
2895 DRM_INFO("Display Core has been requested via kernel parameter "
2896 "but isn't supported by ASIC, ignoring\n");
4562236b
HW
2897 return false;
2898 }
2899}
2900
2901/**
2902 * amdgpu_device_has_dc_support - check if dc is supported
2903 *
2904 * @adev: amdgpu_device_pointer
2905 *
2906 * Returns true for supported, false for not supported
2907 */
2908bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2909{
c997e8e2 2910 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
2555039d
XY
2911 return false;
2912
4562236b
HW
2913 return amdgpu_device_asic_has_dc_support(adev->asic_type);
2914}
2915
d4535e2c
AG
2916
2917static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2918{
2919 struct amdgpu_device *adev =
2920 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 2921 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 2922
c6a6e2db
AG
2923 /* It's a bug to not have a hive within this function */
2924 if (WARN_ON(!hive))
2925 return;
2926
2927 /*
2928 * Use task barrier to synchronize all xgmi reset works across the
2929 * hive. task_barrier_enter and task_barrier_exit will block
2930 * until all the threads running the xgmi reset works reach
2931 * those points. task_barrier_full will do both blocks.
2932 */
2933 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2934
2935 task_barrier_enter(&hive->tb);
4a580877 2936 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
2937
2938 if (adev->asic_reset_res)
2939 goto fail;
2940
2941 task_barrier_exit(&hive->tb);
4a580877 2942 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
2943
2944 if (adev->asic_reset_res)
2945 goto fail;
43c4d576
JC
2946
2947 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2948 adev->mmhub.funcs->reset_ras_error_count(adev);
c6a6e2db
AG
2949 } else {
2950
2951 task_barrier_full(&hive->tb);
2952 adev->asic_reset_res = amdgpu_asic_reset(adev);
2953 }
ce316fa5 2954
c6a6e2db 2955fail:
d4535e2c 2956 if (adev->asic_reset_res)
fed184e9 2957 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 2958 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 2959 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
2960}
2961
71f98027
AD
2962static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2963{
2964 char *input = amdgpu_lockup_timeout;
2965 char *timeout_setting = NULL;
2966 int index = 0;
2967 long timeout;
2968 int ret = 0;
2969
2970 /*
2971 * By default timeout for non compute jobs is 10000.
2972 * And there is no timeout enforced on compute jobs.
2973 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 2974 * jobs are 60000 by default.
71f98027
AD
2975 */
2976 adev->gfx_timeout = msecs_to_jiffies(10000);
2977 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2978 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
b7b2a316 2979 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027
AD
2980 else
2981 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2982
f440ff44 2983 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 2984 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 2985 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
2986 ret = kstrtol(timeout_setting, 0, &timeout);
2987 if (ret)
2988 return ret;
2989
2990 if (timeout == 0) {
2991 index++;
2992 continue;
2993 } else if (timeout < 0) {
2994 timeout = MAX_SCHEDULE_TIMEOUT;
2995 } else {
2996 timeout = msecs_to_jiffies(timeout);
2997 }
2998
2999 switch (index++) {
3000 case 0:
3001 adev->gfx_timeout = timeout;
3002 break;
3003 case 1:
3004 adev->compute_timeout = timeout;
3005 break;
3006 case 2:
3007 adev->sdma_timeout = timeout;
3008 break;
3009 case 3:
3010 adev->video_timeout = timeout;
3011 break;
3012 default:
3013 break;
3014 }
3015 }
3016 /*
3017 * There is only one value specified and
3018 * it should apply to all non-compute jobs.
3019 */
bcccee89 3020 if (index == 1) {
71f98027 3021 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3022 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3023 adev->compute_timeout = adev->gfx_timeout;
3024 }
71f98027
AD
3025 }
3026
3027 return ret;
3028}
d4535e2c 3029
77f3a5cd
ND
3030static const struct attribute *amdgpu_dev_attributes[] = {
3031 &dev_attr_product_name.attr,
3032 &dev_attr_product_number.attr,
3033 &dev_attr_serial_number.attr,
3034 &dev_attr_pcie_replay_count.attr,
3035 NULL
3036};
3037
c9a6b82f 3038
d38ceaf9
AD
3039/**
3040 * amdgpu_device_init - initialize the driver
3041 *
3042 * @adev: amdgpu_device pointer
d38ceaf9
AD
3043 * @flags: driver flags
3044 *
3045 * Initializes the driver info and hw (all asics).
3046 * Returns 0 for success or an error on failure.
3047 * Called at driver startup.
3048 */
3049int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3050 uint32_t flags)
3051{
8aba21b7
LT
3052 struct drm_device *ddev = adev_to_drm(adev);
3053 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3054 int r, i;
3840c5bc 3055 bool boco = false;
95844d20 3056 u32 max_MBps;
d38ceaf9
AD
3057
3058 adev->shutdown = false;
d38ceaf9 3059 adev->flags = flags;
4e66d7d2
YZ
3060
3061 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3062 adev->asic_type = amdgpu_force_asic_type;
3063 else
3064 adev->asic_type = flags & AMD_ASIC_MASK;
3065
d38ceaf9 3066 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3067 if (amdgpu_emu_mode == 1)
8bdab6bb 3068 adev->usec_timeout *= 10;
770d13b1 3069 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3070 adev->accel_working = false;
3071 adev->num_rings = 0;
3072 adev->mman.buffer_funcs = NULL;
3073 adev->mman.buffer_funcs_ring = NULL;
3074 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3075 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3076 adev->gmc.gmc_funcs = NULL;
f54d1867 3077 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3078 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3079
3080 adev->smc_rreg = &amdgpu_invalid_rreg;
3081 adev->smc_wreg = &amdgpu_invalid_wreg;
3082 adev->pcie_rreg = &amdgpu_invalid_rreg;
3083 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3084 adev->pciep_rreg = &amdgpu_invalid_rreg;
3085 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3086 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3087 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3088 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3089 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3090 adev->didt_rreg = &amdgpu_invalid_rreg;
3091 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3092 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3093 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3094 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3095 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3096
3e39ab90
AD
3097 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3098 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3099 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3100
3101 /* mutex initialization are all done here so we
3102 * can recall function without having locking issues */
d38ceaf9 3103 atomic_set(&adev->irq.ih.lock, 0);
0e5ca0d1 3104 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3105 mutex_init(&adev->pm.mutex);
3106 mutex_init(&adev->gfx.gpu_clock_mutex);
3107 mutex_init(&adev->srbm_mutex);
b8866c26 3108 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3109 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3110 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3111 mutex_init(&adev->mn_lock);
e23b74aa 3112 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3113 hash_init(adev->mn_hash);
53b3f8f4 3114 atomic_set(&adev->in_gpu_reset, 0);
6049db43 3115 init_rwsem(&adev->reset_sem);
32eaeae0 3116 mutex_init(&adev->psp.mutex);
bd052211 3117 mutex_init(&adev->notifier_lock);
d38ceaf9 3118
912dfc84
EQ
3119 r = amdgpu_device_check_arguments(adev);
3120 if (r)
3121 return r;
d38ceaf9 3122
d38ceaf9
AD
3123 spin_lock_init(&adev->mmio_idx_lock);
3124 spin_lock_init(&adev->smc_idx_lock);
3125 spin_lock_init(&adev->pcie_idx_lock);
3126 spin_lock_init(&adev->uvd_ctx_idx_lock);
3127 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3128 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3129 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3130 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3131 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3132
0c4e7fa5
CZ
3133 INIT_LIST_HEAD(&adev->shadow_list);
3134 mutex_init(&adev->shadow_list_lock);
3135
beff74bc
AD
3136 INIT_DELAYED_WORK(&adev->delayed_init_work,
3137 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3138 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3139 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3140
d4535e2c
AG
3141 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3142
d23ee13f 3143 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3144 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3145
b265bdbd
EQ
3146 atomic_set(&adev->throttling_logging_enabled, 1);
3147 /*
3148 * If throttling continues, logging will be performed every minute
3149 * to avoid log flooding. "-1" is subtracted since the thermal
3150 * throttling interrupt comes every second. Thus, the total logging
3151 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3152 * for throttling interrupt) = 60 seconds.
3153 */
3154 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3155 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3156
0fa49558
AX
3157 /* Registers mapping */
3158 /* TODO: block userspace mapping of io register */
da69c161
KW
3159 if (adev->asic_type >= CHIP_BONAIRE) {
3160 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3161 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3162 } else {
3163 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3164 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3165 }
d38ceaf9 3166
d38ceaf9
AD
3167 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3168 if (adev->rmmio == NULL) {
3169 return -ENOMEM;
3170 }
3171 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3172 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3173
d38ceaf9
AD
3174 /* io port mapping */
3175 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3176 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3177 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3178 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3179 break;
3180 }
3181 }
3182 if (adev->rio_mem == NULL)
b64a18c5 3183 DRM_INFO("PCI I/O BAR is not found.\n");
d38ceaf9 3184
b2109d8e
JX
3185 /* enable PCIE atomic ops */
3186 r = pci_enable_atomic_ops_to_root(adev->pdev,
3187 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3188 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3189 if (r) {
3190 adev->have_atomics_support = false;
3191 DRM_INFO("PCIE atomic ops is not supported\n");
3192 } else {
3193 adev->have_atomics_support = true;
3194 }
3195
5494d864
AD
3196 amdgpu_device_get_pcie_info(adev);
3197
b239c017
JX
3198 if (amdgpu_mcbp)
3199 DRM_INFO("MCBP is enabled\n");
3200
5f84cc63
JX
3201 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3202 adev->enable_mes = true;
3203
3aa0115d
ML
3204 /* detect hw virtualization here */
3205 amdgpu_detect_virtualization(adev);
3206
dffa11b4
ML
3207 r = amdgpu_device_get_job_timeout_settings(adev);
3208 if (r) {
3209 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3210 return r;
a190d1c7
XY
3211 }
3212
d38ceaf9 3213 /* early init functions */
06ec9070 3214 r = amdgpu_device_ip_early_init(adev);
d38ceaf9
AD
3215 if (r)
3216 return r;
3217
6585661d
OZ
3218 /* doorbell bar mapping and doorbell index init*/
3219 amdgpu_device_doorbell_init(adev);
3220
d38ceaf9
AD
3221 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3222 /* this will fail for cards that aren't VGA class devices, just
3223 * ignore it */
06ec9070 3224 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
d38ceaf9 3225
31af062a 3226 if (amdgpu_device_supports_boco(ddev))
3840c5bc
AD
3227 boco = true;
3228 if (amdgpu_has_atpx() &&
3229 (amdgpu_is_atpx_hybrid() ||
3230 amdgpu_has_atpx_dgpu_power_cntl()) &&
3231 !pci_is_thunderbolt_attached(adev->pdev))
84c8b22e 3232 vga_switcheroo_register_client(adev->pdev,
3840c5bc
AD
3233 &amdgpu_switcheroo_ops, boco);
3234 if (boco)
d38ceaf9
AD
3235 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3236
9475a943
SL
3237 if (amdgpu_emu_mode == 1) {
3238 /* post the asic on emulation mode */
3239 emu_soc_asic_init(adev);
bfca0289 3240 goto fence_driver_init;
9475a943 3241 }
bfca0289 3242
4e99a44e
ML
3243 /* detect if we are with an SRIOV vbios */
3244 amdgpu_device_detect_sriov_bios(adev);
048765ad 3245
95e8e59e
AD
3246 /* check if we need to reset the asic
3247 * E.g., driver was not cleanly unloaded previously, etc.
3248 */
f14899fd 3249 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
95e8e59e
AD
3250 r = amdgpu_asic_reset(adev);
3251 if (r) {
3252 dev_err(adev->dev, "asic reset on init failed\n");
3253 goto failed;
3254 }
3255 }
3256
c9a6b82f
AG
3257 pci_enable_pcie_error_reporting(adev->ddev.pdev);
3258
d38ceaf9 3259 /* Post card if necessary */
39c640c0 3260 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3261 if (!adev->bios) {
bec86378 3262 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3263 r = -EINVAL;
3264 goto failed;
d38ceaf9 3265 }
bec86378 3266 DRM_INFO("GPU posting now...\n");
4d2997ab 3267 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3268 if (r) {
3269 dev_err(adev->dev, "gpu post error!\n");
3270 goto failed;
3271 }
d38ceaf9
AD
3272 }
3273
88b64e95
AD
3274 if (adev->is_atom_fw) {
3275 /* Initialize clocks */
3276 r = amdgpu_atomfirmware_get_clock_info(adev);
3277 if (r) {
3278 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3279 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3280 goto failed;
3281 }
3282 } else {
a5bde2f9
AD
3283 /* Initialize clocks */
3284 r = amdgpu_atombios_get_clock_info(adev);
3285 if (r) {
3286 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3287 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3288 goto failed;
a5bde2f9
AD
3289 }
3290 /* init i2c buses */
4562236b
HW
3291 if (!amdgpu_device_has_dc_support(adev))
3292 amdgpu_atombios_i2c_init(adev);
2c1a2784 3293 }
d38ceaf9 3294
bfca0289 3295fence_driver_init:
d38ceaf9
AD
3296 /* Fence driver */
3297 r = amdgpu_fence_driver_init(adev);
2c1a2784
AD
3298 if (r) {
3299 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
e23b74aa 3300 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3301 goto failed;
2c1a2784 3302 }
d38ceaf9
AD
3303
3304 /* init the mode config */
4a580877 3305 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3306
06ec9070 3307 r = amdgpu_device_ip_init(adev);
d38ceaf9 3308 if (r) {
8840a387 3309 /* failed in exclusive mode due to timeout */
3310 if (amdgpu_sriov_vf(adev) &&
3311 !amdgpu_sriov_runtime(adev) &&
3312 amdgpu_virt_mmio_blocked(adev) &&
3313 !amdgpu_virt_wait_reset(adev)) {
3314 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3315 /* Don't send request since VF is inactive. */
3316 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3317 adev->virt.ops = NULL;
8840a387 3318 r = -EAGAIN;
3319 goto failed;
3320 }
06ec9070 3321 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3322 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
83ba126a 3323 goto failed;
d38ceaf9
AD
3324 }
3325
d69b8971
YZ
3326 dev_info(adev->dev,
3327 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3328 adev->gfx.config.max_shader_engines,
3329 adev->gfx.config.max_sh_per_se,
3330 adev->gfx.config.max_cu_per_sh,
3331 adev->gfx.cu_info.number);
3332
d38ceaf9
AD
3333 adev->accel_working = true;
3334
e59c0205
AX
3335 amdgpu_vm_check_compute_bug(adev);
3336
95844d20
MO
3337 /* Initialize the buffer migration limit. */
3338 if (amdgpu_moverate >= 0)
3339 max_MBps = amdgpu_moverate;
3340 else
3341 max_MBps = 8; /* Allow 8 MB/s. */
3342 /* Get a log2 for easy divisions. */
3343 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3344
9bc92b9c
ML
3345 amdgpu_fbdev_init(adev);
3346
d2f52ac8 3347 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3348 if (r) {
3349 adev->pm_sysfs_en = false;
d2f52ac8 3350 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3351 } else
3352 adev->pm_sysfs_en = true;
d2f52ac8 3353
5bb23532 3354 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3355 if (r) {
3356 adev->ucode_sysfs_en = false;
5bb23532 3357 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3358 } else
3359 adev->ucode_sysfs_en = true;
5bb23532 3360
d38ceaf9
AD
3361 if ((amdgpu_testing & 1)) {
3362 if (adev->accel_working)
3363 amdgpu_test_moves(adev);
3364 else
3365 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3366 }
d38ceaf9
AD
3367 if (amdgpu_benchmarking) {
3368 if (adev->accel_working)
3369 amdgpu_benchmark(adev, amdgpu_benchmarking);
3370 else
3371 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3372 }
3373
b0adca4d
EQ
3374 /*
3375 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3376 * Otherwise the mgpu fan boost feature will be skipped due to the
3377 * gpu instance is counted less.
3378 */
3379 amdgpu_register_gpu_instance(adev);
3380
d38ceaf9
AD
3381 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3382 * explicit gating rather than handling it automatically.
3383 */
06ec9070 3384 r = amdgpu_device_ip_late_init(adev);
2c1a2784 3385 if (r) {
06ec9070 3386 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
e23b74aa 3387 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
83ba126a 3388 goto failed;
2c1a2784 3389 }
d38ceaf9 3390
108c6a63 3391 /* must succeed. */
511fdbc3 3392 amdgpu_ras_resume(adev);
108c6a63 3393
beff74bc
AD
3394 queue_delayed_work(system_wq, &adev->delayed_init_work,
3395 msecs_to_jiffies(AMDGPU_RESUME_MS));
3396
2c738637
ML
3397 if (amdgpu_sriov_vf(adev))
3398 flush_delayed_work(&adev->delayed_init_work);
3399
77f3a5cd 3400 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
bd607166 3401 if (r) {
77f3a5cd 3402 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166
KR
3403 return r;
3404 }
3405
d155bef0
AB
3406 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3407 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3408 if (r)
3409 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3410
d38ceaf9 3411 return 0;
83ba126a
AD
3412
3413failed:
89041940 3414 amdgpu_vf_error_trans_all(adev);
3840c5bc 3415 if (boco)
83ba126a 3416 vga_switcheroo_fini_domain_pm_ops(adev->dev);
8840a387 3417
83ba126a 3418 return r;
d38ceaf9
AD
3419}
3420
d38ceaf9
AD
3421/**
3422 * amdgpu_device_fini - tear down the driver
3423 *
3424 * @adev: amdgpu_device pointer
3425 *
3426 * Tear down the driver info (all asics).
3427 * Called at driver shutdown.
3428 */
3429void amdgpu_device_fini(struct amdgpu_device *adev)
3430{
aac89168 3431 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3432 flush_delayed_work(&adev->delayed_init_work);
d0d13fe8 3433 adev->shutdown = true;
9f875167 3434
752c683d
ML
3435 /* make sure IB test finished before entering exclusive mode
3436 * to avoid preemption on IB test
3437 * */
3438 if (amdgpu_sriov_vf(adev))
3439 amdgpu_virt_request_full_gpu(adev, false);
3440
e5b03032
ML
3441 /* disable all interrupts */
3442 amdgpu_irq_disable_all(adev);
ff97cba8
ML
3443 if (adev->mode_info.mode_config_initialized){
3444 if (!amdgpu_device_has_dc_support(adev))
4a580877 3445 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3446 else
4a580877 3447 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3448 }
d38ceaf9 3449 amdgpu_fence_driver_fini(adev);
7c868b59
YT
3450 if (adev->pm_sysfs_en)
3451 amdgpu_pm_sysfs_fini(adev);
d38ceaf9 3452 amdgpu_fbdev_fini(adev);
e230ac11 3453 amdgpu_device_ip_fini(adev);
75e1658e
ND
3454 release_firmware(adev->firmware.gpu_info_fw);
3455 adev->firmware.gpu_info_fw = NULL;
d38ceaf9
AD
3456 adev->accel_working = false;
3457 /* free i2c buses */
4562236b
HW
3458 if (!amdgpu_device_has_dc_support(adev))
3459 amdgpu_i2c_fini(adev);
bfca0289
SL
3460
3461 if (amdgpu_emu_mode != 1)
3462 amdgpu_atombios_fini(adev);
3463
d38ceaf9
AD
3464 kfree(adev->bios);
3465 adev->bios = NULL;
3840c5bc
AD
3466 if (amdgpu_has_atpx() &&
3467 (amdgpu_is_atpx_hybrid() ||
3468 amdgpu_has_atpx_dgpu_power_cntl()) &&
3469 !pci_is_thunderbolt_attached(adev->pdev))
84c8b22e 3470 vga_switcheroo_unregister_client(adev->pdev);
4a580877 3471 if (amdgpu_device_supports_boco(adev_to_drm(adev)))
83ba126a 3472 vga_switcheroo_fini_domain_pm_ops(adev->dev);
d38ceaf9
AD
3473 vga_client_register(adev->pdev, NULL, NULL, NULL);
3474 if (adev->rio_mem)
3475 pci_iounmap(adev->pdev, adev->rio_mem);
3476 adev->rio_mem = NULL;
3477 iounmap(adev->rmmio);
3478 adev->rmmio = NULL;
06ec9070 3479 amdgpu_device_doorbell_fini(adev);
e9bc1bf7 3480
7c868b59
YT
3481 if (adev->ucode_sysfs_en)
3482 amdgpu_ucode_sysfs_fini(adev);
77f3a5cd
ND
3483
3484 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
d155bef0
AB
3485 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3486 amdgpu_pmu_fini(adev);
72de33f8 3487 if (adev->mman.discovery_bin)
a190d1c7 3488 amdgpu_discovery_fini(adev);
d38ceaf9
AD
3489}
3490
3491
3492/*
3493 * Suspend & resume.
3494 */
3495/**
810ddc3a 3496 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 3497 *
87e3f136 3498 * @dev: drm dev pointer
87e3f136 3499 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
3500 *
3501 * Puts the hw in the suspend state (all asics).
3502 * Returns 0 for success or an error on failure.
3503 * Called at driver suspend.
3504 */
de185019 3505int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9
AD
3506{
3507 struct amdgpu_device *adev;
3508 struct drm_crtc *crtc;
3509 struct drm_connector *connector;
f8d2d39e 3510 struct drm_connector_list_iter iter;
5ceb54c6 3511 int r;
d38ceaf9 3512
1348969a 3513 adev = drm_to_adev(dev);
d38ceaf9
AD
3514
3515 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3516 return 0;
3517
44779b43 3518 adev->in_suspend = true;
d38ceaf9
AD
3519 drm_kms_helper_poll_disable(dev);
3520
5f818173
S
3521 if (fbcon)
3522 amdgpu_fbdev_set_suspend(adev, 1);
3523
beff74bc 3524 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 3525
4562236b
HW
3526 if (!amdgpu_device_has_dc_support(adev)) {
3527 /* turn off display hw */
3528 drm_modeset_lock_all(dev);
f8d2d39e
LP
3529 drm_connector_list_iter_begin(dev, &iter);
3530 drm_for_each_connector_iter(connector, &iter)
3531 drm_helper_connector_dpms(connector,
3532 DRM_MODE_DPMS_OFF);
3533 drm_connector_list_iter_end(&iter);
4562236b 3534 drm_modeset_unlock_all(dev);
fe1053b7
AD
3535 /* unpin the front buffers and cursors */
3536 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3537 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3538 struct drm_framebuffer *fb = crtc->primary->fb;
3539 struct amdgpu_bo *robj;
3540
91334223 3541 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
fe1053b7
AD
3542 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3543 r = amdgpu_bo_reserve(aobj, true);
3544 if (r == 0) {
3545 amdgpu_bo_unpin(aobj);
3546 amdgpu_bo_unreserve(aobj);
3547 }
756e6880 3548 }
756e6880 3549
fe1053b7
AD
3550 if (fb == NULL || fb->obj[0] == NULL) {
3551 continue;
3552 }
3553 robj = gem_to_amdgpu_bo(fb->obj[0]);
3554 /* don't unpin kernel fb objects */
3555 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3556 r = amdgpu_bo_reserve(robj, true);
3557 if (r == 0) {
3558 amdgpu_bo_unpin(robj);
3559 amdgpu_bo_unreserve(robj);
3560 }
d38ceaf9
AD
3561 }
3562 }
3563 }
fe1053b7 3564
5e6932fe 3565 amdgpu_ras_suspend(adev);
3566
fe1053b7
AD
3567 r = amdgpu_device_ip_suspend_phase1(adev);
3568
94fa5660
EQ
3569 amdgpu_amdkfd_suspend(adev, !fbcon);
3570
d38ceaf9
AD
3571 /* evict vram memory */
3572 amdgpu_bo_evict_vram(adev);
3573
5ceb54c6 3574 amdgpu_fence_driver_suspend(adev);
d38ceaf9 3575
fe1053b7 3576 r = amdgpu_device_ip_suspend_phase2(adev);
d38ceaf9 3577
a0a71e49
AD
3578 /* evict remaining vram memory
3579 * This second call to evict vram is to evict the gart page table
3580 * using the CPU.
3581 */
d38ceaf9
AD
3582 amdgpu_bo_evict_vram(adev);
3583
d38ceaf9
AD
3584 return 0;
3585}
3586
3587/**
810ddc3a 3588 * amdgpu_device_resume - initiate device resume
d38ceaf9 3589 *
87e3f136 3590 * @dev: drm dev pointer
87e3f136 3591 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
3592 *
3593 * Bring the hw back to operating state (all asics).
3594 * Returns 0 for success or an error on failure.
3595 * Called at driver resume.
3596 */
de185019 3597int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9
AD
3598{
3599 struct drm_connector *connector;
f8d2d39e 3600 struct drm_connector_list_iter iter;
1348969a 3601 struct amdgpu_device *adev = drm_to_adev(dev);
756e6880 3602 struct drm_crtc *crtc;
03161a6e 3603 int r = 0;
d38ceaf9
AD
3604
3605 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3606 return 0;
3607
d38ceaf9 3608 /* post card */
39c640c0 3609 if (amdgpu_device_need_post(adev)) {
4d2997ab 3610 r = amdgpu_device_asic_init(adev);
74b0b157 3611 if (r)
aac89168 3612 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 3613 }
d38ceaf9 3614
06ec9070 3615 r = amdgpu_device_ip_resume(adev);
e6707218 3616 if (r) {
aac89168 3617 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 3618 return r;
e6707218 3619 }
5ceb54c6
AD
3620 amdgpu_fence_driver_resume(adev);
3621
d38ceaf9 3622
06ec9070 3623 r = amdgpu_device_ip_late_init(adev);
03161a6e 3624 if (r)
4d3b9ae5 3625 return r;
d38ceaf9 3626
beff74bc
AD
3627 queue_delayed_work(system_wq, &adev->delayed_init_work,
3628 msecs_to_jiffies(AMDGPU_RESUME_MS));
3629
fe1053b7
AD
3630 if (!amdgpu_device_has_dc_support(adev)) {
3631 /* pin cursors */
3632 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3633 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3634
91334223 3635 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
fe1053b7
AD
3636 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3637 r = amdgpu_bo_reserve(aobj, true);
3638 if (r == 0) {
3639 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3640 if (r != 0)
aac89168 3641 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
fe1053b7
AD
3642 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3643 amdgpu_bo_unreserve(aobj);
3644 }
756e6880
AD
3645 }
3646 }
3647 }
9593f4d6 3648 r = amdgpu_amdkfd_resume(adev, !fbcon);
ba997709
YZ
3649 if (r)
3650 return r;
756e6880 3651
96a5d8d4 3652 /* Make sure IB tests flushed */
beff74bc 3653 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 3654
d38ceaf9
AD
3655 /* blat the mode back in */
3656 if (fbcon) {
4562236b
HW
3657 if (!amdgpu_device_has_dc_support(adev)) {
3658 /* pre DCE11 */
3659 drm_helper_resume_force_mode(dev);
3660
3661 /* turn on display hw */
3662 drm_modeset_lock_all(dev);
f8d2d39e
LP
3663
3664 drm_connector_list_iter_begin(dev, &iter);
3665 drm_for_each_connector_iter(connector, &iter)
3666 drm_helper_connector_dpms(connector,
3667 DRM_MODE_DPMS_ON);
3668 drm_connector_list_iter_end(&iter);
3669
4562236b 3670 drm_modeset_unlock_all(dev);
d38ceaf9 3671 }
4d3b9ae5 3672 amdgpu_fbdev_set_suspend(adev, 0);
d38ceaf9
AD
3673 }
3674
3675 drm_kms_helper_poll_enable(dev);
23a1a9e5 3676
5e6932fe 3677 amdgpu_ras_resume(adev);
3678
23a1a9e5
L
3679 /*
3680 * Most of the connector probing functions try to acquire runtime pm
3681 * refs to ensure that the GPU is powered on when connector polling is
3682 * performed. Since we're calling this from a runtime PM callback,
3683 * trying to acquire rpm refs will cause us to deadlock.
3684 *
3685 * Since we're guaranteed to be holding the rpm lock, it's safe to
3686 * temporarily disable the rpm helpers so this doesn't deadlock us.
3687 */
3688#ifdef CONFIG_PM
3689 dev->dev->power.disable_depth++;
3690#endif
4562236b
HW
3691 if (!amdgpu_device_has_dc_support(adev))
3692 drm_helper_hpd_irq_event(dev);
3693 else
3694 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
3695#ifdef CONFIG_PM
3696 dev->dev->power.disable_depth--;
3697#endif
44779b43
RZ
3698 adev->in_suspend = false;
3699
4d3b9ae5 3700 return 0;
d38ceaf9
AD
3701}
3702
e3ecdffa
AD
3703/**
3704 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3705 *
3706 * @adev: amdgpu_device pointer
3707 *
3708 * The list of all the hardware IPs that make up the asic is walked and
3709 * the check_soft_reset callbacks are run. check_soft_reset determines
3710 * if the asic is still hung or not.
3711 * Returns true if any of the IPs are still in a hung state, false if not.
3712 */
06ec9070 3713static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
3714{
3715 int i;
3716 bool asic_hang = false;
3717
f993d628
ML
3718 if (amdgpu_sriov_vf(adev))
3719 return true;
3720
8bc04c29
AD
3721 if (amdgpu_asic_need_full_reset(adev))
3722 return true;
3723
63fbf42f 3724 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3725 if (!adev->ip_blocks[i].status.valid)
63fbf42f 3726 continue;
a1255107
AD
3727 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3728 adev->ip_blocks[i].status.hang =
3729 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3730 if (adev->ip_blocks[i].status.hang) {
aac89168 3731 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
3732 asic_hang = true;
3733 }
3734 }
3735 return asic_hang;
3736}
3737
e3ecdffa
AD
3738/**
3739 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3740 *
3741 * @adev: amdgpu_device pointer
3742 *
3743 * The list of all the hardware IPs that make up the asic is walked and the
3744 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3745 * handles any IP specific hardware or software state changes that are
3746 * necessary for a soft reset to succeed.
3747 * Returns 0 on success, negative error code on failure.
3748 */
06ec9070 3749static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
3750{
3751 int i, r = 0;
3752
3753 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3754 if (!adev->ip_blocks[i].status.valid)
d31a501e 3755 continue;
a1255107
AD
3756 if (adev->ip_blocks[i].status.hang &&
3757 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3758 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
3759 if (r)
3760 return r;
3761 }
3762 }
3763
3764 return 0;
3765}
3766
e3ecdffa
AD
3767/**
3768 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3769 *
3770 * @adev: amdgpu_device pointer
3771 *
3772 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3773 * reset is necessary to recover.
3774 * Returns true if a full asic reset is required, false if not.
3775 */
06ec9070 3776static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 3777{
da146d3b
AD
3778 int i;
3779
8bc04c29
AD
3780 if (amdgpu_asic_need_full_reset(adev))
3781 return true;
3782
da146d3b 3783 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3784 if (!adev->ip_blocks[i].status.valid)
da146d3b 3785 continue;
a1255107
AD
3786 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3787 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3788 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
3789 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3790 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 3791 if (adev->ip_blocks[i].status.hang) {
aac89168 3792 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
3793 return true;
3794 }
3795 }
35d782fe
CZ
3796 }
3797 return false;
3798}
3799
e3ecdffa
AD
3800/**
3801 * amdgpu_device_ip_soft_reset - do a soft reset
3802 *
3803 * @adev: amdgpu_device pointer
3804 *
3805 * The list of all the hardware IPs that make up the asic is walked and the
3806 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3807 * IP specific hardware or software state changes that are necessary to soft
3808 * reset the IP.
3809 * Returns 0 on success, negative error code on failure.
3810 */
06ec9070 3811static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
3812{
3813 int i, r = 0;
3814
3815 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3816 if (!adev->ip_blocks[i].status.valid)
35d782fe 3817 continue;
a1255107
AD
3818 if (adev->ip_blocks[i].status.hang &&
3819 adev->ip_blocks[i].version->funcs->soft_reset) {
3820 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
3821 if (r)
3822 return r;
3823 }
3824 }
3825
3826 return 0;
3827}
3828
e3ecdffa
AD
3829/**
3830 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3831 *
3832 * @adev: amdgpu_device pointer
3833 *
3834 * The list of all the hardware IPs that make up the asic is walked and the
3835 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3836 * handles any IP specific hardware or software state changes that are
3837 * necessary after the IP has been soft reset.
3838 * Returns 0 on success, negative error code on failure.
3839 */
06ec9070 3840static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
3841{
3842 int i, r = 0;
3843
3844 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3845 if (!adev->ip_blocks[i].status.valid)
35d782fe 3846 continue;
a1255107
AD
3847 if (adev->ip_blocks[i].status.hang &&
3848 adev->ip_blocks[i].version->funcs->post_soft_reset)
3849 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
3850 if (r)
3851 return r;
3852 }
3853
3854 return 0;
3855}
3856
e3ecdffa 3857/**
c33adbc7 3858 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
3859 *
3860 * @adev: amdgpu_device pointer
3861 *
3862 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
3863 * restore things like GPUVM page tables after a GPU reset where
3864 * the contents of VRAM might be lost.
403009bf
CK
3865 *
3866 * Returns:
3867 * 0 on success, negative error code on failure.
e3ecdffa 3868 */
c33adbc7 3869static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 3870{
c41d1cf6 3871 struct dma_fence *fence = NULL, *next = NULL;
403009bf
CK
3872 struct amdgpu_bo *shadow;
3873 long r = 1, tmo;
c41d1cf6
ML
3874
3875 if (amdgpu_sriov_runtime(adev))
b045d3af 3876 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
3877 else
3878 tmo = msecs_to_jiffies(100);
3879
aac89168 3880 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 3881 mutex_lock(&adev->shadow_list_lock);
403009bf
CK
3882 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3883
3884 /* No need to recover an evicted BO */
3885 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
b575f10d 3886 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
403009bf
CK
3887 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3888 continue;
3889
3890 r = amdgpu_bo_restore_shadow(shadow, &next);
3891 if (r)
3892 break;
3893
c41d1cf6 3894 if (fence) {
1712fb1a 3895 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
3896 dma_fence_put(fence);
3897 fence = next;
1712fb1a 3898 if (tmo == 0) {
3899 r = -ETIMEDOUT;
c41d1cf6 3900 break;
1712fb1a 3901 } else if (tmo < 0) {
3902 r = tmo;
3903 break;
3904 }
403009bf
CK
3905 } else {
3906 fence = next;
c41d1cf6 3907 }
c41d1cf6
ML
3908 }
3909 mutex_unlock(&adev->shadow_list_lock);
3910
403009bf
CK
3911 if (fence)
3912 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
3913 dma_fence_put(fence);
3914
1712fb1a 3915 if (r < 0 || tmo <= 0) {
aac89168 3916 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
3917 return -EIO;
3918 }
c41d1cf6 3919
aac89168 3920 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 3921 return 0;
c41d1cf6
ML
3922}
3923
a90ad3c2 3924
e3ecdffa 3925/**
06ec9070 3926 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e
ML
3927 *
3928 * @adev: amdgpu device pointer
87e3f136 3929 * @from_hypervisor: request from hypervisor
5740682e
ML
3930 *
3931 * do VF FLR and reinitialize Asic
3f48c681 3932 * return 0 means succeeded otherwise failed
e3ecdffa
AD
3933 */
3934static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3935 bool from_hypervisor)
5740682e
ML
3936{
3937 int r;
3938
3939 if (from_hypervisor)
3940 r = amdgpu_virt_request_full_gpu(adev, true);
3941 else
3942 r = amdgpu_virt_reset_gpu(adev);
3943 if (r)
3944 return r;
a90ad3c2 3945
b639c22c
JZ
3946 amdgpu_amdkfd_pre_reset(adev);
3947
a90ad3c2 3948 /* Resume IP prior to SMC */
06ec9070 3949 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
3950 if (r)
3951 goto error;
a90ad3c2 3952
c9ffa427 3953 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 3954 /* we need recover gart prior to run SMC/CP/SDMA resume */
6c28aed6 3955 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
a90ad3c2 3956
7a3e0bb2
RZ
3957 r = amdgpu_device_fw_loading(adev);
3958 if (r)
3959 return r;
3960
a90ad3c2 3961 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 3962 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
3963 if (r)
3964 goto error;
a90ad3c2
ML
3965
3966 amdgpu_irq_gpu_reset_resume_helper(adev);
5740682e 3967 r = amdgpu_ib_ring_tests(adev);
f81e8d53 3968 amdgpu_amdkfd_post_reset(adev);
a90ad3c2 3969
abc34253
ED
3970error:
3971 amdgpu_virt_release_full_gpu(adev, true);
c41d1cf6 3972 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 3973 amdgpu_inc_vram_lost(adev);
c33adbc7 3974 r = amdgpu_device_recover_vram(adev);
a90ad3c2
ML
3975 }
3976
3977 return r;
3978}
3979
9a1cddd6 3980/**
3981 * amdgpu_device_has_job_running - check if there is any job in mirror list
3982 *
3983 * @adev: amdgpu device pointer
3984 *
3985 * check if there is any job in mirror list
3986 */
3987bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
3988{
3989 int i;
3990 struct drm_sched_job *job;
3991
3992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3993 struct amdgpu_ring *ring = adev->rings[i];
3994
3995 if (!ring || !ring->sched.thread)
3996 continue;
3997
3998 spin_lock(&ring->sched.job_list_lock);
3999 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4000 struct drm_sched_job, node);
4001 spin_unlock(&ring->sched.job_list_lock);
4002 if (job)
4003 return true;
4004 }
4005 return false;
4006}
4007
12938fad
CK
4008/**
4009 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4010 *
4011 * @adev: amdgpu device pointer
4012 *
4013 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4014 * a hung GPU.
4015 */
4016bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4017{
4018 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4019 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4020 return false;
4021 }
4022
3ba7b418
AG
4023 if (amdgpu_gpu_recovery == 0)
4024 goto disabled;
4025
4026 if (amdgpu_sriov_vf(adev))
4027 return true;
4028
4029 if (amdgpu_gpu_recovery == -1) {
4030 switch (adev->asic_type) {
fc42d47c
AG
4031 case CHIP_BONAIRE:
4032 case CHIP_HAWAII:
3ba7b418
AG
4033 case CHIP_TOPAZ:
4034 case CHIP_TONGA:
4035 case CHIP_FIJI:
4036 case CHIP_POLARIS10:
4037 case CHIP_POLARIS11:
4038 case CHIP_POLARIS12:
4039 case CHIP_VEGAM:
4040 case CHIP_VEGA20:
4041 case CHIP_VEGA10:
4042 case CHIP_VEGA12:
c43b849f 4043 case CHIP_RAVEN:
e9d4cf91 4044 case CHIP_ARCTURUS:
2cb44fb0 4045 case CHIP_RENOIR:
658c6639
AD
4046 case CHIP_NAVI10:
4047 case CHIP_NAVI14:
4048 case CHIP_NAVI12:
131a3c74 4049 case CHIP_SIENNA_CICHLID:
3ba7b418
AG
4050 break;
4051 default:
4052 goto disabled;
4053 }
12938fad
CK
4054 }
4055
4056 return true;
3ba7b418
AG
4057
4058disabled:
aac89168 4059 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4060 return false;
12938fad
CK
4061}
4062
5c6dd71e 4063
26bc5340
AG
4064static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4065 struct amdgpu_job *job,
4066 bool *need_full_reset_arg)
4067{
4068 int i, r = 0;
4069 bool need_full_reset = *need_full_reset_arg;
71182665 4070
728e7e0c
JZ
4071 amdgpu_debugfs_wait_dump(adev);
4072
71182665 4073 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4074 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4075 struct amdgpu_ring *ring = adev->rings[i];
4076
51687759 4077 if (!ring || !ring->sched.thread)
0875dc9e 4078 continue;
5740682e 4079
2f9d4084
ML
4080 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4081 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4082 }
d38ceaf9 4083
222b5f04
AG
4084 if(job)
4085 drm_sched_increase_karma(&job->base);
4086
1d721ed6 4087 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4088 if (!amdgpu_sriov_vf(adev)) {
4089
4090 if (!need_full_reset)
4091 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4092
4093 if (!need_full_reset) {
4094 amdgpu_device_ip_pre_soft_reset(adev);
4095 r = amdgpu_device_ip_soft_reset(adev);
4096 amdgpu_device_ip_post_soft_reset(adev);
4097 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4098 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4099 need_full_reset = true;
4100 }
4101 }
4102
4103 if (need_full_reset)
4104 r = amdgpu_device_ip_suspend(adev);
4105
4106 *need_full_reset_arg = need_full_reset;
4107 }
4108
4109 return r;
4110}
4111
041a62bc 4112static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
26bc5340
AG
4113 struct list_head *device_list_handle,
4114 bool *need_full_reset_arg)
4115{
4116 struct amdgpu_device *tmp_adev = NULL;
4117 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4118 int r = 0;
4119
4120 /*
4121 * ASIC reset has to be done on all HGMI hive nodes ASAP
4122 * to allow proper links negotiation in FW (within 1 sec)
4123 */
4124 if (need_full_reset) {
4125 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
041a62bc 4126 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4127 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
c96cf282 4128 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4129 r = -EALREADY;
4130 } else
4131 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4132
041a62bc 4133 if (r) {
aac89168 4134 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4135 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4136 break;
ce316fa5
LM
4137 }
4138 }
4139
041a62bc
AG
4140 /* For XGMI wait for all resets to complete before proceed */
4141 if (!r) {
ce316fa5
LM
4142 list_for_each_entry(tmp_adev, device_list_handle,
4143 gmc.xgmi.head) {
4144 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4145 flush_work(&tmp_adev->xgmi_reset_work);
4146 r = tmp_adev->asic_reset_res;
4147 if (r)
4148 break;
ce316fa5
LM
4149 }
4150 }
4151 }
ce316fa5 4152 }
26bc5340 4153
43c4d576
JC
4154 if (!r && amdgpu_ras_intr_triggered()) {
4155 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4156 if (tmp_adev->mmhub.funcs &&
4157 tmp_adev->mmhub.funcs->reset_ras_error_count)
4158 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4159 }
4160
00eaa571 4161 amdgpu_ras_intr_cleared();
43c4d576 4162 }
00eaa571 4163
26bc5340
AG
4164 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4165 if (need_full_reset) {
4166 /* post card */
4d2997ab 4167 if (amdgpu_device_asic_init(tmp_adev))
aac89168 4168 dev_warn(tmp_adev->dev, "asic atom init failed!");
26bc5340
AG
4169
4170 if (!r) {
4171 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4172 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4173 if (r)
4174 goto out;
4175
4176 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4177 if (vram_lost) {
77e7f829 4178 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4179 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4180 }
4181
6c28aed6 4182 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
26bc5340
AG
4183 if (r)
4184 goto out;
4185
4186 r = amdgpu_device_fw_loading(tmp_adev);
4187 if (r)
4188 return r;
4189
4190 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4191 if (r)
4192 goto out;
4193
4194 if (vram_lost)
4195 amdgpu_device_fill_reset_magic(tmp_adev);
4196
fdafb359
EQ
4197 /*
4198 * Add this ASIC as tracked as reset was already
4199 * complete successfully.
4200 */
4201 amdgpu_register_gpu_instance(tmp_adev);
4202
7c04ca50 4203 r = amdgpu_device_ip_late_init(tmp_adev);
4204 if (r)
4205 goto out;
4206
565d1941
EQ
4207 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4208
e8fbaf03
GC
4209 /*
4210 * The GPU enters bad state once faulty pages
4211 * by ECC has reached the threshold, and ras
4212 * recovery is scheduled next. So add one check
4213 * here to break recovery if it indeed exceeds
4214 * bad page threshold, and remind user to
4215 * retire this GPU or setting one bigger
4216 * bad_page_threshold value to fix this once
4217 * probing driver again.
4218 */
4219 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4220 /* must succeed. */
4221 amdgpu_ras_resume(tmp_adev);
4222 } else {
4223 r = -EINVAL;
4224 goto out;
4225 }
e79a04d5 4226
26bc5340
AG
4227 /* Update PSP FW topology after reset */
4228 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4229 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4230 }
4231 }
4232
26bc5340
AG
4233out:
4234 if (!r) {
4235 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4236 r = amdgpu_ib_ring_tests(tmp_adev);
4237 if (r) {
4238 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4239 r = amdgpu_device_ip_suspend(tmp_adev);
4240 need_full_reset = true;
4241 r = -EAGAIN;
4242 goto end;
4243 }
4244 }
4245
4246 if (!r)
4247 r = amdgpu_device_recover_vram(tmp_adev);
4248 else
4249 tmp_adev->asic_reset_res = r;
4250 }
4251
4252end:
4253 *need_full_reset_arg = need_full_reset;
4254 return r;
4255}
4256
08ebb485
DL
4257static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4258 struct amdgpu_hive_info *hive)
26bc5340 4259{
53b3f8f4
DL
4260 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4261 return false;
4262
08ebb485
DL
4263 if (hive) {
4264 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4265 } else {
4266 down_write(&adev->reset_sem);
4267 }
5740682e 4268
26bc5340 4269 atomic_inc(&adev->gpu_reset_counter);
a3a09142
AD
4270 switch (amdgpu_asic_reset_method(adev)) {
4271 case AMD_RESET_METHOD_MODE1:
4272 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4273 break;
4274 case AMD_RESET_METHOD_MODE2:
4275 adev->mp1_state = PP_MP1_STATE_RESET;
4276 break;
4277 default:
4278 adev->mp1_state = PP_MP1_STATE_NONE;
4279 break;
4280 }
1d721ed6
AG
4281
4282 return true;
26bc5340 4283}
d38ceaf9 4284
26bc5340
AG
4285static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4286{
89041940 4287 amdgpu_vf_error_trans_all(adev);
a3a09142 4288 adev->mp1_state = PP_MP1_STATE_NONE;
53b3f8f4 4289 atomic_set(&adev->in_gpu_reset, 0);
6049db43 4290 up_write(&adev->reset_sem);
26bc5340
AG
4291}
4292
3f12acc8
EQ
4293static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4294{
4295 struct pci_dev *p = NULL;
4296
4297 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4298 adev->pdev->bus->number, 1);
4299 if (p) {
4300 pm_runtime_enable(&(p->dev));
4301 pm_runtime_resume(&(p->dev));
4302 }
4303}
4304
4305static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4306{
4307 enum amd_reset_method reset_method;
4308 struct pci_dev *p = NULL;
4309 u64 expires;
4310
4311 /*
4312 * For now, only BACO and mode1 reset are confirmed
4313 * to suffer the audio issue without proper suspended.
4314 */
4315 reset_method = amdgpu_asic_reset_method(adev);
4316 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4317 (reset_method != AMD_RESET_METHOD_MODE1))
4318 return -EINVAL;
4319
4320 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4321 adev->pdev->bus->number, 1);
4322 if (!p)
4323 return -ENODEV;
4324
4325 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4326 if (!expires)
4327 /*
4328 * If we cannot get the audio device autosuspend delay,
4329 * a fixed 4S interval will be used. Considering 3S is
4330 * the audio controller default autosuspend delay setting.
4331 * 4S used here is guaranteed to cover that.
4332 */
54b7feb9 4333 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4334
4335 while (!pm_runtime_status_suspended(&(p->dev))) {
4336 if (!pm_runtime_suspend(&(p->dev)))
4337 break;
4338
4339 if (expires < ktime_get_mono_fast_ns()) {
4340 dev_warn(adev->dev, "failed to suspend display audio\n");
4341 /* TODO: abort the succeeding gpu reset? */
4342 return -ETIMEDOUT;
4343 }
4344 }
4345
4346 pm_runtime_disable(&(p->dev));
4347
4348 return 0;
4349}
4350
26bc5340
AG
4351/**
4352 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4353 *
4354 * @adev: amdgpu device pointer
4355 * @job: which job trigger hang
4356 *
4357 * Attempt to reset the GPU if it has hung (all asics).
4358 * Attempt to do soft-reset or full-reset and reinitialize Asic
4359 * Returns 0 for success or an error on failure.
4360 */
4361
4362int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4363 struct amdgpu_job *job)
4364{
1d721ed6 4365 struct list_head device_list, *device_list_handle = NULL;
7dd8c205
EQ
4366 bool need_full_reset = false;
4367 bool job_signaled = false;
26bc5340 4368 struct amdgpu_hive_info *hive = NULL;
26bc5340 4369 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 4370 int i, r = 0;
bb5c7235 4371 bool need_emergency_restart = false;
3f12acc8 4372 bool audio_suspended = false;
26bc5340 4373
bb5c7235
WS
4374 /**
4375 * Special case: RAS triggered and full reset isn't supported
4376 */
4377 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4378
d5ea093e
AG
4379 /*
4380 * Flush RAM to disk so that after reboot
4381 * the user can read log and see why the system rebooted.
4382 */
bb5c7235 4383 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
4384 DRM_WARN("Emergency reboot.");
4385
4386 ksys_sync_helper();
4387 emergency_restart();
4388 }
4389
b823821f 4390 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 4391 need_emergency_restart ? "jobs stop":"reset");
26bc5340
AG
4392
4393 /*
1d721ed6
AG
4394 * Here we trylock to avoid chain of resets executing from
4395 * either trigger by jobs on different adevs in XGMI hive or jobs on
4396 * different schedulers for same device while this TO handler is running.
4397 * We always reset all schedulers for device and all devices for XGMI
4398 * hive so that should take care of them too.
26bc5340 4399 */
d95e8e97 4400 hive = amdgpu_get_xgmi_hive(adev);
53b3f8f4
DL
4401 if (hive) {
4402 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4403 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4404 job ? job->base.id : -1, hive->hive_id);
d95e8e97 4405 amdgpu_put_xgmi_hive(hive);
53b3f8f4
DL
4406 return 0;
4407 }
4408 mutex_lock(&hive->hive_lock);
1d721ed6 4409 }
26bc5340 4410
9e94d22c
EQ
4411 /*
4412 * Build list of devices to reset.
4413 * In case we are in XGMI hive mode, resort the device list
4414 * to put adev in the 1st position.
4415 */
4416 INIT_LIST_HEAD(&device_list);
4417 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4418 if (!hive)
26bc5340 4419 return -ENODEV;
9e94d22c
EQ
4420 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4421 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
26bc5340
AG
4422 device_list_handle = &hive->device_list;
4423 } else {
4424 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4425 device_list_handle = &device_list;
4426 }
4427
1d721ed6
AG
4428 /* block all schedulers and reset given job's ring */
4429 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
08ebb485 4430 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
aac89168 4431 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
9e94d22c 4432 job ? job->base.id : -1);
cbfd17f7
DL
4433 r = 0;
4434 goto skip_recovery;
7c6e68c7
AG
4435 }
4436
3f12acc8
EQ
4437 /*
4438 * Try to put the audio codec into suspend state
4439 * before gpu reset started.
4440 *
4441 * Due to the power domain of the graphics device
4442 * is shared with AZ power domain. Without this,
4443 * we may change the audio hardware from behind
4444 * the audio driver's back. That will trigger
4445 * some audio codec errors.
4446 */
4447 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4448 audio_suspended = true;
4449
9e94d22c
EQ
4450 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4451
52fb44cf
EQ
4452 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4453
9e94d22c
EQ
4454 if (!amdgpu_sriov_vf(tmp_adev))
4455 amdgpu_amdkfd_pre_reset(tmp_adev);
4456
12ffa55d
AG
4457 /*
4458 * Mark these ASICs to be reseted as untracked first
4459 * And add them back after reset completed
4460 */
4461 amdgpu_unregister_gpu_instance(tmp_adev);
4462
a2f63ee8 4463 amdgpu_fbdev_set_suspend(tmp_adev, 1);
565d1941 4464
f1c1314b 4465 /* disable ras on ALL IPs */
bb5c7235 4466 if (!need_emergency_restart &&
b823821f 4467 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 4468 amdgpu_ras_suspend(tmp_adev);
4469
1d721ed6
AG
4470 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4471 struct amdgpu_ring *ring = tmp_adev->rings[i];
4472
4473 if (!ring || !ring->sched.thread)
4474 continue;
4475
0b2d2c2e 4476 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 4477
bb5c7235 4478 if (need_emergency_restart)
7c6e68c7 4479 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6
AG
4480 }
4481 }
4482
bb5c7235 4483 if (need_emergency_restart)
7c6e68c7
AG
4484 goto skip_sched_resume;
4485
1d721ed6
AG
4486 /*
4487 * Must check guilty signal here since after this point all old
4488 * HW fences are force signaled.
4489 *
4490 * job->base holds a reference to parent fence
4491 */
4492 if (job && job->base.s_fence->parent &&
7dd8c205 4493 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 4494 job_signaled = true;
1d721ed6
AG
4495 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4496 goto skip_hw_reset;
4497 }
4498
26bc5340
AG
4499retry: /* Rest of adevs pre asic reset from XGMI hive. */
4500 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
26bc5340
AG
4501 r = amdgpu_device_pre_asic_reset(tmp_adev,
4502 NULL,
4503 &need_full_reset);
4504 /*TODO Should we stop ?*/
4505 if (r) {
aac89168 4506 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 4507 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
4508 tmp_adev->asic_reset_res = r;
4509 }
4510 }
4511
4512 /* Actual ASIC resets if needed.*/
4513 /* TODO Implement XGMI hive reset logic for SRIOV */
4514 if (amdgpu_sriov_vf(adev)) {
4515 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4516 if (r)
4517 adev->asic_reset_res = r;
4518 } else {
041a62bc 4519 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
26bc5340
AG
4520 if (r && r == -EAGAIN)
4521 goto retry;
4522 }
4523
1d721ed6
AG
4524skip_hw_reset:
4525
26bc5340
AG
4526 /* Post ASIC reset for all devs .*/
4527 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
7c6e68c7 4528
1d721ed6
AG
4529 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4530 struct amdgpu_ring *ring = tmp_adev->rings[i];
4531
4532 if (!ring || !ring->sched.thread)
4533 continue;
4534
4535 /* No point to resubmit jobs if we didn't HW reset*/
4536 if (!tmp_adev->asic_reset_res && !job_signaled)
4537 drm_sched_resubmit_jobs(&ring->sched);
4538
4539 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4540 }
4541
4542 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4a580877 4543 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
4544 }
4545
4546 tmp_adev->asic_reset_res = 0;
26bc5340
AG
4547
4548 if (r) {
4549 /* bad news, how to tell it to userspace ? */
12ffa55d 4550 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
4551 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4552 } else {
12ffa55d 4553 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340 4554 }
7c6e68c7 4555 }
26bc5340 4556
7c6e68c7
AG
4557skip_sched_resume:
4558 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4559 /*unlock kfd: SRIOV would do it separately */
bb5c7235 4560 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
7c6e68c7 4561 amdgpu_amdkfd_post_reset(tmp_adev);
3f12acc8
EQ
4562 if (audio_suspended)
4563 amdgpu_device_resume_display_audio(tmp_adev);
26bc5340
AG
4564 amdgpu_device_unlock_adev(tmp_adev);
4565 }
4566
cbfd17f7 4567skip_recovery:
9e94d22c 4568 if (hive) {
53b3f8f4 4569 atomic_set(&hive->in_reset, 0);
9e94d22c 4570 mutex_unlock(&hive->hive_lock);
d95e8e97 4571 amdgpu_put_xgmi_hive(hive);
9e94d22c 4572 }
26bc5340
AG
4573
4574 if (r)
4575 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
4576 return r;
4577}
4578
e3ecdffa
AD
4579/**
4580 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4581 *
4582 * @adev: amdgpu_device pointer
4583 *
4584 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4585 * and lanes) of the slot the device is in. Handles APUs and
4586 * virtualized environments where PCIE config space may not be available.
4587 */
5494d864 4588static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 4589{
5d9a6330 4590 struct pci_dev *pdev;
c5313457
HK
4591 enum pci_bus_speed speed_cap, platform_speed_cap;
4592 enum pcie_link_width platform_link_width;
d0dd7f0c 4593
cd474ba0
AD
4594 if (amdgpu_pcie_gen_cap)
4595 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 4596
cd474ba0
AD
4597 if (amdgpu_pcie_lane_cap)
4598 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 4599
cd474ba0
AD
4600 /* covers APUs as well */
4601 if (pci_is_root_bus(adev->pdev->bus)) {
4602 if (adev->pm.pcie_gen_mask == 0)
4603 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4604 if (adev->pm.pcie_mlw_mask == 0)
4605 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 4606 return;
cd474ba0 4607 }
d0dd7f0c 4608
c5313457
HK
4609 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4610 return;
4611
dbaa922b
AD
4612 pcie_bandwidth_available(adev->pdev, NULL,
4613 &platform_speed_cap, &platform_link_width);
c5313457 4614
cd474ba0 4615 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
4616 /* asic caps */
4617 pdev = adev->pdev;
4618 speed_cap = pcie_get_speed_cap(pdev);
4619 if (speed_cap == PCI_SPEED_UNKNOWN) {
4620 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
4621 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4622 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 4623 } else {
5d9a6330
AD
4624 if (speed_cap == PCIE_SPEED_16_0GT)
4625 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4626 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4627 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4628 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4629 else if (speed_cap == PCIE_SPEED_8_0GT)
4630 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4631 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4632 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4633 else if (speed_cap == PCIE_SPEED_5_0GT)
4634 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4635 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4636 else
4637 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4638 }
4639 /* platform caps */
c5313457 4640 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
4641 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4642 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4643 } else {
c5313457 4644 if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
4645 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4646 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4647 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4648 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 4649 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
4650 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4651 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4652 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 4653 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
4654 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4655 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4656 else
4657 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4658
cd474ba0
AD
4659 }
4660 }
4661 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 4662 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
4663 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4664 } else {
c5313457 4665 switch (platform_link_width) {
5d9a6330 4666 case PCIE_LNK_X32:
cd474ba0
AD
4667 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4668 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4669 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4670 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4671 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4672 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4673 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4674 break;
5d9a6330 4675 case PCIE_LNK_X16:
cd474ba0
AD
4676 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4677 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4678 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4679 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4680 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4681 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4682 break;
5d9a6330 4683 case PCIE_LNK_X12:
cd474ba0
AD
4684 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4685 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4686 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4687 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4689 break;
5d9a6330 4690 case PCIE_LNK_X8:
cd474ba0
AD
4691 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4692 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4693 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4694 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4695 break;
5d9a6330 4696 case PCIE_LNK_X4:
cd474ba0
AD
4697 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4698 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4699 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4700 break;
5d9a6330 4701 case PCIE_LNK_X2:
cd474ba0
AD
4702 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4703 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4704 break;
5d9a6330 4705 case PCIE_LNK_X1:
cd474ba0
AD
4706 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4707 break;
4708 default:
4709 break;
4710 }
d0dd7f0c
AD
4711 }
4712 }
4713}
d38ceaf9 4714
361dbd01
AD
4715int amdgpu_device_baco_enter(struct drm_device *dev)
4716{
1348969a 4717 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 4718 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 4719
4a580877 4720 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
4721 return -ENOTSUPP;
4722
7a22677b
LM
4723 if (ras && ras->supported)
4724 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4725
9530273e 4726 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
4727}
4728
4729int amdgpu_device_baco_exit(struct drm_device *dev)
4730{
1348969a 4731 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 4732 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 4733 int ret = 0;
361dbd01 4734
4a580877 4735 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
4736 return -ENOTSUPP;
4737
9530273e
EQ
4738 ret = amdgpu_dpm_baco_exit(adev);
4739 if (ret)
4740 return ret;
7a22677b
LM
4741
4742 if (ras && ras->supported)
4743 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4744
4745 return 0;
361dbd01 4746}
c9a6b82f 4747
acd89fca
AG
4748static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4749{
4750 int i;
4751
4752 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4753 struct amdgpu_ring *ring = adev->rings[i];
4754
4755 if (!ring || !ring->sched.thread)
4756 continue;
4757
4758 cancel_delayed_work_sync(&ring->sched.work_tdr);
4759 }
4760}
4761
c9a6b82f
AG
4762/**
4763 * amdgpu_pci_error_detected - Called when a PCI error is detected.
4764 * @pdev: PCI device struct
4765 * @state: PCI channel state
4766 *
4767 * Description: Called when a PCI error is detected.
4768 *
4769 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4770 */
4771pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4772{
4773 struct drm_device *dev = pci_get_drvdata(pdev);
4774 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 4775 int i;
c9a6b82f
AG
4776
4777 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4778
4779 switch (state) {
4780 case pci_channel_io_normal:
4781 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca
AG
4782 /* Fatal error, prepare for slot reset */
4783 case pci_channel_io_frozen:
4784 /*
4785 * Cancel and wait for all TDRs in progress if failing to
4786 * set adev->in_gpu_reset in amdgpu_device_lock_adev
4787 *
4788 * Locking adev->reset_sem will prevent any external access
4789 * to GPU during PCI error recovery
4790 */
4791 while (!amdgpu_device_lock_adev(adev, NULL))
4792 amdgpu_cancel_all_tdr(adev);
4793
4794 /*
4795 * Block any work scheduling as we do for regular GPU reset
4796 * for the duration of the recovery
4797 */
4798 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4799 struct amdgpu_ring *ring = adev->rings[i];
4800
4801 if (!ring || !ring->sched.thread)
4802 continue;
4803
4804 drm_sched_stop(&ring->sched, NULL);
4805 }
c9a6b82f
AG
4806 return PCI_ERS_RESULT_NEED_RESET;
4807 case pci_channel_io_perm_failure:
4808 /* Permanent error, prepare for device removal */
4809 return PCI_ERS_RESULT_DISCONNECT;
4810 }
4811
4812 return PCI_ERS_RESULT_NEED_RESET;
4813}
4814
4815/**
4816 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4817 * @pdev: pointer to PCI device
4818 */
4819pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4820{
4821
4822 DRM_INFO("PCI error: mmio enabled callback!!\n");
4823
4824 /* TODO - dump whatever for debugging purposes */
4825
4826 /* This called only if amdgpu_pci_error_detected returns
4827 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4828 * works, no need to reset slot.
4829 */
4830
4831 return PCI_ERS_RESULT_RECOVERED;
4832}
4833
4834/**
4835 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4836 * @pdev: PCI device struct
4837 *
4838 * Description: This routine is called by the pci error recovery
4839 * code after the PCI slot has been reset, just before we
4840 * should resume normal operations.
4841 */
4842pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4843{
4844 struct drm_device *dev = pci_get_drvdata(pdev);
4845 struct amdgpu_device *adev = drm_to_adev(dev);
4846 int r;
4847 bool vram_lost;
4848
4849 DRM_INFO("PCI error: slot reset callback!!\n");
4850
4851 pci_restore_state(pdev);
4852
bf36b52e 4853 adev->in_pci_err_recovery = true;
c9a6b82f 4854 r = amdgpu_device_ip_suspend(adev);
bf36b52e 4855 adev->in_pci_err_recovery = false;
c9a6b82f
AG
4856 if (r)
4857 goto out;
4858
4859
4860 /* post card */
4861 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
4862 if (r)
4863 goto out;
4864
4865 r = amdgpu_device_ip_resume_phase1(adev);
4866 if (r)
4867 goto out;
4868
4869 vram_lost = amdgpu_device_check_vram_lost(adev);
4870 if (vram_lost) {
4871 DRM_INFO("VRAM is lost due to GPU reset!\n");
4872 amdgpu_inc_vram_lost(adev);
4873 }
4874
4875 r = amdgpu_gtt_mgr_recover(
4876 &adev->mman.bdev.man[TTM_PL_TT]);
4877 if (r)
4878 goto out;
4879
4880 r = amdgpu_device_fw_loading(adev);
4881 if (r)
4882 return r;
4883
4884 r = amdgpu_device_ip_resume_phase2(adev);
4885 if (r)
4886 goto out;
4887
4888 if (vram_lost)
4889 amdgpu_device_fill_reset_magic(adev);
4890
4891 /*
4892 * Add this ASIC as tracked as reset was already
4893 * complete successfully.
4894 */
4895 amdgpu_register_gpu_instance(adev);
4896
4897 r = amdgpu_device_ip_late_init(adev);
4898 if (r)
4899 goto out;
4900
4901 amdgpu_fbdev_set_suspend(adev, 0);
4902
4903 /* must succeed. */
4904 amdgpu_ras_resume(adev);
4905
4906
4907 amdgpu_irq_gpu_reset_resume_helper(adev);
4908 r = amdgpu_ib_ring_tests(adev);
4909 if (r)
4910 goto out;
4911
4912 r = amdgpu_device_recover_vram(adev);
4913
4914out:
4915
4916 if (!r) {
4917 DRM_INFO("PCIe error recovery succeeded\n");
4918 } else {
4919 DRM_ERROR("PCIe error recovery failed, err:%d", r);
4920 amdgpu_device_unlock_adev(adev);
4921 }
4922
4923 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
4924}
4925
4926/**
4927 * amdgpu_pci_resume() - resume normal ops after PCI reset
4928 * @pdev: pointer to PCI device
4929 *
4930 * Called when the error recovery driver tells us that its
4931 * OK to resume normal operation. Use completion to allow
4932 * halted scsi ops to resume.
4933 */
4934void amdgpu_pci_resume(struct pci_dev *pdev)
4935{
4936 struct drm_device *dev = pci_get_drvdata(pdev);
4937 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 4938 int i;
c9a6b82f 4939
c9a6b82f
AG
4940
4941 DRM_INFO("PCI error: resume callback!!\n");
acd89fca
AG
4942
4943 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4944 struct amdgpu_ring *ring = adev->rings[i];
4945
4946 if (!ring || !ring->sched.thread)
4947 continue;
4948
4949
4950 drm_sched_resubmit_jobs(&ring->sched);
4951 drm_sched_start(&ring->sched, true);
4952 }
4953
4954 amdgpu_device_unlock_adev(adev);
c9a6b82f 4955}