drm/amdgpu: declare ta firmware for navy_flounder
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
fdf2f6c5 33
4562236b 34#include <drm/drm_atomic_helper.h>
fcd70cd3 35#include <drm/drm_probe_helper.h>
d38ceaf9
AD
36#include <drm/amdgpu_drm.h>
37#include <linux/vgaarb.h>
38#include <linux/vga_switcheroo.h>
39#include <linux/efi.h>
40#include "amdgpu.h"
f4b373f4 41#include "amdgpu_trace.h"
d38ceaf9
AD
42#include "amdgpu_i2c.h"
43#include "atom.h"
44#include "amdgpu_atombios.h"
a5bde2f9 45#include "amdgpu_atomfirmware.h"
d0dd7f0c 46#include "amd_pcie.h"
33f34802
KW
47#ifdef CONFIG_DRM_AMDGPU_SI
48#include "si.h"
49#endif
a2e73f56
AD
50#ifdef CONFIG_DRM_AMDGPU_CIK
51#include "cik.h"
52#endif
aaa36a97 53#include "vi.h"
460826e6 54#include "soc15.h"
0a5b8c7b 55#include "nv.h"
d38ceaf9 56#include "bif/bif_4_1_d.h"
9accf2fd 57#include <linux/pci.h>
bec86378 58#include <linux/firmware.h>
89041940 59#include "amdgpu_vf_error.h"
d38ceaf9 60
ba997709 61#include "amdgpu_amdkfd.h"
d2f52ac8 62#include "amdgpu_pm.h"
d38ceaf9 63
5183411b 64#include "amdgpu_xgmi.h"
c030f2e4 65#include "amdgpu_ras.h"
9c7c85f7 66#include "amdgpu_pmu.h"
bd607166 67#include "amdgpu_fru_eeprom.h"
5183411b 68
d5ea093e 69#include <linux/suspend.h>
c6a6e2db 70#include <drm/task_barrier.h>
3f12acc8 71#include <linux/pm_runtime.h>
d5ea093e 72
e2a75f88 73MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 74MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 75MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 76MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 77MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 78MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
b51a26a0 79MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
23c6268e 80MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
ed42cfe1 81MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
42b325e5 82MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
c0a43457 83MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
120eb833 84MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
e2a75f88 85
2dc80b00
S
86#define AMDGPU_RESUME_MS 2000
87
050091ab 88const char *amdgpu_asic_name[] = {
da69c161
KW
89 "TAHITI",
90 "PITCAIRN",
91 "VERDE",
92 "OLAND",
93 "HAINAN",
d38ceaf9
AD
94 "BONAIRE",
95 "KAVERI",
96 "KABINI",
97 "HAWAII",
98 "MULLINS",
99 "TOPAZ",
100 "TONGA",
48299f95 101 "FIJI",
d38ceaf9 102 "CARRIZO",
139f4917 103 "STONEY",
2cc0c0b5
FC
104 "POLARIS10",
105 "POLARIS11",
c4642a47 106 "POLARIS12",
48ff108d 107 "VEGAM",
d4196f01 108 "VEGA10",
8fab806a 109 "VEGA12",
956fcddc 110 "VEGA20",
2ca8a5d2 111 "RAVEN",
d6c3b24e 112 "ARCTURUS",
1eee4228 113 "RENOIR",
852a6626 114 "NAVI10",
87dbad02 115 "NAVI14",
9802f5d7 116 "NAVI12",
ccaf72d3 117 "SIENNA_CICHLID",
ddd8fbe7 118 "NAVY_FLOUNDER",
d38ceaf9
AD
119 "LAST",
120};
121
dcea6e65
KR
122/**
123 * DOC: pcie_replay_count
124 *
125 * The amdgpu driver provides a sysfs API for reporting the total number
126 * of PCIe replays (NAKs)
127 * The file pcie_replay_count is used for this and returns the total
128 * number of replays as a sum of the NAKs generated and NAKs received
129 */
130
131static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
132 struct device_attribute *attr, char *buf)
133{
134 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 135 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
137
138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
139}
140
141static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
142 amdgpu_device_get_pcie_replay_count, NULL);
143
5494d864
AD
144static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
145
bd607166
KR
146/**
147 * DOC: product_name
148 *
149 * The amdgpu driver provides a sysfs API for reporting the product name
150 * for the device
151 * The file serial_number is used for this and returns the product name
152 * as returned from the FRU.
153 * NOTE: This is only available for certain server cards
154 */
155
156static ssize_t amdgpu_device_get_product_name(struct device *dev,
157 struct device_attribute *attr, char *buf)
158{
159 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 160 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
161
162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
163}
164
165static DEVICE_ATTR(product_name, S_IRUGO,
166 amdgpu_device_get_product_name, NULL);
167
168/**
169 * DOC: product_number
170 *
171 * The amdgpu driver provides a sysfs API for reporting the part number
172 * for the device
173 * The file serial_number is used for this and returns the part number
174 * as returned from the FRU.
175 * NOTE: This is only available for certain server cards
176 */
177
178static ssize_t amdgpu_device_get_product_number(struct device *dev,
179 struct device_attribute *attr, char *buf)
180{
181 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 182 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
183
184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
185}
186
187static DEVICE_ATTR(product_number, S_IRUGO,
188 amdgpu_device_get_product_number, NULL);
189
190/**
191 * DOC: serial_number
192 *
193 * The amdgpu driver provides a sysfs API for reporting the serial number
194 * for the device
195 * The file serial_number is used for this and returns the serial number
196 * as returned from the FRU.
197 * NOTE: This is only available for certain server cards
198 */
199
200static ssize_t amdgpu_device_get_serial_number(struct device *dev,
201 struct device_attribute *attr, char *buf)
202{
203 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 204 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
205
206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
207}
208
209static DEVICE_ATTR(serial_number, S_IRUGO,
210 amdgpu_device_get_serial_number, NULL);
211
e3ecdffa 212/**
31af062a 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
e3ecdffa
AD
214 *
215 * @dev: drm_device pointer
216 *
217 * Returns true if the device is a dGPU with HG/PX power control,
218 * otherwise return false.
219 */
31af062a 220bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 221{
1348969a 222 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 223
2f7d10b3 224 if (adev->flags & AMD_IS_PX)
d38ceaf9
AD
225 return true;
226 return false;
227}
228
a69cba42
AD
229/**
230 * amdgpu_device_supports_baco - Does the device support BACO
231 *
232 * @dev: drm_device pointer
233 *
234 * Returns true if the device supporte BACO,
235 * otherwise return false.
236 */
237bool amdgpu_device_supports_baco(struct drm_device *dev)
238{
1348969a 239 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
240
241 return amdgpu_asic_supports_baco(adev);
242}
243
e35e2b11
TY
244/**
245 * VRAM access helper functions.
246 *
247 * amdgpu_device_vram_access - read/write a buffer in vram
248 *
249 * @adev: amdgpu_device pointer
250 * @pos: offset of the buffer in vram
251 * @buf: virtual address of the buffer in system memory
252 * @size: read/write size, sizeof(@buf) must > @size
253 * @write: true - write to vram, otherwise - read from vram
254 */
255void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 uint32_t *buf, size_t size, bool write)
257{
e35e2b11 258 unsigned long flags;
ce05ac56
CK
259 uint32_t hi = ~0;
260 uint64_t last;
261
9d11eb0d
CK
262
263#ifdef CONFIG_64BIT
264 last = min(pos + size, adev->gmc.visible_vram_size);
265 if (last > pos) {
266 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 size_t count = last - pos;
268
269 if (write) {
270 memcpy_toio(addr, buf, count);
271 mb();
272 amdgpu_asic_flush_hdp(adev, NULL);
273 } else {
274 amdgpu_asic_invalidate_hdp(adev, NULL);
275 mb();
276 memcpy_fromio(buf, addr, count);
277 }
278
279 if (count == size)
280 return;
281
282 pos += count;
283 buf += count / 4;
284 size -= count;
285 }
286#endif
287
ce05ac56
CK
288 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 for (last = pos + size; pos < last; pos += 4) {
290 uint32_t tmp = pos >> 31;
e35e2b11 291
e35e2b11 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
ce05ac56
CK
293 if (tmp != hi) {
294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 hi = tmp;
296 }
e35e2b11
TY
297 if (write)
298 WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 else
300 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
e35e2b11 301 }
ce05ac56 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
e35e2b11
TY
303}
304
d38ceaf9 305/*
e78b579d 306 * MMIO register access helper functions.
d38ceaf9 307 */
e3ecdffa 308/**
e78b579d 309 * amdgpu_mm_rreg - read a memory mapped IO register
e3ecdffa
AD
310 *
311 * @adev: amdgpu_device pointer
312 * @reg: dword aligned register offset
313 * @acc_flags: access flags which require special behavior
314 *
315 * Returns the 32 bit value from the offset specified.
316 */
e78b579d
HZ
317uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
318 uint32_t acc_flags)
d38ceaf9 319{
f4b373f4
TSD
320 uint32_t ret;
321
bf36b52e
AG
322 if (adev->in_pci_err_recovery)
323 return 0;
324
81202807
DL
325 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
326 down_read_trylock(&adev->reset_sem)) {
327 ret = amdgpu_kiq_rreg(adev, reg);
328 up_read(&adev->reset_sem);
329 return ret;
330 }
bc992ba5 331
ec59847e 332 if ((reg * 4) < adev->rmmio_size)
f4b373f4 333 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
e78b579d
HZ
334 else {
335 unsigned long flags;
336
337 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
338 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
339 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
341 }
81202807 342
e78b579d 343 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
f4b373f4 344 return ret;
d38ceaf9
AD
345}
346
421a2a30
ML
347/*
348 * MMIO register read with bytes helper functions
349 * @offset:bytes offset from MMIO start
350 *
351*/
352
e3ecdffa
AD
353/**
354 * amdgpu_mm_rreg8 - read a memory mapped IO register
355 *
356 * @adev: amdgpu_device pointer
357 * @offset: byte aligned register offset
358 *
359 * Returns the 8 bit value from the offset specified.
360 */
7cbbc745
AG
361uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
362{
bf36b52e
AG
363 if (adev->in_pci_err_recovery)
364 return 0;
365
421a2a30
ML
366 if (offset < adev->rmmio_size)
367 return (readb(adev->rmmio + offset));
368 BUG();
369}
370
371/*
372 * MMIO register write with bytes helper functions
373 * @offset:bytes offset from MMIO start
374 * @value: the value want to be written to the register
375 *
376*/
e3ecdffa
AD
377/**
378 * amdgpu_mm_wreg8 - read a memory mapped IO register
379 *
380 * @adev: amdgpu_device pointer
381 * @offset: byte aligned register offset
382 * @value: 8 bit value to write
383 *
384 * Writes the value specified to the offset specified.
385 */
7cbbc745
AG
386void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
387{
bf36b52e
AG
388 if (adev->in_pci_err_recovery)
389 return;
390
421a2a30
ML
391 if (offset < adev->rmmio_size)
392 writeb(value, adev->rmmio + offset);
393 else
394 BUG();
395}
396
e230ac11
ND
397static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
398 uint32_t reg, uint32_t v,
399 uint32_t acc_flags)
2e0cc4d4 400{
bf36b52e
AG
401 if (adev->in_pci_err_recovery)
402 return;
403
e78b579d 404 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
2e0cc4d4 405
ec59847e 406 if ((reg * 4) < adev->rmmio_size)
2e0cc4d4 407 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
e78b579d
HZ
408 else {
409 unsigned long flags;
410
411 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
412 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
413 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
414 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
415 }
2e0cc4d4
ML
416}
417
e3ecdffa 418/**
e78b579d 419 * amdgpu_mm_wreg - write to a memory mapped IO register
e3ecdffa
AD
420 *
421 * @adev: amdgpu_device pointer
422 * @reg: dword aligned register offset
423 * @v: 32 bit value to write to the register
424 * @acc_flags: access flags which require special behavior
425 *
426 * Writes the value specified to the offset specified.
427 */
e78b579d
HZ
428void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
429 uint32_t acc_flags)
d38ceaf9 430{
bf36b52e
AG
431 if (adev->in_pci_err_recovery)
432 return;
433
81202807
DL
434 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
435 down_read_trylock(&adev->reset_sem)) {
436 amdgpu_kiq_wreg(adev, reg, v);
437 up_read(&adev->reset_sem);
438 return;
439 }
bc992ba5 440
e78b579d 441 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
2e0cc4d4 442}
d38ceaf9 443
2e0cc4d4
ML
444/*
445 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
446 *
447 * this function is invoked only the debugfs register access
448 * */
449void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
450 uint32_t acc_flags)
451{
bf36b52e
AG
452 if (adev->in_pci_err_recovery)
453 return;
454
2e0cc4d4
ML
455 if (amdgpu_sriov_fullaccess(adev) &&
456 adev->gfx.rlc.funcs &&
457 adev->gfx.rlc.funcs->is_rlcg_access_range) {
47ed4e1c 458
2e0cc4d4
ML
459 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
460 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
47ed4e1c 461 }
2e0cc4d4 462
e78b579d 463 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
d38ceaf9
AD
464}
465
e3ecdffa
AD
466/**
467 * amdgpu_io_rreg - read an IO register
468 *
469 * @adev: amdgpu_device pointer
470 * @reg: dword aligned register offset
471 *
472 * Returns the 32 bit value from the offset specified.
473 */
d38ceaf9
AD
474u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
475{
bf36b52e
AG
476 if (adev->in_pci_err_recovery)
477 return 0;
478
d38ceaf9
AD
479 if ((reg * 4) < adev->rio_mem_size)
480 return ioread32(adev->rio_mem + (reg * 4));
481 else {
482 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
483 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
484 }
485}
486
e3ecdffa
AD
487/**
488 * amdgpu_io_wreg - write to an IO register
489 *
490 * @adev: amdgpu_device pointer
491 * @reg: dword aligned register offset
492 * @v: 32 bit value to write to the register
493 *
494 * Writes the value specified to the offset specified.
495 */
d38ceaf9
AD
496void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
497{
bf36b52e
AG
498 if (adev->in_pci_err_recovery)
499 return;
500
d38ceaf9
AD
501 if ((reg * 4) < adev->rio_mem_size)
502 iowrite32(v, adev->rio_mem + (reg * 4));
503 else {
504 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
505 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
506 }
507}
508
509/**
510 * amdgpu_mm_rdoorbell - read a doorbell dword
511 *
512 * @adev: amdgpu_device pointer
513 * @index: doorbell index
514 *
515 * Returns the value in the doorbell aperture at the
516 * requested doorbell index (CIK).
517 */
518u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
519{
bf36b52e
AG
520 if (adev->in_pci_err_recovery)
521 return 0;
522
d38ceaf9
AD
523 if (index < adev->doorbell.num_doorbells) {
524 return readl(adev->doorbell.ptr + index);
525 } else {
526 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
527 return 0;
528 }
529}
530
531/**
532 * amdgpu_mm_wdoorbell - write a doorbell dword
533 *
534 * @adev: amdgpu_device pointer
535 * @index: doorbell index
536 * @v: value to write
537 *
538 * Writes @v to the doorbell aperture at the
539 * requested doorbell index (CIK).
540 */
541void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
542{
bf36b52e
AG
543 if (adev->in_pci_err_recovery)
544 return;
545
d38ceaf9
AD
546 if (index < adev->doorbell.num_doorbells) {
547 writel(v, adev->doorbell.ptr + index);
548 } else {
549 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
550 }
551}
552
832be404
KW
553/**
554 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
555 *
556 * @adev: amdgpu_device pointer
557 * @index: doorbell index
558 *
559 * Returns the value in the doorbell aperture at the
560 * requested doorbell index (VEGA10+).
561 */
562u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
563{
bf36b52e
AG
564 if (adev->in_pci_err_recovery)
565 return 0;
566
832be404
KW
567 if (index < adev->doorbell.num_doorbells) {
568 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
569 } else {
570 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
571 return 0;
572 }
573}
574
575/**
576 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
577 *
578 * @adev: amdgpu_device pointer
579 * @index: doorbell index
580 * @v: value to write
581 *
582 * Writes @v to the doorbell aperture at the
583 * requested doorbell index (VEGA10+).
584 */
585void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
586{
bf36b52e
AG
587 if (adev->in_pci_err_recovery)
588 return;
589
832be404
KW
590 if (index < adev->doorbell.num_doorbells) {
591 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
592 } else {
593 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
594 }
595}
596
d38ceaf9
AD
597/**
598 * amdgpu_invalid_rreg - dummy reg read function
599 *
600 * @adev: amdgpu device pointer
601 * @reg: offset of register
602 *
603 * Dummy register read function. Used for register blocks
604 * that certain asics don't have (all asics).
605 * Returns the value in the register.
606 */
607static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
608{
609 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
610 BUG();
611 return 0;
612}
613
614/**
615 * amdgpu_invalid_wreg - dummy reg write function
616 *
617 * @adev: amdgpu device pointer
618 * @reg: offset of register
619 * @v: value to write to the register
620 *
621 * Dummy register read function. Used for register blocks
622 * that certain asics don't have (all asics).
623 */
624static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
625{
626 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
627 reg, v);
628 BUG();
629}
630
4fa1c6a6
TZ
631/**
632 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
633 *
634 * @adev: amdgpu device pointer
635 * @reg: offset of register
636 *
637 * Dummy register read function. Used for register blocks
638 * that certain asics don't have (all asics).
639 * Returns the value in the register.
640 */
641static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
642{
643 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
644 BUG();
645 return 0;
646}
647
648/**
649 * amdgpu_invalid_wreg64 - dummy reg write function
650 *
651 * @adev: amdgpu device pointer
652 * @reg: offset of register
653 * @v: value to write to the register
654 *
655 * Dummy register read function. Used for register blocks
656 * that certain asics don't have (all asics).
657 */
658static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
659{
660 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
661 reg, v);
662 BUG();
663}
664
d38ceaf9
AD
665/**
666 * amdgpu_block_invalid_rreg - dummy reg read function
667 *
668 * @adev: amdgpu device pointer
669 * @block: offset of instance
670 * @reg: offset of register
671 *
672 * Dummy register read function. Used for register blocks
673 * that certain asics don't have (all asics).
674 * Returns the value in the register.
675 */
676static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
677 uint32_t block, uint32_t reg)
678{
679 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
680 reg, block);
681 BUG();
682 return 0;
683}
684
685/**
686 * amdgpu_block_invalid_wreg - dummy reg write function
687 *
688 * @adev: amdgpu device pointer
689 * @block: offset of instance
690 * @reg: offset of register
691 * @v: value to write to the register
692 *
693 * Dummy register read function. Used for register blocks
694 * that certain asics don't have (all asics).
695 */
696static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
697 uint32_t block,
698 uint32_t reg, uint32_t v)
699{
700 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
701 reg, block, v);
702 BUG();
703}
704
4d2997ab
AD
705/**
706 * amdgpu_device_asic_init - Wrapper for atom asic_init
707 *
708 * @dev: drm_device pointer
709 *
710 * Does any asic specific work and then calls atom asic init.
711 */
712static int amdgpu_device_asic_init(struct amdgpu_device *adev)
713{
714 amdgpu_asic_pre_asic_init(adev);
715
716 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
717}
718
e3ecdffa
AD
719/**
720 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
721 *
722 * @adev: amdgpu device pointer
723 *
724 * Allocates a scratch page of VRAM for use by various things in the
725 * driver.
726 */
06ec9070 727static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 728{
a4a02777
CK
729 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
730 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
731 &adev->vram_scratch.robj,
732 &adev->vram_scratch.gpu_addr,
733 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
734}
735
e3ecdffa
AD
736/**
737 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
738 *
739 * @adev: amdgpu device pointer
740 *
741 * Frees the VRAM scratch page.
742 */
06ec9070 743static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 744{
078af1a3 745 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
746}
747
748/**
9c3f2b54 749 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
750 *
751 * @adev: amdgpu_device pointer
752 * @registers: pointer to the register array
753 * @array_size: size of the register array
754 *
755 * Programs an array or registers with and and or masks.
756 * This is a helper for setting golden registers.
757 */
9c3f2b54
AD
758void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
759 const u32 *registers,
760 const u32 array_size)
d38ceaf9
AD
761{
762 u32 tmp, reg, and_mask, or_mask;
763 int i;
764
765 if (array_size % 3)
766 return;
767
768 for (i = 0; i < array_size; i +=3) {
769 reg = registers[i + 0];
770 and_mask = registers[i + 1];
771 or_mask = registers[i + 2];
772
773 if (and_mask == 0xffffffff) {
774 tmp = or_mask;
775 } else {
776 tmp = RREG32(reg);
777 tmp &= ~and_mask;
e0d07657
HZ
778 if (adev->family >= AMDGPU_FAMILY_AI)
779 tmp |= (or_mask & and_mask);
780 else
781 tmp |= or_mask;
d38ceaf9
AD
782 }
783 WREG32(reg, tmp);
784 }
785}
786
e3ecdffa
AD
787/**
788 * amdgpu_device_pci_config_reset - reset the GPU
789 *
790 * @adev: amdgpu_device pointer
791 *
792 * Resets the GPU using the pci config reset sequence.
793 * Only applicable to asics prior to vega10.
794 */
8111c387 795void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
796{
797 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
798}
799
800/*
801 * GPU doorbell aperture helpers function.
802 */
803/**
06ec9070 804 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
805 *
806 * @adev: amdgpu_device pointer
807 *
808 * Init doorbell driver information (CIK)
809 * Returns 0 on success, error on failure.
810 */
06ec9070 811static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 812{
6585661d 813
705e519e
CK
814 /* No doorbell on SI hardware generation */
815 if (adev->asic_type < CHIP_BONAIRE) {
816 adev->doorbell.base = 0;
817 adev->doorbell.size = 0;
818 adev->doorbell.num_doorbells = 0;
819 adev->doorbell.ptr = NULL;
820 return 0;
821 }
822
d6895ad3
CK
823 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
824 return -EINVAL;
825
22357775
AD
826 amdgpu_asic_init_doorbell_index(adev);
827
d38ceaf9
AD
828 /* doorbell bar mapping */
829 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
830 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
831
edf600da 832 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
9564f192 833 adev->doorbell_index.max_assignment+1);
d38ceaf9
AD
834 if (adev->doorbell.num_doorbells == 0)
835 return -EINVAL;
836
ec3db8a6 837 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
88dc26e4
OZ
838 * paging queue doorbell use the second page. The
839 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
840 * doorbells are in the first page. So with paging queue enabled,
841 * the max num_doorbells should + 1 page (0x400 in dword)
ec3db8a6
PY
842 */
843 if (adev->asic_type >= CHIP_VEGA10)
88dc26e4 844 adev->doorbell.num_doorbells += 0x400;
ec3db8a6 845
8972e5d2
CK
846 adev->doorbell.ptr = ioremap(adev->doorbell.base,
847 adev->doorbell.num_doorbells *
848 sizeof(u32));
849 if (adev->doorbell.ptr == NULL)
d38ceaf9 850 return -ENOMEM;
d38ceaf9
AD
851
852 return 0;
853}
854
855/**
06ec9070 856 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
857 *
858 * @adev: amdgpu_device pointer
859 *
860 * Tear down doorbell driver information (CIK)
861 */
06ec9070 862static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
863{
864 iounmap(adev->doorbell.ptr);
865 adev->doorbell.ptr = NULL;
866}
867
22cb0164 868
d38ceaf9
AD
869
870/*
06ec9070 871 * amdgpu_device_wb_*()
455a7bc2 872 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 873 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
874 */
875
876/**
06ec9070 877 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
878 *
879 * @adev: amdgpu_device pointer
880 *
881 * Disables Writeback and frees the Writeback memory (all asics).
882 * Used at driver shutdown.
883 */
06ec9070 884static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
885{
886 if (adev->wb.wb_obj) {
a76ed485
AD
887 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
888 &adev->wb.gpu_addr,
889 (void **)&adev->wb.wb);
d38ceaf9
AD
890 adev->wb.wb_obj = NULL;
891 }
892}
893
894/**
06ec9070 895 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
d38ceaf9
AD
896 *
897 * @adev: amdgpu_device pointer
898 *
455a7bc2 899 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
900 * Used at driver startup.
901 * Returns 0 on success or an -error on failure.
902 */
06ec9070 903static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
904{
905 int r;
906
907 if (adev->wb.wb_obj == NULL) {
97407b63
AD
908 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
909 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
910 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
911 &adev->wb.wb_obj, &adev->wb.gpu_addr,
912 (void **)&adev->wb.wb);
d38ceaf9
AD
913 if (r) {
914 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
915 return r;
916 }
d38ceaf9
AD
917
918 adev->wb.num_wb = AMDGPU_MAX_WB;
919 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
920
921 /* clear wb memory */
73469585 922 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
923 }
924
925 return 0;
926}
927
928/**
131b4b36 929 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
930 *
931 * @adev: amdgpu_device pointer
932 * @wb: wb index
933 *
934 * Allocate a wb slot for use by the driver (all asics).
935 * Returns 0 on success or -EINVAL on failure.
936 */
131b4b36 937int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
938{
939 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 940
97407b63 941 if (offset < adev->wb.num_wb) {
7014285a 942 __set_bit(offset, adev->wb.used);
63ae07ca 943 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
944 return 0;
945 } else {
946 return -EINVAL;
947 }
948}
949
d38ceaf9 950/**
131b4b36 951 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
952 *
953 * @adev: amdgpu_device pointer
954 * @wb: wb index
955 *
956 * Free a wb slot allocated for use by the driver (all asics)
957 */
131b4b36 958void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 959{
73469585 960 wb >>= 3;
d38ceaf9 961 if (wb < adev->wb.num_wb)
73469585 962 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
963}
964
d6895ad3
CK
965/**
966 * amdgpu_device_resize_fb_bar - try to resize FB BAR
967 *
968 * @adev: amdgpu_device pointer
969 *
970 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
971 * to fail, but if any of the BARs is not accessible after the size we abort
972 * driver loading by returning -ENODEV.
973 */
974int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
975{
770d13b1 976 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
d6895ad3 977 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
31b8adab
CK
978 struct pci_bus *root;
979 struct resource *res;
980 unsigned i;
d6895ad3
CK
981 u16 cmd;
982 int r;
983
0c03b912 984 /* Bypass for VF */
985 if (amdgpu_sriov_vf(adev))
986 return 0;
987
b7221f2b
AD
988 /* skip if the bios has already enabled large BAR */
989 if (adev->gmc.real_vram_size &&
990 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
991 return 0;
992
31b8adab
CK
993 /* Check if the root BUS has 64bit memory resources */
994 root = adev->pdev->bus;
995 while (root->parent)
996 root = root->parent;
997
998 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 999 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1000 res->start > 0x100000000ull)
1001 break;
1002 }
1003
1004 /* Trying to resize is pointless without a root hub window above 4GB */
1005 if (!res)
1006 return 0;
1007
d6895ad3
CK
1008 /* Disable memory decoding while we change the BAR addresses and size */
1009 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1010 pci_write_config_word(adev->pdev, PCI_COMMAND,
1011 cmd & ~PCI_COMMAND_MEMORY);
1012
1013 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1014 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1015 if (adev->asic_type >= CHIP_BONAIRE)
1016 pci_release_resource(adev->pdev, 2);
1017
1018 pci_release_resource(adev->pdev, 0);
1019
1020 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1021 if (r == -ENOSPC)
1022 DRM_INFO("Not enough PCI address space for a large BAR.");
1023 else if (r && r != -ENOTSUPP)
1024 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1025
1026 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1027
1028 /* When the doorbell or fb BAR isn't available we have no chance of
1029 * using the device.
1030 */
06ec9070 1031 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1032 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1033 return -ENODEV;
1034
1035 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1036
1037 return 0;
1038}
a05502e5 1039
d38ceaf9
AD
1040/*
1041 * GPU helpers function.
1042 */
1043/**
39c640c0 1044 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1045 *
1046 * @adev: amdgpu_device pointer
1047 *
c836fec5
JQ
1048 * Check if the asic has been initialized (all asics) at driver startup
1049 * or post is needed if hw reset is performed.
1050 * Returns true if need or false if not.
d38ceaf9 1051 */
39c640c0 1052bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1053{
1054 uint32_t reg;
1055
bec86378
ML
1056 if (amdgpu_sriov_vf(adev))
1057 return false;
1058
1059 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1060 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1061 * some old smc fw still need driver do vPost otherwise gpu hang, while
1062 * those smc fw version above 22.15 doesn't have this flaw, so we force
1063 * vpost executed for smc version below 22.15
bec86378
ML
1064 */
1065 if (adev->asic_type == CHIP_FIJI) {
1066 int err;
1067 uint32_t fw_ver;
1068 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1069 /* force vPost if error occured */
1070 if (err)
1071 return true;
1072
1073 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1074 if (fw_ver < 0x00160e00)
1075 return true;
bec86378 1076 }
bec86378 1077 }
91fe77eb 1078
1079 if (adev->has_hw_reset) {
1080 adev->has_hw_reset = false;
1081 return true;
1082 }
1083
1084 /* bios scratch used on CIK+ */
1085 if (adev->asic_type >= CHIP_BONAIRE)
1086 return amdgpu_atombios_scratch_need_asic_init(adev);
1087
1088 /* check MEM_SIZE for older asics */
1089 reg = amdgpu_asic_get_config_memsize(adev);
1090
1091 if ((reg != 0) && (reg != 0xffffffff))
1092 return false;
1093
1094 return true;
bec86378
ML
1095}
1096
d38ceaf9
AD
1097/* if we get transitioned to only one device, take VGA back */
1098/**
06ec9070 1099 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9
AD
1100 *
1101 * @cookie: amdgpu_device pointer
1102 * @state: enable/disable vga decode
1103 *
1104 * Enable/disable vga decode (all asics).
1105 * Returns VGA resource flags.
1106 */
06ec9070 1107static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
d38ceaf9
AD
1108{
1109 struct amdgpu_device *adev = cookie;
1110 amdgpu_asic_set_vga_state(adev, state);
1111 if (state)
1112 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1113 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1114 else
1115 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1116}
1117
e3ecdffa
AD
1118/**
1119 * amdgpu_device_check_block_size - validate the vm block size
1120 *
1121 * @adev: amdgpu_device pointer
1122 *
1123 * Validates the vm block size specified via module parameter.
1124 * The vm block size defines number of bits in page table versus page directory,
1125 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1126 * page table and the remaining bits are in the page directory.
1127 */
06ec9070 1128static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1129{
1130 /* defines number of bits in page table versus page directory,
1131 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1132 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1133 if (amdgpu_vm_block_size == -1)
1134 return;
a1adf8be 1135
bab4fee7 1136 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1137 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1138 amdgpu_vm_block_size);
97489129 1139 amdgpu_vm_block_size = -1;
a1adf8be 1140 }
a1adf8be
CZ
1141}
1142
e3ecdffa
AD
1143/**
1144 * amdgpu_device_check_vm_size - validate the vm size
1145 *
1146 * @adev: amdgpu_device pointer
1147 *
1148 * Validates the vm size in GB specified via module parameter.
1149 * The VM size is the size of the GPU virtual memory space in GB.
1150 */
06ec9070 1151static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1152{
64dab074
AD
1153 /* no need to check the default value */
1154 if (amdgpu_vm_size == -1)
1155 return;
1156
83ca145d
ZJ
1157 if (amdgpu_vm_size < 1) {
1158 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1159 amdgpu_vm_size);
f3368128 1160 amdgpu_vm_size = -1;
83ca145d 1161 }
83ca145d
ZJ
1162}
1163
7951e376
RZ
1164static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1165{
1166 struct sysinfo si;
a9d4fe2f 1167 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1168 uint64_t total_memory;
1169 uint64_t dram_size_seven_GB = 0x1B8000000;
1170 uint64_t dram_size_three_GB = 0xB8000000;
1171
1172 if (amdgpu_smu_memory_pool_size == 0)
1173 return;
1174
1175 if (!is_os_64) {
1176 DRM_WARN("Not 64-bit OS, feature not supported\n");
1177 goto def_value;
1178 }
1179 si_meminfo(&si);
1180 total_memory = (uint64_t)si.totalram * si.mem_unit;
1181
1182 if ((amdgpu_smu_memory_pool_size == 1) ||
1183 (amdgpu_smu_memory_pool_size == 2)) {
1184 if (total_memory < dram_size_three_GB)
1185 goto def_value1;
1186 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1187 (amdgpu_smu_memory_pool_size == 8)) {
1188 if (total_memory < dram_size_seven_GB)
1189 goto def_value1;
1190 } else {
1191 DRM_WARN("Smu memory pool size not supported\n");
1192 goto def_value;
1193 }
1194 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1195
1196 return;
1197
1198def_value1:
1199 DRM_WARN("No enough system memory\n");
1200def_value:
1201 adev->pm.smu_prv_buffer_size = 0;
1202}
1203
d38ceaf9 1204/**
06ec9070 1205 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1206 *
1207 * @adev: amdgpu_device pointer
1208 *
1209 * Validates certain module parameters and updates
1210 * the associated values used by the driver (all asics).
1211 */
912dfc84 1212static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1213{
5b011235
CZ
1214 if (amdgpu_sched_jobs < 4) {
1215 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1216 amdgpu_sched_jobs);
1217 amdgpu_sched_jobs = 4;
76117507 1218 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1219 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1220 amdgpu_sched_jobs);
1221 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1222 }
d38ceaf9 1223
83e74db6 1224 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1225 /* gart size must be greater or equal to 32M */
1226 dev_warn(adev->dev, "gart size (%d) too small\n",
1227 amdgpu_gart_size);
83e74db6 1228 amdgpu_gart_size = -1;
d38ceaf9
AD
1229 }
1230
36d38372 1231 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1232 /* gtt size must be greater or equal to 32M */
36d38372
CK
1233 dev_warn(adev->dev, "gtt size (%d) too small\n",
1234 amdgpu_gtt_size);
1235 amdgpu_gtt_size = -1;
d38ceaf9
AD
1236 }
1237
d07f14be
RH
1238 /* valid range is between 4 and 9 inclusive */
1239 if (amdgpu_vm_fragment_size != -1 &&
1240 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1241 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1242 amdgpu_vm_fragment_size = -1;
1243 }
1244
5d5bd5e3
KW
1245 if (amdgpu_sched_hw_submission < 2) {
1246 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1247 amdgpu_sched_hw_submission);
1248 amdgpu_sched_hw_submission = 2;
1249 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1250 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1251 amdgpu_sched_hw_submission);
1252 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1253 }
1254
7951e376
RZ
1255 amdgpu_device_check_smu_prv_buffer_size(adev);
1256
06ec9070 1257 amdgpu_device_check_vm_size(adev);
d38ceaf9 1258
06ec9070 1259 amdgpu_device_check_block_size(adev);
6a7f76e7 1260
19aede77 1261 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1262
c6252390 1263 amdgpu_gmc_tmz_set(adev);
01a8dcec 1264
a300de40
ML
1265 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1266 amdgpu_num_kcq = 8;
c16ce562 1267 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
a300de40
ML
1268 }
1269
e3c00faa 1270 return 0;
d38ceaf9
AD
1271}
1272
1273/**
1274 * amdgpu_switcheroo_set_state - set switcheroo state
1275 *
1276 * @pdev: pci dev pointer
1694467b 1277 * @state: vga_switcheroo state
d38ceaf9
AD
1278 *
1279 * Callback for the switcheroo driver. Suspends or resumes the
1280 * the asics before or after it is powered up using ACPI methods.
1281 */
8aba21b7
LT
1282static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1283 enum vga_switcheroo_state state)
d38ceaf9
AD
1284{
1285 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1286 int r;
d38ceaf9 1287
31af062a 1288 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1289 return;
1290
1291 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1292 pr_info("switched on\n");
d38ceaf9
AD
1293 /* don't suspend or resume card normally */
1294 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1295
de185019 1296 pci_set_power_state(dev->pdev, PCI_D0);
c1dd4aa6 1297 amdgpu_device_load_pci_state(dev->pdev);
de185019
AD
1298 r = pci_enable_device(dev->pdev);
1299 if (r)
1300 DRM_WARN("pci_enable_device failed (%d)\n", r);
1301 amdgpu_device_resume(dev, true);
d38ceaf9 1302
d38ceaf9
AD
1303 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1304 drm_kms_helper_poll_enable(dev);
1305 } else {
dd4fa6c1 1306 pr_info("switched off\n");
d38ceaf9
AD
1307 drm_kms_helper_poll_disable(dev);
1308 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1309 amdgpu_device_suspend(dev, true);
c1dd4aa6 1310 amdgpu_device_cache_pci_state(dev->pdev);
de185019
AD
1311 /* Shut down the device */
1312 pci_disable_device(dev->pdev);
1313 pci_set_power_state(dev->pdev, PCI_D3cold);
d38ceaf9
AD
1314 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1315 }
1316}
1317
1318/**
1319 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1320 *
1321 * @pdev: pci dev pointer
1322 *
1323 * Callback for the switcheroo driver. Check of the switcheroo
1324 * state can be changed.
1325 * Returns true if the state can be changed, false if not.
1326 */
1327static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1328{
1329 struct drm_device *dev = pci_get_drvdata(pdev);
1330
1331 /*
1332 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1333 * locking inversion with the driver load path. And the access here is
1334 * completely racy anyway. So don't bother with locking for now.
1335 */
7e13ad89 1336 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1337}
1338
1339static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1340 .set_gpu_state = amdgpu_switcheroo_set_state,
1341 .reprobe = NULL,
1342 .can_switch = amdgpu_switcheroo_can_switch,
1343};
1344
e3ecdffa
AD
1345/**
1346 * amdgpu_device_ip_set_clockgating_state - set the CG state
1347 *
87e3f136 1348 * @dev: amdgpu_device pointer
e3ecdffa
AD
1349 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1350 * @state: clockgating state (gate or ungate)
1351 *
1352 * Sets the requested clockgating state for all instances of
1353 * the hardware IP specified.
1354 * Returns the error code from the last instance.
1355 */
43fa561f 1356int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1357 enum amd_ip_block_type block_type,
1358 enum amd_clockgating_state state)
d38ceaf9 1359{
43fa561f 1360 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1361 int i, r = 0;
1362
1363 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1364 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1365 continue;
c722865a
RZ
1366 if (adev->ip_blocks[i].version->type != block_type)
1367 continue;
1368 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1369 continue;
1370 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1371 (void *)adev, state);
1372 if (r)
1373 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1374 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1375 }
1376 return r;
1377}
1378
e3ecdffa
AD
1379/**
1380 * amdgpu_device_ip_set_powergating_state - set the PG state
1381 *
87e3f136 1382 * @dev: amdgpu_device pointer
e3ecdffa
AD
1383 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1384 * @state: powergating state (gate or ungate)
1385 *
1386 * Sets the requested powergating state for all instances of
1387 * the hardware IP specified.
1388 * Returns the error code from the last instance.
1389 */
43fa561f 1390int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1391 enum amd_ip_block_type block_type,
1392 enum amd_powergating_state state)
d38ceaf9 1393{
43fa561f 1394 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1395 int i, r = 0;
1396
1397 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1398 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1399 continue;
c722865a
RZ
1400 if (adev->ip_blocks[i].version->type != block_type)
1401 continue;
1402 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1403 continue;
1404 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1405 (void *)adev, state);
1406 if (r)
1407 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1408 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1409 }
1410 return r;
1411}
1412
e3ecdffa
AD
1413/**
1414 * amdgpu_device_ip_get_clockgating_state - get the CG state
1415 *
1416 * @adev: amdgpu_device pointer
1417 * @flags: clockgating feature flags
1418 *
1419 * Walks the list of IPs on the device and updates the clockgating
1420 * flags for each IP.
1421 * Updates @flags with the feature flags for each hardware IP where
1422 * clockgating is enabled.
1423 */
2990a1fc
AD
1424void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1425 u32 *flags)
6cb2d4e4
HR
1426{
1427 int i;
1428
1429 for (i = 0; i < adev->num_ip_blocks; i++) {
1430 if (!adev->ip_blocks[i].status.valid)
1431 continue;
1432 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1433 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1434 }
1435}
1436
e3ecdffa
AD
1437/**
1438 * amdgpu_device_ip_wait_for_idle - wait for idle
1439 *
1440 * @adev: amdgpu_device pointer
1441 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1442 *
1443 * Waits for the request hardware IP to be idle.
1444 * Returns 0 for success or a negative error code on failure.
1445 */
2990a1fc
AD
1446int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1447 enum amd_ip_block_type block_type)
5dbbb60b
AD
1448{
1449 int i, r;
1450
1451 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1452 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1453 continue;
a1255107
AD
1454 if (adev->ip_blocks[i].version->type == block_type) {
1455 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1456 if (r)
1457 return r;
1458 break;
1459 }
1460 }
1461 return 0;
1462
1463}
1464
e3ecdffa
AD
1465/**
1466 * amdgpu_device_ip_is_idle - is the hardware IP idle
1467 *
1468 * @adev: amdgpu_device pointer
1469 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1470 *
1471 * Check if the hardware IP is idle or not.
1472 * Returns true if it the IP is idle, false if not.
1473 */
2990a1fc
AD
1474bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1475 enum amd_ip_block_type block_type)
5dbbb60b
AD
1476{
1477 int i;
1478
1479 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1480 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1481 continue;
a1255107
AD
1482 if (adev->ip_blocks[i].version->type == block_type)
1483 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1484 }
1485 return true;
1486
1487}
1488
e3ecdffa
AD
1489/**
1490 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1491 *
1492 * @adev: amdgpu_device pointer
87e3f136 1493 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1494 *
1495 * Returns a pointer to the hardware IP block structure
1496 * if it exists for the asic, otherwise NULL.
1497 */
2990a1fc
AD
1498struct amdgpu_ip_block *
1499amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1500 enum amd_ip_block_type type)
d38ceaf9
AD
1501{
1502 int i;
1503
1504 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1505 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1506 return &adev->ip_blocks[i];
1507
1508 return NULL;
1509}
1510
1511/**
2990a1fc 1512 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1513 *
1514 * @adev: amdgpu_device pointer
5fc3aeeb 1515 * @type: enum amd_ip_block_type
d38ceaf9
AD
1516 * @major: major version
1517 * @minor: minor version
1518 *
1519 * return 0 if equal or greater
1520 * return 1 if smaller or the ip_block doesn't exist
1521 */
2990a1fc
AD
1522int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1523 enum amd_ip_block_type type,
1524 u32 major, u32 minor)
d38ceaf9 1525{
2990a1fc 1526 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1527
a1255107
AD
1528 if (ip_block && ((ip_block->version->major > major) ||
1529 ((ip_block->version->major == major) &&
1530 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1531 return 0;
1532
1533 return 1;
1534}
1535
a1255107 1536/**
2990a1fc 1537 * amdgpu_device_ip_block_add
a1255107
AD
1538 *
1539 * @adev: amdgpu_device pointer
1540 * @ip_block_version: pointer to the IP to add
1541 *
1542 * Adds the IP block driver information to the collection of IPs
1543 * on the asic.
1544 */
2990a1fc
AD
1545int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1546 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1547{
1548 if (!ip_block_version)
1549 return -EINVAL;
1550
e966a725 1551 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1552 ip_block_version->funcs->name);
1553
a1255107
AD
1554 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1555
1556 return 0;
1557}
1558
e3ecdffa
AD
1559/**
1560 * amdgpu_device_enable_virtual_display - enable virtual display feature
1561 *
1562 * @adev: amdgpu_device pointer
1563 *
1564 * Enabled the virtual display feature if the user has enabled it via
1565 * the module parameter virtual_display. This feature provides a virtual
1566 * display hardware on headless boards or in virtualized environments.
1567 * This function parses and validates the configuration string specified by
1568 * the user and configues the virtual display configuration (number of
1569 * virtual connectors, crtcs, etc.) specified.
1570 */
483ef985 1571static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1572{
1573 adev->enable_virtual_display = false;
1574
1575 if (amdgpu_virtual_display) {
4a580877 1576 struct drm_device *ddev = adev_to_drm(adev);
9accf2fd 1577 const char *pci_address_name = pci_name(ddev->pdev);
0f66356d 1578 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1579
1580 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1581 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1582 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1583 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1584 if (!strcmp("all", pciaddname)
1585 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1586 long num_crtc;
1587 int res = -1;
1588
9accf2fd 1589 adev->enable_virtual_display = true;
0f66356d
ED
1590
1591 if (pciaddname_tmp)
1592 res = kstrtol(pciaddname_tmp, 10,
1593 &num_crtc);
1594
1595 if (!res) {
1596 if (num_crtc < 1)
1597 num_crtc = 1;
1598 if (num_crtc > 6)
1599 num_crtc = 6;
1600 adev->mode_info.num_crtc = num_crtc;
1601 } else {
1602 adev->mode_info.num_crtc = 1;
1603 }
9accf2fd
ED
1604 break;
1605 }
1606 }
1607
0f66356d
ED
1608 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1609 amdgpu_virtual_display, pci_address_name,
1610 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1611
1612 kfree(pciaddstr);
1613 }
1614}
1615
e3ecdffa
AD
1616/**
1617 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1618 *
1619 * @adev: amdgpu_device pointer
1620 *
1621 * Parses the asic configuration parameters specified in the gpu info
1622 * firmware and makes them availale to the driver for use in configuring
1623 * the asic.
1624 * Returns 0 on success, -EINVAL on failure.
1625 */
e2a75f88
AD
1626static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1627{
e2a75f88 1628 const char *chip_name;
c0a43457 1629 char fw_name[40];
e2a75f88
AD
1630 int err;
1631 const struct gpu_info_firmware_header_v1_0 *hdr;
1632
ab4fe3e1
HR
1633 adev->firmware.gpu_info_fw = NULL;
1634
72de33f8 1635 if (adev->mman.discovery_bin) {
258620d0 1636 amdgpu_discovery_get_gfx_info(adev);
cc375d8c
TY
1637
1638 /*
1639 * FIXME: The bounding box is still needed by Navi12, so
1640 * temporarily read it from gpu_info firmware. Should be droped
1641 * when DAL no longer needs it.
1642 */
1643 if (adev->asic_type != CHIP_NAVI12)
1644 return 0;
258620d0
AD
1645 }
1646
e2a75f88 1647 switch (adev->asic_type) {
e2a75f88
AD
1648#ifdef CONFIG_DRM_AMDGPU_SI
1649 case CHIP_VERDE:
1650 case CHIP_TAHITI:
1651 case CHIP_PITCAIRN:
1652 case CHIP_OLAND:
1653 case CHIP_HAINAN:
1654#endif
1655#ifdef CONFIG_DRM_AMDGPU_CIK
1656 case CHIP_BONAIRE:
1657 case CHIP_HAWAII:
1658 case CHIP_KAVERI:
1659 case CHIP_KABINI:
1660 case CHIP_MULLINS:
1661#endif
da87c30b
AD
1662 case CHIP_TOPAZ:
1663 case CHIP_TONGA:
1664 case CHIP_FIJI:
1665 case CHIP_POLARIS10:
1666 case CHIP_POLARIS11:
1667 case CHIP_POLARIS12:
1668 case CHIP_VEGAM:
1669 case CHIP_CARRIZO:
1670 case CHIP_STONEY:
27c0bc71 1671 case CHIP_VEGA20:
e2a75f88
AD
1672 default:
1673 return 0;
1674 case CHIP_VEGA10:
1675 chip_name = "vega10";
1676 break;
3f76dced
AD
1677 case CHIP_VEGA12:
1678 chip_name = "vega12";
1679 break;
2d2e5e7e 1680 case CHIP_RAVEN:
54f78a76 1681 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1682 chip_name = "raven2";
54f78a76 1683 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1684 chip_name = "picasso";
54c4d17e
FX
1685 else
1686 chip_name = "raven";
2d2e5e7e 1687 break;
65e60f6e
LM
1688 case CHIP_ARCTURUS:
1689 chip_name = "arcturus";
1690 break;
b51a26a0
HR
1691 case CHIP_RENOIR:
1692 chip_name = "renoir";
1693 break;
23c6268e
HR
1694 case CHIP_NAVI10:
1695 chip_name = "navi10";
1696 break;
ed42cfe1
XY
1697 case CHIP_NAVI14:
1698 chip_name = "navi14";
1699 break;
42b325e5
XY
1700 case CHIP_NAVI12:
1701 chip_name = "navi12";
1702 break;
c0a43457
LG
1703 case CHIP_SIENNA_CICHLID:
1704 chip_name = "sienna_cichlid";
1705 break;
120eb833
JC
1706 case CHIP_NAVY_FLOUNDER:
1707 chip_name = "navy_flounder";
1708 break;
e2a75f88
AD
1709 }
1710
1711 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1712 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1713 if (err) {
1714 dev_err(adev->dev,
1715 "Failed to load gpu_info firmware \"%s\"\n",
1716 fw_name);
1717 goto out;
1718 }
ab4fe3e1 1719 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1720 if (err) {
1721 dev_err(adev->dev,
1722 "Failed to validate gpu_info firmware \"%s\"\n",
1723 fw_name);
1724 goto out;
1725 }
1726
ab4fe3e1 1727 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1728 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1729
1730 switch (hdr->version_major) {
1731 case 1:
1732 {
1733 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1734 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1735 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1736
cc375d8c
TY
1737 /*
1738 * Should be droped when DAL no longer needs it.
1739 */
1740 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
1741 goto parse_soc_bounding_box;
1742
b5ab16bf
AD
1743 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1744 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1745 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1746 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 1747 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
1748 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1749 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1750 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1751 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1752 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 1753 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
1754 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1755 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
1756 adev->gfx.cu_info.max_waves_per_simd =
1757 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1758 adev->gfx.cu_info.max_scratch_slots_per_cu =
1759 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1760 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 1761 if (hdr->version_minor >= 1) {
35c2e910
HZ
1762 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1763 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1764 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1765 adev->gfx.config.num_sc_per_sh =
1766 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1767 adev->gfx.config.num_packer_per_sc =
1768 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1769 }
ec51d3fa
XY
1770
1771parse_soc_bounding_box:
ec51d3fa
XY
1772 /*
1773 * soc bounding box info is not integrated in disocovery table,
258620d0 1774 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 1775 */
48321c3d
HW
1776 if (hdr->version_minor == 2) {
1777 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1778 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1779 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1780 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1781 }
e2a75f88
AD
1782 break;
1783 }
1784 default:
1785 dev_err(adev->dev,
1786 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1787 err = -EINVAL;
1788 goto out;
1789 }
1790out:
e2a75f88
AD
1791 return err;
1792}
1793
e3ecdffa
AD
1794/**
1795 * amdgpu_device_ip_early_init - run early init for hardware IPs
1796 *
1797 * @adev: amdgpu_device pointer
1798 *
1799 * Early initialization pass for hardware IPs. The hardware IPs that make
1800 * up each asic are discovered each IP's early_init callback is run. This
1801 * is the first stage in initializing the asic.
1802 * Returns 0 on success, negative error code on failure.
1803 */
06ec9070 1804static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 1805{
aaa36a97 1806 int i, r;
d38ceaf9 1807
483ef985 1808 amdgpu_device_enable_virtual_display(adev);
a6be7570 1809
00a979f3 1810 if (amdgpu_sriov_vf(adev)) {
00a979f3 1811 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
1812 if (r)
1813 return r;
00a979f3
WS
1814 }
1815
d38ceaf9 1816 switch (adev->asic_type) {
33f34802
KW
1817#ifdef CONFIG_DRM_AMDGPU_SI
1818 case CHIP_VERDE:
1819 case CHIP_TAHITI:
1820 case CHIP_PITCAIRN:
1821 case CHIP_OLAND:
1822 case CHIP_HAINAN:
295d0daf 1823 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
1824 r = si_set_ip_blocks(adev);
1825 if (r)
1826 return r;
1827 break;
1828#endif
a2e73f56
AD
1829#ifdef CONFIG_DRM_AMDGPU_CIK
1830 case CHIP_BONAIRE:
1831 case CHIP_HAWAII:
1832 case CHIP_KAVERI:
1833 case CHIP_KABINI:
1834 case CHIP_MULLINS:
e1ad2d53 1835 if (adev->flags & AMD_IS_APU)
a2e73f56 1836 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
1837 else
1838 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
1839
1840 r = cik_set_ip_blocks(adev);
1841 if (r)
1842 return r;
1843 break;
1844#endif
da87c30b
AD
1845 case CHIP_TOPAZ:
1846 case CHIP_TONGA:
1847 case CHIP_FIJI:
1848 case CHIP_POLARIS10:
1849 case CHIP_POLARIS11:
1850 case CHIP_POLARIS12:
1851 case CHIP_VEGAM:
1852 case CHIP_CARRIZO:
1853 case CHIP_STONEY:
1854 if (adev->flags & AMD_IS_APU)
1855 adev->family = AMDGPU_FAMILY_CZ;
1856 else
1857 adev->family = AMDGPU_FAMILY_VI;
1858
1859 r = vi_set_ip_blocks(adev);
1860 if (r)
1861 return r;
1862 break;
e48a3cd9
AD
1863 case CHIP_VEGA10:
1864 case CHIP_VEGA12:
e4bd8170 1865 case CHIP_VEGA20:
e48a3cd9 1866 case CHIP_RAVEN:
61cf44c1 1867 case CHIP_ARCTURUS:
b51a26a0 1868 case CHIP_RENOIR:
70534d1e 1869 if (adev->flags & AMD_IS_APU)
2ca8a5d2
CZ
1870 adev->family = AMDGPU_FAMILY_RV;
1871 else
1872 adev->family = AMDGPU_FAMILY_AI;
460826e6
KW
1873
1874 r = soc15_set_ip_blocks(adev);
1875 if (r)
1876 return r;
1877 break;
0a5b8c7b 1878 case CHIP_NAVI10:
7ecb5cd4 1879 case CHIP_NAVI14:
4808cf9c 1880 case CHIP_NAVI12:
11e8aef5 1881 case CHIP_SIENNA_CICHLID:
41f446bf 1882 case CHIP_NAVY_FLOUNDER:
0a5b8c7b
HR
1883 adev->family = AMDGPU_FAMILY_NV;
1884
1885 r = nv_set_ip_blocks(adev);
1886 if (r)
1887 return r;
1888 break;
d38ceaf9
AD
1889 default:
1890 /* FIXME: not supported yet */
1891 return -EINVAL;
1892 }
1893
1884734a 1894 amdgpu_amdkfd_device_probe(adev);
1895
3b94fb10 1896 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 1897 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 1898 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
00f54b97 1899
d38ceaf9
AD
1900 for (i = 0; i < adev->num_ip_blocks; i++) {
1901 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
1902 DRM_ERROR("disabled ip block: %d <%s>\n",
1903 i, adev->ip_blocks[i].version->funcs->name);
a1255107 1904 adev->ip_blocks[i].status.valid = false;
d38ceaf9 1905 } else {
a1255107
AD
1906 if (adev->ip_blocks[i].version->funcs->early_init) {
1907 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 1908 if (r == -ENOENT) {
a1255107 1909 adev->ip_blocks[i].status.valid = false;
2c1a2784 1910 } else if (r) {
a1255107
AD
1911 DRM_ERROR("early_init of IP block <%s> failed %d\n",
1912 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 1913 return r;
2c1a2784 1914 } else {
a1255107 1915 adev->ip_blocks[i].status.valid = true;
2c1a2784 1916 }
974e6b64 1917 } else {
a1255107 1918 adev->ip_blocks[i].status.valid = true;
d38ceaf9 1919 }
d38ceaf9 1920 }
21a249ca
AD
1921 /* get the vbios after the asic_funcs are set up */
1922 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
1923 r = amdgpu_device_parse_gpu_info_fw(adev);
1924 if (r)
1925 return r;
1926
21a249ca
AD
1927 /* Read BIOS */
1928 if (!amdgpu_get_bios(adev))
1929 return -EINVAL;
1930
1931 r = amdgpu_atombios_init(adev);
1932 if (r) {
1933 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1934 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1935 return r;
1936 }
1937 }
d38ceaf9
AD
1938 }
1939
395d1fb9
NH
1940 adev->cg_flags &= amdgpu_cg_mask;
1941 adev->pg_flags &= amdgpu_pg_mask;
1942
d38ceaf9
AD
1943 return 0;
1944}
1945
0a4f2520
RZ
1946static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1947{
1948 int i, r;
1949
1950 for (i = 0; i < adev->num_ip_blocks; i++) {
1951 if (!adev->ip_blocks[i].status.sw)
1952 continue;
1953 if (adev->ip_blocks[i].status.hw)
1954 continue;
1955 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 1956 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
1957 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1958 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1959 if (r) {
1960 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1961 adev->ip_blocks[i].version->funcs->name, r);
1962 return r;
1963 }
1964 adev->ip_blocks[i].status.hw = true;
1965 }
1966 }
1967
1968 return 0;
1969}
1970
1971static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1972{
1973 int i, r;
1974
1975 for (i = 0; i < adev->num_ip_blocks; i++) {
1976 if (!adev->ip_blocks[i].status.sw)
1977 continue;
1978 if (adev->ip_blocks[i].status.hw)
1979 continue;
1980 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1981 if (r) {
1982 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1983 adev->ip_blocks[i].version->funcs->name, r);
1984 return r;
1985 }
1986 adev->ip_blocks[i].status.hw = true;
1987 }
1988
1989 return 0;
1990}
1991
7a3e0bb2
RZ
1992static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1993{
1994 int r = 0;
1995 int i;
80f41f84 1996 uint32_t smu_version;
7a3e0bb2
RZ
1997
1998 if (adev->asic_type >= CHIP_VEGA10) {
1999 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2000 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2001 continue;
2002
2003 /* no need to do the fw loading again if already done*/
2004 if (adev->ip_blocks[i].status.hw == true)
2005 break;
2006
53b3f8f4 2007 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2008 r = adev->ip_blocks[i].version->funcs->resume(adev);
2009 if (r) {
2010 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2011 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2012 return r;
2013 }
2014 } else {
2015 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2016 if (r) {
2017 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2018 adev->ip_blocks[i].version->funcs->name, r);
2019 return r;
7a3e0bb2 2020 }
7a3e0bb2 2021 }
482f0e53
ML
2022
2023 adev->ip_blocks[i].status.hw = true;
2024 break;
7a3e0bb2
RZ
2025 }
2026 }
482f0e53 2027
8973d9ec
ED
2028 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2029 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2030
80f41f84 2031 return r;
7a3e0bb2
RZ
2032}
2033
e3ecdffa
AD
2034/**
2035 * amdgpu_device_ip_init - run init for hardware IPs
2036 *
2037 * @adev: amdgpu_device pointer
2038 *
2039 * Main initialization pass for hardware IPs. The list of all the hardware
2040 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2041 * are run. sw_init initializes the software state associated with each IP
2042 * and hw_init initializes the hardware associated with each IP.
2043 * Returns 0 on success, negative error code on failure.
2044 */
06ec9070 2045static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2046{
2047 int i, r;
2048
c030f2e4 2049 r = amdgpu_ras_init(adev);
2050 if (r)
2051 return r;
2052
d38ceaf9 2053 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2054 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2055 continue;
a1255107 2056 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2057 if (r) {
a1255107
AD
2058 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2059 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2060 goto init_failed;
2c1a2784 2061 }
a1255107 2062 adev->ip_blocks[i].status.sw = true;
bfca0289 2063
d38ceaf9 2064 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2065 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
06ec9070 2066 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2067 if (r) {
2068 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2069 goto init_failed;
2c1a2784 2070 }
a1255107 2071 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2072 if (r) {
2073 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2074 goto init_failed;
2c1a2784 2075 }
06ec9070 2076 r = amdgpu_device_wb_init(adev);
2c1a2784 2077 if (r) {
06ec9070 2078 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2079 goto init_failed;
2c1a2784 2080 }
a1255107 2081 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2082
2083 /* right after GMC hw init, we create CSA */
f92d5c61 2084 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2085 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2086 AMDGPU_GEM_DOMAIN_VRAM,
2087 AMDGPU_CSA_SIZE);
2493664f
ML
2088 if (r) {
2089 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2090 goto init_failed;
2493664f
ML
2091 }
2092 }
d38ceaf9
AD
2093 }
2094 }
2095
c9ffa427
YT
2096 if (amdgpu_sriov_vf(adev))
2097 amdgpu_virt_init_data_exchange(adev);
2098
533aed27
AG
2099 r = amdgpu_ib_pool_init(adev);
2100 if (r) {
2101 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2102 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2103 goto init_failed;
2104 }
2105
c8963ea4
RZ
2106 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2107 if (r)
72d3f592 2108 goto init_failed;
0a4f2520
RZ
2109
2110 r = amdgpu_device_ip_hw_init_phase1(adev);
2111 if (r)
72d3f592 2112 goto init_failed;
0a4f2520 2113
7a3e0bb2
RZ
2114 r = amdgpu_device_fw_loading(adev);
2115 if (r)
72d3f592 2116 goto init_failed;
7a3e0bb2 2117
0a4f2520
RZ
2118 r = amdgpu_device_ip_hw_init_phase2(adev);
2119 if (r)
72d3f592 2120 goto init_failed;
d38ceaf9 2121
121a2bc6
AG
2122 /*
2123 * retired pages will be loaded from eeprom and reserved here,
2124 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2125 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2126 * for I2C communication which only true at this point.
b82e65a9
GC
2127 *
2128 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2129 * failure from bad gpu situation and stop amdgpu init process
2130 * accordingly. For other failed cases, it will still release all
2131 * the resource and print error message, rather than returning one
2132 * negative value to upper level.
121a2bc6
AG
2133 *
2134 * Note: theoretically, this should be called before all vram allocations
2135 * to protect retired page from abusing
2136 */
b82e65a9
GC
2137 r = amdgpu_ras_recovery_init(adev);
2138 if (r)
2139 goto init_failed;
121a2bc6 2140
3e2e2ab5
HZ
2141 if (adev->gmc.xgmi.num_physical_nodes > 1)
2142 amdgpu_xgmi_add_device(adev);
1884734a 2143 amdgpu_amdkfd_device_init(adev);
c6332b97 2144
bd607166
KR
2145 amdgpu_fru_get_product_info(adev);
2146
72d3f592 2147init_failed:
c9ffa427 2148 if (amdgpu_sriov_vf(adev))
c6332b97 2149 amdgpu_virt_release_full_gpu(adev, true);
2150
72d3f592 2151 return r;
d38ceaf9
AD
2152}
2153
e3ecdffa
AD
2154/**
2155 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2156 *
2157 * @adev: amdgpu_device pointer
2158 *
2159 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2160 * this function before a GPU reset. If the value is retained after a
2161 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2162 */
06ec9070 2163static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2164{
2165 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2166}
2167
e3ecdffa
AD
2168/**
2169 * amdgpu_device_check_vram_lost - check if vram is valid
2170 *
2171 * @adev: amdgpu_device pointer
2172 *
2173 * Checks the reset magic value written to the gart pointer in VRAM.
2174 * The driver calls this after a GPU reset to see if the contents of
2175 * VRAM is lost or now.
2176 * returns true if vram is lost, false if not.
2177 */
06ec9070 2178static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2179{
dadce777
EQ
2180 if (memcmp(adev->gart.ptr, adev->reset_magic,
2181 AMDGPU_RESET_MAGIC_NUM))
2182 return true;
2183
53b3f8f4 2184 if (!amdgpu_in_reset(adev))
dadce777
EQ
2185 return false;
2186
2187 /*
2188 * For all ASICs with baco/mode1 reset, the VRAM is
2189 * always assumed to be lost.
2190 */
2191 switch (amdgpu_asic_reset_method(adev)) {
2192 case AMD_RESET_METHOD_BACO:
2193 case AMD_RESET_METHOD_MODE1:
2194 return true;
2195 default:
2196 return false;
2197 }
0c49e0b8
CZ
2198}
2199
e3ecdffa 2200/**
1112a46b 2201 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2202 *
2203 * @adev: amdgpu_device pointer
b8b72130 2204 * @state: clockgating state (gate or ungate)
e3ecdffa 2205 *
e3ecdffa 2206 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2207 * set_clockgating_state callbacks are run.
2208 * Late initialization pass enabling clockgating for hardware IPs.
2209 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2210 * Returns 0 on success, negative error code on failure.
2211 */
fdd34271 2212
1112a46b
RZ
2213static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2214 enum amd_clockgating_state state)
d38ceaf9 2215{
1112a46b 2216 int i, j, r;
d38ceaf9 2217
4a2ba394
SL
2218 if (amdgpu_emu_mode == 1)
2219 return 0;
2220
1112a46b
RZ
2221 for (j = 0; j < adev->num_ip_blocks; j++) {
2222 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2223 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2224 continue;
4a446d55 2225 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2226 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2227 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2228 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2229 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2230 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2231 /* enable clockgating to save power */
a1255107 2232 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2233 state);
4a446d55
AD
2234 if (r) {
2235 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2236 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2237 return r;
2238 }
b0b00ff1 2239 }
d38ceaf9 2240 }
06b18f61 2241
c9f96fd5
RZ
2242 return 0;
2243}
2244
1112a46b 2245static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
c9f96fd5 2246{
1112a46b 2247 int i, j, r;
06b18f61 2248
c9f96fd5
RZ
2249 if (amdgpu_emu_mode == 1)
2250 return 0;
2251
1112a46b
RZ
2252 for (j = 0; j < adev->num_ip_blocks; j++) {
2253 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2254 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5
RZ
2255 continue;
2256 /* skip CG for VCE/UVD, it's handled specially */
2257 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2258 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2259 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2260 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2261 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2262 /* enable powergating to save power */
2263 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2264 state);
c9f96fd5
RZ
2265 if (r) {
2266 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2267 adev->ip_blocks[i].version->funcs->name, r);
2268 return r;
2269 }
2270 }
2271 }
2dc80b00
S
2272 return 0;
2273}
2274
beff74bc
AD
2275static int amdgpu_device_enable_mgpu_fan_boost(void)
2276{
2277 struct amdgpu_gpu_instance *gpu_ins;
2278 struct amdgpu_device *adev;
2279 int i, ret = 0;
2280
2281 mutex_lock(&mgpu_info.mutex);
2282
2283 /*
2284 * MGPU fan boost feature should be enabled
2285 * only when there are two or more dGPUs in
2286 * the system
2287 */
2288 if (mgpu_info.num_dgpu < 2)
2289 goto out;
2290
2291 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2292 gpu_ins = &(mgpu_info.gpu_ins[i]);
2293 adev = gpu_ins->adev;
2294 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2295 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2296 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2297 if (ret)
2298 break;
2299
2300 gpu_ins->mgpu_fan_enabled = 1;
2301 }
2302 }
2303
2304out:
2305 mutex_unlock(&mgpu_info.mutex);
2306
2307 return ret;
2308}
2309
e3ecdffa
AD
2310/**
2311 * amdgpu_device_ip_late_init - run late init for hardware IPs
2312 *
2313 * @adev: amdgpu_device pointer
2314 *
2315 * Late initialization pass for hardware IPs. The list of all the hardware
2316 * IPs that make up the asic is walked and the late_init callbacks are run.
2317 * late_init covers any special initialization that an IP requires
2318 * after all of the have been initialized or something that needs to happen
2319 * late in the init process.
2320 * Returns 0 on success, negative error code on failure.
2321 */
06ec9070 2322static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2323{
60599a03 2324 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2325 int i = 0, r;
2326
2327 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2328 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2329 continue;
2330 if (adev->ip_blocks[i].version->funcs->late_init) {
2331 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2332 if (r) {
2333 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2334 adev->ip_blocks[i].version->funcs->name, r);
2335 return r;
2336 }
2dc80b00 2337 }
73f847db 2338 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2339 }
2340
a891d239
DL
2341 amdgpu_ras_set_error_query_ready(adev, true);
2342
1112a46b
RZ
2343 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2344 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2345
06ec9070 2346 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2347
beff74bc
AD
2348 r = amdgpu_device_enable_mgpu_fan_boost();
2349 if (r)
2350 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2351
60599a03
EQ
2352
2353 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2354 mutex_lock(&mgpu_info.mutex);
2355
2356 /*
2357 * Reset device p-state to low as this was booted with high.
2358 *
2359 * This should be performed only after all devices from the same
2360 * hive get initialized.
2361 *
2362 * However, it's unknown how many device in the hive in advance.
2363 * As this is counted one by one during devices initializations.
2364 *
2365 * So, we wait for all XGMI interlinked devices initialized.
2366 * This may bring some delays as those devices may come from
2367 * different hives. But that should be OK.
2368 */
2369 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2370 for (i = 0; i < mgpu_info.num_gpu; i++) {
2371 gpu_instance = &(mgpu_info.gpu_ins[i]);
2372 if (gpu_instance->adev->flags & AMD_IS_APU)
2373 continue;
2374
d84a430d
JK
2375 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2376 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2377 if (r) {
2378 DRM_ERROR("pstate setting failed (%d).\n", r);
2379 break;
2380 }
2381 }
2382 }
2383
2384 mutex_unlock(&mgpu_info.mutex);
2385 }
2386
d38ceaf9
AD
2387 return 0;
2388}
2389
e3ecdffa
AD
2390/**
2391 * amdgpu_device_ip_fini - run fini for hardware IPs
2392 *
2393 * @adev: amdgpu_device pointer
2394 *
2395 * Main teardown pass for hardware IPs. The list of all the hardware
2396 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2397 * are run. hw_fini tears down the hardware associated with each IP
2398 * and sw_fini tears down any software state associated with each IP.
2399 * Returns 0 on success, negative error code on failure.
2400 */
06ec9070 2401static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
d38ceaf9
AD
2402{
2403 int i, r;
2404
5278a159
SY
2405 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2406 amdgpu_virt_release_ras_err_handler_data(adev);
2407
c030f2e4 2408 amdgpu_ras_pre_fini(adev);
2409
a82400b5
AG
2410 if (adev->gmc.xgmi.num_physical_nodes > 1)
2411 amdgpu_xgmi_remove_device(adev);
2412
1884734a 2413 amdgpu_amdkfd_device_fini(adev);
05df1f01
RZ
2414
2415 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2416 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2417
3e96dbfd
AD
2418 /* need to disable SMC first */
2419 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2420 if (!adev->ip_blocks[i].status.hw)
3e96dbfd 2421 continue;
fdd34271 2422 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
a1255107 2423 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3e96dbfd
AD
2424 /* XXX handle errors */
2425 if (r) {
2426 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
a1255107 2427 adev->ip_blocks[i].version->funcs->name, r);
3e96dbfd 2428 }
a1255107 2429 adev->ip_blocks[i].status.hw = false;
3e96dbfd
AD
2430 break;
2431 }
2432 }
2433
d38ceaf9 2434 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2435 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2436 continue;
8201a67a 2437
a1255107 2438 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2439 /* XXX handle errors */
2c1a2784 2440 if (r) {
a1255107
AD
2441 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2442 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2443 }
8201a67a 2444
a1255107 2445 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2446 }
2447
9950cda2 2448
d38ceaf9 2449 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2450 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2451 continue;
c12aba3a
ML
2452
2453 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2454 amdgpu_ucode_free_bo(adev);
1e256e27 2455 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2456 amdgpu_device_wb_fini(adev);
2457 amdgpu_device_vram_scratch_fini(adev);
533aed27 2458 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2459 }
2460
a1255107 2461 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2462 /* XXX handle errors */
2c1a2784 2463 if (r) {
a1255107
AD
2464 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2465 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2466 }
a1255107
AD
2467 adev->ip_blocks[i].status.sw = false;
2468 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2469 }
2470
a6dcfd9c 2471 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2472 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2473 continue;
a1255107
AD
2474 if (adev->ip_blocks[i].version->funcs->late_fini)
2475 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2476 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2477 }
2478
c030f2e4 2479 amdgpu_ras_fini(adev);
2480
030308fc 2481 if (amdgpu_sriov_vf(adev))
24136135
ML
2482 if (amdgpu_virt_release_full_gpu(adev, false))
2483 DRM_ERROR("failed to release exclusive mode on fini\n");
2493664f 2484
d38ceaf9
AD
2485 return 0;
2486}
2487
e3ecdffa 2488/**
beff74bc 2489 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2490 *
1112a46b 2491 * @work: work_struct.
e3ecdffa 2492 */
beff74bc 2493static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2494{
2495 struct amdgpu_device *adev =
beff74bc 2496 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2497 int r;
2498
2499 r = amdgpu_ib_ring_tests(adev);
2500 if (r)
2501 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2502}
2503
1e317b99
RZ
2504static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2505{
2506 struct amdgpu_device *adev =
2507 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2508
2509 mutex_lock(&adev->gfx.gfx_off_mutex);
2510 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2511 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2512 adev->gfx.gfx_off_state = true;
2513 }
2514 mutex_unlock(&adev->gfx.gfx_off_mutex);
2515}
2516
e3ecdffa 2517/**
e7854a03 2518 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2519 *
2520 * @adev: amdgpu_device pointer
2521 *
2522 * Main suspend function for hardware IPs. The list of all the hardware
2523 * IPs that make up the asic is walked, clockgating is disabled and the
2524 * suspend callbacks are run. suspend puts the hardware and software state
2525 * in each IP into a state suitable for suspend.
2526 * Returns 0 on success, negative error code on failure.
2527 */
e7854a03
AD
2528static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2529{
2530 int i, r;
2531
ced1ba97
PL
2532 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2533 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2534
e7854a03
AD
2535 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2536 if (!adev->ip_blocks[i].status.valid)
2537 continue;
2b9f7848 2538
e7854a03 2539 /* displays are handled separately */
2b9f7848
ND
2540 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2541 continue;
2542
2543 /* XXX handle errors */
2544 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2545 /* XXX handle errors */
2546 if (r) {
2547 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2548 adev->ip_blocks[i].version->funcs->name, r);
2549 return r;
e7854a03 2550 }
2b9f7848
ND
2551
2552 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2553 }
2554
e7854a03
AD
2555 return 0;
2556}
2557
2558/**
2559 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2560 *
2561 * @adev: amdgpu_device pointer
2562 *
2563 * Main suspend function for hardware IPs. The list of all the hardware
2564 * IPs that make up the asic is walked, clockgating is disabled and the
2565 * suspend callbacks are run. suspend puts the hardware and software state
2566 * in each IP into a state suitable for suspend.
2567 * Returns 0 on success, negative error code on failure.
2568 */
2569static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2570{
2571 int i, r;
2572
2573 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2574 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2575 continue;
e7854a03
AD
2576 /* displays are handled in phase1 */
2577 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2578 continue;
bff77e86
LM
2579 /* PSP lost connection when err_event_athub occurs */
2580 if (amdgpu_ras_intr_triggered() &&
2581 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2582 adev->ip_blocks[i].status.hw = false;
2583 continue;
2584 }
d38ceaf9 2585 /* XXX handle errors */
a1255107 2586 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2587 /* XXX handle errors */
2c1a2784 2588 if (r) {
a1255107
AD
2589 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2590 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2591 }
876923fb 2592 adev->ip_blocks[i].status.hw = false;
a3a09142 2593 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2594 if(!amdgpu_sriov_vf(adev)){
2595 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2596 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2597 if (r) {
2598 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2599 adev->mp1_state, r);
2600 return r;
2601 }
a3a09142
AD
2602 }
2603 }
b5507c7e 2604 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2605 }
2606
2607 return 0;
2608}
2609
e7854a03
AD
2610/**
2611 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2612 *
2613 * @adev: amdgpu_device pointer
2614 *
2615 * Main suspend function for hardware IPs. The list of all the hardware
2616 * IPs that make up the asic is walked, clockgating is disabled and the
2617 * suspend callbacks are run. suspend puts the hardware and software state
2618 * in each IP into a state suitable for suspend.
2619 * Returns 0 on success, negative error code on failure.
2620 */
2621int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2622{
2623 int r;
2624
e7819644
YT
2625 if (amdgpu_sriov_vf(adev))
2626 amdgpu_virt_request_full_gpu(adev, false);
2627
e7854a03
AD
2628 r = amdgpu_device_ip_suspend_phase1(adev);
2629 if (r)
2630 return r;
2631 r = amdgpu_device_ip_suspend_phase2(adev);
2632
e7819644
YT
2633 if (amdgpu_sriov_vf(adev))
2634 amdgpu_virt_release_full_gpu(adev, false);
2635
e7854a03
AD
2636 return r;
2637}
2638
06ec9070 2639static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2640{
2641 int i, r;
2642
2cb681b6
ML
2643 static enum amd_ip_block_type ip_order[] = {
2644 AMD_IP_BLOCK_TYPE_GMC,
2645 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 2646 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
2647 AMD_IP_BLOCK_TYPE_IH,
2648 };
a90ad3c2 2649
2cb681b6
ML
2650 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2651 int j;
2652 struct amdgpu_ip_block *block;
a90ad3c2 2653
4cd2a96d
J
2654 block = &adev->ip_blocks[i];
2655 block->status.hw = false;
2cb681b6 2656
4cd2a96d 2657 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 2658
4cd2a96d 2659 if (block->version->type != ip_order[j] ||
2cb681b6
ML
2660 !block->status.valid)
2661 continue;
2662
2663 r = block->version->funcs->hw_init(adev);
0aaeefcc 2664 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2665 if (r)
2666 return r;
482f0e53 2667 block->status.hw = true;
a90ad3c2
ML
2668 }
2669 }
2670
2671 return 0;
2672}
2673
06ec9070 2674static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2675{
2676 int i, r;
2677
2cb681b6
ML
2678 static enum amd_ip_block_type ip_order[] = {
2679 AMD_IP_BLOCK_TYPE_SMC,
2680 AMD_IP_BLOCK_TYPE_DCE,
2681 AMD_IP_BLOCK_TYPE_GFX,
2682 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 2683 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
2684 AMD_IP_BLOCK_TYPE_VCE,
2685 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 2686 };
a90ad3c2 2687
2cb681b6
ML
2688 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2689 int j;
2690 struct amdgpu_ip_block *block;
a90ad3c2 2691
2cb681b6
ML
2692 for (j = 0; j < adev->num_ip_blocks; j++) {
2693 block = &adev->ip_blocks[j];
2694
2695 if (block->version->type != ip_order[i] ||
482f0e53
ML
2696 !block->status.valid ||
2697 block->status.hw)
2cb681b6
ML
2698 continue;
2699
895bd048
JZ
2700 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2701 r = block->version->funcs->resume(adev);
2702 else
2703 r = block->version->funcs->hw_init(adev);
2704
0aaeefcc 2705 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2706 if (r)
2707 return r;
482f0e53 2708 block->status.hw = true;
a90ad3c2
ML
2709 }
2710 }
2711
2712 return 0;
2713}
2714
e3ecdffa
AD
2715/**
2716 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2717 *
2718 * @adev: amdgpu_device pointer
2719 *
2720 * First resume function for hardware IPs. The list of all the hardware
2721 * IPs that make up the asic is walked and the resume callbacks are run for
2722 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2723 * after a suspend and updates the software state as necessary. This
2724 * function is also used for restoring the GPU after a GPU reset.
2725 * Returns 0 on success, negative error code on failure.
2726 */
06ec9070 2727static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
2728{
2729 int i, r;
2730
a90ad3c2 2731 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2732 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 2733 continue;
a90ad3c2 2734 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
2735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2736 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 2737
fcf0649f
CZ
2738 r = adev->ip_blocks[i].version->funcs->resume(adev);
2739 if (r) {
2740 DRM_ERROR("resume of IP block <%s> failed %d\n",
2741 adev->ip_blocks[i].version->funcs->name, r);
2742 return r;
2743 }
482f0e53 2744 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
2745 }
2746 }
2747
2748 return 0;
2749}
2750
e3ecdffa
AD
2751/**
2752 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2753 *
2754 * @adev: amdgpu_device pointer
2755 *
2756 * First resume function for hardware IPs. The list of all the hardware
2757 * IPs that make up the asic is walked and the resume callbacks are run for
2758 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2759 * functional state after a suspend and updates the software state as
2760 * necessary. This function is also used for restoring the GPU after a GPU
2761 * reset.
2762 * Returns 0 on success, negative error code on failure.
2763 */
06ec9070 2764static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2765{
2766 int i, r;
2767
2768 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2769 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 2770 continue;
fcf0649f 2771 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 2772 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
2773 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2774 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 2775 continue;
a1255107 2776 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 2777 if (r) {
a1255107
AD
2778 DRM_ERROR("resume of IP block <%s> failed %d\n",
2779 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2780 return r;
2c1a2784 2781 }
482f0e53 2782 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
2783 }
2784
2785 return 0;
2786}
2787
e3ecdffa
AD
2788/**
2789 * amdgpu_device_ip_resume - run resume for hardware IPs
2790 *
2791 * @adev: amdgpu_device pointer
2792 *
2793 * Main resume function for hardware IPs. The hardware IPs
2794 * are split into two resume functions because they are
2795 * are also used in in recovering from a GPU reset and some additional
2796 * steps need to be take between them. In this case (S3/S4) they are
2797 * run sequentially.
2798 * Returns 0 on success, negative error code on failure.
2799 */
06ec9070 2800static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
2801{
2802 int r;
2803
06ec9070 2804 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
2805 if (r)
2806 return r;
7a3e0bb2
RZ
2807
2808 r = amdgpu_device_fw_loading(adev);
2809 if (r)
2810 return r;
2811
06ec9070 2812 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
2813
2814 return r;
2815}
2816
e3ecdffa
AD
2817/**
2818 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2819 *
2820 * @adev: amdgpu_device pointer
2821 *
2822 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2823 */
4e99a44e 2824static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 2825{
6867e1b5
ML
2826 if (amdgpu_sriov_vf(adev)) {
2827 if (adev->is_atom_fw) {
2828 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2829 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2830 } else {
2831 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2832 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2833 }
2834
2835 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2836 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 2837 }
048765ad
AR
2838}
2839
e3ecdffa
AD
2840/**
2841 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2842 *
2843 * @asic_type: AMD asic type
2844 *
2845 * Check if there is DC (new modesetting infrastructre) support for an asic.
2846 * returns true if DC has support, false if not.
2847 */
4562236b
HW
2848bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2849{
2850 switch (asic_type) {
2851#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
2852#if defined(CONFIG_DRM_AMD_DC_SI)
2853 case CHIP_TAHITI:
2854 case CHIP_PITCAIRN:
2855 case CHIP_VERDE:
2856 case CHIP_OLAND:
2857#endif
4562236b 2858 case CHIP_BONAIRE:
0d6fbccb 2859 case CHIP_KAVERI:
367e6687
AD
2860 case CHIP_KABINI:
2861 case CHIP_MULLINS:
d9fda248
HW
2862 /*
2863 * We have systems in the wild with these ASICs that require
2864 * LVDS and VGA support which is not supported with DC.
2865 *
2866 * Fallback to the non-DC driver here by default so as not to
2867 * cause regressions.
2868 */
2869 return amdgpu_dc > 0;
2870 case CHIP_HAWAII:
4562236b
HW
2871 case CHIP_CARRIZO:
2872 case CHIP_STONEY:
4562236b 2873 case CHIP_POLARIS10:
675fd32b 2874 case CHIP_POLARIS11:
2c8ad2d5 2875 case CHIP_POLARIS12:
675fd32b 2876 case CHIP_VEGAM:
4562236b
HW
2877 case CHIP_TONGA:
2878 case CHIP_FIJI:
42f8ffa1 2879 case CHIP_VEGA10:
dca7b401 2880 case CHIP_VEGA12:
c6034aa2 2881 case CHIP_VEGA20:
b86a1aa3 2882#if defined(CONFIG_DRM_AMD_DC_DCN)
fd187853 2883 case CHIP_RAVEN:
b4f199c7 2884 case CHIP_NAVI10:
8fceceb6 2885 case CHIP_NAVI14:
078655d9 2886 case CHIP_NAVI12:
e1c14c43 2887 case CHIP_RENOIR:
81d9bfb8
JFZ
2888#endif
2889#if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2890 case CHIP_SIENNA_CICHLID:
a6c5308f 2891 case CHIP_NAVY_FLOUNDER:
42f8ffa1 2892#endif
fd187853 2893 return amdgpu_dc != 0;
4562236b
HW
2894#endif
2895 default:
93b09a9a
SS
2896 if (amdgpu_dc > 0)
2897 DRM_INFO("Display Core has been requested via kernel parameter "
2898 "but isn't supported by ASIC, ignoring\n");
4562236b
HW
2899 return false;
2900 }
2901}
2902
2903/**
2904 * amdgpu_device_has_dc_support - check if dc is supported
2905 *
2906 * @adev: amdgpu_device_pointer
2907 *
2908 * Returns true for supported, false for not supported
2909 */
2910bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2911{
c997e8e2 2912 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
2555039d
XY
2913 return false;
2914
4562236b
HW
2915 return amdgpu_device_asic_has_dc_support(adev->asic_type);
2916}
2917
d4535e2c
AG
2918
2919static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2920{
2921 struct amdgpu_device *adev =
2922 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 2923 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 2924
c6a6e2db
AG
2925 /* It's a bug to not have a hive within this function */
2926 if (WARN_ON(!hive))
2927 return;
2928
2929 /*
2930 * Use task barrier to synchronize all xgmi reset works across the
2931 * hive. task_barrier_enter and task_barrier_exit will block
2932 * until all the threads running the xgmi reset works reach
2933 * those points. task_barrier_full will do both blocks.
2934 */
2935 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2936
2937 task_barrier_enter(&hive->tb);
4a580877 2938 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
2939
2940 if (adev->asic_reset_res)
2941 goto fail;
2942
2943 task_barrier_exit(&hive->tb);
4a580877 2944 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
2945
2946 if (adev->asic_reset_res)
2947 goto fail;
43c4d576
JC
2948
2949 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2950 adev->mmhub.funcs->reset_ras_error_count(adev);
c6a6e2db
AG
2951 } else {
2952
2953 task_barrier_full(&hive->tb);
2954 adev->asic_reset_res = amdgpu_asic_reset(adev);
2955 }
ce316fa5 2956
c6a6e2db 2957fail:
d4535e2c 2958 if (adev->asic_reset_res)
fed184e9 2959 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 2960 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 2961 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
2962}
2963
71f98027
AD
2964static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2965{
2966 char *input = amdgpu_lockup_timeout;
2967 char *timeout_setting = NULL;
2968 int index = 0;
2969 long timeout;
2970 int ret = 0;
2971
2972 /*
2973 * By default timeout for non compute jobs is 10000.
2974 * And there is no timeout enforced on compute jobs.
2975 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 2976 * jobs are 60000 by default.
71f98027
AD
2977 */
2978 adev->gfx_timeout = msecs_to_jiffies(10000);
2979 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2980 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
b7b2a316 2981 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027
AD
2982 else
2983 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2984
f440ff44 2985 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 2986 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 2987 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
2988 ret = kstrtol(timeout_setting, 0, &timeout);
2989 if (ret)
2990 return ret;
2991
2992 if (timeout == 0) {
2993 index++;
2994 continue;
2995 } else if (timeout < 0) {
2996 timeout = MAX_SCHEDULE_TIMEOUT;
2997 } else {
2998 timeout = msecs_to_jiffies(timeout);
2999 }
3000
3001 switch (index++) {
3002 case 0:
3003 adev->gfx_timeout = timeout;
3004 break;
3005 case 1:
3006 adev->compute_timeout = timeout;
3007 break;
3008 case 2:
3009 adev->sdma_timeout = timeout;
3010 break;
3011 case 3:
3012 adev->video_timeout = timeout;
3013 break;
3014 default:
3015 break;
3016 }
3017 }
3018 /*
3019 * There is only one value specified and
3020 * it should apply to all non-compute jobs.
3021 */
bcccee89 3022 if (index == 1) {
71f98027 3023 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3024 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3025 adev->compute_timeout = adev->gfx_timeout;
3026 }
71f98027
AD
3027 }
3028
3029 return ret;
3030}
d4535e2c 3031
77f3a5cd
ND
3032static const struct attribute *amdgpu_dev_attributes[] = {
3033 &dev_attr_product_name.attr,
3034 &dev_attr_product_number.attr,
3035 &dev_attr_serial_number.attr,
3036 &dev_attr_pcie_replay_count.attr,
3037 NULL
3038};
3039
c9a6b82f 3040
d38ceaf9
AD
3041/**
3042 * amdgpu_device_init - initialize the driver
3043 *
3044 * @adev: amdgpu_device pointer
d38ceaf9
AD
3045 * @flags: driver flags
3046 *
3047 * Initializes the driver info and hw (all asics).
3048 * Returns 0 for success or an error on failure.
3049 * Called at driver startup.
3050 */
3051int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3052 uint32_t flags)
3053{
8aba21b7
LT
3054 struct drm_device *ddev = adev_to_drm(adev);
3055 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3056 int r, i;
3840c5bc 3057 bool boco = false;
95844d20 3058 u32 max_MBps;
d38ceaf9
AD
3059
3060 adev->shutdown = false;
d38ceaf9 3061 adev->flags = flags;
4e66d7d2
YZ
3062
3063 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3064 adev->asic_type = amdgpu_force_asic_type;
3065 else
3066 adev->asic_type = flags & AMD_ASIC_MASK;
3067
d38ceaf9 3068 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3069 if (amdgpu_emu_mode == 1)
8bdab6bb 3070 adev->usec_timeout *= 10;
770d13b1 3071 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3072 adev->accel_working = false;
3073 adev->num_rings = 0;
3074 adev->mman.buffer_funcs = NULL;
3075 adev->mman.buffer_funcs_ring = NULL;
3076 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3077 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3078 adev->gmc.gmc_funcs = NULL;
f54d1867 3079 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3080 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3081
3082 adev->smc_rreg = &amdgpu_invalid_rreg;
3083 adev->smc_wreg = &amdgpu_invalid_wreg;
3084 adev->pcie_rreg = &amdgpu_invalid_rreg;
3085 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3086 adev->pciep_rreg = &amdgpu_invalid_rreg;
3087 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3088 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3089 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3090 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3091 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3092 adev->didt_rreg = &amdgpu_invalid_rreg;
3093 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3094 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3095 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3096 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3097 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3098
3e39ab90
AD
3099 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3100 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3101 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3102
3103 /* mutex initialization are all done here so we
3104 * can recall function without having locking issues */
d38ceaf9 3105 atomic_set(&adev->irq.ih.lock, 0);
0e5ca0d1 3106 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3107 mutex_init(&adev->pm.mutex);
3108 mutex_init(&adev->gfx.gpu_clock_mutex);
3109 mutex_init(&adev->srbm_mutex);
b8866c26 3110 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3111 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3112 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3113 mutex_init(&adev->mn_lock);
e23b74aa 3114 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3115 hash_init(adev->mn_hash);
53b3f8f4 3116 atomic_set(&adev->in_gpu_reset, 0);
6049db43 3117 init_rwsem(&adev->reset_sem);
32eaeae0 3118 mutex_init(&adev->psp.mutex);
bd052211 3119 mutex_init(&adev->notifier_lock);
d38ceaf9 3120
912dfc84
EQ
3121 r = amdgpu_device_check_arguments(adev);
3122 if (r)
3123 return r;
d38ceaf9 3124
d38ceaf9
AD
3125 spin_lock_init(&adev->mmio_idx_lock);
3126 spin_lock_init(&adev->smc_idx_lock);
3127 spin_lock_init(&adev->pcie_idx_lock);
3128 spin_lock_init(&adev->uvd_ctx_idx_lock);
3129 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3130 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3131 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3132 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3133 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3134
0c4e7fa5
CZ
3135 INIT_LIST_HEAD(&adev->shadow_list);
3136 mutex_init(&adev->shadow_list_lock);
3137
beff74bc
AD
3138 INIT_DELAYED_WORK(&adev->delayed_init_work,
3139 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3140 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3141 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3142
d4535e2c
AG
3143 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3144
d23ee13f 3145 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3146 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3147
b265bdbd
EQ
3148 atomic_set(&adev->throttling_logging_enabled, 1);
3149 /*
3150 * If throttling continues, logging will be performed every minute
3151 * to avoid log flooding. "-1" is subtracted since the thermal
3152 * throttling interrupt comes every second. Thus, the total logging
3153 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3154 * for throttling interrupt) = 60 seconds.
3155 */
3156 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3157 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3158
0fa49558
AX
3159 /* Registers mapping */
3160 /* TODO: block userspace mapping of io register */
da69c161
KW
3161 if (adev->asic_type >= CHIP_BONAIRE) {
3162 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3163 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3164 } else {
3165 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3166 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3167 }
d38ceaf9 3168
d38ceaf9
AD
3169 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3170 if (adev->rmmio == NULL) {
3171 return -ENOMEM;
3172 }
3173 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3174 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3175
d38ceaf9
AD
3176 /* io port mapping */
3177 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3178 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3179 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3180 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3181 break;
3182 }
3183 }
3184 if (adev->rio_mem == NULL)
b64a18c5 3185 DRM_INFO("PCI I/O BAR is not found.\n");
d38ceaf9 3186
b2109d8e
JX
3187 /* enable PCIE atomic ops */
3188 r = pci_enable_atomic_ops_to_root(adev->pdev,
3189 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3190 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3191 if (r) {
3192 adev->have_atomics_support = false;
3193 DRM_INFO("PCIE atomic ops is not supported\n");
3194 } else {
3195 adev->have_atomics_support = true;
3196 }
3197
5494d864
AD
3198 amdgpu_device_get_pcie_info(adev);
3199
b239c017
JX
3200 if (amdgpu_mcbp)
3201 DRM_INFO("MCBP is enabled\n");
3202
5f84cc63
JX
3203 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3204 adev->enable_mes = true;
3205
3aa0115d
ML
3206 /* detect hw virtualization here */
3207 amdgpu_detect_virtualization(adev);
3208
dffa11b4
ML
3209 r = amdgpu_device_get_job_timeout_settings(adev);
3210 if (r) {
3211 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3212 return r;
a190d1c7
XY
3213 }
3214
d38ceaf9 3215 /* early init functions */
06ec9070 3216 r = amdgpu_device_ip_early_init(adev);
d38ceaf9
AD
3217 if (r)
3218 return r;
3219
6585661d
OZ
3220 /* doorbell bar mapping and doorbell index init*/
3221 amdgpu_device_doorbell_init(adev);
3222
d38ceaf9
AD
3223 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3224 /* this will fail for cards that aren't VGA class devices, just
3225 * ignore it */
06ec9070 3226 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
d38ceaf9 3227
31af062a 3228 if (amdgpu_device_supports_boco(ddev))
3840c5bc
AD
3229 boco = true;
3230 if (amdgpu_has_atpx() &&
3231 (amdgpu_is_atpx_hybrid() ||
3232 amdgpu_has_atpx_dgpu_power_cntl()) &&
3233 !pci_is_thunderbolt_attached(adev->pdev))
84c8b22e 3234 vga_switcheroo_register_client(adev->pdev,
3840c5bc
AD
3235 &amdgpu_switcheroo_ops, boco);
3236 if (boco)
d38ceaf9
AD
3237 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3238
9475a943
SL
3239 if (amdgpu_emu_mode == 1) {
3240 /* post the asic on emulation mode */
3241 emu_soc_asic_init(adev);
bfca0289 3242 goto fence_driver_init;
9475a943 3243 }
bfca0289 3244
4e99a44e
ML
3245 /* detect if we are with an SRIOV vbios */
3246 amdgpu_device_detect_sriov_bios(adev);
048765ad 3247
95e8e59e
AD
3248 /* check if we need to reset the asic
3249 * E.g., driver was not cleanly unloaded previously, etc.
3250 */
f14899fd 3251 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
95e8e59e
AD
3252 r = amdgpu_asic_reset(adev);
3253 if (r) {
3254 dev_err(adev->dev, "asic reset on init failed\n");
3255 goto failed;
3256 }
3257 }
3258
c9a6b82f
AG
3259 pci_enable_pcie_error_reporting(adev->ddev.pdev);
3260
d38ceaf9 3261 /* Post card if necessary */
39c640c0 3262 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3263 if (!adev->bios) {
bec86378 3264 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3265 r = -EINVAL;
3266 goto failed;
d38ceaf9 3267 }
bec86378 3268 DRM_INFO("GPU posting now...\n");
4d2997ab 3269 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3270 if (r) {
3271 dev_err(adev->dev, "gpu post error!\n");
3272 goto failed;
3273 }
d38ceaf9
AD
3274 }
3275
88b64e95
AD
3276 if (adev->is_atom_fw) {
3277 /* Initialize clocks */
3278 r = amdgpu_atomfirmware_get_clock_info(adev);
3279 if (r) {
3280 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3281 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3282 goto failed;
3283 }
3284 } else {
a5bde2f9
AD
3285 /* Initialize clocks */
3286 r = amdgpu_atombios_get_clock_info(adev);
3287 if (r) {
3288 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3289 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3290 goto failed;
a5bde2f9
AD
3291 }
3292 /* init i2c buses */
4562236b
HW
3293 if (!amdgpu_device_has_dc_support(adev))
3294 amdgpu_atombios_i2c_init(adev);
2c1a2784 3295 }
d38ceaf9 3296
bfca0289 3297fence_driver_init:
d38ceaf9
AD
3298 /* Fence driver */
3299 r = amdgpu_fence_driver_init(adev);
2c1a2784
AD
3300 if (r) {
3301 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
e23b74aa 3302 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3303 goto failed;
2c1a2784 3304 }
d38ceaf9
AD
3305
3306 /* init the mode config */
4a580877 3307 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3308
06ec9070 3309 r = amdgpu_device_ip_init(adev);
d38ceaf9 3310 if (r) {
8840a387 3311 /* failed in exclusive mode due to timeout */
3312 if (amdgpu_sriov_vf(adev) &&
3313 !amdgpu_sriov_runtime(adev) &&
3314 amdgpu_virt_mmio_blocked(adev) &&
3315 !amdgpu_virt_wait_reset(adev)) {
3316 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3317 /* Don't send request since VF is inactive. */
3318 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3319 adev->virt.ops = NULL;
8840a387 3320 r = -EAGAIN;
3321 goto failed;
3322 }
06ec9070 3323 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3324 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
83ba126a 3325 goto failed;
d38ceaf9
AD
3326 }
3327
d69b8971
YZ
3328 dev_info(adev->dev,
3329 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3330 adev->gfx.config.max_shader_engines,
3331 adev->gfx.config.max_sh_per_se,
3332 adev->gfx.config.max_cu_per_sh,
3333 adev->gfx.cu_info.number);
3334
d38ceaf9
AD
3335 adev->accel_working = true;
3336
e59c0205
AX
3337 amdgpu_vm_check_compute_bug(adev);
3338
95844d20
MO
3339 /* Initialize the buffer migration limit. */
3340 if (amdgpu_moverate >= 0)
3341 max_MBps = amdgpu_moverate;
3342 else
3343 max_MBps = 8; /* Allow 8 MB/s. */
3344 /* Get a log2 for easy divisions. */
3345 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3346
9bc92b9c
ML
3347 amdgpu_fbdev_init(adev);
3348
d2f52ac8 3349 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3350 if (r) {
3351 adev->pm_sysfs_en = false;
d2f52ac8 3352 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3353 } else
3354 adev->pm_sysfs_en = true;
d2f52ac8 3355
5bb23532 3356 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3357 if (r) {
3358 adev->ucode_sysfs_en = false;
5bb23532 3359 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3360 } else
3361 adev->ucode_sysfs_en = true;
5bb23532 3362
d38ceaf9
AD
3363 if ((amdgpu_testing & 1)) {
3364 if (adev->accel_working)
3365 amdgpu_test_moves(adev);
3366 else
3367 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3368 }
d38ceaf9
AD
3369 if (amdgpu_benchmarking) {
3370 if (adev->accel_working)
3371 amdgpu_benchmark(adev, amdgpu_benchmarking);
3372 else
3373 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3374 }
3375
b0adca4d
EQ
3376 /*
3377 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3378 * Otherwise the mgpu fan boost feature will be skipped due to the
3379 * gpu instance is counted less.
3380 */
3381 amdgpu_register_gpu_instance(adev);
3382
d38ceaf9
AD
3383 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3384 * explicit gating rather than handling it automatically.
3385 */
06ec9070 3386 r = amdgpu_device_ip_late_init(adev);
2c1a2784 3387 if (r) {
06ec9070 3388 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
e23b74aa 3389 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
83ba126a 3390 goto failed;
2c1a2784 3391 }
d38ceaf9 3392
108c6a63 3393 /* must succeed. */
511fdbc3 3394 amdgpu_ras_resume(adev);
108c6a63 3395
beff74bc
AD
3396 queue_delayed_work(system_wq, &adev->delayed_init_work,
3397 msecs_to_jiffies(AMDGPU_RESUME_MS));
3398
2c738637
ML
3399 if (amdgpu_sriov_vf(adev))
3400 flush_delayed_work(&adev->delayed_init_work);
3401
77f3a5cd 3402 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
bd607166 3403 if (r) {
77f3a5cd 3404 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166
KR
3405 return r;
3406 }
3407
d155bef0
AB
3408 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3409 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3410 if (r)
3411 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3412
c1dd4aa6
AG
3413 /* Have stored pci confspace at hand for restore in sudden PCI error */
3414 if (amdgpu_device_cache_pci_state(adev->pdev))
3415 pci_restore_state(pdev);
3416
d38ceaf9 3417 return 0;
83ba126a
AD
3418
3419failed:
89041940 3420 amdgpu_vf_error_trans_all(adev);
3840c5bc 3421 if (boco)
83ba126a 3422 vga_switcheroo_fini_domain_pm_ops(adev->dev);
8840a387 3423
83ba126a 3424 return r;
d38ceaf9
AD
3425}
3426
d38ceaf9
AD
3427/**
3428 * amdgpu_device_fini - tear down the driver
3429 *
3430 * @adev: amdgpu_device pointer
3431 *
3432 * Tear down the driver info (all asics).
3433 * Called at driver shutdown.
3434 */
3435void amdgpu_device_fini(struct amdgpu_device *adev)
3436{
aac89168 3437 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3438 flush_delayed_work(&adev->delayed_init_work);
d0d13fe8 3439 adev->shutdown = true;
9f875167 3440
c1dd4aa6
AG
3441 kfree(adev->pci_state);
3442
752c683d
ML
3443 /* make sure IB test finished before entering exclusive mode
3444 * to avoid preemption on IB test
3445 * */
3446 if (amdgpu_sriov_vf(adev))
3447 amdgpu_virt_request_full_gpu(adev, false);
3448
e5b03032
ML
3449 /* disable all interrupts */
3450 amdgpu_irq_disable_all(adev);
ff97cba8
ML
3451 if (adev->mode_info.mode_config_initialized){
3452 if (!amdgpu_device_has_dc_support(adev))
4a580877 3453 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3454 else
4a580877 3455 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3456 }
d38ceaf9 3457 amdgpu_fence_driver_fini(adev);
7c868b59
YT
3458 if (adev->pm_sysfs_en)
3459 amdgpu_pm_sysfs_fini(adev);
d38ceaf9 3460 amdgpu_fbdev_fini(adev);
e230ac11 3461 amdgpu_device_ip_fini(adev);
75e1658e
ND
3462 release_firmware(adev->firmware.gpu_info_fw);
3463 adev->firmware.gpu_info_fw = NULL;
d38ceaf9
AD
3464 adev->accel_working = false;
3465 /* free i2c buses */
4562236b
HW
3466 if (!amdgpu_device_has_dc_support(adev))
3467 amdgpu_i2c_fini(adev);
bfca0289
SL
3468
3469 if (amdgpu_emu_mode != 1)
3470 amdgpu_atombios_fini(adev);
3471
d38ceaf9
AD
3472 kfree(adev->bios);
3473 adev->bios = NULL;
3840c5bc
AD
3474 if (amdgpu_has_atpx() &&
3475 (amdgpu_is_atpx_hybrid() ||
3476 amdgpu_has_atpx_dgpu_power_cntl()) &&
3477 !pci_is_thunderbolt_attached(adev->pdev))
84c8b22e 3478 vga_switcheroo_unregister_client(adev->pdev);
4a580877 3479 if (amdgpu_device_supports_boco(adev_to_drm(adev)))
83ba126a 3480 vga_switcheroo_fini_domain_pm_ops(adev->dev);
d38ceaf9
AD
3481 vga_client_register(adev->pdev, NULL, NULL, NULL);
3482 if (adev->rio_mem)
3483 pci_iounmap(adev->pdev, adev->rio_mem);
3484 adev->rio_mem = NULL;
3485 iounmap(adev->rmmio);
3486 adev->rmmio = NULL;
06ec9070 3487 amdgpu_device_doorbell_fini(adev);
e9bc1bf7 3488
7c868b59
YT
3489 if (adev->ucode_sysfs_en)
3490 amdgpu_ucode_sysfs_fini(adev);
77f3a5cd
ND
3491
3492 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
d155bef0
AB
3493 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3494 amdgpu_pmu_fini(adev);
72de33f8 3495 if (adev->mman.discovery_bin)
a190d1c7 3496 amdgpu_discovery_fini(adev);
d38ceaf9
AD
3497}
3498
3499
3500/*
3501 * Suspend & resume.
3502 */
3503/**
810ddc3a 3504 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 3505 *
87e3f136 3506 * @dev: drm dev pointer
87e3f136 3507 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
3508 *
3509 * Puts the hw in the suspend state (all asics).
3510 * Returns 0 for success or an error on failure.
3511 * Called at driver suspend.
3512 */
de185019 3513int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9
AD
3514{
3515 struct amdgpu_device *adev;
3516 struct drm_crtc *crtc;
3517 struct drm_connector *connector;
f8d2d39e 3518 struct drm_connector_list_iter iter;
5ceb54c6 3519 int r;
d38ceaf9 3520
1348969a 3521 adev = drm_to_adev(dev);
d38ceaf9
AD
3522
3523 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3524 return 0;
3525
44779b43 3526 adev->in_suspend = true;
d38ceaf9
AD
3527 drm_kms_helper_poll_disable(dev);
3528
5f818173
S
3529 if (fbcon)
3530 amdgpu_fbdev_set_suspend(adev, 1);
3531
beff74bc 3532 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 3533
4562236b
HW
3534 if (!amdgpu_device_has_dc_support(adev)) {
3535 /* turn off display hw */
3536 drm_modeset_lock_all(dev);
f8d2d39e
LP
3537 drm_connector_list_iter_begin(dev, &iter);
3538 drm_for_each_connector_iter(connector, &iter)
3539 drm_helper_connector_dpms(connector,
3540 DRM_MODE_DPMS_OFF);
3541 drm_connector_list_iter_end(&iter);
4562236b 3542 drm_modeset_unlock_all(dev);
fe1053b7
AD
3543 /* unpin the front buffers and cursors */
3544 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3545 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3546 struct drm_framebuffer *fb = crtc->primary->fb;
3547 struct amdgpu_bo *robj;
3548
91334223 3549 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
fe1053b7
AD
3550 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3551 r = amdgpu_bo_reserve(aobj, true);
3552 if (r == 0) {
3553 amdgpu_bo_unpin(aobj);
3554 amdgpu_bo_unreserve(aobj);
3555 }
756e6880 3556 }
756e6880 3557
fe1053b7
AD
3558 if (fb == NULL || fb->obj[0] == NULL) {
3559 continue;
3560 }
3561 robj = gem_to_amdgpu_bo(fb->obj[0]);
3562 /* don't unpin kernel fb objects */
3563 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3564 r = amdgpu_bo_reserve(robj, true);
3565 if (r == 0) {
3566 amdgpu_bo_unpin(robj);
3567 amdgpu_bo_unreserve(robj);
3568 }
d38ceaf9
AD
3569 }
3570 }
3571 }
fe1053b7 3572
5e6932fe 3573 amdgpu_ras_suspend(adev);
3574
fe1053b7
AD
3575 r = amdgpu_device_ip_suspend_phase1(adev);
3576
94fa5660
EQ
3577 amdgpu_amdkfd_suspend(adev, !fbcon);
3578
d38ceaf9
AD
3579 /* evict vram memory */
3580 amdgpu_bo_evict_vram(adev);
3581
5ceb54c6 3582 amdgpu_fence_driver_suspend(adev);
d38ceaf9 3583
fe1053b7 3584 r = amdgpu_device_ip_suspend_phase2(adev);
d38ceaf9 3585
a0a71e49
AD
3586 /* evict remaining vram memory
3587 * This second call to evict vram is to evict the gart page table
3588 * using the CPU.
3589 */
d38ceaf9
AD
3590 amdgpu_bo_evict_vram(adev);
3591
d38ceaf9
AD
3592 return 0;
3593}
3594
3595/**
810ddc3a 3596 * amdgpu_device_resume - initiate device resume
d38ceaf9 3597 *
87e3f136 3598 * @dev: drm dev pointer
87e3f136 3599 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
3600 *
3601 * Bring the hw back to operating state (all asics).
3602 * Returns 0 for success or an error on failure.
3603 * Called at driver resume.
3604 */
de185019 3605int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9
AD
3606{
3607 struct drm_connector *connector;
f8d2d39e 3608 struct drm_connector_list_iter iter;
1348969a 3609 struct amdgpu_device *adev = drm_to_adev(dev);
756e6880 3610 struct drm_crtc *crtc;
03161a6e 3611 int r = 0;
d38ceaf9
AD
3612
3613 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3614 return 0;
3615
d38ceaf9 3616 /* post card */
39c640c0 3617 if (amdgpu_device_need_post(adev)) {
4d2997ab 3618 r = amdgpu_device_asic_init(adev);
74b0b157 3619 if (r)
aac89168 3620 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 3621 }
d38ceaf9 3622
06ec9070 3623 r = amdgpu_device_ip_resume(adev);
e6707218 3624 if (r) {
aac89168 3625 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 3626 return r;
e6707218 3627 }
5ceb54c6
AD
3628 amdgpu_fence_driver_resume(adev);
3629
d38ceaf9 3630
06ec9070 3631 r = amdgpu_device_ip_late_init(adev);
03161a6e 3632 if (r)
4d3b9ae5 3633 return r;
d38ceaf9 3634
beff74bc
AD
3635 queue_delayed_work(system_wq, &adev->delayed_init_work,
3636 msecs_to_jiffies(AMDGPU_RESUME_MS));
3637
fe1053b7
AD
3638 if (!amdgpu_device_has_dc_support(adev)) {
3639 /* pin cursors */
3640 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3641 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3642
91334223 3643 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
fe1053b7
AD
3644 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3645 r = amdgpu_bo_reserve(aobj, true);
3646 if (r == 0) {
3647 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3648 if (r != 0)
aac89168 3649 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
fe1053b7
AD
3650 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3651 amdgpu_bo_unreserve(aobj);
3652 }
756e6880
AD
3653 }
3654 }
3655 }
9593f4d6 3656 r = amdgpu_amdkfd_resume(adev, !fbcon);
ba997709
YZ
3657 if (r)
3658 return r;
756e6880 3659
96a5d8d4 3660 /* Make sure IB tests flushed */
beff74bc 3661 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 3662
d38ceaf9
AD
3663 /* blat the mode back in */
3664 if (fbcon) {
4562236b
HW
3665 if (!amdgpu_device_has_dc_support(adev)) {
3666 /* pre DCE11 */
3667 drm_helper_resume_force_mode(dev);
3668
3669 /* turn on display hw */
3670 drm_modeset_lock_all(dev);
f8d2d39e
LP
3671
3672 drm_connector_list_iter_begin(dev, &iter);
3673 drm_for_each_connector_iter(connector, &iter)
3674 drm_helper_connector_dpms(connector,
3675 DRM_MODE_DPMS_ON);
3676 drm_connector_list_iter_end(&iter);
3677
4562236b 3678 drm_modeset_unlock_all(dev);
d38ceaf9 3679 }
4d3b9ae5 3680 amdgpu_fbdev_set_suspend(adev, 0);
d38ceaf9
AD
3681 }
3682
3683 drm_kms_helper_poll_enable(dev);
23a1a9e5 3684
5e6932fe 3685 amdgpu_ras_resume(adev);
3686
23a1a9e5
L
3687 /*
3688 * Most of the connector probing functions try to acquire runtime pm
3689 * refs to ensure that the GPU is powered on when connector polling is
3690 * performed. Since we're calling this from a runtime PM callback,
3691 * trying to acquire rpm refs will cause us to deadlock.
3692 *
3693 * Since we're guaranteed to be holding the rpm lock, it's safe to
3694 * temporarily disable the rpm helpers so this doesn't deadlock us.
3695 */
3696#ifdef CONFIG_PM
3697 dev->dev->power.disable_depth++;
3698#endif
4562236b
HW
3699 if (!amdgpu_device_has_dc_support(adev))
3700 drm_helper_hpd_irq_event(dev);
3701 else
3702 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
3703#ifdef CONFIG_PM
3704 dev->dev->power.disable_depth--;
3705#endif
44779b43
RZ
3706 adev->in_suspend = false;
3707
4d3b9ae5 3708 return 0;
d38ceaf9
AD
3709}
3710
e3ecdffa
AD
3711/**
3712 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3713 *
3714 * @adev: amdgpu_device pointer
3715 *
3716 * The list of all the hardware IPs that make up the asic is walked and
3717 * the check_soft_reset callbacks are run. check_soft_reset determines
3718 * if the asic is still hung or not.
3719 * Returns true if any of the IPs are still in a hung state, false if not.
3720 */
06ec9070 3721static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
3722{
3723 int i;
3724 bool asic_hang = false;
3725
f993d628
ML
3726 if (amdgpu_sriov_vf(adev))
3727 return true;
3728
8bc04c29
AD
3729 if (amdgpu_asic_need_full_reset(adev))
3730 return true;
3731
63fbf42f 3732 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3733 if (!adev->ip_blocks[i].status.valid)
63fbf42f 3734 continue;
a1255107
AD
3735 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3736 adev->ip_blocks[i].status.hang =
3737 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3738 if (adev->ip_blocks[i].status.hang) {
aac89168 3739 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
3740 asic_hang = true;
3741 }
3742 }
3743 return asic_hang;
3744}
3745
e3ecdffa
AD
3746/**
3747 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3748 *
3749 * @adev: amdgpu_device pointer
3750 *
3751 * The list of all the hardware IPs that make up the asic is walked and the
3752 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3753 * handles any IP specific hardware or software state changes that are
3754 * necessary for a soft reset to succeed.
3755 * Returns 0 on success, negative error code on failure.
3756 */
06ec9070 3757static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
3758{
3759 int i, r = 0;
3760
3761 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3762 if (!adev->ip_blocks[i].status.valid)
d31a501e 3763 continue;
a1255107
AD
3764 if (adev->ip_blocks[i].status.hang &&
3765 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3766 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
3767 if (r)
3768 return r;
3769 }
3770 }
3771
3772 return 0;
3773}
3774
e3ecdffa
AD
3775/**
3776 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3777 *
3778 * @adev: amdgpu_device pointer
3779 *
3780 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3781 * reset is necessary to recover.
3782 * Returns true if a full asic reset is required, false if not.
3783 */
06ec9070 3784static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 3785{
da146d3b
AD
3786 int i;
3787
8bc04c29
AD
3788 if (amdgpu_asic_need_full_reset(adev))
3789 return true;
3790
da146d3b 3791 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3792 if (!adev->ip_blocks[i].status.valid)
da146d3b 3793 continue;
a1255107
AD
3794 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3795 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3796 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
3797 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3798 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 3799 if (adev->ip_blocks[i].status.hang) {
aac89168 3800 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
3801 return true;
3802 }
3803 }
35d782fe
CZ
3804 }
3805 return false;
3806}
3807
e3ecdffa
AD
3808/**
3809 * amdgpu_device_ip_soft_reset - do a soft reset
3810 *
3811 * @adev: amdgpu_device pointer
3812 *
3813 * The list of all the hardware IPs that make up the asic is walked and the
3814 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3815 * IP specific hardware or software state changes that are necessary to soft
3816 * reset the IP.
3817 * Returns 0 on success, negative error code on failure.
3818 */
06ec9070 3819static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
3820{
3821 int i, r = 0;
3822
3823 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3824 if (!adev->ip_blocks[i].status.valid)
35d782fe 3825 continue;
a1255107
AD
3826 if (adev->ip_blocks[i].status.hang &&
3827 adev->ip_blocks[i].version->funcs->soft_reset) {
3828 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
3829 if (r)
3830 return r;
3831 }
3832 }
3833
3834 return 0;
3835}
3836
e3ecdffa
AD
3837/**
3838 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3839 *
3840 * @adev: amdgpu_device pointer
3841 *
3842 * The list of all the hardware IPs that make up the asic is walked and the
3843 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3844 * handles any IP specific hardware or software state changes that are
3845 * necessary after the IP has been soft reset.
3846 * Returns 0 on success, negative error code on failure.
3847 */
06ec9070 3848static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
3849{
3850 int i, r = 0;
3851
3852 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3853 if (!adev->ip_blocks[i].status.valid)
35d782fe 3854 continue;
a1255107
AD
3855 if (adev->ip_blocks[i].status.hang &&
3856 adev->ip_blocks[i].version->funcs->post_soft_reset)
3857 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
3858 if (r)
3859 return r;
3860 }
3861
3862 return 0;
3863}
3864
e3ecdffa 3865/**
c33adbc7 3866 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
3867 *
3868 * @adev: amdgpu_device pointer
3869 *
3870 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
3871 * restore things like GPUVM page tables after a GPU reset where
3872 * the contents of VRAM might be lost.
403009bf
CK
3873 *
3874 * Returns:
3875 * 0 on success, negative error code on failure.
e3ecdffa 3876 */
c33adbc7 3877static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 3878{
c41d1cf6 3879 struct dma_fence *fence = NULL, *next = NULL;
403009bf
CK
3880 struct amdgpu_bo *shadow;
3881 long r = 1, tmo;
c41d1cf6
ML
3882
3883 if (amdgpu_sriov_runtime(adev))
b045d3af 3884 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
3885 else
3886 tmo = msecs_to_jiffies(100);
3887
aac89168 3888 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 3889 mutex_lock(&adev->shadow_list_lock);
403009bf
CK
3890 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3891
3892 /* No need to recover an evicted BO */
3893 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
b575f10d 3894 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
403009bf
CK
3895 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3896 continue;
3897
3898 r = amdgpu_bo_restore_shadow(shadow, &next);
3899 if (r)
3900 break;
3901
c41d1cf6 3902 if (fence) {
1712fb1a 3903 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
3904 dma_fence_put(fence);
3905 fence = next;
1712fb1a 3906 if (tmo == 0) {
3907 r = -ETIMEDOUT;
c41d1cf6 3908 break;
1712fb1a 3909 } else if (tmo < 0) {
3910 r = tmo;
3911 break;
3912 }
403009bf
CK
3913 } else {
3914 fence = next;
c41d1cf6 3915 }
c41d1cf6
ML
3916 }
3917 mutex_unlock(&adev->shadow_list_lock);
3918
403009bf
CK
3919 if (fence)
3920 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
3921 dma_fence_put(fence);
3922
1712fb1a 3923 if (r < 0 || tmo <= 0) {
aac89168 3924 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
3925 return -EIO;
3926 }
c41d1cf6 3927
aac89168 3928 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 3929 return 0;
c41d1cf6
ML
3930}
3931
a90ad3c2 3932
e3ecdffa 3933/**
06ec9070 3934 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e
ML
3935 *
3936 * @adev: amdgpu device pointer
87e3f136 3937 * @from_hypervisor: request from hypervisor
5740682e
ML
3938 *
3939 * do VF FLR and reinitialize Asic
3f48c681 3940 * return 0 means succeeded otherwise failed
e3ecdffa
AD
3941 */
3942static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3943 bool from_hypervisor)
5740682e
ML
3944{
3945 int r;
3946
3947 if (from_hypervisor)
3948 r = amdgpu_virt_request_full_gpu(adev, true);
3949 else
3950 r = amdgpu_virt_reset_gpu(adev);
3951 if (r)
3952 return r;
a90ad3c2 3953
b639c22c
JZ
3954 amdgpu_amdkfd_pre_reset(adev);
3955
a90ad3c2 3956 /* Resume IP prior to SMC */
06ec9070 3957 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
3958 if (r)
3959 goto error;
a90ad3c2 3960
c9ffa427 3961 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 3962 /* we need recover gart prior to run SMC/CP/SDMA resume */
6c28aed6 3963 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
a90ad3c2 3964
7a3e0bb2
RZ
3965 r = amdgpu_device_fw_loading(adev);
3966 if (r)
3967 return r;
3968
a90ad3c2 3969 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 3970 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
3971 if (r)
3972 goto error;
a90ad3c2
ML
3973
3974 amdgpu_irq_gpu_reset_resume_helper(adev);
5740682e 3975 r = amdgpu_ib_ring_tests(adev);
f81e8d53 3976 amdgpu_amdkfd_post_reset(adev);
a90ad3c2 3977
abc34253
ED
3978error:
3979 amdgpu_virt_release_full_gpu(adev, true);
c41d1cf6 3980 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 3981 amdgpu_inc_vram_lost(adev);
c33adbc7 3982 r = amdgpu_device_recover_vram(adev);
a90ad3c2
ML
3983 }
3984
3985 return r;
3986}
3987
9a1cddd6 3988/**
3989 * amdgpu_device_has_job_running - check if there is any job in mirror list
3990 *
3991 * @adev: amdgpu device pointer
3992 *
3993 * check if there is any job in mirror list
3994 */
3995bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
3996{
3997 int i;
3998 struct drm_sched_job *job;
3999
4000 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4001 struct amdgpu_ring *ring = adev->rings[i];
4002
4003 if (!ring || !ring->sched.thread)
4004 continue;
4005
4006 spin_lock(&ring->sched.job_list_lock);
4007 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4008 struct drm_sched_job, node);
4009 spin_unlock(&ring->sched.job_list_lock);
4010 if (job)
4011 return true;
4012 }
4013 return false;
4014}
4015
12938fad
CK
4016/**
4017 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4018 *
4019 * @adev: amdgpu device pointer
4020 *
4021 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4022 * a hung GPU.
4023 */
4024bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4025{
4026 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4027 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4028 return false;
4029 }
4030
3ba7b418
AG
4031 if (amdgpu_gpu_recovery == 0)
4032 goto disabled;
4033
4034 if (amdgpu_sriov_vf(adev))
4035 return true;
4036
4037 if (amdgpu_gpu_recovery == -1) {
4038 switch (adev->asic_type) {
fc42d47c
AG
4039 case CHIP_BONAIRE:
4040 case CHIP_HAWAII:
3ba7b418
AG
4041 case CHIP_TOPAZ:
4042 case CHIP_TONGA:
4043 case CHIP_FIJI:
4044 case CHIP_POLARIS10:
4045 case CHIP_POLARIS11:
4046 case CHIP_POLARIS12:
4047 case CHIP_VEGAM:
4048 case CHIP_VEGA20:
4049 case CHIP_VEGA10:
4050 case CHIP_VEGA12:
c43b849f 4051 case CHIP_RAVEN:
e9d4cf91 4052 case CHIP_ARCTURUS:
2cb44fb0 4053 case CHIP_RENOIR:
658c6639
AD
4054 case CHIP_NAVI10:
4055 case CHIP_NAVI14:
4056 case CHIP_NAVI12:
131a3c74 4057 case CHIP_SIENNA_CICHLID:
3ba7b418
AG
4058 break;
4059 default:
4060 goto disabled;
4061 }
12938fad
CK
4062 }
4063
4064 return true;
3ba7b418
AG
4065
4066disabled:
aac89168 4067 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4068 return false;
12938fad
CK
4069}
4070
5c6dd71e 4071
26bc5340
AG
4072static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4073 struct amdgpu_job *job,
4074 bool *need_full_reset_arg)
4075{
4076 int i, r = 0;
4077 bool need_full_reset = *need_full_reset_arg;
71182665 4078
728e7e0c
JZ
4079 amdgpu_debugfs_wait_dump(adev);
4080
71182665 4081 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4082 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4083 struct amdgpu_ring *ring = adev->rings[i];
4084
51687759 4085 if (!ring || !ring->sched.thread)
0875dc9e 4086 continue;
5740682e 4087
2f9d4084
ML
4088 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4089 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4090 }
d38ceaf9 4091
222b5f04
AG
4092 if(job)
4093 drm_sched_increase_karma(&job->base);
4094
1d721ed6 4095 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4096 if (!amdgpu_sriov_vf(adev)) {
4097
4098 if (!need_full_reset)
4099 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4100
4101 if (!need_full_reset) {
4102 amdgpu_device_ip_pre_soft_reset(adev);
4103 r = amdgpu_device_ip_soft_reset(adev);
4104 amdgpu_device_ip_post_soft_reset(adev);
4105 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4106 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4107 need_full_reset = true;
4108 }
4109 }
4110
4111 if (need_full_reset)
4112 r = amdgpu_device_ip_suspend(adev);
4113
4114 *need_full_reset_arg = need_full_reset;
4115 }
4116
4117 return r;
4118}
4119
041a62bc 4120static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
26bc5340 4121 struct list_head *device_list_handle,
7ac71382
AG
4122 bool *need_full_reset_arg,
4123 bool skip_hw_reset)
26bc5340
AG
4124{
4125 struct amdgpu_device *tmp_adev = NULL;
4126 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4127 int r = 0;
4128
4129 /*
4130 * ASIC reset has to be done on all HGMI hive nodes ASAP
4131 * to allow proper links negotiation in FW (within 1 sec)
4132 */
7ac71382 4133 if (!skip_hw_reset && need_full_reset) {
26bc5340 4134 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
041a62bc 4135 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4136 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
c96cf282 4137 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4138 r = -EALREADY;
4139 } else
4140 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4141
041a62bc 4142 if (r) {
aac89168 4143 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4144 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4145 break;
ce316fa5
LM
4146 }
4147 }
4148
041a62bc
AG
4149 /* For XGMI wait for all resets to complete before proceed */
4150 if (!r) {
ce316fa5
LM
4151 list_for_each_entry(tmp_adev, device_list_handle,
4152 gmc.xgmi.head) {
4153 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4154 flush_work(&tmp_adev->xgmi_reset_work);
4155 r = tmp_adev->asic_reset_res;
4156 if (r)
4157 break;
ce316fa5
LM
4158 }
4159 }
4160 }
ce316fa5 4161 }
26bc5340 4162
43c4d576
JC
4163 if (!r && amdgpu_ras_intr_triggered()) {
4164 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4165 if (tmp_adev->mmhub.funcs &&
4166 tmp_adev->mmhub.funcs->reset_ras_error_count)
4167 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4168 }
4169
00eaa571 4170 amdgpu_ras_intr_cleared();
43c4d576 4171 }
00eaa571 4172
26bc5340
AG
4173 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4174 if (need_full_reset) {
4175 /* post card */
4d2997ab 4176 if (amdgpu_device_asic_init(tmp_adev))
aac89168 4177 dev_warn(tmp_adev->dev, "asic atom init failed!");
26bc5340
AG
4178
4179 if (!r) {
4180 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4181 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4182 if (r)
4183 goto out;
4184
4185 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4186 if (vram_lost) {
77e7f829 4187 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4188 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4189 }
4190
6c28aed6 4191 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
26bc5340
AG
4192 if (r)
4193 goto out;
4194
4195 r = amdgpu_device_fw_loading(tmp_adev);
4196 if (r)
4197 return r;
4198
4199 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4200 if (r)
4201 goto out;
4202
4203 if (vram_lost)
4204 amdgpu_device_fill_reset_magic(tmp_adev);
4205
fdafb359
EQ
4206 /*
4207 * Add this ASIC as tracked as reset was already
4208 * complete successfully.
4209 */
4210 amdgpu_register_gpu_instance(tmp_adev);
4211
7c04ca50 4212 r = amdgpu_device_ip_late_init(tmp_adev);
4213 if (r)
4214 goto out;
4215
565d1941
EQ
4216 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4217
e8fbaf03
GC
4218 /*
4219 * The GPU enters bad state once faulty pages
4220 * by ECC has reached the threshold, and ras
4221 * recovery is scheduled next. So add one check
4222 * here to break recovery if it indeed exceeds
4223 * bad page threshold, and remind user to
4224 * retire this GPU or setting one bigger
4225 * bad_page_threshold value to fix this once
4226 * probing driver again.
4227 */
4228 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4229 /* must succeed. */
4230 amdgpu_ras_resume(tmp_adev);
4231 } else {
4232 r = -EINVAL;
4233 goto out;
4234 }
e79a04d5 4235
26bc5340
AG
4236 /* Update PSP FW topology after reset */
4237 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4238 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4239 }
4240 }
4241
26bc5340
AG
4242out:
4243 if (!r) {
4244 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4245 r = amdgpu_ib_ring_tests(tmp_adev);
4246 if (r) {
4247 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4248 r = amdgpu_device_ip_suspend(tmp_adev);
4249 need_full_reset = true;
4250 r = -EAGAIN;
4251 goto end;
4252 }
4253 }
4254
4255 if (!r)
4256 r = amdgpu_device_recover_vram(tmp_adev);
4257 else
4258 tmp_adev->asic_reset_res = r;
4259 }
4260
4261end:
4262 *need_full_reset_arg = need_full_reset;
4263 return r;
4264}
4265
08ebb485
DL
4266static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4267 struct amdgpu_hive_info *hive)
26bc5340 4268{
53b3f8f4
DL
4269 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4270 return false;
4271
08ebb485
DL
4272 if (hive) {
4273 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4274 } else {
4275 down_write(&adev->reset_sem);
4276 }
5740682e 4277
26bc5340 4278 atomic_inc(&adev->gpu_reset_counter);
a3a09142
AD
4279 switch (amdgpu_asic_reset_method(adev)) {
4280 case AMD_RESET_METHOD_MODE1:
4281 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4282 break;
4283 case AMD_RESET_METHOD_MODE2:
4284 adev->mp1_state = PP_MP1_STATE_RESET;
4285 break;
4286 default:
4287 adev->mp1_state = PP_MP1_STATE_NONE;
4288 break;
4289 }
1d721ed6
AG
4290
4291 return true;
26bc5340 4292}
d38ceaf9 4293
26bc5340
AG
4294static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4295{
89041940 4296 amdgpu_vf_error_trans_all(adev);
a3a09142 4297 adev->mp1_state = PP_MP1_STATE_NONE;
53b3f8f4 4298 atomic_set(&adev->in_gpu_reset, 0);
6049db43 4299 up_write(&adev->reset_sem);
26bc5340
AG
4300}
4301
3f12acc8
EQ
4302static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4303{
4304 struct pci_dev *p = NULL;
4305
4306 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4307 adev->pdev->bus->number, 1);
4308 if (p) {
4309 pm_runtime_enable(&(p->dev));
4310 pm_runtime_resume(&(p->dev));
4311 }
4312}
4313
4314static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4315{
4316 enum amd_reset_method reset_method;
4317 struct pci_dev *p = NULL;
4318 u64 expires;
4319
4320 /*
4321 * For now, only BACO and mode1 reset are confirmed
4322 * to suffer the audio issue without proper suspended.
4323 */
4324 reset_method = amdgpu_asic_reset_method(adev);
4325 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4326 (reset_method != AMD_RESET_METHOD_MODE1))
4327 return -EINVAL;
4328
4329 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4330 adev->pdev->bus->number, 1);
4331 if (!p)
4332 return -ENODEV;
4333
4334 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4335 if (!expires)
4336 /*
4337 * If we cannot get the audio device autosuspend delay,
4338 * a fixed 4S interval will be used. Considering 3S is
4339 * the audio controller default autosuspend delay setting.
4340 * 4S used here is guaranteed to cover that.
4341 */
54b7feb9 4342 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4343
4344 while (!pm_runtime_status_suspended(&(p->dev))) {
4345 if (!pm_runtime_suspend(&(p->dev)))
4346 break;
4347
4348 if (expires < ktime_get_mono_fast_ns()) {
4349 dev_warn(adev->dev, "failed to suspend display audio\n");
4350 /* TODO: abort the succeeding gpu reset? */
4351 return -ETIMEDOUT;
4352 }
4353 }
4354
4355 pm_runtime_disable(&(p->dev));
4356
4357 return 0;
4358}
4359
26bc5340
AG
4360/**
4361 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4362 *
4363 * @adev: amdgpu device pointer
4364 * @job: which job trigger hang
4365 *
4366 * Attempt to reset the GPU if it has hung (all asics).
4367 * Attempt to do soft-reset or full-reset and reinitialize Asic
4368 * Returns 0 for success or an error on failure.
4369 */
4370
4371int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4372 struct amdgpu_job *job)
4373{
1d721ed6 4374 struct list_head device_list, *device_list_handle = NULL;
7dd8c205
EQ
4375 bool need_full_reset = false;
4376 bool job_signaled = false;
26bc5340 4377 struct amdgpu_hive_info *hive = NULL;
26bc5340 4378 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 4379 int i, r = 0;
bb5c7235 4380 bool need_emergency_restart = false;
3f12acc8 4381 bool audio_suspended = false;
26bc5340 4382
bb5c7235
WS
4383 /**
4384 * Special case: RAS triggered and full reset isn't supported
4385 */
4386 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4387
d5ea093e
AG
4388 /*
4389 * Flush RAM to disk so that after reboot
4390 * the user can read log and see why the system rebooted.
4391 */
bb5c7235 4392 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
4393 DRM_WARN("Emergency reboot.");
4394
4395 ksys_sync_helper();
4396 emergency_restart();
4397 }
4398
b823821f 4399 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 4400 need_emergency_restart ? "jobs stop":"reset");
26bc5340
AG
4401
4402 /*
1d721ed6
AG
4403 * Here we trylock to avoid chain of resets executing from
4404 * either trigger by jobs on different adevs in XGMI hive or jobs on
4405 * different schedulers for same device while this TO handler is running.
4406 * We always reset all schedulers for device and all devices for XGMI
4407 * hive so that should take care of them too.
26bc5340 4408 */
d95e8e97 4409 hive = amdgpu_get_xgmi_hive(adev);
53b3f8f4
DL
4410 if (hive) {
4411 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4412 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4413 job ? job->base.id : -1, hive->hive_id);
d95e8e97 4414 amdgpu_put_xgmi_hive(hive);
53b3f8f4
DL
4415 return 0;
4416 }
4417 mutex_lock(&hive->hive_lock);
1d721ed6 4418 }
26bc5340 4419
9e94d22c
EQ
4420 /*
4421 * Build list of devices to reset.
4422 * In case we are in XGMI hive mode, resort the device list
4423 * to put adev in the 1st position.
4424 */
4425 INIT_LIST_HEAD(&device_list);
4426 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4427 if (!hive)
26bc5340 4428 return -ENODEV;
9e94d22c
EQ
4429 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4430 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
26bc5340
AG
4431 device_list_handle = &hive->device_list;
4432 } else {
4433 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4434 device_list_handle = &device_list;
4435 }
4436
1d721ed6
AG
4437 /* block all schedulers and reset given job's ring */
4438 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
08ebb485 4439 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
aac89168 4440 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
9e94d22c 4441 job ? job->base.id : -1);
cbfd17f7
DL
4442 r = 0;
4443 goto skip_recovery;
7c6e68c7
AG
4444 }
4445
3f12acc8
EQ
4446 /*
4447 * Try to put the audio codec into suspend state
4448 * before gpu reset started.
4449 *
4450 * Due to the power domain of the graphics device
4451 * is shared with AZ power domain. Without this,
4452 * we may change the audio hardware from behind
4453 * the audio driver's back. That will trigger
4454 * some audio codec errors.
4455 */
4456 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4457 audio_suspended = true;
4458
9e94d22c
EQ
4459 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4460
52fb44cf
EQ
4461 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4462
9e94d22c
EQ
4463 if (!amdgpu_sriov_vf(tmp_adev))
4464 amdgpu_amdkfd_pre_reset(tmp_adev);
4465
12ffa55d
AG
4466 /*
4467 * Mark these ASICs to be reseted as untracked first
4468 * And add them back after reset completed
4469 */
4470 amdgpu_unregister_gpu_instance(tmp_adev);
4471
a2f63ee8 4472 amdgpu_fbdev_set_suspend(tmp_adev, 1);
565d1941 4473
f1c1314b 4474 /* disable ras on ALL IPs */
bb5c7235 4475 if (!need_emergency_restart &&
b823821f 4476 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 4477 amdgpu_ras_suspend(tmp_adev);
4478
1d721ed6
AG
4479 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4480 struct amdgpu_ring *ring = tmp_adev->rings[i];
4481
4482 if (!ring || !ring->sched.thread)
4483 continue;
4484
0b2d2c2e 4485 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 4486
bb5c7235 4487 if (need_emergency_restart)
7c6e68c7 4488 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6
AG
4489 }
4490 }
4491
bb5c7235 4492 if (need_emergency_restart)
7c6e68c7
AG
4493 goto skip_sched_resume;
4494
1d721ed6
AG
4495 /*
4496 * Must check guilty signal here since after this point all old
4497 * HW fences are force signaled.
4498 *
4499 * job->base holds a reference to parent fence
4500 */
4501 if (job && job->base.s_fence->parent &&
7dd8c205 4502 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 4503 job_signaled = true;
1d721ed6
AG
4504 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4505 goto skip_hw_reset;
4506 }
4507
26bc5340
AG
4508retry: /* Rest of adevs pre asic reset from XGMI hive. */
4509 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
26bc5340
AG
4510 r = amdgpu_device_pre_asic_reset(tmp_adev,
4511 NULL,
4512 &need_full_reset);
4513 /*TODO Should we stop ?*/
4514 if (r) {
aac89168 4515 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 4516 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
4517 tmp_adev->asic_reset_res = r;
4518 }
4519 }
4520
4521 /* Actual ASIC resets if needed.*/
4522 /* TODO Implement XGMI hive reset logic for SRIOV */
4523 if (amdgpu_sriov_vf(adev)) {
4524 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4525 if (r)
4526 adev->asic_reset_res = r;
4527 } else {
7ac71382 4528 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
26bc5340
AG
4529 if (r && r == -EAGAIN)
4530 goto retry;
4531 }
4532
1d721ed6
AG
4533skip_hw_reset:
4534
26bc5340
AG
4535 /* Post ASIC reset for all devs .*/
4536 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
7c6e68c7 4537
1d721ed6
AG
4538 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4539 struct amdgpu_ring *ring = tmp_adev->rings[i];
4540
4541 if (!ring || !ring->sched.thread)
4542 continue;
4543
4544 /* No point to resubmit jobs if we didn't HW reset*/
4545 if (!tmp_adev->asic_reset_res && !job_signaled)
4546 drm_sched_resubmit_jobs(&ring->sched);
4547
4548 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4549 }
4550
4551 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4a580877 4552 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
4553 }
4554
4555 tmp_adev->asic_reset_res = 0;
26bc5340
AG
4556
4557 if (r) {
4558 /* bad news, how to tell it to userspace ? */
12ffa55d 4559 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
4560 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4561 } else {
12ffa55d 4562 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340 4563 }
7c6e68c7 4564 }
26bc5340 4565
7c6e68c7
AG
4566skip_sched_resume:
4567 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4568 /*unlock kfd: SRIOV would do it separately */
bb5c7235 4569 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
7c6e68c7 4570 amdgpu_amdkfd_post_reset(tmp_adev);
3f12acc8
EQ
4571 if (audio_suspended)
4572 amdgpu_device_resume_display_audio(tmp_adev);
26bc5340
AG
4573 amdgpu_device_unlock_adev(tmp_adev);
4574 }
4575
cbfd17f7 4576skip_recovery:
9e94d22c 4577 if (hive) {
53b3f8f4 4578 atomic_set(&hive->in_reset, 0);
9e94d22c 4579 mutex_unlock(&hive->hive_lock);
d95e8e97 4580 amdgpu_put_xgmi_hive(hive);
9e94d22c 4581 }
26bc5340
AG
4582
4583 if (r)
4584 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
4585 return r;
4586}
4587
e3ecdffa
AD
4588/**
4589 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4590 *
4591 * @adev: amdgpu_device pointer
4592 *
4593 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4594 * and lanes) of the slot the device is in. Handles APUs and
4595 * virtualized environments where PCIE config space may not be available.
4596 */
5494d864 4597static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 4598{
5d9a6330 4599 struct pci_dev *pdev;
c5313457
HK
4600 enum pci_bus_speed speed_cap, platform_speed_cap;
4601 enum pcie_link_width platform_link_width;
d0dd7f0c 4602
cd474ba0
AD
4603 if (amdgpu_pcie_gen_cap)
4604 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 4605
cd474ba0
AD
4606 if (amdgpu_pcie_lane_cap)
4607 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 4608
cd474ba0
AD
4609 /* covers APUs as well */
4610 if (pci_is_root_bus(adev->pdev->bus)) {
4611 if (adev->pm.pcie_gen_mask == 0)
4612 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4613 if (adev->pm.pcie_mlw_mask == 0)
4614 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 4615 return;
cd474ba0 4616 }
d0dd7f0c 4617
c5313457
HK
4618 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4619 return;
4620
dbaa922b
AD
4621 pcie_bandwidth_available(adev->pdev, NULL,
4622 &platform_speed_cap, &platform_link_width);
c5313457 4623
cd474ba0 4624 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
4625 /* asic caps */
4626 pdev = adev->pdev;
4627 speed_cap = pcie_get_speed_cap(pdev);
4628 if (speed_cap == PCI_SPEED_UNKNOWN) {
4629 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
4630 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4631 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 4632 } else {
5d9a6330
AD
4633 if (speed_cap == PCIE_SPEED_16_0GT)
4634 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4635 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4636 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4637 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4638 else if (speed_cap == PCIE_SPEED_8_0GT)
4639 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4640 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4641 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4642 else if (speed_cap == PCIE_SPEED_5_0GT)
4643 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4644 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4645 else
4646 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4647 }
4648 /* platform caps */
c5313457 4649 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
4650 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4651 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4652 } else {
c5313457 4653 if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
4654 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4655 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4656 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4657 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 4658 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
4659 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4660 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4661 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 4662 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
4663 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4664 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4665 else
4666 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4667
cd474ba0
AD
4668 }
4669 }
4670 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 4671 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
4672 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4673 } else {
c5313457 4674 switch (platform_link_width) {
5d9a6330 4675 case PCIE_LNK_X32:
cd474ba0
AD
4676 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4677 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4678 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4679 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4680 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4681 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4682 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4683 break;
5d9a6330 4684 case PCIE_LNK_X16:
cd474ba0
AD
4685 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4686 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4687 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4689 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4691 break;
5d9a6330 4692 case PCIE_LNK_X12:
cd474ba0
AD
4693 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4694 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4695 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4697 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4698 break;
5d9a6330 4699 case PCIE_LNK_X8:
cd474ba0
AD
4700 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4701 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4702 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4703 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4704 break;
5d9a6330 4705 case PCIE_LNK_X4:
cd474ba0
AD
4706 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4707 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4708 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4709 break;
5d9a6330 4710 case PCIE_LNK_X2:
cd474ba0
AD
4711 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4712 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4713 break;
5d9a6330 4714 case PCIE_LNK_X1:
cd474ba0
AD
4715 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4716 break;
4717 default:
4718 break;
4719 }
d0dd7f0c
AD
4720 }
4721 }
4722}
d38ceaf9 4723
361dbd01
AD
4724int amdgpu_device_baco_enter(struct drm_device *dev)
4725{
1348969a 4726 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 4727 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 4728
4a580877 4729 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
4730 return -ENOTSUPP;
4731
7a22677b
LM
4732 if (ras && ras->supported)
4733 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4734
9530273e 4735 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
4736}
4737
4738int amdgpu_device_baco_exit(struct drm_device *dev)
4739{
1348969a 4740 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 4741 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 4742 int ret = 0;
361dbd01 4743
4a580877 4744 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
4745 return -ENOTSUPP;
4746
9530273e
EQ
4747 ret = amdgpu_dpm_baco_exit(adev);
4748 if (ret)
4749 return ret;
7a22677b
LM
4750
4751 if (ras && ras->supported)
4752 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4753
4754 return 0;
361dbd01 4755}
c9a6b82f 4756
acd89fca
AG
4757static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4758{
4759 int i;
4760
4761 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4762 struct amdgpu_ring *ring = adev->rings[i];
4763
4764 if (!ring || !ring->sched.thread)
4765 continue;
4766
4767 cancel_delayed_work_sync(&ring->sched.work_tdr);
4768 }
4769}
4770
c9a6b82f
AG
4771/**
4772 * amdgpu_pci_error_detected - Called when a PCI error is detected.
4773 * @pdev: PCI device struct
4774 * @state: PCI channel state
4775 *
4776 * Description: Called when a PCI error is detected.
4777 *
4778 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4779 */
4780pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4781{
4782 struct drm_device *dev = pci_get_drvdata(pdev);
4783 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 4784 int i;
c9a6b82f
AG
4785
4786 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4787
6894305c
AG
4788 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4789 DRM_WARN("No support for XGMI hive yet...");
4790 return PCI_ERS_RESULT_DISCONNECT;
4791 }
4792
c9a6b82f
AG
4793 switch (state) {
4794 case pci_channel_io_normal:
4795 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca
AG
4796 /* Fatal error, prepare for slot reset */
4797 case pci_channel_io_frozen:
4798 /*
4799 * Cancel and wait for all TDRs in progress if failing to
4800 * set adev->in_gpu_reset in amdgpu_device_lock_adev
4801 *
4802 * Locking adev->reset_sem will prevent any external access
4803 * to GPU during PCI error recovery
4804 */
4805 while (!amdgpu_device_lock_adev(adev, NULL))
4806 amdgpu_cancel_all_tdr(adev);
4807
4808 /*
4809 * Block any work scheduling as we do for regular GPU reset
4810 * for the duration of the recovery
4811 */
4812 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4813 struct amdgpu_ring *ring = adev->rings[i];
4814
4815 if (!ring || !ring->sched.thread)
4816 continue;
4817
4818 drm_sched_stop(&ring->sched, NULL);
4819 }
c9a6b82f
AG
4820 return PCI_ERS_RESULT_NEED_RESET;
4821 case pci_channel_io_perm_failure:
4822 /* Permanent error, prepare for device removal */
4823 return PCI_ERS_RESULT_DISCONNECT;
4824 }
4825
4826 return PCI_ERS_RESULT_NEED_RESET;
4827}
4828
4829/**
4830 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4831 * @pdev: pointer to PCI device
4832 */
4833pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4834{
4835
4836 DRM_INFO("PCI error: mmio enabled callback!!\n");
4837
4838 /* TODO - dump whatever for debugging purposes */
4839
4840 /* This called only if amdgpu_pci_error_detected returns
4841 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4842 * works, no need to reset slot.
4843 */
4844
4845 return PCI_ERS_RESULT_RECOVERED;
4846}
4847
4848/**
4849 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4850 * @pdev: PCI device struct
4851 *
4852 * Description: This routine is called by the pci error recovery
4853 * code after the PCI slot has been reset, just before we
4854 * should resume normal operations.
4855 */
4856pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4857{
4858 struct drm_device *dev = pci_get_drvdata(pdev);
4859 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 4860 int r, i;
7ac71382 4861 bool need_full_reset = true;
362c7b91 4862 u32 memsize;
7ac71382 4863 struct list_head device_list;
c9a6b82f
AG
4864
4865 DRM_INFO("PCI error: slot reset callback!!\n");
4866
7ac71382
AG
4867 INIT_LIST_HEAD(&device_list);
4868 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4869
362c7b91
AG
4870 /* wait for asic to come out of reset */
4871 msleep(500);
4872
7ac71382 4873 /* Restore PCI confspace */
c1dd4aa6 4874 amdgpu_device_load_pci_state(pdev);
c9a6b82f 4875
362c7b91
AG
4876 /* confirm ASIC came out of reset */
4877 for (i = 0; i < adev->usec_timeout; i++) {
4878 memsize = amdgpu_asic_get_config_memsize(adev);
4879
4880 if (memsize != 0xffffffff)
4881 break;
4882 udelay(1);
4883 }
4884 if (memsize == 0xffffffff) {
4885 r = -ETIME;
4886 goto out;
4887 }
4888
362c7b91 4889 adev->in_pci_err_recovery = true;
7ac71382 4890 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
bf36b52e 4891 adev->in_pci_err_recovery = false;
c9a6b82f
AG
4892 if (r)
4893 goto out;
4894
7ac71382 4895 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
c9a6b82f
AG
4896
4897out:
c9a6b82f 4898 if (!r) {
c1dd4aa6
AG
4899 if (amdgpu_device_cache_pci_state(adev->pdev))
4900 pci_restore_state(adev->pdev);
4901
c9a6b82f
AG
4902 DRM_INFO("PCIe error recovery succeeded\n");
4903 } else {
4904 DRM_ERROR("PCIe error recovery failed, err:%d", r);
4905 amdgpu_device_unlock_adev(adev);
4906 }
4907
4908 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
4909}
4910
4911/**
4912 * amdgpu_pci_resume() - resume normal ops after PCI reset
4913 * @pdev: pointer to PCI device
4914 *
4915 * Called when the error recovery driver tells us that its
4916 * OK to resume normal operation. Use completion to allow
4917 * halted scsi ops to resume.
4918 */
4919void amdgpu_pci_resume(struct pci_dev *pdev)
4920{
4921 struct drm_device *dev = pci_get_drvdata(pdev);
4922 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 4923 int i;
c9a6b82f 4924
c9a6b82f
AG
4925
4926 DRM_INFO("PCI error: resume callback!!\n");
acd89fca
AG
4927
4928 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4929 struct amdgpu_ring *ring = adev->rings[i];
4930
4931 if (!ring || !ring->sched.thread)
4932 continue;
4933
4934
4935 drm_sched_resubmit_jobs(&ring->sched);
4936 drm_sched_start(&ring->sched, true);
4937 }
4938
4939 amdgpu_device_unlock_adev(adev);
c9a6b82f 4940}
c1dd4aa6
AG
4941
4942bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
4943{
4944 struct drm_device *dev = pci_get_drvdata(pdev);
4945 struct amdgpu_device *adev = drm_to_adev(dev);
4946 int r;
4947
4948 r = pci_save_state(pdev);
4949 if (!r) {
4950 kfree(adev->pci_state);
4951
4952 adev->pci_state = pci_store_saved_state(pdev);
4953
4954 if (!adev->pci_state) {
4955 DRM_ERROR("Failed to store PCI saved state");
4956 return false;
4957 }
4958 } else {
4959 DRM_WARN("Failed to save PCI state, err:%d\n", r);
4960 return false;
4961 }
4962
4963 return true;
4964}
4965
4966bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
4967{
4968 struct drm_device *dev = pci_get_drvdata(pdev);
4969 struct amdgpu_device *adev = drm_to_adev(dev);
4970 int r;
4971
4972 if (!adev->pci_state)
4973 return false;
4974
4975 r = pci_load_saved_state(pdev, adev->pci_state);
4976
4977 if (!r) {
4978 pci_restore_state(pdev);
4979 } else {
4980 DRM_WARN("Failed to load PCI state, err:%d\n", r);
4981 return false;
4982 }
4983
4984 return true;
4985}
4986
4987