drm/amdgpu: Trim amdgpu_pci_slot_reset by reusing code.
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
fdf2f6c5 33
4562236b 34#include <drm/drm_atomic_helper.h>
fcd70cd3 35#include <drm/drm_probe_helper.h>
d38ceaf9
AD
36#include <drm/amdgpu_drm.h>
37#include <linux/vgaarb.h>
38#include <linux/vga_switcheroo.h>
39#include <linux/efi.h>
40#include "amdgpu.h"
f4b373f4 41#include "amdgpu_trace.h"
d38ceaf9
AD
42#include "amdgpu_i2c.h"
43#include "atom.h"
44#include "amdgpu_atombios.h"
a5bde2f9 45#include "amdgpu_atomfirmware.h"
d0dd7f0c 46#include "amd_pcie.h"
33f34802
KW
47#ifdef CONFIG_DRM_AMDGPU_SI
48#include "si.h"
49#endif
a2e73f56
AD
50#ifdef CONFIG_DRM_AMDGPU_CIK
51#include "cik.h"
52#endif
aaa36a97 53#include "vi.h"
460826e6 54#include "soc15.h"
0a5b8c7b 55#include "nv.h"
d38ceaf9 56#include "bif/bif_4_1_d.h"
9accf2fd 57#include <linux/pci.h>
bec86378 58#include <linux/firmware.h>
89041940 59#include "amdgpu_vf_error.h"
d38ceaf9 60
ba997709 61#include "amdgpu_amdkfd.h"
d2f52ac8 62#include "amdgpu_pm.h"
d38ceaf9 63
5183411b 64#include "amdgpu_xgmi.h"
c030f2e4 65#include "amdgpu_ras.h"
9c7c85f7 66#include "amdgpu_pmu.h"
bd607166 67#include "amdgpu_fru_eeprom.h"
5183411b 68
d5ea093e 69#include <linux/suspend.h>
c6a6e2db 70#include <drm/task_barrier.h>
3f12acc8 71#include <linux/pm_runtime.h>
d5ea093e 72
e2a75f88 73MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 74MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 75MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 76MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 77MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 78MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
b51a26a0 79MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
23c6268e 80MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
ed42cfe1 81MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
42b325e5 82MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
c0a43457 83MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
120eb833 84MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
e2a75f88 85
2dc80b00
S
86#define AMDGPU_RESUME_MS 2000
87
050091ab 88const char *amdgpu_asic_name[] = {
da69c161
KW
89 "TAHITI",
90 "PITCAIRN",
91 "VERDE",
92 "OLAND",
93 "HAINAN",
d38ceaf9
AD
94 "BONAIRE",
95 "KAVERI",
96 "KABINI",
97 "HAWAII",
98 "MULLINS",
99 "TOPAZ",
100 "TONGA",
48299f95 101 "FIJI",
d38ceaf9 102 "CARRIZO",
139f4917 103 "STONEY",
2cc0c0b5
FC
104 "POLARIS10",
105 "POLARIS11",
c4642a47 106 "POLARIS12",
48ff108d 107 "VEGAM",
d4196f01 108 "VEGA10",
8fab806a 109 "VEGA12",
956fcddc 110 "VEGA20",
2ca8a5d2 111 "RAVEN",
d6c3b24e 112 "ARCTURUS",
1eee4228 113 "RENOIR",
852a6626 114 "NAVI10",
87dbad02 115 "NAVI14",
9802f5d7 116 "NAVI12",
ccaf72d3 117 "SIENNA_CICHLID",
ddd8fbe7 118 "NAVY_FLOUNDER",
d38ceaf9
AD
119 "LAST",
120};
121
dcea6e65
KR
122/**
123 * DOC: pcie_replay_count
124 *
125 * The amdgpu driver provides a sysfs API for reporting the total number
126 * of PCIe replays (NAKs)
127 * The file pcie_replay_count is used for this and returns the total
128 * number of replays as a sum of the NAKs generated and NAKs received
129 */
130
131static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
132 struct device_attribute *attr, char *buf)
133{
134 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 135 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
136 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
137
138 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
139}
140
141static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
142 amdgpu_device_get_pcie_replay_count, NULL);
143
5494d864
AD
144static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
145
bd607166
KR
146/**
147 * DOC: product_name
148 *
149 * The amdgpu driver provides a sysfs API for reporting the product name
150 * for the device
151 * The file serial_number is used for this and returns the product name
152 * as returned from the FRU.
153 * NOTE: This is only available for certain server cards
154 */
155
156static ssize_t amdgpu_device_get_product_name(struct device *dev,
157 struct device_attribute *attr, char *buf)
158{
159 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 160 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
161
162 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
163}
164
165static DEVICE_ATTR(product_name, S_IRUGO,
166 amdgpu_device_get_product_name, NULL);
167
168/**
169 * DOC: product_number
170 *
171 * The amdgpu driver provides a sysfs API for reporting the part number
172 * for the device
173 * The file serial_number is used for this and returns the part number
174 * as returned from the FRU.
175 * NOTE: This is only available for certain server cards
176 */
177
178static ssize_t amdgpu_device_get_product_number(struct device *dev,
179 struct device_attribute *attr, char *buf)
180{
181 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 182 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
183
184 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
185}
186
187static DEVICE_ATTR(product_number, S_IRUGO,
188 amdgpu_device_get_product_number, NULL);
189
190/**
191 * DOC: serial_number
192 *
193 * The amdgpu driver provides a sysfs API for reporting the serial number
194 * for the device
195 * The file serial_number is used for this and returns the serial number
196 * as returned from the FRU.
197 * NOTE: This is only available for certain server cards
198 */
199
200static ssize_t amdgpu_device_get_serial_number(struct device *dev,
201 struct device_attribute *attr, char *buf)
202{
203 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 204 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
205
206 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
207}
208
209static DEVICE_ATTR(serial_number, S_IRUGO,
210 amdgpu_device_get_serial_number, NULL);
211
e3ecdffa 212/**
31af062a 213 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
e3ecdffa
AD
214 *
215 * @dev: drm_device pointer
216 *
217 * Returns true if the device is a dGPU with HG/PX power control,
218 * otherwise return false.
219 */
31af062a 220bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 221{
1348969a 222 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 223
2f7d10b3 224 if (adev->flags & AMD_IS_PX)
d38ceaf9
AD
225 return true;
226 return false;
227}
228
a69cba42
AD
229/**
230 * amdgpu_device_supports_baco - Does the device support BACO
231 *
232 * @dev: drm_device pointer
233 *
234 * Returns true if the device supporte BACO,
235 * otherwise return false.
236 */
237bool amdgpu_device_supports_baco(struct drm_device *dev)
238{
1348969a 239 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
240
241 return amdgpu_asic_supports_baco(adev);
242}
243
e35e2b11
TY
244/**
245 * VRAM access helper functions.
246 *
247 * amdgpu_device_vram_access - read/write a buffer in vram
248 *
249 * @adev: amdgpu_device pointer
250 * @pos: offset of the buffer in vram
251 * @buf: virtual address of the buffer in system memory
252 * @size: read/write size, sizeof(@buf) must > @size
253 * @write: true - write to vram, otherwise - read from vram
254 */
255void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 uint32_t *buf, size_t size, bool write)
257{
e35e2b11 258 unsigned long flags;
ce05ac56
CK
259 uint32_t hi = ~0;
260 uint64_t last;
261
9d11eb0d
CK
262
263#ifdef CONFIG_64BIT
264 last = min(pos + size, adev->gmc.visible_vram_size);
265 if (last > pos) {
266 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 size_t count = last - pos;
268
269 if (write) {
270 memcpy_toio(addr, buf, count);
271 mb();
272 amdgpu_asic_flush_hdp(adev, NULL);
273 } else {
274 amdgpu_asic_invalidate_hdp(adev, NULL);
275 mb();
276 memcpy_fromio(buf, addr, count);
277 }
278
279 if (count == size)
280 return;
281
282 pos += count;
283 buf += count / 4;
284 size -= count;
285 }
286#endif
287
ce05ac56
CK
288 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 for (last = pos + size; pos < last; pos += 4) {
290 uint32_t tmp = pos >> 31;
e35e2b11 291
e35e2b11 292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
ce05ac56
CK
293 if (tmp != hi) {
294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 hi = tmp;
296 }
e35e2b11
TY
297 if (write)
298 WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 else
300 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
e35e2b11 301 }
ce05ac56 302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
e35e2b11
TY
303}
304
d38ceaf9 305/*
e78b579d 306 * MMIO register access helper functions.
d38ceaf9 307 */
e3ecdffa 308/**
e78b579d 309 * amdgpu_mm_rreg - read a memory mapped IO register
e3ecdffa
AD
310 *
311 * @adev: amdgpu_device pointer
312 * @reg: dword aligned register offset
313 * @acc_flags: access flags which require special behavior
314 *
315 * Returns the 32 bit value from the offset specified.
316 */
e78b579d
HZ
317uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
318 uint32_t acc_flags)
d38ceaf9 319{
f4b373f4
TSD
320 uint32_t ret;
321
bf36b52e
AG
322 if (adev->in_pci_err_recovery)
323 return 0;
324
81202807
DL
325 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
326 down_read_trylock(&adev->reset_sem)) {
327 ret = amdgpu_kiq_rreg(adev, reg);
328 up_read(&adev->reset_sem);
329 return ret;
330 }
bc992ba5 331
ec59847e 332 if ((reg * 4) < adev->rmmio_size)
f4b373f4 333 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
e78b579d
HZ
334 else {
335 unsigned long flags;
336
337 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
338 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
339 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
340 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
341 }
81202807 342
e78b579d 343 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
f4b373f4 344 return ret;
d38ceaf9
AD
345}
346
421a2a30
ML
347/*
348 * MMIO register read with bytes helper functions
349 * @offset:bytes offset from MMIO start
350 *
351*/
352
e3ecdffa
AD
353/**
354 * amdgpu_mm_rreg8 - read a memory mapped IO register
355 *
356 * @adev: amdgpu_device pointer
357 * @offset: byte aligned register offset
358 *
359 * Returns the 8 bit value from the offset specified.
360 */
421a2a30 361uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
bf36b52e
AG
362 if (adev->in_pci_err_recovery)
363 return 0;
364
421a2a30
ML
365 if (offset < adev->rmmio_size)
366 return (readb(adev->rmmio + offset));
367 BUG();
368}
369
370/*
371 * MMIO register write with bytes helper functions
372 * @offset:bytes offset from MMIO start
373 * @value: the value want to be written to the register
374 *
375*/
e3ecdffa
AD
376/**
377 * amdgpu_mm_wreg8 - read a memory mapped IO register
378 *
379 * @adev: amdgpu_device pointer
380 * @offset: byte aligned register offset
381 * @value: 8 bit value to write
382 *
383 * Writes the value specified to the offset specified.
384 */
421a2a30 385void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
bf36b52e
AG
386 if (adev->in_pci_err_recovery)
387 return;
388
421a2a30
ML
389 if (offset < adev->rmmio_size)
390 writeb(value, adev->rmmio + offset);
391 else
392 BUG();
393}
394
e230ac11
ND
395static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
396 uint32_t reg, uint32_t v,
397 uint32_t acc_flags)
2e0cc4d4 398{
bf36b52e
AG
399 if (adev->in_pci_err_recovery)
400 return;
401
e78b579d 402 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
2e0cc4d4 403
ec59847e 404 if ((reg * 4) < adev->rmmio_size)
2e0cc4d4 405 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
e78b579d
HZ
406 else {
407 unsigned long flags;
408
409 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
410 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
411 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
412 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
413 }
2e0cc4d4
ML
414}
415
e3ecdffa 416/**
e78b579d 417 * amdgpu_mm_wreg - write to a memory mapped IO register
e3ecdffa
AD
418 *
419 * @adev: amdgpu_device pointer
420 * @reg: dword aligned register offset
421 * @v: 32 bit value to write to the register
422 * @acc_flags: access flags which require special behavior
423 *
424 * Writes the value specified to the offset specified.
425 */
e78b579d
HZ
426void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
427 uint32_t acc_flags)
d38ceaf9 428{
bf36b52e
AG
429 if (adev->in_pci_err_recovery)
430 return;
431
81202807
DL
432 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
433 down_read_trylock(&adev->reset_sem)) {
434 amdgpu_kiq_wreg(adev, reg, v);
435 up_read(&adev->reset_sem);
436 return;
437 }
bc992ba5 438
e78b579d 439 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
2e0cc4d4 440}
d38ceaf9 441
2e0cc4d4
ML
442/*
443 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
444 *
445 * this function is invoked only the debugfs register access
446 * */
447void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
448 uint32_t acc_flags)
449{
bf36b52e
AG
450 if (adev->in_pci_err_recovery)
451 return;
452
2e0cc4d4
ML
453 if (amdgpu_sriov_fullaccess(adev) &&
454 adev->gfx.rlc.funcs &&
455 adev->gfx.rlc.funcs->is_rlcg_access_range) {
47ed4e1c 456
2e0cc4d4
ML
457 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
458 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
47ed4e1c 459 }
2e0cc4d4 460
e78b579d 461 amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
d38ceaf9
AD
462}
463
e3ecdffa
AD
464/**
465 * amdgpu_io_rreg - read an IO register
466 *
467 * @adev: amdgpu_device pointer
468 * @reg: dword aligned register offset
469 *
470 * Returns the 32 bit value from the offset specified.
471 */
d38ceaf9
AD
472u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
473{
bf36b52e
AG
474 if (adev->in_pci_err_recovery)
475 return 0;
476
d38ceaf9
AD
477 if ((reg * 4) < adev->rio_mem_size)
478 return ioread32(adev->rio_mem + (reg * 4));
479 else {
480 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
481 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
482 }
483}
484
e3ecdffa
AD
485/**
486 * amdgpu_io_wreg - write to an IO register
487 *
488 * @adev: amdgpu_device pointer
489 * @reg: dword aligned register offset
490 * @v: 32 bit value to write to the register
491 *
492 * Writes the value specified to the offset specified.
493 */
d38ceaf9
AD
494void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
495{
bf36b52e
AG
496 if (adev->in_pci_err_recovery)
497 return;
498
d38ceaf9
AD
499 if ((reg * 4) < adev->rio_mem_size)
500 iowrite32(v, adev->rio_mem + (reg * 4));
501 else {
502 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
503 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
504 }
505}
506
507/**
508 * amdgpu_mm_rdoorbell - read a doorbell dword
509 *
510 * @adev: amdgpu_device pointer
511 * @index: doorbell index
512 *
513 * Returns the value in the doorbell aperture at the
514 * requested doorbell index (CIK).
515 */
516u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
517{
bf36b52e
AG
518 if (adev->in_pci_err_recovery)
519 return 0;
520
d38ceaf9
AD
521 if (index < adev->doorbell.num_doorbells) {
522 return readl(adev->doorbell.ptr + index);
523 } else {
524 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
525 return 0;
526 }
527}
528
529/**
530 * amdgpu_mm_wdoorbell - write a doorbell dword
531 *
532 * @adev: amdgpu_device pointer
533 * @index: doorbell index
534 * @v: value to write
535 *
536 * Writes @v to the doorbell aperture at the
537 * requested doorbell index (CIK).
538 */
539void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
540{
bf36b52e
AG
541 if (adev->in_pci_err_recovery)
542 return;
543
d38ceaf9
AD
544 if (index < adev->doorbell.num_doorbells) {
545 writel(v, adev->doorbell.ptr + index);
546 } else {
547 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
548 }
549}
550
832be404
KW
551/**
552 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
553 *
554 * @adev: amdgpu_device pointer
555 * @index: doorbell index
556 *
557 * Returns the value in the doorbell aperture at the
558 * requested doorbell index (VEGA10+).
559 */
560u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
561{
bf36b52e
AG
562 if (adev->in_pci_err_recovery)
563 return 0;
564
832be404
KW
565 if (index < adev->doorbell.num_doorbells) {
566 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
567 } else {
568 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
569 return 0;
570 }
571}
572
573/**
574 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
575 *
576 * @adev: amdgpu_device pointer
577 * @index: doorbell index
578 * @v: value to write
579 *
580 * Writes @v to the doorbell aperture at the
581 * requested doorbell index (VEGA10+).
582 */
583void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
584{
bf36b52e
AG
585 if (adev->in_pci_err_recovery)
586 return;
587
832be404
KW
588 if (index < adev->doorbell.num_doorbells) {
589 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
590 } else {
591 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
592 }
593}
594
d38ceaf9
AD
595/**
596 * amdgpu_invalid_rreg - dummy reg read function
597 *
598 * @adev: amdgpu device pointer
599 * @reg: offset of register
600 *
601 * Dummy register read function. Used for register blocks
602 * that certain asics don't have (all asics).
603 * Returns the value in the register.
604 */
605static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
606{
607 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
608 BUG();
609 return 0;
610}
611
612/**
613 * amdgpu_invalid_wreg - dummy reg write function
614 *
615 * @adev: amdgpu device pointer
616 * @reg: offset of register
617 * @v: value to write to the register
618 *
619 * Dummy register read function. Used for register blocks
620 * that certain asics don't have (all asics).
621 */
622static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
623{
624 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
625 reg, v);
626 BUG();
627}
628
4fa1c6a6
TZ
629/**
630 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
631 *
632 * @adev: amdgpu device pointer
633 * @reg: offset of register
634 *
635 * Dummy register read function. Used for register blocks
636 * that certain asics don't have (all asics).
637 * Returns the value in the register.
638 */
639static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
640{
641 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
642 BUG();
643 return 0;
644}
645
646/**
647 * amdgpu_invalid_wreg64 - dummy reg write function
648 *
649 * @adev: amdgpu device pointer
650 * @reg: offset of register
651 * @v: value to write to the register
652 *
653 * Dummy register read function. Used for register blocks
654 * that certain asics don't have (all asics).
655 */
656static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
657{
658 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
659 reg, v);
660 BUG();
661}
662
d38ceaf9
AD
663/**
664 * amdgpu_block_invalid_rreg - dummy reg read function
665 *
666 * @adev: amdgpu device pointer
667 * @block: offset of instance
668 * @reg: offset of register
669 *
670 * Dummy register read function. Used for register blocks
671 * that certain asics don't have (all asics).
672 * Returns the value in the register.
673 */
674static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
675 uint32_t block, uint32_t reg)
676{
677 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
678 reg, block);
679 BUG();
680 return 0;
681}
682
683/**
684 * amdgpu_block_invalid_wreg - dummy reg write function
685 *
686 * @adev: amdgpu device pointer
687 * @block: offset of instance
688 * @reg: offset of register
689 * @v: value to write to the register
690 *
691 * Dummy register read function. Used for register blocks
692 * that certain asics don't have (all asics).
693 */
694static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
695 uint32_t block,
696 uint32_t reg, uint32_t v)
697{
698 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
699 reg, block, v);
700 BUG();
701}
702
4d2997ab
AD
703/**
704 * amdgpu_device_asic_init - Wrapper for atom asic_init
705 *
706 * @dev: drm_device pointer
707 *
708 * Does any asic specific work and then calls atom asic init.
709 */
710static int amdgpu_device_asic_init(struct amdgpu_device *adev)
711{
712 amdgpu_asic_pre_asic_init(adev);
713
714 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
715}
716
e3ecdffa
AD
717/**
718 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
719 *
720 * @adev: amdgpu device pointer
721 *
722 * Allocates a scratch page of VRAM for use by various things in the
723 * driver.
724 */
06ec9070 725static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 726{
a4a02777
CK
727 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
728 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
729 &adev->vram_scratch.robj,
730 &adev->vram_scratch.gpu_addr,
731 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
732}
733
e3ecdffa
AD
734/**
735 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
736 *
737 * @adev: amdgpu device pointer
738 *
739 * Frees the VRAM scratch page.
740 */
06ec9070 741static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 742{
078af1a3 743 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
744}
745
746/**
9c3f2b54 747 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
748 *
749 * @adev: amdgpu_device pointer
750 * @registers: pointer to the register array
751 * @array_size: size of the register array
752 *
753 * Programs an array or registers with and and or masks.
754 * This is a helper for setting golden registers.
755 */
9c3f2b54
AD
756void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
757 const u32 *registers,
758 const u32 array_size)
d38ceaf9
AD
759{
760 u32 tmp, reg, and_mask, or_mask;
761 int i;
762
763 if (array_size % 3)
764 return;
765
766 for (i = 0; i < array_size; i +=3) {
767 reg = registers[i + 0];
768 and_mask = registers[i + 1];
769 or_mask = registers[i + 2];
770
771 if (and_mask == 0xffffffff) {
772 tmp = or_mask;
773 } else {
774 tmp = RREG32(reg);
775 tmp &= ~and_mask;
e0d07657
HZ
776 if (adev->family >= AMDGPU_FAMILY_AI)
777 tmp |= (or_mask & and_mask);
778 else
779 tmp |= or_mask;
d38ceaf9
AD
780 }
781 WREG32(reg, tmp);
782 }
783}
784
e3ecdffa
AD
785/**
786 * amdgpu_device_pci_config_reset - reset the GPU
787 *
788 * @adev: amdgpu_device pointer
789 *
790 * Resets the GPU using the pci config reset sequence.
791 * Only applicable to asics prior to vega10.
792 */
8111c387 793void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
794{
795 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
796}
797
798/*
799 * GPU doorbell aperture helpers function.
800 */
801/**
06ec9070 802 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
803 *
804 * @adev: amdgpu_device pointer
805 *
806 * Init doorbell driver information (CIK)
807 * Returns 0 on success, error on failure.
808 */
06ec9070 809static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 810{
6585661d 811
705e519e
CK
812 /* No doorbell on SI hardware generation */
813 if (adev->asic_type < CHIP_BONAIRE) {
814 adev->doorbell.base = 0;
815 adev->doorbell.size = 0;
816 adev->doorbell.num_doorbells = 0;
817 adev->doorbell.ptr = NULL;
818 return 0;
819 }
820
d6895ad3
CK
821 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
822 return -EINVAL;
823
22357775
AD
824 amdgpu_asic_init_doorbell_index(adev);
825
d38ceaf9
AD
826 /* doorbell bar mapping */
827 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
828 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
829
edf600da 830 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
9564f192 831 adev->doorbell_index.max_assignment+1);
d38ceaf9
AD
832 if (adev->doorbell.num_doorbells == 0)
833 return -EINVAL;
834
ec3db8a6 835 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
88dc26e4
OZ
836 * paging queue doorbell use the second page. The
837 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
838 * doorbells are in the first page. So with paging queue enabled,
839 * the max num_doorbells should + 1 page (0x400 in dword)
ec3db8a6
PY
840 */
841 if (adev->asic_type >= CHIP_VEGA10)
88dc26e4 842 adev->doorbell.num_doorbells += 0x400;
ec3db8a6 843
8972e5d2
CK
844 adev->doorbell.ptr = ioremap(adev->doorbell.base,
845 adev->doorbell.num_doorbells *
846 sizeof(u32));
847 if (adev->doorbell.ptr == NULL)
d38ceaf9 848 return -ENOMEM;
d38ceaf9
AD
849
850 return 0;
851}
852
853/**
06ec9070 854 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
855 *
856 * @adev: amdgpu_device pointer
857 *
858 * Tear down doorbell driver information (CIK)
859 */
06ec9070 860static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
861{
862 iounmap(adev->doorbell.ptr);
863 adev->doorbell.ptr = NULL;
864}
865
22cb0164 866
d38ceaf9
AD
867
868/*
06ec9070 869 * amdgpu_device_wb_*()
455a7bc2 870 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 871 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
872 */
873
874/**
06ec9070 875 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
876 *
877 * @adev: amdgpu_device pointer
878 *
879 * Disables Writeback and frees the Writeback memory (all asics).
880 * Used at driver shutdown.
881 */
06ec9070 882static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
883{
884 if (adev->wb.wb_obj) {
a76ed485
AD
885 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
886 &adev->wb.gpu_addr,
887 (void **)&adev->wb.wb);
d38ceaf9
AD
888 adev->wb.wb_obj = NULL;
889 }
890}
891
892/**
06ec9070 893 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
d38ceaf9
AD
894 *
895 * @adev: amdgpu_device pointer
896 *
455a7bc2 897 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
898 * Used at driver startup.
899 * Returns 0 on success or an -error on failure.
900 */
06ec9070 901static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
902{
903 int r;
904
905 if (adev->wb.wb_obj == NULL) {
97407b63
AD
906 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
907 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
908 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
909 &adev->wb.wb_obj, &adev->wb.gpu_addr,
910 (void **)&adev->wb.wb);
d38ceaf9
AD
911 if (r) {
912 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
913 return r;
914 }
d38ceaf9
AD
915
916 adev->wb.num_wb = AMDGPU_MAX_WB;
917 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
918
919 /* clear wb memory */
73469585 920 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
921 }
922
923 return 0;
924}
925
926/**
131b4b36 927 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
928 *
929 * @adev: amdgpu_device pointer
930 * @wb: wb index
931 *
932 * Allocate a wb slot for use by the driver (all asics).
933 * Returns 0 on success or -EINVAL on failure.
934 */
131b4b36 935int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
936{
937 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 938
97407b63 939 if (offset < adev->wb.num_wb) {
7014285a 940 __set_bit(offset, adev->wb.used);
63ae07ca 941 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
942 return 0;
943 } else {
944 return -EINVAL;
945 }
946}
947
d38ceaf9 948/**
131b4b36 949 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
950 *
951 * @adev: amdgpu_device pointer
952 * @wb: wb index
953 *
954 * Free a wb slot allocated for use by the driver (all asics)
955 */
131b4b36 956void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 957{
73469585 958 wb >>= 3;
d38ceaf9 959 if (wb < adev->wb.num_wb)
73469585 960 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
961}
962
d6895ad3
CK
963/**
964 * amdgpu_device_resize_fb_bar - try to resize FB BAR
965 *
966 * @adev: amdgpu_device pointer
967 *
968 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
969 * to fail, but if any of the BARs is not accessible after the size we abort
970 * driver loading by returning -ENODEV.
971 */
972int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
973{
770d13b1 974 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
d6895ad3 975 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
31b8adab
CK
976 struct pci_bus *root;
977 struct resource *res;
978 unsigned i;
d6895ad3
CK
979 u16 cmd;
980 int r;
981
0c03b912 982 /* Bypass for VF */
983 if (amdgpu_sriov_vf(adev))
984 return 0;
985
b7221f2b
AD
986 /* skip if the bios has already enabled large BAR */
987 if (adev->gmc.real_vram_size &&
988 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
989 return 0;
990
31b8adab
CK
991 /* Check if the root BUS has 64bit memory resources */
992 root = adev->pdev->bus;
993 while (root->parent)
994 root = root->parent;
995
996 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 997 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
998 res->start > 0x100000000ull)
999 break;
1000 }
1001
1002 /* Trying to resize is pointless without a root hub window above 4GB */
1003 if (!res)
1004 return 0;
1005
d6895ad3
CK
1006 /* Disable memory decoding while we change the BAR addresses and size */
1007 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1008 pci_write_config_word(adev->pdev, PCI_COMMAND,
1009 cmd & ~PCI_COMMAND_MEMORY);
1010
1011 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1012 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1013 if (adev->asic_type >= CHIP_BONAIRE)
1014 pci_release_resource(adev->pdev, 2);
1015
1016 pci_release_resource(adev->pdev, 0);
1017
1018 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1019 if (r == -ENOSPC)
1020 DRM_INFO("Not enough PCI address space for a large BAR.");
1021 else if (r && r != -ENOTSUPP)
1022 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1023
1024 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1025
1026 /* When the doorbell or fb BAR isn't available we have no chance of
1027 * using the device.
1028 */
06ec9070 1029 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1030 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1031 return -ENODEV;
1032
1033 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1034
1035 return 0;
1036}
a05502e5 1037
d38ceaf9
AD
1038/*
1039 * GPU helpers function.
1040 */
1041/**
39c640c0 1042 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1043 *
1044 * @adev: amdgpu_device pointer
1045 *
c836fec5
JQ
1046 * Check if the asic has been initialized (all asics) at driver startup
1047 * or post is needed if hw reset is performed.
1048 * Returns true if need or false if not.
d38ceaf9 1049 */
39c640c0 1050bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1051{
1052 uint32_t reg;
1053
bec86378
ML
1054 if (amdgpu_sriov_vf(adev))
1055 return false;
1056
1057 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1058 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1059 * some old smc fw still need driver do vPost otherwise gpu hang, while
1060 * those smc fw version above 22.15 doesn't have this flaw, so we force
1061 * vpost executed for smc version below 22.15
bec86378
ML
1062 */
1063 if (adev->asic_type == CHIP_FIJI) {
1064 int err;
1065 uint32_t fw_ver;
1066 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1067 /* force vPost if error occured */
1068 if (err)
1069 return true;
1070
1071 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1072 if (fw_ver < 0x00160e00)
1073 return true;
bec86378 1074 }
bec86378 1075 }
91fe77eb 1076
1077 if (adev->has_hw_reset) {
1078 adev->has_hw_reset = false;
1079 return true;
1080 }
1081
1082 /* bios scratch used on CIK+ */
1083 if (adev->asic_type >= CHIP_BONAIRE)
1084 return amdgpu_atombios_scratch_need_asic_init(adev);
1085
1086 /* check MEM_SIZE for older asics */
1087 reg = amdgpu_asic_get_config_memsize(adev);
1088
1089 if ((reg != 0) && (reg != 0xffffffff))
1090 return false;
1091
1092 return true;
bec86378
ML
1093}
1094
d38ceaf9
AD
1095/* if we get transitioned to only one device, take VGA back */
1096/**
06ec9070 1097 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9
AD
1098 *
1099 * @cookie: amdgpu_device pointer
1100 * @state: enable/disable vga decode
1101 *
1102 * Enable/disable vga decode (all asics).
1103 * Returns VGA resource flags.
1104 */
06ec9070 1105static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
d38ceaf9
AD
1106{
1107 struct amdgpu_device *adev = cookie;
1108 amdgpu_asic_set_vga_state(adev, state);
1109 if (state)
1110 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1111 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1112 else
1113 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1114}
1115
e3ecdffa
AD
1116/**
1117 * amdgpu_device_check_block_size - validate the vm block size
1118 *
1119 * @adev: amdgpu_device pointer
1120 *
1121 * Validates the vm block size specified via module parameter.
1122 * The vm block size defines number of bits in page table versus page directory,
1123 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1124 * page table and the remaining bits are in the page directory.
1125 */
06ec9070 1126static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1127{
1128 /* defines number of bits in page table versus page directory,
1129 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1130 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1131 if (amdgpu_vm_block_size == -1)
1132 return;
a1adf8be 1133
bab4fee7 1134 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1135 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1136 amdgpu_vm_block_size);
97489129 1137 amdgpu_vm_block_size = -1;
a1adf8be 1138 }
a1adf8be
CZ
1139}
1140
e3ecdffa
AD
1141/**
1142 * amdgpu_device_check_vm_size - validate the vm size
1143 *
1144 * @adev: amdgpu_device pointer
1145 *
1146 * Validates the vm size in GB specified via module parameter.
1147 * The VM size is the size of the GPU virtual memory space in GB.
1148 */
06ec9070 1149static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1150{
64dab074
AD
1151 /* no need to check the default value */
1152 if (amdgpu_vm_size == -1)
1153 return;
1154
83ca145d
ZJ
1155 if (amdgpu_vm_size < 1) {
1156 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1157 amdgpu_vm_size);
f3368128 1158 amdgpu_vm_size = -1;
83ca145d 1159 }
83ca145d
ZJ
1160}
1161
7951e376
RZ
1162static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1163{
1164 struct sysinfo si;
a9d4fe2f 1165 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1166 uint64_t total_memory;
1167 uint64_t dram_size_seven_GB = 0x1B8000000;
1168 uint64_t dram_size_three_GB = 0xB8000000;
1169
1170 if (amdgpu_smu_memory_pool_size == 0)
1171 return;
1172
1173 if (!is_os_64) {
1174 DRM_WARN("Not 64-bit OS, feature not supported\n");
1175 goto def_value;
1176 }
1177 si_meminfo(&si);
1178 total_memory = (uint64_t)si.totalram * si.mem_unit;
1179
1180 if ((amdgpu_smu_memory_pool_size == 1) ||
1181 (amdgpu_smu_memory_pool_size == 2)) {
1182 if (total_memory < dram_size_three_GB)
1183 goto def_value1;
1184 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1185 (amdgpu_smu_memory_pool_size == 8)) {
1186 if (total_memory < dram_size_seven_GB)
1187 goto def_value1;
1188 } else {
1189 DRM_WARN("Smu memory pool size not supported\n");
1190 goto def_value;
1191 }
1192 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1193
1194 return;
1195
1196def_value1:
1197 DRM_WARN("No enough system memory\n");
1198def_value:
1199 adev->pm.smu_prv_buffer_size = 0;
1200}
1201
d38ceaf9 1202/**
06ec9070 1203 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1204 *
1205 * @adev: amdgpu_device pointer
1206 *
1207 * Validates certain module parameters and updates
1208 * the associated values used by the driver (all asics).
1209 */
912dfc84 1210static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1211{
5b011235
CZ
1212 if (amdgpu_sched_jobs < 4) {
1213 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1214 amdgpu_sched_jobs);
1215 amdgpu_sched_jobs = 4;
76117507 1216 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1217 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1218 amdgpu_sched_jobs);
1219 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1220 }
d38ceaf9 1221
83e74db6 1222 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1223 /* gart size must be greater or equal to 32M */
1224 dev_warn(adev->dev, "gart size (%d) too small\n",
1225 amdgpu_gart_size);
83e74db6 1226 amdgpu_gart_size = -1;
d38ceaf9
AD
1227 }
1228
36d38372 1229 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1230 /* gtt size must be greater or equal to 32M */
36d38372
CK
1231 dev_warn(adev->dev, "gtt size (%d) too small\n",
1232 amdgpu_gtt_size);
1233 amdgpu_gtt_size = -1;
d38ceaf9
AD
1234 }
1235
d07f14be
RH
1236 /* valid range is between 4 and 9 inclusive */
1237 if (amdgpu_vm_fragment_size != -1 &&
1238 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1239 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1240 amdgpu_vm_fragment_size = -1;
1241 }
1242
5d5bd5e3
KW
1243 if (amdgpu_sched_hw_submission < 2) {
1244 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1245 amdgpu_sched_hw_submission);
1246 amdgpu_sched_hw_submission = 2;
1247 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1248 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1249 amdgpu_sched_hw_submission);
1250 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1251 }
1252
7951e376
RZ
1253 amdgpu_device_check_smu_prv_buffer_size(adev);
1254
06ec9070 1255 amdgpu_device_check_vm_size(adev);
d38ceaf9 1256
06ec9070 1257 amdgpu_device_check_block_size(adev);
6a7f76e7 1258
19aede77 1259 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1260
c6252390 1261 amdgpu_gmc_tmz_set(adev);
01a8dcec 1262
a300de40
ML
1263 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1264 amdgpu_num_kcq = 8;
c16ce562 1265 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
a300de40
ML
1266 }
1267
e3c00faa 1268 return 0;
d38ceaf9
AD
1269}
1270
1271/**
1272 * amdgpu_switcheroo_set_state - set switcheroo state
1273 *
1274 * @pdev: pci dev pointer
1694467b 1275 * @state: vga_switcheroo state
d38ceaf9
AD
1276 *
1277 * Callback for the switcheroo driver. Suspends or resumes the
1278 * the asics before or after it is powered up using ACPI methods.
1279 */
8aba21b7
LT
1280static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1281 enum vga_switcheroo_state state)
d38ceaf9
AD
1282{
1283 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1284 int r;
d38ceaf9 1285
31af062a 1286 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1287 return;
1288
1289 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1290 pr_info("switched on\n");
d38ceaf9
AD
1291 /* don't suspend or resume card normally */
1292 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1293
de185019 1294 pci_set_power_state(dev->pdev, PCI_D0);
c1dd4aa6 1295 amdgpu_device_load_pci_state(dev->pdev);
de185019
AD
1296 r = pci_enable_device(dev->pdev);
1297 if (r)
1298 DRM_WARN("pci_enable_device failed (%d)\n", r);
1299 amdgpu_device_resume(dev, true);
d38ceaf9 1300
d38ceaf9
AD
1301 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1302 drm_kms_helper_poll_enable(dev);
1303 } else {
dd4fa6c1 1304 pr_info("switched off\n");
d38ceaf9
AD
1305 drm_kms_helper_poll_disable(dev);
1306 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1307 amdgpu_device_suspend(dev, true);
c1dd4aa6 1308 amdgpu_device_cache_pci_state(dev->pdev);
de185019
AD
1309 /* Shut down the device */
1310 pci_disable_device(dev->pdev);
1311 pci_set_power_state(dev->pdev, PCI_D3cold);
d38ceaf9
AD
1312 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1313 }
1314}
1315
1316/**
1317 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1318 *
1319 * @pdev: pci dev pointer
1320 *
1321 * Callback for the switcheroo driver. Check of the switcheroo
1322 * state can be changed.
1323 * Returns true if the state can be changed, false if not.
1324 */
1325static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1326{
1327 struct drm_device *dev = pci_get_drvdata(pdev);
1328
1329 /*
1330 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1331 * locking inversion with the driver load path. And the access here is
1332 * completely racy anyway. So don't bother with locking for now.
1333 */
7e13ad89 1334 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1335}
1336
1337static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1338 .set_gpu_state = amdgpu_switcheroo_set_state,
1339 .reprobe = NULL,
1340 .can_switch = amdgpu_switcheroo_can_switch,
1341};
1342
e3ecdffa
AD
1343/**
1344 * amdgpu_device_ip_set_clockgating_state - set the CG state
1345 *
87e3f136 1346 * @dev: amdgpu_device pointer
e3ecdffa
AD
1347 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1348 * @state: clockgating state (gate or ungate)
1349 *
1350 * Sets the requested clockgating state for all instances of
1351 * the hardware IP specified.
1352 * Returns the error code from the last instance.
1353 */
43fa561f 1354int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1355 enum amd_ip_block_type block_type,
1356 enum amd_clockgating_state state)
d38ceaf9 1357{
43fa561f 1358 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1359 int i, r = 0;
1360
1361 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1362 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1363 continue;
c722865a
RZ
1364 if (adev->ip_blocks[i].version->type != block_type)
1365 continue;
1366 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1367 continue;
1368 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1369 (void *)adev, state);
1370 if (r)
1371 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1372 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1373 }
1374 return r;
1375}
1376
e3ecdffa
AD
1377/**
1378 * amdgpu_device_ip_set_powergating_state - set the PG state
1379 *
87e3f136 1380 * @dev: amdgpu_device pointer
e3ecdffa
AD
1381 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1382 * @state: powergating state (gate or ungate)
1383 *
1384 * Sets the requested powergating state for all instances of
1385 * the hardware IP specified.
1386 * Returns the error code from the last instance.
1387 */
43fa561f 1388int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1389 enum amd_ip_block_type block_type,
1390 enum amd_powergating_state state)
d38ceaf9 1391{
43fa561f 1392 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1393 int i, r = 0;
1394
1395 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1396 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1397 continue;
c722865a
RZ
1398 if (adev->ip_blocks[i].version->type != block_type)
1399 continue;
1400 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1401 continue;
1402 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1403 (void *)adev, state);
1404 if (r)
1405 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1406 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1407 }
1408 return r;
1409}
1410
e3ecdffa
AD
1411/**
1412 * amdgpu_device_ip_get_clockgating_state - get the CG state
1413 *
1414 * @adev: amdgpu_device pointer
1415 * @flags: clockgating feature flags
1416 *
1417 * Walks the list of IPs on the device and updates the clockgating
1418 * flags for each IP.
1419 * Updates @flags with the feature flags for each hardware IP where
1420 * clockgating is enabled.
1421 */
2990a1fc
AD
1422void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1423 u32 *flags)
6cb2d4e4
HR
1424{
1425 int i;
1426
1427 for (i = 0; i < adev->num_ip_blocks; i++) {
1428 if (!adev->ip_blocks[i].status.valid)
1429 continue;
1430 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1431 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1432 }
1433}
1434
e3ecdffa
AD
1435/**
1436 * amdgpu_device_ip_wait_for_idle - wait for idle
1437 *
1438 * @adev: amdgpu_device pointer
1439 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1440 *
1441 * Waits for the request hardware IP to be idle.
1442 * Returns 0 for success or a negative error code on failure.
1443 */
2990a1fc
AD
1444int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1445 enum amd_ip_block_type block_type)
5dbbb60b
AD
1446{
1447 int i, r;
1448
1449 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1450 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1451 continue;
a1255107
AD
1452 if (adev->ip_blocks[i].version->type == block_type) {
1453 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1454 if (r)
1455 return r;
1456 break;
1457 }
1458 }
1459 return 0;
1460
1461}
1462
e3ecdffa
AD
1463/**
1464 * amdgpu_device_ip_is_idle - is the hardware IP idle
1465 *
1466 * @adev: amdgpu_device pointer
1467 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1468 *
1469 * Check if the hardware IP is idle or not.
1470 * Returns true if it the IP is idle, false if not.
1471 */
2990a1fc
AD
1472bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1473 enum amd_ip_block_type block_type)
5dbbb60b
AD
1474{
1475 int i;
1476
1477 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1478 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1479 continue;
a1255107
AD
1480 if (adev->ip_blocks[i].version->type == block_type)
1481 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1482 }
1483 return true;
1484
1485}
1486
e3ecdffa
AD
1487/**
1488 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1489 *
1490 * @adev: amdgpu_device pointer
87e3f136 1491 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1492 *
1493 * Returns a pointer to the hardware IP block structure
1494 * if it exists for the asic, otherwise NULL.
1495 */
2990a1fc
AD
1496struct amdgpu_ip_block *
1497amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1498 enum amd_ip_block_type type)
d38ceaf9
AD
1499{
1500 int i;
1501
1502 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1503 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1504 return &adev->ip_blocks[i];
1505
1506 return NULL;
1507}
1508
1509/**
2990a1fc 1510 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1511 *
1512 * @adev: amdgpu_device pointer
5fc3aeeb 1513 * @type: enum amd_ip_block_type
d38ceaf9
AD
1514 * @major: major version
1515 * @minor: minor version
1516 *
1517 * return 0 if equal or greater
1518 * return 1 if smaller or the ip_block doesn't exist
1519 */
2990a1fc
AD
1520int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1521 enum amd_ip_block_type type,
1522 u32 major, u32 minor)
d38ceaf9 1523{
2990a1fc 1524 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1525
a1255107
AD
1526 if (ip_block && ((ip_block->version->major > major) ||
1527 ((ip_block->version->major == major) &&
1528 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1529 return 0;
1530
1531 return 1;
1532}
1533
a1255107 1534/**
2990a1fc 1535 * amdgpu_device_ip_block_add
a1255107
AD
1536 *
1537 * @adev: amdgpu_device pointer
1538 * @ip_block_version: pointer to the IP to add
1539 *
1540 * Adds the IP block driver information to the collection of IPs
1541 * on the asic.
1542 */
2990a1fc
AD
1543int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1544 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1545{
1546 if (!ip_block_version)
1547 return -EINVAL;
1548
e966a725 1549 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1550 ip_block_version->funcs->name);
1551
a1255107
AD
1552 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1553
1554 return 0;
1555}
1556
e3ecdffa
AD
1557/**
1558 * amdgpu_device_enable_virtual_display - enable virtual display feature
1559 *
1560 * @adev: amdgpu_device pointer
1561 *
1562 * Enabled the virtual display feature if the user has enabled it via
1563 * the module parameter virtual_display. This feature provides a virtual
1564 * display hardware on headless boards or in virtualized environments.
1565 * This function parses and validates the configuration string specified by
1566 * the user and configues the virtual display configuration (number of
1567 * virtual connectors, crtcs, etc.) specified.
1568 */
483ef985 1569static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1570{
1571 adev->enable_virtual_display = false;
1572
1573 if (amdgpu_virtual_display) {
4a580877 1574 struct drm_device *ddev = adev_to_drm(adev);
9accf2fd 1575 const char *pci_address_name = pci_name(ddev->pdev);
0f66356d 1576 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1577
1578 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1579 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1580 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1581 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1582 if (!strcmp("all", pciaddname)
1583 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1584 long num_crtc;
1585 int res = -1;
1586
9accf2fd 1587 adev->enable_virtual_display = true;
0f66356d
ED
1588
1589 if (pciaddname_tmp)
1590 res = kstrtol(pciaddname_tmp, 10,
1591 &num_crtc);
1592
1593 if (!res) {
1594 if (num_crtc < 1)
1595 num_crtc = 1;
1596 if (num_crtc > 6)
1597 num_crtc = 6;
1598 adev->mode_info.num_crtc = num_crtc;
1599 } else {
1600 adev->mode_info.num_crtc = 1;
1601 }
9accf2fd
ED
1602 break;
1603 }
1604 }
1605
0f66356d
ED
1606 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1607 amdgpu_virtual_display, pci_address_name,
1608 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1609
1610 kfree(pciaddstr);
1611 }
1612}
1613
e3ecdffa
AD
1614/**
1615 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1616 *
1617 * @adev: amdgpu_device pointer
1618 *
1619 * Parses the asic configuration parameters specified in the gpu info
1620 * firmware and makes them availale to the driver for use in configuring
1621 * the asic.
1622 * Returns 0 on success, -EINVAL on failure.
1623 */
e2a75f88
AD
1624static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1625{
e2a75f88 1626 const char *chip_name;
c0a43457 1627 char fw_name[40];
e2a75f88
AD
1628 int err;
1629 const struct gpu_info_firmware_header_v1_0 *hdr;
1630
ab4fe3e1
HR
1631 adev->firmware.gpu_info_fw = NULL;
1632
72de33f8 1633 if (adev->mman.discovery_bin) {
258620d0 1634 amdgpu_discovery_get_gfx_info(adev);
cc375d8c
TY
1635
1636 /*
1637 * FIXME: The bounding box is still needed by Navi12, so
1638 * temporarily read it from gpu_info firmware. Should be droped
1639 * when DAL no longer needs it.
1640 */
1641 if (adev->asic_type != CHIP_NAVI12)
1642 return 0;
258620d0
AD
1643 }
1644
e2a75f88 1645 switch (adev->asic_type) {
e2a75f88
AD
1646#ifdef CONFIG_DRM_AMDGPU_SI
1647 case CHIP_VERDE:
1648 case CHIP_TAHITI:
1649 case CHIP_PITCAIRN:
1650 case CHIP_OLAND:
1651 case CHIP_HAINAN:
1652#endif
1653#ifdef CONFIG_DRM_AMDGPU_CIK
1654 case CHIP_BONAIRE:
1655 case CHIP_HAWAII:
1656 case CHIP_KAVERI:
1657 case CHIP_KABINI:
1658 case CHIP_MULLINS:
1659#endif
da87c30b
AD
1660 case CHIP_TOPAZ:
1661 case CHIP_TONGA:
1662 case CHIP_FIJI:
1663 case CHIP_POLARIS10:
1664 case CHIP_POLARIS11:
1665 case CHIP_POLARIS12:
1666 case CHIP_VEGAM:
1667 case CHIP_CARRIZO:
1668 case CHIP_STONEY:
27c0bc71 1669 case CHIP_VEGA20:
e2a75f88
AD
1670 default:
1671 return 0;
1672 case CHIP_VEGA10:
1673 chip_name = "vega10";
1674 break;
3f76dced
AD
1675 case CHIP_VEGA12:
1676 chip_name = "vega12";
1677 break;
2d2e5e7e 1678 case CHIP_RAVEN:
54f78a76 1679 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1680 chip_name = "raven2";
54f78a76 1681 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1682 chip_name = "picasso";
54c4d17e
FX
1683 else
1684 chip_name = "raven";
2d2e5e7e 1685 break;
65e60f6e
LM
1686 case CHIP_ARCTURUS:
1687 chip_name = "arcturus";
1688 break;
b51a26a0
HR
1689 case CHIP_RENOIR:
1690 chip_name = "renoir";
1691 break;
23c6268e
HR
1692 case CHIP_NAVI10:
1693 chip_name = "navi10";
1694 break;
ed42cfe1
XY
1695 case CHIP_NAVI14:
1696 chip_name = "navi14";
1697 break;
42b325e5
XY
1698 case CHIP_NAVI12:
1699 chip_name = "navi12";
1700 break;
c0a43457
LG
1701 case CHIP_SIENNA_CICHLID:
1702 chip_name = "sienna_cichlid";
1703 break;
120eb833
JC
1704 case CHIP_NAVY_FLOUNDER:
1705 chip_name = "navy_flounder";
1706 break;
e2a75f88
AD
1707 }
1708
1709 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1710 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1711 if (err) {
1712 dev_err(adev->dev,
1713 "Failed to load gpu_info firmware \"%s\"\n",
1714 fw_name);
1715 goto out;
1716 }
ab4fe3e1 1717 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1718 if (err) {
1719 dev_err(adev->dev,
1720 "Failed to validate gpu_info firmware \"%s\"\n",
1721 fw_name);
1722 goto out;
1723 }
1724
ab4fe3e1 1725 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1726 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1727
1728 switch (hdr->version_major) {
1729 case 1:
1730 {
1731 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1732 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1733 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1734
cc375d8c
TY
1735 /*
1736 * Should be droped when DAL no longer needs it.
1737 */
1738 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
1739 goto parse_soc_bounding_box;
1740
b5ab16bf
AD
1741 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1742 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1743 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1744 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 1745 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
1746 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1747 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1748 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1749 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1750 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 1751 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
1752 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1753 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
1754 adev->gfx.cu_info.max_waves_per_simd =
1755 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1756 adev->gfx.cu_info.max_scratch_slots_per_cu =
1757 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1758 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 1759 if (hdr->version_minor >= 1) {
35c2e910
HZ
1760 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1761 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1762 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1763 adev->gfx.config.num_sc_per_sh =
1764 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1765 adev->gfx.config.num_packer_per_sc =
1766 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1767 }
ec51d3fa
XY
1768
1769parse_soc_bounding_box:
ec51d3fa
XY
1770 /*
1771 * soc bounding box info is not integrated in disocovery table,
258620d0 1772 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 1773 */
48321c3d
HW
1774 if (hdr->version_minor == 2) {
1775 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1776 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1777 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1778 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1779 }
e2a75f88
AD
1780 break;
1781 }
1782 default:
1783 dev_err(adev->dev,
1784 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1785 err = -EINVAL;
1786 goto out;
1787 }
1788out:
e2a75f88
AD
1789 return err;
1790}
1791
e3ecdffa
AD
1792/**
1793 * amdgpu_device_ip_early_init - run early init for hardware IPs
1794 *
1795 * @adev: amdgpu_device pointer
1796 *
1797 * Early initialization pass for hardware IPs. The hardware IPs that make
1798 * up each asic are discovered each IP's early_init callback is run. This
1799 * is the first stage in initializing the asic.
1800 * Returns 0 on success, negative error code on failure.
1801 */
06ec9070 1802static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 1803{
aaa36a97 1804 int i, r;
d38ceaf9 1805
483ef985 1806 amdgpu_device_enable_virtual_display(adev);
a6be7570 1807
00a979f3 1808 if (amdgpu_sriov_vf(adev)) {
00a979f3 1809 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
1810 if (r)
1811 return r;
00a979f3
WS
1812 }
1813
d38ceaf9 1814 switch (adev->asic_type) {
33f34802
KW
1815#ifdef CONFIG_DRM_AMDGPU_SI
1816 case CHIP_VERDE:
1817 case CHIP_TAHITI:
1818 case CHIP_PITCAIRN:
1819 case CHIP_OLAND:
1820 case CHIP_HAINAN:
295d0daf 1821 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
1822 r = si_set_ip_blocks(adev);
1823 if (r)
1824 return r;
1825 break;
1826#endif
a2e73f56
AD
1827#ifdef CONFIG_DRM_AMDGPU_CIK
1828 case CHIP_BONAIRE:
1829 case CHIP_HAWAII:
1830 case CHIP_KAVERI:
1831 case CHIP_KABINI:
1832 case CHIP_MULLINS:
e1ad2d53 1833 if (adev->flags & AMD_IS_APU)
a2e73f56 1834 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
1835 else
1836 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
1837
1838 r = cik_set_ip_blocks(adev);
1839 if (r)
1840 return r;
1841 break;
1842#endif
da87c30b
AD
1843 case CHIP_TOPAZ:
1844 case CHIP_TONGA:
1845 case CHIP_FIJI:
1846 case CHIP_POLARIS10:
1847 case CHIP_POLARIS11:
1848 case CHIP_POLARIS12:
1849 case CHIP_VEGAM:
1850 case CHIP_CARRIZO:
1851 case CHIP_STONEY:
1852 if (adev->flags & AMD_IS_APU)
1853 adev->family = AMDGPU_FAMILY_CZ;
1854 else
1855 adev->family = AMDGPU_FAMILY_VI;
1856
1857 r = vi_set_ip_blocks(adev);
1858 if (r)
1859 return r;
1860 break;
e48a3cd9
AD
1861 case CHIP_VEGA10:
1862 case CHIP_VEGA12:
e4bd8170 1863 case CHIP_VEGA20:
e48a3cd9 1864 case CHIP_RAVEN:
61cf44c1 1865 case CHIP_ARCTURUS:
b51a26a0 1866 case CHIP_RENOIR:
70534d1e 1867 if (adev->flags & AMD_IS_APU)
2ca8a5d2
CZ
1868 adev->family = AMDGPU_FAMILY_RV;
1869 else
1870 adev->family = AMDGPU_FAMILY_AI;
460826e6
KW
1871
1872 r = soc15_set_ip_blocks(adev);
1873 if (r)
1874 return r;
1875 break;
0a5b8c7b 1876 case CHIP_NAVI10:
7ecb5cd4 1877 case CHIP_NAVI14:
4808cf9c 1878 case CHIP_NAVI12:
11e8aef5 1879 case CHIP_SIENNA_CICHLID:
41f446bf 1880 case CHIP_NAVY_FLOUNDER:
0a5b8c7b
HR
1881 adev->family = AMDGPU_FAMILY_NV;
1882
1883 r = nv_set_ip_blocks(adev);
1884 if (r)
1885 return r;
1886 break;
d38ceaf9
AD
1887 default:
1888 /* FIXME: not supported yet */
1889 return -EINVAL;
1890 }
1891
1884734a 1892 amdgpu_amdkfd_device_probe(adev);
1893
3b94fb10 1894 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 1895 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 1896 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
00f54b97 1897
d38ceaf9
AD
1898 for (i = 0; i < adev->num_ip_blocks; i++) {
1899 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
1900 DRM_ERROR("disabled ip block: %d <%s>\n",
1901 i, adev->ip_blocks[i].version->funcs->name);
a1255107 1902 adev->ip_blocks[i].status.valid = false;
d38ceaf9 1903 } else {
a1255107
AD
1904 if (adev->ip_blocks[i].version->funcs->early_init) {
1905 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 1906 if (r == -ENOENT) {
a1255107 1907 adev->ip_blocks[i].status.valid = false;
2c1a2784 1908 } else if (r) {
a1255107
AD
1909 DRM_ERROR("early_init of IP block <%s> failed %d\n",
1910 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 1911 return r;
2c1a2784 1912 } else {
a1255107 1913 adev->ip_blocks[i].status.valid = true;
2c1a2784 1914 }
974e6b64 1915 } else {
a1255107 1916 adev->ip_blocks[i].status.valid = true;
d38ceaf9 1917 }
d38ceaf9 1918 }
21a249ca
AD
1919 /* get the vbios after the asic_funcs are set up */
1920 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
1921 r = amdgpu_device_parse_gpu_info_fw(adev);
1922 if (r)
1923 return r;
1924
21a249ca
AD
1925 /* Read BIOS */
1926 if (!amdgpu_get_bios(adev))
1927 return -EINVAL;
1928
1929 r = amdgpu_atombios_init(adev);
1930 if (r) {
1931 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1932 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1933 return r;
1934 }
1935 }
d38ceaf9
AD
1936 }
1937
395d1fb9
NH
1938 adev->cg_flags &= amdgpu_cg_mask;
1939 adev->pg_flags &= amdgpu_pg_mask;
1940
d38ceaf9
AD
1941 return 0;
1942}
1943
0a4f2520
RZ
1944static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1945{
1946 int i, r;
1947
1948 for (i = 0; i < adev->num_ip_blocks; i++) {
1949 if (!adev->ip_blocks[i].status.sw)
1950 continue;
1951 if (adev->ip_blocks[i].status.hw)
1952 continue;
1953 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 1954 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
1955 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1956 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1957 if (r) {
1958 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1959 adev->ip_blocks[i].version->funcs->name, r);
1960 return r;
1961 }
1962 adev->ip_blocks[i].status.hw = true;
1963 }
1964 }
1965
1966 return 0;
1967}
1968
1969static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1970{
1971 int i, r;
1972
1973 for (i = 0; i < adev->num_ip_blocks; i++) {
1974 if (!adev->ip_blocks[i].status.sw)
1975 continue;
1976 if (adev->ip_blocks[i].status.hw)
1977 continue;
1978 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1979 if (r) {
1980 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1981 adev->ip_blocks[i].version->funcs->name, r);
1982 return r;
1983 }
1984 adev->ip_blocks[i].status.hw = true;
1985 }
1986
1987 return 0;
1988}
1989
7a3e0bb2
RZ
1990static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1991{
1992 int r = 0;
1993 int i;
80f41f84 1994 uint32_t smu_version;
7a3e0bb2
RZ
1995
1996 if (adev->asic_type >= CHIP_VEGA10) {
1997 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
1998 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1999 continue;
2000
2001 /* no need to do the fw loading again if already done*/
2002 if (adev->ip_blocks[i].status.hw == true)
2003 break;
2004
53b3f8f4 2005 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2006 r = adev->ip_blocks[i].version->funcs->resume(adev);
2007 if (r) {
2008 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2009 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2010 return r;
2011 }
2012 } else {
2013 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2014 if (r) {
2015 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2016 adev->ip_blocks[i].version->funcs->name, r);
2017 return r;
7a3e0bb2 2018 }
7a3e0bb2 2019 }
482f0e53
ML
2020
2021 adev->ip_blocks[i].status.hw = true;
2022 break;
7a3e0bb2
RZ
2023 }
2024 }
482f0e53 2025
8973d9ec
ED
2026 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2027 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2028
80f41f84 2029 return r;
7a3e0bb2
RZ
2030}
2031
e3ecdffa
AD
2032/**
2033 * amdgpu_device_ip_init - run init for hardware IPs
2034 *
2035 * @adev: amdgpu_device pointer
2036 *
2037 * Main initialization pass for hardware IPs. The list of all the hardware
2038 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2039 * are run. sw_init initializes the software state associated with each IP
2040 * and hw_init initializes the hardware associated with each IP.
2041 * Returns 0 on success, negative error code on failure.
2042 */
06ec9070 2043static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2044{
2045 int i, r;
2046
c030f2e4 2047 r = amdgpu_ras_init(adev);
2048 if (r)
2049 return r;
2050
d38ceaf9 2051 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2052 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2053 continue;
a1255107 2054 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2055 if (r) {
a1255107
AD
2056 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2057 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2058 goto init_failed;
2c1a2784 2059 }
a1255107 2060 adev->ip_blocks[i].status.sw = true;
bfca0289 2061
d38ceaf9 2062 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2063 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
06ec9070 2064 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2065 if (r) {
2066 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2067 goto init_failed;
2c1a2784 2068 }
a1255107 2069 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2070 if (r) {
2071 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2072 goto init_failed;
2c1a2784 2073 }
06ec9070 2074 r = amdgpu_device_wb_init(adev);
2c1a2784 2075 if (r) {
06ec9070 2076 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2077 goto init_failed;
2c1a2784 2078 }
a1255107 2079 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2080
2081 /* right after GMC hw init, we create CSA */
f92d5c61 2082 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2083 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2084 AMDGPU_GEM_DOMAIN_VRAM,
2085 AMDGPU_CSA_SIZE);
2493664f
ML
2086 if (r) {
2087 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2088 goto init_failed;
2493664f
ML
2089 }
2090 }
d38ceaf9
AD
2091 }
2092 }
2093
c9ffa427
YT
2094 if (amdgpu_sriov_vf(adev))
2095 amdgpu_virt_init_data_exchange(adev);
2096
533aed27
AG
2097 r = amdgpu_ib_pool_init(adev);
2098 if (r) {
2099 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2100 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2101 goto init_failed;
2102 }
2103
c8963ea4
RZ
2104 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2105 if (r)
72d3f592 2106 goto init_failed;
0a4f2520
RZ
2107
2108 r = amdgpu_device_ip_hw_init_phase1(adev);
2109 if (r)
72d3f592 2110 goto init_failed;
0a4f2520 2111
7a3e0bb2
RZ
2112 r = amdgpu_device_fw_loading(adev);
2113 if (r)
72d3f592 2114 goto init_failed;
7a3e0bb2 2115
0a4f2520
RZ
2116 r = amdgpu_device_ip_hw_init_phase2(adev);
2117 if (r)
72d3f592 2118 goto init_failed;
d38ceaf9 2119
121a2bc6
AG
2120 /*
2121 * retired pages will be loaded from eeprom and reserved here,
2122 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2123 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2124 * for I2C communication which only true at this point.
b82e65a9
GC
2125 *
2126 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2127 * failure from bad gpu situation and stop amdgpu init process
2128 * accordingly. For other failed cases, it will still release all
2129 * the resource and print error message, rather than returning one
2130 * negative value to upper level.
121a2bc6
AG
2131 *
2132 * Note: theoretically, this should be called before all vram allocations
2133 * to protect retired page from abusing
2134 */
b82e65a9
GC
2135 r = amdgpu_ras_recovery_init(adev);
2136 if (r)
2137 goto init_failed;
121a2bc6 2138
3e2e2ab5
HZ
2139 if (adev->gmc.xgmi.num_physical_nodes > 1)
2140 amdgpu_xgmi_add_device(adev);
1884734a 2141 amdgpu_amdkfd_device_init(adev);
c6332b97 2142
bd607166
KR
2143 amdgpu_fru_get_product_info(adev);
2144
72d3f592 2145init_failed:
c9ffa427 2146 if (amdgpu_sriov_vf(adev))
c6332b97 2147 amdgpu_virt_release_full_gpu(adev, true);
2148
72d3f592 2149 return r;
d38ceaf9
AD
2150}
2151
e3ecdffa
AD
2152/**
2153 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2154 *
2155 * @adev: amdgpu_device pointer
2156 *
2157 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2158 * this function before a GPU reset. If the value is retained after a
2159 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2160 */
06ec9070 2161static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2162{
2163 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2164}
2165
e3ecdffa
AD
2166/**
2167 * amdgpu_device_check_vram_lost - check if vram is valid
2168 *
2169 * @adev: amdgpu_device pointer
2170 *
2171 * Checks the reset magic value written to the gart pointer in VRAM.
2172 * The driver calls this after a GPU reset to see if the contents of
2173 * VRAM is lost or now.
2174 * returns true if vram is lost, false if not.
2175 */
06ec9070 2176static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2177{
dadce777
EQ
2178 if (memcmp(adev->gart.ptr, adev->reset_magic,
2179 AMDGPU_RESET_MAGIC_NUM))
2180 return true;
2181
53b3f8f4 2182 if (!amdgpu_in_reset(adev))
dadce777
EQ
2183 return false;
2184
2185 /*
2186 * For all ASICs with baco/mode1 reset, the VRAM is
2187 * always assumed to be lost.
2188 */
2189 switch (amdgpu_asic_reset_method(adev)) {
2190 case AMD_RESET_METHOD_BACO:
2191 case AMD_RESET_METHOD_MODE1:
2192 return true;
2193 default:
2194 return false;
2195 }
0c49e0b8
CZ
2196}
2197
e3ecdffa 2198/**
1112a46b 2199 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2200 *
2201 * @adev: amdgpu_device pointer
b8b72130 2202 * @state: clockgating state (gate or ungate)
e3ecdffa 2203 *
e3ecdffa 2204 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2205 * set_clockgating_state callbacks are run.
2206 * Late initialization pass enabling clockgating for hardware IPs.
2207 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2208 * Returns 0 on success, negative error code on failure.
2209 */
fdd34271 2210
1112a46b
RZ
2211static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2212 enum amd_clockgating_state state)
d38ceaf9 2213{
1112a46b 2214 int i, j, r;
d38ceaf9 2215
4a2ba394
SL
2216 if (amdgpu_emu_mode == 1)
2217 return 0;
2218
1112a46b
RZ
2219 for (j = 0; j < adev->num_ip_blocks; j++) {
2220 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2221 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2222 continue;
4a446d55 2223 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2224 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2225 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2226 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2227 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2228 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2229 /* enable clockgating to save power */
a1255107 2230 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2231 state);
4a446d55
AD
2232 if (r) {
2233 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2234 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2235 return r;
2236 }
b0b00ff1 2237 }
d38ceaf9 2238 }
06b18f61 2239
c9f96fd5
RZ
2240 return 0;
2241}
2242
1112a46b 2243static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
c9f96fd5 2244{
1112a46b 2245 int i, j, r;
06b18f61 2246
c9f96fd5
RZ
2247 if (amdgpu_emu_mode == 1)
2248 return 0;
2249
1112a46b
RZ
2250 for (j = 0; j < adev->num_ip_blocks; j++) {
2251 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2252 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5
RZ
2253 continue;
2254 /* skip CG for VCE/UVD, it's handled specially */
2255 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2256 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2257 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2258 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2259 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2260 /* enable powergating to save power */
2261 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2262 state);
c9f96fd5
RZ
2263 if (r) {
2264 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2265 adev->ip_blocks[i].version->funcs->name, r);
2266 return r;
2267 }
2268 }
2269 }
2dc80b00
S
2270 return 0;
2271}
2272
beff74bc
AD
2273static int amdgpu_device_enable_mgpu_fan_boost(void)
2274{
2275 struct amdgpu_gpu_instance *gpu_ins;
2276 struct amdgpu_device *adev;
2277 int i, ret = 0;
2278
2279 mutex_lock(&mgpu_info.mutex);
2280
2281 /*
2282 * MGPU fan boost feature should be enabled
2283 * only when there are two or more dGPUs in
2284 * the system
2285 */
2286 if (mgpu_info.num_dgpu < 2)
2287 goto out;
2288
2289 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2290 gpu_ins = &(mgpu_info.gpu_ins[i]);
2291 adev = gpu_ins->adev;
2292 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2293 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2294 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2295 if (ret)
2296 break;
2297
2298 gpu_ins->mgpu_fan_enabled = 1;
2299 }
2300 }
2301
2302out:
2303 mutex_unlock(&mgpu_info.mutex);
2304
2305 return ret;
2306}
2307
e3ecdffa
AD
2308/**
2309 * amdgpu_device_ip_late_init - run late init for hardware IPs
2310 *
2311 * @adev: amdgpu_device pointer
2312 *
2313 * Late initialization pass for hardware IPs. The list of all the hardware
2314 * IPs that make up the asic is walked and the late_init callbacks are run.
2315 * late_init covers any special initialization that an IP requires
2316 * after all of the have been initialized or something that needs to happen
2317 * late in the init process.
2318 * Returns 0 on success, negative error code on failure.
2319 */
06ec9070 2320static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2321{
60599a03 2322 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2323 int i = 0, r;
2324
2325 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2326 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2327 continue;
2328 if (adev->ip_blocks[i].version->funcs->late_init) {
2329 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2330 if (r) {
2331 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2332 adev->ip_blocks[i].version->funcs->name, r);
2333 return r;
2334 }
2dc80b00 2335 }
73f847db 2336 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2337 }
2338
a891d239
DL
2339 amdgpu_ras_set_error_query_ready(adev, true);
2340
1112a46b
RZ
2341 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2342 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2343
06ec9070 2344 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2345
beff74bc
AD
2346 r = amdgpu_device_enable_mgpu_fan_boost();
2347 if (r)
2348 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2349
60599a03
EQ
2350
2351 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2352 mutex_lock(&mgpu_info.mutex);
2353
2354 /*
2355 * Reset device p-state to low as this was booted with high.
2356 *
2357 * This should be performed only after all devices from the same
2358 * hive get initialized.
2359 *
2360 * However, it's unknown how many device in the hive in advance.
2361 * As this is counted one by one during devices initializations.
2362 *
2363 * So, we wait for all XGMI interlinked devices initialized.
2364 * This may bring some delays as those devices may come from
2365 * different hives. But that should be OK.
2366 */
2367 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2368 for (i = 0; i < mgpu_info.num_gpu; i++) {
2369 gpu_instance = &(mgpu_info.gpu_ins[i]);
2370 if (gpu_instance->adev->flags & AMD_IS_APU)
2371 continue;
2372
d84a430d
JK
2373 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2374 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2375 if (r) {
2376 DRM_ERROR("pstate setting failed (%d).\n", r);
2377 break;
2378 }
2379 }
2380 }
2381
2382 mutex_unlock(&mgpu_info.mutex);
2383 }
2384
d38ceaf9
AD
2385 return 0;
2386}
2387
e3ecdffa
AD
2388/**
2389 * amdgpu_device_ip_fini - run fini for hardware IPs
2390 *
2391 * @adev: amdgpu_device pointer
2392 *
2393 * Main teardown pass for hardware IPs. The list of all the hardware
2394 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2395 * are run. hw_fini tears down the hardware associated with each IP
2396 * and sw_fini tears down any software state associated with each IP.
2397 * Returns 0 on success, negative error code on failure.
2398 */
06ec9070 2399static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
d38ceaf9
AD
2400{
2401 int i, r;
2402
5278a159
SY
2403 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2404 amdgpu_virt_release_ras_err_handler_data(adev);
2405
c030f2e4 2406 amdgpu_ras_pre_fini(adev);
2407
a82400b5
AG
2408 if (adev->gmc.xgmi.num_physical_nodes > 1)
2409 amdgpu_xgmi_remove_device(adev);
2410
1884734a 2411 amdgpu_amdkfd_device_fini(adev);
05df1f01
RZ
2412
2413 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2414 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2415
3e96dbfd
AD
2416 /* need to disable SMC first */
2417 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2418 if (!adev->ip_blocks[i].status.hw)
3e96dbfd 2419 continue;
fdd34271 2420 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
a1255107 2421 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3e96dbfd
AD
2422 /* XXX handle errors */
2423 if (r) {
2424 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
a1255107 2425 adev->ip_blocks[i].version->funcs->name, r);
3e96dbfd 2426 }
a1255107 2427 adev->ip_blocks[i].status.hw = false;
3e96dbfd
AD
2428 break;
2429 }
2430 }
2431
d38ceaf9 2432 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2433 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2434 continue;
8201a67a 2435
a1255107 2436 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2437 /* XXX handle errors */
2c1a2784 2438 if (r) {
a1255107
AD
2439 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2440 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2441 }
8201a67a 2442
a1255107 2443 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2444 }
2445
9950cda2 2446
d38ceaf9 2447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2448 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2449 continue;
c12aba3a
ML
2450
2451 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2452 amdgpu_ucode_free_bo(adev);
1e256e27 2453 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2454 amdgpu_device_wb_fini(adev);
2455 amdgpu_device_vram_scratch_fini(adev);
533aed27 2456 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2457 }
2458
a1255107 2459 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2460 /* XXX handle errors */
2c1a2784 2461 if (r) {
a1255107
AD
2462 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2463 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2464 }
a1255107
AD
2465 adev->ip_blocks[i].status.sw = false;
2466 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2467 }
2468
a6dcfd9c 2469 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2470 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2471 continue;
a1255107
AD
2472 if (adev->ip_blocks[i].version->funcs->late_fini)
2473 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2474 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2475 }
2476
c030f2e4 2477 amdgpu_ras_fini(adev);
2478
030308fc 2479 if (amdgpu_sriov_vf(adev))
24136135
ML
2480 if (amdgpu_virt_release_full_gpu(adev, false))
2481 DRM_ERROR("failed to release exclusive mode on fini\n");
2493664f 2482
d38ceaf9
AD
2483 return 0;
2484}
2485
e3ecdffa 2486/**
beff74bc 2487 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2488 *
1112a46b 2489 * @work: work_struct.
e3ecdffa 2490 */
beff74bc 2491static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2492{
2493 struct amdgpu_device *adev =
beff74bc 2494 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2495 int r;
2496
2497 r = amdgpu_ib_ring_tests(adev);
2498 if (r)
2499 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2500}
2501
1e317b99
RZ
2502static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2503{
2504 struct amdgpu_device *adev =
2505 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2506
2507 mutex_lock(&adev->gfx.gfx_off_mutex);
2508 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2509 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2510 adev->gfx.gfx_off_state = true;
2511 }
2512 mutex_unlock(&adev->gfx.gfx_off_mutex);
2513}
2514
e3ecdffa 2515/**
e7854a03 2516 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2517 *
2518 * @adev: amdgpu_device pointer
2519 *
2520 * Main suspend function for hardware IPs. The list of all the hardware
2521 * IPs that make up the asic is walked, clockgating is disabled and the
2522 * suspend callbacks are run. suspend puts the hardware and software state
2523 * in each IP into a state suitable for suspend.
2524 * Returns 0 on success, negative error code on failure.
2525 */
e7854a03
AD
2526static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2527{
2528 int i, r;
2529
ced1ba97
PL
2530 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2531 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2532
e7854a03
AD
2533 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2534 if (!adev->ip_blocks[i].status.valid)
2535 continue;
2b9f7848 2536
e7854a03 2537 /* displays are handled separately */
2b9f7848
ND
2538 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2539 continue;
2540
2541 /* XXX handle errors */
2542 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2543 /* XXX handle errors */
2544 if (r) {
2545 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2546 adev->ip_blocks[i].version->funcs->name, r);
2547 return r;
e7854a03 2548 }
2b9f7848
ND
2549
2550 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2551 }
2552
e7854a03
AD
2553 return 0;
2554}
2555
2556/**
2557 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2558 *
2559 * @adev: amdgpu_device pointer
2560 *
2561 * Main suspend function for hardware IPs. The list of all the hardware
2562 * IPs that make up the asic is walked, clockgating is disabled and the
2563 * suspend callbacks are run. suspend puts the hardware and software state
2564 * in each IP into a state suitable for suspend.
2565 * Returns 0 on success, negative error code on failure.
2566 */
2567static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2568{
2569 int i, r;
2570
2571 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2572 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2573 continue;
e7854a03
AD
2574 /* displays are handled in phase1 */
2575 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2576 continue;
bff77e86
LM
2577 /* PSP lost connection when err_event_athub occurs */
2578 if (amdgpu_ras_intr_triggered() &&
2579 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2580 adev->ip_blocks[i].status.hw = false;
2581 continue;
2582 }
d38ceaf9 2583 /* XXX handle errors */
a1255107 2584 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2585 /* XXX handle errors */
2c1a2784 2586 if (r) {
a1255107
AD
2587 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2588 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2589 }
876923fb 2590 adev->ip_blocks[i].status.hw = false;
a3a09142 2591 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2592 if(!amdgpu_sriov_vf(adev)){
2593 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2594 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2595 if (r) {
2596 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2597 adev->mp1_state, r);
2598 return r;
2599 }
a3a09142
AD
2600 }
2601 }
b5507c7e 2602 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2603 }
2604
2605 return 0;
2606}
2607
e7854a03
AD
2608/**
2609 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2610 *
2611 * @adev: amdgpu_device pointer
2612 *
2613 * Main suspend function for hardware IPs. The list of all the hardware
2614 * IPs that make up the asic is walked, clockgating is disabled and the
2615 * suspend callbacks are run. suspend puts the hardware and software state
2616 * in each IP into a state suitable for suspend.
2617 * Returns 0 on success, negative error code on failure.
2618 */
2619int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2620{
2621 int r;
2622
e7819644
YT
2623 if (amdgpu_sriov_vf(adev))
2624 amdgpu_virt_request_full_gpu(adev, false);
2625
e7854a03
AD
2626 r = amdgpu_device_ip_suspend_phase1(adev);
2627 if (r)
2628 return r;
2629 r = amdgpu_device_ip_suspend_phase2(adev);
2630
e7819644
YT
2631 if (amdgpu_sriov_vf(adev))
2632 amdgpu_virt_release_full_gpu(adev, false);
2633
e7854a03
AD
2634 return r;
2635}
2636
06ec9070 2637static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2638{
2639 int i, r;
2640
2cb681b6
ML
2641 static enum amd_ip_block_type ip_order[] = {
2642 AMD_IP_BLOCK_TYPE_GMC,
2643 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 2644 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
2645 AMD_IP_BLOCK_TYPE_IH,
2646 };
a90ad3c2 2647
2cb681b6
ML
2648 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2649 int j;
2650 struct amdgpu_ip_block *block;
a90ad3c2 2651
4cd2a96d
J
2652 block = &adev->ip_blocks[i];
2653 block->status.hw = false;
2cb681b6 2654
4cd2a96d 2655 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 2656
4cd2a96d 2657 if (block->version->type != ip_order[j] ||
2cb681b6
ML
2658 !block->status.valid)
2659 continue;
2660
2661 r = block->version->funcs->hw_init(adev);
0aaeefcc 2662 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2663 if (r)
2664 return r;
482f0e53 2665 block->status.hw = true;
a90ad3c2
ML
2666 }
2667 }
2668
2669 return 0;
2670}
2671
06ec9070 2672static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2673{
2674 int i, r;
2675
2cb681b6
ML
2676 static enum amd_ip_block_type ip_order[] = {
2677 AMD_IP_BLOCK_TYPE_SMC,
2678 AMD_IP_BLOCK_TYPE_DCE,
2679 AMD_IP_BLOCK_TYPE_GFX,
2680 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 2681 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
2682 AMD_IP_BLOCK_TYPE_VCE,
2683 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 2684 };
a90ad3c2 2685
2cb681b6
ML
2686 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2687 int j;
2688 struct amdgpu_ip_block *block;
a90ad3c2 2689
2cb681b6
ML
2690 for (j = 0; j < adev->num_ip_blocks; j++) {
2691 block = &adev->ip_blocks[j];
2692
2693 if (block->version->type != ip_order[i] ||
482f0e53
ML
2694 !block->status.valid ||
2695 block->status.hw)
2cb681b6
ML
2696 continue;
2697
895bd048
JZ
2698 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2699 r = block->version->funcs->resume(adev);
2700 else
2701 r = block->version->funcs->hw_init(adev);
2702
0aaeefcc 2703 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2704 if (r)
2705 return r;
482f0e53 2706 block->status.hw = true;
a90ad3c2
ML
2707 }
2708 }
2709
2710 return 0;
2711}
2712
e3ecdffa
AD
2713/**
2714 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2715 *
2716 * @adev: amdgpu_device pointer
2717 *
2718 * First resume function for hardware IPs. The list of all the hardware
2719 * IPs that make up the asic is walked and the resume callbacks are run for
2720 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2721 * after a suspend and updates the software state as necessary. This
2722 * function is also used for restoring the GPU after a GPU reset.
2723 * Returns 0 on success, negative error code on failure.
2724 */
06ec9070 2725static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
2726{
2727 int i, r;
2728
a90ad3c2 2729 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2730 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 2731 continue;
a90ad3c2 2732 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
2733 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2734 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 2735
fcf0649f
CZ
2736 r = adev->ip_blocks[i].version->funcs->resume(adev);
2737 if (r) {
2738 DRM_ERROR("resume of IP block <%s> failed %d\n",
2739 adev->ip_blocks[i].version->funcs->name, r);
2740 return r;
2741 }
482f0e53 2742 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
2743 }
2744 }
2745
2746 return 0;
2747}
2748
e3ecdffa
AD
2749/**
2750 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2751 *
2752 * @adev: amdgpu_device pointer
2753 *
2754 * First resume function for hardware IPs. The list of all the hardware
2755 * IPs that make up the asic is walked and the resume callbacks are run for
2756 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2757 * functional state after a suspend and updates the software state as
2758 * necessary. This function is also used for restoring the GPU after a GPU
2759 * reset.
2760 * Returns 0 on success, negative error code on failure.
2761 */
06ec9070 2762static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2763{
2764 int i, r;
2765
2766 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2767 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 2768 continue;
fcf0649f 2769 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 2770 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
2771 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2772 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 2773 continue;
a1255107 2774 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 2775 if (r) {
a1255107
AD
2776 DRM_ERROR("resume of IP block <%s> failed %d\n",
2777 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2778 return r;
2c1a2784 2779 }
482f0e53 2780 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
2781 }
2782
2783 return 0;
2784}
2785
e3ecdffa
AD
2786/**
2787 * amdgpu_device_ip_resume - run resume for hardware IPs
2788 *
2789 * @adev: amdgpu_device pointer
2790 *
2791 * Main resume function for hardware IPs. The hardware IPs
2792 * are split into two resume functions because they are
2793 * are also used in in recovering from a GPU reset and some additional
2794 * steps need to be take between them. In this case (S3/S4) they are
2795 * run sequentially.
2796 * Returns 0 on success, negative error code on failure.
2797 */
06ec9070 2798static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
2799{
2800 int r;
2801
06ec9070 2802 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
2803 if (r)
2804 return r;
7a3e0bb2
RZ
2805
2806 r = amdgpu_device_fw_loading(adev);
2807 if (r)
2808 return r;
2809
06ec9070 2810 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
2811
2812 return r;
2813}
2814
e3ecdffa
AD
2815/**
2816 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2817 *
2818 * @adev: amdgpu_device pointer
2819 *
2820 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2821 */
4e99a44e 2822static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 2823{
6867e1b5
ML
2824 if (amdgpu_sriov_vf(adev)) {
2825 if (adev->is_atom_fw) {
2826 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2827 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2828 } else {
2829 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2830 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2831 }
2832
2833 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2834 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 2835 }
048765ad
AR
2836}
2837
e3ecdffa
AD
2838/**
2839 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2840 *
2841 * @asic_type: AMD asic type
2842 *
2843 * Check if there is DC (new modesetting infrastructre) support for an asic.
2844 * returns true if DC has support, false if not.
2845 */
4562236b
HW
2846bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2847{
2848 switch (asic_type) {
2849#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
2850#if defined(CONFIG_DRM_AMD_DC_SI)
2851 case CHIP_TAHITI:
2852 case CHIP_PITCAIRN:
2853 case CHIP_VERDE:
2854 case CHIP_OLAND:
2855#endif
4562236b 2856 case CHIP_BONAIRE:
0d6fbccb 2857 case CHIP_KAVERI:
367e6687
AD
2858 case CHIP_KABINI:
2859 case CHIP_MULLINS:
d9fda248
HW
2860 /*
2861 * We have systems in the wild with these ASICs that require
2862 * LVDS and VGA support which is not supported with DC.
2863 *
2864 * Fallback to the non-DC driver here by default so as not to
2865 * cause regressions.
2866 */
2867 return amdgpu_dc > 0;
2868 case CHIP_HAWAII:
4562236b
HW
2869 case CHIP_CARRIZO:
2870 case CHIP_STONEY:
4562236b 2871 case CHIP_POLARIS10:
675fd32b 2872 case CHIP_POLARIS11:
2c8ad2d5 2873 case CHIP_POLARIS12:
675fd32b 2874 case CHIP_VEGAM:
4562236b
HW
2875 case CHIP_TONGA:
2876 case CHIP_FIJI:
42f8ffa1 2877 case CHIP_VEGA10:
dca7b401 2878 case CHIP_VEGA12:
c6034aa2 2879 case CHIP_VEGA20:
b86a1aa3 2880#if defined(CONFIG_DRM_AMD_DC_DCN)
fd187853 2881 case CHIP_RAVEN:
b4f199c7 2882 case CHIP_NAVI10:
8fceceb6 2883 case CHIP_NAVI14:
078655d9 2884 case CHIP_NAVI12:
e1c14c43 2885 case CHIP_RENOIR:
81d9bfb8
JFZ
2886#endif
2887#if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2888 case CHIP_SIENNA_CICHLID:
a6c5308f 2889 case CHIP_NAVY_FLOUNDER:
42f8ffa1 2890#endif
fd187853 2891 return amdgpu_dc != 0;
4562236b
HW
2892#endif
2893 default:
93b09a9a
SS
2894 if (amdgpu_dc > 0)
2895 DRM_INFO("Display Core has been requested via kernel parameter "
2896 "but isn't supported by ASIC, ignoring\n");
4562236b
HW
2897 return false;
2898 }
2899}
2900
2901/**
2902 * amdgpu_device_has_dc_support - check if dc is supported
2903 *
2904 * @adev: amdgpu_device_pointer
2905 *
2906 * Returns true for supported, false for not supported
2907 */
2908bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2909{
c997e8e2 2910 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
2555039d
XY
2911 return false;
2912
4562236b
HW
2913 return amdgpu_device_asic_has_dc_support(adev->asic_type);
2914}
2915
d4535e2c
AG
2916
2917static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2918{
2919 struct amdgpu_device *adev =
2920 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 2921 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 2922
c6a6e2db
AG
2923 /* It's a bug to not have a hive within this function */
2924 if (WARN_ON(!hive))
2925 return;
2926
2927 /*
2928 * Use task barrier to synchronize all xgmi reset works across the
2929 * hive. task_barrier_enter and task_barrier_exit will block
2930 * until all the threads running the xgmi reset works reach
2931 * those points. task_barrier_full will do both blocks.
2932 */
2933 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2934
2935 task_barrier_enter(&hive->tb);
4a580877 2936 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
2937
2938 if (adev->asic_reset_res)
2939 goto fail;
2940
2941 task_barrier_exit(&hive->tb);
4a580877 2942 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
2943
2944 if (adev->asic_reset_res)
2945 goto fail;
43c4d576
JC
2946
2947 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2948 adev->mmhub.funcs->reset_ras_error_count(adev);
c6a6e2db
AG
2949 } else {
2950
2951 task_barrier_full(&hive->tb);
2952 adev->asic_reset_res = amdgpu_asic_reset(adev);
2953 }
ce316fa5 2954
c6a6e2db 2955fail:
d4535e2c 2956 if (adev->asic_reset_res)
fed184e9 2957 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 2958 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 2959 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
2960}
2961
71f98027
AD
2962static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2963{
2964 char *input = amdgpu_lockup_timeout;
2965 char *timeout_setting = NULL;
2966 int index = 0;
2967 long timeout;
2968 int ret = 0;
2969
2970 /*
2971 * By default timeout for non compute jobs is 10000.
2972 * And there is no timeout enforced on compute jobs.
2973 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 2974 * jobs are 60000 by default.
71f98027
AD
2975 */
2976 adev->gfx_timeout = msecs_to_jiffies(10000);
2977 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2978 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
b7b2a316 2979 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027
AD
2980 else
2981 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2982
f440ff44 2983 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 2984 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 2985 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
2986 ret = kstrtol(timeout_setting, 0, &timeout);
2987 if (ret)
2988 return ret;
2989
2990 if (timeout == 0) {
2991 index++;
2992 continue;
2993 } else if (timeout < 0) {
2994 timeout = MAX_SCHEDULE_TIMEOUT;
2995 } else {
2996 timeout = msecs_to_jiffies(timeout);
2997 }
2998
2999 switch (index++) {
3000 case 0:
3001 adev->gfx_timeout = timeout;
3002 break;
3003 case 1:
3004 adev->compute_timeout = timeout;
3005 break;
3006 case 2:
3007 adev->sdma_timeout = timeout;
3008 break;
3009 case 3:
3010 adev->video_timeout = timeout;
3011 break;
3012 default:
3013 break;
3014 }
3015 }
3016 /*
3017 * There is only one value specified and
3018 * it should apply to all non-compute jobs.
3019 */
bcccee89 3020 if (index == 1) {
71f98027 3021 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3022 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3023 adev->compute_timeout = adev->gfx_timeout;
3024 }
71f98027
AD
3025 }
3026
3027 return ret;
3028}
d4535e2c 3029
77f3a5cd
ND
3030static const struct attribute *amdgpu_dev_attributes[] = {
3031 &dev_attr_product_name.attr,
3032 &dev_attr_product_number.attr,
3033 &dev_attr_serial_number.attr,
3034 &dev_attr_pcie_replay_count.attr,
3035 NULL
3036};
3037
c9a6b82f 3038
d38ceaf9
AD
3039/**
3040 * amdgpu_device_init - initialize the driver
3041 *
3042 * @adev: amdgpu_device pointer
d38ceaf9
AD
3043 * @flags: driver flags
3044 *
3045 * Initializes the driver info and hw (all asics).
3046 * Returns 0 for success or an error on failure.
3047 * Called at driver startup.
3048 */
3049int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3050 uint32_t flags)
3051{
8aba21b7
LT
3052 struct drm_device *ddev = adev_to_drm(adev);
3053 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3054 int r, i;
3840c5bc 3055 bool boco = false;
95844d20 3056 u32 max_MBps;
d38ceaf9
AD
3057
3058 adev->shutdown = false;
d38ceaf9 3059 adev->flags = flags;
4e66d7d2
YZ
3060
3061 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3062 adev->asic_type = amdgpu_force_asic_type;
3063 else
3064 adev->asic_type = flags & AMD_ASIC_MASK;
3065
d38ceaf9 3066 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3067 if (amdgpu_emu_mode == 1)
8bdab6bb 3068 adev->usec_timeout *= 10;
770d13b1 3069 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3070 adev->accel_working = false;
3071 adev->num_rings = 0;
3072 adev->mman.buffer_funcs = NULL;
3073 adev->mman.buffer_funcs_ring = NULL;
3074 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3075 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3076 adev->gmc.gmc_funcs = NULL;
f54d1867 3077 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3078 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3079
3080 adev->smc_rreg = &amdgpu_invalid_rreg;
3081 adev->smc_wreg = &amdgpu_invalid_wreg;
3082 adev->pcie_rreg = &amdgpu_invalid_rreg;
3083 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3084 adev->pciep_rreg = &amdgpu_invalid_rreg;
3085 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3086 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3087 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3088 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3089 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3090 adev->didt_rreg = &amdgpu_invalid_rreg;
3091 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3092 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3093 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3094 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3095 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3096
3e39ab90
AD
3097 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3098 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3099 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3100
3101 /* mutex initialization are all done here so we
3102 * can recall function without having locking issues */
d38ceaf9 3103 atomic_set(&adev->irq.ih.lock, 0);
0e5ca0d1 3104 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3105 mutex_init(&adev->pm.mutex);
3106 mutex_init(&adev->gfx.gpu_clock_mutex);
3107 mutex_init(&adev->srbm_mutex);
b8866c26 3108 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3109 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3110 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3111 mutex_init(&adev->mn_lock);
e23b74aa 3112 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3113 hash_init(adev->mn_hash);
53b3f8f4 3114 atomic_set(&adev->in_gpu_reset, 0);
6049db43 3115 init_rwsem(&adev->reset_sem);
32eaeae0 3116 mutex_init(&adev->psp.mutex);
bd052211 3117 mutex_init(&adev->notifier_lock);
d38ceaf9 3118
912dfc84
EQ
3119 r = amdgpu_device_check_arguments(adev);
3120 if (r)
3121 return r;
d38ceaf9 3122
d38ceaf9
AD
3123 spin_lock_init(&adev->mmio_idx_lock);
3124 spin_lock_init(&adev->smc_idx_lock);
3125 spin_lock_init(&adev->pcie_idx_lock);
3126 spin_lock_init(&adev->uvd_ctx_idx_lock);
3127 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3128 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3129 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3130 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3131 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3132
0c4e7fa5
CZ
3133 INIT_LIST_HEAD(&adev->shadow_list);
3134 mutex_init(&adev->shadow_list_lock);
3135
beff74bc
AD
3136 INIT_DELAYED_WORK(&adev->delayed_init_work,
3137 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3138 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3139 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3140
d4535e2c
AG
3141 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3142
d23ee13f 3143 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3144 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3145
b265bdbd
EQ
3146 atomic_set(&adev->throttling_logging_enabled, 1);
3147 /*
3148 * If throttling continues, logging will be performed every minute
3149 * to avoid log flooding. "-1" is subtracted since the thermal
3150 * throttling interrupt comes every second. Thus, the total logging
3151 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3152 * for throttling interrupt) = 60 seconds.
3153 */
3154 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3155 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3156
0fa49558
AX
3157 /* Registers mapping */
3158 /* TODO: block userspace mapping of io register */
da69c161
KW
3159 if (adev->asic_type >= CHIP_BONAIRE) {
3160 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3161 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3162 } else {
3163 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3164 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3165 }
d38ceaf9 3166
d38ceaf9
AD
3167 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3168 if (adev->rmmio == NULL) {
3169 return -ENOMEM;
3170 }
3171 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3172 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3173
d38ceaf9
AD
3174 /* io port mapping */
3175 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3176 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3177 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3178 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3179 break;
3180 }
3181 }
3182 if (adev->rio_mem == NULL)
b64a18c5 3183 DRM_INFO("PCI I/O BAR is not found.\n");
d38ceaf9 3184
b2109d8e
JX
3185 /* enable PCIE atomic ops */
3186 r = pci_enable_atomic_ops_to_root(adev->pdev,
3187 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3188 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3189 if (r) {
3190 adev->have_atomics_support = false;
3191 DRM_INFO("PCIE atomic ops is not supported\n");
3192 } else {
3193 adev->have_atomics_support = true;
3194 }
3195
5494d864
AD
3196 amdgpu_device_get_pcie_info(adev);
3197
b239c017
JX
3198 if (amdgpu_mcbp)
3199 DRM_INFO("MCBP is enabled\n");
3200
5f84cc63
JX
3201 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3202 adev->enable_mes = true;
3203
3aa0115d
ML
3204 /* detect hw virtualization here */
3205 amdgpu_detect_virtualization(adev);
3206
dffa11b4
ML
3207 r = amdgpu_device_get_job_timeout_settings(adev);
3208 if (r) {
3209 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3210 return r;
a190d1c7
XY
3211 }
3212
d38ceaf9 3213 /* early init functions */
06ec9070 3214 r = amdgpu_device_ip_early_init(adev);
d38ceaf9
AD
3215 if (r)
3216 return r;
3217
6585661d
OZ
3218 /* doorbell bar mapping and doorbell index init*/
3219 amdgpu_device_doorbell_init(adev);
3220
d38ceaf9
AD
3221 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3222 /* this will fail for cards that aren't VGA class devices, just
3223 * ignore it */
06ec9070 3224 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
d38ceaf9 3225
31af062a 3226 if (amdgpu_device_supports_boco(ddev))
3840c5bc
AD
3227 boco = true;
3228 if (amdgpu_has_atpx() &&
3229 (amdgpu_is_atpx_hybrid() ||
3230 amdgpu_has_atpx_dgpu_power_cntl()) &&
3231 !pci_is_thunderbolt_attached(adev->pdev))
84c8b22e 3232 vga_switcheroo_register_client(adev->pdev,
3840c5bc
AD
3233 &amdgpu_switcheroo_ops, boco);
3234 if (boco)
d38ceaf9
AD
3235 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3236
9475a943
SL
3237 if (amdgpu_emu_mode == 1) {
3238 /* post the asic on emulation mode */
3239 emu_soc_asic_init(adev);
bfca0289 3240 goto fence_driver_init;
9475a943 3241 }
bfca0289 3242
4e99a44e
ML
3243 /* detect if we are with an SRIOV vbios */
3244 amdgpu_device_detect_sriov_bios(adev);
048765ad 3245
95e8e59e
AD
3246 /* check if we need to reset the asic
3247 * E.g., driver was not cleanly unloaded previously, etc.
3248 */
f14899fd 3249 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
95e8e59e
AD
3250 r = amdgpu_asic_reset(adev);
3251 if (r) {
3252 dev_err(adev->dev, "asic reset on init failed\n");
3253 goto failed;
3254 }
3255 }
3256
c9a6b82f
AG
3257 pci_enable_pcie_error_reporting(adev->ddev.pdev);
3258
d38ceaf9 3259 /* Post card if necessary */
39c640c0 3260 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3261 if (!adev->bios) {
bec86378 3262 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3263 r = -EINVAL;
3264 goto failed;
d38ceaf9 3265 }
bec86378 3266 DRM_INFO("GPU posting now...\n");
4d2997ab 3267 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3268 if (r) {
3269 dev_err(adev->dev, "gpu post error!\n");
3270 goto failed;
3271 }
d38ceaf9
AD
3272 }
3273
88b64e95
AD
3274 if (adev->is_atom_fw) {
3275 /* Initialize clocks */
3276 r = amdgpu_atomfirmware_get_clock_info(adev);
3277 if (r) {
3278 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3279 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3280 goto failed;
3281 }
3282 } else {
a5bde2f9
AD
3283 /* Initialize clocks */
3284 r = amdgpu_atombios_get_clock_info(adev);
3285 if (r) {
3286 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3287 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3288 goto failed;
a5bde2f9
AD
3289 }
3290 /* init i2c buses */
4562236b
HW
3291 if (!amdgpu_device_has_dc_support(adev))
3292 amdgpu_atombios_i2c_init(adev);
2c1a2784 3293 }
d38ceaf9 3294
bfca0289 3295fence_driver_init:
d38ceaf9
AD
3296 /* Fence driver */
3297 r = amdgpu_fence_driver_init(adev);
2c1a2784
AD
3298 if (r) {
3299 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
e23b74aa 3300 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3301 goto failed;
2c1a2784 3302 }
d38ceaf9
AD
3303
3304 /* init the mode config */
4a580877 3305 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3306
06ec9070 3307 r = amdgpu_device_ip_init(adev);
d38ceaf9 3308 if (r) {
8840a387 3309 /* failed in exclusive mode due to timeout */
3310 if (amdgpu_sriov_vf(adev) &&
3311 !amdgpu_sriov_runtime(adev) &&
3312 amdgpu_virt_mmio_blocked(adev) &&
3313 !amdgpu_virt_wait_reset(adev)) {
3314 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3315 /* Don't send request since VF is inactive. */
3316 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3317 adev->virt.ops = NULL;
8840a387 3318 r = -EAGAIN;
3319 goto failed;
3320 }
06ec9070 3321 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3322 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
83ba126a 3323 goto failed;
d38ceaf9
AD
3324 }
3325
d69b8971
YZ
3326 dev_info(adev->dev,
3327 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3328 adev->gfx.config.max_shader_engines,
3329 adev->gfx.config.max_sh_per_se,
3330 adev->gfx.config.max_cu_per_sh,
3331 adev->gfx.cu_info.number);
3332
d38ceaf9
AD
3333 adev->accel_working = true;
3334
e59c0205
AX
3335 amdgpu_vm_check_compute_bug(adev);
3336
95844d20
MO
3337 /* Initialize the buffer migration limit. */
3338 if (amdgpu_moverate >= 0)
3339 max_MBps = amdgpu_moverate;
3340 else
3341 max_MBps = 8; /* Allow 8 MB/s. */
3342 /* Get a log2 for easy divisions. */
3343 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3344
9bc92b9c
ML
3345 amdgpu_fbdev_init(adev);
3346
d2f52ac8 3347 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3348 if (r) {
3349 adev->pm_sysfs_en = false;
d2f52ac8 3350 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3351 } else
3352 adev->pm_sysfs_en = true;
d2f52ac8 3353
5bb23532 3354 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3355 if (r) {
3356 adev->ucode_sysfs_en = false;
5bb23532 3357 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3358 } else
3359 adev->ucode_sysfs_en = true;
5bb23532 3360
d38ceaf9
AD
3361 if ((amdgpu_testing & 1)) {
3362 if (adev->accel_working)
3363 amdgpu_test_moves(adev);
3364 else
3365 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3366 }
d38ceaf9
AD
3367 if (amdgpu_benchmarking) {
3368 if (adev->accel_working)
3369 amdgpu_benchmark(adev, amdgpu_benchmarking);
3370 else
3371 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3372 }
3373
b0adca4d
EQ
3374 /*
3375 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3376 * Otherwise the mgpu fan boost feature will be skipped due to the
3377 * gpu instance is counted less.
3378 */
3379 amdgpu_register_gpu_instance(adev);
3380
d38ceaf9
AD
3381 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3382 * explicit gating rather than handling it automatically.
3383 */
06ec9070 3384 r = amdgpu_device_ip_late_init(adev);
2c1a2784 3385 if (r) {
06ec9070 3386 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
e23b74aa 3387 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
83ba126a 3388 goto failed;
2c1a2784 3389 }
d38ceaf9 3390
108c6a63 3391 /* must succeed. */
511fdbc3 3392 amdgpu_ras_resume(adev);
108c6a63 3393
beff74bc
AD
3394 queue_delayed_work(system_wq, &adev->delayed_init_work,
3395 msecs_to_jiffies(AMDGPU_RESUME_MS));
3396
2c738637
ML
3397 if (amdgpu_sriov_vf(adev))
3398 flush_delayed_work(&adev->delayed_init_work);
3399
77f3a5cd 3400 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
bd607166 3401 if (r) {
77f3a5cd 3402 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166
KR
3403 return r;
3404 }
3405
d155bef0
AB
3406 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3407 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3408 if (r)
3409 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3410
c1dd4aa6
AG
3411 /* Have stored pci confspace at hand for restore in sudden PCI error */
3412 if (amdgpu_device_cache_pci_state(adev->pdev))
3413 pci_restore_state(pdev);
3414
d38ceaf9 3415 return 0;
83ba126a
AD
3416
3417failed:
89041940 3418 amdgpu_vf_error_trans_all(adev);
3840c5bc 3419 if (boco)
83ba126a 3420 vga_switcheroo_fini_domain_pm_ops(adev->dev);
8840a387 3421
83ba126a 3422 return r;
d38ceaf9
AD
3423}
3424
d38ceaf9
AD
3425/**
3426 * amdgpu_device_fini - tear down the driver
3427 *
3428 * @adev: amdgpu_device pointer
3429 *
3430 * Tear down the driver info (all asics).
3431 * Called at driver shutdown.
3432 */
3433void amdgpu_device_fini(struct amdgpu_device *adev)
3434{
aac89168 3435 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3436 flush_delayed_work(&adev->delayed_init_work);
d0d13fe8 3437 adev->shutdown = true;
9f875167 3438
c1dd4aa6
AG
3439 kfree(adev->pci_state);
3440
752c683d
ML
3441 /* make sure IB test finished before entering exclusive mode
3442 * to avoid preemption on IB test
3443 * */
3444 if (amdgpu_sriov_vf(adev))
3445 amdgpu_virt_request_full_gpu(adev, false);
3446
e5b03032
ML
3447 /* disable all interrupts */
3448 amdgpu_irq_disable_all(adev);
ff97cba8
ML
3449 if (adev->mode_info.mode_config_initialized){
3450 if (!amdgpu_device_has_dc_support(adev))
4a580877 3451 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3452 else
4a580877 3453 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3454 }
d38ceaf9 3455 amdgpu_fence_driver_fini(adev);
7c868b59
YT
3456 if (adev->pm_sysfs_en)
3457 amdgpu_pm_sysfs_fini(adev);
d38ceaf9 3458 amdgpu_fbdev_fini(adev);
e230ac11 3459 amdgpu_device_ip_fini(adev);
75e1658e
ND
3460 release_firmware(adev->firmware.gpu_info_fw);
3461 adev->firmware.gpu_info_fw = NULL;
d38ceaf9
AD
3462 adev->accel_working = false;
3463 /* free i2c buses */
4562236b
HW
3464 if (!amdgpu_device_has_dc_support(adev))
3465 amdgpu_i2c_fini(adev);
bfca0289
SL
3466
3467 if (amdgpu_emu_mode != 1)
3468 amdgpu_atombios_fini(adev);
3469
d38ceaf9
AD
3470 kfree(adev->bios);
3471 adev->bios = NULL;
3840c5bc
AD
3472 if (amdgpu_has_atpx() &&
3473 (amdgpu_is_atpx_hybrid() ||
3474 amdgpu_has_atpx_dgpu_power_cntl()) &&
3475 !pci_is_thunderbolt_attached(adev->pdev))
84c8b22e 3476 vga_switcheroo_unregister_client(adev->pdev);
4a580877 3477 if (amdgpu_device_supports_boco(adev_to_drm(adev)))
83ba126a 3478 vga_switcheroo_fini_domain_pm_ops(adev->dev);
d38ceaf9
AD
3479 vga_client_register(adev->pdev, NULL, NULL, NULL);
3480 if (adev->rio_mem)
3481 pci_iounmap(adev->pdev, adev->rio_mem);
3482 adev->rio_mem = NULL;
3483 iounmap(adev->rmmio);
3484 adev->rmmio = NULL;
06ec9070 3485 amdgpu_device_doorbell_fini(adev);
e9bc1bf7 3486
7c868b59
YT
3487 if (adev->ucode_sysfs_en)
3488 amdgpu_ucode_sysfs_fini(adev);
77f3a5cd
ND
3489
3490 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
d155bef0
AB
3491 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3492 amdgpu_pmu_fini(adev);
72de33f8 3493 if (adev->mman.discovery_bin)
a190d1c7 3494 amdgpu_discovery_fini(adev);
d38ceaf9
AD
3495}
3496
3497
3498/*
3499 * Suspend & resume.
3500 */
3501/**
810ddc3a 3502 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 3503 *
87e3f136 3504 * @dev: drm dev pointer
87e3f136 3505 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
3506 *
3507 * Puts the hw in the suspend state (all asics).
3508 * Returns 0 for success or an error on failure.
3509 * Called at driver suspend.
3510 */
de185019 3511int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9
AD
3512{
3513 struct amdgpu_device *adev;
3514 struct drm_crtc *crtc;
3515 struct drm_connector *connector;
f8d2d39e 3516 struct drm_connector_list_iter iter;
5ceb54c6 3517 int r;
d38ceaf9 3518
1348969a 3519 adev = drm_to_adev(dev);
d38ceaf9
AD
3520
3521 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3522 return 0;
3523
44779b43 3524 adev->in_suspend = true;
d38ceaf9
AD
3525 drm_kms_helper_poll_disable(dev);
3526
5f818173
S
3527 if (fbcon)
3528 amdgpu_fbdev_set_suspend(adev, 1);
3529
beff74bc 3530 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 3531
4562236b
HW
3532 if (!amdgpu_device_has_dc_support(adev)) {
3533 /* turn off display hw */
3534 drm_modeset_lock_all(dev);
f8d2d39e
LP
3535 drm_connector_list_iter_begin(dev, &iter);
3536 drm_for_each_connector_iter(connector, &iter)
3537 drm_helper_connector_dpms(connector,
3538 DRM_MODE_DPMS_OFF);
3539 drm_connector_list_iter_end(&iter);
4562236b 3540 drm_modeset_unlock_all(dev);
fe1053b7
AD
3541 /* unpin the front buffers and cursors */
3542 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3543 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3544 struct drm_framebuffer *fb = crtc->primary->fb;
3545 struct amdgpu_bo *robj;
3546
91334223 3547 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
fe1053b7
AD
3548 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3549 r = amdgpu_bo_reserve(aobj, true);
3550 if (r == 0) {
3551 amdgpu_bo_unpin(aobj);
3552 amdgpu_bo_unreserve(aobj);
3553 }
756e6880 3554 }
756e6880 3555
fe1053b7
AD
3556 if (fb == NULL || fb->obj[0] == NULL) {
3557 continue;
3558 }
3559 robj = gem_to_amdgpu_bo(fb->obj[0]);
3560 /* don't unpin kernel fb objects */
3561 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3562 r = amdgpu_bo_reserve(robj, true);
3563 if (r == 0) {
3564 amdgpu_bo_unpin(robj);
3565 amdgpu_bo_unreserve(robj);
3566 }
d38ceaf9
AD
3567 }
3568 }
3569 }
fe1053b7 3570
5e6932fe 3571 amdgpu_ras_suspend(adev);
3572
fe1053b7
AD
3573 r = amdgpu_device_ip_suspend_phase1(adev);
3574
94fa5660
EQ
3575 amdgpu_amdkfd_suspend(adev, !fbcon);
3576
d38ceaf9
AD
3577 /* evict vram memory */
3578 amdgpu_bo_evict_vram(adev);
3579
5ceb54c6 3580 amdgpu_fence_driver_suspend(adev);
d38ceaf9 3581
fe1053b7 3582 r = amdgpu_device_ip_suspend_phase2(adev);
d38ceaf9 3583
a0a71e49
AD
3584 /* evict remaining vram memory
3585 * This second call to evict vram is to evict the gart page table
3586 * using the CPU.
3587 */
d38ceaf9
AD
3588 amdgpu_bo_evict_vram(adev);
3589
d38ceaf9
AD
3590 return 0;
3591}
3592
3593/**
810ddc3a 3594 * amdgpu_device_resume - initiate device resume
d38ceaf9 3595 *
87e3f136 3596 * @dev: drm dev pointer
87e3f136 3597 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
3598 *
3599 * Bring the hw back to operating state (all asics).
3600 * Returns 0 for success or an error on failure.
3601 * Called at driver resume.
3602 */
de185019 3603int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9
AD
3604{
3605 struct drm_connector *connector;
f8d2d39e 3606 struct drm_connector_list_iter iter;
1348969a 3607 struct amdgpu_device *adev = drm_to_adev(dev);
756e6880 3608 struct drm_crtc *crtc;
03161a6e 3609 int r = 0;
d38ceaf9
AD
3610
3611 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3612 return 0;
3613
d38ceaf9 3614 /* post card */
39c640c0 3615 if (amdgpu_device_need_post(adev)) {
4d2997ab 3616 r = amdgpu_device_asic_init(adev);
74b0b157 3617 if (r)
aac89168 3618 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 3619 }
d38ceaf9 3620
06ec9070 3621 r = amdgpu_device_ip_resume(adev);
e6707218 3622 if (r) {
aac89168 3623 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 3624 return r;
e6707218 3625 }
5ceb54c6
AD
3626 amdgpu_fence_driver_resume(adev);
3627
d38ceaf9 3628
06ec9070 3629 r = amdgpu_device_ip_late_init(adev);
03161a6e 3630 if (r)
4d3b9ae5 3631 return r;
d38ceaf9 3632
beff74bc
AD
3633 queue_delayed_work(system_wq, &adev->delayed_init_work,
3634 msecs_to_jiffies(AMDGPU_RESUME_MS));
3635
fe1053b7
AD
3636 if (!amdgpu_device_has_dc_support(adev)) {
3637 /* pin cursors */
3638 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3639 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3640
91334223 3641 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
fe1053b7
AD
3642 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3643 r = amdgpu_bo_reserve(aobj, true);
3644 if (r == 0) {
3645 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3646 if (r != 0)
aac89168 3647 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
fe1053b7
AD
3648 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3649 amdgpu_bo_unreserve(aobj);
3650 }
756e6880
AD
3651 }
3652 }
3653 }
9593f4d6 3654 r = amdgpu_amdkfd_resume(adev, !fbcon);
ba997709
YZ
3655 if (r)
3656 return r;
756e6880 3657
96a5d8d4 3658 /* Make sure IB tests flushed */
beff74bc 3659 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 3660
d38ceaf9
AD
3661 /* blat the mode back in */
3662 if (fbcon) {
4562236b
HW
3663 if (!amdgpu_device_has_dc_support(adev)) {
3664 /* pre DCE11 */
3665 drm_helper_resume_force_mode(dev);
3666
3667 /* turn on display hw */
3668 drm_modeset_lock_all(dev);
f8d2d39e
LP
3669
3670 drm_connector_list_iter_begin(dev, &iter);
3671 drm_for_each_connector_iter(connector, &iter)
3672 drm_helper_connector_dpms(connector,
3673 DRM_MODE_DPMS_ON);
3674 drm_connector_list_iter_end(&iter);
3675
4562236b 3676 drm_modeset_unlock_all(dev);
d38ceaf9 3677 }
4d3b9ae5 3678 amdgpu_fbdev_set_suspend(adev, 0);
d38ceaf9
AD
3679 }
3680
3681 drm_kms_helper_poll_enable(dev);
23a1a9e5 3682
5e6932fe 3683 amdgpu_ras_resume(adev);
3684
23a1a9e5
L
3685 /*
3686 * Most of the connector probing functions try to acquire runtime pm
3687 * refs to ensure that the GPU is powered on when connector polling is
3688 * performed. Since we're calling this from a runtime PM callback,
3689 * trying to acquire rpm refs will cause us to deadlock.
3690 *
3691 * Since we're guaranteed to be holding the rpm lock, it's safe to
3692 * temporarily disable the rpm helpers so this doesn't deadlock us.
3693 */
3694#ifdef CONFIG_PM
3695 dev->dev->power.disable_depth++;
3696#endif
4562236b
HW
3697 if (!amdgpu_device_has_dc_support(adev))
3698 drm_helper_hpd_irq_event(dev);
3699 else
3700 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
3701#ifdef CONFIG_PM
3702 dev->dev->power.disable_depth--;
3703#endif
44779b43
RZ
3704 adev->in_suspend = false;
3705
4d3b9ae5 3706 return 0;
d38ceaf9
AD
3707}
3708
e3ecdffa
AD
3709/**
3710 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3711 *
3712 * @adev: amdgpu_device pointer
3713 *
3714 * The list of all the hardware IPs that make up the asic is walked and
3715 * the check_soft_reset callbacks are run. check_soft_reset determines
3716 * if the asic is still hung or not.
3717 * Returns true if any of the IPs are still in a hung state, false if not.
3718 */
06ec9070 3719static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
3720{
3721 int i;
3722 bool asic_hang = false;
3723
f993d628
ML
3724 if (amdgpu_sriov_vf(adev))
3725 return true;
3726
8bc04c29
AD
3727 if (amdgpu_asic_need_full_reset(adev))
3728 return true;
3729
63fbf42f 3730 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3731 if (!adev->ip_blocks[i].status.valid)
63fbf42f 3732 continue;
a1255107
AD
3733 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3734 adev->ip_blocks[i].status.hang =
3735 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3736 if (adev->ip_blocks[i].status.hang) {
aac89168 3737 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
3738 asic_hang = true;
3739 }
3740 }
3741 return asic_hang;
3742}
3743
e3ecdffa
AD
3744/**
3745 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3746 *
3747 * @adev: amdgpu_device pointer
3748 *
3749 * The list of all the hardware IPs that make up the asic is walked and the
3750 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3751 * handles any IP specific hardware or software state changes that are
3752 * necessary for a soft reset to succeed.
3753 * Returns 0 on success, negative error code on failure.
3754 */
06ec9070 3755static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
3756{
3757 int i, r = 0;
3758
3759 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3760 if (!adev->ip_blocks[i].status.valid)
d31a501e 3761 continue;
a1255107
AD
3762 if (adev->ip_blocks[i].status.hang &&
3763 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3764 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
3765 if (r)
3766 return r;
3767 }
3768 }
3769
3770 return 0;
3771}
3772
e3ecdffa
AD
3773/**
3774 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3775 *
3776 * @adev: amdgpu_device pointer
3777 *
3778 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3779 * reset is necessary to recover.
3780 * Returns true if a full asic reset is required, false if not.
3781 */
06ec9070 3782static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 3783{
da146d3b
AD
3784 int i;
3785
8bc04c29
AD
3786 if (amdgpu_asic_need_full_reset(adev))
3787 return true;
3788
da146d3b 3789 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3790 if (!adev->ip_blocks[i].status.valid)
da146d3b 3791 continue;
a1255107
AD
3792 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3793 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3794 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
3795 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3796 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 3797 if (adev->ip_blocks[i].status.hang) {
aac89168 3798 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
3799 return true;
3800 }
3801 }
35d782fe
CZ
3802 }
3803 return false;
3804}
3805
e3ecdffa
AD
3806/**
3807 * amdgpu_device_ip_soft_reset - do a soft reset
3808 *
3809 * @adev: amdgpu_device pointer
3810 *
3811 * The list of all the hardware IPs that make up the asic is walked and the
3812 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3813 * IP specific hardware or software state changes that are necessary to soft
3814 * reset the IP.
3815 * Returns 0 on success, negative error code on failure.
3816 */
06ec9070 3817static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
3818{
3819 int i, r = 0;
3820
3821 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3822 if (!adev->ip_blocks[i].status.valid)
35d782fe 3823 continue;
a1255107
AD
3824 if (adev->ip_blocks[i].status.hang &&
3825 adev->ip_blocks[i].version->funcs->soft_reset) {
3826 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
3827 if (r)
3828 return r;
3829 }
3830 }
3831
3832 return 0;
3833}
3834
e3ecdffa
AD
3835/**
3836 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3837 *
3838 * @adev: amdgpu_device pointer
3839 *
3840 * The list of all the hardware IPs that make up the asic is walked and the
3841 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3842 * handles any IP specific hardware or software state changes that are
3843 * necessary after the IP has been soft reset.
3844 * Returns 0 on success, negative error code on failure.
3845 */
06ec9070 3846static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
3847{
3848 int i, r = 0;
3849
3850 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3851 if (!adev->ip_blocks[i].status.valid)
35d782fe 3852 continue;
a1255107
AD
3853 if (adev->ip_blocks[i].status.hang &&
3854 adev->ip_blocks[i].version->funcs->post_soft_reset)
3855 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
3856 if (r)
3857 return r;
3858 }
3859
3860 return 0;
3861}
3862
e3ecdffa 3863/**
c33adbc7 3864 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
3865 *
3866 * @adev: amdgpu_device pointer
3867 *
3868 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
3869 * restore things like GPUVM page tables after a GPU reset where
3870 * the contents of VRAM might be lost.
403009bf
CK
3871 *
3872 * Returns:
3873 * 0 on success, negative error code on failure.
e3ecdffa 3874 */
c33adbc7 3875static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 3876{
c41d1cf6 3877 struct dma_fence *fence = NULL, *next = NULL;
403009bf
CK
3878 struct amdgpu_bo *shadow;
3879 long r = 1, tmo;
c41d1cf6
ML
3880
3881 if (amdgpu_sriov_runtime(adev))
b045d3af 3882 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
3883 else
3884 tmo = msecs_to_jiffies(100);
3885
aac89168 3886 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 3887 mutex_lock(&adev->shadow_list_lock);
403009bf
CK
3888 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3889
3890 /* No need to recover an evicted BO */
3891 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
b575f10d 3892 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
403009bf
CK
3893 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3894 continue;
3895
3896 r = amdgpu_bo_restore_shadow(shadow, &next);
3897 if (r)
3898 break;
3899
c41d1cf6 3900 if (fence) {
1712fb1a 3901 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
3902 dma_fence_put(fence);
3903 fence = next;
1712fb1a 3904 if (tmo == 0) {
3905 r = -ETIMEDOUT;
c41d1cf6 3906 break;
1712fb1a 3907 } else if (tmo < 0) {
3908 r = tmo;
3909 break;
3910 }
403009bf
CK
3911 } else {
3912 fence = next;
c41d1cf6 3913 }
c41d1cf6
ML
3914 }
3915 mutex_unlock(&adev->shadow_list_lock);
3916
403009bf
CK
3917 if (fence)
3918 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
3919 dma_fence_put(fence);
3920
1712fb1a 3921 if (r < 0 || tmo <= 0) {
aac89168 3922 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
3923 return -EIO;
3924 }
c41d1cf6 3925
aac89168 3926 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 3927 return 0;
c41d1cf6
ML
3928}
3929
a90ad3c2 3930
e3ecdffa 3931/**
06ec9070 3932 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e
ML
3933 *
3934 * @adev: amdgpu device pointer
87e3f136 3935 * @from_hypervisor: request from hypervisor
5740682e
ML
3936 *
3937 * do VF FLR and reinitialize Asic
3f48c681 3938 * return 0 means succeeded otherwise failed
e3ecdffa
AD
3939 */
3940static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3941 bool from_hypervisor)
5740682e
ML
3942{
3943 int r;
3944
3945 if (from_hypervisor)
3946 r = amdgpu_virt_request_full_gpu(adev, true);
3947 else
3948 r = amdgpu_virt_reset_gpu(adev);
3949 if (r)
3950 return r;
a90ad3c2 3951
b639c22c
JZ
3952 amdgpu_amdkfd_pre_reset(adev);
3953
a90ad3c2 3954 /* Resume IP prior to SMC */
06ec9070 3955 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
3956 if (r)
3957 goto error;
a90ad3c2 3958
c9ffa427 3959 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 3960 /* we need recover gart prior to run SMC/CP/SDMA resume */
6c28aed6 3961 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
a90ad3c2 3962
7a3e0bb2
RZ
3963 r = amdgpu_device_fw_loading(adev);
3964 if (r)
3965 return r;
3966
a90ad3c2 3967 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 3968 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
3969 if (r)
3970 goto error;
a90ad3c2
ML
3971
3972 amdgpu_irq_gpu_reset_resume_helper(adev);
5740682e 3973 r = amdgpu_ib_ring_tests(adev);
f81e8d53 3974 amdgpu_amdkfd_post_reset(adev);
a90ad3c2 3975
abc34253
ED
3976error:
3977 amdgpu_virt_release_full_gpu(adev, true);
c41d1cf6 3978 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 3979 amdgpu_inc_vram_lost(adev);
c33adbc7 3980 r = amdgpu_device_recover_vram(adev);
a90ad3c2
ML
3981 }
3982
3983 return r;
3984}
3985
9a1cddd6 3986/**
3987 * amdgpu_device_has_job_running - check if there is any job in mirror list
3988 *
3989 * @adev: amdgpu device pointer
3990 *
3991 * check if there is any job in mirror list
3992 */
3993bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
3994{
3995 int i;
3996 struct drm_sched_job *job;
3997
3998 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3999 struct amdgpu_ring *ring = adev->rings[i];
4000
4001 if (!ring || !ring->sched.thread)
4002 continue;
4003
4004 spin_lock(&ring->sched.job_list_lock);
4005 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4006 struct drm_sched_job, node);
4007 spin_unlock(&ring->sched.job_list_lock);
4008 if (job)
4009 return true;
4010 }
4011 return false;
4012}
4013
12938fad
CK
4014/**
4015 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4016 *
4017 * @adev: amdgpu device pointer
4018 *
4019 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4020 * a hung GPU.
4021 */
4022bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4023{
4024 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4025 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4026 return false;
4027 }
4028
3ba7b418
AG
4029 if (amdgpu_gpu_recovery == 0)
4030 goto disabled;
4031
4032 if (amdgpu_sriov_vf(adev))
4033 return true;
4034
4035 if (amdgpu_gpu_recovery == -1) {
4036 switch (adev->asic_type) {
fc42d47c
AG
4037 case CHIP_BONAIRE:
4038 case CHIP_HAWAII:
3ba7b418
AG
4039 case CHIP_TOPAZ:
4040 case CHIP_TONGA:
4041 case CHIP_FIJI:
4042 case CHIP_POLARIS10:
4043 case CHIP_POLARIS11:
4044 case CHIP_POLARIS12:
4045 case CHIP_VEGAM:
4046 case CHIP_VEGA20:
4047 case CHIP_VEGA10:
4048 case CHIP_VEGA12:
c43b849f 4049 case CHIP_RAVEN:
e9d4cf91 4050 case CHIP_ARCTURUS:
2cb44fb0 4051 case CHIP_RENOIR:
658c6639
AD
4052 case CHIP_NAVI10:
4053 case CHIP_NAVI14:
4054 case CHIP_NAVI12:
131a3c74 4055 case CHIP_SIENNA_CICHLID:
3ba7b418
AG
4056 break;
4057 default:
4058 goto disabled;
4059 }
12938fad
CK
4060 }
4061
4062 return true;
3ba7b418
AG
4063
4064disabled:
aac89168 4065 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4066 return false;
12938fad
CK
4067}
4068
5c6dd71e 4069
26bc5340
AG
4070static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4071 struct amdgpu_job *job,
4072 bool *need_full_reset_arg)
4073{
4074 int i, r = 0;
4075 bool need_full_reset = *need_full_reset_arg;
71182665 4076
728e7e0c
JZ
4077 amdgpu_debugfs_wait_dump(adev);
4078
71182665 4079 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4080 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4081 struct amdgpu_ring *ring = adev->rings[i];
4082
51687759 4083 if (!ring || !ring->sched.thread)
0875dc9e 4084 continue;
5740682e 4085
2f9d4084
ML
4086 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4087 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4088 }
d38ceaf9 4089
222b5f04
AG
4090 if(job)
4091 drm_sched_increase_karma(&job->base);
4092
1d721ed6 4093 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4094 if (!amdgpu_sriov_vf(adev)) {
4095
4096 if (!need_full_reset)
4097 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4098
4099 if (!need_full_reset) {
4100 amdgpu_device_ip_pre_soft_reset(adev);
4101 r = amdgpu_device_ip_soft_reset(adev);
4102 amdgpu_device_ip_post_soft_reset(adev);
4103 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4104 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4105 need_full_reset = true;
4106 }
4107 }
4108
4109 if (need_full_reset)
4110 r = amdgpu_device_ip_suspend(adev);
4111
4112 *need_full_reset_arg = need_full_reset;
4113 }
4114
4115 return r;
4116}
4117
041a62bc 4118static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
26bc5340 4119 struct list_head *device_list_handle,
7ac71382
AG
4120 bool *need_full_reset_arg,
4121 bool skip_hw_reset)
26bc5340
AG
4122{
4123 struct amdgpu_device *tmp_adev = NULL;
4124 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4125 int r = 0;
4126
4127 /*
4128 * ASIC reset has to be done on all HGMI hive nodes ASAP
4129 * to allow proper links negotiation in FW (within 1 sec)
4130 */
7ac71382 4131 if (!skip_hw_reset && need_full_reset) {
26bc5340 4132 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
041a62bc 4133 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4134 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
c96cf282 4135 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4136 r = -EALREADY;
4137 } else
4138 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4139
041a62bc 4140 if (r) {
aac89168 4141 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4142 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4143 break;
ce316fa5
LM
4144 }
4145 }
4146
041a62bc
AG
4147 /* For XGMI wait for all resets to complete before proceed */
4148 if (!r) {
ce316fa5
LM
4149 list_for_each_entry(tmp_adev, device_list_handle,
4150 gmc.xgmi.head) {
4151 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4152 flush_work(&tmp_adev->xgmi_reset_work);
4153 r = tmp_adev->asic_reset_res;
4154 if (r)
4155 break;
ce316fa5
LM
4156 }
4157 }
4158 }
ce316fa5 4159 }
26bc5340 4160
43c4d576
JC
4161 if (!r && amdgpu_ras_intr_triggered()) {
4162 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4163 if (tmp_adev->mmhub.funcs &&
4164 tmp_adev->mmhub.funcs->reset_ras_error_count)
4165 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4166 }
4167
00eaa571 4168 amdgpu_ras_intr_cleared();
43c4d576 4169 }
00eaa571 4170
26bc5340
AG
4171 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4172 if (need_full_reset) {
4173 /* post card */
4d2997ab 4174 if (amdgpu_device_asic_init(tmp_adev))
aac89168 4175 dev_warn(tmp_adev->dev, "asic atom init failed!");
26bc5340
AG
4176
4177 if (!r) {
4178 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4179 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4180 if (r)
4181 goto out;
4182
4183 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4184 if (vram_lost) {
77e7f829 4185 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4186 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4187 }
4188
6c28aed6 4189 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
26bc5340
AG
4190 if (r)
4191 goto out;
4192
4193 r = amdgpu_device_fw_loading(tmp_adev);
4194 if (r)
4195 return r;
4196
4197 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4198 if (r)
4199 goto out;
4200
4201 if (vram_lost)
4202 amdgpu_device_fill_reset_magic(tmp_adev);
4203
fdafb359
EQ
4204 /*
4205 * Add this ASIC as tracked as reset was already
4206 * complete successfully.
4207 */
4208 amdgpu_register_gpu_instance(tmp_adev);
4209
7c04ca50 4210 r = amdgpu_device_ip_late_init(tmp_adev);
4211 if (r)
4212 goto out;
4213
565d1941
EQ
4214 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4215
e8fbaf03
GC
4216 /*
4217 * The GPU enters bad state once faulty pages
4218 * by ECC has reached the threshold, and ras
4219 * recovery is scheduled next. So add one check
4220 * here to break recovery if it indeed exceeds
4221 * bad page threshold, and remind user to
4222 * retire this GPU or setting one bigger
4223 * bad_page_threshold value to fix this once
4224 * probing driver again.
4225 */
4226 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4227 /* must succeed. */
4228 amdgpu_ras_resume(tmp_adev);
4229 } else {
4230 r = -EINVAL;
4231 goto out;
4232 }
e79a04d5 4233
26bc5340
AG
4234 /* Update PSP FW topology after reset */
4235 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4236 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4237 }
4238 }
4239
26bc5340
AG
4240out:
4241 if (!r) {
4242 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4243 r = amdgpu_ib_ring_tests(tmp_adev);
4244 if (r) {
4245 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4246 r = amdgpu_device_ip_suspend(tmp_adev);
4247 need_full_reset = true;
4248 r = -EAGAIN;
4249 goto end;
4250 }
4251 }
4252
4253 if (!r)
4254 r = amdgpu_device_recover_vram(tmp_adev);
4255 else
4256 tmp_adev->asic_reset_res = r;
4257 }
4258
4259end:
4260 *need_full_reset_arg = need_full_reset;
4261 return r;
4262}
4263
08ebb485
DL
4264static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4265 struct amdgpu_hive_info *hive)
26bc5340 4266{
53b3f8f4
DL
4267 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4268 return false;
4269
08ebb485
DL
4270 if (hive) {
4271 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4272 } else {
4273 down_write(&adev->reset_sem);
4274 }
5740682e 4275
26bc5340 4276 atomic_inc(&adev->gpu_reset_counter);
a3a09142
AD
4277 switch (amdgpu_asic_reset_method(adev)) {
4278 case AMD_RESET_METHOD_MODE1:
4279 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4280 break;
4281 case AMD_RESET_METHOD_MODE2:
4282 adev->mp1_state = PP_MP1_STATE_RESET;
4283 break;
4284 default:
4285 adev->mp1_state = PP_MP1_STATE_NONE;
4286 break;
4287 }
1d721ed6
AG
4288
4289 return true;
26bc5340 4290}
d38ceaf9 4291
26bc5340
AG
4292static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4293{
89041940 4294 amdgpu_vf_error_trans_all(adev);
a3a09142 4295 adev->mp1_state = PP_MP1_STATE_NONE;
53b3f8f4 4296 atomic_set(&adev->in_gpu_reset, 0);
6049db43 4297 up_write(&adev->reset_sem);
26bc5340
AG
4298}
4299
3f12acc8
EQ
4300static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4301{
4302 struct pci_dev *p = NULL;
4303
4304 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4305 adev->pdev->bus->number, 1);
4306 if (p) {
4307 pm_runtime_enable(&(p->dev));
4308 pm_runtime_resume(&(p->dev));
4309 }
4310}
4311
4312static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4313{
4314 enum amd_reset_method reset_method;
4315 struct pci_dev *p = NULL;
4316 u64 expires;
4317
4318 /*
4319 * For now, only BACO and mode1 reset are confirmed
4320 * to suffer the audio issue without proper suspended.
4321 */
4322 reset_method = amdgpu_asic_reset_method(adev);
4323 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4324 (reset_method != AMD_RESET_METHOD_MODE1))
4325 return -EINVAL;
4326
4327 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4328 adev->pdev->bus->number, 1);
4329 if (!p)
4330 return -ENODEV;
4331
4332 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4333 if (!expires)
4334 /*
4335 * If we cannot get the audio device autosuspend delay,
4336 * a fixed 4S interval will be used. Considering 3S is
4337 * the audio controller default autosuspend delay setting.
4338 * 4S used here is guaranteed to cover that.
4339 */
54b7feb9 4340 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4341
4342 while (!pm_runtime_status_suspended(&(p->dev))) {
4343 if (!pm_runtime_suspend(&(p->dev)))
4344 break;
4345
4346 if (expires < ktime_get_mono_fast_ns()) {
4347 dev_warn(adev->dev, "failed to suspend display audio\n");
4348 /* TODO: abort the succeeding gpu reset? */
4349 return -ETIMEDOUT;
4350 }
4351 }
4352
4353 pm_runtime_disable(&(p->dev));
4354
4355 return 0;
4356}
4357
26bc5340
AG
4358/**
4359 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4360 *
4361 * @adev: amdgpu device pointer
4362 * @job: which job trigger hang
4363 *
4364 * Attempt to reset the GPU if it has hung (all asics).
4365 * Attempt to do soft-reset or full-reset and reinitialize Asic
4366 * Returns 0 for success or an error on failure.
4367 */
4368
4369int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4370 struct amdgpu_job *job)
4371{
1d721ed6 4372 struct list_head device_list, *device_list_handle = NULL;
7dd8c205
EQ
4373 bool need_full_reset = false;
4374 bool job_signaled = false;
26bc5340 4375 struct amdgpu_hive_info *hive = NULL;
26bc5340 4376 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 4377 int i, r = 0;
bb5c7235 4378 bool need_emergency_restart = false;
3f12acc8 4379 bool audio_suspended = false;
26bc5340 4380
bb5c7235
WS
4381 /**
4382 * Special case: RAS triggered and full reset isn't supported
4383 */
4384 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4385
d5ea093e
AG
4386 /*
4387 * Flush RAM to disk so that after reboot
4388 * the user can read log and see why the system rebooted.
4389 */
bb5c7235 4390 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
4391 DRM_WARN("Emergency reboot.");
4392
4393 ksys_sync_helper();
4394 emergency_restart();
4395 }
4396
b823821f 4397 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 4398 need_emergency_restart ? "jobs stop":"reset");
26bc5340
AG
4399
4400 /*
1d721ed6
AG
4401 * Here we trylock to avoid chain of resets executing from
4402 * either trigger by jobs on different adevs in XGMI hive or jobs on
4403 * different schedulers for same device while this TO handler is running.
4404 * We always reset all schedulers for device and all devices for XGMI
4405 * hive so that should take care of them too.
26bc5340 4406 */
d95e8e97 4407 hive = amdgpu_get_xgmi_hive(adev);
53b3f8f4
DL
4408 if (hive) {
4409 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4410 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4411 job ? job->base.id : -1, hive->hive_id);
d95e8e97 4412 amdgpu_put_xgmi_hive(hive);
53b3f8f4
DL
4413 return 0;
4414 }
4415 mutex_lock(&hive->hive_lock);
1d721ed6 4416 }
26bc5340 4417
9e94d22c
EQ
4418 /*
4419 * Build list of devices to reset.
4420 * In case we are in XGMI hive mode, resort the device list
4421 * to put adev in the 1st position.
4422 */
4423 INIT_LIST_HEAD(&device_list);
4424 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4425 if (!hive)
26bc5340 4426 return -ENODEV;
9e94d22c
EQ
4427 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4428 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
26bc5340
AG
4429 device_list_handle = &hive->device_list;
4430 } else {
4431 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4432 device_list_handle = &device_list;
4433 }
4434
1d721ed6
AG
4435 /* block all schedulers and reset given job's ring */
4436 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
08ebb485 4437 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
aac89168 4438 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
9e94d22c 4439 job ? job->base.id : -1);
cbfd17f7
DL
4440 r = 0;
4441 goto skip_recovery;
7c6e68c7
AG
4442 }
4443
3f12acc8
EQ
4444 /*
4445 * Try to put the audio codec into suspend state
4446 * before gpu reset started.
4447 *
4448 * Due to the power domain of the graphics device
4449 * is shared with AZ power domain. Without this,
4450 * we may change the audio hardware from behind
4451 * the audio driver's back. That will trigger
4452 * some audio codec errors.
4453 */
4454 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4455 audio_suspended = true;
4456
9e94d22c
EQ
4457 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4458
52fb44cf
EQ
4459 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4460
9e94d22c
EQ
4461 if (!amdgpu_sriov_vf(tmp_adev))
4462 amdgpu_amdkfd_pre_reset(tmp_adev);
4463
12ffa55d
AG
4464 /*
4465 * Mark these ASICs to be reseted as untracked first
4466 * And add them back after reset completed
4467 */
4468 amdgpu_unregister_gpu_instance(tmp_adev);
4469
a2f63ee8 4470 amdgpu_fbdev_set_suspend(tmp_adev, 1);
565d1941 4471
f1c1314b 4472 /* disable ras on ALL IPs */
bb5c7235 4473 if (!need_emergency_restart &&
b823821f 4474 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 4475 amdgpu_ras_suspend(tmp_adev);
4476
1d721ed6
AG
4477 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4478 struct amdgpu_ring *ring = tmp_adev->rings[i];
4479
4480 if (!ring || !ring->sched.thread)
4481 continue;
4482
0b2d2c2e 4483 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 4484
bb5c7235 4485 if (need_emergency_restart)
7c6e68c7 4486 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6
AG
4487 }
4488 }
4489
bb5c7235 4490 if (need_emergency_restart)
7c6e68c7
AG
4491 goto skip_sched_resume;
4492
1d721ed6
AG
4493 /*
4494 * Must check guilty signal here since after this point all old
4495 * HW fences are force signaled.
4496 *
4497 * job->base holds a reference to parent fence
4498 */
4499 if (job && job->base.s_fence->parent &&
7dd8c205 4500 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 4501 job_signaled = true;
1d721ed6
AG
4502 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4503 goto skip_hw_reset;
4504 }
4505
26bc5340
AG
4506retry: /* Rest of adevs pre asic reset from XGMI hive. */
4507 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
26bc5340
AG
4508 r = amdgpu_device_pre_asic_reset(tmp_adev,
4509 NULL,
4510 &need_full_reset);
4511 /*TODO Should we stop ?*/
4512 if (r) {
aac89168 4513 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 4514 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
4515 tmp_adev->asic_reset_res = r;
4516 }
4517 }
4518
4519 /* Actual ASIC resets if needed.*/
4520 /* TODO Implement XGMI hive reset logic for SRIOV */
4521 if (amdgpu_sriov_vf(adev)) {
4522 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4523 if (r)
4524 adev->asic_reset_res = r;
4525 } else {
7ac71382 4526 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
26bc5340
AG
4527 if (r && r == -EAGAIN)
4528 goto retry;
4529 }
4530
1d721ed6
AG
4531skip_hw_reset:
4532
26bc5340
AG
4533 /* Post ASIC reset for all devs .*/
4534 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
7c6e68c7 4535
1d721ed6
AG
4536 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4537 struct amdgpu_ring *ring = tmp_adev->rings[i];
4538
4539 if (!ring || !ring->sched.thread)
4540 continue;
4541
4542 /* No point to resubmit jobs if we didn't HW reset*/
4543 if (!tmp_adev->asic_reset_res && !job_signaled)
4544 drm_sched_resubmit_jobs(&ring->sched);
4545
4546 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4547 }
4548
4549 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4a580877 4550 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
4551 }
4552
4553 tmp_adev->asic_reset_res = 0;
26bc5340
AG
4554
4555 if (r) {
4556 /* bad news, how to tell it to userspace ? */
12ffa55d 4557 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
4558 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4559 } else {
12ffa55d 4560 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340 4561 }
7c6e68c7 4562 }
26bc5340 4563
7c6e68c7
AG
4564skip_sched_resume:
4565 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4566 /*unlock kfd: SRIOV would do it separately */
bb5c7235 4567 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
7c6e68c7 4568 amdgpu_amdkfd_post_reset(tmp_adev);
3f12acc8
EQ
4569 if (audio_suspended)
4570 amdgpu_device_resume_display_audio(tmp_adev);
26bc5340
AG
4571 amdgpu_device_unlock_adev(tmp_adev);
4572 }
4573
cbfd17f7 4574skip_recovery:
9e94d22c 4575 if (hive) {
53b3f8f4 4576 atomic_set(&hive->in_reset, 0);
9e94d22c 4577 mutex_unlock(&hive->hive_lock);
d95e8e97 4578 amdgpu_put_xgmi_hive(hive);
9e94d22c 4579 }
26bc5340
AG
4580
4581 if (r)
4582 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
4583 return r;
4584}
4585
e3ecdffa
AD
4586/**
4587 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4588 *
4589 * @adev: amdgpu_device pointer
4590 *
4591 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4592 * and lanes) of the slot the device is in. Handles APUs and
4593 * virtualized environments where PCIE config space may not be available.
4594 */
5494d864 4595static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 4596{
5d9a6330 4597 struct pci_dev *pdev;
c5313457
HK
4598 enum pci_bus_speed speed_cap, platform_speed_cap;
4599 enum pcie_link_width platform_link_width;
d0dd7f0c 4600
cd474ba0
AD
4601 if (amdgpu_pcie_gen_cap)
4602 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 4603
cd474ba0
AD
4604 if (amdgpu_pcie_lane_cap)
4605 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 4606
cd474ba0
AD
4607 /* covers APUs as well */
4608 if (pci_is_root_bus(adev->pdev->bus)) {
4609 if (adev->pm.pcie_gen_mask == 0)
4610 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4611 if (adev->pm.pcie_mlw_mask == 0)
4612 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 4613 return;
cd474ba0 4614 }
d0dd7f0c 4615
c5313457
HK
4616 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4617 return;
4618
dbaa922b
AD
4619 pcie_bandwidth_available(adev->pdev, NULL,
4620 &platform_speed_cap, &platform_link_width);
c5313457 4621
cd474ba0 4622 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
4623 /* asic caps */
4624 pdev = adev->pdev;
4625 speed_cap = pcie_get_speed_cap(pdev);
4626 if (speed_cap == PCI_SPEED_UNKNOWN) {
4627 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
4628 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4629 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 4630 } else {
5d9a6330
AD
4631 if (speed_cap == PCIE_SPEED_16_0GT)
4632 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4633 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4634 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4635 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4636 else if (speed_cap == PCIE_SPEED_8_0GT)
4637 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4638 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4639 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4640 else if (speed_cap == PCIE_SPEED_5_0GT)
4641 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4642 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4643 else
4644 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4645 }
4646 /* platform caps */
c5313457 4647 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
4648 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4649 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4650 } else {
c5313457 4651 if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
4652 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4653 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4654 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4655 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 4656 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
4657 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4658 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4659 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 4660 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
4661 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4662 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4663 else
4664 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4665
cd474ba0
AD
4666 }
4667 }
4668 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 4669 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
4670 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4671 } else {
c5313457 4672 switch (platform_link_width) {
5d9a6330 4673 case PCIE_LNK_X32:
cd474ba0
AD
4674 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4675 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4676 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4677 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4678 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4679 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4680 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4681 break;
5d9a6330 4682 case PCIE_LNK_X16:
cd474ba0
AD
4683 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4684 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4685 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4686 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4687 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4689 break;
5d9a6330 4690 case PCIE_LNK_X12:
cd474ba0
AD
4691 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4692 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4693 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4694 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4695 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4696 break;
5d9a6330 4697 case PCIE_LNK_X8:
cd474ba0
AD
4698 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4699 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4700 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4701 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4702 break;
5d9a6330 4703 case PCIE_LNK_X4:
cd474ba0
AD
4704 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4705 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4706 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4707 break;
5d9a6330 4708 case PCIE_LNK_X2:
cd474ba0
AD
4709 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4710 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4711 break;
5d9a6330 4712 case PCIE_LNK_X1:
cd474ba0
AD
4713 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4714 break;
4715 default:
4716 break;
4717 }
d0dd7f0c
AD
4718 }
4719 }
4720}
d38ceaf9 4721
361dbd01
AD
4722int amdgpu_device_baco_enter(struct drm_device *dev)
4723{
1348969a 4724 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 4725 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 4726
4a580877 4727 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
4728 return -ENOTSUPP;
4729
7a22677b
LM
4730 if (ras && ras->supported)
4731 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4732
9530273e 4733 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
4734}
4735
4736int amdgpu_device_baco_exit(struct drm_device *dev)
4737{
1348969a 4738 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 4739 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 4740 int ret = 0;
361dbd01 4741
4a580877 4742 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
4743 return -ENOTSUPP;
4744
9530273e
EQ
4745 ret = amdgpu_dpm_baco_exit(adev);
4746 if (ret)
4747 return ret;
7a22677b
LM
4748
4749 if (ras && ras->supported)
4750 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4751
4752 return 0;
361dbd01 4753}
c9a6b82f 4754
acd89fca
AG
4755static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4756{
4757 int i;
4758
4759 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4760 struct amdgpu_ring *ring = adev->rings[i];
4761
4762 if (!ring || !ring->sched.thread)
4763 continue;
4764
4765 cancel_delayed_work_sync(&ring->sched.work_tdr);
4766 }
4767}
4768
c9a6b82f
AG
4769/**
4770 * amdgpu_pci_error_detected - Called when a PCI error is detected.
4771 * @pdev: PCI device struct
4772 * @state: PCI channel state
4773 *
4774 * Description: Called when a PCI error is detected.
4775 *
4776 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4777 */
4778pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4779{
4780 struct drm_device *dev = pci_get_drvdata(pdev);
4781 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 4782 int i;
c9a6b82f
AG
4783
4784 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4785
4786 switch (state) {
4787 case pci_channel_io_normal:
4788 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca
AG
4789 /* Fatal error, prepare for slot reset */
4790 case pci_channel_io_frozen:
4791 /*
4792 * Cancel and wait for all TDRs in progress if failing to
4793 * set adev->in_gpu_reset in amdgpu_device_lock_adev
4794 *
4795 * Locking adev->reset_sem will prevent any external access
4796 * to GPU during PCI error recovery
4797 */
4798 while (!amdgpu_device_lock_adev(adev, NULL))
4799 amdgpu_cancel_all_tdr(adev);
4800
4801 /*
4802 * Block any work scheduling as we do for regular GPU reset
4803 * for the duration of the recovery
4804 */
4805 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4806 struct amdgpu_ring *ring = adev->rings[i];
4807
4808 if (!ring || !ring->sched.thread)
4809 continue;
4810
4811 drm_sched_stop(&ring->sched, NULL);
4812 }
c9a6b82f
AG
4813 return PCI_ERS_RESULT_NEED_RESET;
4814 case pci_channel_io_perm_failure:
4815 /* Permanent error, prepare for device removal */
4816 return PCI_ERS_RESULT_DISCONNECT;
4817 }
4818
4819 return PCI_ERS_RESULT_NEED_RESET;
4820}
4821
4822/**
4823 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4824 * @pdev: pointer to PCI device
4825 */
4826pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4827{
4828
4829 DRM_INFO("PCI error: mmio enabled callback!!\n");
4830
4831 /* TODO - dump whatever for debugging purposes */
4832
4833 /* This called only if amdgpu_pci_error_detected returns
4834 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4835 * works, no need to reset slot.
4836 */
4837
4838 return PCI_ERS_RESULT_RECOVERED;
4839}
4840
4841/**
4842 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4843 * @pdev: PCI device struct
4844 *
4845 * Description: This routine is called by the pci error recovery
4846 * code after the PCI slot has been reset, just before we
4847 * should resume normal operations.
4848 */
4849pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4850{
4851 struct drm_device *dev = pci_get_drvdata(pdev);
4852 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 4853 int r, i;
7ac71382 4854 bool need_full_reset = true;
362c7b91 4855 u32 memsize;
7ac71382 4856 struct list_head device_list;
c9a6b82f
AG
4857
4858 DRM_INFO("PCI error: slot reset callback!!\n");
4859
7ac71382
AG
4860 INIT_LIST_HEAD(&device_list);
4861 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4862
362c7b91
AG
4863 /* wait for asic to come out of reset */
4864 msleep(500);
4865
7ac71382 4866 /* Restore PCI confspace */
c1dd4aa6 4867 amdgpu_device_load_pci_state(pdev);
c9a6b82f 4868
362c7b91
AG
4869 /* confirm ASIC came out of reset */
4870 for (i = 0; i < adev->usec_timeout; i++) {
4871 memsize = amdgpu_asic_get_config_memsize(adev);
4872
4873 if (memsize != 0xffffffff)
4874 break;
4875 udelay(1);
4876 }
4877 if (memsize == 0xffffffff) {
4878 r = -ETIME;
4879 goto out;
4880 }
4881
362c7b91 4882 adev->in_pci_err_recovery = true;
7ac71382 4883 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
bf36b52e 4884 adev->in_pci_err_recovery = false;
c9a6b82f
AG
4885 if (r)
4886 goto out;
4887
7ac71382 4888 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
c9a6b82f
AG
4889
4890out:
c9a6b82f 4891 if (!r) {
c1dd4aa6
AG
4892 if (amdgpu_device_cache_pci_state(adev->pdev))
4893 pci_restore_state(adev->pdev);
4894
c9a6b82f
AG
4895 DRM_INFO("PCIe error recovery succeeded\n");
4896 } else {
4897 DRM_ERROR("PCIe error recovery failed, err:%d", r);
4898 amdgpu_device_unlock_adev(adev);
4899 }
4900
4901 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
4902}
4903
4904/**
4905 * amdgpu_pci_resume() - resume normal ops after PCI reset
4906 * @pdev: pointer to PCI device
4907 *
4908 * Called when the error recovery driver tells us that its
4909 * OK to resume normal operation. Use completion to allow
4910 * halted scsi ops to resume.
4911 */
4912void amdgpu_pci_resume(struct pci_dev *pdev)
4913{
4914 struct drm_device *dev = pci_get_drvdata(pdev);
4915 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 4916 int i;
c9a6b82f 4917
c9a6b82f
AG
4918
4919 DRM_INFO("PCI error: resume callback!!\n");
acd89fca
AG
4920
4921 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4922 struct amdgpu_ring *ring = adev->rings[i];
4923
4924 if (!ring || !ring->sched.thread)
4925 continue;
4926
4927
4928 drm_sched_resubmit_jobs(&ring->sched);
4929 drm_sched_start(&ring->sched, true);
4930 }
4931
4932 amdgpu_device_unlock_adev(adev);
c9a6b82f 4933}
c1dd4aa6
AG
4934
4935bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
4936{
4937 struct drm_device *dev = pci_get_drvdata(pdev);
4938 struct amdgpu_device *adev = drm_to_adev(dev);
4939 int r;
4940
4941 r = pci_save_state(pdev);
4942 if (!r) {
4943 kfree(adev->pci_state);
4944
4945 adev->pci_state = pci_store_saved_state(pdev);
4946
4947 if (!adev->pci_state) {
4948 DRM_ERROR("Failed to store PCI saved state");
4949 return false;
4950 }
4951 } else {
4952 DRM_WARN("Failed to save PCI state, err:%d\n", r);
4953 return false;
4954 }
4955
4956 return true;
4957}
4958
4959bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
4960{
4961 struct drm_device *dev = pci_get_drvdata(pdev);
4962 struct amdgpu_device *adev = drm_to_adev(dev);
4963 int r;
4964
4965 if (!adev->pci_state)
4966 return false;
4967
4968 r = pci_load_saved_state(pdev, adev->pci_state);
4969
4970 if (!r) {
4971 pci_restore_state(pdev);
4972 } else {
4973 DRM_WARN("Failed to load PCI state, err:%d\n", r);
4974 return false;
4975 }
4976
4977 return true;
4978}
4979
4980