drm/ingenic: Add option to alloc cached GEM buffers
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
fdf2f6c5 33
4562236b 34#include <drm/drm_atomic_helper.h>
fcd70cd3 35#include <drm/drm_probe_helper.h>
d38ceaf9
AD
36#include <drm/amdgpu_drm.h>
37#include <linux/vgaarb.h>
38#include <linux/vga_switcheroo.h>
39#include <linux/efi.h>
40#include "amdgpu.h"
f4b373f4 41#include "amdgpu_trace.h"
d38ceaf9
AD
42#include "amdgpu_i2c.h"
43#include "atom.h"
44#include "amdgpu_atombios.h"
a5bde2f9 45#include "amdgpu_atomfirmware.h"
d0dd7f0c 46#include "amd_pcie.h"
33f34802
KW
47#ifdef CONFIG_DRM_AMDGPU_SI
48#include "si.h"
49#endif
a2e73f56
AD
50#ifdef CONFIG_DRM_AMDGPU_CIK
51#include "cik.h"
52#endif
aaa36a97 53#include "vi.h"
460826e6 54#include "soc15.h"
0a5b8c7b 55#include "nv.h"
d38ceaf9 56#include "bif/bif_4_1_d.h"
9accf2fd 57#include <linux/pci.h>
bec86378 58#include <linux/firmware.h>
89041940 59#include "amdgpu_vf_error.h"
d38ceaf9 60
ba997709 61#include "amdgpu_amdkfd.h"
d2f52ac8 62#include "amdgpu_pm.h"
d38ceaf9 63
5183411b 64#include "amdgpu_xgmi.h"
c030f2e4 65#include "amdgpu_ras.h"
9c7c85f7 66#include "amdgpu_pmu.h"
bd607166 67#include "amdgpu_fru_eeprom.h"
04442bf7 68#include "amdgpu_reset.h"
5183411b 69
d5ea093e 70#include <linux/suspend.h>
c6a6e2db 71#include <drm/task_barrier.h>
3f12acc8 72#include <linux/pm_runtime.h>
d5ea093e 73
f89f8c6b
AG
74#include <drm/drm_drv.h>
75
e2a75f88 76MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 77MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 78MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 79MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 80MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 81MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
b51a26a0 82MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
23c6268e 83MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
ed42cfe1 84MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
42b325e5 85MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
4e52a9f8 86MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
e2a75f88 87
2dc80b00
S
88#define AMDGPU_RESUME_MS 2000
89
050091ab 90const char *amdgpu_asic_name[] = {
da69c161
KW
91 "TAHITI",
92 "PITCAIRN",
93 "VERDE",
94 "OLAND",
95 "HAINAN",
d38ceaf9
AD
96 "BONAIRE",
97 "KAVERI",
98 "KABINI",
99 "HAWAII",
100 "MULLINS",
101 "TOPAZ",
102 "TONGA",
48299f95 103 "FIJI",
d38ceaf9 104 "CARRIZO",
139f4917 105 "STONEY",
2cc0c0b5
FC
106 "POLARIS10",
107 "POLARIS11",
c4642a47 108 "POLARIS12",
48ff108d 109 "VEGAM",
d4196f01 110 "VEGA10",
8fab806a 111 "VEGA12",
956fcddc 112 "VEGA20",
2ca8a5d2 113 "RAVEN",
d6c3b24e 114 "ARCTURUS",
1eee4228 115 "RENOIR",
d46b417a 116 "ALDEBARAN",
852a6626 117 "NAVI10",
87dbad02 118 "NAVI14",
9802f5d7 119 "NAVI12",
ccaf72d3 120 "SIENNA_CICHLID",
ddd8fbe7 121 "NAVY_FLOUNDER",
4f1e9a76 122 "VANGOGH",
a2468e04 123 "DIMGREY_CAVEFISH",
6f169591 124 "BEIGE_GOBY",
d38ceaf9
AD
125 "LAST",
126};
127
dcea6e65
KR
128/**
129 * DOC: pcie_replay_count
130 *
131 * The amdgpu driver provides a sysfs API for reporting the total number
132 * of PCIe replays (NAKs)
133 * The file pcie_replay_count is used for this and returns the total
134 * number of replays as a sum of the NAKs generated and NAKs received
135 */
136
137static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
138 struct device_attribute *attr, char *buf)
139{
140 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 141 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
142 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
143
36000c7a 144 return sysfs_emit(buf, "%llu\n", cnt);
dcea6e65
KR
145}
146
147static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
148 amdgpu_device_get_pcie_replay_count, NULL);
149
5494d864
AD
150static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
151
bd607166
KR
152/**
153 * DOC: product_name
154 *
155 * The amdgpu driver provides a sysfs API for reporting the product name
156 * for the device
157 * The file serial_number is used for this and returns the product name
158 * as returned from the FRU.
159 * NOTE: This is only available for certain server cards
160 */
161
162static ssize_t amdgpu_device_get_product_name(struct device *dev,
163 struct device_attribute *attr, char *buf)
164{
165 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 166 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 167
36000c7a 168 return sysfs_emit(buf, "%s\n", adev->product_name);
bd607166
KR
169}
170
171static DEVICE_ATTR(product_name, S_IRUGO,
172 amdgpu_device_get_product_name, NULL);
173
174/**
175 * DOC: product_number
176 *
177 * The amdgpu driver provides a sysfs API for reporting the part number
178 * for the device
179 * The file serial_number is used for this and returns the part number
180 * as returned from the FRU.
181 * NOTE: This is only available for certain server cards
182 */
183
184static ssize_t amdgpu_device_get_product_number(struct device *dev,
185 struct device_attribute *attr, char *buf)
186{
187 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 188 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 189
36000c7a 190 return sysfs_emit(buf, "%s\n", adev->product_number);
bd607166
KR
191}
192
193static DEVICE_ATTR(product_number, S_IRUGO,
194 amdgpu_device_get_product_number, NULL);
195
196/**
197 * DOC: serial_number
198 *
199 * The amdgpu driver provides a sysfs API for reporting the serial number
200 * for the device
201 * The file serial_number is used for this and returns the serial number
202 * as returned from the FRU.
203 * NOTE: This is only available for certain server cards
204 */
205
206static ssize_t amdgpu_device_get_serial_number(struct device *dev,
207 struct device_attribute *attr, char *buf)
208{
209 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 210 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 211
36000c7a 212 return sysfs_emit(buf, "%s\n", adev->serial);
bd607166
KR
213}
214
215static DEVICE_ATTR(serial_number, S_IRUGO,
216 amdgpu_device_get_serial_number, NULL);
217
fd496ca8 218/**
b98c6299 219 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
fd496ca8
AD
220 *
221 * @dev: drm_device pointer
222 *
b98c6299 223 * Returns true if the device is a dGPU with ATPX power control,
fd496ca8
AD
224 * otherwise return false.
225 */
b98c6299 226bool amdgpu_device_supports_px(struct drm_device *dev)
fd496ca8
AD
227{
228 struct amdgpu_device *adev = drm_to_adev(dev);
229
b98c6299 230 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
fd496ca8
AD
231 return true;
232 return false;
233}
234
e3ecdffa 235/**
0330b848 236 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
e3ecdffa
AD
237 *
238 * @dev: drm_device pointer
239 *
b98c6299 240 * Returns true if the device is a dGPU with ACPI power control,
e3ecdffa
AD
241 * otherwise return false.
242 */
31af062a 243bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 244{
1348969a 245 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 246
b98c6299
AD
247 if (adev->has_pr3 ||
248 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
d38ceaf9
AD
249 return true;
250 return false;
251}
252
a69cba42
AD
253/**
254 * amdgpu_device_supports_baco - Does the device support BACO
255 *
256 * @dev: drm_device pointer
257 *
258 * Returns true if the device supporte BACO,
259 * otherwise return false.
260 */
261bool amdgpu_device_supports_baco(struct drm_device *dev)
262{
1348969a 263 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
264
265 return amdgpu_asic_supports_baco(adev);
266}
267
6e3cd2a9
MCC
268/*
269 * VRAM access helper functions
270 */
271
e35e2b11 272/**
e35e2b11
TY
273 * amdgpu_device_vram_access - read/write a buffer in vram
274 *
275 * @adev: amdgpu_device pointer
276 * @pos: offset of the buffer in vram
277 * @buf: virtual address of the buffer in system memory
278 * @size: read/write size, sizeof(@buf) must > @size
279 * @write: true - write to vram, otherwise - read from vram
280 */
281void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
282 uint32_t *buf, size_t size, bool write)
283{
e35e2b11 284 unsigned long flags;
ce05ac56
CK
285 uint32_t hi = ~0;
286 uint64_t last;
f89f8c6b 287 int idx;
ce05ac56 288
f89f8c6b
AG
289 if (!drm_dev_enter(&adev->ddev, &idx))
290 return;
9d11eb0d
CK
291
292#ifdef CONFIG_64BIT
293 last = min(pos + size, adev->gmc.visible_vram_size);
294 if (last > pos) {
295 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
296 size_t count = last - pos;
297
298 if (write) {
299 memcpy_toio(addr, buf, count);
300 mb();
301 amdgpu_asic_flush_hdp(adev, NULL);
302 } else {
303 amdgpu_asic_invalidate_hdp(adev, NULL);
304 mb();
305 memcpy_fromio(buf, addr, count);
306 }
307
308 if (count == size)
f89f8c6b 309 goto exit;
9d11eb0d
CK
310
311 pos += count;
312 buf += count / 4;
313 size -= count;
314 }
315#endif
316
ce05ac56
CK
317 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
318 for (last = pos + size; pos < last; pos += 4) {
319 uint32_t tmp = pos >> 31;
e35e2b11 320
e35e2b11 321 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
ce05ac56
CK
322 if (tmp != hi) {
323 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
324 hi = tmp;
325 }
e35e2b11
TY
326 if (write)
327 WREG32_NO_KIQ(mmMM_DATA, *buf++);
328 else
329 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
e35e2b11 330 }
ce05ac56 331 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
f89f8c6b
AG
332
333exit:
334 drm_dev_exit(idx);
e35e2b11
TY
335}
336
d38ceaf9 337/*
f7ee1874 338 * register access helper functions.
d38ceaf9 339 */
56b53c0b
DL
340
341/* Check if hw access should be skipped because of hotplug or device error */
342bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
343{
344 if (adev->in_pci_err_recovery)
345 return true;
346
347#ifdef CONFIG_LOCKDEP
348 /*
349 * This is a bit complicated to understand, so worth a comment. What we assert
350 * here is that the GPU reset is not running on another thread in parallel.
351 *
352 * For this we trylock the read side of the reset semaphore, if that succeeds
353 * we know that the reset is not running in paralell.
354 *
355 * If the trylock fails we assert that we are either already holding the read
356 * side of the lock or are the reset thread itself and hold the write side of
357 * the lock.
358 */
359 if (in_task()) {
360 if (down_read_trylock(&adev->reset_sem))
361 up_read(&adev->reset_sem);
362 else
363 lockdep_assert_held(&adev->reset_sem);
364 }
365#endif
366 return false;
367}
368
e3ecdffa 369/**
f7ee1874 370 * amdgpu_device_rreg - read a memory mapped IO or indirect register
e3ecdffa
AD
371 *
372 * @adev: amdgpu_device pointer
373 * @reg: dword aligned register offset
374 * @acc_flags: access flags which require special behavior
375 *
376 * Returns the 32 bit value from the offset specified.
377 */
f7ee1874
HZ
378uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
379 uint32_t reg, uint32_t acc_flags)
d38ceaf9 380{
f4b373f4
TSD
381 uint32_t ret;
382
56b53c0b 383 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
384 return 0;
385
f7ee1874
HZ
386 if ((reg * 4) < adev->rmmio_size) {
387 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
388 amdgpu_sriov_runtime(adev) &&
389 down_read_trylock(&adev->reset_sem)) {
390 ret = amdgpu_kiq_rreg(adev, reg);
391 up_read(&adev->reset_sem);
392 } else {
393 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
394 }
395 } else {
396 ret = adev->pcie_rreg(adev, reg * 4);
81202807 397 }
bc992ba5 398
f7ee1874 399 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
e78b579d 400
f4b373f4 401 return ret;
d38ceaf9
AD
402}
403
421a2a30
ML
404/*
405 * MMIO register read with bytes helper functions
406 * @offset:bytes offset from MMIO start
407 *
408*/
409
e3ecdffa
AD
410/**
411 * amdgpu_mm_rreg8 - read a memory mapped IO register
412 *
413 * @adev: amdgpu_device pointer
414 * @offset: byte aligned register offset
415 *
416 * Returns the 8 bit value from the offset specified.
417 */
7cbbc745
AG
418uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
419{
56b53c0b 420 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
421 return 0;
422
421a2a30
ML
423 if (offset < adev->rmmio_size)
424 return (readb(adev->rmmio + offset));
425 BUG();
426}
427
428/*
429 * MMIO register write with bytes helper functions
430 * @offset:bytes offset from MMIO start
431 * @value: the value want to be written to the register
432 *
433*/
e3ecdffa
AD
434/**
435 * amdgpu_mm_wreg8 - read a memory mapped IO register
436 *
437 * @adev: amdgpu_device pointer
438 * @offset: byte aligned register offset
439 * @value: 8 bit value to write
440 *
441 * Writes the value specified to the offset specified.
442 */
7cbbc745
AG
443void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
444{
56b53c0b 445 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
446 return;
447
421a2a30
ML
448 if (offset < adev->rmmio_size)
449 writeb(value, adev->rmmio + offset);
450 else
451 BUG();
452}
453
e3ecdffa 454/**
f7ee1874 455 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
e3ecdffa
AD
456 *
457 * @adev: amdgpu_device pointer
458 * @reg: dword aligned register offset
459 * @v: 32 bit value to write to the register
460 * @acc_flags: access flags which require special behavior
461 *
462 * Writes the value specified to the offset specified.
463 */
f7ee1874
HZ
464void amdgpu_device_wreg(struct amdgpu_device *adev,
465 uint32_t reg, uint32_t v,
466 uint32_t acc_flags)
d38ceaf9 467{
56b53c0b 468 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
469 return;
470
f7ee1874
HZ
471 if ((reg * 4) < adev->rmmio_size) {
472 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
473 amdgpu_sriov_runtime(adev) &&
474 down_read_trylock(&adev->reset_sem)) {
475 amdgpu_kiq_wreg(adev, reg, v);
476 up_read(&adev->reset_sem);
477 } else {
478 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
479 }
480 } else {
481 adev->pcie_wreg(adev, reg * 4, v);
81202807 482 }
bc992ba5 483
f7ee1874 484 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
2e0cc4d4 485}
d38ceaf9 486
2e0cc4d4
ML
487/*
488 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
489 *
490 * this function is invoked only the debugfs register access
491 * */
f7ee1874
HZ
492void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
493 uint32_t reg, uint32_t v)
2e0cc4d4 494{
56b53c0b 495 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
496 return;
497
2e0cc4d4 498 if (amdgpu_sriov_fullaccess(adev) &&
f7ee1874
HZ
499 adev->gfx.rlc.funcs &&
500 adev->gfx.rlc.funcs->is_rlcg_access_range) {
2e0cc4d4 501 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
5e025531 502 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0);
f7ee1874
HZ
503 } else {
504 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
47ed4e1c 505 }
d38ceaf9
AD
506}
507
d38ceaf9
AD
508/**
509 * amdgpu_mm_rdoorbell - read a doorbell dword
510 *
511 * @adev: amdgpu_device pointer
512 * @index: doorbell index
513 *
514 * Returns the value in the doorbell aperture at the
515 * requested doorbell index (CIK).
516 */
517u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
518{
56b53c0b 519 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
520 return 0;
521
d38ceaf9
AD
522 if (index < adev->doorbell.num_doorbells) {
523 return readl(adev->doorbell.ptr + index);
524 } else {
525 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
526 return 0;
527 }
528}
529
530/**
531 * amdgpu_mm_wdoorbell - write a doorbell dword
532 *
533 * @adev: amdgpu_device pointer
534 * @index: doorbell index
535 * @v: value to write
536 *
537 * Writes @v to the doorbell aperture at the
538 * requested doorbell index (CIK).
539 */
540void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
541{
56b53c0b 542 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
543 return;
544
d38ceaf9
AD
545 if (index < adev->doorbell.num_doorbells) {
546 writel(v, adev->doorbell.ptr + index);
547 } else {
548 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
549 }
550}
551
832be404
KW
552/**
553 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
554 *
555 * @adev: amdgpu_device pointer
556 * @index: doorbell index
557 *
558 * Returns the value in the doorbell aperture at the
559 * requested doorbell index (VEGA10+).
560 */
561u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
562{
56b53c0b 563 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
564 return 0;
565
832be404
KW
566 if (index < adev->doorbell.num_doorbells) {
567 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
568 } else {
569 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
570 return 0;
571 }
572}
573
574/**
575 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
576 *
577 * @adev: amdgpu_device pointer
578 * @index: doorbell index
579 * @v: value to write
580 *
581 * Writes @v to the doorbell aperture at the
582 * requested doorbell index (VEGA10+).
583 */
584void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
585{
56b53c0b 586 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
587 return;
588
832be404
KW
589 if (index < adev->doorbell.num_doorbells) {
590 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
591 } else {
592 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
593 }
594}
595
1bba3683
HZ
596/**
597 * amdgpu_device_indirect_rreg - read an indirect register
598 *
599 * @adev: amdgpu_device pointer
600 * @pcie_index: mmio register offset
601 * @pcie_data: mmio register offset
22f453fb 602 * @reg_addr: indirect register address to read from
1bba3683
HZ
603 *
604 * Returns the value of indirect register @reg_addr
605 */
606u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
607 u32 pcie_index, u32 pcie_data,
608 u32 reg_addr)
609{
610 unsigned long flags;
611 u32 r;
612 void __iomem *pcie_index_offset;
613 void __iomem *pcie_data_offset;
614
615 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
616 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
617 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
618
619 writel(reg_addr, pcie_index_offset);
620 readl(pcie_index_offset);
621 r = readl(pcie_data_offset);
622 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
623
624 return r;
625}
626
627/**
628 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
629 *
630 * @adev: amdgpu_device pointer
631 * @pcie_index: mmio register offset
632 * @pcie_data: mmio register offset
22f453fb 633 * @reg_addr: indirect register address to read from
1bba3683
HZ
634 *
635 * Returns the value of indirect register @reg_addr
636 */
637u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
638 u32 pcie_index, u32 pcie_data,
639 u32 reg_addr)
640{
641 unsigned long flags;
642 u64 r;
643 void __iomem *pcie_index_offset;
644 void __iomem *pcie_data_offset;
645
646 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
647 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
648 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
649
650 /* read low 32 bits */
651 writel(reg_addr, pcie_index_offset);
652 readl(pcie_index_offset);
653 r = readl(pcie_data_offset);
654 /* read high 32 bits */
655 writel(reg_addr + 4, pcie_index_offset);
656 readl(pcie_index_offset);
657 r |= ((u64)readl(pcie_data_offset) << 32);
658 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
659
660 return r;
661}
662
663/**
664 * amdgpu_device_indirect_wreg - write an indirect register address
665 *
666 * @adev: amdgpu_device pointer
667 * @pcie_index: mmio register offset
668 * @pcie_data: mmio register offset
669 * @reg_addr: indirect register offset
670 * @reg_data: indirect register data
671 *
672 */
673void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
674 u32 pcie_index, u32 pcie_data,
675 u32 reg_addr, u32 reg_data)
676{
677 unsigned long flags;
678 void __iomem *pcie_index_offset;
679 void __iomem *pcie_data_offset;
680
681 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
682 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
683 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
684
685 writel(reg_addr, pcie_index_offset);
686 readl(pcie_index_offset);
687 writel(reg_data, pcie_data_offset);
688 readl(pcie_data_offset);
689 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
690}
691
692/**
693 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
694 *
695 * @adev: amdgpu_device pointer
696 * @pcie_index: mmio register offset
697 * @pcie_data: mmio register offset
698 * @reg_addr: indirect register offset
699 * @reg_data: indirect register data
700 *
701 */
702void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
703 u32 pcie_index, u32 pcie_data,
704 u32 reg_addr, u64 reg_data)
705{
706 unsigned long flags;
707 void __iomem *pcie_index_offset;
708 void __iomem *pcie_data_offset;
709
710 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
711 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
712 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
713
714 /* write low 32 bits */
715 writel(reg_addr, pcie_index_offset);
716 readl(pcie_index_offset);
717 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
718 readl(pcie_data_offset);
719 /* write high 32 bits */
720 writel(reg_addr + 4, pcie_index_offset);
721 readl(pcie_index_offset);
722 writel((u32)(reg_data >> 32), pcie_data_offset);
723 readl(pcie_data_offset);
724 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
725}
726
d38ceaf9
AD
727/**
728 * amdgpu_invalid_rreg - dummy reg read function
729 *
982a820b 730 * @adev: amdgpu_device pointer
d38ceaf9
AD
731 * @reg: offset of register
732 *
733 * Dummy register read function. Used for register blocks
734 * that certain asics don't have (all asics).
735 * Returns the value in the register.
736 */
737static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
738{
739 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
740 BUG();
741 return 0;
742}
743
744/**
745 * amdgpu_invalid_wreg - dummy reg write function
746 *
982a820b 747 * @adev: amdgpu_device pointer
d38ceaf9
AD
748 * @reg: offset of register
749 * @v: value to write to the register
750 *
751 * Dummy register read function. Used for register blocks
752 * that certain asics don't have (all asics).
753 */
754static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
755{
756 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
757 reg, v);
758 BUG();
759}
760
4fa1c6a6
TZ
761/**
762 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
763 *
982a820b 764 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
765 * @reg: offset of register
766 *
767 * Dummy register read function. Used for register blocks
768 * that certain asics don't have (all asics).
769 * Returns the value in the register.
770 */
771static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
772{
773 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
774 BUG();
775 return 0;
776}
777
778/**
779 * amdgpu_invalid_wreg64 - dummy reg write function
780 *
982a820b 781 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
782 * @reg: offset of register
783 * @v: value to write to the register
784 *
785 * Dummy register read function. Used for register blocks
786 * that certain asics don't have (all asics).
787 */
788static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
789{
790 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
791 reg, v);
792 BUG();
793}
794
d38ceaf9
AD
795/**
796 * amdgpu_block_invalid_rreg - dummy reg read function
797 *
982a820b 798 * @adev: amdgpu_device pointer
d38ceaf9
AD
799 * @block: offset of instance
800 * @reg: offset of register
801 *
802 * Dummy register read function. Used for register blocks
803 * that certain asics don't have (all asics).
804 * Returns the value in the register.
805 */
806static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
807 uint32_t block, uint32_t reg)
808{
809 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
810 reg, block);
811 BUG();
812 return 0;
813}
814
815/**
816 * amdgpu_block_invalid_wreg - dummy reg write function
817 *
982a820b 818 * @adev: amdgpu_device pointer
d38ceaf9
AD
819 * @block: offset of instance
820 * @reg: offset of register
821 * @v: value to write to the register
822 *
823 * Dummy register read function. Used for register blocks
824 * that certain asics don't have (all asics).
825 */
826static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
827 uint32_t block,
828 uint32_t reg, uint32_t v)
829{
830 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
831 reg, block, v);
832 BUG();
833}
834
4d2997ab
AD
835/**
836 * amdgpu_device_asic_init - Wrapper for atom asic_init
837 *
982a820b 838 * @adev: amdgpu_device pointer
4d2997ab
AD
839 *
840 * Does any asic specific work and then calls atom asic init.
841 */
842static int amdgpu_device_asic_init(struct amdgpu_device *adev)
843{
844 amdgpu_asic_pre_asic_init(adev);
845
846 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
847}
848
e3ecdffa
AD
849/**
850 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
851 *
982a820b 852 * @adev: amdgpu_device pointer
e3ecdffa
AD
853 *
854 * Allocates a scratch page of VRAM for use by various things in the
855 * driver.
856 */
06ec9070 857static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 858{
a4a02777
CK
859 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
860 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
861 &adev->vram_scratch.robj,
862 &adev->vram_scratch.gpu_addr,
863 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
864}
865
e3ecdffa
AD
866/**
867 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
868 *
982a820b 869 * @adev: amdgpu_device pointer
e3ecdffa
AD
870 *
871 * Frees the VRAM scratch page.
872 */
06ec9070 873static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 874{
078af1a3 875 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
876}
877
878/**
9c3f2b54 879 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
880 *
881 * @adev: amdgpu_device pointer
882 * @registers: pointer to the register array
883 * @array_size: size of the register array
884 *
885 * Programs an array or registers with and and or masks.
886 * This is a helper for setting golden registers.
887 */
9c3f2b54
AD
888void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
889 const u32 *registers,
890 const u32 array_size)
d38ceaf9
AD
891{
892 u32 tmp, reg, and_mask, or_mask;
893 int i;
894
895 if (array_size % 3)
896 return;
897
898 for (i = 0; i < array_size; i +=3) {
899 reg = registers[i + 0];
900 and_mask = registers[i + 1];
901 or_mask = registers[i + 2];
902
903 if (and_mask == 0xffffffff) {
904 tmp = or_mask;
905 } else {
906 tmp = RREG32(reg);
907 tmp &= ~and_mask;
e0d07657
HZ
908 if (adev->family >= AMDGPU_FAMILY_AI)
909 tmp |= (or_mask & and_mask);
910 else
911 tmp |= or_mask;
d38ceaf9
AD
912 }
913 WREG32(reg, tmp);
914 }
915}
916
e3ecdffa
AD
917/**
918 * amdgpu_device_pci_config_reset - reset the GPU
919 *
920 * @adev: amdgpu_device pointer
921 *
922 * Resets the GPU using the pci config reset sequence.
923 * Only applicable to asics prior to vega10.
924 */
8111c387 925void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
926{
927 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
928}
929
af484df8
AD
930/**
931 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
932 *
933 * @adev: amdgpu_device pointer
934 *
935 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
936 */
937int amdgpu_device_pci_reset(struct amdgpu_device *adev)
938{
939 return pci_reset_function(adev->pdev);
940}
941
d38ceaf9
AD
942/*
943 * GPU doorbell aperture helpers function.
944 */
945/**
06ec9070 946 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
947 *
948 * @adev: amdgpu_device pointer
949 *
950 * Init doorbell driver information (CIK)
951 * Returns 0 on success, error on failure.
952 */
06ec9070 953static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 954{
6585661d 955
705e519e
CK
956 /* No doorbell on SI hardware generation */
957 if (adev->asic_type < CHIP_BONAIRE) {
958 adev->doorbell.base = 0;
959 adev->doorbell.size = 0;
960 adev->doorbell.num_doorbells = 0;
961 adev->doorbell.ptr = NULL;
962 return 0;
963 }
964
d6895ad3
CK
965 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
966 return -EINVAL;
967
22357775
AD
968 amdgpu_asic_init_doorbell_index(adev);
969
d38ceaf9
AD
970 /* doorbell bar mapping */
971 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
972 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
973
edf600da 974 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
9564f192 975 adev->doorbell_index.max_assignment+1);
d38ceaf9
AD
976 if (adev->doorbell.num_doorbells == 0)
977 return -EINVAL;
978
ec3db8a6 979 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
88dc26e4
OZ
980 * paging queue doorbell use the second page. The
981 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
982 * doorbells are in the first page. So with paging queue enabled,
983 * the max num_doorbells should + 1 page (0x400 in dword)
ec3db8a6
PY
984 */
985 if (adev->asic_type >= CHIP_VEGA10)
88dc26e4 986 adev->doorbell.num_doorbells += 0x400;
ec3db8a6 987
8972e5d2
CK
988 adev->doorbell.ptr = ioremap(adev->doorbell.base,
989 adev->doorbell.num_doorbells *
990 sizeof(u32));
991 if (adev->doorbell.ptr == NULL)
d38ceaf9 992 return -ENOMEM;
d38ceaf9
AD
993
994 return 0;
995}
996
997/**
06ec9070 998 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
999 *
1000 * @adev: amdgpu_device pointer
1001 *
1002 * Tear down doorbell driver information (CIK)
1003 */
06ec9070 1004static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1005{
1006 iounmap(adev->doorbell.ptr);
1007 adev->doorbell.ptr = NULL;
1008}
1009
22cb0164 1010
d38ceaf9
AD
1011
1012/*
06ec9070 1013 * amdgpu_device_wb_*()
455a7bc2 1014 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 1015 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
1016 */
1017
1018/**
06ec9070 1019 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
1020 *
1021 * @adev: amdgpu_device pointer
1022 *
1023 * Disables Writeback and frees the Writeback memory (all asics).
1024 * Used at driver shutdown.
1025 */
06ec9070 1026static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1027{
1028 if (adev->wb.wb_obj) {
a76ed485
AD
1029 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1030 &adev->wb.gpu_addr,
1031 (void **)&adev->wb.wb);
d38ceaf9
AD
1032 adev->wb.wb_obj = NULL;
1033 }
1034}
1035
1036/**
06ec9070 1037 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
d38ceaf9
AD
1038 *
1039 * @adev: amdgpu_device pointer
1040 *
455a7bc2 1041 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
1042 * Used at driver startup.
1043 * Returns 0 on success or an -error on failure.
1044 */
06ec9070 1045static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
1046{
1047 int r;
1048
1049 if (adev->wb.wb_obj == NULL) {
97407b63
AD
1050 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1051 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
1052 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1053 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1054 (void **)&adev->wb.wb);
d38ceaf9
AD
1055 if (r) {
1056 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1057 return r;
1058 }
d38ceaf9
AD
1059
1060 adev->wb.num_wb = AMDGPU_MAX_WB;
1061 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1062
1063 /* clear wb memory */
73469585 1064 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
1065 }
1066
1067 return 0;
1068}
1069
1070/**
131b4b36 1071 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
1072 *
1073 * @adev: amdgpu_device pointer
1074 * @wb: wb index
1075 *
1076 * Allocate a wb slot for use by the driver (all asics).
1077 * Returns 0 on success or -EINVAL on failure.
1078 */
131b4b36 1079int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
1080{
1081 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 1082
97407b63 1083 if (offset < adev->wb.num_wb) {
7014285a 1084 __set_bit(offset, adev->wb.used);
63ae07ca 1085 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
1086 return 0;
1087 } else {
1088 return -EINVAL;
1089 }
1090}
1091
d38ceaf9 1092/**
131b4b36 1093 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
1094 *
1095 * @adev: amdgpu_device pointer
1096 * @wb: wb index
1097 *
1098 * Free a wb slot allocated for use by the driver (all asics)
1099 */
131b4b36 1100void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 1101{
73469585 1102 wb >>= 3;
d38ceaf9 1103 if (wb < adev->wb.num_wb)
73469585 1104 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
1105}
1106
d6895ad3
CK
1107/**
1108 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1109 *
1110 * @adev: amdgpu_device pointer
1111 *
1112 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1113 * to fail, but if any of the BARs is not accessible after the size we abort
1114 * driver loading by returning -ENODEV.
1115 */
1116int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1117{
453f617a 1118 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
31b8adab
CK
1119 struct pci_bus *root;
1120 struct resource *res;
1121 unsigned i;
d6895ad3
CK
1122 u16 cmd;
1123 int r;
1124
0c03b912 1125 /* Bypass for VF */
1126 if (amdgpu_sriov_vf(adev))
1127 return 0;
1128
b7221f2b
AD
1129 /* skip if the bios has already enabled large BAR */
1130 if (adev->gmc.real_vram_size &&
1131 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1132 return 0;
1133
31b8adab
CK
1134 /* Check if the root BUS has 64bit memory resources */
1135 root = adev->pdev->bus;
1136 while (root->parent)
1137 root = root->parent;
1138
1139 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 1140 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1141 res->start > 0x100000000ull)
1142 break;
1143 }
1144
1145 /* Trying to resize is pointless without a root hub window above 4GB */
1146 if (!res)
1147 return 0;
1148
453f617a
ND
1149 /* Limit the BAR size to what is available */
1150 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1151 rbar_size);
1152
d6895ad3
CK
1153 /* Disable memory decoding while we change the BAR addresses and size */
1154 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1155 pci_write_config_word(adev->pdev, PCI_COMMAND,
1156 cmd & ~PCI_COMMAND_MEMORY);
1157
1158 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1159 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1160 if (adev->asic_type >= CHIP_BONAIRE)
1161 pci_release_resource(adev->pdev, 2);
1162
1163 pci_release_resource(adev->pdev, 0);
1164
1165 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1166 if (r == -ENOSPC)
1167 DRM_INFO("Not enough PCI address space for a large BAR.");
1168 else if (r && r != -ENOTSUPP)
1169 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1170
1171 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1172
1173 /* When the doorbell or fb BAR isn't available we have no chance of
1174 * using the device.
1175 */
06ec9070 1176 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1177 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1178 return -ENODEV;
1179
1180 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1181
1182 return 0;
1183}
a05502e5 1184
d38ceaf9
AD
1185/*
1186 * GPU helpers function.
1187 */
1188/**
39c640c0 1189 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1190 *
1191 * @adev: amdgpu_device pointer
1192 *
c836fec5
JQ
1193 * Check if the asic has been initialized (all asics) at driver startup
1194 * or post is needed if hw reset is performed.
1195 * Returns true if need or false if not.
d38ceaf9 1196 */
39c640c0 1197bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1198{
1199 uint32_t reg;
1200
bec86378
ML
1201 if (amdgpu_sriov_vf(adev))
1202 return false;
1203
1204 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1205 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1206 * some old smc fw still need driver do vPost otherwise gpu hang, while
1207 * those smc fw version above 22.15 doesn't have this flaw, so we force
1208 * vpost executed for smc version below 22.15
bec86378
ML
1209 */
1210 if (adev->asic_type == CHIP_FIJI) {
1211 int err;
1212 uint32_t fw_ver;
1213 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1214 /* force vPost if error occured */
1215 if (err)
1216 return true;
1217
1218 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1219 if (fw_ver < 0x00160e00)
1220 return true;
bec86378 1221 }
bec86378 1222 }
91fe77eb 1223
e3c1b071 1224 /* Don't post if we need to reset whole hive on init */
1225 if (adev->gmc.xgmi.pending_reset)
1226 return false;
1227
91fe77eb 1228 if (adev->has_hw_reset) {
1229 adev->has_hw_reset = false;
1230 return true;
1231 }
1232
1233 /* bios scratch used on CIK+ */
1234 if (adev->asic_type >= CHIP_BONAIRE)
1235 return amdgpu_atombios_scratch_need_asic_init(adev);
1236
1237 /* check MEM_SIZE for older asics */
1238 reg = amdgpu_asic_get_config_memsize(adev);
1239
1240 if ((reg != 0) && (reg != 0xffffffff))
1241 return false;
1242
1243 return true;
bec86378
ML
1244}
1245
d38ceaf9
AD
1246/* if we get transitioned to only one device, take VGA back */
1247/**
06ec9070 1248 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9
AD
1249 *
1250 * @cookie: amdgpu_device pointer
1251 * @state: enable/disable vga decode
1252 *
1253 * Enable/disable vga decode (all asics).
1254 * Returns VGA resource flags.
1255 */
06ec9070 1256static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
d38ceaf9
AD
1257{
1258 struct amdgpu_device *adev = cookie;
1259 amdgpu_asic_set_vga_state(adev, state);
1260 if (state)
1261 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1262 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1263 else
1264 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1265}
1266
e3ecdffa
AD
1267/**
1268 * amdgpu_device_check_block_size - validate the vm block size
1269 *
1270 * @adev: amdgpu_device pointer
1271 *
1272 * Validates the vm block size specified via module parameter.
1273 * The vm block size defines number of bits in page table versus page directory,
1274 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1275 * page table and the remaining bits are in the page directory.
1276 */
06ec9070 1277static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1278{
1279 /* defines number of bits in page table versus page directory,
1280 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1281 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1282 if (amdgpu_vm_block_size == -1)
1283 return;
a1adf8be 1284
bab4fee7 1285 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1286 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1287 amdgpu_vm_block_size);
97489129 1288 amdgpu_vm_block_size = -1;
a1adf8be 1289 }
a1adf8be
CZ
1290}
1291
e3ecdffa
AD
1292/**
1293 * amdgpu_device_check_vm_size - validate the vm size
1294 *
1295 * @adev: amdgpu_device pointer
1296 *
1297 * Validates the vm size in GB specified via module parameter.
1298 * The VM size is the size of the GPU virtual memory space in GB.
1299 */
06ec9070 1300static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1301{
64dab074
AD
1302 /* no need to check the default value */
1303 if (amdgpu_vm_size == -1)
1304 return;
1305
83ca145d
ZJ
1306 if (amdgpu_vm_size < 1) {
1307 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1308 amdgpu_vm_size);
f3368128 1309 amdgpu_vm_size = -1;
83ca145d 1310 }
83ca145d
ZJ
1311}
1312
7951e376
RZ
1313static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1314{
1315 struct sysinfo si;
a9d4fe2f 1316 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1317 uint64_t total_memory;
1318 uint64_t dram_size_seven_GB = 0x1B8000000;
1319 uint64_t dram_size_three_GB = 0xB8000000;
1320
1321 if (amdgpu_smu_memory_pool_size == 0)
1322 return;
1323
1324 if (!is_os_64) {
1325 DRM_WARN("Not 64-bit OS, feature not supported\n");
1326 goto def_value;
1327 }
1328 si_meminfo(&si);
1329 total_memory = (uint64_t)si.totalram * si.mem_unit;
1330
1331 if ((amdgpu_smu_memory_pool_size == 1) ||
1332 (amdgpu_smu_memory_pool_size == 2)) {
1333 if (total_memory < dram_size_three_GB)
1334 goto def_value1;
1335 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1336 (amdgpu_smu_memory_pool_size == 8)) {
1337 if (total_memory < dram_size_seven_GB)
1338 goto def_value1;
1339 } else {
1340 DRM_WARN("Smu memory pool size not supported\n");
1341 goto def_value;
1342 }
1343 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1344
1345 return;
1346
1347def_value1:
1348 DRM_WARN("No enough system memory\n");
1349def_value:
1350 adev->pm.smu_prv_buffer_size = 0;
1351}
1352
d38ceaf9 1353/**
06ec9070 1354 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1355 *
1356 * @adev: amdgpu_device pointer
1357 *
1358 * Validates certain module parameters and updates
1359 * the associated values used by the driver (all asics).
1360 */
912dfc84 1361static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1362{
5b011235
CZ
1363 if (amdgpu_sched_jobs < 4) {
1364 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1365 amdgpu_sched_jobs);
1366 amdgpu_sched_jobs = 4;
76117507 1367 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1368 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1369 amdgpu_sched_jobs);
1370 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1371 }
d38ceaf9 1372
83e74db6 1373 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1374 /* gart size must be greater or equal to 32M */
1375 dev_warn(adev->dev, "gart size (%d) too small\n",
1376 amdgpu_gart_size);
83e74db6 1377 amdgpu_gart_size = -1;
d38ceaf9
AD
1378 }
1379
36d38372 1380 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1381 /* gtt size must be greater or equal to 32M */
36d38372
CK
1382 dev_warn(adev->dev, "gtt size (%d) too small\n",
1383 amdgpu_gtt_size);
1384 amdgpu_gtt_size = -1;
d38ceaf9
AD
1385 }
1386
d07f14be
RH
1387 /* valid range is between 4 and 9 inclusive */
1388 if (amdgpu_vm_fragment_size != -1 &&
1389 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1390 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1391 amdgpu_vm_fragment_size = -1;
1392 }
1393
5d5bd5e3
KW
1394 if (amdgpu_sched_hw_submission < 2) {
1395 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1396 amdgpu_sched_hw_submission);
1397 amdgpu_sched_hw_submission = 2;
1398 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1399 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1400 amdgpu_sched_hw_submission);
1401 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1402 }
1403
7951e376
RZ
1404 amdgpu_device_check_smu_prv_buffer_size(adev);
1405
06ec9070 1406 amdgpu_device_check_vm_size(adev);
d38ceaf9 1407
06ec9070 1408 amdgpu_device_check_block_size(adev);
6a7f76e7 1409
19aede77 1410 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1411
c6252390 1412 amdgpu_gmc_tmz_set(adev);
01a8dcec 1413
9b498efa
AD
1414 amdgpu_gmc_noretry_set(adev);
1415
e3c00faa 1416 return 0;
d38ceaf9
AD
1417}
1418
1419/**
1420 * amdgpu_switcheroo_set_state - set switcheroo state
1421 *
1422 * @pdev: pci dev pointer
1694467b 1423 * @state: vga_switcheroo state
d38ceaf9
AD
1424 *
1425 * Callback for the switcheroo driver. Suspends or resumes the
1426 * the asics before or after it is powered up using ACPI methods.
1427 */
8aba21b7
LT
1428static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1429 enum vga_switcheroo_state state)
d38ceaf9
AD
1430{
1431 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1432 int r;
d38ceaf9 1433
b98c6299 1434 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1435 return;
1436
1437 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1438 pr_info("switched on\n");
d38ceaf9
AD
1439 /* don't suspend or resume card normally */
1440 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1441
8f66090b
TZ
1442 pci_set_power_state(pdev, PCI_D0);
1443 amdgpu_device_load_pci_state(pdev);
1444 r = pci_enable_device(pdev);
de185019
AD
1445 if (r)
1446 DRM_WARN("pci_enable_device failed (%d)\n", r);
1447 amdgpu_device_resume(dev, true);
d38ceaf9 1448
d38ceaf9 1449 dev->switch_power_state = DRM_SWITCH_POWER_ON;
d38ceaf9 1450 } else {
dd4fa6c1 1451 pr_info("switched off\n");
d38ceaf9 1452 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1453 amdgpu_device_suspend(dev, true);
8f66090b 1454 amdgpu_device_cache_pci_state(pdev);
de185019 1455 /* Shut down the device */
8f66090b
TZ
1456 pci_disable_device(pdev);
1457 pci_set_power_state(pdev, PCI_D3cold);
d38ceaf9
AD
1458 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1459 }
1460}
1461
1462/**
1463 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1464 *
1465 * @pdev: pci dev pointer
1466 *
1467 * Callback for the switcheroo driver. Check of the switcheroo
1468 * state can be changed.
1469 * Returns true if the state can be changed, false if not.
1470 */
1471static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1472{
1473 struct drm_device *dev = pci_get_drvdata(pdev);
1474
1475 /*
1476 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1477 * locking inversion with the driver load path. And the access here is
1478 * completely racy anyway. So don't bother with locking for now.
1479 */
7e13ad89 1480 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1481}
1482
1483static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1484 .set_gpu_state = amdgpu_switcheroo_set_state,
1485 .reprobe = NULL,
1486 .can_switch = amdgpu_switcheroo_can_switch,
1487};
1488
e3ecdffa
AD
1489/**
1490 * amdgpu_device_ip_set_clockgating_state - set the CG state
1491 *
87e3f136 1492 * @dev: amdgpu_device pointer
e3ecdffa
AD
1493 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1494 * @state: clockgating state (gate or ungate)
1495 *
1496 * Sets the requested clockgating state for all instances of
1497 * the hardware IP specified.
1498 * Returns the error code from the last instance.
1499 */
43fa561f 1500int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1501 enum amd_ip_block_type block_type,
1502 enum amd_clockgating_state state)
d38ceaf9 1503{
43fa561f 1504 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1505 int i, r = 0;
1506
1507 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1508 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1509 continue;
c722865a
RZ
1510 if (adev->ip_blocks[i].version->type != block_type)
1511 continue;
1512 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1513 continue;
1514 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1515 (void *)adev, state);
1516 if (r)
1517 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1518 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1519 }
1520 return r;
1521}
1522
e3ecdffa
AD
1523/**
1524 * amdgpu_device_ip_set_powergating_state - set the PG state
1525 *
87e3f136 1526 * @dev: amdgpu_device pointer
e3ecdffa
AD
1527 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1528 * @state: powergating state (gate or ungate)
1529 *
1530 * Sets the requested powergating state for all instances of
1531 * the hardware IP specified.
1532 * Returns the error code from the last instance.
1533 */
43fa561f 1534int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1535 enum amd_ip_block_type block_type,
1536 enum amd_powergating_state state)
d38ceaf9 1537{
43fa561f 1538 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1539 int i, r = 0;
1540
1541 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1542 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1543 continue;
c722865a
RZ
1544 if (adev->ip_blocks[i].version->type != block_type)
1545 continue;
1546 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1547 continue;
1548 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1549 (void *)adev, state);
1550 if (r)
1551 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1552 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1553 }
1554 return r;
1555}
1556
e3ecdffa
AD
1557/**
1558 * amdgpu_device_ip_get_clockgating_state - get the CG state
1559 *
1560 * @adev: amdgpu_device pointer
1561 * @flags: clockgating feature flags
1562 *
1563 * Walks the list of IPs on the device and updates the clockgating
1564 * flags for each IP.
1565 * Updates @flags with the feature flags for each hardware IP where
1566 * clockgating is enabled.
1567 */
2990a1fc
AD
1568void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1569 u32 *flags)
6cb2d4e4
HR
1570{
1571 int i;
1572
1573 for (i = 0; i < adev->num_ip_blocks; i++) {
1574 if (!adev->ip_blocks[i].status.valid)
1575 continue;
1576 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1577 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1578 }
1579}
1580
e3ecdffa
AD
1581/**
1582 * amdgpu_device_ip_wait_for_idle - wait for idle
1583 *
1584 * @adev: amdgpu_device pointer
1585 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1586 *
1587 * Waits for the request hardware IP to be idle.
1588 * Returns 0 for success or a negative error code on failure.
1589 */
2990a1fc
AD
1590int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1591 enum amd_ip_block_type block_type)
5dbbb60b
AD
1592{
1593 int i, r;
1594
1595 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1596 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1597 continue;
a1255107
AD
1598 if (adev->ip_blocks[i].version->type == block_type) {
1599 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1600 if (r)
1601 return r;
1602 break;
1603 }
1604 }
1605 return 0;
1606
1607}
1608
e3ecdffa
AD
1609/**
1610 * amdgpu_device_ip_is_idle - is the hardware IP idle
1611 *
1612 * @adev: amdgpu_device pointer
1613 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1614 *
1615 * Check if the hardware IP is idle or not.
1616 * Returns true if it the IP is idle, false if not.
1617 */
2990a1fc
AD
1618bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1619 enum amd_ip_block_type block_type)
5dbbb60b
AD
1620{
1621 int i;
1622
1623 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1624 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1625 continue;
a1255107
AD
1626 if (adev->ip_blocks[i].version->type == block_type)
1627 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1628 }
1629 return true;
1630
1631}
1632
e3ecdffa
AD
1633/**
1634 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1635 *
1636 * @adev: amdgpu_device pointer
87e3f136 1637 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1638 *
1639 * Returns a pointer to the hardware IP block structure
1640 * if it exists for the asic, otherwise NULL.
1641 */
2990a1fc
AD
1642struct amdgpu_ip_block *
1643amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1644 enum amd_ip_block_type type)
d38ceaf9
AD
1645{
1646 int i;
1647
1648 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1649 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1650 return &adev->ip_blocks[i];
1651
1652 return NULL;
1653}
1654
1655/**
2990a1fc 1656 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1657 *
1658 * @adev: amdgpu_device pointer
5fc3aeeb 1659 * @type: enum amd_ip_block_type
d38ceaf9
AD
1660 * @major: major version
1661 * @minor: minor version
1662 *
1663 * return 0 if equal or greater
1664 * return 1 if smaller or the ip_block doesn't exist
1665 */
2990a1fc
AD
1666int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1667 enum amd_ip_block_type type,
1668 u32 major, u32 minor)
d38ceaf9 1669{
2990a1fc 1670 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1671
a1255107
AD
1672 if (ip_block && ((ip_block->version->major > major) ||
1673 ((ip_block->version->major == major) &&
1674 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1675 return 0;
1676
1677 return 1;
1678}
1679
a1255107 1680/**
2990a1fc 1681 * amdgpu_device_ip_block_add
a1255107
AD
1682 *
1683 * @adev: amdgpu_device pointer
1684 * @ip_block_version: pointer to the IP to add
1685 *
1686 * Adds the IP block driver information to the collection of IPs
1687 * on the asic.
1688 */
2990a1fc
AD
1689int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1690 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1691{
1692 if (!ip_block_version)
1693 return -EINVAL;
1694
7bd939d0
LG
1695 switch (ip_block_version->type) {
1696 case AMD_IP_BLOCK_TYPE_VCN:
1697 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1698 return 0;
1699 break;
1700 case AMD_IP_BLOCK_TYPE_JPEG:
1701 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1702 return 0;
1703 break;
1704 default:
1705 break;
1706 }
1707
e966a725 1708 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1709 ip_block_version->funcs->name);
1710
a1255107
AD
1711 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1712
1713 return 0;
1714}
1715
e3ecdffa
AD
1716/**
1717 * amdgpu_device_enable_virtual_display - enable virtual display feature
1718 *
1719 * @adev: amdgpu_device pointer
1720 *
1721 * Enabled the virtual display feature if the user has enabled it via
1722 * the module parameter virtual_display. This feature provides a virtual
1723 * display hardware on headless boards or in virtualized environments.
1724 * This function parses and validates the configuration string specified by
1725 * the user and configues the virtual display configuration (number of
1726 * virtual connectors, crtcs, etc.) specified.
1727 */
483ef985 1728static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1729{
1730 adev->enable_virtual_display = false;
1731
1732 if (amdgpu_virtual_display) {
8f66090b 1733 const char *pci_address_name = pci_name(adev->pdev);
0f66356d 1734 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1735
1736 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1737 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1738 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1739 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1740 if (!strcmp("all", pciaddname)
1741 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1742 long num_crtc;
1743 int res = -1;
1744
9accf2fd 1745 adev->enable_virtual_display = true;
0f66356d
ED
1746
1747 if (pciaddname_tmp)
1748 res = kstrtol(pciaddname_tmp, 10,
1749 &num_crtc);
1750
1751 if (!res) {
1752 if (num_crtc < 1)
1753 num_crtc = 1;
1754 if (num_crtc > 6)
1755 num_crtc = 6;
1756 adev->mode_info.num_crtc = num_crtc;
1757 } else {
1758 adev->mode_info.num_crtc = 1;
1759 }
9accf2fd
ED
1760 break;
1761 }
1762 }
1763
0f66356d
ED
1764 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1765 amdgpu_virtual_display, pci_address_name,
1766 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1767
1768 kfree(pciaddstr);
1769 }
1770}
1771
e3ecdffa
AD
1772/**
1773 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1774 *
1775 * @adev: amdgpu_device pointer
1776 *
1777 * Parses the asic configuration parameters specified in the gpu info
1778 * firmware and makes them availale to the driver for use in configuring
1779 * the asic.
1780 * Returns 0 on success, -EINVAL on failure.
1781 */
e2a75f88
AD
1782static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1783{
e2a75f88 1784 const char *chip_name;
c0a43457 1785 char fw_name[40];
e2a75f88
AD
1786 int err;
1787 const struct gpu_info_firmware_header_v1_0 *hdr;
1788
ab4fe3e1
HR
1789 adev->firmware.gpu_info_fw = NULL;
1790
72de33f8 1791 if (adev->mman.discovery_bin) {
258620d0 1792 amdgpu_discovery_get_gfx_info(adev);
cc375d8c
TY
1793
1794 /*
1795 * FIXME: The bounding box is still needed by Navi12, so
1796 * temporarily read it from gpu_info firmware. Should be droped
1797 * when DAL no longer needs it.
1798 */
1799 if (adev->asic_type != CHIP_NAVI12)
1800 return 0;
258620d0
AD
1801 }
1802
e2a75f88 1803 switch (adev->asic_type) {
e2a75f88
AD
1804#ifdef CONFIG_DRM_AMDGPU_SI
1805 case CHIP_VERDE:
1806 case CHIP_TAHITI:
1807 case CHIP_PITCAIRN:
1808 case CHIP_OLAND:
1809 case CHIP_HAINAN:
1810#endif
1811#ifdef CONFIG_DRM_AMDGPU_CIK
1812 case CHIP_BONAIRE:
1813 case CHIP_HAWAII:
1814 case CHIP_KAVERI:
1815 case CHIP_KABINI:
1816 case CHIP_MULLINS:
1817#endif
da87c30b
AD
1818 case CHIP_TOPAZ:
1819 case CHIP_TONGA:
1820 case CHIP_FIJI:
1821 case CHIP_POLARIS10:
1822 case CHIP_POLARIS11:
1823 case CHIP_POLARIS12:
1824 case CHIP_VEGAM:
1825 case CHIP_CARRIZO:
1826 case CHIP_STONEY:
27c0bc71 1827 case CHIP_VEGA20:
44b3253a 1828 case CHIP_ALDEBARAN:
84d244a3
JC
1829 case CHIP_SIENNA_CICHLID:
1830 case CHIP_NAVY_FLOUNDER:
eac88a5f 1831 case CHIP_DIMGREY_CAVEFISH:
0e5f4b09 1832 case CHIP_BEIGE_GOBY:
e2a75f88
AD
1833 default:
1834 return 0;
1835 case CHIP_VEGA10:
1836 chip_name = "vega10";
1837 break;
3f76dced
AD
1838 case CHIP_VEGA12:
1839 chip_name = "vega12";
1840 break;
2d2e5e7e 1841 case CHIP_RAVEN:
54f78a76 1842 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1843 chip_name = "raven2";
54f78a76 1844 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1845 chip_name = "picasso";
54c4d17e
FX
1846 else
1847 chip_name = "raven";
2d2e5e7e 1848 break;
65e60f6e
LM
1849 case CHIP_ARCTURUS:
1850 chip_name = "arcturus";
1851 break;
b51a26a0 1852 case CHIP_RENOIR:
2e62f0b5
PL
1853 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1854 chip_name = "renoir";
1855 else
1856 chip_name = "green_sardine";
b51a26a0 1857 break;
23c6268e
HR
1858 case CHIP_NAVI10:
1859 chip_name = "navi10";
1860 break;
ed42cfe1
XY
1861 case CHIP_NAVI14:
1862 chip_name = "navi14";
1863 break;
42b325e5
XY
1864 case CHIP_NAVI12:
1865 chip_name = "navi12";
1866 break;
4e52a9f8
HR
1867 case CHIP_VANGOGH:
1868 chip_name = "vangogh";
1869 break;
e2a75f88
AD
1870 }
1871
1872 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1873 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1874 if (err) {
1875 dev_err(adev->dev,
1876 "Failed to load gpu_info firmware \"%s\"\n",
1877 fw_name);
1878 goto out;
1879 }
ab4fe3e1 1880 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1881 if (err) {
1882 dev_err(adev->dev,
1883 "Failed to validate gpu_info firmware \"%s\"\n",
1884 fw_name);
1885 goto out;
1886 }
1887
ab4fe3e1 1888 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1889 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1890
1891 switch (hdr->version_major) {
1892 case 1:
1893 {
1894 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1895 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1896 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1897
cc375d8c
TY
1898 /*
1899 * Should be droped when DAL no longer needs it.
1900 */
1901 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
1902 goto parse_soc_bounding_box;
1903
b5ab16bf
AD
1904 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1905 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1906 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1907 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 1908 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
1909 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1910 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1911 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1912 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1913 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 1914 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
1915 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1916 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
1917 adev->gfx.cu_info.max_waves_per_simd =
1918 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1919 adev->gfx.cu_info.max_scratch_slots_per_cu =
1920 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1921 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 1922 if (hdr->version_minor >= 1) {
35c2e910
HZ
1923 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1924 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1925 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1926 adev->gfx.config.num_sc_per_sh =
1927 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1928 adev->gfx.config.num_packer_per_sc =
1929 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1930 }
ec51d3fa
XY
1931
1932parse_soc_bounding_box:
ec51d3fa
XY
1933 /*
1934 * soc bounding box info is not integrated in disocovery table,
258620d0 1935 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 1936 */
48321c3d
HW
1937 if (hdr->version_minor == 2) {
1938 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1939 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1940 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1941 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1942 }
e2a75f88
AD
1943 break;
1944 }
1945 default:
1946 dev_err(adev->dev,
1947 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1948 err = -EINVAL;
1949 goto out;
1950 }
1951out:
e2a75f88
AD
1952 return err;
1953}
1954
e3ecdffa
AD
1955/**
1956 * amdgpu_device_ip_early_init - run early init for hardware IPs
1957 *
1958 * @adev: amdgpu_device pointer
1959 *
1960 * Early initialization pass for hardware IPs. The hardware IPs that make
1961 * up each asic are discovered each IP's early_init callback is run. This
1962 * is the first stage in initializing the asic.
1963 * Returns 0 on success, negative error code on failure.
1964 */
06ec9070 1965static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 1966{
aaa36a97 1967 int i, r;
d38ceaf9 1968
483ef985 1969 amdgpu_device_enable_virtual_display(adev);
a6be7570 1970
00a979f3 1971 if (amdgpu_sriov_vf(adev)) {
00a979f3 1972 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
1973 if (r)
1974 return r;
00a979f3
WS
1975 }
1976
d38ceaf9 1977 switch (adev->asic_type) {
33f34802
KW
1978#ifdef CONFIG_DRM_AMDGPU_SI
1979 case CHIP_VERDE:
1980 case CHIP_TAHITI:
1981 case CHIP_PITCAIRN:
1982 case CHIP_OLAND:
1983 case CHIP_HAINAN:
295d0daf 1984 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
1985 r = si_set_ip_blocks(adev);
1986 if (r)
1987 return r;
1988 break;
1989#endif
a2e73f56
AD
1990#ifdef CONFIG_DRM_AMDGPU_CIK
1991 case CHIP_BONAIRE:
1992 case CHIP_HAWAII:
1993 case CHIP_KAVERI:
1994 case CHIP_KABINI:
1995 case CHIP_MULLINS:
e1ad2d53 1996 if (adev->flags & AMD_IS_APU)
a2e73f56 1997 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
1998 else
1999 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
2000
2001 r = cik_set_ip_blocks(adev);
2002 if (r)
2003 return r;
2004 break;
2005#endif
da87c30b
AD
2006 case CHIP_TOPAZ:
2007 case CHIP_TONGA:
2008 case CHIP_FIJI:
2009 case CHIP_POLARIS10:
2010 case CHIP_POLARIS11:
2011 case CHIP_POLARIS12:
2012 case CHIP_VEGAM:
2013 case CHIP_CARRIZO:
2014 case CHIP_STONEY:
2015 if (adev->flags & AMD_IS_APU)
2016 adev->family = AMDGPU_FAMILY_CZ;
2017 else
2018 adev->family = AMDGPU_FAMILY_VI;
2019
2020 r = vi_set_ip_blocks(adev);
2021 if (r)
2022 return r;
2023 break;
e48a3cd9
AD
2024 case CHIP_VEGA10:
2025 case CHIP_VEGA12:
e4bd8170 2026 case CHIP_VEGA20:
e48a3cd9 2027 case CHIP_RAVEN:
61cf44c1 2028 case CHIP_ARCTURUS:
b51a26a0 2029 case CHIP_RENOIR:
c00a18ec 2030 case CHIP_ALDEBARAN:
70534d1e 2031 if (adev->flags & AMD_IS_APU)
2ca8a5d2
CZ
2032 adev->family = AMDGPU_FAMILY_RV;
2033 else
2034 adev->family = AMDGPU_FAMILY_AI;
460826e6
KW
2035
2036 r = soc15_set_ip_blocks(adev);
2037 if (r)
2038 return r;
2039 break;
0a5b8c7b 2040 case CHIP_NAVI10:
7ecb5cd4 2041 case CHIP_NAVI14:
4808cf9c 2042 case CHIP_NAVI12:
11e8aef5 2043 case CHIP_SIENNA_CICHLID:
41f446bf 2044 case CHIP_NAVY_FLOUNDER:
144722fa 2045 case CHIP_DIMGREY_CAVEFISH:
b41f5b7a 2046 case CHIP_BEIGE_GOBY:
4e52a9f8
HR
2047 case CHIP_VANGOGH:
2048 if (adev->asic_type == CHIP_VANGOGH)
2049 adev->family = AMDGPU_FAMILY_VGH;
2050 else
2051 adev->family = AMDGPU_FAMILY_NV;
0a5b8c7b
HR
2052
2053 r = nv_set_ip_blocks(adev);
2054 if (r)
2055 return r;
2056 break;
d38ceaf9
AD
2057 default:
2058 /* FIXME: not supported yet */
2059 return -EINVAL;
2060 }
2061
1884734a 2062 amdgpu_amdkfd_device_probe(adev);
2063
3b94fb10 2064 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 2065 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 2066 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
4215a119
HC
2067 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2068 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
00f54b97 2069
d38ceaf9
AD
2070 for (i = 0; i < adev->num_ip_blocks; i++) {
2071 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
2072 DRM_ERROR("disabled ip block: %d <%s>\n",
2073 i, adev->ip_blocks[i].version->funcs->name);
a1255107 2074 adev->ip_blocks[i].status.valid = false;
d38ceaf9 2075 } else {
a1255107
AD
2076 if (adev->ip_blocks[i].version->funcs->early_init) {
2077 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 2078 if (r == -ENOENT) {
a1255107 2079 adev->ip_blocks[i].status.valid = false;
2c1a2784 2080 } else if (r) {
a1255107
AD
2081 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2082 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2083 return r;
2c1a2784 2084 } else {
a1255107 2085 adev->ip_blocks[i].status.valid = true;
2c1a2784 2086 }
974e6b64 2087 } else {
a1255107 2088 adev->ip_blocks[i].status.valid = true;
d38ceaf9 2089 }
d38ceaf9 2090 }
21a249ca
AD
2091 /* get the vbios after the asic_funcs are set up */
2092 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
2093 r = amdgpu_device_parse_gpu_info_fw(adev);
2094 if (r)
2095 return r;
2096
21a249ca
AD
2097 /* Read BIOS */
2098 if (!amdgpu_get_bios(adev))
2099 return -EINVAL;
2100
2101 r = amdgpu_atombios_init(adev);
2102 if (r) {
2103 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2104 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2105 return r;
2106 }
77eabc6f
PJZ
2107
2108 /*get pf2vf msg info at it's earliest time*/
2109 if (amdgpu_sriov_vf(adev))
2110 amdgpu_virt_init_data_exchange(adev);
2111
21a249ca 2112 }
d38ceaf9
AD
2113 }
2114
395d1fb9
NH
2115 adev->cg_flags &= amdgpu_cg_mask;
2116 adev->pg_flags &= amdgpu_pg_mask;
2117
d38ceaf9
AD
2118 return 0;
2119}
2120
0a4f2520
RZ
2121static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2122{
2123 int i, r;
2124
2125 for (i = 0; i < adev->num_ip_blocks; i++) {
2126 if (!adev->ip_blocks[i].status.sw)
2127 continue;
2128 if (adev->ip_blocks[i].status.hw)
2129 continue;
2130 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 2131 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
2132 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2133 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2134 if (r) {
2135 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2136 adev->ip_blocks[i].version->funcs->name, r);
2137 return r;
2138 }
2139 adev->ip_blocks[i].status.hw = true;
2140 }
2141 }
2142
2143 return 0;
2144}
2145
2146static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2147{
2148 int i, r;
2149
2150 for (i = 0; i < adev->num_ip_blocks; i++) {
2151 if (!adev->ip_blocks[i].status.sw)
2152 continue;
2153 if (adev->ip_blocks[i].status.hw)
2154 continue;
2155 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2156 if (r) {
2157 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2158 adev->ip_blocks[i].version->funcs->name, r);
2159 return r;
2160 }
2161 adev->ip_blocks[i].status.hw = true;
2162 }
2163
2164 return 0;
2165}
2166
7a3e0bb2
RZ
2167static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2168{
2169 int r = 0;
2170 int i;
80f41f84 2171 uint32_t smu_version;
7a3e0bb2
RZ
2172
2173 if (adev->asic_type >= CHIP_VEGA10) {
2174 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2175 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2176 continue;
2177
e3c1b071 2178 if (!adev->ip_blocks[i].status.sw)
2179 continue;
2180
482f0e53
ML
2181 /* no need to do the fw loading again if already done*/
2182 if (adev->ip_blocks[i].status.hw == true)
2183 break;
2184
53b3f8f4 2185 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2186 r = adev->ip_blocks[i].version->funcs->resume(adev);
2187 if (r) {
2188 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2189 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2190 return r;
2191 }
2192 } else {
2193 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2194 if (r) {
2195 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2196 adev->ip_blocks[i].version->funcs->name, r);
2197 return r;
7a3e0bb2 2198 }
7a3e0bb2 2199 }
482f0e53
ML
2200
2201 adev->ip_blocks[i].status.hw = true;
2202 break;
7a3e0bb2
RZ
2203 }
2204 }
482f0e53 2205
8973d9ec
ED
2206 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2207 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2208
80f41f84 2209 return r;
7a3e0bb2
RZ
2210}
2211
e3ecdffa
AD
2212/**
2213 * amdgpu_device_ip_init - run init for hardware IPs
2214 *
2215 * @adev: amdgpu_device pointer
2216 *
2217 * Main initialization pass for hardware IPs. The list of all the hardware
2218 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2219 * are run. sw_init initializes the software state associated with each IP
2220 * and hw_init initializes the hardware associated with each IP.
2221 * Returns 0 on success, negative error code on failure.
2222 */
06ec9070 2223static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2224{
2225 int i, r;
2226
c030f2e4 2227 r = amdgpu_ras_init(adev);
2228 if (r)
2229 return r;
2230
d38ceaf9 2231 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2232 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2233 continue;
a1255107 2234 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2235 if (r) {
a1255107
AD
2236 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2237 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2238 goto init_failed;
2c1a2784 2239 }
a1255107 2240 adev->ip_blocks[i].status.sw = true;
bfca0289 2241
d38ceaf9 2242 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2243 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
06ec9070 2244 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2245 if (r) {
2246 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2247 goto init_failed;
2c1a2784 2248 }
a1255107 2249 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2250 if (r) {
2251 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2252 goto init_failed;
2c1a2784 2253 }
06ec9070 2254 r = amdgpu_device_wb_init(adev);
2c1a2784 2255 if (r) {
06ec9070 2256 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2257 goto init_failed;
2c1a2784 2258 }
a1255107 2259 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2260
2261 /* right after GMC hw init, we create CSA */
f92d5c61 2262 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2263 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2264 AMDGPU_GEM_DOMAIN_VRAM,
2265 AMDGPU_CSA_SIZE);
2493664f
ML
2266 if (r) {
2267 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2268 goto init_failed;
2493664f
ML
2269 }
2270 }
d38ceaf9
AD
2271 }
2272 }
2273
c9ffa427
YT
2274 if (amdgpu_sriov_vf(adev))
2275 amdgpu_virt_init_data_exchange(adev);
2276
533aed27
AG
2277 r = amdgpu_ib_pool_init(adev);
2278 if (r) {
2279 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2280 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2281 goto init_failed;
2282 }
2283
c8963ea4
RZ
2284 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2285 if (r)
72d3f592 2286 goto init_failed;
0a4f2520
RZ
2287
2288 r = amdgpu_device_ip_hw_init_phase1(adev);
2289 if (r)
72d3f592 2290 goto init_failed;
0a4f2520 2291
7a3e0bb2
RZ
2292 r = amdgpu_device_fw_loading(adev);
2293 if (r)
72d3f592 2294 goto init_failed;
7a3e0bb2 2295
0a4f2520
RZ
2296 r = amdgpu_device_ip_hw_init_phase2(adev);
2297 if (r)
72d3f592 2298 goto init_failed;
d38ceaf9 2299
121a2bc6
AG
2300 /*
2301 * retired pages will be loaded from eeprom and reserved here,
2302 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2303 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2304 * for I2C communication which only true at this point.
b82e65a9
GC
2305 *
2306 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2307 * failure from bad gpu situation and stop amdgpu init process
2308 * accordingly. For other failed cases, it will still release all
2309 * the resource and print error message, rather than returning one
2310 * negative value to upper level.
121a2bc6
AG
2311 *
2312 * Note: theoretically, this should be called before all vram allocations
2313 * to protect retired page from abusing
2314 */
b82e65a9
GC
2315 r = amdgpu_ras_recovery_init(adev);
2316 if (r)
2317 goto init_failed;
121a2bc6 2318
3e2e2ab5
HZ
2319 if (adev->gmc.xgmi.num_physical_nodes > 1)
2320 amdgpu_xgmi_add_device(adev);
e3c1b071 2321
2322 /* Don't init kfd if whole hive need to be reset during init */
2323 if (!adev->gmc.xgmi.pending_reset)
2324 amdgpu_amdkfd_device_init(adev);
c6332b97 2325
bd607166
KR
2326 amdgpu_fru_get_product_info(adev);
2327
72d3f592 2328init_failed:
c9ffa427 2329 if (amdgpu_sriov_vf(adev))
c6332b97 2330 amdgpu_virt_release_full_gpu(adev, true);
2331
72d3f592 2332 return r;
d38ceaf9
AD
2333}
2334
e3ecdffa
AD
2335/**
2336 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2337 *
2338 * @adev: amdgpu_device pointer
2339 *
2340 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2341 * this function before a GPU reset. If the value is retained after a
2342 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2343 */
06ec9070 2344static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2345{
2346 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2347}
2348
e3ecdffa
AD
2349/**
2350 * amdgpu_device_check_vram_lost - check if vram is valid
2351 *
2352 * @adev: amdgpu_device pointer
2353 *
2354 * Checks the reset magic value written to the gart pointer in VRAM.
2355 * The driver calls this after a GPU reset to see if the contents of
2356 * VRAM is lost or now.
2357 * returns true if vram is lost, false if not.
2358 */
06ec9070 2359static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2360{
dadce777
EQ
2361 if (memcmp(adev->gart.ptr, adev->reset_magic,
2362 AMDGPU_RESET_MAGIC_NUM))
2363 return true;
2364
53b3f8f4 2365 if (!amdgpu_in_reset(adev))
dadce777
EQ
2366 return false;
2367
2368 /*
2369 * For all ASICs with baco/mode1 reset, the VRAM is
2370 * always assumed to be lost.
2371 */
2372 switch (amdgpu_asic_reset_method(adev)) {
2373 case AMD_RESET_METHOD_BACO:
2374 case AMD_RESET_METHOD_MODE1:
2375 return true;
2376 default:
2377 return false;
2378 }
0c49e0b8
CZ
2379}
2380
e3ecdffa 2381/**
1112a46b 2382 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2383 *
2384 * @adev: amdgpu_device pointer
b8b72130 2385 * @state: clockgating state (gate or ungate)
e3ecdffa 2386 *
e3ecdffa 2387 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2388 * set_clockgating_state callbacks are run.
2389 * Late initialization pass enabling clockgating for hardware IPs.
2390 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2391 * Returns 0 on success, negative error code on failure.
2392 */
fdd34271 2393
5d89bb2d
LL
2394int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2395 enum amd_clockgating_state state)
d38ceaf9 2396{
1112a46b 2397 int i, j, r;
d38ceaf9 2398
4a2ba394
SL
2399 if (amdgpu_emu_mode == 1)
2400 return 0;
2401
1112a46b
RZ
2402 for (j = 0; j < adev->num_ip_blocks; j++) {
2403 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2404 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2405 continue;
5d70a549
PV
2406 /* skip CG for GFX on S0ix */
2407 if (adev->in_s0ix &&
2408 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2409 continue;
4a446d55 2410 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2411 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2412 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2413 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2414 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2415 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2416 /* enable clockgating to save power */
a1255107 2417 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2418 state);
4a446d55
AD
2419 if (r) {
2420 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2421 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2422 return r;
2423 }
b0b00ff1 2424 }
d38ceaf9 2425 }
06b18f61 2426
c9f96fd5
RZ
2427 return 0;
2428}
2429
5d89bb2d
LL
2430int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2431 enum amd_powergating_state state)
c9f96fd5 2432{
1112a46b 2433 int i, j, r;
06b18f61 2434
c9f96fd5
RZ
2435 if (amdgpu_emu_mode == 1)
2436 return 0;
2437
1112a46b
RZ
2438 for (j = 0; j < adev->num_ip_blocks; j++) {
2439 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2440 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5 2441 continue;
5d70a549
PV
2442 /* skip PG for GFX on S0ix */
2443 if (adev->in_s0ix &&
2444 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2445 continue;
c9f96fd5
RZ
2446 /* skip CG for VCE/UVD, it's handled specially */
2447 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2448 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2449 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2450 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2451 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2452 /* enable powergating to save power */
2453 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2454 state);
c9f96fd5
RZ
2455 if (r) {
2456 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2457 adev->ip_blocks[i].version->funcs->name, r);
2458 return r;
2459 }
2460 }
2461 }
2dc80b00
S
2462 return 0;
2463}
2464
beff74bc
AD
2465static int amdgpu_device_enable_mgpu_fan_boost(void)
2466{
2467 struct amdgpu_gpu_instance *gpu_ins;
2468 struct amdgpu_device *adev;
2469 int i, ret = 0;
2470
2471 mutex_lock(&mgpu_info.mutex);
2472
2473 /*
2474 * MGPU fan boost feature should be enabled
2475 * only when there are two or more dGPUs in
2476 * the system
2477 */
2478 if (mgpu_info.num_dgpu < 2)
2479 goto out;
2480
2481 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2482 gpu_ins = &(mgpu_info.gpu_ins[i]);
2483 adev = gpu_ins->adev;
2484 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2485 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2486 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2487 if (ret)
2488 break;
2489
2490 gpu_ins->mgpu_fan_enabled = 1;
2491 }
2492 }
2493
2494out:
2495 mutex_unlock(&mgpu_info.mutex);
2496
2497 return ret;
2498}
2499
e3ecdffa
AD
2500/**
2501 * amdgpu_device_ip_late_init - run late init for hardware IPs
2502 *
2503 * @adev: amdgpu_device pointer
2504 *
2505 * Late initialization pass for hardware IPs. The list of all the hardware
2506 * IPs that make up the asic is walked and the late_init callbacks are run.
2507 * late_init covers any special initialization that an IP requires
2508 * after all of the have been initialized or something that needs to happen
2509 * late in the init process.
2510 * Returns 0 on success, negative error code on failure.
2511 */
06ec9070 2512static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2513{
60599a03 2514 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2515 int i = 0, r;
2516
2517 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2518 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2519 continue;
2520 if (adev->ip_blocks[i].version->funcs->late_init) {
2521 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2522 if (r) {
2523 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2524 adev->ip_blocks[i].version->funcs->name, r);
2525 return r;
2526 }
2dc80b00 2527 }
73f847db 2528 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2529 }
2530
a891d239
DL
2531 amdgpu_ras_set_error_query_ready(adev, true);
2532
1112a46b
RZ
2533 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2534 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2535
06ec9070 2536 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2537
beff74bc
AD
2538 r = amdgpu_device_enable_mgpu_fan_boost();
2539 if (r)
2540 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2541
2d02893f 2542 /* For XGMI + passthrough configuration on arcturus, enable light SBR */
2543 if (adev->asic_type == CHIP_ARCTURUS &&
2544 amdgpu_passthrough(adev) &&
2545 adev->gmc.xgmi.num_physical_nodes > 1)
2546 smu_set_light_sbr(&adev->smu, true);
60599a03
EQ
2547
2548 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2549 mutex_lock(&mgpu_info.mutex);
2550
2551 /*
2552 * Reset device p-state to low as this was booted with high.
2553 *
2554 * This should be performed only after all devices from the same
2555 * hive get initialized.
2556 *
2557 * However, it's unknown how many device in the hive in advance.
2558 * As this is counted one by one during devices initializations.
2559 *
2560 * So, we wait for all XGMI interlinked devices initialized.
2561 * This may bring some delays as those devices may come from
2562 * different hives. But that should be OK.
2563 */
2564 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2565 for (i = 0; i < mgpu_info.num_gpu; i++) {
2566 gpu_instance = &(mgpu_info.gpu_ins[i]);
2567 if (gpu_instance->adev->flags & AMD_IS_APU)
2568 continue;
2569
d84a430d
JK
2570 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2571 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2572 if (r) {
2573 DRM_ERROR("pstate setting failed (%d).\n", r);
2574 break;
2575 }
2576 }
2577 }
2578
2579 mutex_unlock(&mgpu_info.mutex);
2580 }
2581
d38ceaf9
AD
2582 return 0;
2583}
2584
e9669fb7 2585static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
d38ceaf9
AD
2586{
2587 int i, r;
2588
e9669fb7
AG
2589 for (i = 0; i < adev->num_ip_blocks; i++) {
2590 if (!adev->ip_blocks[i].version->funcs->early_fini)
2591 continue;
5278a159 2592
e9669fb7
AG
2593 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2594 if (r) {
2595 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2596 adev->ip_blocks[i].version->funcs->name, r);
2597 }
2598 }
c030f2e4 2599
e9669fb7 2600 amdgpu_amdkfd_suspend(adev, false);
a82400b5 2601
05df1f01 2602 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2603 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2604
3e96dbfd
AD
2605 /* need to disable SMC first */
2606 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2607 if (!adev->ip_blocks[i].status.hw)
3e96dbfd 2608 continue;
fdd34271 2609 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
a1255107 2610 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3e96dbfd
AD
2611 /* XXX handle errors */
2612 if (r) {
2613 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
a1255107 2614 adev->ip_blocks[i].version->funcs->name, r);
3e96dbfd 2615 }
a1255107 2616 adev->ip_blocks[i].status.hw = false;
3e96dbfd
AD
2617 break;
2618 }
2619 }
2620
d38ceaf9 2621 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2622 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2623 continue;
8201a67a 2624
a1255107 2625 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2626 /* XXX handle errors */
2c1a2784 2627 if (r) {
a1255107
AD
2628 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2629 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2630 }
8201a67a 2631
a1255107 2632 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2633 }
2634
e9669fb7
AG
2635 return 0;
2636}
2637
2638/**
2639 * amdgpu_device_ip_fini - run fini for hardware IPs
2640 *
2641 * @adev: amdgpu_device pointer
2642 *
2643 * Main teardown pass for hardware IPs. The list of all the hardware
2644 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2645 * are run. hw_fini tears down the hardware associated with each IP
2646 * and sw_fini tears down any software state associated with each IP.
2647 * Returns 0 on success, negative error code on failure.
2648 */
2649static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2650{
2651 int i, r;
2652
2653 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2654 amdgpu_virt_release_ras_err_handler_data(adev);
2655
2656 amdgpu_ras_pre_fini(adev);
2657
2658 if (adev->gmc.xgmi.num_physical_nodes > 1)
2659 amdgpu_xgmi_remove_device(adev);
2660
2661 amdgpu_amdkfd_device_fini_sw(adev);
9950cda2 2662
d38ceaf9 2663 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2664 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2665 continue;
c12aba3a
ML
2666
2667 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2668 amdgpu_ucode_free_bo(adev);
1e256e27 2669 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2670 amdgpu_device_wb_fini(adev);
2671 amdgpu_device_vram_scratch_fini(adev);
533aed27 2672 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2673 }
2674
a1255107 2675 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2676 /* XXX handle errors */
2c1a2784 2677 if (r) {
a1255107
AD
2678 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2679 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2680 }
a1255107
AD
2681 adev->ip_blocks[i].status.sw = false;
2682 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2683 }
2684
a6dcfd9c 2685 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2686 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2687 continue;
a1255107
AD
2688 if (adev->ip_blocks[i].version->funcs->late_fini)
2689 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2690 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2691 }
2692
c030f2e4 2693 amdgpu_ras_fini(adev);
2694
030308fc 2695 if (amdgpu_sriov_vf(adev))
24136135
ML
2696 if (amdgpu_virt_release_full_gpu(adev, false))
2697 DRM_ERROR("failed to release exclusive mode on fini\n");
2493664f 2698
d38ceaf9
AD
2699 return 0;
2700}
2701
e3ecdffa 2702/**
beff74bc 2703 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2704 *
1112a46b 2705 * @work: work_struct.
e3ecdffa 2706 */
beff74bc 2707static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2708{
2709 struct amdgpu_device *adev =
beff74bc 2710 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2711 int r;
2712
2713 r = amdgpu_ib_ring_tests(adev);
2714 if (r)
2715 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2716}
2717
1e317b99
RZ
2718static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2719{
2720 struct amdgpu_device *adev =
2721 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2722
2723 mutex_lock(&adev->gfx.gfx_off_mutex);
2724 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2725 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2726 adev->gfx.gfx_off_state = true;
2727 }
2728 mutex_unlock(&adev->gfx.gfx_off_mutex);
2729}
2730
e3ecdffa 2731/**
e7854a03 2732 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2733 *
2734 * @adev: amdgpu_device pointer
2735 *
2736 * Main suspend function for hardware IPs. The list of all the hardware
2737 * IPs that make up the asic is walked, clockgating is disabled and the
2738 * suspend callbacks are run. suspend puts the hardware and software state
2739 * in each IP into a state suitable for suspend.
2740 * Returns 0 on success, negative error code on failure.
2741 */
e7854a03
AD
2742static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2743{
2744 int i, r;
2745
50ec83f0
AD
2746 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2747 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2748
e7854a03
AD
2749 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2750 if (!adev->ip_blocks[i].status.valid)
2751 continue;
2b9f7848 2752
e7854a03 2753 /* displays are handled separately */
2b9f7848
ND
2754 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2755 continue;
2756
2757 /* XXX handle errors */
2758 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2759 /* XXX handle errors */
2760 if (r) {
2761 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2762 adev->ip_blocks[i].version->funcs->name, r);
2763 return r;
e7854a03 2764 }
2b9f7848
ND
2765
2766 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2767 }
2768
e7854a03
AD
2769 return 0;
2770}
2771
2772/**
2773 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2774 *
2775 * @adev: amdgpu_device pointer
2776 *
2777 * Main suspend function for hardware IPs. The list of all the hardware
2778 * IPs that make up the asic is walked, clockgating is disabled and the
2779 * suspend callbacks are run. suspend puts the hardware and software state
2780 * in each IP into a state suitable for suspend.
2781 * Returns 0 on success, negative error code on failure.
2782 */
2783static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2784{
2785 int i, r;
2786
557f42a2 2787 if (adev->in_s0ix)
34416931 2788 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
34416931 2789
d38ceaf9 2790 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2791 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2792 continue;
e7854a03
AD
2793 /* displays are handled in phase1 */
2794 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2795 continue;
bff77e86
LM
2796 /* PSP lost connection when err_event_athub occurs */
2797 if (amdgpu_ras_intr_triggered() &&
2798 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2799 adev->ip_blocks[i].status.hw = false;
2800 continue;
2801 }
e3c1b071 2802
2803 /* skip unnecessary suspend if we do not initialize them yet */
2804 if (adev->gmc.xgmi.pending_reset &&
2805 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2806 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2807 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2808 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2809 adev->ip_blocks[i].status.hw = false;
2810 continue;
2811 }
557f42a2 2812
32ff160d
AD
2813 /* skip suspend of gfx and psp for S0ix
2814 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2815 * like at runtime. PSP is also part of the always on hardware
2816 * so no need to suspend it.
2817 */
557f42a2 2818 if (adev->in_s0ix &&
32ff160d
AD
2819 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2820 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
557f42a2
AD
2821 continue;
2822
d38ceaf9 2823 /* XXX handle errors */
a1255107 2824 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2825 /* XXX handle errors */
2c1a2784 2826 if (r) {
a1255107
AD
2827 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2828 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2829 }
876923fb 2830 adev->ip_blocks[i].status.hw = false;
a3a09142 2831 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2832 if(!amdgpu_sriov_vf(adev)){
2833 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2834 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2835 if (r) {
2836 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2837 adev->mp1_state, r);
2838 return r;
2839 }
a3a09142
AD
2840 }
2841 }
d38ceaf9
AD
2842 }
2843
2844 return 0;
2845}
2846
e7854a03
AD
2847/**
2848 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2849 *
2850 * @adev: amdgpu_device pointer
2851 *
2852 * Main suspend function for hardware IPs. The list of all the hardware
2853 * IPs that make up the asic is walked, clockgating is disabled and the
2854 * suspend callbacks are run. suspend puts the hardware and software state
2855 * in each IP into a state suitable for suspend.
2856 * Returns 0 on success, negative error code on failure.
2857 */
2858int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2859{
2860 int r;
2861
3c73683c
JC
2862 if (amdgpu_sriov_vf(adev)) {
2863 amdgpu_virt_fini_data_exchange(adev);
e7819644 2864 amdgpu_virt_request_full_gpu(adev, false);
3c73683c 2865 }
e7819644 2866
e7854a03
AD
2867 r = amdgpu_device_ip_suspend_phase1(adev);
2868 if (r)
2869 return r;
2870 r = amdgpu_device_ip_suspend_phase2(adev);
2871
e7819644
YT
2872 if (amdgpu_sriov_vf(adev))
2873 amdgpu_virt_release_full_gpu(adev, false);
2874
e7854a03
AD
2875 return r;
2876}
2877
06ec9070 2878static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2879{
2880 int i, r;
2881
2cb681b6
ML
2882 static enum amd_ip_block_type ip_order[] = {
2883 AMD_IP_BLOCK_TYPE_GMC,
2884 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 2885 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
2886 AMD_IP_BLOCK_TYPE_IH,
2887 };
a90ad3c2 2888
95ea3dbc 2889 for (i = 0; i < adev->num_ip_blocks; i++) {
2cb681b6
ML
2890 int j;
2891 struct amdgpu_ip_block *block;
a90ad3c2 2892
4cd2a96d
J
2893 block = &adev->ip_blocks[i];
2894 block->status.hw = false;
2cb681b6 2895
4cd2a96d 2896 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 2897
4cd2a96d 2898 if (block->version->type != ip_order[j] ||
2cb681b6
ML
2899 !block->status.valid)
2900 continue;
2901
2902 r = block->version->funcs->hw_init(adev);
0aaeefcc 2903 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2904 if (r)
2905 return r;
482f0e53 2906 block->status.hw = true;
a90ad3c2
ML
2907 }
2908 }
2909
2910 return 0;
2911}
2912
06ec9070 2913static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2914{
2915 int i, r;
2916
2cb681b6
ML
2917 static enum amd_ip_block_type ip_order[] = {
2918 AMD_IP_BLOCK_TYPE_SMC,
2919 AMD_IP_BLOCK_TYPE_DCE,
2920 AMD_IP_BLOCK_TYPE_GFX,
2921 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 2922 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
2923 AMD_IP_BLOCK_TYPE_VCE,
2924 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 2925 };
a90ad3c2 2926
2cb681b6
ML
2927 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2928 int j;
2929 struct amdgpu_ip_block *block;
a90ad3c2 2930
2cb681b6
ML
2931 for (j = 0; j < adev->num_ip_blocks; j++) {
2932 block = &adev->ip_blocks[j];
2933
2934 if (block->version->type != ip_order[i] ||
482f0e53
ML
2935 !block->status.valid ||
2936 block->status.hw)
2cb681b6
ML
2937 continue;
2938
895bd048
JZ
2939 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2940 r = block->version->funcs->resume(adev);
2941 else
2942 r = block->version->funcs->hw_init(adev);
2943
0aaeefcc 2944 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2945 if (r)
2946 return r;
482f0e53 2947 block->status.hw = true;
a90ad3c2
ML
2948 }
2949 }
2950
2951 return 0;
2952}
2953
e3ecdffa
AD
2954/**
2955 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2956 *
2957 * @adev: amdgpu_device pointer
2958 *
2959 * First resume function for hardware IPs. The list of all the hardware
2960 * IPs that make up the asic is walked and the resume callbacks are run for
2961 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2962 * after a suspend and updates the software state as necessary. This
2963 * function is also used for restoring the GPU after a GPU reset.
2964 * Returns 0 on success, negative error code on failure.
2965 */
06ec9070 2966static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
2967{
2968 int i, r;
2969
a90ad3c2 2970 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2971 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 2972 continue;
a90ad3c2 2973 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
2974 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2975 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 2976
fcf0649f
CZ
2977 r = adev->ip_blocks[i].version->funcs->resume(adev);
2978 if (r) {
2979 DRM_ERROR("resume of IP block <%s> failed %d\n",
2980 adev->ip_blocks[i].version->funcs->name, r);
2981 return r;
2982 }
482f0e53 2983 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
2984 }
2985 }
2986
2987 return 0;
2988}
2989
e3ecdffa
AD
2990/**
2991 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2992 *
2993 * @adev: amdgpu_device pointer
2994 *
2995 * First resume function for hardware IPs. The list of all the hardware
2996 * IPs that make up the asic is walked and the resume callbacks are run for
2997 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2998 * functional state after a suspend and updates the software state as
2999 * necessary. This function is also used for restoring the GPU after a GPU
3000 * reset.
3001 * Returns 0 on success, negative error code on failure.
3002 */
06ec9070 3003static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
3004{
3005 int i, r;
3006
3007 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3008 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 3009 continue;
fcf0649f 3010 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3011 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
3012 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3013 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 3014 continue;
a1255107 3015 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 3016 if (r) {
a1255107
AD
3017 DRM_ERROR("resume of IP block <%s> failed %d\n",
3018 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 3019 return r;
2c1a2784 3020 }
482f0e53 3021 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
3022 }
3023
3024 return 0;
3025}
3026
e3ecdffa
AD
3027/**
3028 * amdgpu_device_ip_resume - run resume for hardware IPs
3029 *
3030 * @adev: amdgpu_device pointer
3031 *
3032 * Main resume function for hardware IPs. The hardware IPs
3033 * are split into two resume functions because they are
3034 * are also used in in recovering from a GPU reset and some additional
3035 * steps need to be take between them. In this case (S3/S4) they are
3036 * run sequentially.
3037 * Returns 0 on success, negative error code on failure.
3038 */
06ec9070 3039static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
3040{
3041 int r;
3042
06ec9070 3043 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
3044 if (r)
3045 return r;
7a3e0bb2
RZ
3046
3047 r = amdgpu_device_fw_loading(adev);
3048 if (r)
3049 return r;
3050
06ec9070 3051 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
3052
3053 return r;
3054}
3055
e3ecdffa
AD
3056/**
3057 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3058 *
3059 * @adev: amdgpu_device pointer
3060 *
3061 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3062 */
4e99a44e 3063static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 3064{
6867e1b5
ML
3065 if (amdgpu_sriov_vf(adev)) {
3066 if (adev->is_atom_fw) {
58ff791a 3067 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
6867e1b5
ML
3068 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3069 } else {
3070 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3071 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3072 }
3073
3074 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3075 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 3076 }
048765ad
AR
3077}
3078
e3ecdffa
AD
3079/**
3080 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3081 *
3082 * @asic_type: AMD asic type
3083 *
3084 * Check if there is DC (new modesetting infrastructre) support for an asic.
3085 * returns true if DC has support, false if not.
3086 */
4562236b
HW
3087bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3088{
3089 switch (asic_type) {
3090#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
3091#if defined(CONFIG_DRM_AMD_DC_SI)
3092 case CHIP_TAHITI:
3093 case CHIP_PITCAIRN:
3094 case CHIP_VERDE:
3095 case CHIP_OLAND:
3096#endif
4562236b 3097 case CHIP_BONAIRE:
0d6fbccb 3098 case CHIP_KAVERI:
367e6687
AD
3099 case CHIP_KABINI:
3100 case CHIP_MULLINS:
d9fda248
HW
3101 /*
3102 * We have systems in the wild with these ASICs that require
3103 * LVDS and VGA support which is not supported with DC.
3104 *
3105 * Fallback to the non-DC driver here by default so as not to
3106 * cause regressions.
3107 */
3108 return amdgpu_dc > 0;
3109 case CHIP_HAWAII:
4562236b
HW
3110 case CHIP_CARRIZO:
3111 case CHIP_STONEY:
4562236b 3112 case CHIP_POLARIS10:
675fd32b 3113 case CHIP_POLARIS11:
2c8ad2d5 3114 case CHIP_POLARIS12:
675fd32b 3115 case CHIP_VEGAM:
4562236b
HW
3116 case CHIP_TONGA:
3117 case CHIP_FIJI:
42f8ffa1 3118 case CHIP_VEGA10:
dca7b401 3119 case CHIP_VEGA12:
c6034aa2 3120 case CHIP_VEGA20:
b86a1aa3 3121#if defined(CONFIG_DRM_AMD_DC_DCN)
fd187853 3122 case CHIP_RAVEN:
b4f199c7 3123 case CHIP_NAVI10:
8fceceb6 3124 case CHIP_NAVI14:
078655d9 3125 case CHIP_NAVI12:
e1c14c43 3126 case CHIP_RENOIR:
81d9bfb8 3127 case CHIP_SIENNA_CICHLID:
a6c5308f 3128 case CHIP_NAVY_FLOUNDER:
7cc656e2 3129 case CHIP_DIMGREY_CAVEFISH:
ddaed58b 3130 case CHIP_BEIGE_GOBY:
84b934bc 3131 case CHIP_VANGOGH:
42f8ffa1 3132#endif
fd187853 3133 return amdgpu_dc != 0;
4562236b
HW
3134#endif
3135 default:
93b09a9a 3136 if (amdgpu_dc > 0)
044a48f4 3137 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
93b09a9a 3138 "but isn't supported by ASIC, ignoring\n");
4562236b
HW
3139 return false;
3140 }
3141}
3142
3143/**
3144 * amdgpu_device_has_dc_support - check if dc is supported
3145 *
982a820b 3146 * @adev: amdgpu_device pointer
4562236b
HW
3147 *
3148 * Returns true for supported, false for not supported
3149 */
3150bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3151{
c997e8e2 3152 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
2555039d
XY
3153 return false;
3154
4562236b
HW
3155 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3156}
3157
d4535e2c
AG
3158static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3159{
3160 struct amdgpu_device *adev =
3161 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 3162 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 3163
c6a6e2db
AG
3164 /* It's a bug to not have a hive within this function */
3165 if (WARN_ON(!hive))
3166 return;
3167
3168 /*
3169 * Use task barrier to synchronize all xgmi reset works across the
3170 * hive. task_barrier_enter and task_barrier_exit will block
3171 * until all the threads running the xgmi reset works reach
3172 * those points. task_barrier_full will do both blocks.
3173 */
3174 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3175
3176 task_barrier_enter(&hive->tb);
4a580877 3177 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
3178
3179 if (adev->asic_reset_res)
3180 goto fail;
3181
3182 task_barrier_exit(&hive->tb);
4a580877 3183 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
3184
3185 if (adev->asic_reset_res)
3186 goto fail;
43c4d576 3187
8bc7b360
HZ
3188 if (adev->mmhub.ras_funcs &&
3189 adev->mmhub.ras_funcs->reset_ras_error_count)
3190 adev->mmhub.ras_funcs->reset_ras_error_count(adev);
c6a6e2db
AG
3191 } else {
3192
3193 task_barrier_full(&hive->tb);
3194 adev->asic_reset_res = amdgpu_asic_reset(adev);
3195 }
ce316fa5 3196
c6a6e2db 3197fail:
d4535e2c 3198 if (adev->asic_reset_res)
fed184e9 3199 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 3200 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 3201 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
3202}
3203
71f98027
AD
3204static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3205{
3206 char *input = amdgpu_lockup_timeout;
3207 char *timeout_setting = NULL;
3208 int index = 0;
3209 long timeout;
3210 int ret = 0;
3211
3212 /*
67387dfe
AD
3213 * By default timeout for non compute jobs is 10000
3214 * and 60000 for compute jobs.
71f98027 3215 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 3216 * jobs are 60000 by default.
71f98027
AD
3217 */
3218 adev->gfx_timeout = msecs_to_jiffies(10000);
3219 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
9882e278
ED
3220 if (amdgpu_sriov_vf(adev))
3221 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3222 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
71f98027 3223 else
67387dfe 3224 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027 3225
f440ff44 3226 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 3227 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 3228 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
3229 ret = kstrtol(timeout_setting, 0, &timeout);
3230 if (ret)
3231 return ret;
3232
3233 if (timeout == 0) {
3234 index++;
3235 continue;
3236 } else if (timeout < 0) {
3237 timeout = MAX_SCHEDULE_TIMEOUT;
3238 } else {
3239 timeout = msecs_to_jiffies(timeout);
3240 }
3241
3242 switch (index++) {
3243 case 0:
3244 adev->gfx_timeout = timeout;
3245 break;
3246 case 1:
3247 adev->compute_timeout = timeout;
3248 break;
3249 case 2:
3250 adev->sdma_timeout = timeout;
3251 break;
3252 case 3:
3253 adev->video_timeout = timeout;
3254 break;
3255 default:
3256 break;
3257 }
3258 }
3259 /*
3260 * There is only one value specified and
3261 * it should apply to all non-compute jobs.
3262 */
bcccee89 3263 if (index == 1) {
71f98027 3264 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3265 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3266 adev->compute_timeout = adev->gfx_timeout;
3267 }
71f98027
AD
3268 }
3269
3270 return ret;
3271}
d4535e2c 3272
77f3a5cd
ND
3273static const struct attribute *amdgpu_dev_attributes[] = {
3274 &dev_attr_product_name.attr,
3275 &dev_attr_product_number.attr,
3276 &dev_attr_serial_number.attr,
3277 &dev_attr_pcie_replay_count.attr,
3278 NULL
3279};
3280
d38ceaf9
AD
3281/**
3282 * amdgpu_device_init - initialize the driver
3283 *
3284 * @adev: amdgpu_device pointer
d38ceaf9
AD
3285 * @flags: driver flags
3286 *
3287 * Initializes the driver info and hw (all asics).
3288 * Returns 0 for success or an error on failure.
3289 * Called at driver startup.
3290 */
3291int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3292 uint32_t flags)
3293{
8aba21b7
LT
3294 struct drm_device *ddev = adev_to_drm(adev);
3295 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3296 int r, i;
b98c6299 3297 bool px = false;
95844d20 3298 u32 max_MBps;
d38ceaf9
AD
3299
3300 adev->shutdown = false;
d38ceaf9 3301 adev->flags = flags;
4e66d7d2
YZ
3302
3303 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3304 adev->asic_type = amdgpu_force_asic_type;
3305 else
3306 adev->asic_type = flags & AMD_ASIC_MASK;
3307
d38ceaf9 3308 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3309 if (amdgpu_emu_mode == 1)
8bdab6bb 3310 adev->usec_timeout *= 10;
770d13b1 3311 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3312 adev->accel_working = false;
3313 adev->num_rings = 0;
3314 adev->mman.buffer_funcs = NULL;
3315 adev->mman.buffer_funcs_ring = NULL;
3316 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3317 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3318 adev->gmc.gmc_funcs = NULL;
7bd939d0 3319 adev->harvest_ip_mask = 0x0;
f54d1867 3320 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3321 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3322
3323 adev->smc_rreg = &amdgpu_invalid_rreg;
3324 adev->smc_wreg = &amdgpu_invalid_wreg;
3325 adev->pcie_rreg = &amdgpu_invalid_rreg;
3326 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3327 adev->pciep_rreg = &amdgpu_invalid_rreg;
3328 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3329 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3330 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3331 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3332 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3333 adev->didt_rreg = &amdgpu_invalid_rreg;
3334 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3335 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3336 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3337 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3338 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3339
3e39ab90
AD
3340 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3341 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3342 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3343
3344 /* mutex initialization are all done here so we
3345 * can recall function without having locking issues */
0e5ca0d1 3346 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3347 mutex_init(&adev->pm.mutex);
3348 mutex_init(&adev->gfx.gpu_clock_mutex);
3349 mutex_init(&adev->srbm_mutex);
b8866c26 3350 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3351 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3352 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3353 mutex_init(&adev->mn_lock);
e23b74aa 3354 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3355 hash_init(adev->mn_hash);
53b3f8f4 3356 atomic_set(&adev->in_gpu_reset, 0);
6049db43 3357 init_rwsem(&adev->reset_sem);
32eaeae0 3358 mutex_init(&adev->psp.mutex);
bd052211 3359 mutex_init(&adev->notifier_lock);
d38ceaf9 3360
912dfc84
EQ
3361 r = amdgpu_device_check_arguments(adev);
3362 if (r)
3363 return r;
d38ceaf9 3364
d38ceaf9
AD
3365 spin_lock_init(&adev->mmio_idx_lock);
3366 spin_lock_init(&adev->smc_idx_lock);
3367 spin_lock_init(&adev->pcie_idx_lock);
3368 spin_lock_init(&adev->uvd_ctx_idx_lock);
3369 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3370 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3371 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3372 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3373 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3374
0c4e7fa5
CZ
3375 INIT_LIST_HEAD(&adev->shadow_list);
3376 mutex_init(&adev->shadow_list_lock);
3377
655ce9cb 3378 INIT_LIST_HEAD(&adev->reset_list);
3379
beff74bc
AD
3380 INIT_DELAYED_WORK(&adev->delayed_init_work,
3381 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3382 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3383 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3384
d4535e2c
AG
3385 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3386
d23ee13f 3387 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3388 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3389
b265bdbd
EQ
3390 atomic_set(&adev->throttling_logging_enabled, 1);
3391 /*
3392 * If throttling continues, logging will be performed every minute
3393 * to avoid log flooding. "-1" is subtracted since the thermal
3394 * throttling interrupt comes every second. Thus, the total logging
3395 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3396 * for throttling interrupt) = 60 seconds.
3397 */
3398 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3399 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3400
0fa49558
AX
3401 /* Registers mapping */
3402 /* TODO: block userspace mapping of io register */
da69c161
KW
3403 if (adev->asic_type >= CHIP_BONAIRE) {
3404 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3405 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3406 } else {
3407 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3408 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3409 }
d38ceaf9 3410
d38ceaf9
AD
3411 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3412 if (adev->rmmio == NULL) {
3413 return -ENOMEM;
3414 }
3415 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3416 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3417
b2109d8e
JX
3418 /* enable PCIE atomic ops */
3419 r = pci_enable_atomic_ops_to_root(adev->pdev,
3420 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3421 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3422 if (r) {
3423 adev->have_atomics_support = false;
3424 DRM_INFO("PCIE atomic ops is not supported\n");
3425 } else {
3426 adev->have_atomics_support = true;
3427 }
3428
5494d864
AD
3429 amdgpu_device_get_pcie_info(adev);
3430
b239c017
JX
3431 if (amdgpu_mcbp)
3432 DRM_INFO("MCBP is enabled\n");
3433
5f84cc63
JX
3434 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3435 adev->enable_mes = true;
3436
3aa0115d
ML
3437 /* detect hw virtualization here */
3438 amdgpu_detect_virtualization(adev);
3439
dffa11b4
ML
3440 r = amdgpu_device_get_job_timeout_settings(adev);
3441 if (r) {
3442 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4192f7b5 3443 goto failed_unmap;
a190d1c7
XY
3444 }
3445
d38ceaf9 3446 /* early init functions */
06ec9070 3447 r = amdgpu_device_ip_early_init(adev);
d38ceaf9 3448 if (r)
4192f7b5 3449 goto failed_unmap;
d38ceaf9 3450
6585661d
OZ
3451 /* doorbell bar mapping and doorbell index init*/
3452 amdgpu_device_doorbell_init(adev);
3453
9475a943
SL
3454 if (amdgpu_emu_mode == 1) {
3455 /* post the asic on emulation mode */
3456 emu_soc_asic_init(adev);
bfca0289 3457 goto fence_driver_init;
9475a943 3458 }
bfca0289 3459
04442bf7
LL
3460 amdgpu_reset_init(adev);
3461
4e99a44e
ML
3462 /* detect if we are with an SRIOV vbios */
3463 amdgpu_device_detect_sriov_bios(adev);
048765ad 3464
95e8e59e
AD
3465 /* check if we need to reset the asic
3466 * E.g., driver was not cleanly unloaded previously, etc.
3467 */
f14899fd 3468 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
e3c1b071 3469 if (adev->gmc.xgmi.num_physical_nodes) {
3470 dev_info(adev->dev, "Pending hive reset.\n");
3471 adev->gmc.xgmi.pending_reset = true;
3472 /* Only need to init necessary block for SMU to handle the reset */
3473 for (i = 0; i < adev->num_ip_blocks; i++) {
3474 if (!adev->ip_blocks[i].status.valid)
3475 continue;
3476 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3477 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3478 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3479 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
751f43e7 3480 DRM_DEBUG("IP %s disabled for hw_init.\n",
e3c1b071 3481 adev->ip_blocks[i].version->funcs->name);
3482 adev->ip_blocks[i].status.hw = true;
3483 }
3484 }
3485 } else {
3486 r = amdgpu_asic_reset(adev);
3487 if (r) {
3488 dev_err(adev->dev, "asic reset on init failed\n");
3489 goto failed;
3490 }
95e8e59e
AD
3491 }
3492 }
3493
8f66090b 3494 pci_enable_pcie_error_reporting(adev->pdev);
c9a6b82f 3495
d38ceaf9 3496 /* Post card if necessary */
39c640c0 3497 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3498 if (!adev->bios) {
bec86378 3499 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3500 r = -EINVAL;
3501 goto failed;
d38ceaf9 3502 }
bec86378 3503 DRM_INFO("GPU posting now...\n");
4d2997ab 3504 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3505 if (r) {
3506 dev_err(adev->dev, "gpu post error!\n");
3507 goto failed;
3508 }
d38ceaf9
AD
3509 }
3510
88b64e95
AD
3511 if (adev->is_atom_fw) {
3512 /* Initialize clocks */
3513 r = amdgpu_atomfirmware_get_clock_info(adev);
3514 if (r) {
3515 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3516 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3517 goto failed;
3518 }
3519 } else {
a5bde2f9
AD
3520 /* Initialize clocks */
3521 r = amdgpu_atombios_get_clock_info(adev);
3522 if (r) {
3523 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3524 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3525 goto failed;
a5bde2f9
AD
3526 }
3527 /* init i2c buses */
4562236b
HW
3528 if (!amdgpu_device_has_dc_support(adev))
3529 amdgpu_atombios_i2c_init(adev);
2c1a2784 3530 }
d38ceaf9 3531
bfca0289 3532fence_driver_init:
d38ceaf9
AD
3533 /* Fence driver */
3534 r = amdgpu_fence_driver_init(adev);
2c1a2784
AD
3535 if (r) {
3536 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
e23b74aa 3537 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3538 goto failed;
2c1a2784 3539 }
d38ceaf9
AD
3540
3541 /* init the mode config */
4a580877 3542 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3543
06ec9070 3544 r = amdgpu_device_ip_init(adev);
d38ceaf9 3545 if (r) {
8840a387 3546 /* failed in exclusive mode due to timeout */
3547 if (amdgpu_sriov_vf(adev) &&
3548 !amdgpu_sriov_runtime(adev) &&
3549 amdgpu_virt_mmio_blocked(adev) &&
3550 !amdgpu_virt_wait_reset(adev)) {
3551 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3552 /* Don't send request since VF is inactive. */
3553 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3554 adev->virt.ops = NULL;
8840a387 3555 r = -EAGAIN;
970fd197 3556 goto release_ras_con;
8840a387 3557 }
06ec9070 3558 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3559 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
970fd197 3560 goto release_ras_con;
d38ceaf9
AD
3561 }
3562
d69b8971
YZ
3563 dev_info(adev->dev,
3564 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3565 adev->gfx.config.max_shader_engines,
3566 adev->gfx.config.max_sh_per_se,
3567 adev->gfx.config.max_cu_per_sh,
3568 adev->gfx.cu_info.number);
3569
d38ceaf9
AD
3570 adev->accel_working = true;
3571
e59c0205
AX
3572 amdgpu_vm_check_compute_bug(adev);
3573
95844d20
MO
3574 /* Initialize the buffer migration limit. */
3575 if (amdgpu_moverate >= 0)
3576 max_MBps = amdgpu_moverate;
3577 else
3578 max_MBps = 8; /* Allow 8 MB/s. */
3579 /* Get a log2 for easy divisions. */
3580 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3581
9bc92b9c
ML
3582 amdgpu_fbdev_init(adev);
3583
d2f52ac8 3584 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3585 if (r) {
3586 adev->pm_sysfs_en = false;
d2f52ac8 3587 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3588 } else
3589 adev->pm_sysfs_en = true;
d2f52ac8 3590
5bb23532 3591 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3592 if (r) {
3593 adev->ucode_sysfs_en = false;
5bb23532 3594 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3595 } else
3596 adev->ucode_sysfs_en = true;
5bb23532 3597
d38ceaf9
AD
3598 if ((amdgpu_testing & 1)) {
3599 if (adev->accel_working)
3600 amdgpu_test_moves(adev);
3601 else
3602 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3603 }
d38ceaf9
AD
3604 if (amdgpu_benchmarking) {
3605 if (adev->accel_working)
3606 amdgpu_benchmark(adev, amdgpu_benchmarking);
3607 else
3608 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3609 }
3610
b0adca4d
EQ
3611 /*
3612 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3613 * Otherwise the mgpu fan boost feature will be skipped due to the
3614 * gpu instance is counted less.
3615 */
3616 amdgpu_register_gpu_instance(adev);
3617
d38ceaf9
AD
3618 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3619 * explicit gating rather than handling it automatically.
3620 */
e3c1b071 3621 if (!adev->gmc.xgmi.pending_reset) {
3622 r = amdgpu_device_ip_late_init(adev);
3623 if (r) {
3624 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3625 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
970fd197 3626 goto release_ras_con;
e3c1b071 3627 }
3628 /* must succeed. */
3629 amdgpu_ras_resume(adev);
3630 queue_delayed_work(system_wq, &adev->delayed_init_work,
3631 msecs_to_jiffies(AMDGPU_RESUME_MS));
2c1a2784 3632 }
d38ceaf9 3633
2c738637
ML
3634 if (amdgpu_sriov_vf(adev))
3635 flush_delayed_work(&adev->delayed_init_work);
3636
77f3a5cd 3637 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
5aea5327 3638 if (r)
77f3a5cd 3639 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166 3640
d155bef0
AB
3641 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3642 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3643 if (r)
3644 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3645
c1dd4aa6
AG
3646 /* Have stored pci confspace at hand for restore in sudden PCI error */
3647 if (amdgpu_device_cache_pci_state(adev->pdev))
3648 pci_restore_state(pdev);
3649
8c3dd61c
KHF
3650 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3651 /* this will fail for cards that aren't VGA class devices, just
3652 * ignore it */
3653 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3654 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3655
3656 if (amdgpu_device_supports_px(ddev)) {
3657 px = true;
3658 vga_switcheroo_register_client(adev->pdev,
3659 &amdgpu_switcheroo_ops, px);
3660 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3661 }
3662
e3c1b071 3663 if (adev->gmc.xgmi.pending_reset)
3664 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3665 msecs_to_jiffies(AMDGPU_RESUME_MS));
3666
d38ceaf9 3667 return 0;
83ba126a 3668
970fd197
SY
3669release_ras_con:
3670 amdgpu_release_ras_context(adev);
3671
83ba126a 3672failed:
89041940 3673 amdgpu_vf_error_trans_all(adev);
8840a387 3674
4192f7b5
AD
3675failed_unmap:
3676 iounmap(adev->rmmio);
3677 adev->rmmio = NULL;
3678
83ba126a 3679 return r;
d38ceaf9
AD
3680}
3681
07775fc1
AG
3682static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3683{
3684 /* Clear all CPU mappings pointing to this device */
3685 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3686
3687 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3688 amdgpu_device_doorbell_fini(adev);
3689
3690 iounmap(adev->rmmio);
3691 adev->rmmio = NULL;
3692 if (adev->mman.aper_base_kaddr)
3693 iounmap(adev->mman.aper_base_kaddr);
3694 adev->mman.aper_base_kaddr = NULL;
3695
3696 /* Memory manager related */
3697 if (!adev->gmc.xgmi.connected_to_cpu) {
3698 arch_phys_wc_del(adev->gmc.vram_mtrr);
3699 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3700 }
3701}
3702
d38ceaf9
AD
3703/**
3704 * amdgpu_device_fini - tear down the driver
3705 *
3706 * @adev: amdgpu_device pointer
3707 *
3708 * Tear down the driver info (all asics).
3709 * Called at driver shutdown.
3710 */
72c8c97b 3711void amdgpu_device_fini_hw(struct amdgpu_device *adev)
d38ceaf9 3712{
aac89168 3713 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3714 flush_delayed_work(&adev->delayed_init_work);
bb0cd09b 3715 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
d0d13fe8 3716 adev->shutdown = true;
9f875167 3717
752c683d
ML
3718 /* make sure IB test finished before entering exclusive mode
3719 * to avoid preemption on IB test
3720 * */
519b8b76 3721 if (amdgpu_sriov_vf(adev)) {
752c683d 3722 amdgpu_virt_request_full_gpu(adev, false);
519b8b76
BZ
3723 amdgpu_virt_fini_data_exchange(adev);
3724 }
752c683d 3725
e5b03032
ML
3726 /* disable all interrupts */
3727 amdgpu_irq_disable_all(adev);
ff97cba8
ML
3728 if (adev->mode_info.mode_config_initialized){
3729 if (!amdgpu_device_has_dc_support(adev))
4a580877 3730 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3731 else
4a580877 3732 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3733 }
72c8c97b
AG
3734 amdgpu_fence_driver_fini_hw(adev);
3735
7c868b59
YT
3736 if (adev->pm_sysfs_en)
3737 amdgpu_pm_sysfs_fini(adev);
72c8c97b
AG
3738 if (adev->ucode_sysfs_en)
3739 amdgpu_ucode_sysfs_fini(adev);
3740 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3741
d38ceaf9 3742 amdgpu_fbdev_fini(adev);
72c8c97b
AG
3743
3744 amdgpu_irq_fini_hw(adev);
e9669fb7
AG
3745
3746 amdgpu_device_ip_fini_early(adev);
d10d0daa
AG
3747
3748 amdgpu_gart_dummy_page_fini(adev);
07775fc1
AG
3749
3750 amdgpu_device_unmap_mmio(adev);
72c8c97b
AG
3751}
3752
3753void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3754{
e230ac11 3755 amdgpu_device_ip_fini(adev);
72c8c97b 3756 amdgpu_fence_driver_fini_sw(adev);
75e1658e
ND
3757 release_firmware(adev->firmware.gpu_info_fw);
3758 adev->firmware.gpu_info_fw = NULL;
d38ceaf9 3759 adev->accel_working = false;
04442bf7
LL
3760
3761 amdgpu_reset_fini(adev);
3762
d38ceaf9 3763 /* free i2c buses */
4562236b
HW
3764 if (!amdgpu_device_has_dc_support(adev))
3765 amdgpu_i2c_fini(adev);
bfca0289
SL
3766
3767 if (amdgpu_emu_mode != 1)
3768 amdgpu_atombios_fini(adev);
3769
d38ceaf9
AD
3770 kfree(adev->bios);
3771 adev->bios = NULL;
b98c6299 3772 if (amdgpu_device_supports_px(adev_to_drm(adev))) {
84c8b22e 3773 vga_switcheroo_unregister_client(adev->pdev);
83ba126a 3774 vga_switcheroo_fini_domain_pm_ops(adev->dev);
b98c6299 3775 }
38d6be81
AD
3776 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3777 vga_client_register(adev->pdev, NULL, NULL, NULL);
e9bc1bf7 3778
d155bef0
AB
3779 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3780 amdgpu_pmu_fini(adev);
72de33f8 3781 if (adev->mman.discovery_bin)
a190d1c7 3782 amdgpu_discovery_fini(adev);
72c8c97b
AG
3783
3784 kfree(adev->pci_state);
3785
d38ceaf9
AD
3786}
3787
3788
3789/*
3790 * Suspend & resume.
3791 */
3792/**
810ddc3a 3793 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 3794 *
87e3f136 3795 * @dev: drm dev pointer
87e3f136 3796 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
3797 *
3798 * Puts the hw in the suspend state (all asics).
3799 * Returns 0 for success or an error on failure.
3800 * Called at driver suspend.
3801 */
de185019 3802int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9 3803{
a2e15b0e 3804 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 3805
d38ceaf9
AD
3806 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3807 return 0;
3808
44779b43 3809 adev->in_suspend = true;
d38ceaf9
AD
3810 drm_kms_helper_poll_disable(dev);
3811
5f818173
S
3812 if (fbcon)
3813 amdgpu_fbdev_set_suspend(adev, 1);
3814
beff74bc 3815 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 3816
5e6932fe 3817 amdgpu_ras_suspend(adev);
3818
2196927b 3819 amdgpu_device_ip_suspend_phase1(adev);
fe1053b7 3820
5d3a2d95
AD
3821 if (!adev->in_s0ix)
3822 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
94fa5660 3823
d38ceaf9
AD
3824 /* evict vram memory */
3825 amdgpu_bo_evict_vram(adev);
3826
5ceb54c6 3827 amdgpu_fence_driver_suspend(adev);
d38ceaf9 3828
2196927b 3829 amdgpu_device_ip_suspend_phase2(adev);
a0a71e49
AD
3830 /* evict remaining vram memory
3831 * This second call to evict vram is to evict the gart page table
3832 * using the CPU.
3833 */
d38ceaf9
AD
3834 amdgpu_bo_evict_vram(adev);
3835
d38ceaf9
AD
3836 return 0;
3837}
3838
3839/**
810ddc3a 3840 * amdgpu_device_resume - initiate device resume
d38ceaf9 3841 *
87e3f136 3842 * @dev: drm dev pointer
87e3f136 3843 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
3844 *
3845 * Bring the hw back to operating state (all asics).
3846 * Returns 0 for success or an error on failure.
3847 * Called at driver resume.
3848 */
de185019 3849int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9 3850{
1348969a 3851 struct amdgpu_device *adev = drm_to_adev(dev);
03161a6e 3852 int r = 0;
d38ceaf9
AD
3853
3854 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3855 return 0;
3856
62498733 3857 if (adev->in_s0ix)
628c36d7
PL
3858 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3859
d38ceaf9 3860 /* post card */
39c640c0 3861 if (amdgpu_device_need_post(adev)) {
4d2997ab 3862 r = amdgpu_device_asic_init(adev);
74b0b157 3863 if (r)
aac89168 3864 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 3865 }
d38ceaf9 3866
06ec9070 3867 r = amdgpu_device_ip_resume(adev);
e6707218 3868 if (r) {
aac89168 3869 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 3870 return r;
e6707218 3871 }
5ceb54c6
AD
3872 amdgpu_fence_driver_resume(adev);
3873
d38ceaf9 3874
06ec9070 3875 r = amdgpu_device_ip_late_init(adev);
03161a6e 3876 if (r)
4d3b9ae5 3877 return r;
d38ceaf9 3878
beff74bc
AD
3879 queue_delayed_work(system_wq, &adev->delayed_init_work,
3880 msecs_to_jiffies(AMDGPU_RESUME_MS));
3881
5d3a2d95
AD
3882 if (!adev->in_s0ix) {
3883 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3884 if (r)
3885 return r;
3886 }
756e6880 3887
96a5d8d4 3888 /* Make sure IB tests flushed */
beff74bc 3889 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 3890
a2e15b0e 3891 if (fbcon)
4d3b9ae5 3892 amdgpu_fbdev_set_suspend(adev, 0);
d38ceaf9
AD
3893
3894 drm_kms_helper_poll_enable(dev);
23a1a9e5 3895
5e6932fe 3896 amdgpu_ras_resume(adev);
3897
23a1a9e5
L
3898 /*
3899 * Most of the connector probing functions try to acquire runtime pm
3900 * refs to ensure that the GPU is powered on when connector polling is
3901 * performed. Since we're calling this from a runtime PM callback,
3902 * trying to acquire rpm refs will cause us to deadlock.
3903 *
3904 * Since we're guaranteed to be holding the rpm lock, it's safe to
3905 * temporarily disable the rpm helpers so this doesn't deadlock us.
3906 */
3907#ifdef CONFIG_PM
3908 dev->dev->power.disable_depth++;
3909#endif
4562236b
HW
3910 if (!amdgpu_device_has_dc_support(adev))
3911 drm_helper_hpd_irq_event(dev);
3912 else
3913 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
3914#ifdef CONFIG_PM
3915 dev->dev->power.disable_depth--;
3916#endif
44779b43
RZ
3917 adev->in_suspend = false;
3918
4d3b9ae5 3919 return 0;
d38ceaf9
AD
3920}
3921
e3ecdffa
AD
3922/**
3923 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3924 *
3925 * @adev: amdgpu_device pointer
3926 *
3927 * The list of all the hardware IPs that make up the asic is walked and
3928 * the check_soft_reset callbacks are run. check_soft_reset determines
3929 * if the asic is still hung or not.
3930 * Returns true if any of the IPs are still in a hung state, false if not.
3931 */
06ec9070 3932static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
3933{
3934 int i;
3935 bool asic_hang = false;
3936
f993d628
ML
3937 if (amdgpu_sriov_vf(adev))
3938 return true;
3939
8bc04c29
AD
3940 if (amdgpu_asic_need_full_reset(adev))
3941 return true;
3942
63fbf42f 3943 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3944 if (!adev->ip_blocks[i].status.valid)
63fbf42f 3945 continue;
a1255107
AD
3946 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3947 adev->ip_blocks[i].status.hang =
3948 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3949 if (adev->ip_blocks[i].status.hang) {
aac89168 3950 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
3951 asic_hang = true;
3952 }
3953 }
3954 return asic_hang;
3955}
3956
e3ecdffa
AD
3957/**
3958 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3959 *
3960 * @adev: amdgpu_device pointer
3961 *
3962 * The list of all the hardware IPs that make up the asic is walked and the
3963 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3964 * handles any IP specific hardware or software state changes that are
3965 * necessary for a soft reset to succeed.
3966 * Returns 0 on success, negative error code on failure.
3967 */
06ec9070 3968static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
3969{
3970 int i, r = 0;
3971
3972 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3973 if (!adev->ip_blocks[i].status.valid)
d31a501e 3974 continue;
a1255107
AD
3975 if (adev->ip_blocks[i].status.hang &&
3976 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3977 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
3978 if (r)
3979 return r;
3980 }
3981 }
3982
3983 return 0;
3984}
3985
e3ecdffa
AD
3986/**
3987 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3988 *
3989 * @adev: amdgpu_device pointer
3990 *
3991 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3992 * reset is necessary to recover.
3993 * Returns true if a full asic reset is required, false if not.
3994 */
06ec9070 3995static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 3996{
da146d3b
AD
3997 int i;
3998
8bc04c29
AD
3999 if (amdgpu_asic_need_full_reset(adev))
4000 return true;
4001
da146d3b 4002 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4003 if (!adev->ip_blocks[i].status.valid)
da146d3b 4004 continue;
a1255107
AD
4005 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4006 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4007 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
4008 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4009 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 4010 if (adev->ip_blocks[i].status.hang) {
aac89168 4011 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
4012 return true;
4013 }
4014 }
35d782fe
CZ
4015 }
4016 return false;
4017}
4018
e3ecdffa
AD
4019/**
4020 * amdgpu_device_ip_soft_reset - do a soft reset
4021 *
4022 * @adev: amdgpu_device pointer
4023 *
4024 * The list of all the hardware IPs that make up the asic is walked and the
4025 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4026 * IP specific hardware or software state changes that are necessary to soft
4027 * reset the IP.
4028 * Returns 0 on success, negative error code on failure.
4029 */
06ec9070 4030static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4031{
4032 int i, r = 0;
4033
4034 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4035 if (!adev->ip_blocks[i].status.valid)
35d782fe 4036 continue;
a1255107
AD
4037 if (adev->ip_blocks[i].status.hang &&
4038 adev->ip_blocks[i].version->funcs->soft_reset) {
4039 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
4040 if (r)
4041 return r;
4042 }
4043 }
4044
4045 return 0;
4046}
4047
e3ecdffa
AD
4048/**
4049 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4050 *
4051 * @adev: amdgpu_device pointer
4052 *
4053 * The list of all the hardware IPs that make up the asic is walked and the
4054 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4055 * handles any IP specific hardware or software state changes that are
4056 * necessary after the IP has been soft reset.
4057 * Returns 0 on success, negative error code on failure.
4058 */
06ec9070 4059static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4060{
4061 int i, r = 0;
4062
4063 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4064 if (!adev->ip_blocks[i].status.valid)
35d782fe 4065 continue;
a1255107
AD
4066 if (adev->ip_blocks[i].status.hang &&
4067 adev->ip_blocks[i].version->funcs->post_soft_reset)
4068 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
4069 if (r)
4070 return r;
4071 }
4072
4073 return 0;
4074}
4075
e3ecdffa 4076/**
c33adbc7 4077 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
4078 *
4079 * @adev: amdgpu_device pointer
4080 *
4081 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4082 * restore things like GPUVM page tables after a GPU reset where
4083 * the contents of VRAM might be lost.
403009bf
CK
4084 *
4085 * Returns:
4086 * 0 on success, negative error code on failure.
e3ecdffa 4087 */
c33adbc7 4088static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 4089{
c41d1cf6 4090 struct dma_fence *fence = NULL, *next = NULL;
403009bf
CK
4091 struct amdgpu_bo *shadow;
4092 long r = 1, tmo;
c41d1cf6
ML
4093
4094 if (amdgpu_sriov_runtime(adev))
b045d3af 4095 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
4096 else
4097 tmo = msecs_to_jiffies(100);
4098
aac89168 4099 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 4100 mutex_lock(&adev->shadow_list_lock);
403009bf
CK
4101 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4102
4103 /* No need to recover an evicted BO */
4104 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
b575f10d 4105 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
403009bf
CK
4106 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4107 continue;
4108
4109 r = amdgpu_bo_restore_shadow(shadow, &next);
4110 if (r)
4111 break;
4112
c41d1cf6 4113 if (fence) {
1712fb1a 4114 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
4115 dma_fence_put(fence);
4116 fence = next;
1712fb1a 4117 if (tmo == 0) {
4118 r = -ETIMEDOUT;
c41d1cf6 4119 break;
1712fb1a 4120 } else if (tmo < 0) {
4121 r = tmo;
4122 break;
4123 }
403009bf
CK
4124 } else {
4125 fence = next;
c41d1cf6 4126 }
c41d1cf6
ML
4127 }
4128 mutex_unlock(&adev->shadow_list_lock);
4129
403009bf
CK
4130 if (fence)
4131 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
4132 dma_fence_put(fence);
4133
1712fb1a 4134 if (r < 0 || tmo <= 0) {
aac89168 4135 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
4136 return -EIO;
4137 }
c41d1cf6 4138
aac89168 4139 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 4140 return 0;
c41d1cf6
ML
4141}
4142
a90ad3c2 4143
e3ecdffa 4144/**
06ec9070 4145 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e 4146 *
982a820b 4147 * @adev: amdgpu_device pointer
87e3f136 4148 * @from_hypervisor: request from hypervisor
5740682e
ML
4149 *
4150 * do VF FLR and reinitialize Asic
3f48c681 4151 * return 0 means succeeded otherwise failed
e3ecdffa
AD
4152 */
4153static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4154 bool from_hypervisor)
5740682e
ML
4155{
4156 int r;
4157
4158 if (from_hypervisor)
4159 r = amdgpu_virt_request_full_gpu(adev, true);
4160 else
4161 r = amdgpu_virt_reset_gpu(adev);
4162 if (r)
4163 return r;
a90ad3c2 4164
b639c22c
JZ
4165 amdgpu_amdkfd_pre_reset(adev);
4166
a90ad3c2 4167 /* Resume IP prior to SMC */
06ec9070 4168 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
4169 if (r)
4170 goto error;
a90ad3c2 4171
c9ffa427 4172 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 4173 /* we need recover gart prior to run SMC/CP/SDMA resume */
6c28aed6 4174 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
a90ad3c2 4175
7a3e0bb2
RZ
4176 r = amdgpu_device_fw_loading(adev);
4177 if (r)
4178 return r;
4179
a90ad3c2 4180 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 4181 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
4182 if (r)
4183 goto error;
a90ad3c2
ML
4184
4185 amdgpu_irq_gpu_reset_resume_helper(adev);
5740682e 4186 r = amdgpu_ib_ring_tests(adev);
f81e8d53 4187 amdgpu_amdkfd_post_reset(adev);
a90ad3c2 4188
abc34253 4189error:
c41d1cf6 4190 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 4191 amdgpu_inc_vram_lost(adev);
c33adbc7 4192 r = amdgpu_device_recover_vram(adev);
a90ad3c2 4193 }
437f3e0b 4194 amdgpu_virt_release_full_gpu(adev, true);
a90ad3c2
ML
4195
4196 return r;
4197}
4198
9a1cddd6 4199/**
4200 * amdgpu_device_has_job_running - check if there is any job in mirror list
4201 *
982a820b 4202 * @adev: amdgpu_device pointer
9a1cddd6 4203 *
4204 * check if there is any job in mirror list
4205 */
4206bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4207{
4208 int i;
4209 struct drm_sched_job *job;
4210
4211 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4212 struct amdgpu_ring *ring = adev->rings[i];
4213
4214 if (!ring || !ring->sched.thread)
4215 continue;
4216
4217 spin_lock(&ring->sched.job_list_lock);
6efa4b46
LT
4218 job = list_first_entry_or_null(&ring->sched.pending_list,
4219 struct drm_sched_job, list);
9a1cddd6 4220 spin_unlock(&ring->sched.job_list_lock);
4221 if (job)
4222 return true;
4223 }
4224 return false;
4225}
4226
12938fad
CK
4227/**
4228 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4229 *
982a820b 4230 * @adev: amdgpu_device pointer
12938fad
CK
4231 *
4232 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4233 * a hung GPU.
4234 */
4235bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4236{
4237 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4238 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4239 return false;
4240 }
4241
3ba7b418
AG
4242 if (amdgpu_gpu_recovery == 0)
4243 goto disabled;
4244
4245 if (amdgpu_sriov_vf(adev))
4246 return true;
4247
4248 if (amdgpu_gpu_recovery == -1) {
4249 switch (adev->asic_type) {
fc42d47c
AG
4250 case CHIP_BONAIRE:
4251 case CHIP_HAWAII:
3ba7b418
AG
4252 case CHIP_TOPAZ:
4253 case CHIP_TONGA:
4254 case CHIP_FIJI:
4255 case CHIP_POLARIS10:
4256 case CHIP_POLARIS11:
4257 case CHIP_POLARIS12:
4258 case CHIP_VEGAM:
4259 case CHIP_VEGA20:
4260 case CHIP_VEGA10:
4261 case CHIP_VEGA12:
c43b849f 4262 case CHIP_RAVEN:
e9d4cf91 4263 case CHIP_ARCTURUS:
2cb44fb0 4264 case CHIP_RENOIR:
658c6639
AD
4265 case CHIP_NAVI10:
4266 case CHIP_NAVI14:
4267 case CHIP_NAVI12:
131a3c74 4268 case CHIP_SIENNA_CICHLID:
665fe4dc 4269 case CHIP_NAVY_FLOUNDER:
27859ee3 4270 case CHIP_DIMGREY_CAVEFISH:
fe68ceef 4271 case CHIP_VANGOGH:
ea4e96a7 4272 case CHIP_ALDEBARAN:
3ba7b418
AG
4273 break;
4274 default:
4275 goto disabled;
4276 }
12938fad
CK
4277 }
4278
4279 return true;
3ba7b418
AG
4280
4281disabled:
aac89168 4282 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4283 return false;
12938fad
CK
4284}
4285
5c03e584
FX
4286int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4287{
4288 u32 i;
4289 int ret = 0;
4290
4291 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4292
4293 dev_info(adev->dev, "GPU mode1 reset\n");
4294
4295 /* disable BM */
4296 pci_clear_master(adev->pdev);
4297
4298 amdgpu_device_cache_pci_state(adev->pdev);
4299
4300 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4301 dev_info(adev->dev, "GPU smu mode1 reset\n");
4302 ret = amdgpu_dpm_mode1_reset(adev);
4303 } else {
4304 dev_info(adev->dev, "GPU psp mode1 reset\n");
4305 ret = psp_gpu_reset(adev);
4306 }
4307
4308 if (ret)
4309 dev_err(adev->dev, "GPU mode1 reset failed\n");
4310
4311 amdgpu_device_load_pci_state(adev->pdev);
4312
4313 /* wait for asic to come out of reset */
4314 for (i = 0; i < adev->usec_timeout; i++) {
4315 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4316
4317 if (memsize != 0xffffffff)
4318 break;
4319 udelay(1);
4320 }
4321
4322 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4323 return ret;
4324}
5c6dd71e 4325
e3c1b071 4326int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
04442bf7 4327 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4328{
4329 int i, r = 0;
04442bf7
LL
4330 struct amdgpu_job *job = NULL;
4331 bool need_full_reset =
4332 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4333
4334 if (reset_context->reset_req_dev == adev)
4335 job = reset_context->job;
71182665 4336
e3c1b071 4337 /* no need to dump if device is not in good state during probe period */
4338 if (!adev->gmc.xgmi.pending_reset)
4339 amdgpu_debugfs_wait_dump(adev);
728e7e0c 4340
b602ca5f
TZ
4341 if (amdgpu_sriov_vf(adev)) {
4342 /* stop the data exchange thread */
4343 amdgpu_virt_fini_data_exchange(adev);
4344 }
4345
71182665 4346 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4347 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4348 struct amdgpu_ring *ring = adev->rings[i];
4349
51687759 4350 if (!ring || !ring->sched.thread)
0875dc9e 4351 continue;
5740682e 4352
2f9d4084
ML
4353 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4354 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4355 }
d38ceaf9 4356
222b5f04
AG
4357 if(job)
4358 drm_sched_increase_karma(&job->base);
4359
04442bf7 4360 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
404b277b
LL
4361 /* If reset handler not implemented, continue; otherwise return */
4362 if (r == -ENOSYS)
4363 r = 0;
4364 else
04442bf7
LL
4365 return r;
4366
1d721ed6 4367 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4368 if (!amdgpu_sriov_vf(adev)) {
4369
4370 if (!need_full_reset)
4371 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4372
4373 if (!need_full_reset) {
4374 amdgpu_device_ip_pre_soft_reset(adev);
4375 r = amdgpu_device_ip_soft_reset(adev);
4376 amdgpu_device_ip_post_soft_reset(adev);
4377 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4378 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4379 need_full_reset = true;
4380 }
4381 }
4382
4383 if (need_full_reset)
4384 r = amdgpu_device_ip_suspend(adev);
04442bf7
LL
4385 if (need_full_reset)
4386 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4387 else
4388 clear_bit(AMDGPU_NEED_FULL_RESET,
4389 &reset_context->flags);
26bc5340
AG
4390 }
4391
4392 return r;
4393}
4394
04442bf7
LL
4395int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4396 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4397{
4398 struct amdgpu_device *tmp_adev = NULL;
04442bf7 4399 bool need_full_reset, skip_hw_reset, vram_lost = false;
26bc5340
AG
4400 int r = 0;
4401
04442bf7
LL
4402 /* Try reset handler method first */
4403 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4404 reset_list);
4405 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
404b277b
LL
4406 /* If reset handler not implemented, continue; otherwise return */
4407 if (r == -ENOSYS)
4408 r = 0;
4409 else
04442bf7
LL
4410 return r;
4411
4412 /* Reset handler not implemented, use the default method */
4413 need_full_reset =
4414 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4415 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4416
26bc5340 4417 /*
655ce9cb 4418 * ASIC reset has to be done on all XGMI hive nodes ASAP
26bc5340
AG
4419 * to allow proper links negotiation in FW (within 1 sec)
4420 */
7ac71382 4421 if (!skip_hw_reset && need_full_reset) {
655ce9cb 4422 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
041a62bc 4423 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4424 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
e3c1b071 4425 tmp_adev->gmc.xgmi.pending_reset = false;
c96cf282 4426 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4427 r = -EALREADY;
4428 } else
4429 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4430
041a62bc 4431 if (r) {
aac89168 4432 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4433 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4434 break;
ce316fa5
LM
4435 }
4436 }
4437
041a62bc
AG
4438 /* For XGMI wait for all resets to complete before proceed */
4439 if (!r) {
655ce9cb 4440 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
ce316fa5
LM
4441 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4442 flush_work(&tmp_adev->xgmi_reset_work);
4443 r = tmp_adev->asic_reset_res;
4444 if (r)
4445 break;
ce316fa5
LM
4446 }
4447 }
4448 }
ce316fa5 4449 }
26bc5340 4450
43c4d576 4451 if (!r && amdgpu_ras_intr_triggered()) {
655ce9cb 4452 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
8bc7b360
HZ
4453 if (tmp_adev->mmhub.ras_funcs &&
4454 tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4455 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
43c4d576
JC
4456 }
4457
00eaa571 4458 amdgpu_ras_intr_cleared();
43c4d576 4459 }
00eaa571 4460
655ce9cb 4461 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
26bc5340
AG
4462 if (need_full_reset) {
4463 /* post card */
e3c1b071 4464 r = amdgpu_device_asic_init(tmp_adev);
4465 if (r) {
aac89168 4466 dev_warn(tmp_adev->dev, "asic atom init failed!");
e3c1b071 4467 } else {
26bc5340
AG
4468 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4469 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4470 if (r)
4471 goto out;
4472
4473 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4474 if (vram_lost) {
77e7f829 4475 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4476 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4477 }
4478
6c28aed6 4479 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
26bc5340
AG
4480 if (r)
4481 goto out;
4482
4483 r = amdgpu_device_fw_loading(tmp_adev);
4484 if (r)
4485 return r;
4486
4487 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4488 if (r)
4489 goto out;
4490
4491 if (vram_lost)
4492 amdgpu_device_fill_reset_magic(tmp_adev);
4493
fdafb359
EQ
4494 /*
4495 * Add this ASIC as tracked as reset was already
4496 * complete successfully.
4497 */
4498 amdgpu_register_gpu_instance(tmp_adev);
4499
04442bf7
LL
4500 if (!reset_context->hive &&
4501 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
e3c1b071 4502 amdgpu_xgmi_add_device(tmp_adev);
4503
7c04ca50 4504 r = amdgpu_device_ip_late_init(tmp_adev);
4505 if (r)
4506 goto out;
4507
565d1941
EQ
4508 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4509
e8fbaf03
GC
4510 /*
4511 * The GPU enters bad state once faulty pages
4512 * by ECC has reached the threshold, and ras
4513 * recovery is scheduled next. So add one check
4514 * here to break recovery if it indeed exceeds
4515 * bad page threshold, and remind user to
4516 * retire this GPU or setting one bigger
4517 * bad_page_threshold value to fix this once
4518 * probing driver again.
4519 */
11003c68 4520 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
e8fbaf03
GC
4521 /* must succeed. */
4522 amdgpu_ras_resume(tmp_adev);
4523 } else {
4524 r = -EINVAL;
4525 goto out;
4526 }
e79a04d5 4527
26bc5340 4528 /* Update PSP FW topology after reset */
04442bf7
LL
4529 if (reset_context->hive &&
4530 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4531 r = amdgpu_xgmi_update_topology(
4532 reset_context->hive, tmp_adev);
26bc5340
AG
4533 }
4534 }
4535
26bc5340
AG
4536out:
4537 if (!r) {
4538 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4539 r = amdgpu_ib_ring_tests(tmp_adev);
4540 if (r) {
4541 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
26bc5340
AG
4542 need_full_reset = true;
4543 r = -EAGAIN;
4544 goto end;
4545 }
4546 }
4547
4548 if (!r)
4549 r = amdgpu_device_recover_vram(tmp_adev);
4550 else
4551 tmp_adev->asic_reset_res = r;
4552 }
4553
4554end:
04442bf7
LL
4555 if (need_full_reset)
4556 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4557 else
4558 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340
AG
4559 return r;
4560}
4561
08ebb485
DL
4562static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4563 struct amdgpu_hive_info *hive)
26bc5340 4564{
53b3f8f4
DL
4565 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4566 return false;
4567
08ebb485
DL
4568 if (hive) {
4569 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4570 } else {
4571 down_write(&adev->reset_sem);
4572 }
5740682e 4573
a3a09142
AD
4574 switch (amdgpu_asic_reset_method(adev)) {
4575 case AMD_RESET_METHOD_MODE1:
4576 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4577 break;
4578 case AMD_RESET_METHOD_MODE2:
4579 adev->mp1_state = PP_MP1_STATE_RESET;
4580 break;
4581 default:
4582 adev->mp1_state = PP_MP1_STATE_NONE;
4583 break;
4584 }
1d721ed6
AG
4585
4586 return true;
26bc5340 4587}
d38ceaf9 4588
26bc5340
AG
4589static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4590{
89041940 4591 amdgpu_vf_error_trans_all(adev);
a3a09142 4592 adev->mp1_state = PP_MP1_STATE_NONE;
53b3f8f4 4593 atomic_set(&adev->in_gpu_reset, 0);
6049db43 4594 up_write(&adev->reset_sem);
26bc5340
AG
4595}
4596
91fb309d
HC
4597/*
4598 * to lockup a list of amdgpu devices in a hive safely, if not a hive
4599 * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4600 *
4601 * unlock won't require roll back.
4602 */
4603static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4604{
4605 struct amdgpu_device *tmp_adev = NULL;
4606
4607 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4608 if (!hive) {
4609 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4610 return -ENODEV;
4611 }
4612 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4613 if (!amdgpu_device_lock_adev(tmp_adev, hive))
4614 goto roll_back;
4615 }
4616 } else if (!amdgpu_device_lock_adev(adev, hive))
4617 return -EAGAIN;
4618
4619 return 0;
4620roll_back:
4621 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4622 /*
4623 * if the lockup iteration break in the middle of a hive,
4624 * it may means there may has a race issue,
4625 * or a hive device locked up independently.
4626 * we may be in trouble and may not, so will try to roll back
4627 * the lock and give out a warnning.
4628 */
4629 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4630 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4631 amdgpu_device_unlock_adev(tmp_adev);
4632 }
4633 }
4634 return -EAGAIN;
4635}
4636
3f12acc8
EQ
4637static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4638{
4639 struct pci_dev *p = NULL;
4640
4641 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4642 adev->pdev->bus->number, 1);
4643 if (p) {
4644 pm_runtime_enable(&(p->dev));
4645 pm_runtime_resume(&(p->dev));
4646 }
4647}
4648
4649static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4650{
4651 enum amd_reset_method reset_method;
4652 struct pci_dev *p = NULL;
4653 u64 expires;
4654
4655 /*
4656 * For now, only BACO and mode1 reset are confirmed
4657 * to suffer the audio issue without proper suspended.
4658 */
4659 reset_method = amdgpu_asic_reset_method(adev);
4660 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4661 (reset_method != AMD_RESET_METHOD_MODE1))
4662 return -EINVAL;
4663
4664 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4665 adev->pdev->bus->number, 1);
4666 if (!p)
4667 return -ENODEV;
4668
4669 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4670 if (!expires)
4671 /*
4672 * If we cannot get the audio device autosuspend delay,
4673 * a fixed 4S interval will be used. Considering 3S is
4674 * the audio controller default autosuspend delay setting.
4675 * 4S used here is guaranteed to cover that.
4676 */
54b7feb9 4677 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4678
4679 while (!pm_runtime_status_suspended(&(p->dev))) {
4680 if (!pm_runtime_suspend(&(p->dev)))
4681 break;
4682
4683 if (expires < ktime_get_mono_fast_ns()) {
4684 dev_warn(adev->dev, "failed to suspend display audio\n");
4685 /* TODO: abort the succeeding gpu reset? */
4686 return -ETIMEDOUT;
4687 }
4688 }
4689
4690 pm_runtime_disable(&(p->dev));
4691
4692 return 0;
4693}
4694
04442bf7
LL
4695void amdgpu_device_recheck_guilty_jobs(
4696 struct amdgpu_device *adev, struct list_head *device_list_handle,
4697 struct amdgpu_reset_context *reset_context)
e6c6338f
JZ
4698{
4699 int i, r = 0;
4700
4701 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4702 struct amdgpu_ring *ring = adev->rings[i];
4703 int ret = 0;
4704 struct drm_sched_job *s_job;
4705
4706 if (!ring || !ring->sched.thread)
4707 continue;
4708
4709 s_job = list_first_entry_or_null(&ring->sched.pending_list,
4710 struct drm_sched_job, list);
4711 if (s_job == NULL)
4712 continue;
4713
4714 /* clear job's guilty and depend the folowing step to decide the real one */
4715 drm_sched_reset_karma(s_job);
4716 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4717
4718 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4719 if (ret == 0) { /* timeout */
4720 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4721 ring->sched.name, s_job->id);
4722
4723 /* set guilty */
4724 drm_sched_increase_karma(s_job);
4725retry:
4726 /* do hw reset */
4727 if (amdgpu_sriov_vf(adev)) {
4728 amdgpu_virt_fini_data_exchange(adev);
4729 r = amdgpu_device_reset_sriov(adev, false);
4730 if (r)
4731 adev->asic_reset_res = r;
4732 } else {
04442bf7
LL
4733 clear_bit(AMDGPU_SKIP_HW_RESET,
4734 &reset_context->flags);
4735 r = amdgpu_do_asic_reset(device_list_handle,
4736 reset_context);
e6c6338f
JZ
4737 if (r && r == -EAGAIN)
4738 goto retry;
4739 }
4740
4741 /*
4742 * add reset counter so that the following
4743 * resubmitted job could flush vmid
4744 */
4745 atomic_inc(&adev->gpu_reset_counter);
4746 continue;
4747 }
4748
4749 /* got the hw fence, signal finished fence */
4750 atomic_dec(ring->sched.score);
4751 dma_fence_get(&s_job->s_fence->finished);
4752 dma_fence_signal(&s_job->s_fence->finished);
4753 dma_fence_put(&s_job->s_fence->finished);
4754
4755 /* remove node from list and free the job */
4756 spin_lock(&ring->sched.job_list_lock);
4757 list_del_init(&s_job->list);
4758 spin_unlock(&ring->sched.job_list_lock);
4759 ring->sched.ops->free_job(s_job);
4760 }
4761}
4762
26bc5340
AG
4763/**
4764 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4765 *
982a820b 4766 * @adev: amdgpu_device pointer
26bc5340
AG
4767 * @job: which job trigger hang
4768 *
4769 * Attempt to reset the GPU if it has hung (all asics).
4770 * Attempt to do soft-reset or full-reset and reinitialize Asic
4771 * Returns 0 for success or an error on failure.
4772 */
4773
4774int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4775 struct amdgpu_job *job)
4776{
1d721ed6 4777 struct list_head device_list, *device_list_handle = NULL;
7dd8c205 4778 bool job_signaled = false;
26bc5340 4779 struct amdgpu_hive_info *hive = NULL;
26bc5340 4780 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 4781 int i, r = 0;
bb5c7235 4782 bool need_emergency_restart = false;
3f12acc8 4783 bool audio_suspended = false;
e6c6338f 4784 int tmp_vram_lost_counter;
04442bf7
LL
4785 struct amdgpu_reset_context reset_context;
4786
4787 memset(&reset_context, 0, sizeof(reset_context));
26bc5340 4788
6e3cd2a9 4789 /*
bb5c7235
WS
4790 * Special case: RAS triggered and full reset isn't supported
4791 */
4792 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4793
d5ea093e
AG
4794 /*
4795 * Flush RAM to disk so that after reboot
4796 * the user can read log and see why the system rebooted.
4797 */
bb5c7235 4798 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
4799 DRM_WARN("Emergency reboot.");
4800
4801 ksys_sync_helper();
4802 emergency_restart();
4803 }
4804
b823821f 4805 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 4806 need_emergency_restart ? "jobs stop":"reset");
26bc5340
AG
4807
4808 /*
1d721ed6
AG
4809 * Here we trylock to avoid chain of resets executing from
4810 * either trigger by jobs on different adevs in XGMI hive or jobs on
4811 * different schedulers for same device while this TO handler is running.
4812 * We always reset all schedulers for device and all devices for XGMI
4813 * hive so that should take care of them too.
26bc5340 4814 */
d95e8e97 4815 hive = amdgpu_get_xgmi_hive(adev);
53b3f8f4
DL
4816 if (hive) {
4817 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4818 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4819 job ? job->base.id : -1, hive->hive_id);
d95e8e97 4820 amdgpu_put_xgmi_hive(hive);
91fb309d
HC
4821 if (job)
4822 drm_sched_increase_karma(&job->base);
53b3f8f4
DL
4823 return 0;
4824 }
4825 mutex_lock(&hive->hive_lock);
1d721ed6 4826 }
26bc5340 4827
04442bf7
LL
4828 reset_context.method = AMD_RESET_METHOD_NONE;
4829 reset_context.reset_req_dev = adev;
4830 reset_context.job = job;
4831 reset_context.hive = hive;
4832 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
4833
91fb309d
HC
4834 /*
4835 * lock the device before we try to operate the linked list
4836 * if didn't get the device lock, don't touch the linked list since
4837 * others may iterating it.
4838 */
4839 r = amdgpu_device_lock_hive_adev(adev, hive);
4840 if (r) {
4841 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4842 job ? job->base.id : -1);
4843
4844 /* even we skipped this reset, still need to set the job to guilty */
4845 if (job)
4846 drm_sched_increase_karma(&job->base);
4847 goto skip_recovery;
4848 }
4849
9e94d22c
EQ
4850 /*
4851 * Build list of devices to reset.
4852 * In case we are in XGMI hive mode, resort the device list
4853 * to put adev in the 1st position.
4854 */
4855 INIT_LIST_HEAD(&device_list);
4856 if (adev->gmc.xgmi.num_physical_nodes > 1) {
655ce9cb 4857 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
4858 list_add_tail(&tmp_adev->reset_list, &device_list);
4859 if (!list_is_first(&adev->reset_list, &device_list))
4860 list_rotate_to_front(&adev->reset_list, &device_list);
4861 device_list_handle = &device_list;
26bc5340 4862 } else {
655ce9cb 4863 list_add_tail(&adev->reset_list, &device_list);
26bc5340
AG
4864 device_list_handle = &device_list;
4865 }
4866
1d721ed6 4867 /* block all schedulers and reset given job's ring */
655ce9cb 4868 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
3f12acc8
EQ
4869 /*
4870 * Try to put the audio codec into suspend state
4871 * before gpu reset started.
4872 *
4873 * Due to the power domain of the graphics device
4874 * is shared with AZ power domain. Without this,
4875 * we may change the audio hardware from behind
4876 * the audio driver's back. That will trigger
4877 * some audio codec errors.
4878 */
4879 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4880 audio_suspended = true;
4881
9e94d22c
EQ
4882 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4883
52fb44cf
EQ
4884 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4885
9e94d22c
EQ
4886 if (!amdgpu_sriov_vf(tmp_adev))
4887 amdgpu_amdkfd_pre_reset(tmp_adev);
4888
12ffa55d
AG
4889 /*
4890 * Mark these ASICs to be reseted as untracked first
4891 * And add them back after reset completed
4892 */
4893 amdgpu_unregister_gpu_instance(tmp_adev);
4894
a2f63ee8 4895 amdgpu_fbdev_set_suspend(tmp_adev, 1);
565d1941 4896
f1c1314b 4897 /* disable ras on ALL IPs */
bb5c7235 4898 if (!need_emergency_restart &&
b823821f 4899 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 4900 amdgpu_ras_suspend(tmp_adev);
4901
1d721ed6
AG
4902 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4903 struct amdgpu_ring *ring = tmp_adev->rings[i];
4904
4905 if (!ring || !ring->sched.thread)
4906 continue;
4907
0b2d2c2e 4908 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 4909
bb5c7235 4910 if (need_emergency_restart)
7c6e68c7 4911 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6 4912 }
8f8c80f4 4913 atomic_inc(&tmp_adev->gpu_reset_counter);
1d721ed6
AG
4914 }
4915
bb5c7235 4916 if (need_emergency_restart)
7c6e68c7
AG
4917 goto skip_sched_resume;
4918
1d721ed6
AG
4919 /*
4920 * Must check guilty signal here since after this point all old
4921 * HW fences are force signaled.
4922 *
4923 * job->base holds a reference to parent fence
4924 */
4925 if (job && job->base.s_fence->parent &&
7dd8c205 4926 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 4927 job_signaled = true;
1d721ed6
AG
4928 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4929 goto skip_hw_reset;
4930 }
4931
26bc5340 4932retry: /* Rest of adevs pre asic reset from XGMI hive. */
655ce9cb 4933 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
04442bf7 4934 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
26bc5340
AG
4935 /*TODO Should we stop ?*/
4936 if (r) {
aac89168 4937 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 4938 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
4939 tmp_adev->asic_reset_res = r;
4940 }
4941 }
4942
e6c6338f 4943 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
26bc5340
AG
4944 /* Actual ASIC resets if needed.*/
4945 /* TODO Implement XGMI hive reset logic for SRIOV */
4946 if (amdgpu_sriov_vf(adev)) {
4947 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4948 if (r)
4949 adev->asic_reset_res = r;
4950 } else {
04442bf7 4951 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
26bc5340
AG
4952 if (r && r == -EAGAIN)
4953 goto retry;
4954 }
4955
1d721ed6
AG
4956skip_hw_reset:
4957
26bc5340 4958 /* Post ASIC reset for all devs .*/
655ce9cb 4959 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
7c6e68c7 4960
e6c6338f
JZ
4961 /*
4962 * Sometimes a later bad compute job can block a good gfx job as gfx
4963 * and compute ring share internal GC HW mutually. We add an additional
4964 * guilty jobs recheck step to find the real guilty job, it synchronously
4965 * submits and pends for the first job being signaled. If it gets timeout,
4966 * we identify it as a real guilty job.
4967 */
4968 if (amdgpu_gpu_recovery == 2 &&
4969 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
04442bf7
LL
4970 amdgpu_device_recheck_guilty_jobs(
4971 tmp_adev, device_list_handle, &reset_context);
e6c6338f 4972
1d721ed6
AG
4973 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4974 struct amdgpu_ring *ring = tmp_adev->rings[i];
4975
4976 if (!ring || !ring->sched.thread)
4977 continue;
4978
4979 /* No point to resubmit jobs if we didn't HW reset*/
4980 if (!tmp_adev->asic_reset_res && !job_signaled)
4981 drm_sched_resubmit_jobs(&ring->sched);
4982
4983 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4984 }
4985
4986 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4a580877 4987 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
4988 }
4989
4990 tmp_adev->asic_reset_res = 0;
26bc5340
AG
4991
4992 if (r) {
4993 /* bad news, how to tell it to userspace ? */
12ffa55d 4994 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
4995 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4996 } else {
12ffa55d 4997 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340 4998 }
7c6e68c7 4999 }
26bc5340 5000
7c6e68c7 5001skip_sched_resume:
655ce9cb 5002 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
8e2712e7 5003 /* unlock kfd: SRIOV would do it separately */
bb5c7235 5004 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
7c6e68c7 5005 amdgpu_amdkfd_post_reset(tmp_adev);
8e2712e7 5006
5007 /* kfd_post_reset will do nothing if kfd device is not initialized,
5008 * need to bring up kfd here if it's not be initialized before
5009 */
5010 if (!adev->kfd.init_complete)
5011 amdgpu_amdkfd_device_init(adev);
5012
3f12acc8
EQ
5013 if (audio_suspended)
5014 amdgpu_device_resume_display_audio(tmp_adev);
26bc5340
AG
5015 amdgpu_device_unlock_adev(tmp_adev);
5016 }
5017
cbfd17f7 5018skip_recovery:
9e94d22c 5019 if (hive) {
53b3f8f4 5020 atomic_set(&hive->in_reset, 0);
9e94d22c 5021 mutex_unlock(&hive->hive_lock);
d95e8e97 5022 amdgpu_put_xgmi_hive(hive);
9e94d22c 5023 }
26bc5340 5024
91fb309d 5025 if (r && r != -EAGAIN)
26bc5340 5026 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
5027 return r;
5028}
5029
e3ecdffa
AD
5030/**
5031 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5032 *
5033 * @adev: amdgpu_device pointer
5034 *
5035 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5036 * and lanes) of the slot the device is in. Handles APUs and
5037 * virtualized environments where PCIE config space may not be available.
5038 */
5494d864 5039static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 5040{
5d9a6330 5041 struct pci_dev *pdev;
c5313457
HK
5042 enum pci_bus_speed speed_cap, platform_speed_cap;
5043 enum pcie_link_width platform_link_width;
d0dd7f0c 5044
cd474ba0
AD
5045 if (amdgpu_pcie_gen_cap)
5046 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 5047
cd474ba0
AD
5048 if (amdgpu_pcie_lane_cap)
5049 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 5050
cd474ba0
AD
5051 /* covers APUs as well */
5052 if (pci_is_root_bus(adev->pdev->bus)) {
5053 if (adev->pm.pcie_gen_mask == 0)
5054 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5055 if (adev->pm.pcie_mlw_mask == 0)
5056 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 5057 return;
cd474ba0 5058 }
d0dd7f0c 5059
c5313457
HK
5060 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5061 return;
5062
dbaa922b
AD
5063 pcie_bandwidth_available(adev->pdev, NULL,
5064 &platform_speed_cap, &platform_link_width);
c5313457 5065
cd474ba0 5066 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
5067 /* asic caps */
5068 pdev = adev->pdev;
5069 speed_cap = pcie_get_speed_cap(pdev);
5070 if (speed_cap == PCI_SPEED_UNKNOWN) {
5071 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
5072 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5073 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 5074 } else {
2b3a1f51
FX
5075 if (speed_cap == PCIE_SPEED_32_0GT)
5076 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5077 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5078 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5079 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5080 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5081 else if (speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5082 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5083 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5084 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5085 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5086 else if (speed_cap == PCIE_SPEED_8_0GT)
5087 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5088 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5089 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5090 else if (speed_cap == PCIE_SPEED_5_0GT)
5091 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5092 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5093 else
5094 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5095 }
5096 /* platform caps */
c5313457 5097 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
5098 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5099 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5100 } else {
2b3a1f51
FX
5101 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5102 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5103 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5104 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5105 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5106 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5107 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5108 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5109 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5110 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5111 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 5112 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
5113 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5114 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5115 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 5116 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
5117 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5118 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5119 else
5120 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5121
cd474ba0
AD
5122 }
5123 }
5124 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 5125 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
5126 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5127 } else {
c5313457 5128 switch (platform_link_width) {
5d9a6330 5129 case PCIE_LNK_X32:
cd474ba0
AD
5130 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5131 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5132 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5133 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5134 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5135 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5136 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5137 break;
5d9a6330 5138 case PCIE_LNK_X16:
cd474ba0
AD
5139 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5140 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5141 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5142 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5143 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5144 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5145 break;
5d9a6330 5146 case PCIE_LNK_X12:
cd474ba0
AD
5147 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5148 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5149 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5150 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5151 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5152 break;
5d9a6330 5153 case PCIE_LNK_X8:
cd474ba0
AD
5154 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5155 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5156 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5157 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5158 break;
5d9a6330 5159 case PCIE_LNK_X4:
cd474ba0
AD
5160 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5161 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5162 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5163 break;
5d9a6330 5164 case PCIE_LNK_X2:
cd474ba0
AD
5165 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5166 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5167 break;
5d9a6330 5168 case PCIE_LNK_X1:
cd474ba0
AD
5169 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5170 break;
5171 default:
5172 break;
5173 }
d0dd7f0c
AD
5174 }
5175 }
5176}
d38ceaf9 5177
361dbd01
AD
5178int amdgpu_device_baco_enter(struct drm_device *dev)
5179{
1348969a 5180 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5181 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 5182
4a580877 5183 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5184 return -ENOTSUPP;
5185
8ab0d6f0 5186 if (ras && adev->ras_enabled &&
acdae216 5187 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5188 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5189
9530273e 5190 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
5191}
5192
5193int amdgpu_device_baco_exit(struct drm_device *dev)
5194{
1348969a 5195 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5196 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 5197 int ret = 0;
361dbd01 5198
4a580877 5199 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5200 return -ENOTSUPP;
5201
9530273e
EQ
5202 ret = amdgpu_dpm_baco_exit(adev);
5203 if (ret)
5204 return ret;
7a22677b 5205
8ab0d6f0 5206 if (ras && adev->ras_enabled &&
acdae216 5207 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5208 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5209
5210 return 0;
361dbd01 5211}
c9a6b82f 5212
acd89fca
AG
5213static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5214{
5215 int i;
5216
5217 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5218 struct amdgpu_ring *ring = adev->rings[i];
5219
5220 if (!ring || !ring->sched.thread)
5221 continue;
5222
5223 cancel_delayed_work_sync(&ring->sched.work_tdr);
5224 }
5225}
5226
c9a6b82f
AG
5227/**
5228 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5229 * @pdev: PCI device struct
5230 * @state: PCI channel state
5231 *
5232 * Description: Called when a PCI error is detected.
5233 *
5234 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5235 */
5236pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5237{
5238 struct drm_device *dev = pci_get_drvdata(pdev);
5239 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5240 int i;
c9a6b82f
AG
5241
5242 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5243
6894305c
AG
5244 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5245 DRM_WARN("No support for XGMI hive yet...");
5246 return PCI_ERS_RESULT_DISCONNECT;
5247 }
5248
c9a6b82f
AG
5249 switch (state) {
5250 case pci_channel_io_normal:
5251 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca 5252 /* Fatal error, prepare for slot reset */
8a11d283
TZ
5253 case pci_channel_io_frozen:
5254 /*
acd89fca
AG
5255 * Cancel and wait for all TDRs in progress if failing to
5256 * set adev->in_gpu_reset in amdgpu_device_lock_adev
5257 *
5258 * Locking adev->reset_sem will prevent any external access
5259 * to GPU during PCI error recovery
5260 */
5261 while (!amdgpu_device_lock_adev(adev, NULL))
5262 amdgpu_cancel_all_tdr(adev);
5263
5264 /*
5265 * Block any work scheduling as we do for regular GPU reset
5266 * for the duration of the recovery
5267 */
5268 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5269 struct amdgpu_ring *ring = adev->rings[i];
5270
5271 if (!ring || !ring->sched.thread)
5272 continue;
5273
5274 drm_sched_stop(&ring->sched, NULL);
5275 }
8f8c80f4 5276 atomic_inc(&adev->gpu_reset_counter);
c9a6b82f
AG
5277 return PCI_ERS_RESULT_NEED_RESET;
5278 case pci_channel_io_perm_failure:
5279 /* Permanent error, prepare for device removal */
5280 return PCI_ERS_RESULT_DISCONNECT;
5281 }
5282
5283 return PCI_ERS_RESULT_NEED_RESET;
5284}
5285
5286/**
5287 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5288 * @pdev: pointer to PCI device
5289 */
5290pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5291{
5292
5293 DRM_INFO("PCI error: mmio enabled callback!!\n");
5294
5295 /* TODO - dump whatever for debugging purposes */
5296
5297 /* This called only if amdgpu_pci_error_detected returns
5298 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5299 * works, no need to reset slot.
5300 */
5301
5302 return PCI_ERS_RESULT_RECOVERED;
5303}
5304
5305/**
5306 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5307 * @pdev: PCI device struct
5308 *
5309 * Description: This routine is called by the pci error recovery
5310 * code after the PCI slot has been reset, just before we
5311 * should resume normal operations.
5312 */
5313pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5314{
5315 struct drm_device *dev = pci_get_drvdata(pdev);
5316 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 5317 int r, i;
04442bf7 5318 struct amdgpu_reset_context reset_context;
362c7b91 5319 u32 memsize;
7ac71382 5320 struct list_head device_list;
c9a6b82f
AG
5321
5322 DRM_INFO("PCI error: slot reset callback!!\n");
5323
04442bf7
LL
5324 memset(&reset_context, 0, sizeof(reset_context));
5325
7ac71382 5326 INIT_LIST_HEAD(&device_list);
655ce9cb 5327 list_add_tail(&adev->reset_list, &device_list);
7ac71382 5328
362c7b91
AG
5329 /* wait for asic to come out of reset */
5330 msleep(500);
5331
7ac71382 5332 /* Restore PCI confspace */
c1dd4aa6 5333 amdgpu_device_load_pci_state(pdev);
c9a6b82f 5334
362c7b91
AG
5335 /* confirm ASIC came out of reset */
5336 for (i = 0; i < adev->usec_timeout; i++) {
5337 memsize = amdgpu_asic_get_config_memsize(adev);
5338
5339 if (memsize != 0xffffffff)
5340 break;
5341 udelay(1);
5342 }
5343 if (memsize == 0xffffffff) {
5344 r = -ETIME;
5345 goto out;
5346 }
5347
04442bf7
LL
5348 reset_context.method = AMD_RESET_METHOD_NONE;
5349 reset_context.reset_req_dev = adev;
5350 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5351 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5352
8a11d283 5353 adev->in_pci_err_recovery = true;
04442bf7 5354 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
bf36b52e 5355 adev->in_pci_err_recovery = false;
c9a6b82f
AG
5356 if (r)
5357 goto out;
5358
04442bf7 5359 r = amdgpu_do_asic_reset(&device_list, &reset_context);
c9a6b82f
AG
5360
5361out:
c9a6b82f 5362 if (!r) {
c1dd4aa6
AG
5363 if (amdgpu_device_cache_pci_state(adev->pdev))
5364 pci_restore_state(adev->pdev);
5365
c9a6b82f
AG
5366 DRM_INFO("PCIe error recovery succeeded\n");
5367 } else {
5368 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5369 amdgpu_device_unlock_adev(adev);
5370 }
5371
5372 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5373}
5374
5375/**
5376 * amdgpu_pci_resume() - resume normal ops after PCI reset
5377 * @pdev: pointer to PCI device
5378 *
5379 * Called when the error recovery driver tells us that its
505199a3 5380 * OK to resume normal operation.
c9a6b82f
AG
5381 */
5382void amdgpu_pci_resume(struct pci_dev *pdev)
5383{
5384 struct drm_device *dev = pci_get_drvdata(pdev);
5385 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5386 int i;
c9a6b82f 5387
c9a6b82f
AG
5388
5389 DRM_INFO("PCI error: resume callback!!\n");
acd89fca
AG
5390
5391 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5392 struct amdgpu_ring *ring = adev->rings[i];
5393
5394 if (!ring || !ring->sched.thread)
5395 continue;
5396
5397
5398 drm_sched_resubmit_jobs(&ring->sched);
5399 drm_sched_start(&ring->sched, true);
5400 }
5401
5402 amdgpu_device_unlock_adev(adev);
c9a6b82f 5403}
c1dd4aa6
AG
5404
5405bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5406{
5407 struct drm_device *dev = pci_get_drvdata(pdev);
5408 struct amdgpu_device *adev = drm_to_adev(dev);
5409 int r;
5410
5411 r = pci_save_state(pdev);
5412 if (!r) {
5413 kfree(adev->pci_state);
5414
5415 adev->pci_state = pci_store_saved_state(pdev);
5416
5417 if (!adev->pci_state) {
5418 DRM_ERROR("Failed to store PCI saved state");
5419 return false;
5420 }
5421 } else {
5422 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5423 return false;
5424 }
5425
5426 return true;
5427}
5428
5429bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5430{
5431 struct drm_device *dev = pci_get_drvdata(pdev);
5432 struct amdgpu_device *adev = drm_to_adev(dev);
5433 int r;
5434
5435 if (!adev->pci_state)
5436 return false;
5437
5438 r = pci_load_saved_state(pdev, adev->pci_state);
5439
5440 if (!r) {
5441 pci_restore_state(pdev);
5442 } else {
5443 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5444 return false;
5445 }
5446
5447 return true;
5448}
5449
5450