drm/radeon/radeon_ttm: Remove unused variable 'rbo' from radeon_bo_move()
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
4a74c38c 33#include <linux/iommu.h>
901e2be2 34#include <linux/pci.h>
3d8785f6
SA
35#include <linux/devcoredump.h>
36#include <generated/utsrelease.h>
08a2fd23 37#include <linux/pci-p2pdma.h>
d37a3929 38#include <linux/apple-gmux.h>
fdf2f6c5 39
b7cdb41e 40#include <drm/drm_aperture.h>
4562236b 41#include <drm/drm_atomic_helper.h>
973ad627 42#include <drm/drm_crtc_helper.h>
45b64fd9 43#include <drm/drm_fb_helper.h>
fcd70cd3 44#include <drm/drm_probe_helper.h>
d38ceaf9
AD
45#include <drm/amdgpu_drm.h>
46#include <linux/vgaarb.h>
47#include <linux/vga_switcheroo.h>
48#include <linux/efi.h>
49#include "amdgpu.h"
f4b373f4 50#include "amdgpu_trace.h"
d38ceaf9
AD
51#include "amdgpu_i2c.h"
52#include "atom.h"
53#include "amdgpu_atombios.h"
a5bde2f9 54#include "amdgpu_atomfirmware.h"
d0dd7f0c 55#include "amd_pcie.h"
33f34802
KW
56#ifdef CONFIG_DRM_AMDGPU_SI
57#include "si.h"
58#endif
a2e73f56
AD
59#ifdef CONFIG_DRM_AMDGPU_CIK
60#include "cik.h"
61#endif
aaa36a97 62#include "vi.h"
460826e6 63#include "soc15.h"
0a5b8c7b 64#include "nv.h"
d38ceaf9 65#include "bif/bif_4_1_d.h"
bec86378 66#include <linux/firmware.h>
89041940 67#include "amdgpu_vf_error.h"
d38ceaf9 68
ba997709 69#include "amdgpu_amdkfd.h"
d2f52ac8 70#include "amdgpu_pm.h"
d38ceaf9 71
5183411b 72#include "amdgpu_xgmi.h"
c030f2e4 73#include "amdgpu_ras.h"
9c7c85f7 74#include "amdgpu_pmu.h"
bd607166 75#include "amdgpu_fru_eeprom.h"
04442bf7 76#include "amdgpu_reset.h"
5183411b 77
d5ea093e 78#include <linux/suspend.h>
c6a6e2db 79#include <drm/task_barrier.h>
3f12acc8 80#include <linux/pm_runtime.h>
d5ea093e 81
f89f8c6b
AG
82#include <drm/drm_drv.h>
83
3ad5dcfe
KHF
84#if IS_ENABLED(CONFIG_X86)
85#include <asm/intel-family.h>
86#endif
87
e2a75f88 88MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 89MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 90MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 91MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 92MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 93MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
42b325e5 94MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
e2a75f88 95
2dc80b00 96#define AMDGPU_RESUME_MS 2000
7258fa31
SK
97#define AMDGPU_MAX_RETRY_LIMIT 2
98#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
2dc80b00 99
b7cdb41e
ML
100static const struct drm_driver amdgpu_kms_driver;
101
050091ab 102const char *amdgpu_asic_name[] = {
da69c161
KW
103 "TAHITI",
104 "PITCAIRN",
105 "VERDE",
106 "OLAND",
107 "HAINAN",
d38ceaf9
AD
108 "BONAIRE",
109 "KAVERI",
110 "KABINI",
111 "HAWAII",
112 "MULLINS",
113 "TOPAZ",
114 "TONGA",
48299f95 115 "FIJI",
d38ceaf9 116 "CARRIZO",
139f4917 117 "STONEY",
2cc0c0b5
FC
118 "POLARIS10",
119 "POLARIS11",
c4642a47 120 "POLARIS12",
48ff108d 121 "VEGAM",
d4196f01 122 "VEGA10",
8fab806a 123 "VEGA12",
956fcddc 124 "VEGA20",
2ca8a5d2 125 "RAVEN",
d6c3b24e 126 "ARCTURUS",
1eee4228 127 "RENOIR",
d46b417a 128 "ALDEBARAN",
852a6626 129 "NAVI10",
d0f56dc2 130 "CYAN_SKILLFISH",
87dbad02 131 "NAVI14",
9802f5d7 132 "NAVI12",
ccaf72d3 133 "SIENNA_CICHLID",
ddd8fbe7 134 "NAVY_FLOUNDER",
4f1e9a76 135 "VANGOGH",
a2468e04 136 "DIMGREY_CAVEFISH",
6f169591 137 "BEIGE_GOBY",
ee9236b7 138 "YELLOW_CARP",
3ae695d6 139 "IP DISCOVERY",
d38ceaf9
AD
140 "LAST",
141};
142
dcea6e65
KR
143/**
144 * DOC: pcie_replay_count
145 *
146 * The amdgpu driver provides a sysfs API for reporting the total number
147 * of PCIe replays (NAKs)
148 * The file pcie_replay_count is used for this and returns the total
149 * number of replays as a sum of the NAKs generated and NAKs received
150 */
151
152static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
153 struct device_attribute *attr, char *buf)
154{
155 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 156 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
158
36000c7a 159 return sysfs_emit(buf, "%llu\n", cnt);
dcea6e65
KR
160}
161
b8920e1e 162static DEVICE_ATTR(pcie_replay_count, 0444,
dcea6e65
KR
163 amdgpu_device_get_pcie_replay_count, NULL);
164
5494d864
AD
165static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
166
bd607166 167
fd496ca8 168/**
b98c6299 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
fd496ca8
AD
170 *
171 * @dev: drm_device pointer
172 *
b98c6299 173 * Returns true if the device is a dGPU with ATPX power control,
fd496ca8
AD
174 * otherwise return false.
175 */
b98c6299 176bool amdgpu_device_supports_px(struct drm_device *dev)
fd496ca8
AD
177{
178 struct amdgpu_device *adev = drm_to_adev(dev);
179
b98c6299 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
fd496ca8
AD
181 return true;
182 return false;
183}
184
e3ecdffa 185/**
0330b848 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
e3ecdffa
AD
187 *
188 * @dev: drm_device pointer
189 *
b98c6299 190 * Returns true if the device is a dGPU with ACPI power control,
e3ecdffa
AD
191 * otherwise return false.
192 */
31af062a 193bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 194{
1348969a 195 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 196
b98c6299
AD
197 if (adev->has_pr3 ||
198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
d38ceaf9
AD
199 return true;
200 return false;
201}
202
a69cba42
AD
203/**
204 * amdgpu_device_supports_baco - Does the device support BACO
205 *
206 * @dev: drm_device pointer
207 *
208 * Returns true if the device supporte BACO,
209 * otherwise return false.
210 */
211bool amdgpu_device_supports_baco(struct drm_device *dev)
212{
1348969a 213 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
214
215 return amdgpu_asic_supports_baco(adev);
216}
217
3fa8f89d
S
218/**
219 * amdgpu_device_supports_smart_shift - Is the device dGPU with
220 * smart shift support
221 *
222 * @dev: drm_device pointer
223 *
224 * Returns true if the device is a dGPU with Smart Shift support,
225 * otherwise returns false.
226 */
227bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
228{
229 return (amdgpu_device_supports_boco(dev) &&
230 amdgpu_acpi_is_power_shift_control_supported());
231}
232
6e3cd2a9
MCC
233/*
234 * VRAM access helper functions
235 */
236
e35e2b11 237/**
048af66b 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
e35e2b11
TY
239 *
240 * @adev: amdgpu_device pointer
241 * @pos: offset of the buffer in vram
242 * @buf: virtual address of the buffer in system memory
243 * @size: read/write size, sizeof(@buf) must > @size
244 * @write: true - write to vram, otherwise - read from vram
245 */
048af66b
KW
246void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
247 void *buf, size_t size, bool write)
e35e2b11 248{
e35e2b11 249 unsigned long flags;
048af66b
KW
250 uint32_t hi = ~0, tmp = 0;
251 uint32_t *data = buf;
ce05ac56 252 uint64_t last;
f89f8c6b 253 int idx;
ce05ac56 254
c58a863b 255 if (!drm_dev_enter(adev_to_drm(adev), &idx))
f89f8c6b 256 return;
9d11eb0d 257
048af66b
KW
258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
259
260 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
261 for (last = pos + size; pos < last; pos += 4) {
262 tmp = pos >> 31;
263
264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
265 if (tmp != hi) {
266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
267 hi = tmp;
268 }
269 if (write)
270 WREG32_NO_KIQ(mmMM_DATA, *data++);
271 else
272 *data++ = RREG32_NO_KIQ(mmMM_DATA);
273 }
274
275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
276 drm_dev_exit(idx);
277}
278
279/**
bbe04dec 280 * amdgpu_device_aper_access - access vram by vram aperature
048af66b
KW
281 *
282 * @adev: amdgpu_device pointer
283 * @pos: offset of the buffer in vram
284 * @buf: virtual address of the buffer in system memory
285 * @size: read/write size, sizeof(@buf) must > @size
286 * @write: true - write to vram, otherwise - read from vram
287 *
288 * The return value means how many bytes have been transferred.
289 */
290size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
291 void *buf, size_t size, bool write)
292{
9d11eb0d 293#ifdef CONFIG_64BIT
048af66b
KW
294 void __iomem *addr;
295 size_t count = 0;
296 uint64_t last;
297
298 if (!adev->mman.aper_base_kaddr)
299 return 0;
300
9d11eb0d
CK
301 last = min(pos + size, adev->gmc.visible_vram_size);
302 if (last > pos) {
048af66b
KW
303 addr = adev->mman.aper_base_kaddr + pos;
304 count = last - pos;
9d11eb0d
CK
305
306 if (write) {
307 memcpy_toio(addr, buf, count);
4c452b5c
SS
308 /* Make sure HDP write cache flush happens without any reordering
309 * after the system memory contents are sent over PCIe device
310 */
9d11eb0d 311 mb();
810085dd 312 amdgpu_device_flush_hdp(adev, NULL);
9d11eb0d 313 } else {
810085dd 314 amdgpu_device_invalidate_hdp(adev, NULL);
4c452b5c
SS
315 /* Make sure HDP read cache is invalidated before issuing a read
316 * to the PCIe device
317 */
9d11eb0d
CK
318 mb();
319 memcpy_fromio(buf, addr, count);
320 }
321
9d11eb0d 322 }
048af66b
KW
323
324 return count;
325#else
326 return 0;
9d11eb0d 327#endif
048af66b 328}
9d11eb0d 329
048af66b
KW
330/**
331 * amdgpu_device_vram_access - read/write a buffer in vram
332 *
333 * @adev: amdgpu_device pointer
334 * @pos: offset of the buffer in vram
335 * @buf: virtual address of the buffer in system memory
336 * @size: read/write size, sizeof(@buf) must > @size
337 * @write: true - write to vram, otherwise - read from vram
338 */
339void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
340 void *buf, size_t size, bool write)
341{
342 size_t count;
e35e2b11 343
048af66b
KW
344 /* try to using vram apreature to access vram first */
345 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
346 size -= count;
347 if (size) {
348 /* using MM to access rest vram */
349 pos += count;
350 buf += count;
351 amdgpu_device_mm_access(adev, pos, buf, size, write);
e35e2b11
TY
352 }
353}
354
d38ceaf9 355/*
f7ee1874 356 * register access helper functions.
d38ceaf9 357 */
56b53c0b
DL
358
359/* Check if hw access should be skipped because of hotplug or device error */
360bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
361{
7afefb81 362 if (adev->no_hw_access)
56b53c0b
DL
363 return true;
364
365#ifdef CONFIG_LOCKDEP
366 /*
367 * This is a bit complicated to understand, so worth a comment. What we assert
368 * here is that the GPU reset is not running on another thread in parallel.
369 *
370 * For this we trylock the read side of the reset semaphore, if that succeeds
371 * we know that the reset is not running in paralell.
372 *
373 * If the trylock fails we assert that we are either already holding the read
374 * side of the lock or are the reset thread itself and hold the write side of
375 * the lock.
376 */
377 if (in_task()) {
d0fb18b5
AG
378 if (down_read_trylock(&adev->reset_domain->sem))
379 up_read(&adev->reset_domain->sem);
56b53c0b 380 else
d0fb18b5 381 lockdep_assert_held(&adev->reset_domain->sem);
56b53c0b
DL
382 }
383#endif
384 return false;
385}
386
e3ecdffa 387/**
f7ee1874 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register
e3ecdffa
AD
389 *
390 * @adev: amdgpu_device pointer
391 * @reg: dword aligned register offset
392 * @acc_flags: access flags which require special behavior
393 *
394 * Returns the 32 bit value from the offset specified.
395 */
f7ee1874
HZ
396uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
397 uint32_t reg, uint32_t acc_flags)
d38ceaf9 398{
f4b373f4
TSD
399 uint32_t ret;
400
56b53c0b 401 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
402 return 0;
403
f7ee1874
HZ
404 if ((reg * 4) < adev->rmmio_size) {
405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
406 amdgpu_sriov_runtime(adev) &&
d0fb18b5 407 down_read_trylock(&adev->reset_domain->sem)) {
f7ee1874 408 ret = amdgpu_kiq_rreg(adev, reg);
d0fb18b5 409 up_read(&adev->reset_domain->sem);
f7ee1874
HZ
410 } else {
411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
412 }
413 } else {
414 ret = adev->pcie_rreg(adev, reg * 4);
81202807 415 }
bc992ba5 416
f7ee1874 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
e78b579d 418
f4b373f4 419 return ret;
d38ceaf9
AD
420}
421
421a2a30
ML
422/*
423 * MMIO register read with bytes helper functions
424 * @offset:bytes offset from MMIO start
b8920e1e 425 */
421a2a30 426
e3ecdffa
AD
427/**
428 * amdgpu_mm_rreg8 - read a memory mapped IO register
429 *
430 * @adev: amdgpu_device pointer
431 * @offset: byte aligned register offset
432 *
433 * Returns the 8 bit value from the offset specified.
434 */
7cbbc745
AG
435uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
436{
56b53c0b 437 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
438 return 0;
439
421a2a30
ML
440 if (offset < adev->rmmio_size)
441 return (readb(adev->rmmio + offset));
442 BUG();
443}
444
445/*
446 * MMIO register write with bytes helper functions
447 * @offset:bytes offset from MMIO start
448 * @value: the value want to be written to the register
b8920e1e
SS
449 */
450
e3ecdffa
AD
451/**
452 * amdgpu_mm_wreg8 - read a memory mapped IO register
453 *
454 * @adev: amdgpu_device pointer
455 * @offset: byte aligned register offset
456 * @value: 8 bit value to write
457 *
458 * Writes the value specified to the offset specified.
459 */
7cbbc745
AG
460void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
461{
56b53c0b 462 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
463 return;
464
421a2a30
ML
465 if (offset < adev->rmmio_size)
466 writeb(value, adev->rmmio + offset);
467 else
468 BUG();
469}
470
e3ecdffa 471/**
f7ee1874 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
e3ecdffa
AD
473 *
474 * @adev: amdgpu_device pointer
475 * @reg: dword aligned register offset
476 * @v: 32 bit value to write to the register
477 * @acc_flags: access flags which require special behavior
478 *
479 * Writes the value specified to the offset specified.
480 */
f7ee1874
HZ
481void amdgpu_device_wreg(struct amdgpu_device *adev,
482 uint32_t reg, uint32_t v,
483 uint32_t acc_flags)
d38ceaf9 484{
56b53c0b 485 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
486 return;
487
f7ee1874
HZ
488 if ((reg * 4) < adev->rmmio_size) {
489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
490 amdgpu_sriov_runtime(adev) &&
d0fb18b5 491 down_read_trylock(&adev->reset_domain->sem)) {
f7ee1874 492 amdgpu_kiq_wreg(adev, reg, v);
d0fb18b5 493 up_read(&adev->reset_domain->sem);
f7ee1874
HZ
494 } else {
495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
496 }
497 } else {
498 adev->pcie_wreg(adev, reg * 4, v);
81202807 499 }
bc992ba5 500
f7ee1874 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
2e0cc4d4 502}
d38ceaf9 503
03f2abb0 504/**
4cc9f86f 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
2e0cc4d4 506 *
71579346
RB
507 * @adev: amdgpu_device pointer
508 * @reg: mmio/rlc register
509 * @v: value to write
510 *
511 * this function is invoked only for the debugfs register access
03f2abb0 512 */
f7ee1874 513void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
8ed49dd1
VL
514 uint32_t reg, uint32_t v,
515 uint32_t xcc_id)
2e0cc4d4 516{
56b53c0b 517 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
518 return;
519
2e0cc4d4 520 if (amdgpu_sriov_fullaccess(adev) &&
f7ee1874
HZ
521 adev->gfx.rlc.funcs &&
522 adev->gfx.rlc.funcs->is_rlcg_access_range) {
2e0cc4d4 523 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
8ed49dd1 524 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
4cc9f86f
TSD
525 } else if ((reg * 4) >= adev->rmmio_size) {
526 adev->pcie_wreg(adev, reg * 4, v);
f7ee1874
HZ
527 } else {
528 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
47ed4e1c 529 }
d38ceaf9
AD
530}
531
1bba3683
HZ
532/**
533 * amdgpu_device_indirect_rreg - read an indirect register
534 *
535 * @adev: amdgpu_device pointer
22f453fb 536 * @reg_addr: indirect register address to read from
1bba3683
HZ
537 *
538 * Returns the value of indirect register @reg_addr
539 */
540u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
1bba3683
HZ
541 u32 reg_addr)
542{
65ba96e9 543 unsigned long flags, pcie_index, pcie_data;
1bba3683
HZ
544 void __iomem *pcie_index_offset;
545 void __iomem *pcie_data_offset;
65ba96e9
HZ
546 u32 r;
547
548 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
549 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1bba3683
HZ
550
551 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
552 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
553 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
554
555 writel(reg_addr, pcie_index_offset);
556 readl(pcie_index_offset);
557 r = readl(pcie_data_offset);
558 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
559
560 return r;
561}
562
0c552ed3
LM
563u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
564 u64 reg_addr)
565{
566 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
567 u32 r;
568 void __iomem *pcie_index_offset;
569 void __iomem *pcie_index_hi_offset;
570 void __iomem *pcie_data_offset;
571
572 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
573 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
574 if (adev->nbio.funcs->get_pcie_index_hi_offset)
575 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
576 else
577 pcie_index_hi = 0;
578
579 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
580 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
581 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
582 if (pcie_index_hi != 0)
583 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
584 pcie_index_hi * 4;
585
586 writel(reg_addr, pcie_index_offset);
587 readl(pcie_index_offset);
588 if (pcie_index_hi != 0) {
589 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
590 readl(pcie_index_hi_offset);
591 }
592 r = readl(pcie_data_offset);
593
594 /* clear the high bits */
595 if (pcie_index_hi != 0) {
596 writel(0, pcie_index_hi_offset);
597 readl(pcie_index_hi_offset);
598 }
599
600 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
601
602 return r;
603}
604
1bba3683
HZ
605/**
606 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
607 *
608 * @adev: amdgpu_device pointer
22f453fb 609 * @reg_addr: indirect register address to read from
1bba3683
HZ
610 *
611 * Returns the value of indirect register @reg_addr
612 */
613u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1bba3683
HZ
614 u32 reg_addr)
615{
65ba96e9 616 unsigned long flags, pcie_index, pcie_data;
1bba3683
HZ
617 void __iomem *pcie_index_offset;
618 void __iomem *pcie_data_offset;
65ba96e9
HZ
619 u64 r;
620
621 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
622 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1bba3683
HZ
623
624 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
627
628 /* read low 32 bits */
629 writel(reg_addr, pcie_index_offset);
630 readl(pcie_index_offset);
631 r = readl(pcie_data_offset);
632 /* read high 32 bits */
633 writel(reg_addr + 4, pcie_index_offset);
634 readl(pcie_index_offset);
635 r |= ((u64)readl(pcie_data_offset) << 32);
636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
637
638 return r;
639}
640
641/**
642 * amdgpu_device_indirect_wreg - write an indirect register address
643 *
644 * @adev: amdgpu_device pointer
1bba3683
HZ
645 * @reg_addr: indirect register offset
646 * @reg_data: indirect register data
647 *
648 */
649void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1bba3683
HZ
650 u32 reg_addr, u32 reg_data)
651{
65ba96e9 652 unsigned long flags, pcie_index, pcie_data;
1bba3683
HZ
653 void __iomem *pcie_index_offset;
654 void __iomem *pcie_data_offset;
655
65ba96e9
HZ
656 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
657 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
658
1bba3683
HZ
659 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
662
663 writel(reg_addr, pcie_index_offset);
664 readl(pcie_index_offset);
665 writel(reg_data, pcie_data_offset);
666 readl(pcie_data_offset);
667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
668}
669
0c552ed3
LM
670void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
671 u64 reg_addr, u32 reg_data)
672{
673 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
674 void __iomem *pcie_index_offset;
675 void __iomem *pcie_index_hi_offset;
676 void __iomem *pcie_data_offset;
677
678 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
679 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
680 if (adev->nbio.funcs->get_pcie_index_hi_offset)
681 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
682 else
683 pcie_index_hi = 0;
684
685 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
688 if (pcie_index_hi != 0)
689 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
690 pcie_index_hi * 4;
691
692 writel(reg_addr, pcie_index_offset);
693 readl(pcie_index_offset);
694 if (pcie_index_hi != 0) {
695 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
696 readl(pcie_index_hi_offset);
697 }
698 writel(reg_data, pcie_data_offset);
699 readl(pcie_data_offset);
700
701 /* clear the high bits */
702 if (pcie_index_hi != 0) {
703 writel(0, pcie_index_hi_offset);
704 readl(pcie_index_hi_offset);
705 }
706
707 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
708}
709
1bba3683
HZ
710/**
711 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
712 *
713 * @adev: amdgpu_device pointer
1bba3683
HZ
714 * @reg_addr: indirect register offset
715 * @reg_data: indirect register data
716 *
717 */
718void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1bba3683
HZ
719 u32 reg_addr, u64 reg_data)
720{
65ba96e9 721 unsigned long flags, pcie_index, pcie_data;
1bba3683
HZ
722 void __iomem *pcie_index_offset;
723 void __iomem *pcie_data_offset;
724
65ba96e9
HZ
725 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
726 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
727
1bba3683
HZ
728 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
731
732 /* write low 32 bits */
733 writel(reg_addr, pcie_index_offset);
734 readl(pcie_index_offset);
735 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
736 readl(pcie_data_offset);
737 /* write high 32 bits */
738 writel(reg_addr + 4, pcie_index_offset);
739 readl(pcie_index_offset);
740 writel((u32)(reg_data >> 32), pcie_data_offset);
741 readl(pcie_data_offset);
742 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
743}
744
dabc114e
HZ
745/**
746 * amdgpu_device_get_rev_id - query device rev_id
747 *
748 * @adev: amdgpu_device pointer
749 *
750 * Return device rev_id
751 */
752u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
753{
754 return adev->nbio.funcs->get_rev_id(adev);
755}
756
d38ceaf9
AD
757/**
758 * amdgpu_invalid_rreg - dummy reg read function
759 *
982a820b 760 * @adev: amdgpu_device pointer
d38ceaf9
AD
761 * @reg: offset of register
762 *
763 * Dummy register read function. Used for register blocks
764 * that certain asics don't have (all asics).
765 * Returns the value in the register.
766 */
767static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
768{
769 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
770 BUG();
771 return 0;
772}
773
0c552ed3
LM
774static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
775{
776 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
777 BUG();
778 return 0;
779}
780
d38ceaf9
AD
781/**
782 * amdgpu_invalid_wreg - dummy reg write function
783 *
982a820b 784 * @adev: amdgpu_device pointer
d38ceaf9
AD
785 * @reg: offset of register
786 * @v: value to write to the register
787 *
788 * Dummy register read function. Used for register blocks
789 * that certain asics don't have (all asics).
790 */
791static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
792{
793 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
794 reg, v);
795 BUG();
796}
797
0c552ed3
LM
798static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
799{
800 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
801 reg, v);
802 BUG();
803}
804
4fa1c6a6
TZ
805/**
806 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
807 *
982a820b 808 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
809 * @reg: offset of register
810 *
811 * Dummy register read function. Used for register blocks
812 * that certain asics don't have (all asics).
813 * Returns the value in the register.
814 */
815static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
816{
817 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
818 BUG();
819 return 0;
820}
821
822/**
823 * amdgpu_invalid_wreg64 - dummy reg write function
824 *
982a820b 825 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
826 * @reg: offset of register
827 * @v: value to write to the register
828 *
829 * Dummy register read function. Used for register blocks
830 * that certain asics don't have (all asics).
831 */
832static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
833{
834 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
835 reg, v);
836 BUG();
837}
838
d38ceaf9
AD
839/**
840 * amdgpu_block_invalid_rreg - dummy reg read function
841 *
982a820b 842 * @adev: amdgpu_device pointer
d38ceaf9
AD
843 * @block: offset of instance
844 * @reg: offset of register
845 *
846 * Dummy register read function. Used for register blocks
847 * that certain asics don't have (all asics).
848 * Returns the value in the register.
849 */
850static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
851 uint32_t block, uint32_t reg)
852{
853 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
854 reg, block);
855 BUG();
856 return 0;
857}
858
859/**
860 * amdgpu_block_invalid_wreg - dummy reg write function
861 *
982a820b 862 * @adev: amdgpu_device pointer
d38ceaf9
AD
863 * @block: offset of instance
864 * @reg: offset of register
865 * @v: value to write to the register
866 *
867 * Dummy register read function. Used for register blocks
868 * that certain asics don't have (all asics).
869 */
870static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
871 uint32_t block,
872 uint32_t reg, uint32_t v)
873{
874 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
875 reg, block, v);
876 BUG();
877}
878
4d2997ab
AD
879/**
880 * amdgpu_device_asic_init - Wrapper for atom asic_init
881 *
982a820b 882 * @adev: amdgpu_device pointer
4d2997ab
AD
883 *
884 * Does any asic specific work and then calls atom asic init.
885 */
886static int amdgpu_device_asic_init(struct amdgpu_device *adev)
887{
15c5c5f5
LL
888 int ret;
889
4d2997ab
AD
890 amdgpu_asic_pre_asic_init(adev);
891
5db392a0 892 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
15c5c5f5
LL
893 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
894 amdgpu_psp_wait_for_bootloader(adev);
895 ret = amdgpu_atomfirmware_asic_init(adev, true);
896 return ret;
897 } else {
85d1bcc6 898 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
15c5c5f5
LL
899 }
900
901 return 0;
4d2997ab
AD
902}
903
e3ecdffa 904/**
7ccfd79f 905 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
e3ecdffa 906 *
982a820b 907 * @adev: amdgpu_device pointer
e3ecdffa
AD
908 *
909 * Allocates a scratch page of VRAM for use by various things in the
910 * driver.
911 */
7ccfd79f 912static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
d38ceaf9 913{
7ccfd79f
CK
914 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
915 AMDGPU_GEM_DOMAIN_VRAM |
916 AMDGPU_GEM_DOMAIN_GTT,
917 &adev->mem_scratch.robj,
918 &adev->mem_scratch.gpu_addr,
919 (void **)&adev->mem_scratch.ptr);
d38ceaf9
AD
920}
921
e3ecdffa 922/**
7ccfd79f 923 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
e3ecdffa 924 *
982a820b 925 * @adev: amdgpu_device pointer
e3ecdffa
AD
926 *
927 * Frees the VRAM scratch page.
928 */
7ccfd79f 929static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 930{
7ccfd79f 931 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
d38ceaf9
AD
932}
933
934/**
9c3f2b54 935 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
936 *
937 * @adev: amdgpu_device pointer
938 * @registers: pointer to the register array
939 * @array_size: size of the register array
940 *
b8920e1e 941 * Programs an array or registers with and or masks.
d38ceaf9
AD
942 * This is a helper for setting golden registers.
943 */
9c3f2b54
AD
944void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
945 const u32 *registers,
946 const u32 array_size)
d38ceaf9
AD
947{
948 u32 tmp, reg, and_mask, or_mask;
949 int i;
950
951 if (array_size % 3)
952 return;
953
47fc644f 954 for (i = 0; i < array_size; i += 3) {
d38ceaf9
AD
955 reg = registers[i + 0];
956 and_mask = registers[i + 1];
957 or_mask = registers[i + 2];
958
959 if (and_mask == 0xffffffff) {
960 tmp = or_mask;
961 } else {
962 tmp = RREG32(reg);
963 tmp &= ~and_mask;
e0d07657
HZ
964 if (adev->family >= AMDGPU_FAMILY_AI)
965 tmp |= (or_mask & and_mask);
966 else
967 tmp |= or_mask;
d38ceaf9
AD
968 }
969 WREG32(reg, tmp);
970 }
971}
972
e3ecdffa
AD
973/**
974 * amdgpu_device_pci_config_reset - reset the GPU
975 *
976 * @adev: amdgpu_device pointer
977 *
978 * Resets the GPU using the pci config reset sequence.
979 * Only applicable to asics prior to vega10.
980 */
8111c387 981void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
982{
983 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
984}
985
af484df8
AD
986/**
987 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
988 *
989 * @adev: amdgpu_device pointer
990 *
991 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
992 */
993int amdgpu_device_pci_reset(struct amdgpu_device *adev)
994{
995 return pci_reset_function(adev->pdev);
996}
997
d38ceaf9 998/*
06ec9070 999 * amdgpu_device_wb_*()
455a7bc2 1000 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 1001 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
1002 */
1003
1004/**
06ec9070 1005 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
1006 *
1007 * @adev: amdgpu_device pointer
1008 *
1009 * Disables Writeback and frees the Writeback memory (all asics).
1010 * Used at driver shutdown.
1011 */
06ec9070 1012static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1013{
1014 if (adev->wb.wb_obj) {
a76ed485
AD
1015 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1016 &adev->wb.gpu_addr,
1017 (void **)&adev->wb.wb);
d38ceaf9
AD
1018 adev->wb.wb_obj = NULL;
1019 }
1020}
1021
1022/**
03f2abb0 1023 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
d38ceaf9
AD
1024 *
1025 * @adev: amdgpu_device pointer
1026 *
455a7bc2 1027 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
1028 * Used at driver startup.
1029 * Returns 0 on success or an -error on failure.
1030 */
06ec9070 1031static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
1032{
1033 int r;
1034
1035 if (adev->wb.wb_obj == NULL) {
97407b63
AD
1036 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1037 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
1038 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1039 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1040 (void **)&adev->wb.wb);
d38ceaf9
AD
1041 if (r) {
1042 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1043 return r;
1044 }
d38ceaf9
AD
1045
1046 adev->wb.num_wb = AMDGPU_MAX_WB;
1047 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1048
1049 /* clear wb memory */
73469585 1050 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
1051 }
1052
1053 return 0;
1054}
1055
1056/**
131b4b36 1057 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
1058 *
1059 * @adev: amdgpu_device pointer
1060 * @wb: wb index
1061 *
1062 * Allocate a wb slot for use by the driver (all asics).
1063 * Returns 0 on success or -EINVAL on failure.
1064 */
131b4b36 1065int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
1066{
1067 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 1068
97407b63 1069 if (offset < adev->wb.num_wb) {
7014285a 1070 __set_bit(offset, adev->wb.used);
63ae07ca 1071 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
1072 return 0;
1073 } else {
1074 return -EINVAL;
1075 }
1076}
1077
d38ceaf9 1078/**
131b4b36 1079 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
1080 *
1081 * @adev: amdgpu_device pointer
1082 * @wb: wb index
1083 *
1084 * Free a wb slot allocated for use by the driver (all asics)
1085 */
131b4b36 1086void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 1087{
73469585 1088 wb >>= 3;
d38ceaf9 1089 if (wb < adev->wb.num_wb)
73469585 1090 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
1091}
1092
d6895ad3
CK
1093/**
1094 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1095 *
1096 * @adev: amdgpu_device pointer
1097 *
1098 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1099 * to fail, but if any of the BARs is not accessible after the size we abort
1100 * driver loading by returning -ENODEV.
1101 */
1102int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1103{
453f617a 1104 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
31b8adab
CK
1105 struct pci_bus *root;
1106 struct resource *res;
b8920e1e 1107 unsigned int i;
d6895ad3
CK
1108 u16 cmd;
1109 int r;
1110
822130b5
AB
1111 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1112 return 0;
1113
0c03b912 1114 /* Bypass for VF */
1115 if (amdgpu_sriov_vf(adev))
1116 return 0;
1117
b7221f2b
AD
1118 /* skip if the bios has already enabled large BAR */
1119 if (adev->gmc.real_vram_size &&
1120 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1121 return 0;
1122
31b8adab
CK
1123 /* Check if the root BUS has 64bit memory resources */
1124 root = adev->pdev->bus;
1125 while (root->parent)
1126 root = root->parent;
1127
1128 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 1129 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1130 res->start > 0x100000000ull)
1131 break;
1132 }
1133
1134 /* Trying to resize is pointless without a root hub window above 4GB */
1135 if (!res)
1136 return 0;
1137
453f617a
ND
1138 /* Limit the BAR size to what is available */
1139 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1140 rbar_size);
1141
d6895ad3
CK
1142 /* Disable memory decoding while we change the BAR addresses and size */
1143 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1144 pci_write_config_word(adev->pdev, PCI_COMMAND,
1145 cmd & ~PCI_COMMAND_MEMORY);
1146
1147 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
43c064db 1148 amdgpu_doorbell_fini(adev);
d6895ad3
CK
1149 if (adev->asic_type >= CHIP_BONAIRE)
1150 pci_release_resource(adev->pdev, 2);
1151
1152 pci_release_resource(adev->pdev, 0);
1153
1154 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1155 if (r == -ENOSPC)
1156 DRM_INFO("Not enough PCI address space for a large BAR.");
1157 else if (r && r != -ENOTSUPP)
1158 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1159
1160 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1161
1162 /* When the doorbell or fb BAR isn't available we have no chance of
1163 * using the device.
1164 */
43c064db 1165 r = amdgpu_doorbell_init(adev);
d6895ad3
CK
1166 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1167 return -ENODEV;
1168
1169 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1170
1171 return 0;
1172}
a05502e5 1173
9535a86a
SZ
1174static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1175{
b8920e1e 1176 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
9535a86a 1177 return false;
9535a86a
SZ
1178
1179 return true;
1180}
1181
d38ceaf9
AD
1182/*
1183 * GPU helpers function.
1184 */
1185/**
39c640c0 1186 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1187 *
1188 * @adev: amdgpu_device pointer
1189 *
c836fec5
JQ
1190 * Check if the asic has been initialized (all asics) at driver startup
1191 * or post is needed if hw reset is performed.
1192 * Returns true if need or false if not.
d38ceaf9 1193 */
39c640c0 1194bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1195{
1196 uint32_t reg;
1197
bec86378
ML
1198 if (amdgpu_sriov_vf(adev))
1199 return false;
1200
9535a86a
SZ
1201 if (!amdgpu_device_read_bios(adev))
1202 return false;
1203
bec86378 1204 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1205 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1206 * some old smc fw still need driver do vPost otherwise gpu hang, while
1207 * those smc fw version above 22.15 doesn't have this flaw, so we force
1208 * vpost executed for smc version below 22.15
bec86378
ML
1209 */
1210 if (adev->asic_type == CHIP_FIJI) {
1211 int err;
1212 uint32_t fw_ver;
b8920e1e 1213
bec86378
ML
1214 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1215 /* force vPost if error occured */
1216 if (err)
1217 return true;
1218
1219 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1220 if (fw_ver < 0x00160e00)
1221 return true;
bec86378 1222 }
bec86378 1223 }
91fe77eb 1224
e3c1b071 1225 /* Don't post if we need to reset whole hive on init */
1226 if (adev->gmc.xgmi.pending_reset)
1227 return false;
1228
91fe77eb 1229 if (adev->has_hw_reset) {
1230 adev->has_hw_reset = false;
1231 return true;
1232 }
1233
1234 /* bios scratch used on CIK+ */
1235 if (adev->asic_type >= CHIP_BONAIRE)
1236 return amdgpu_atombios_scratch_need_asic_init(adev);
1237
1238 /* check MEM_SIZE for older asics */
1239 reg = amdgpu_asic_get_config_memsize(adev);
1240
1241 if ((reg != 0) && (reg != 0xffffffff))
1242 return false;
1243
1244 return true;
bec86378
ML
1245}
1246
70e64c4d
ML
1247/*
1248 * On APUs with >= 64GB white flickering has been observed w/ SG enabled.
1249 * Disable S/G on such systems until we have a proper fix.
1250 * https://gitlab.freedesktop.org/drm/amd/-/issues/2354
1251 * https://gitlab.freedesktop.org/drm/amd/-/issues/2735
1252 */
1253bool amdgpu_sg_display_supported(struct amdgpu_device *adev)
1254{
1255 switch (amdgpu_sg_display) {
1256 case -1:
1257 break;
1258 case 0:
1259 return false;
1260 case 1:
1261 return true;
1262 default:
1263 return false;
1264 }
1265 if ((totalram_pages() << (PAGE_SHIFT - 10)) +
1266 (adev->gmc.real_vram_size / 1024) >= 64000000) {
1267 DRM_WARN("Disabling S/G due to >=64GB RAM\n");
1268 return false;
1269 }
1270 return true;
1271}
1272
5d1eb4c4
ML
1273/*
1274 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1275 * speed switching. Until we have confirmation from Intel that a specific host
1276 * supports it, it's safer that we keep it disabled for all.
1277 *
1278 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1279 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1280 */
1281bool amdgpu_device_pcie_dynamic_switching_supported(void)
1282{
1283#if IS_ENABLED(CONFIG_X86)
1284 struct cpuinfo_x86 *c = &cpu_data(0);
1285
1286 if (c->x86_vendor == X86_VENDOR_INTEL)
1287 return false;
1288#endif
1289 return true;
1290}
1291
0ab5d711
ML
1292/**
1293 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1294 *
1295 * @adev: amdgpu_device pointer
1296 *
1297 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1298 * be set for this device.
1299 *
1300 * Returns true if it should be used or false if not.
1301 */
1302bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1303{
1304 switch (amdgpu_aspm) {
1305 case -1:
1306 break;
1307 case 0:
1308 return false;
1309 case 1:
1310 return true;
1311 default:
1312 return false;
1313 }
1314 return pcie_aspm_enabled(adev->pdev);
1315}
1316
3ad5dcfe
KHF
1317bool amdgpu_device_aspm_support_quirk(void)
1318{
1319#if IS_ENABLED(CONFIG_X86)
1320 struct cpuinfo_x86 *c = &cpu_data(0);
1321
1322 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1323#else
1324 return true;
1325#endif
1326}
1327
d38ceaf9
AD
1328/* if we get transitioned to only one device, take VGA back */
1329/**
06ec9070 1330 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9 1331 *
bf44e8ce 1332 * @pdev: PCI device pointer
d38ceaf9
AD
1333 * @state: enable/disable vga decode
1334 *
1335 * Enable/disable vga decode (all asics).
1336 * Returns VGA resource flags.
1337 */
bf44e8ce
CH
1338static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1339 bool state)
d38ceaf9 1340{
bf44e8ce 1341 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
b8920e1e 1342
d38ceaf9
AD
1343 amdgpu_asic_set_vga_state(adev, state);
1344 if (state)
1345 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1346 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1347 else
1348 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1349}
1350
e3ecdffa
AD
1351/**
1352 * amdgpu_device_check_block_size - validate the vm block size
1353 *
1354 * @adev: amdgpu_device pointer
1355 *
1356 * Validates the vm block size specified via module parameter.
1357 * The vm block size defines number of bits in page table versus page directory,
1358 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1359 * page table and the remaining bits are in the page directory.
1360 */
06ec9070 1361static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1362{
1363 /* defines number of bits in page table versus page directory,
1364 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
b8920e1e
SS
1365 * page table and the remaining bits are in the page directory
1366 */
bab4fee7
JZ
1367 if (amdgpu_vm_block_size == -1)
1368 return;
a1adf8be 1369
bab4fee7 1370 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1371 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1372 amdgpu_vm_block_size);
97489129 1373 amdgpu_vm_block_size = -1;
a1adf8be 1374 }
a1adf8be
CZ
1375}
1376
e3ecdffa
AD
1377/**
1378 * amdgpu_device_check_vm_size - validate the vm size
1379 *
1380 * @adev: amdgpu_device pointer
1381 *
1382 * Validates the vm size in GB specified via module parameter.
1383 * The VM size is the size of the GPU virtual memory space in GB.
1384 */
06ec9070 1385static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1386{
64dab074
AD
1387 /* no need to check the default value */
1388 if (amdgpu_vm_size == -1)
1389 return;
1390
83ca145d
ZJ
1391 if (amdgpu_vm_size < 1) {
1392 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1393 amdgpu_vm_size);
f3368128 1394 amdgpu_vm_size = -1;
83ca145d 1395 }
83ca145d
ZJ
1396}
1397
7951e376
RZ
1398static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1399{
1400 struct sysinfo si;
a9d4fe2f 1401 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1402 uint64_t total_memory;
1403 uint64_t dram_size_seven_GB = 0x1B8000000;
1404 uint64_t dram_size_three_GB = 0xB8000000;
1405
1406 if (amdgpu_smu_memory_pool_size == 0)
1407 return;
1408
1409 if (!is_os_64) {
1410 DRM_WARN("Not 64-bit OS, feature not supported\n");
1411 goto def_value;
1412 }
1413 si_meminfo(&si);
1414 total_memory = (uint64_t)si.totalram * si.mem_unit;
1415
1416 if ((amdgpu_smu_memory_pool_size == 1) ||
1417 (amdgpu_smu_memory_pool_size == 2)) {
1418 if (total_memory < dram_size_three_GB)
1419 goto def_value1;
1420 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1421 (amdgpu_smu_memory_pool_size == 8)) {
1422 if (total_memory < dram_size_seven_GB)
1423 goto def_value1;
1424 } else {
1425 DRM_WARN("Smu memory pool size not supported\n");
1426 goto def_value;
1427 }
1428 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1429
1430 return;
1431
1432def_value1:
1433 DRM_WARN("No enough system memory\n");
1434def_value:
1435 adev->pm.smu_prv_buffer_size = 0;
1436}
1437
9f6a7857
HR
1438static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1439{
1440 if (!(adev->flags & AMD_IS_APU) ||
1441 adev->asic_type < CHIP_RAVEN)
1442 return 0;
1443
1444 switch (adev->asic_type) {
1445 case CHIP_RAVEN:
1446 if (adev->pdev->device == 0x15dd)
1447 adev->apu_flags |= AMD_APU_IS_RAVEN;
1448 if (adev->pdev->device == 0x15d8)
1449 adev->apu_flags |= AMD_APU_IS_PICASSO;
1450 break;
1451 case CHIP_RENOIR:
1452 if ((adev->pdev->device == 0x1636) ||
1453 (adev->pdev->device == 0x164c))
1454 adev->apu_flags |= AMD_APU_IS_RENOIR;
1455 else
1456 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1457 break;
1458 case CHIP_VANGOGH:
1459 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1460 break;
1461 case CHIP_YELLOW_CARP:
1462 break;
d0f56dc2 1463 case CHIP_CYAN_SKILLFISH:
dfcc3e8c
AD
1464 if ((adev->pdev->device == 0x13FE) ||
1465 (adev->pdev->device == 0x143F))
d0f56dc2
TZ
1466 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1467 break;
9f6a7857 1468 default:
4eaf21b7 1469 break;
9f6a7857
HR
1470 }
1471
1472 return 0;
1473}
1474
d38ceaf9 1475/**
06ec9070 1476 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1477 *
1478 * @adev: amdgpu_device pointer
1479 *
1480 * Validates certain module parameters and updates
1481 * the associated values used by the driver (all asics).
1482 */
912dfc84 1483static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1484{
5b011235
CZ
1485 if (amdgpu_sched_jobs < 4) {
1486 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1487 amdgpu_sched_jobs);
1488 amdgpu_sched_jobs = 4;
47fc644f 1489 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
5b011235
CZ
1490 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1491 amdgpu_sched_jobs);
1492 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1493 }
d38ceaf9 1494
83e74db6 1495 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1496 /* gart size must be greater or equal to 32M */
1497 dev_warn(adev->dev, "gart size (%d) too small\n",
1498 amdgpu_gart_size);
83e74db6 1499 amdgpu_gart_size = -1;
d38ceaf9
AD
1500 }
1501
36d38372 1502 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1503 /* gtt size must be greater or equal to 32M */
36d38372
CK
1504 dev_warn(adev->dev, "gtt size (%d) too small\n",
1505 amdgpu_gtt_size);
1506 amdgpu_gtt_size = -1;
d38ceaf9
AD
1507 }
1508
d07f14be
RH
1509 /* valid range is between 4 and 9 inclusive */
1510 if (amdgpu_vm_fragment_size != -1 &&
1511 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1512 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1513 amdgpu_vm_fragment_size = -1;
1514 }
1515
5d5bd5e3
KW
1516 if (amdgpu_sched_hw_submission < 2) {
1517 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1518 amdgpu_sched_hw_submission);
1519 amdgpu_sched_hw_submission = 2;
1520 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1521 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1522 amdgpu_sched_hw_submission);
1523 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1524 }
1525
2656fd23
AG
1526 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1527 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1528 amdgpu_reset_method = -1;
1529 }
1530
7951e376
RZ
1531 amdgpu_device_check_smu_prv_buffer_size(adev);
1532
06ec9070 1533 amdgpu_device_check_vm_size(adev);
d38ceaf9 1534
06ec9070 1535 amdgpu_device_check_block_size(adev);
6a7f76e7 1536
19aede77 1537 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1538
e3c00faa 1539 return 0;
d38ceaf9
AD
1540}
1541
1542/**
1543 * amdgpu_switcheroo_set_state - set switcheroo state
1544 *
1545 * @pdev: pci dev pointer
1694467b 1546 * @state: vga_switcheroo state
d38ceaf9 1547 *
12024b17 1548 * Callback for the switcheroo driver. Suspends or resumes
d38ceaf9
AD
1549 * the asics before or after it is powered up using ACPI methods.
1550 */
8aba21b7
LT
1551static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1552 enum vga_switcheroo_state state)
d38ceaf9
AD
1553{
1554 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1555 int r;
d38ceaf9 1556
b98c6299 1557 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1558 return;
1559
1560 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1561 pr_info("switched on\n");
d38ceaf9
AD
1562 /* don't suspend or resume card normally */
1563 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1564
8f66090b
TZ
1565 pci_set_power_state(pdev, PCI_D0);
1566 amdgpu_device_load_pci_state(pdev);
1567 r = pci_enable_device(pdev);
de185019
AD
1568 if (r)
1569 DRM_WARN("pci_enable_device failed (%d)\n", r);
1570 amdgpu_device_resume(dev, true);
d38ceaf9 1571
d38ceaf9 1572 dev->switch_power_state = DRM_SWITCH_POWER_ON;
d38ceaf9 1573 } else {
dd4fa6c1 1574 pr_info("switched off\n");
d38ceaf9 1575 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1576 amdgpu_device_suspend(dev, true);
8f66090b 1577 amdgpu_device_cache_pci_state(pdev);
de185019 1578 /* Shut down the device */
8f66090b
TZ
1579 pci_disable_device(pdev);
1580 pci_set_power_state(pdev, PCI_D3cold);
d38ceaf9
AD
1581 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1582 }
1583}
1584
1585/**
1586 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1587 *
1588 * @pdev: pci dev pointer
1589 *
1590 * Callback for the switcheroo driver. Check of the switcheroo
1591 * state can be changed.
1592 * Returns true if the state can be changed, false if not.
1593 */
1594static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1595{
1596 struct drm_device *dev = pci_get_drvdata(pdev);
1597
b8920e1e 1598 /*
d38ceaf9
AD
1599 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1600 * locking inversion with the driver load path. And the access here is
1601 * completely racy anyway. So don't bother with locking for now.
1602 */
7e13ad89 1603 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1604}
1605
1606static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1607 .set_gpu_state = amdgpu_switcheroo_set_state,
1608 .reprobe = NULL,
1609 .can_switch = amdgpu_switcheroo_can_switch,
1610};
1611
e3ecdffa
AD
1612/**
1613 * amdgpu_device_ip_set_clockgating_state - set the CG state
1614 *
87e3f136 1615 * @dev: amdgpu_device pointer
e3ecdffa
AD
1616 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1617 * @state: clockgating state (gate or ungate)
1618 *
1619 * Sets the requested clockgating state for all instances of
1620 * the hardware IP specified.
1621 * Returns the error code from the last instance.
1622 */
43fa561f 1623int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1624 enum amd_ip_block_type block_type,
1625 enum amd_clockgating_state state)
d38ceaf9 1626{
43fa561f 1627 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1628 int i, r = 0;
1629
1630 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1631 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1632 continue;
c722865a
RZ
1633 if (adev->ip_blocks[i].version->type != block_type)
1634 continue;
1635 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1636 continue;
1637 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1638 (void *)adev, state);
1639 if (r)
1640 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1641 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1642 }
1643 return r;
1644}
1645
e3ecdffa
AD
1646/**
1647 * amdgpu_device_ip_set_powergating_state - set the PG state
1648 *
87e3f136 1649 * @dev: amdgpu_device pointer
e3ecdffa
AD
1650 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1651 * @state: powergating state (gate or ungate)
1652 *
1653 * Sets the requested powergating state for all instances of
1654 * the hardware IP specified.
1655 * Returns the error code from the last instance.
1656 */
43fa561f 1657int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1658 enum amd_ip_block_type block_type,
1659 enum amd_powergating_state state)
d38ceaf9 1660{
43fa561f 1661 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1662 int i, r = 0;
1663
1664 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1665 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1666 continue;
c722865a
RZ
1667 if (adev->ip_blocks[i].version->type != block_type)
1668 continue;
1669 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1670 continue;
1671 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1672 (void *)adev, state);
1673 if (r)
1674 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1675 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1676 }
1677 return r;
1678}
1679
e3ecdffa
AD
1680/**
1681 * amdgpu_device_ip_get_clockgating_state - get the CG state
1682 *
1683 * @adev: amdgpu_device pointer
1684 * @flags: clockgating feature flags
1685 *
1686 * Walks the list of IPs on the device and updates the clockgating
1687 * flags for each IP.
1688 * Updates @flags with the feature flags for each hardware IP where
1689 * clockgating is enabled.
1690 */
2990a1fc 1691void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
25faeddc 1692 u64 *flags)
6cb2d4e4
HR
1693{
1694 int i;
1695
1696 for (i = 0; i < adev->num_ip_blocks; i++) {
1697 if (!adev->ip_blocks[i].status.valid)
1698 continue;
1699 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1700 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1701 }
1702}
1703
e3ecdffa
AD
1704/**
1705 * amdgpu_device_ip_wait_for_idle - wait for idle
1706 *
1707 * @adev: amdgpu_device pointer
1708 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1709 *
1710 * Waits for the request hardware IP to be idle.
1711 * Returns 0 for success or a negative error code on failure.
1712 */
2990a1fc
AD
1713int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1714 enum amd_ip_block_type block_type)
5dbbb60b
AD
1715{
1716 int i, r;
1717
1718 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1719 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1720 continue;
a1255107
AD
1721 if (adev->ip_blocks[i].version->type == block_type) {
1722 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1723 if (r)
1724 return r;
1725 break;
1726 }
1727 }
1728 return 0;
1729
1730}
1731
e3ecdffa
AD
1732/**
1733 * amdgpu_device_ip_is_idle - is the hardware IP idle
1734 *
1735 * @adev: amdgpu_device pointer
1736 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1737 *
1738 * Check if the hardware IP is idle or not.
1739 * Returns true if it the IP is idle, false if not.
1740 */
2990a1fc
AD
1741bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1742 enum amd_ip_block_type block_type)
5dbbb60b
AD
1743{
1744 int i;
1745
1746 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1747 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1748 continue;
a1255107
AD
1749 if (adev->ip_blocks[i].version->type == block_type)
1750 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1751 }
1752 return true;
1753
1754}
1755
e3ecdffa
AD
1756/**
1757 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1758 *
1759 * @adev: amdgpu_device pointer
87e3f136 1760 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1761 *
1762 * Returns a pointer to the hardware IP block structure
1763 * if it exists for the asic, otherwise NULL.
1764 */
2990a1fc
AD
1765struct amdgpu_ip_block *
1766amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1767 enum amd_ip_block_type type)
d38ceaf9
AD
1768{
1769 int i;
1770
1771 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1772 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1773 return &adev->ip_blocks[i];
1774
1775 return NULL;
1776}
1777
1778/**
2990a1fc 1779 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1780 *
1781 * @adev: amdgpu_device pointer
5fc3aeeb 1782 * @type: enum amd_ip_block_type
d38ceaf9
AD
1783 * @major: major version
1784 * @minor: minor version
1785 *
1786 * return 0 if equal or greater
1787 * return 1 if smaller or the ip_block doesn't exist
1788 */
2990a1fc
AD
1789int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1790 enum amd_ip_block_type type,
1791 u32 major, u32 minor)
d38ceaf9 1792{
2990a1fc 1793 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1794
a1255107
AD
1795 if (ip_block && ((ip_block->version->major > major) ||
1796 ((ip_block->version->major == major) &&
1797 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1798 return 0;
1799
1800 return 1;
1801}
1802
a1255107 1803/**
2990a1fc 1804 * amdgpu_device_ip_block_add
a1255107
AD
1805 *
1806 * @adev: amdgpu_device pointer
1807 * @ip_block_version: pointer to the IP to add
1808 *
1809 * Adds the IP block driver information to the collection of IPs
1810 * on the asic.
1811 */
2990a1fc
AD
1812int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1813 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1814{
1815 if (!ip_block_version)
1816 return -EINVAL;
1817
7bd939d0
LG
1818 switch (ip_block_version->type) {
1819 case AMD_IP_BLOCK_TYPE_VCN:
1820 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1821 return 0;
1822 break;
1823 case AMD_IP_BLOCK_TYPE_JPEG:
1824 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1825 return 0;
1826 break;
1827 default:
1828 break;
1829 }
1830
e966a725 1831 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1832 ip_block_version->funcs->name);
1833
a1255107
AD
1834 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1835
1836 return 0;
1837}
1838
e3ecdffa
AD
1839/**
1840 * amdgpu_device_enable_virtual_display - enable virtual display feature
1841 *
1842 * @adev: amdgpu_device pointer
1843 *
1844 * Enabled the virtual display feature if the user has enabled it via
1845 * the module parameter virtual_display. This feature provides a virtual
1846 * display hardware on headless boards or in virtualized environments.
1847 * This function parses and validates the configuration string specified by
1848 * the user and configues the virtual display configuration (number of
1849 * virtual connectors, crtcs, etc.) specified.
1850 */
483ef985 1851static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1852{
1853 adev->enable_virtual_display = false;
1854
1855 if (amdgpu_virtual_display) {
8f66090b 1856 const char *pci_address_name = pci_name(adev->pdev);
0f66356d 1857 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1858
1859 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1860 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1861 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1862 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1863 if (!strcmp("all", pciaddname)
1864 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1865 long num_crtc;
1866 int res = -1;
1867
9accf2fd 1868 adev->enable_virtual_display = true;
0f66356d
ED
1869
1870 if (pciaddname_tmp)
1871 res = kstrtol(pciaddname_tmp, 10,
1872 &num_crtc);
1873
1874 if (!res) {
1875 if (num_crtc < 1)
1876 num_crtc = 1;
1877 if (num_crtc > 6)
1878 num_crtc = 6;
1879 adev->mode_info.num_crtc = num_crtc;
1880 } else {
1881 adev->mode_info.num_crtc = 1;
1882 }
9accf2fd
ED
1883 break;
1884 }
1885 }
1886
0f66356d
ED
1887 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1888 amdgpu_virtual_display, pci_address_name,
1889 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1890
1891 kfree(pciaddstr);
1892 }
1893}
1894
25263da3
AD
1895void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1896{
1897 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1898 adev->mode_info.num_crtc = 1;
1899 adev->enable_virtual_display = true;
1900 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1901 adev->enable_virtual_display, adev->mode_info.num_crtc);
1902 }
1903}
1904
e3ecdffa
AD
1905/**
1906 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1907 *
1908 * @adev: amdgpu_device pointer
1909 *
1910 * Parses the asic configuration parameters specified in the gpu info
1911 * firmware and makes them availale to the driver for use in configuring
1912 * the asic.
1913 * Returns 0 on success, -EINVAL on failure.
1914 */
e2a75f88
AD
1915static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1916{
e2a75f88 1917 const char *chip_name;
c0a43457 1918 char fw_name[40];
e2a75f88
AD
1919 int err;
1920 const struct gpu_info_firmware_header_v1_0 *hdr;
1921
ab4fe3e1
HR
1922 adev->firmware.gpu_info_fw = NULL;
1923
72de33f8 1924 if (adev->mman.discovery_bin) {
cc375d8c
TY
1925 /*
1926 * FIXME: The bounding box is still needed by Navi12, so
e24d0e91 1927 * temporarily read it from gpu_info firmware. Should be dropped
cc375d8c
TY
1928 * when DAL no longer needs it.
1929 */
1930 if (adev->asic_type != CHIP_NAVI12)
1931 return 0;
258620d0
AD
1932 }
1933
e2a75f88 1934 switch (adev->asic_type) {
e2a75f88
AD
1935 default:
1936 return 0;
1937 case CHIP_VEGA10:
1938 chip_name = "vega10";
1939 break;
3f76dced
AD
1940 case CHIP_VEGA12:
1941 chip_name = "vega12";
1942 break;
2d2e5e7e 1943 case CHIP_RAVEN:
54f78a76 1944 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1945 chip_name = "raven2";
54f78a76 1946 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1947 chip_name = "picasso";
54c4d17e
FX
1948 else
1949 chip_name = "raven";
2d2e5e7e 1950 break;
65e60f6e
LM
1951 case CHIP_ARCTURUS:
1952 chip_name = "arcturus";
1953 break;
42b325e5
XY
1954 case CHIP_NAVI12:
1955 chip_name = "navi12";
1956 break;
e2a75f88
AD
1957 }
1958
1959 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
b31d3063 1960 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
e2a75f88
AD
1961 if (err) {
1962 dev_err(adev->dev,
b31d3063 1963 "Failed to get gpu_info firmware \"%s\"\n",
e2a75f88
AD
1964 fw_name);
1965 goto out;
1966 }
1967
ab4fe3e1 1968 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1969 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1970
1971 switch (hdr->version_major) {
1972 case 1:
1973 {
1974 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1975 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1976 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1977
cc375d8c
TY
1978 /*
1979 * Should be droped when DAL no longer needs it.
1980 */
1981 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
1982 goto parse_soc_bounding_box;
1983
b5ab16bf
AD
1984 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1985 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1986 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1987 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 1988 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
1989 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1990 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1991 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1992 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1993 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 1994 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
1995 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1996 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
1997 adev->gfx.cu_info.max_waves_per_simd =
1998 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1999 adev->gfx.cu_info.max_scratch_slots_per_cu =
2000 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2001 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 2002 if (hdr->version_minor >= 1) {
35c2e910
HZ
2003 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2004 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2005 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2006 adev->gfx.config.num_sc_per_sh =
2007 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2008 adev->gfx.config.num_packer_per_sc =
2009 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2010 }
ec51d3fa
XY
2011
2012parse_soc_bounding_box:
ec51d3fa
XY
2013 /*
2014 * soc bounding box info is not integrated in disocovery table,
258620d0 2015 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 2016 */
48321c3d
HW
2017 if (hdr->version_minor == 2) {
2018 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2019 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2020 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2021 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2022 }
e2a75f88
AD
2023 break;
2024 }
2025 default:
2026 dev_err(adev->dev,
2027 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2028 err = -EINVAL;
2029 goto out;
2030 }
2031out:
e2a75f88
AD
2032 return err;
2033}
2034
e3ecdffa
AD
2035/**
2036 * amdgpu_device_ip_early_init - run early init for hardware IPs
2037 *
2038 * @adev: amdgpu_device pointer
2039 *
2040 * Early initialization pass for hardware IPs. The hardware IPs that make
2041 * up each asic are discovered each IP's early_init callback is run. This
2042 * is the first stage in initializing the asic.
2043 * Returns 0 on success, negative error code on failure.
2044 */
06ec9070 2045static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 2046{
901e2be2
AD
2047 struct drm_device *dev = adev_to_drm(adev);
2048 struct pci_dev *parent;
aaa36a97 2049 int i, r;
ced69502 2050 bool total;
d38ceaf9 2051
483ef985 2052 amdgpu_device_enable_virtual_display(adev);
a6be7570 2053
00a979f3 2054 if (amdgpu_sriov_vf(adev)) {
00a979f3 2055 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
2056 if (r)
2057 return r;
00a979f3
WS
2058 }
2059
d38ceaf9 2060 switch (adev->asic_type) {
33f34802
KW
2061#ifdef CONFIG_DRM_AMDGPU_SI
2062 case CHIP_VERDE:
2063 case CHIP_TAHITI:
2064 case CHIP_PITCAIRN:
2065 case CHIP_OLAND:
2066 case CHIP_HAINAN:
295d0daf 2067 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
2068 r = si_set_ip_blocks(adev);
2069 if (r)
2070 return r;
2071 break;
2072#endif
a2e73f56
AD
2073#ifdef CONFIG_DRM_AMDGPU_CIK
2074 case CHIP_BONAIRE:
2075 case CHIP_HAWAII:
2076 case CHIP_KAVERI:
2077 case CHIP_KABINI:
2078 case CHIP_MULLINS:
e1ad2d53 2079 if (adev->flags & AMD_IS_APU)
a2e73f56 2080 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
2081 else
2082 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
2083
2084 r = cik_set_ip_blocks(adev);
2085 if (r)
2086 return r;
2087 break;
2088#endif
da87c30b
AD
2089 case CHIP_TOPAZ:
2090 case CHIP_TONGA:
2091 case CHIP_FIJI:
2092 case CHIP_POLARIS10:
2093 case CHIP_POLARIS11:
2094 case CHIP_POLARIS12:
2095 case CHIP_VEGAM:
2096 case CHIP_CARRIZO:
2097 case CHIP_STONEY:
2098 if (adev->flags & AMD_IS_APU)
2099 adev->family = AMDGPU_FAMILY_CZ;
2100 else
2101 adev->family = AMDGPU_FAMILY_VI;
2102
2103 r = vi_set_ip_blocks(adev);
2104 if (r)
2105 return r;
2106 break;
d38ceaf9 2107 default:
63352b7f
AD
2108 r = amdgpu_discovery_set_ip_blocks(adev);
2109 if (r)
2110 return r;
2111 break;
d38ceaf9
AD
2112 }
2113
901e2be2
AD
2114 if (amdgpu_has_atpx() &&
2115 (amdgpu_is_atpx_hybrid() ||
2116 amdgpu_has_atpx_dgpu_power_cntl()) &&
2117 ((adev->flags & AMD_IS_APU) == 0) &&
2118 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2119 adev->flags |= AMD_IS_PX;
2120
85ac2021
AD
2121 if (!(adev->flags & AMD_IS_APU)) {
2122 parent = pci_upstream_bridge(adev->pdev);
2123 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2124 }
901e2be2 2125
1884734a 2126
3b94fb10 2127 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 2128 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 2129 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
4215a119
HC
2130 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2131 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
00f54b97 2132
ced69502 2133 total = true;
d38ceaf9
AD
2134 for (i = 0; i < adev->num_ip_blocks; i++) {
2135 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
0c451baf 2136 DRM_WARN("disabled ip block: %d <%s>\n",
ed8cf00c 2137 i, adev->ip_blocks[i].version->funcs->name);
a1255107 2138 adev->ip_blocks[i].status.valid = false;
d38ceaf9 2139 } else {
a1255107
AD
2140 if (adev->ip_blocks[i].version->funcs->early_init) {
2141 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 2142 if (r == -ENOENT) {
a1255107 2143 adev->ip_blocks[i].status.valid = false;
2c1a2784 2144 } else if (r) {
a1255107
AD
2145 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2146 adev->ip_blocks[i].version->funcs->name, r);
ced69502 2147 total = false;
2c1a2784 2148 } else {
a1255107 2149 adev->ip_blocks[i].status.valid = true;
2c1a2784 2150 }
974e6b64 2151 } else {
a1255107 2152 adev->ip_blocks[i].status.valid = true;
d38ceaf9 2153 }
d38ceaf9 2154 }
21a249ca
AD
2155 /* get the vbios after the asic_funcs are set up */
2156 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
2157 r = amdgpu_device_parse_gpu_info_fw(adev);
2158 if (r)
2159 return r;
2160
21a249ca 2161 /* Read BIOS */
9535a86a
SZ
2162 if (amdgpu_device_read_bios(adev)) {
2163 if (!amdgpu_get_bios(adev))
2164 return -EINVAL;
21a249ca 2165
9535a86a
SZ
2166 r = amdgpu_atombios_init(adev);
2167 if (r) {
2168 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2169 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2170 return r;
2171 }
21a249ca 2172 }
77eabc6f
PJZ
2173
2174 /*get pf2vf msg info at it's earliest time*/
2175 if (amdgpu_sriov_vf(adev))
2176 amdgpu_virt_init_data_exchange(adev);
2177
21a249ca 2178 }
d38ceaf9 2179 }
ced69502
ML
2180 if (!total)
2181 return -ENODEV;
d38ceaf9 2182
00fa4035 2183 amdgpu_amdkfd_device_probe(adev);
395d1fb9
NH
2184 adev->cg_flags &= amdgpu_cg_mask;
2185 adev->pg_flags &= amdgpu_pg_mask;
2186
d38ceaf9
AD
2187 return 0;
2188}
2189
0a4f2520
RZ
2190static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2191{
2192 int i, r;
2193
2194 for (i = 0; i < adev->num_ip_blocks; i++) {
2195 if (!adev->ip_blocks[i].status.sw)
2196 continue;
2197 if (adev->ip_blocks[i].status.hw)
2198 continue;
2199 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 2200 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
2201 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2202 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2203 if (r) {
2204 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2205 adev->ip_blocks[i].version->funcs->name, r);
2206 return r;
2207 }
2208 adev->ip_blocks[i].status.hw = true;
2209 }
2210 }
2211
2212 return 0;
2213}
2214
2215static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2216{
2217 int i, r;
2218
2219 for (i = 0; i < adev->num_ip_blocks; i++) {
2220 if (!adev->ip_blocks[i].status.sw)
2221 continue;
2222 if (adev->ip_blocks[i].status.hw)
2223 continue;
2224 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2225 if (r) {
2226 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2227 adev->ip_blocks[i].version->funcs->name, r);
2228 return r;
2229 }
2230 adev->ip_blocks[i].status.hw = true;
2231 }
2232
2233 return 0;
2234}
2235
7a3e0bb2
RZ
2236static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2237{
2238 int r = 0;
2239 int i;
80f41f84 2240 uint32_t smu_version;
7a3e0bb2
RZ
2241
2242 if (adev->asic_type >= CHIP_VEGA10) {
2243 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2244 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2245 continue;
2246
e3c1b071 2247 if (!adev->ip_blocks[i].status.sw)
2248 continue;
2249
482f0e53
ML
2250 /* no need to do the fw loading again if already done*/
2251 if (adev->ip_blocks[i].status.hw == true)
2252 break;
2253
53b3f8f4 2254 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2255 r = adev->ip_blocks[i].version->funcs->resume(adev);
2256 if (r) {
2257 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2258 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2259 return r;
2260 }
2261 } else {
2262 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2263 if (r) {
2264 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2265 adev->ip_blocks[i].version->funcs->name, r);
2266 return r;
7a3e0bb2 2267 }
7a3e0bb2 2268 }
482f0e53
ML
2269
2270 adev->ip_blocks[i].status.hw = true;
2271 break;
7a3e0bb2
RZ
2272 }
2273 }
482f0e53 2274
8973d9ec
ED
2275 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2276 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2277
80f41f84 2278 return r;
7a3e0bb2
RZ
2279}
2280
5fd8518d
AG
2281static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2282{
2283 long timeout;
2284 int r, i;
2285
2286 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2287 struct amdgpu_ring *ring = adev->rings[i];
2288
2289 /* No need to setup the GPU scheduler for rings that don't need it */
2290 if (!ring || ring->no_scheduler)
2291 continue;
2292
2293 switch (ring->funcs->type) {
2294 case AMDGPU_RING_TYPE_GFX:
2295 timeout = adev->gfx_timeout;
2296 break;
2297 case AMDGPU_RING_TYPE_COMPUTE:
2298 timeout = adev->compute_timeout;
2299 break;
2300 case AMDGPU_RING_TYPE_SDMA:
2301 timeout = adev->sdma_timeout;
2302 break;
2303 default:
2304 timeout = adev->video_timeout;
2305 break;
2306 }
2307
2308 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
11f25c84 2309 ring->num_hw_submission, 0,
8ab62eda
JG
2310 timeout, adev->reset_domain->wq,
2311 ring->sched_score, ring->name,
2312 adev->dev);
5fd8518d
AG
2313 if (r) {
2314 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2315 ring->name);
2316 return r;
2317 }
2318 }
2319
d425c6f4
JZ
2320 amdgpu_xcp_update_partition_sched_list(adev);
2321
5fd8518d
AG
2322 return 0;
2323}
2324
2325
e3ecdffa
AD
2326/**
2327 * amdgpu_device_ip_init - run init for hardware IPs
2328 *
2329 * @adev: amdgpu_device pointer
2330 *
2331 * Main initialization pass for hardware IPs. The list of all the hardware
2332 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2333 * are run. sw_init initializes the software state associated with each IP
2334 * and hw_init initializes the hardware associated with each IP.
2335 * Returns 0 on success, negative error code on failure.
2336 */
06ec9070 2337static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2338{
2339 int i, r;
2340
c030f2e4 2341 r = amdgpu_ras_init(adev);
2342 if (r)
2343 return r;
2344
d38ceaf9 2345 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2346 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2347 continue;
a1255107 2348 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2349 if (r) {
a1255107
AD
2350 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2351 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2352 goto init_failed;
2c1a2784 2353 }
a1255107 2354 adev->ip_blocks[i].status.sw = true;
bfca0289 2355
c1c39032
AD
2356 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2357 /* need to do common hw init early so everything is set up for gmc */
2358 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2359 if (r) {
2360 DRM_ERROR("hw_init %d failed %d\n", i, r);
2361 goto init_failed;
2362 }
2363 adev->ip_blocks[i].status.hw = true;
2364 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2365 /* need to do gmc hw init early so we can allocate gpu mem */
892deb48
VS
2366 /* Try to reserve bad pages early */
2367 if (amdgpu_sriov_vf(adev))
2368 amdgpu_virt_exchange_data(adev);
2369
7ccfd79f 2370 r = amdgpu_device_mem_scratch_init(adev);
2c1a2784 2371 if (r) {
7ccfd79f 2372 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
72d3f592 2373 goto init_failed;
2c1a2784 2374 }
a1255107 2375 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2376 if (r) {
2377 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2378 goto init_failed;
2c1a2784 2379 }
06ec9070 2380 r = amdgpu_device_wb_init(adev);
2c1a2784 2381 if (r) {
06ec9070 2382 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2383 goto init_failed;
2c1a2784 2384 }
a1255107 2385 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2386
2387 /* right after GMC hw init, we create CSA */
02ff519e 2388 if (adev->gfx.mcbp) {
1e256e27 2389 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
58ab2c08
CK
2390 AMDGPU_GEM_DOMAIN_VRAM |
2391 AMDGPU_GEM_DOMAIN_GTT,
2392 AMDGPU_CSA_SIZE);
2493664f
ML
2393 if (r) {
2394 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2395 goto init_failed;
2493664f
ML
2396 }
2397 }
d38ceaf9
AD
2398 }
2399 }
2400
c9ffa427 2401 if (amdgpu_sriov_vf(adev))
22c16d25 2402 amdgpu_virt_init_data_exchange(adev);
c9ffa427 2403
533aed27
AG
2404 r = amdgpu_ib_pool_init(adev);
2405 if (r) {
2406 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2407 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2408 goto init_failed;
2409 }
2410
c8963ea4
RZ
2411 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2412 if (r)
72d3f592 2413 goto init_failed;
0a4f2520
RZ
2414
2415 r = amdgpu_device_ip_hw_init_phase1(adev);
2416 if (r)
72d3f592 2417 goto init_failed;
0a4f2520 2418
7a3e0bb2
RZ
2419 r = amdgpu_device_fw_loading(adev);
2420 if (r)
72d3f592 2421 goto init_failed;
7a3e0bb2 2422
0a4f2520
RZ
2423 r = amdgpu_device_ip_hw_init_phase2(adev);
2424 if (r)
72d3f592 2425 goto init_failed;
d38ceaf9 2426
121a2bc6
AG
2427 /*
2428 * retired pages will be loaded from eeprom and reserved here,
2429 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2430 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2431 * for I2C communication which only true at this point.
b82e65a9
GC
2432 *
2433 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2434 * failure from bad gpu situation and stop amdgpu init process
2435 * accordingly. For other failed cases, it will still release all
2436 * the resource and print error message, rather than returning one
2437 * negative value to upper level.
121a2bc6
AG
2438 *
2439 * Note: theoretically, this should be called before all vram allocations
2440 * to protect retired page from abusing
2441 */
b82e65a9
GC
2442 r = amdgpu_ras_recovery_init(adev);
2443 if (r)
2444 goto init_failed;
121a2bc6 2445
cfbb6b00
AG
2446 /**
2447 * In case of XGMI grab extra reference for reset domain for this device
2448 */
a4c63caf 2449 if (adev->gmc.xgmi.num_physical_nodes > 1) {
cfbb6b00 2450 if (amdgpu_xgmi_add_device(adev) == 0) {
46c67660 2451 if (!amdgpu_sriov_vf(adev)) {
2efc30f0
VC
2452 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2453
dfd0287b
LH
2454 if (WARN_ON(!hive)) {
2455 r = -ENOENT;
2456 goto init_failed;
2457 }
2458
46c67660 2459 if (!hive->reset_domain ||
2460 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2461 r = -ENOENT;
2462 amdgpu_put_xgmi_hive(hive);
2463 goto init_failed;
2464 }
2465
2466 /* Drop the early temporary reset domain we created for device */
2467 amdgpu_reset_put_reset_domain(adev->reset_domain);
2468 adev->reset_domain = hive->reset_domain;
9dfa4860 2469 amdgpu_put_xgmi_hive(hive);
cfbb6b00 2470 }
a4c63caf
AG
2471 }
2472 }
2473
5fd8518d
AG
2474 r = amdgpu_device_init_schedulers(adev);
2475 if (r)
2476 goto init_failed;
e3c1b071 2477
2478 /* Don't init kfd if whole hive need to be reset during init */
84b4dd3f
PY
2479 if (!adev->gmc.xgmi.pending_reset) {
2480 kgd2kfd_init_zone_device(adev);
e3c1b071 2481 amdgpu_amdkfd_device_init(adev);
84b4dd3f 2482 }
c6332b97 2483
bd607166
KR
2484 amdgpu_fru_get_product_info(adev);
2485
72d3f592 2486init_failed:
c6332b97 2487
72d3f592 2488 return r;
d38ceaf9
AD
2489}
2490
e3ecdffa
AD
2491/**
2492 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2493 *
2494 * @adev: amdgpu_device pointer
2495 *
2496 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2497 * this function before a GPU reset. If the value is retained after a
2498 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2499 */
06ec9070 2500static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2501{
2502 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2503}
2504
e3ecdffa
AD
2505/**
2506 * amdgpu_device_check_vram_lost - check if vram is valid
2507 *
2508 * @adev: amdgpu_device pointer
2509 *
2510 * Checks the reset magic value written to the gart pointer in VRAM.
2511 * The driver calls this after a GPU reset to see if the contents of
2512 * VRAM is lost or now.
2513 * returns true if vram is lost, false if not.
2514 */
06ec9070 2515static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2516{
dadce777
EQ
2517 if (memcmp(adev->gart.ptr, adev->reset_magic,
2518 AMDGPU_RESET_MAGIC_NUM))
2519 return true;
2520
53b3f8f4 2521 if (!amdgpu_in_reset(adev))
dadce777
EQ
2522 return false;
2523
2524 /*
2525 * For all ASICs with baco/mode1 reset, the VRAM is
2526 * always assumed to be lost.
2527 */
2528 switch (amdgpu_asic_reset_method(adev)) {
2529 case AMD_RESET_METHOD_BACO:
2530 case AMD_RESET_METHOD_MODE1:
2531 return true;
2532 default:
2533 return false;
2534 }
0c49e0b8
CZ
2535}
2536
e3ecdffa 2537/**
1112a46b 2538 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2539 *
2540 * @adev: amdgpu_device pointer
b8b72130 2541 * @state: clockgating state (gate or ungate)
e3ecdffa 2542 *
e3ecdffa 2543 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2544 * set_clockgating_state callbacks are run.
2545 * Late initialization pass enabling clockgating for hardware IPs.
2546 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2547 * Returns 0 on success, negative error code on failure.
2548 */
fdd34271 2549
5d89bb2d
LL
2550int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2551 enum amd_clockgating_state state)
d38ceaf9 2552{
1112a46b 2553 int i, j, r;
d38ceaf9 2554
4a2ba394
SL
2555 if (amdgpu_emu_mode == 1)
2556 return 0;
2557
1112a46b
RZ
2558 for (j = 0; j < adev->num_ip_blocks; j++) {
2559 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2560 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2561 continue;
47198eb7 2562 /* skip CG for GFX, SDMA on S0ix */
5d70a549 2563 if (adev->in_s0ix &&
47198eb7
AD
2564 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2565 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
5d70a549 2566 continue;
4a446d55 2567 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2568 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2569 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2570 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2571 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2572 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2573 /* enable clockgating to save power */
a1255107 2574 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2575 state);
4a446d55
AD
2576 if (r) {
2577 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2578 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2579 return r;
2580 }
b0b00ff1 2581 }
d38ceaf9 2582 }
06b18f61 2583
c9f96fd5
RZ
2584 return 0;
2585}
2586
5d89bb2d
LL
2587int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2588 enum amd_powergating_state state)
c9f96fd5 2589{
1112a46b 2590 int i, j, r;
06b18f61 2591
c9f96fd5
RZ
2592 if (amdgpu_emu_mode == 1)
2593 return 0;
2594
1112a46b
RZ
2595 for (j = 0; j < adev->num_ip_blocks; j++) {
2596 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2597 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5 2598 continue;
47198eb7 2599 /* skip PG for GFX, SDMA on S0ix */
5d70a549 2600 if (adev->in_s0ix &&
47198eb7
AD
2601 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2602 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
5d70a549 2603 continue;
c9f96fd5
RZ
2604 /* skip CG for VCE/UVD, it's handled specially */
2605 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2606 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2607 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2608 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2609 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2610 /* enable powergating to save power */
2611 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2612 state);
c9f96fd5
RZ
2613 if (r) {
2614 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2615 adev->ip_blocks[i].version->funcs->name, r);
2616 return r;
2617 }
2618 }
2619 }
2dc80b00
S
2620 return 0;
2621}
2622
beff74bc
AD
2623static int amdgpu_device_enable_mgpu_fan_boost(void)
2624{
2625 struct amdgpu_gpu_instance *gpu_ins;
2626 struct amdgpu_device *adev;
2627 int i, ret = 0;
2628
2629 mutex_lock(&mgpu_info.mutex);
2630
2631 /*
2632 * MGPU fan boost feature should be enabled
2633 * only when there are two or more dGPUs in
2634 * the system
2635 */
2636 if (mgpu_info.num_dgpu < 2)
2637 goto out;
2638
2639 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2640 gpu_ins = &(mgpu_info.gpu_ins[i]);
2641 adev = gpu_ins->adev;
2642 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2643 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2644 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2645 if (ret)
2646 break;
2647
2648 gpu_ins->mgpu_fan_enabled = 1;
2649 }
2650 }
2651
2652out:
2653 mutex_unlock(&mgpu_info.mutex);
2654
2655 return ret;
2656}
2657
e3ecdffa
AD
2658/**
2659 * amdgpu_device_ip_late_init - run late init for hardware IPs
2660 *
2661 * @adev: amdgpu_device pointer
2662 *
2663 * Late initialization pass for hardware IPs. The list of all the hardware
2664 * IPs that make up the asic is walked and the late_init callbacks are run.
2665 * late_init covers any special initialization that an IP requires
2666 * after all of the have been initialized or something that needs to happen
2667 * late in the init process.
2668 * Returns 0 on success, negative error code on failure.
2669 */
06ec9070 2670static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2671{
60599a03 2672 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2673 int i = 0, r;
2674
2675 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2676 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2677 continue;
2678 if (adev->ip_blocks[i].version->funcs->late_init) {
2679 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2680 if (r) {
2681 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2682 adev->ip_blocks[i].version->funcs->name, r);
2683 return r;
2684 }
2dc80b00 2685 }
73f847db 2686 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2687 }
2688
867e24ca 2689 r = amdgpu_ras_late_init(adev);
2690 if (r) {
2691 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2692 return r;
2693 }
2694
a891d239
DL
2695 amdgpu_ras_set_error_query_ready(adev, true);
2696
1112a46b
RZ
2697 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2698 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2699
06ec9070 2700 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2701
beff74bc
AD
2702 r = amdgpu_device_enable_mgpu_fan_boost();
2703 if (r)
2704 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2705
4da8b639 2706 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
47fc644f
SS
2707 if (amdgpu_passthrough(adev) &&
2708 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2709 adev->asic_type == CHIP_ALDEBARAN))
bc143d8b 2710 amdgpu_dpm_handle_passthrough_sbr(adev, true);
60599a03
EQ
2711
2712 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2713 mutex_lock(&mgpu_info.mutex);
2714
2715 /*
2716 * Reset device p-state to low as this was booted with high.
2717 *
2718 * This should be performed only after all devices from the same
2719 * hive get initialized.
2720 *
2721 * However, it's unknown how many device in the hive in advance.
2722 * As this is counted one by one during devices initializations.
2723 *
2724 * So, we wait for all XGMI interlinked devices initialized.
2725 * This may bring some delays as those devices may come from
2726 * different hives. But that should be OK.
2727 */
2728 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2729 for (i = 0; i < mgpu_info.num_gpu; i++) {
2730 gpu_instance = &(mgpu_info.gpu_ins[i]);
2731 if (gpu_instance->adev->flags & AMD_IS_APU)
2732 continue;
2733
d84a430d
JK
2734 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2735 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2736 if (r) {
2737 DRM_ERROR("pstate setting failed (%d).\n", r);
2738 break;
2739 }
2740 }
2741 }
2742
2743 mutex_unlock(&mgpu_info.mutex);
2744 }
2745
d38ceaf9
AD
2746 return 0;
2747}
2748
613aa3ea
LY
2749/**
2750 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2751 *
2752 * @adev: amdgpu_device pointer
2753 *
2754 * For ASICs need to disable SMC first
2755 */
2756static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2757{
2758 int i, r;
2759
2760 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2761 return;
2762
2763 for (i = 0; i < adev->num_ip_blocks; i++) {
2764 if (!adev->ip_blocks[i].status.hw)
2765 continue;
2766 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2767 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2768 /* XXX handle errors */
2769 if (r) {
2770 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2771 adev->ip_blocks[i].version->funcs->name, r);
2772 }
2773 adev->ip_blocks[i].status.hw = false;
2774 break;
2775 }
2776 }
2777}
2778
e9669fb7 2779static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
d38ceaf9
AD
2780{
2781 int i, r;
2782
e9669fb7
AG
2783 for (i = 0; i < adev->num_ip_blocks; i++) {
2784 if (!adev->ip_blocks[i].version->funcs->early_fini)
2785 continue;
5278a159 2786
e9669fb7
AG
2787 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2788 if (r) {
2789 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2790 adev->ip_blocks[i].version->funcs->name, r);
2791 }
2792 }
c030f2e4 2793
05df1f01 2794 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2795 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2796
7270e895
TY
2797 amdgpu_amdkfd_suspend(adev, false);
2798
613aa3ea
LY
2799 /* Workaroud for ASICs need to disable SMC first */
2800 amdgpu_device_smu_fini_early(adev);
3e96dbfd 2801
d38ceaf9 2802 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2803 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2804 continue;
8201a67a 2805
a1255107 2806 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2807 /* XXX handle errors */
2c1a2784 2808 if (r) {
a1255107
AD
2809 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2810 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2811 }
8201a67a 2812
a1255107 2813 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2814 }
2815
6effad8a
GC
2816 if (amdgpu_sriov_vf(adev)) {
2817 if (amdgpu_virt_release_full_gpu(adev, false))
2818 DRM_ERROR("failed to release exclusive mode on fini\n");
2819 }
2820
e9669fb7
AG
2821 return 0;
2822}
2823
2824/**
2825 * amdgpu_device_ip_fini - run fini for hardware IPs
2826 *
2827 * @adev: amdgpu_device pointer
2828 *
2829 * Main teardown pass for hardware IPs. The list of all the hardware
2830 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2831 * are run. hw_fini tears down the hardware associated with each IP
2832 * and sw_fini tears down any software state associated with each IP.
2833 * Returns 0 on success, negative error code on failure.
2834 */
2835static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2836{
2837 int i, r;
2838
2839 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2840 amdgpu_virt_release_ras_err_handler_data(adev);
2841
e9669fb7
AG
2842 if (adev->gmc.xgmi.num_physical_nodes > 1)
2843 amdgpu_xgmi_remove_device(adev);
2844
c004d44e 2845 amdgpu_amdkfd_device_fini_sw(adev);
9950cda2 2846
d38ceaf9 2847 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2848 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2849 continue;
c12aba3a
ML
2850
2851 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2852 amdgpu_ucode_free_bo(adev);
1e256e27 2853 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a 2854 amdgpu_device_wb_fini(adev);
7ccfd79f 2855 amdgpu_device_mem_scratch_fini(adev);
533aed27 2856 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2857 }
2858
a1255107 2859 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2860 /* XXX handle errors */
2c1a2784 2861 if (r) {
a1255107
AD
2862 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2863 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2864 }
a1255107
AD
2865 adev->ip_blocks[i].status.sw = false;
2866 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2867 }
2868
a6dcfd9c 2869 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2870 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2871 continue;
a1255107
AD
2872 if (adev->ip_blocks[i].version->funcs->late_fini)
2873 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2874 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2875 }
2876
c030f2e4 2877 amdgpu_ras_fini(adev);
2878
d38ceaf9
AD
2879 return 0;
2880}
2881
e3ecdffa 2882/**
beff74bc 2883 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2884 *
1112a46b 2885 * @work: work_struct.
e3ecdffa 2886 */
beff74bc 2887static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2888{
2889 struct amdgpu_device *adev =
beff74bc 2890 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2891 int r;
2892
2893 r = amdgpu_ib_ring_tests(adev);
2894 if (r)
2895 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2896}
2897
1e317b99
RZ
2898static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2899{
2900 struct amdgpu_device *adev =
2901 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2902
90a92662
MD
2903 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2904 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2905
2906 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2907 adev->gfx.gfx_off_state = true;
1e317b99
RZ
2908}
2909
e3ecdffa 2910/**
e7854a03 2911 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2912 *
2913 * @adev: amdgpu_device pointer
2914 *
2915 * Main suspend function for hardware IPs. The list of all the hardware
2916 * IPs that make up the asic is walked, clockgating is disabled and the
2917 * suspend callbacks are run. suspend puts the hardware and software state
2918 * in each IP into a state suitable for suspend.
2919 * Returns 0 on success, negative error code on failure.
2920 */
e7854a03
AD
2921static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2922{
2923 int i, r;
2924
50ec83f0
AD
2925 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2926 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2927
b31d6ada
EQ
2928 /*
2929 * Per PMFW team's suggestion, driver needs to handle gfxoff
2930 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2931 * scenario. Add the missing df cstate disablement here.
2932 */
2933 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2934 dev_warn(adev->dev, "Failed to disallow df cstate");
2935
e7854a03
AD
2936 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2937 if (!adev->ip_blocks[i].status.valid)
2938 continue;
2b9f7848 2939
e7854a03 2940 /* displays are handled separately */
2b9f7848
ND
2941 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2942 continue;
2943
2944 /* XXX handle errors */
2945 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2946 /* XXX handle errors */
2947 if (r) {
2948 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2949 adev->ip_blocks[i].version->funcs->name, r);
2950 return r;
e7854a03 2951 }
2b9f7848
ND
2952
2953 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2954 }
2955
e7854a03
AD
2956 return 0;
2957}
2958
2959/**
2960 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2961 *
2962 * @adev: amdgpu_device pointer
2963 *
2964 * Main suspend function for hardware IPs. The list of all the hardware
2965 * IPs that make up the asic is walked, clockgating is disabled and the
2966 * suspend callbacks are run. suspend puts the hardware and software state
2967 * in each IP into a state suitable for suspend.
2968 * Returns 0 on success, negative error code on failure.
2969 */
2970static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2971{
2972 int i, r;
2973
557f42a2 2974 if (adev->in_s0ix)
bc143d8b 2975 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
34416931 2976
d38ceaf9 2977 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2978 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2979 continue;
e7854a03
AD
2980 /* displays are handled in phase1 */
2981 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2982 continue;
bff77e86
LM
2983 /* PSP lost connection when err_event_athub occurs */
2984 if (amdgpu_ras_intr_triggered() &&
2985 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2986 adev->ip_blocks[i].status.hw = false;
2987 continue;
2988 }
e3c1b071 2989
2990 /* skip unnecessary suspend if we do not initialize them yet */
2991 if (adev->gmc.xgmi.pending_reset &&
2992 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2993 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2995 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2996 adev->ip_blocks[i].status.hw = false;
2997 continue;
2998 }
557f42a2 2999
afa6646b 3000 /* skip suspend of gfx/mes and psp for S0ix
32ff160d
AD
3001 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3002 * like at runtime. PSP is also part of the always on hardware
3003 * so no need to suspend it.
3004 */
557f42a2 3005 if (adev->in_s0ix &&
32ff160d 3006 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
afa6646b
AD
3007 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3008 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
557f42a2
AD
3009 continue;
3010
2a7798ea
AD
3011 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3012 if (adev->in_s0ix &&
3013 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3014 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3015 continue;
3016
e11c7750
TH
3017 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3018 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3019 * from this location and RLC Autoload automatically also gets loaded
3020 * from here based on PMFW -> PSP message during re-init sequence.
3021 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3022 * the TMR and reload FWs again for IMU enabled APU ASICs.
3023 */
3024 if (amdgpu_in_reset(adev) &&
3025 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3026 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3027 continue;
3028
d38ceaf9 3029 /* XXX handle errors */
a1255107 3030 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 3031 /* XXX handle errors */
2c1a2784 3032 if (r) {
a1255107
AD
3033 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3034 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 3035 }
876923fb 3036 adev->ip_blocks[i].status.hw = false;
a3a09142 3037 /* handle putting the SMC in the appropriate state */
47fc644f 3038 if (!amdgpu_sriov_vf(adev)) {
86b93fd6
JZ
3039 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3040 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3041 if (r) {
3042 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3043 adev->mp1_state, r);
3044 return r;
3045 }
a3a09142
AD
3046 }
3047 }
d38ceaf9
AD
3048 }
3049
3050 return 0;
3051}
3052
e7854a03
AD
3053/**
3054 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3055 *
3056 * @adev: amdgpu_device pointer
3057 *
3058 * Main suspend function for hardware IPs. The list of all the hardware
3059 * IPs that make up the asic is walked, clockgating is disabled and the
3060 * suspend callbacks are run. suspend puts the hardware and software state
3061 * in each IP into a state suitable for suspend.
3062 * Returns 0 on success, negative error code on failure.
3063 */
3064int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3065{
3066 int r;
3067
3c73683c
JC
3068 if (amdgpu_sriov_vf(adev)) {
3069 amdgpu_virt_fini_data_exchange(adev);
e7819644 3070 amdgpu_virt_request_full_gpu(adev, false);
3c73683c 3071 }
e7819644 3072
e7854a03
AD
3073 r = amdgpu_device_ip_suspend_phase1(adev);
3074 if (r)
3075 return r;
3076 r = amdgpu_device_ip_suspend_phase2(adev);
3077
e7819644
YT
3078 if (amdgpu_sriov_vf(adev))
3079 amdgpu_virt_release_full_gpu(adev, false);
3080
e7854a03
AD
3081 return r;
3082}
3083
06ec9070 3084static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3085{
3086 int i, r;
3087
2cb681b6 3088 static enum amd_ip_block_type ip_order[] = {
2cb681b6 3089 AMD_IP_BLOCK_TYPE_COMMON,
c1c39032 3090 AMD_IP_BLOCK_TYPE_GMC,
39186aef 3091 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
3092 AMD_IP_BLOCK_TYPE_IH,
3093 };
a90ad3c2 3094
95ea3dbc 3095 for (i = 0; i < adev->num_ip_blocks; i++) {
2cb681b6
ML
3096 int j;
3097 struct amdgpu_ip_block *block;
a90ad3c2 3098
4cd2a96d
J
3099 block = &adev->ip_blocks[i];
3100 block->status.hw = false;
2cb681b6 3101
4cd2a96d 3102 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 3103
4cd2a96d 3104 if (block->version->type != ip_order[j] ||
2cb681b6
ML
3105 !block->status.valid)
3106 continue;
3107
3108 r = block->version->funcs->hw_init(adev);
0aaeefcc 3109 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3110 if (r)
3111 return r;
482f0e53 3112 block->status.hw = true;
a90ad3c2
ML
3113 }
3114 }
3115
3116 return 0;
3117}
3118
06ec9070 3119static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3120{
3121 int i, r;
3122
2cb681b6
ML
3123 static enum amd_ip_block_type ip_order[] = {
3124 AMD_IP_BLOCK_TYPE_SMC,
3125 AMD_IP_BLOCK_TYPE_DCE,
3126 AMD_IP_BLOCK_TYPE_GFX,
3127 AMD_IP_BLOCK_TYPE_SDMA,
ec64350d 3128 AMD_IP_BLOCK_TYPE_MES,
257deb8c 3129 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07 3130 AMD_IP_BLOCK_TYPE_VCE,
d2cdc014
YZ
3131 AMD_IP_BLOCK_TYPE_VCN,
3132 AMD_IP_BLOCK_TYPE_JPEG
2cb681b6 3133 };
a90ad3c2 3134
2cb681b6
ML
3135 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3136 int j;
3137 struct amdgpu_ip_block *block;
a90ad3c2 3138
2cb681b6
ML
3139 for (j = 0; j < adev->num_ip_blocks; j++) {
3140 block = &adev->ip_blocks[j];
3141
3142 if (block->version->type != ip_order[i] ||
482f0e53
ML
3143 !block->status.valid ||
3144 block->status.hw)
2cb681b6
ML
3145 continue;
3146
895bd048
JZ
3147 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3148 r = block->version->funcs->resume(adev);
3149 else
3150 r = block->version->funcs->hw_init(adev);
3151
0aaeefcc 3152 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3153 if (r)
3154 return r;
482f0e53 3155 block->status.hw = true;
a90ad3c2
ML
3156 }
3157 }
3158
3159 return 0;
3160}
3161
e3ecdffa
AD
3162/**
3163 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3164 *
3165 * @adev: amdgpu_device pointer
3166 *
3167 * First resume function for hardware IPs. The list of all the hardware
3168 * IPs that make up the asic is walked and the resume callbacks are run for
3169 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3170 * after a suspend and updates the software state as necessary. This
3171 * function is also used for restoring the GPU after a GPU reset.
3172 * Returns 0 on success, negative error code on failure.
3173 */
06ec9070 3174static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
3175{
3176 int i, r;
3177
a90ad3c2 3178 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3179 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 3180 continue;
a90ad3c2 3181 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3182 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
d7274ec7
BZ
3183 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3184 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
482f0e53 3185
fcf0649f
CZ
3186 r = adev->ip_blocks[i].version->funcs->resume(adev);
3187 if (r) {
3188 DRM_ERROR("resume of IP block <%s> failed %d\n",
3189 adev->ip_blocks[i].version->funcs->name, r);
3190 return r;
3191 }
482f0e53 3192 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
3193 }
3194 }
3195
3196 return 0;
3197}
3198
e3ecdffa
AD
3199/**
3200 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3201 *
3202 * @adev: amdgpu_device pointer
3203 *
3204 * First resume function for hardware IPs. The list of all the hardware
3205 * IPs that make up the asic is walked and the resume callbacks are run for
3206 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3207 * functional state after a suspend and updates the software state as
3208 * necessary. This function is also used for restoring the GPU after a GPU
3209 * reset.
3210 * Returns 0 on success, negative error code on failure.
3211 */
06ec9070 3212static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
3213{
3214 int i, r;
3215
3216 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3217 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 3218 continue;
fcf0649f 3219 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3220 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3222 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 3223 continue;
a1255107 3224 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 3225 if (r) {
a1255107
AD
3226 DRM_ERROR("resume of IP block <%s> failed %d\n",
3227 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 3228 return r;
2c1a2784 3229 }
482f0e53 3230 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
3231 }
3232
3233 return 0;
3234}
3235
e3ecdffa
AD
3236/**
3237 * amdgpu_device_ip_resume - run resume for hardware IPs
3238 *
3239 * @adev: amdgpu_device pointer
3240 *
3241 * Main resume function for hardware IPs. The hardware IPs
3242 * are split into two resume functions because they are
b8920e1e 3243 * also used in recovering from a GPU reset and some additional
e3ecdffa
AD
3244 * steps need to be take between them. In this case (S3/S4) they are
3245 * run sequentially.
3246 * Returns 0 on success, negative error code on failure.
3247 */
06ec9070 3248static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
3249{
3250 int r;
3251
06ec9070 3252 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
3253 if (r)
3254 return r;
7a3e0bb2
RZ
3255
3256 r = amdgpu_device_fw_loading(adev);
3257 if (r)
3258 return r;
3259
06ec9070 3260 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
3261
3262 return r;
3263}
3264
e3ecdffa
AD
3265/**
3266 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3267 *
3268 * @adev: amdgpu_device pointer
3269 *
3270 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3271 */
4e99a44e 3272static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 3273{
6867e1b5
ML
3274 if (amdgpu_sriov_vf(adev)) {
3275 if (adev->is_atom_fw) {
58ff791a 3276 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
6867e1b5
ML
3277 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3278 } else {
3279 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3280 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3281 }
3282
3283 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3284 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 3285 }
048765ad
AR
3286}
3287
e3ecdffa
AD
3288/**
3289 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3290 *
3291 * @asic_type: AMD asic type
3292 *
3293 * Check if there is DC (new modesetting infrastructre) support for an asic.
3294 * returns true if DC has support, false if not.
3295 */
4562236b
HW
3296bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3297{
3298 switch (asic_type) {
0637d417
AD
3299#ifdef CONFIG_DRM_AMDGPU_SI
3300 case CHIP_HAINAN:
3301#endif
3302 case CHIP_TOPAZ:
3303 /* chips with no display hardware */
3304 return false;
4562236b 3305#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
3306 case CHIP_TAHITI:
3307 case CHIP_PITCAIRN:
3308 case CHIP_VERDE:
3309 case CHIP_OLAND:
2d32ffd6
AD
3310 /*
3311 * We have systems in the wild with these ASICs that require
3312 * LVDS and VGA support which is not supported with DC.
3313 *
3314 * Fallback to the non-DC driver here by default so as not to
3315 * cause regressions.
3316 */
3317#if defined(CONFIG_DRM_AMD_DC_SI)
3318 return amdgpu_dc > 0;
3319#else
3320 return false;
64200c46 3321#endif
4562236b 3322 case CHIP_BONAIRE:
0d6fbccb 3323 case CHIP_KAVERI:
367e6687
AD
3324 case CHIP_KABINI:
3325 case CHIP_MULLINS:
d9fda248
HW
3326 /*
3327 * We have systems in the wild with these ASICs that require
b5a0168e 3328 * VGA support which is not supported with DC.
d9fda248
HW
3329 *
3330 * Fallback to the non-DC driver here by default so as not to
3331 * cause regressions.
3332 */
3333 return amdgpu_dc > 0;
f7f12b25 3334 default:
fd187853 3335 return amdgpu_dc != 0;
f7f12b25 3336#else
4562236b 3337 default:
93b09a9a 3338 if (amdgpu_dc > 0)
b8920e1e 3339 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
4562236b 3340 return false;
f7f12b25 3341#endif
4562236b
HW
3342 }
3343}
3344
3345/**
3346 * amdgpu_device_has_dc_support - check if dc is supported
3347 *
982a820b 3348 * @adev: amdgpu_device pointer
4562236b
HW
3349 *
3350 * Returns true for supported, false for not supported
3351 */
3352bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3353{
25263da3 3354 if (adev->enable_virtual_display ||
abaf210c 3355 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
2555039d
XY
3356 return false;
3357
4562236b
HW
3358 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3359}
3360
d4535e2c
AG
3361static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3362{
3363 struct amdgpu_device *adev =
3364 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 3365 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 3366
c6a6e2db
AG
3367 /* It's a bug to not have a hive within this function */
3368 if (WARN_ON(!hive))
3369 return;
3370
3371 /*
3372 * Use task barrier to synchronize all xgmi reset works across the
3373 * hive. task_barrier_enter and task_barrier_exit will block
3374 * until all the threads running the xgmi reset works reach
3375 * those points. task_barrier_full will do both blocks.
3376 */
3377 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3378
3379 task_barrier_enter(&hive->tb);
4a580877 3380 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
3381
3382 if (adev->asic_reset_res)
3383 goto fail;
3384
3385 task_barrier_exit(&hive->tb);
4a580877 3386 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
3387
3388 if (adev->asic_reset_res)
3389 goto fail;
43c4d576 3390
5e67bba3 3391 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3392 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3393 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
c6a6e2db
AG
3394 } else {
3395
3396 task_barrier_full(&hive->tb);
3397 adev->asic_reset_res = amdgpu_asic_reset(adev);
3398 }
ce316fa5 3399
c6a6e2db 3400fail:
d4535e2c 3401 if (adev->asic_reset_res)
fed184e9 3402 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 3403 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 3404 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
3405}
3406
71f98027
AD
3407static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3408{
3409 char *input = amdgpu_lockup_timeout;
3410 char *timeout_setting = NULL;
3411 int index = 0;
3412 long timeout;
3413 int ret = 0;
3414
3415 /*
67387dfe
AD
3416 * By default timeout for non compute jobs is 10000
3417 * and 60000 for compute jobs.
71f98027 3418 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 3419 * jobs are 60000 by default.
71f98027
AD
3420 */
3421 adev->gfx_timeout = msecs_to_jiffies(10000);
3422 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
9882e278
ED
3423 if (amdgpu_sriov_vf(adev))
3424 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3425 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
71f98027 3426 else
67387dfe 3427 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027 3428
f440ff44 3429 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 3430 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 3431 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
3432 ret = kstrtol(timeout_setting, 0, &timeout);
3433 if (ret)
3434 return ret;
3435
3436 if (timeout == 0) {
3437 index++;
3438 continue;
3439 } else if (timeout < 0) {
3440 timeout = MAX_SCHEDULE_TIMEOUT;
127aedf9
CK
3441 dev_warn(adev->dev, "lockup timeout disabled");
3442 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
71f98027
AD
3443 } else {
3444 timeout = msecs_to_jiffies(timeout);
3445 }
3446
3447 switch (index++) {
3448 case 0:
3449 adev->gfx_timeout = timeout;
3450 break;
3451 case 1:
3452 adev->compute_timeout = timeout;
3453 break;
3454 case 2:
3455 adev->sdma_timeout = timeout;
3456 break;
3457 case 3:
3458 adev->video_timeout = timeout;
3459 break;
3460 default:
3461 break;
3462 }
3463 }
3464 /*
3465 * There is only one value specified and
3466 * it should apply to all non-compute jobs.
3467 */
bcccee89 3468 if (index == 1) {
71f98027 3469 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3470 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3471 adev->compute_timeout = adev->gfx_timeout;
3472 }
71f98027
AD
3473 }
3474
3475 return ret;
3476}
d4535e2c 3477
4a74c38c
PY
3478/**
3479 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3480 *
3481 * @adev: amdgpu_device pointer
3482 *
3483 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3484 */
3485static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3486{
3487 struct iommu_domain *domain;
3488
3489 domain = iommu_get_domain_for_dev(adev->dev);
3490 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3491 adev->ram_is_direct_mapped = true;
3492}
3493
77f3a5cd 3494static const struct attribute *amdgpu_dev_attributes[] = {
77f3a5cd
ND
3495 &dev_attr_pcie_replay_count.attr,
3496 NULL
3497};
3498
02ff519e
AD
3499static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3500{
3501 if (amdgpu_mcbp == 1)
3502 adev->gfx.mcbp = true;
1e9e15dc
JZ
3503 else if (amdgpu_mcbp == 0)
3504 adev->gfx.mcbp = false;
3505 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
3506 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
3507 adev->gfx.num_gfx_rings)
50a7c876
AD
3508 adev->gfx.mcbp = true;
3509
02ff519e
AD
3510 if (amdgpu_sriov_vf(adev))
3511 adev->gfx.mcbp = true;
3512
3513 if (adev->gfx.mcbp)
3514 DRM_INFO("MCBP is enabled\n");
3515}
3516
d38ceaf9
AD
3517/**
3518 * amdgpu_device_init - initialize the driver
3519 *
3520 * @adev: amdgpu_device pointer
d38ceaf9
AD
3521 * @flags: driver flags
3522 *
3523 * Initializes the driver info and hw (all asics).
3524 * Returns 0 for success or an error on failure.
3525 * Called at driver startup.
3526 */
3527int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3528 uint32_t flags)
3529{
8aba21b7
LT
3530 struct drm_device *ddev = adev_to_drm(adev);
3531 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3532 int r, i;
b98c6299 3533 bool px = false;
95844d20 3534 u32 max_MBps;
59e9fff1 3535 int tmp;
d38ceaf9
AD
3536
3537 adev->shutdown = false;
d38ceaf9 3538 adev->flags = flags;
4e66d7d2
YZ
3539
3540 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3541 adev->asic_type = amdgpu_force_asic_type;
3542 else
3543 adev->asic_type = flags & AMD_ASIC_MASK;
3544
d38ceaf9 3545 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3546 if (amdgpu_emu_mode == 1)
8bdab6bb 3547 adev->usec_timeout *= 10;
770d13b1 3548 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3549 adev->accel_working = false;
3550 adev->num_rings = 0;
68ce8b24 3551 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
d38ceaf9
AD
3552 adev->mman.buffer_funcs = NULL;
3553 adev->mman.buffer_funcs_ring = NULL;
3554 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3555 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3556 adev->gmc.gmc_funcs = NULL;
7bd939d0 3557 adev->harvest_ip_mask = 0x0;
f54d1867 3558 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3559 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3560
3561 adev->smc_rreg = &amdgpu_invalid_rreg;
3562 adev->smc_wreg = &amdgpu_invalid_wreg;
3563 adev->pcie_rreg = &amdgpu_invalid_rreg;
3564 adev->pcie_wreg = &amdgpu_invalid_wreg;
0c552ed3
LM
3565 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3566 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
36b9a952
HR
3567 adev->pciep_rreg = &amdgpu_invalid_rreg;
3568 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3569 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3570 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3571 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3572 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3573 adev->didt_rreg = &amdgpu_invalid_rreg;
3574 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3575 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3576 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3577 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3578 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3579
3e39ab90
AD
3580 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3581 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3582 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3583
3584 /* mutex initialization are all done here so we
b8920e1e
SS
3585 * can recall function without having locking issues
3586 */
0e5ca0d1 3587 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3588 mutex_init(&adev->pm.mutex);
3589 mutex_init(&adev->gfx.gpu_clock_mutex);
3590 mutex_init(&adev->srbm_mutex);
b8866c26 3591 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3592 mutex_init(&adev->gfx.gfx_off_mutex);
98a54e88 3593 mutex_init(&adev->gfx.partition_mutex);
d38ceaf9 3594 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3595 mutex_init(&adev->mn_lock);
e23b74aa 3596 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3597 hash_init(adev->mn_hash);
32eaeae0 3598 mutex_init(&adev->psp.mutex);
bd052211 3599 mutex_init(&adev->notifier_lock);
8cda7a4f 3600 mutex_init(&adev->pm.stable_pstate_ctx_lock);
f113cc32 3601 mutex_init(&adev->benchmark_mutex);
d38ceaf9 3602
ab3b9de6 3603 amdgpu_device_init_apu_flags(adev);
9f6a7857 3604
912dfc84
EQ
3605 r = amdgpu_device_check_arguments(adev);
3606 if (r)
3607 return r;
d38ceaf9 3608
d38ceaf9
AD
3609 spin_lock_init(&adev->mmio_idx_lock);
3610 spin_lock_init(&adev->smc_idx_lock);
3611 spin_lock_init(&adev->pcie_idx_lock);
3612 spin_lock_init(&adev->uvd_ctx_idx_lock);
3613 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3614 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3615 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3616 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3617 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3618
0c4e7fa5
CZ
3619 INIT_LIST_HEAD(&adev->shadow_list);
3620 mutex_init(&adev->shadow_list_lock);
3621
655ce9cb 3622 INIT_LIST_HEAD(&adev->reset_list);
3623
6492e1b0 3624 INIT_LIST_HEAD(&adev->ras_list);
3625
beff74bc
AD
3626 INIT_DELAYED_WORK(&adev->delayed_init_work,
3627 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3628 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3629 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3630
d4535e2c
AG
3631 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3632
d23ee13f 3633 adev->gfx.gfx_off_req_count = 1;
0ad7347a
AA
3634 adev->gfx.gfx_off_residency = 0;
3635 adev->gfx.gfx_off_entrycount = 0;
b6e79d9a 3636 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3637
b265bdbd
EQ
3638 atomic_set(&adev->throttling_logging_enabled, 1);
3639 /*
3640 * If throttling continues, logging will be performed every minute
3641 * to avoid log flooding. "-1" is subtracted since the thermal
3642 * throttling interrupt comes every second. Thus, the total logging
3643 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3644 * for throttling interrupt) = 60 seconds.
3645 */
3646 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3647 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3648
0fa49558
AX
3649 /* Registers mapping */
3650 /* TODO: block userspace mapping of io register */
da69c161
KW
3651 if (adev->asic_type >= CHIP_BONAIRE) {
3652 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3653 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3654 } else {
3655 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3656 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3657 }
d38ceaf9 3658
6c08e0ef
EQ
3659 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3660 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3661
d38ceaf9 3662 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
b8920e1e 3663 if (!adev->rmmio)
d38ceaf9 3664 return -ENOMEM;
b8920e1e 3665
d38ceaf9 3666 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
b8920e1e 3667 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
d38ceaf9 3668
436afdfa
PY
3669 /*
3670 * Reset domain needs to be present early, before XGMI hive discovered
3671 * (if any) and intitialized to use reset sem and in_gpu reset flag
3672 * early on during init and before calling to RREG32.
3673 */
3674 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3675 if (!adev->reset_domain)
3676 return -ENOMEM;
3677
3aa0115d
ML
3678 /* detect hw virtualization here */
3679 amdgpu_detect_virtualization(adev);
3680
04e85958
TL
3681 amdgpu_device_get_pcie_info(adev);
3682
dffa11b4
ML
3683 r = amdgpu_device_get_job_timeout_settings(adev);
3684 if (r) {
3685 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4ef87d8f 3686 return r;
a190d1c7
XY
3687 }
3688
d38ceaf9 3689 /* early init functions */
06ec9070 3690 r = amdgpu_device_ip_early_init(adev);
d38ceaf9 3691 if (r)
4ef87d8f 3692 return r;
d38ceaf9 3693
02ff519e
AD
3694 amdgpu_device_set_mcbp(adev);
3695
b7cdb41e
ML
3696 /* Get rid of things like offb */
3697 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3698 if (r)
3699 return r;
3700
4d33e704
SK
3701 /* Enable TMZ based on IP_VERSION */
3702 amdgpu_gmc_tmz_set(adev);
3703
957b0787 3704 amdgpu_gmc_noretry_set(adev);
4a0165f0
VS
3705 /* Need to get xgmi info early to decide the reset behavior*/
3706 if (adev->gmc.xgmi.supported) {
3707 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3708 if (r)
3709 return r;
3710 }
3711
8e6d0b69 3712 /* enable PCIE atomic ops */
b4520bfd
GW
3713 if (amdgpu_sriov_vf(adev)) {
3714 if (adev->virt.fw_reserve.p_pf2vf)
3715 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3716 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3717 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
0e768043
YZ
3718 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3719 * internal path natively support atomics, set have_atomics_support to true.
3720 */
b4520bfd
GW
3721 } else if ((adev->flags & AMD_IS_APU) &&
3722 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
0e768043 3723 adev->have_atomics_support = true;
b4520bfd 3724 } else {
8e6d0b69 3725 adev->have_atomics_support =
3726 !pci_enable_atomic_ops_to_root(adev->pdev,
3727 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3728 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
b4520bfd
GW
3729 }
3730
8e6d0b69 3731 if (!adev->have_atomics_support)
3732 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3733
6585661d 3734 /* doorbell bar mapping and doorbell index init*/
43c064db 3735 amdgpu_doorbell_init(adev);
6585661d 3736
9475a943
SL
3737 if (amdgpu_emu_mode == 1) {
3738 /* post the asic on emulation mode */
3739 emu_soc_asic_init(adev);
bfca0289 3740 goto fence_driver_init;
9475a943 3741 }
bfca0289 3742
04442bf7
LL
3743 amdgpu_reset_init(adev);
3744
4e99a44e 3745 /* detect if we are with an SRIOV vbios */
b4520bfd
GW
3746 if (adev->bios)
3747 amdgpu_device_detect_sriov_bios(adev);
048765ad 3748
95e8e59e
AD
3749 /* check if we need to reset the asic
3750 * E.g., driver was not cleanly unloaded previously, etc.
3751 */
f14899fd 3752 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
e3c1b071 3753 if (adev->gmc.xgmi.num_physical_nodes) {
3754 dev_info(adev->dev, "Pending hive reset.\n");
3755 adev->gmc.xgmi.pending_reset = true;
3756 /* Only need to init necessary block for SMU to handle the reset */
3757 for (i = 0; i < adev->num_ip_blocks; i++) {
3758 if (!adev->ip_blocks[i].status.valid)
3759 continue;
3760 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3761 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3762 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3763 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
751f43e7 3764 DRM_DEBUG("IP %s disabled for hw_init.\n",
e3c1b071 3765 adev->ip_blocks[i].version->funcs->name);
3766 adev->ip_blocks[i].status.hw = true;
3767 }
3768 }
3769 } else {
59e9fff1 3770 tmp = amdgpu_reset_method;
3771 /* It should do a default reset when loading or reloading the driver,
3772 * regardless of the module parameter reset_method.
3773 */
3774 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
e3c1b071 3775 r = amdgpu_asic_reset(adev);
59e9fff1 3776 amdgpu_reset_method = tmp;
e3c1b071 3777 if (r) {
3778 dev_err(adev->dev, "asic reset on init failed\n");
3779 goto failed;
3780 }
95e8e59e
AD
3781 }
3782 }
3783
d38ceaf9 3784 /* Post card if necessary */
39c640c0 3785 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3786 if (!adev->bios) {
bec86378 3787 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3788 r = -EINVAL;
3789 goto failed;
d38ceaf9 3790 }
bec86378 3791 DRM_INFO("GPU posting now...\n");
4d2997ab 3792 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3793 if (r) {
3794 dev_err(adev->dev, "gpu post error!\n");
3795 goto failed;
3796 }
d38ceaf9
AD
3797 }
3798
9535a86a
SZ
3799 if (adev->bios) {
3800 if (adev->is_atom_fw) {
3801 /* Initialize clocks */
3802 r = amdgpu_atomfirmware_get_clock_info(adev);
3803 if (r) {
3804 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3805 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3806 goto failed;
3807 }
3808 } else {
3809 /* Initialize clocks */
3810 r = amdgpu_atombios_get_clock_info(adev);
3811 if (r) {
3812 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3813 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3814 goto failed;
3815 }
3816 /* init i2c buses */
3817 if (!amdgpu_device_has_dc_support(adev))
3818 amdgpu_atombios_i2c_init(adev);
a5bde2f9 3819 }
2c1a2784 3820 }
d38ceaf9 3821
bfca0289 3822fence_driver_init:
d38ceaf9 3823 /* Fence driver */
067f44c8 3824 r = amdgpu_fence_driver_sw_init(adev);
2c1a2784 3825 if (r) {
067f44c8 3826 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
e23b74aa 3827 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3828 goto failed;
2c1a2784 3829 }
d38ceaf9
AD
3830
3831 /* init the mode config */
4a580877 3832 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3833
06ec9070 3834 r = amdgpu_device_ip_init(adev);
d38ceaf9 3835 if (r) {
06ec9070 3836 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3837 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
970fd197 3838 goto release_ras_con;
d38ceaf9
AD
3839 }
3840
8d35a259
LG
3841 amdgpu_fence_driver_hw_init(adev);
3842
d69b8971
YZ
3843 dev_info(adev->dev,
3844 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3845 adev->gfx.config.max_shader_engines,
3846 adev->gfx.config.max_sh_per_se,
3847 adev->gfx.config.max_cu_per_sh,
3848 adev->gfx.cu_info.number);
3849
d38ceaf9
AD
3850 adev->accel_working = true;
3851
e59c0205
AX
3852 amdgpu_vm_check_compute_bug(adev);
3853
95844d20
MO
3854 /* Initialize the buffer migration limit. */
3855 if (amdgpu_moverate >= 0)
3856 max_MBps = amdgpu_moverate;
3857 else
3858 max_MBps = 8; /* Allow 8 MB/s. */
3859 /* Get a log2 for easy divisions. */
3860 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3861
184d8384
LL
3862 r = amdgpu_atombios_sysfs_init(adev);
3863 if (r)
3864 drm_err(&adev->ddev,
3865 "registering atombios sysfs failed (%d).\n", r);
3866
d2f52ac8 3867 r = amdgpu_pm_sysfs_init(adev);
53e9d836
GC
3868 if (r)
3869 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
d2f52ac8 3870
5bb23532 3871 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3872 if (r) {
3873 adev->ucode_sysfs_en = false;
5bb23532 3874 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3875 } else
3876 adev->ucode_sysfs_en = true;
5bb23532 3877
b0adca4d
EQ
3878 /*
3879 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3880 * Otherwise the mgpu fan boost feature will be skipped due to the
3881 * gpu instance is counted less.
3882 */
3883 amdgpu_register_gpu_instance(adev);
3884
d38ceaf9
AD
3885 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3886 * explicit gating rather than handling it automatically.
3887 */
e3c1b071 3888 if (!adev->gmc.xgmi.pending_reset) {
3889 r = amdgpu_device_ip_late_init(adev);
3890 if (r) {
3891 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3892 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
970fd197 3893 goto release_ras_con;
e3c1b071 3894 }
3895 /* must succeed. */
3896 amdgpu_ras_resume(adev);
3897 queue_delayed_work(system_wq, &adev->delayed_init_work,
3898 msecs_to_jiffies(AMDGPU_RESUME_MS));
2c1a2784 3899 }
d38ceaf9 3900
38eecbe0
CL
3901 if (amdgpu_sriov_vf(adev)) {
3902 amdgpu_virt_release_full_gpu(adev, true);
2c738637 3903 flush_delayed_work(&adev->delayed_init_work);
38eecbe0 3904 }
2c738637 3905
77f3a5cd 3906 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
5aea5327 3907 if (r)
77f3a5cd 3908 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166 3909
7957ec80
LL
3910 amdgpu_fru_sysfs_init(adev);
3911
d155bef0
AB
3912 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3913 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3914 if (r)
3915 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3916
c1dd4aa6
AG
3917 /* Have stored pci confspace at hand for restore in sudden PCI error */
3918 if (amdgpu_device_cache_pci_state(adev->pdev))
3919 pci_restore_state(pdev);
3920
8c3dd61c
KHF
3921 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3922 /* this will fail for cards that aren't VGA class devices, just
b8920e1e
SS
3923 * ignore it
3924 */
8c3dd61c 3925 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
bf44e8ce 3926 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
8c3dd61c 3927
d37a3929
OC
3928 px = amdgpu_device_supports_px(ddev);
3929
3930 if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
3931 apple_gmux_detect(NULL, NULL)))
8c3dd61c
KHF
3932 vga_switcheroo_register_client(adev->pdev,
3933 &amdgpu_switcheroo_ops, px);
d37a3929
OC
3934
3935 if (px)
8c3dd61c 3936 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
8c3dd61c 3937
e3c1b071 3938 if (adev->gmc.xgmi.pending_reset)
3939 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3940 msecs_to_jiffies(AMDGPU_RESUME_MS));
3941
4a74c38c
PY
3942 amdgpu_device_check_iommu_direct_map(adev);
3943
d38ceaf9 3944 return 0;
83ba126a 3945
970fd197 3946release_ras_con:
38eecbe0
CL
3947 if (amdgpu_sriov_vf(adev))
3948 amdgpu_virt_release_full_gpu(adev, true);
3949
3950 /* failed in exclusive mode due to timeout */
3951 if (amdgpu_sriov_vf(adev) &&
3952 !amdgpu_sriov_runtime(adev) &&
3953 amdgpu_virt_mmio_blocked(adev) &&
3954 !amdgpu_virt_wait_reset(adev)) {
3955 dev_err(adev->dev, "VF exclusive mode timeout\n");
3956 /* Don't send request since VF is inactive. */
3957 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3958 adev->virt.ops = NULL;
3959 r = -EAGAIN;
3960 }
970fd197
SY
3961 amdgpu_release_ras_context(adev);
3962
83ba126a 3963failed:
89041940 3964 amdgpu_vf_error_trans_all(adev);
8840a387 3965
83ba126a 3966 return r;
d38ceaf9
AD
3967}
3968
07775fc1
AG
3969static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3970{
62d5f9f7 3971
07775fc1
AG
3972 /* Clear all CPU mappings pointing to this device */
3973 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3974
3975 /* Unmap all mapped bars - Doorbell, registers and VRAM */
43c064db 3976 amdgpu_doorbell_fini(adev);
07775fc1
AG
3977
3978 iounmap(adev->rmmio);
3979 adev->rmmio = NULL;
3980 if (adev->mman.aper_base_kaddr)
3981 iounmap(adev->mman.aper_base_kaddr);
3982 adev->mman.aper_base_kaddr = NULL;
3983
3984 /* Memory manager related */
a0ba1279 3985 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
07775fc1
AG
3986 arch_phys_wc_del(adev->gmc.vram_mtrr);
3987 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3988 }
3989}
3990
d38ceaf9 3991/**
bbe04dec 3992 * amdgpu_device_fini_hw - tear down the driver
d38ceaf9
AD
3993 *
3994 * @adev: amdgpu_device pointer
3995 *
3996 * Tear down the driver info (all asics).
3997 * Called at driver shutdown.
3998 */
72c8c97b 3999void amdgpu_device_fini_hw(struct amdgpu_device *adev)
d38ceaf9 4000{
aac89168 4001 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 4002 flush_delayed_work(&adev->delayed_init_work);
d0d13fe8 4003 adev->shutdown = true;
9f875167 4004
752c683d
ML
4005 /* make sure IB test finished before entering exclusive mode
4006 * to avoid preemption on IB test
b8920e1e 4007 */
519b8b76 4008 if (amdgpu_sriov_vf(adev)) {
752c683d 4009 amdgpu_virt_request_full_gpu(adev, false);
519b8b76
BZ
4010 amdgpu_virt_fini_data_exchange(adev);
4011 }
752c683d 4012
e5b03032
ML
4013 /* disable all interrupts */
4014 amdgpu_irq_disable_all(adev);
47fc644f 4015 if (adev->mode_info.mode_config_initialized) {
1053b9c9 4016 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4a580877 4017 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 4018 else
4a580877 4019 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 4020 }
8d35a259 4021 amdgpu_fence_driver_hw_fini(adev);
72c8c97b 4022
cd3a8a59 4023 if (adev->mman.initialized)
9bff18d1 4024 drain_workqueue(adev->mman.bdev.wq);
98f56188 4025
53e9d836 4026 if (adev->pm.sysfs_initialized)
7c868b59 4027 amdgpu_pm_sysfs_fini(adev);
72c8c97b
AG
4028 if (adev->ucode_sysfs_en)
4029 amdgpu_ucode_sysfs_fini(adev);
4030 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
7957ec80 4031 amdgpu_fru_sysfs_fini(adev);
72c8c97b 4032
232d1d43
SY
4033 /* disable ras feature must before hw fini */
4034 amdgpu_ras_pre_fini(adev);
4035
e9669fb7 4036 amdgpu_device_ip_fini_early(adev);
d10d0daa 4037
a3848df6
YW
4038 amdgpu_irq_fini_hw(adev);
4039
b6fd6e0f
SK
4040 if (adev->mman.initialized)
4041 ttm_device_clear_dma_mappings(&adev->mman.bdev);
894c6890 4042
d10d0daa 4043 amdgpu_gart_dummy_page_fini(adev);
07775fc1 4044
39934d3e
VP
4045 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4046 amdgpu_device_unmap_mmio(adev);
87172e89 4047
72c8c97b
AG
4048}
4049
4050void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4051{
62d5f9f7 4052 int idx;
d37a3929 4053 bool px;
62d5f9f7 4054
8d35a259 4055 amdgpu_fence_driver_sw_fini(adev);
a5c5d8d5 4056 amdgpu_device_ip_fini(adev);
b31d3063 4057 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
d38ceaf9 4058 adev->accel_working = false;
68ce8b24 4059 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
04442bf7
LL
4060
4061 amdgpu_reset_fini(adev);
4062
d38ceaf9 4063 /* free i2c buses */
4562236b
HW
4064 if (!amdgpu_device_has_dc_support(adev))
4065 amdgpu_i2c_fini(adev);
bfca0289
SL
4066
4067 if (amdgpu_emu_mode != 1)
4068 amdgpu_atombios_fini(adev);
4069
d38ceaf9
AD
4070 kfree(adev->bios);
4071 adev->bios = NULL;
d37a3929
OC
4072
4073 px = amdgpu_device_supports_px(adev_to_drm(adev));
4074
4075 if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4076 apple_gmux_detect(NULL, NULL)))
84c8b22e 4077 vga_switcheroo_unregister_client(adev->pdev);
d37a3929
OC
4078
4079 if (px)
83ba126a 4080 vga_switcheroo_fini_domain_pm_ops(adev->dev);
d37a3929 4081
38d6be81 4082 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
b8779475 4083 vga_client_unregister(adev->pdev);
e9bc1bf7 4084
62d5f9f7
LS
4085 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4086
4087 iounmap(adev->rmmio);
4088 adev->rmmio = NULL;
43c064db 4089 amdgpu_doorbell_fini(adev);
62d5f9f7
LS
4090 drm_dev_exit(idx);
4091 }
4092
d155bef0
AB
4093 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4094 amdgpu_pmu_fini(adev);
72de33f8 4095 if (adev->mman.discovery_bin)
a190d1c7 4096 amdgpu_discovery_fini(adev);
72c8c97b 4097
cfbb6b00
AG
4098 amdgpu_reset_put_reset_domain(adev->reset_domain);
4099 adev->reset_domain = NULL;
4100
72c8c97b
AG
4101 kfree(adev->pci_state);
4102
d38ceaf9
AD
4103}
4104
58144d28
ND
4105/**
4106 * amdgpu_device_evict_resources - evict device resources
4107 * @adev: amdgpu device object
4108 *
4109 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4110 * of the vram memory type. Mainly used for evicting device resources
4111 * at suspend time.
4112 *
4113 */
7863c155 4114static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
58144d28 4115{
7863c155
ML
4116 int ret;
4117
e53d9665
ML
4118 /* No need to evict vram on APUs for suspend to ram or s2idle */
4119 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
7863c155 4120 return 0;
58144d28 4121
7863c155
ML
4122 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4123 if (ret)
58144d28 4124 DRM_WARN("evicting device resources failed\n");
7863c155 4125 return ret;
58144d28 4126}
d38ceaf9
AD
4127
4128/*
4129 * Suspend & resume.
4130 */
4131/**
810ddc3a 4132 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 4133 *
87e3f136 4134 * @dev: drm dev pointer
87e3f136 4135 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
4136 *
4137 * Puts the hw in the suspend state (all asics).
4138 * Returns 0 for success or an error on failure.
4139 * Called at driver suspend.
4140 */
de185019 4141int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9 4142{
a2e15b0e 4143 struct amdgpu_device *adev = drm_to_adev(dev);
d7274ec7 4144 int r = 0;
d38ceaf9 4145
d38ceaf9
AD
4146 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4147 return 0;
4148
44779b43 4149 adev->in_suspend = true;
3fa8f89d 4150
47ea2076
SF
4151 /* Evict the majority of BOs before grabbing the full access */
4152 r = amdgpu_device_evict_resources(adev);
4153 if (r)
4154 return r;
4155
d7274ec7
BZ
4156 if (amdgpu_sriov_vf(adev)) {
4157 amdgpu_virt_fini_data_exchange(adev);
4158 r = amdgpu_virt_request_full_gpu(adev, false);
4159 if (r)
4160 return r;
4161 }
4162
3fa8f89d
S
4163 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4164 DRM_WARN("smart shift update failed\n");
4165
5f818173 4166 if (fbcon)
087451f3 4167 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
5f818173 4168
beff74bc 4169 cancel_delayed_work_sync(&adev->delayed_init_work);
0dee7263 4170 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
a5459475 4171
5e6932fe 4172 amdgpu_ras_suspend(adev);
4173
2196927b 4174 amdgpu_device_ip_suspend_phase1(adev);
fe1053b7 4175
c004d44e 4176 if (!adev->in_s0ix)
5d3a2d95 4177 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
94fa5660 4178
7863c155
ML
4179 r = amdgpu_device_evict_resources(adev);
4180 if (r)
4181 return r;
d38ceaf9 4182
8d35a259 4183 amdgpu_fence_driver_hw_fini(adev);
d38ceaf9 4184
2196927b 4185 amdgpu_device_ip_suspend_phase2(adev);
d38ceaf9 4186
d7274ec7
BZ
4187 if (amdgpu_sriov_vf(adev))
4188 amdgpu_virt_release_full_gpu(adev, false);
4189
d38ceaf9
AD
4190 return 0;
4191}
4192
4193/**
810ddc3a 4194 * amdgpu_device_resume - initiate device resume
d38ceaf9 4195 *
87e3f136 4196 * @dev: drm dev pointer
87e3f136 4197 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
4198 *
4199 * Bring the hw back to operating state (all asics).
4200 * Returns 0 for success or an error on failure.
4201 * Called at driver resume.
4202 */
de185019 4203int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9 4204{
1348969a 4205 struct amdgpu_device *adev = drm_to_adev(dev);
03161a6e 4206 int r = 0;
d38ceaf9 4207
d7274ec7
BZ
4208 if (amdgpu_sriov_vf(adev)) {
4209 r = amdgpu_virt_request_full_gpu(adev, true);
4210 if (r)
4211 return r;
4212 }
4213
d38ceaf9
AD
4214 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4215 return 0;
4216
62498733 4217 if (adev->in_s0ix)
bc143d8b 4218 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
628c36d7 4219
d38ceaf9 4220 /* post card */
39c640c0 4221 if (amdgpu_device_need_post(adev)) {
4d2997ab 4222 r = amdgpu_device_asic_init(adev);
74b0b157 4223 if (r)
aac89168 4224 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 4225 }
d38ceaf9 4226
06ec9070 4227 r = amdgpu_device_ip_resume(adev);
d7274ec7 4228
e6707218 4229 if (r) {
aac89168 4230 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3c22c1ea 4231 goto exit;
e6707218 4232 }
8d35a259 4233 amdgpu_fence_driver_hw_init(adev);
5ceb54c6 4234
06ec9070 4235 r = amdgpu_device_ip_late_init(adev);
03161a6e 4236 if (r)
3c22c1ea 4237 goto exit;
d38ceaf9 4238
beff74bc
AD
4239 queue_delayed_work(system_wq, &adev->delayed_init_work,
4240 msecs_to_jiffies(AMDGPU_RESUME_MS));
4241
c004d44e 4242 if (!adev->in_s0ix) {
5d3a2d95
AD
4243 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4244 if (r)
3c22c1ea 4245 goto exit;
5d3a2d95 4246 }
756e6880 4247
3c22c1ea
SF
4248exit:
4249 if (amdgpu_sriov_vf(adev)) {
4250 amdgpu_virt_init_data_exchange(adev);
4251 amdgpu_virt_release_full_gpu(adev, true);
4252 }
4253
4254 if (r)
4255 return r;
4256
96a5d8d4 4257 /* Make sure IB tests flushed */
beff74bc 4258 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 4259
a2e15b0e 4260 if (fbcon)
087451f3 4261 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
d38ceaf9 4262
5e6932fe 4263 amdgpu_ras_resume(adev);
4264
d09ef243
AD
4265 if (adev->mode_info.num_crtc) {
4266 /*
4267 * Most of the connector probing functions try to acquire runtime pm
4268 * refs to ensure that the GPU is powered on when connector polling is
4269 * performed. Since we're calling this from a runtime PM callback,
4270 * trying to acquire rpm refs will cause us to deadlock.
4271 *
4272 * Since we're guaranteed to be holding the rpm lock, it's safe to
4273 * temporarily disable the rpm helpers so this doesn't deadlock us.
4274 */
23a1a9e5 4275#ifdef CONFIG_PM
d09ef243 4276 dev->dev->power.disable_depth++;
23a1a9e5 4277#endif
d09ef243
AD
4278 if (!adev->dc_enabled)
4279 drm_helper_hpd_irq_event(dev);
4280 else
4281 drm_kms_helper_hotplug_event(dev);
23a1a9e5 4282#ifdef CONFIG_PM
d09ef243 4283 dev->dev->power.disable_depth--;
23a1a9e5 4284#endif
d09ef243 4285 }
44779b43
RZ
4286 adev->in_suspend = false;
4287
dc907c9d
JX
4288 if (adev->enable_mes)
4289 amdgpu_mes_self_test(adev);
4290
3fa8f89d
S
4291 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4292 DRM_WARN("smart shift update failed\n");
4293
4d3b9ae5 4294 return 0;
d38ceaf9
AD
4295}
4296
e3ecdffa
AD
4297/**
4298 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4299 *
4300 * @adev: amdgpu_device pointer
4301 *
4302 * The list of all the hardware IPs that make up the asic is walked and
4303 * the check_soft_reset callbacks are run. check_soft_reset determines
4304 * if the asic is still hung or not.
4305 * Returns true if any of the IPs are still in a hung state, false if not.
4306 */
06ec9070 4307static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
4308{
4309 int i;
4310 bool asic_hang = false;
4311
f993d628
ML
4312 if (amdgpu_sriov_vf(adev))
4313 return true;
4314
8bc04c29
AD
4315 if (amdgpu_asic_need_full_reset(adev))
4316 return true;
4317
63fbf42f 4318 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4319 if (!adev->ip_blocks[i].status.valid)
63fbf42f 4320 continue;
a1255107
AD
4321 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4322 adev->ip_blocks[i].status.hang =
4323 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4324 if (adev->ip_blocks[i].status.hang) {
aac89168 4325 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
4326 asic_hang = true;
4327 }
4328 }
4329 return asic_hang;
4330}
4331
e3ecdffa
AD
4332/**
4333 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4334 *
4335 * @adev: amdgpu_device pointer
4336 *
4337 * The list of all the hardware IPs that make up the asic is walked and the
4338 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4339 * handles any IP specific hardware or software state changes that are
4340 * necessary for a soft reset to succeed.
4341 * Returns 0 on success, negative error code on failure.
4342 */
06ec9070 4343static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
4344{
4345 int i, r = 0;
4346
4347 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4348 if (!adev->ip_blocks[i].status.valid)
d31a501e 4349 continue;
a1255107
AD
4350 if (adev->ip_blocks[i].status.hang &&
4351 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4352 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
4353 if (r)
4354 return r;
4355 }
4356 }
4357
4358 return 0;
4359}
4360
e3ecdffa
AD
4361/**
4362 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4363 *
4364 * @adev: amdgpu_device pointer
4365 *
4366 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4367 * reset is necessary to recover.
4368 * Returns true if a full asic reset is required, false if not.
4369 */
06ec9070 4370static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 4371{
da146d3b
AD
4372 int i;
4373
8bc04c29
AD
4374 if (amdgpu_asic_need_full_reset(adev))
4375 return true;
4376
da146d3b 4377 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4378 if (!adev->ip_blocks[i].status.valid)
da146d3b 4379 continue;
a1255107
AD
4380 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4381 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4382 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
4383 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4384 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 4385 if (adev->ip_blocks[i].status.hang) {
aac89168 4386 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
4387 return true;
4388 }
4389 }
35d782fe
CZ
4390 }
4391 return false;
4392}
4393
e3ecdffa
AD
4394/**
4395 * amdgpu_device_ip_soft_reset - do a soft reset
4396 *
4397 * @adev: amdgpu_device pointer
4398 *
4399 * The list of all the hardware IPs that make up the asic is walked and the
4400 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4401 * IP specific hardware or software state changes that are necessary to soft
4402 * reset the IP.
4403 * Returns 0 on success, negative error code on failure.
4404 */
06ec9070 4405static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4406{
4407 int i, r = 0;
4408
4409 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4410 if (!adev->ip_blocks[i].status.valid)
35d782fe 4411 continue;
a1255107
AD
4412 if (adev->ip_blocks[i].status.hang &&
4413 adev->ip_blocks[i].version->funcs->soft_reset) {
4414 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
4415 if (r)
4416 return r;
4417 }
4418 }
4419
4420 return 0;
4421}
4422
e3ecdffa
AD
4423/**
4424 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4425 *
4426 * @adev: amdgpu_device pointer
4427 *
4428 * The list of all the hardware IPs that make up the asic is walked and the
4429 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4430 * handles any IP specific hardware or software state changes that are
4431 * necessary after the IP has been soft reset.
4432 * Returns 0 on success, negative error code on failure.
4433 */
06ec9070 4434static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4435{
4436 int i, r = 0;
4437
4438 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4439 if (!adev->ip_blocks[i].status.valid)
35d782fe 4440 continue;
a1255107
AD
4441 if (adev->ip_blocks[i].status.hang &&
4442 adev->ip_blocks[i].version->funcs->post_soft_reset)
4443 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
4444 if (r)
4445 return r;
4446 }
4447
4448 return 0;
4449}
4450
e3ecdffa 4451/**
c33adbc7 4452 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
4453 *
4454 * @adev: amdgpu_device pointer
4455 *
4456 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4457 * restore things like GPUVM page tables after a GPU reset where
4458 * the contents of VRAM might be lost.
403009bf
CK
4459 *
4460 * Returns:
4461 * 0 on success, negative error code on failure.
e3ecdffa 4462 */
c33adbc7 4463static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 4464{
c41d1cf6 4465 struct dma_fence *fence = NULL, *next = NULL;
403009bf 4466 struct amdgpu_bo *shadow;
e18aaea7 4467 struct amdgpu_bo_vm *vmbo;
403009bf 4468 long r = 1, tmo;
c41d1cf6
ML
4469
4470 if (amdgpu_sriov_runtime(adev))
b045d3af 4471 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
4472 else
4473 tmo = msecs_to_jiffies(100);
4474
aac89168 4475 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 4476 mutex_lock(&adev->shadow_list_lock);
e18aaea7 4477 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4994d1f0
LC
4478 /* If vm is compute context or adev is APU, shadow will be NULL */
4479 if (!vmbo->shadow)
4480 continue;
4481 shadow = vmbo->shadow;
4482
403009bf 4483 /* No need to recover an evicted BO */
d3116756
CK
4484 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4485 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4486 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
403009bf
CK
4487 continue;
4488
4489 r = amdgpu_bo_restore_shadow(shadow, &next);
4490 if (r)
4491 break;
4492
c41d1cf6 4493 if (fence) {
1712fb1a 4494 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
4495 dma_fence_put(fence);
4496 fence = next;
1712fb1a 4497 if (tmo == 0) {
4498 r = -ETIMEDOUT;
c41d1cf6 4499 break;
1712fb1a 4500 } else if (tmo < 0) {
4501 r = tmo;
4502 break;
4503 }
403009bf
CK
4504 } else {
4505 fence = next;
c41d1cf6 4506 }
c41d1cf6
ML
4507 }
4508 mutex_unlock(&adev->shadow_list_lock);
4509
403009bf
CK
4510 if (fence)
4511 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
4512 dma_fence_put(fence);
4513
1712fb1a 4514 if (r < 0 || tmo <= 0) {
aac89168 4515 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
4516 return -EIO;
4517 }
c41d1cf6 4518
aac89168 4519 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 4520 return 0;
c41d1cf6
ML
4521}
4522
a90ad3c2 4523
e3ecdffa 4524/**
06ec9070 4525 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e 4526 *
982a820b 4527 * @adev: amdgpu_device pointer
87e3f136 4528 * @from_hypervisor: request from hypervisor
5740682e
ML
4529 *
4530 * do VF FLR and reinitialize Asic
3f48c681 4531 * return 0 means succeeded otherwise failed
e3ecdffa
AD
4532 */
4533static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4534 bool from_hypervisor)
5740682e
ML
4535{
4536 int r;
a5f67c93 4537 struct amdgpu_hive_info *hive = NULL;
7258fa31 4538 int retry_limit = 0;
5740682e 4539
7258fa31 4540retry:
c004d44e 4541 amdgpu_amdkfd_pre_reset(adev);
428890a3 4542
5740682e
ML
4543 if (from_hypervisor)
4544 r = amdgpu_virt_request_full_gpu(adev, true);
4545 else
4546 r = amdgpu_virt_reset_gpu(adev);
4547 if (r)
4548 return r;
f734b213 4549 amdgpu_irq_gpu_reset_resume_helper(adev);
a90ad3c2 4550
83f24a8f
HC
4551 /* some sw clean up VF needs to do before recover */
4552 amdgpu_virt_post_reset(adev);
4553
a90ad3c2 4554 /* Resume IP prior to SMC */
06ec9070 4555 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
4556 if (r)
4557 goto error;
a90ad3c2 4558
c9ffa427 4559 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 4560
7a3e0bb2
RZ
4561 r = amdgpu_device_fw_loading(adev);
4562 if (r)
4563 return r;
4564
a90ad3c2 4565 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 4566 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
4567 if (r)
4568 goto error;
a90ad3c2 4569
a5f67c93
ZL
4570 hive = amdgpu_get_xgmi_hive(adev);
4571 /* Update PSP FW topology after reset */
4572 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4573 r = amdgpu_xgmi_update_topology(hive, adev);
4574
4575 if (hive)
4576 amdgpu_put_xgmi_hive(hive);
4577
4578 if (!r) {
a5f67c93 4579 r = amdgpu_ib_ring_tests(adev);
9c12f5cd 4580
c004d44e 4581 amdgpu_amdkfd_post_reset(adev);
a5f67c93 4582 }
a90ad3c2 4583
abc34253 4584error:
c41d1cf6 4585 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 4586 amdgpu_inc_vram_lost(adev);
c33adbc7 4587 r = amdgpu_device_recover_vram(adev);
a90ad3c2 4588 }
437f3e0b 4589 amdgpu_virt_release_full_gpu(adev, true);
a90ad3c2 4590
7258fa31
SK
4591 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4592 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4593 retry_limit++;
4594 goto retry;
4595 } else
4596 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4597 }
4598
a90ad3c2
ML
4599 return r;
4600}
4601
9a1cddd6 4602/**
4603 * amdgpu_device_has_job_running - check if there is any job in mirror list
4604 *
982a820b 4605 * @adev: amdgpu_device pointer
9a1cddd6 4606 *
4607 * check if there is any job in mirror list
4608 */
4609bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4610{
4611 int i;
4612 struct drm_sched_job *job;
4613
4614 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4615 struct amdgpu_ring *ring = adev->rings[i];
4616
4617 if (!ring || !ring->sched.thread)
4618 continue;
4619
4620 spin_lock(&ring->sched.job_list_lock);
6efa4b46
LT
4621 job = list_first_entry_or_null(&ring->sched.pending_list,
4622 struct drm_sched_job, list);
9a1cddd6 4623 spin_unlock(&ring->sched.job_list_lock);
4624 if (job)
4625 return true;
4626 }
4627 return false;
4628}
4629
12938fad
CK
4630/**
4631 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4632 *
982a820b 4633 * @adev: amdgpu_device pointer
12938fad
CK
4634 *
4635 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4636 * a hung GPU.
4637 */
4638bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4639{
12938fad 4640
3ba7b418
AG
4641 if (amdgpu_gpu_recovery == 0)
4642 goto disabled;
4643
1a11a65d
YC
4644 /* Skip soft reset check in fatal error mode */
4645 if (!amdgpu_ras_is_poison_mode_supported(adev))
4646 return true;
4647
3ba7b418
AG
4648 if (amdgpu_sriov_vf(adev))
4649 return true;
4650
4651 if (amdgpu_gpu_recovery == -1) {
4652 switch (adev->asic_type) {
b3523c45
AD
4653#ifdef CONFIG_DRM_AMDGPU_SI
4654 case CHIP_VERDE:
4655 case CHIP_TAHITI:
4656 case CHIP_PITCAIRN:
4657 case CHIP_OLAND:
4658 case CHIP_HAINAN:
4659#endif
4660#ifdef CONFIG_DRM_AMDGPU_CIK
4661 case CHIP_KAVERI:
4662 case CHIP_KABINI:
4663 case CHIP_MULLINS:
4664#endif
4665 case CHIP_CARRIZO:
4666 case CHIP_STONEY:
4667 case CHIP_CYAN_SKILLFISH:
3ba7b418 4668 goto disabled;
b3523c45
AD
4669 default:
4670 break;
3ba7b418 4671 }
12938fad
CK
4672 }
4673
4674 return true;
3ba7b418
AG
4675
4676disabled:
aac89168 4677 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4678 return false;
12938fad
CK
4679}
4680
5c03e584
FX
4681int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4682{
47fc644f
SS
4683 u32 i;
4684 int ret = 0;
5c03e584 4685
47fc644f 4686 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5c03e584 4687
47fc644f 4688 dev_info(adev->dev, "GPU mode1 reset\n");
5c03e584 4689
47fc644f
SS
4690 /* disable BM */
4691 pci_clear_master(adev->pdev);
5c03e584 4692
47fc644f 4693 amdgpu_device_cache_pci_state(adev->pdev);
5c03e584 4694
47fc644f
SS
4695 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4696 dev_info(adev->dev, "GPU smu mode1 reset\n");
4697 ret = amdgpu_dpm_mode1_reset(adev);
4698 } else {
4699 dev_info(adev->dev, "GPU psp mode1 reset\n");
4700 ret = psp_gpu_reset(adev);
4701 }
5c03e584 4702
47fc644f 4703 if (ret)
2c0f880a 4704 goto mode1_reset_failed;
5c03e584 4705
47fc644f 4706 amdgpu_device_load_pci_state(adev->pdev);
15c5c5f5
LL
4707 ret = amdgpu_psp_wait_for_bootloader(adev);
4708 if (ret)
2c0f880a 4709 goto mode1_reset_failed;
5c03e584 4710
47fc644f
SS
4711 /* wait for asic to come out of reset */
4712 for (i = 0; i < adev->usec_timeout; i++) {
4713 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5c03e584 4714
47fc644f
SS
4715 if (memsize != 0xffffffff)
4716 break;
4717 udelay(1);
4718 }
5c03e584 4719
2c0f880a
HZ
4720 if (i >= adev->usec_timeout) {
4721 ret = -ETIMEDOUT;
4722 goto mode1_reset_failed;
4723 }
4724
47fc644f 4725 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
15c5c5f5 4726
2c0f880a
HZ
4727 return 0;
4728
4729mode1_reset_failed:
4730 dev_err(adev->dev, "GPU mode1 reset failed\n");
47fc644f 4731 return ret;
5c03e584 4732}
5c6dd71e 4733
e3c1b071 4734int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
04442bf7 4735 struct amdgpu_reset_context *reset_context)
26bc5340 4736{
5c1e6fa4 4737 int i, r = 0;
04442bf7
LL
4738 struct amdgpu_job *job = NULL;
4739 bool need_full_reset =
4740 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4741
4742 if (reset_context->reset_req_dev == adev)
4743 job = reset_context->job;
71182665 4744
b602ca5f
TZ
4745 if (amdgpu_sriov_vf(adev)) {
4746 /* stop the data exchange thread */
4747 amdgpu_virt_fini_data_exchange(adev);
4748 }
4749
9e225fb9
AG
4750 amdgpu_fence_driver_isr_toggle(adev, true);
4751
71182665 4752 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4753 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4754 struct amdgpu_ring *ring = adev->rings[i];
4755
51687759 4756 if (!ring || !ring->sched.thread)
0875dc9e 4757 continue;
5740682e 4758
b8920e1e
SS
4759 /* Clear job fence from fence drv to avoid force_completion
4760 * leave NULL and vm flush fence in fence drv
4761 */
5c1e6fa4 4762 amdgpu_fence_driver_clear_job_fences(ring);
c530b02f 4763
2f9d4084
ML
4764 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4765 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4766 }
d38ceaf9 4767
9e225fb9
AG
4768 amdgpu_fence_driver_isr_toggle(adev, false);
4769
ff99849b 4770 if (job && job->vm)
222b5f04
AG
4771 drm_sched_increase_karma(&job->base);
4772
04442bf7 4773 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
404b277b 4774 /* If reset handler not implemented, continue; otherwise return */
b8920e1e 4775 if (r == -EOPNOTSUPP)
404b277b
LL
4776 r = 0;
4777 else
04442bf7
LL
4778 return r;
4779
1d721ed6 4780 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4781 if (!amdgpu_sriov_vf(adev)) {
4782
4783 if (!need_full_reset)
4784 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4785
360cd081
LG
4786 if (!need_full_reset && amdgpu_gpu_recovery &&
4787 amdgpu_device_ip_check_soft_reset(adev)) {
26bc5340
AG
4788 amdgpu_device_ip_pre_soft_reset(adev);
4789 r = amdgpu_device_ip_soft_reset(adev);
4790 amdgpu_device_ip_post_soft_reset(adev);
4791 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4792 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4793 need_full_reset = true;
4794 }
4795 }
4796
4797 if (need_full_reset)
4798 r = amdgpu_device_ip_suspend(adev);
04442bf7
LL
4799 if (need_full_reset)
4800 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4801 else
4802 clear_bit(AMDGPU_NEED_FULL_RESET,
4803 &reset_context->flags);
26bc5340
AG
4804 }
4805
4806 return r;
4807}
4808
15fd09a0
SA
4809static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4810{
15fd09a0
SA
4811 int i;
4812
38a15ad9 4813 lockdep_assert_held(&adev->reset_domain->sem);
15fd09a0
SA
4814
4815 for (i = 0; i < adev->num_regs; i++) {
651d7ee6
SA
4816 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4817 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4818 adev->reset_dump_reg_value[i]);
15fd09a0
SA
4819 }
4820
4821 return 0;
4822}
4823
3d8785f6
SA
4824#ifdef CONFIG_DEV_COREDUMP
4825static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4826 size_t count, void *data, size_t datalen)
4827{
4828 struct drm_printer p;
4829 struct amdgpu_device *adev = data;
4830 struct drm_print_iterator iter;
4831 int i;
4832
4833 iter.data = buffer;
4834 iter.offset = 0;
4835 iter.start = offset;
4836 iter.remain = count;
4837
4838 p = drm_coredump_printer(&iter);
4839
4840 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4841 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4842 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4843 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4844 if (adev->reset_task_info.pid)
4845 drm_printf(&p, "process_name: %s PID: %d\n",
4846 adev->reset_task_info.process_name,
4847 adev->reset_task_info.pid);
4848
4849 if (adev->reset_vram_lost)
4850 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4851 if (adev->num_regs) {
4852 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4853
4854 for (i = 0; i < adev->num_regs; i++)
4855 drm_printf(&p, "0x%08x: 0x%08x\n",
4856 adev->reset_dump_reg_list[i],
4857 adev->reset_dump_reg_value[i]);
4858 }
4859
4860 return count - iter.remain;
4861}
4862
4863static void amdgpu_devcoredump_free(void *data)
4864{
4865}
4866
4867static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4868{
4869 struct drm_device *dev = adev_to_drm(adev);
4870
4871 ktime_get_ts64(&adev->reset_time);
4872 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4873 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4874}
4875#endif
4876
04442bf7
LL
4877int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4878 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4879{
4880 struct amdgpu_device *tmp_adev = NULL;
04442bf7 4881 bool need_full_reset, skip_hw_reset, vram_lost = false;
26bc5340 4882 int r = 0;
f5c7e779 4883 bool gpu_reset_for_dev_remove = 0;
26bc5340 4884
04442bf7
LL
4885 /* Try reset handler method first */
4886 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4887 reset_list);
15fd09a0 4888 amdgpu_reset_reg_dumps(tmp_adev);
0a83bb35
LL
4889
4890 reset_context->reset_device_list = device_list_handle;
04442bf7 4891 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
404b277b 4892 /* If reset handler not implemented, continue; otherwise return */
b8920e1e 4893 if (r == -EOPNOTSUPP)
404b277b
LL
4894 r = 0;
4895 else
04442bf7
LL
4896 return r;
4897
4898 /* Reset handler not implemented, use the default method */
4899 need_full_reset =
4900 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4901 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4902
f5c7e779
YC
4903 gpu_reset_for_dev_remove =
4904 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4905 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4906
26bc5340 4907 /*
655ce9cb 4908 * ASIC reset has to be done on all XGMI hive nodes ASAP
26bc5340
AG
4909 * to allow proper links negotiation in FW (within 1 sec)
4910 */
7ac71382 4911 if (!skip_hw_reset && need_full_reset) {
655ce9cb 4912 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
041a62bc 4913 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4914 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
e3c1b071 4915 tmp_adev->gmc.xgmi.pending_reset = false;
c96cf282 4916 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4917 r = -EALREADY;
4918 } else
4919 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4920
041a62bc 4921 if (r) {
aac89168 4922 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4923 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4924 break;
ce316fa5
LM
4925 }
4926 }
4927
041a62bc
AG
4928 /* For XGMI wait for all resets to complete before proceed */
4929 if (!r) {
655ce9cb 4930 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
ce316fa5
LM
4931 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4932 flush_work(&tmp_adev->xgmi_reset_work);
4933 r = tmp_adev->asic_reset_res;
4934 if (r)
4935 break;
ce316fa5
LM
4936 }
4937 }
4938 }
ce316fa5 4939 }
26bc5340 4940
43c4d576 4941 if (!r && amdgpu_ras_intr_triggered()) {
655ce9cb 4942 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5e67bba3 4943 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4944 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4945 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
43c4d576
JC
4946 }
4947
00eaa571 4948 amdgpu_ras_intr_cleared();
43c4d576 4949 }
00eaa571 4950
f5c7e779
YC
4951 /* Since the mode1 reset affects base ip blocks, the
4952 * phase1 ip blocks need to be resumed. Otherwise there
4953 * will be a BIOS signature error and the psp bootloader
4954 * can't load kdb on the next amdgpu install.
4955 */
4956 if (gpu_reset_for_dev_remove) {
4957 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4958 amdgpu_device_ip_resume_phase1(tmp_adev);
4959
4960 goto end;
4961 }
4962
655ce9cb 4963 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
26bc5340
AG
4964 if (need_full_reset) {
4965 /* post card */
e3c1b071 4966 r = amdgpu_device_asic_init(tmp_adev);
4967 if (r) {
aac89168 4968 dev_warn(tmp_adev->dev, "asic atom init failed!");
e3c1b071 4969 } else {
26bc5340 4970 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
9cec53c1 4971
26bc5340
AG
4972 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4973 if (r)
4974 goto out;
4975
4976 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
3d8785f6
SA
4977#ifdef CONFIG_DEV_COREDUMP
4978 tmp_adev->reset_vram_lost = vram_lost;
4979 memset(&tmp_adev->reset_task_info, 0,
4980 sizeof(tmp_adev->reset_task_info));
4981 if (reset_context->job && reset_context->job->vm)
4982 tmp_adev->reset_task_info =
4983 reset_context->job->vm->task_info;
4984 amdgpu_reset_capture_coredumpm(tmp_adev);
4985#endif
26bc5340 4986 if (vram_lost) {
77e7f829 4987 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4988 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4989 }
4990
26bc5340
AG
4991 r = amdgpu_device_fw_loading(tmp_adev);
4992 if (r)
4993 return r;
4994
4995 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4996 if (r)
4997 goto out;
4998
4999 if (vram_lost)
5000 amdgpu_device_fill_reset_magic(tmp_adev);
5001
fdafb359
EQ
5002 /*
5003 * Add this ASIC as tracked as reset was already
5004 * complete successfully.
5005 */
5006 amdgpu_register_gpu_instance(tmp_adev);
5007
04442bf7
LL
5008 if (!reset_context->hive &&
5009 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
e3c1b071 5010 amdgpu_xgmi_add_device(tmp_adev);
5011
7c04ca50 5012 r = amdgpu_device_ip_late_init(tmp_adev);
5013 if (r)
5014 goto out;
5015
087451f3 5016 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
565d1941 5017
e8fbaf03
GC
5018 /*
5019 * The GPU enters bad state once faulty pages
5020 * by ECC has reached the threshold, and ras
5021 * recovery is scheduled next. So add one check
5022 * here to break recovery if it indeed exceeds
5023 * bad page threshold, and remind user to
5024 * retire this GPU or setting one bigger
5025 * bad_page_threshold value to fix this once
5026 * probing driver again.
5027 */
11003c68 5028 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
e8fbaf03
GC
5029 /* must succeed. */
5030 amdgpu_ras_resume(tmp_adev);
5031 } else {
5032 r = -EINVAL;
5033 goto out;
5034 }
e79a04d5 5035
26bc5340 5036 /* Update PSP FW topology after reset */
04442bf7
LL
5037 if (reset_context->hive &&
5038 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5039 r = amdgpu_xgmi_update_topology(
5040 reset_context->hive, tmp_adev);
26bc5340
AG
5041 }
5042 }
5043
26bc5340
AG
5044out:
5045 if (!r) {
5046 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5047 r = amdgpu_ib_ring_tests(tmp_adev);
5048 if (r) {
5049 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
26bc5340
AG
5050 need_full_reset = true;
5051 r = -EAGAIN;
5052 goto end;
5053 }
5054 }
5055
5056 if (!r)
5057 r = amdgpu_device_recover_vram(tmp_adev);
5058 else
5059 tmp_adev->asic_reset_res = r;
5060 }
5061
5062end:
04442bf7
LL
5063 if (need_full_reset)
5064 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5065 else
5066 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340
AG
5067 return r;
5068}
5069
e923be99 5070static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
26bc5340 5071{
5740682e 5072
a3a09142
AD
5073 switch (amdgpu_asic_reset_method(adev)) {
5074 case AMD_RESET_METHOD_MODE1:
5075 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5076 break;
5077 case AMD_RESET_METHOD_MODE2:
5078 adev->mp1_state = PP_MP1_STATE_RESET;
5079 break;
5080 default:
5081 adev->mp1_state = PP_MP1_STATE_NONE;
5082 break;
5083 }
26bc5340 5084}
d38ceaf9 5085
e923be99 5086static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
26bc5340 5087{
89041940 5088 amdgpu_vf_error_trans_all(adev);
a3a09142 5089 adev->mp1_state = PP_MP1_STATE_NONE;
91fb309d
HC
5090}
5091
3f12acc8
EQ
5092static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5093{
5094 struct pci_dev *p = NULL;
5095
5096 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5097 adev->pdev->bus->number, 1);
5098 if (p) {
5099 pm_runtime_enable(&(p->dev));
5100 pm_runtime_resume(&(p->dev));
5101 }
b85e285e
YY
5102
5103 pci_dev_put(p);
3f12acc8
EQ
5104}
5105
5106static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5107{
5108 enum amd_reset_method reset_method;
5109 struct pci_dev *p = NULL;
5110 u64 expires;
5111
5112 /*
5113 * For now, only BACO and mode1 reset are confirmed
5114 * to suffer the audio issue without proper suspended.
5115 */
5116 reset_method = amdgpu_asic_reset_method(adev);
5117 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5118 (reset_method != AMD_RESET_METHOD_MODE1))
5119 return -EINVAL;
5120
5121 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5122 adev->pdev->bus->number, 1);
5123 if (!p)
5124 return -ENODEV;
5125
5126 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5127 if (!expires)
5128 /*
5129 * If we cannot get the audio device autosuspend delay,
5130 * a fixed 4S interval will be used. Considering 3S is
5131 * the audio controller default autosuspend delay setting.
5132 * 4S used here is guaranteed to cover that.
5133 */
54b7feb9 5134 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
5135
5136 while (!pm_runtime_status_suspended(&(p->dev))) {
5137 if (!pm_runtime_suspend(&(p->dev)))
5138 break;
5139
5140 if (expires < ktime_get_mono_fast_ns()) {
5141 dev_warn(adev->dev, "failed to suspend display audio\n");
b85e285e 5142 pci_dev_put(p);
3f12acc8
EQ
5143 /* TODO: abort the succeeding gpu reset? */
5144 return -ETIMEDOUT;
5145 }
5146 }
5147
5148 pm_runtime_disable(&(p->dev));
5149
b85e285e 5150 pci_dev_put(p);
3f12acc8
EQ
5151 return 0;
5152}
5153
d193b12b 5154static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
247c7b0d
AG
5155{
5156 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5157
5158#if defined(CONFIG_DEBUG_FS)
5159 if (!amdgpu_sriov_vf(adev))
5160 cancel_work(&adev->reset_work);
5161#endif
5162
5163 if (adev->kfd.dev)
5164 cancel_work(&adev->kfd.reset_work);
5165
5166 if (amdgpu_sriov_vf(adev))
5167 cancel_work(&adev->virt.flr_work);
5168
5169 if (con && adev->ras_enabled)
5170 cancel_work(&con->recovery_work);
5171
5172}
5173
26bc5340 5174/**
6e9c65f7 5175 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
26bc5340 5176 *
982a820b 5177 * @adev: amdgpu_device pointer
26bc5340 5178 * @job: which job trigger hang
80bd2de1 5179 * @reset_context: amdgpu reset context pointer
26bc5340
AG
5180 *
5181 * Attempt to reset the GPU if it has hung (all asics).
5182 * Attempt to do soft-reset or full-reset and reinitialize Asic
5183 * Returns 0 for success or an error on failure.
5184 */
5185
cf727044 5186int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
f1549c09
LG
5187 struct amdgpu_job *job,
5188 struct amdgpu_reset_context *reset_context)
26bc5340 5189{
1d721ed6 5190 struct list_head device_list, *device_list_handle = NULL;
7dd8c205 5191 bool job_signaled = false;
26bc5340 5192 struct amdgpu_hive_info *hive = NULL;
26bc5340 5193 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 5194 int i, r = 0;
bb5c7235 5195 bool need_emergency_restart = false;
3f12acc8 5196 bool audio_suspended = false;
f5c7e779
YC
5197 bool gpu_reset_for_dev_remove = false;
5198
5199 gpu_reset_for_dev_remove =
5200 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5201 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340 5202
6e3cd2a9 5203 /*
bb5c7235
WS
5204 * Special case: RAS triggered and full reset isn't supported
5205 */
5206 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5207
d5ea093e
AG
5208 /*
5209 * Flush RAM to disk so that after reboot
5210 * the user can read log and see why the system rebooted.
5211 */
bb5c7235 5212 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
5213 DRM_WARN("Emergency reboot.");
5214
5215 ksys_sync_helper();
5216 emergency_restart();
5217 }
5218
b823821f 5219 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 5220 need_emergency_restart ? "jobs stop":"reset");
26bc5340 5221
175ac6ec
ZL
5222 if (!amdgpu_sriov_vf(adev))
5223 hive = amdgpu_get_xgmi_hive(adev);
681260df 5224 if (hive)
53b3f8f4 5225 mutex_lock(&hive->hive_lock);
26bc5340 5226
f1549c09
LG
5227 reset_context->job = job;
5228 reset_context->hive = hive;
9e94d22c
EQ
5229 /*
5230 * Build list of devices to reset.
5231 * In case we are in XGMI hive mode, resort the device list
5232 * to put adev in the 1st position.
5233 */
5234 INIT_LIST_HEAD(&device_list);
175ac6ec 5235 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
83d29a5f 5236 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
655ce9cb 5237 list_add_tail(&tmp_adev->reset_list, &device_list);
83d29a5f
YC
5238 if (gpu_reset_for_dev_remove && adev->shutdown)
5239 tmp_adev->shutdown = true;
5240 }
655ce9cb 5241 if (!list_is_first(&adev->reset_list, &device_list))
5242 list_rotate_to_front(&adev->reset_list, &device_list);
5243 device_list_handle = &device_list;
26bc5340 5244 } else {
655ce9cb 5245 list_add_tail(&adev->reset_list, &device_list);
26bc5340
AG
5246 device_list_handle = &device_list;
5247 }
5248
e923be99
AG
5249 /* We need to lock reset domain only once both for XGMI and single device */
5250 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5251 reset_list);
3675c2f2 5252 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
e923be99 5253
1d721ed6 5254 /* block all schedulers and reset given job's ring */
655ce9cb 5255 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
f287a3c5 5256
e923be99 5257 amdgpu_device_set_mp1_state(tmp_adev);
f287a3c5 5258
3f12acc8
EQ
5259 /*
5260 * Try to put the audio codec into suspend state
5261 * before gpu reset started.
5262 *
5263 * Due to the power domain of the graphics device
5264 * is shared with AZ power domain. Without this,
5265 * we may change the audio hardware from behind
5266 * the audio driver's back. That will trigger
5267 * some audio codec errors.
5268 */
5269 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5270 audio_suspended = true;
5271
9e94d22c
EQ
5272 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5273
52fb44cf
EQ
5274 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5275
c004d44e 5276 if (!amdgpu_sriov_vf(tmp_adev))
428890a3 5277 amdgpu_amdkfd_pre_reset(tmp_adev);
9e94d22c 5278
12ffa55d
AG
5279 /*
5280 * Mark these ASICs to be reseted as untracked first
5281 * And add them back after reset completed
5282 */
5283 amdgpu_unregister_gpu_instance(tmp_adev);
5284
163d4cd2 5285 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
565d1941 5286
f1c1314b 5287 /* disable ras on ALL IPs */
bb5c7235 5288 if (!need_emergency_restart &&
b823821f 5289 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 5290 amdgpu_ras_suspend(tmp_adev);
5291
1d721ed6
AG
5292 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5293 struct amdgpu_ring *ring = tmp_adev->rings[i];
5294
5295 if (!ring || !ring->sched.thread)
5296 continue;
5297
0b2d2c2e 5298 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 5299
bb5c7235 5300 if (need_emergency_restart)
7c6e68c7 5301 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6 5302 }
8f8c80f4 5303 atomic_inc(&tmp_adev->gpu_reset_counter);
1d721ed6
AG
5304 }
5305
bb5c7235 5306 if (need_emergency_restart)
7c6e68c7
AG
5307 goto skip_sched_resume;
5308
1d721ed6
AG
5309 /*
5310 * Must check guilty signal here since after this point all old
5311 * HW fences are force signaled.
5312 *
5313 * job->base holds a reference to parent fence
5314 */
f6a3f660 5315 if (job && dma_fence_is_signaled(&job->hw_fence)) {
1d721ed6 5316 job_signaled = true;
1d721ed6
AG
5317 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5318 goto skip_hw_reset;
5319 }
5320
26bc5340 5321retry: /* Rest of adevs pre asic reset from XGMI hive. */
655ce9cb 5322 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
f5c7e779
YC
5323 if (gpu_reset_for_dev_remove) {
5324 /* Workaroud for ASICs need to disable SMC first */
5325 amdgpu_device_smu_fini_early(tmp_adev);
5326 }
f1549c09 5327 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
26bc5340
AG
5328 /*TODO Should we stop ?*/
5329 if (r) {
aac89168 5330 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 5331 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
5332 tmp_adev->asic_reset_res = r;
5333 }
247c7b0d
AG
5334
5335 /*
5336 * Drop all pending non scheduler resets. Scheduler resets
5337 * were already dropped during drm_sched_stop
5338 */
d193b12b 5339 amdgpu_device_stop_pending_resets(tmp_adev);
26bc5340
AG
5340 }
5341
5342 /* Actual ASIC resets if needed.*/
4f30d920 5343 /* Host driver will handle XGMI hive reset for SRIOV */
26bc5340
AG
5344 if (amdgpu_sriov_vf(adev)) {
5345 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5346 if (r)
5347 adev->asic_reset_res = r;
950d6425 5348
28606c4e
YC
5349 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5350 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5351 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
950d6425 5352 amdgpu_ras_resume(adev);
26bc5340 5353 } else {
f1549c09 5354 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
b98a1648 5355 if (r && r == -EAGAIN)
26bc5340 5356 goto retry;
f5c7e779
YC
5357
5358 if (!r && gpu_reset_for_dev_remove)
5359 goto recover_end;
26bc5340
AG
5360 }
5361
1d721ed6
AG
5362skip_hw_reset:
5363
26bc5340 5364 /* Post ASIC reset for all devs .*/
655ce9cb 5365 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
7c6e68c7 5366
1d721ed6
AG
5367 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5368 struct amdgpu_ring *ring = tmp_adev->rings[i];
5369
5370 if (!ring || !ring->sched.thread)
5371 continue;
5372
6868a2c4 5373 drm_sched_start(&ring->sched, true);
1d721ed6
AG
5374 }
5375
693073a0 5376 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
ed67f729
JX
5377 amdgpu_mes_self_test(tmp_adev);
5378
b8920e1e 5379 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
4a580877 5380 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6 5381
7258fa31
SK
5382 if (tmp_adev->asic_reset_res)
5383 r = tmp_adev->asic_reset_res;
5384
1d721ed6 5385 tmp_adev->asic_reset_res = 0;
26bc5340
AG
5386
5387 if (r) {
5388 /* bad news, how to tell it to userspace ? */
12ffa55d 5389 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
5390 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5391 } else {
12ffa55d 5392 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
3fa8f89d
S
5393 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5394 DRM_WARN("smart shift update failed\n");
26bc5340 5395 }
7c6e68c7 5396 }
26bc5340 5397
7c6e68c7 5398skip_sched_resume:
655ce9cb 5399 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
428890a3 5400 /* unlock kfd: SRIOV would do it separately */
c004d44e 5401 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
428890a3 5402 amdgpu_amdkfd_post_reset(tmp_adev);
8e2712e7 5403
5404 /* kfd_post_reset will do nothing if kfd device is not initialized,
5405 * need to bring up kfd here if it's not be initialized before
5406 */
5407 if (!adev->kfd.init_complete)
5408 amdgpu_amdkfd_device_init(adev);
5409
3f12acc8
EQ
5410 if (audio_suspended)
5411 amdgpu_device_resume_display_audio(tmp_adev);
e923be99
AG
5412
5413 amdgpu_device_unset_mp1_state(tmp_adev);
d293470e
YC
5414
5415 amdgpu_ras_set_error_query_ready(tmp_adev, true);
26bc5340
AG
5416 }
5417
f5c7e779 5418recover_end:
e923be99
AG
5419 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5420 reset_list);
5421 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5422
9e94d22c 5423 if (hive) {
9e94d22c 5424 mutex_unlock(&hive->hive_lock);
d95e8e97 5425 amdgpu_put_xgmi_hive(hive);
9e94d22c 5426 }
26bc5340 5427
f287a3c5 5428 if (r)
26bc5340 5429 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
ab9a0b1f
AG
5430
5431 atomic_set(&adev->reset_domain->reset_res, r);
d38ceaf9
AD
5432 return r;
5433}
5434
e3ecdffa
AD
5435/**
5436 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5437 *
5438 * @adev: amdgpu_device pointer
5439 *
5440 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5441 * and lanes) of the slot the device is in. Handles APUs and
5442 * virtualized environments where PCIE config space may not be available.
5443 */
5494d864 5444static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 5445{
5d9a6330 5446 struct pci_dev *pdev;
c5313457
HK
5447 enum pci_bus_speed speed_cap, platform_speed_cap;
5448 enum pcie_link_width platform_link_width;
d0dd7f0c 5449
cd474ba0
AD
5450 if (amdgpu_pcie_gen_cap)
5451 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 5452
cd474ba0
AD
5453 if (amdgpu_pcie_lane_cap)
5454 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 5455
cd474ba0 5456 /* covers APUs as well */
04e85958 5457 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
cd474ba0
AD
5458 if (adev->pm.pcie_gen_mask == 0)
5459 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5460 if (adev->pm.pcie_mlw_mask == 0)
5461 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 5462 return;
cd474ba0 5463 }
d0dd7f0c 5464
c5313457
HK
5465 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5466 return;
5467
dbaa922b
AD
5468 pcie_bandwidth_available(adev->pdev, NULL,
5469 &platform_speed_cap, &platform_link_width);
c5313457 5470
cd474ba0 5471 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
5472 /* asic caps */
5473 pdev = adev->pdev;
5474 speed_cap = pcie_get_speed_cap(pdev);
5475 if (speed_cap == PCI_SPEED_UNKNOWN) {
5476 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
5477 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5478 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 5479 } else {
2b3a1f51
FX
5480 if (speed_cap == PCIE_SPEED_32_0GT)
5481 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5482 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5483 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5484 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5485 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5486 else if (speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5487 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5488 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5489 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5490 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5491 else if (speed_cap == PCIE_SPEED_8_0GT)
5492 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5493 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5494 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5495 else if (speed_cap == PCIE_SPEED_5_0GT)
5496 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5497 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5498 else
5499 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5500 }
5501 /* platform caps */
c5313457 5502 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
5503 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5504 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5505 } else {
2b3a1f51
FX
5506 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5507 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5508 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5509 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5510 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5511 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5512 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5513 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5514 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5515 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5516 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 5517 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
5518 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5519 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5520 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 5521 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
5522 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5523 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5524 else
5525 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5526
cd474ba0
AD
5527 }
5528 }
5529 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 5530 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
5531 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5532 } else {
c5313457 5533 switch (platform_link_width) {
5d9a6330 5534 case PCIE_LNK_X32:
cd474ba0
AD
5535 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5541 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5542 break;
5d9a6330 5543 case PCIE_LNK_X16:
cd474ba0
AD
5544 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5548 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5549 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5550 break;
5d9a6330 5551 case PCIE_LNK_X12:
cd474ba0
AD
5552 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5557 break;
5d9a6330 5558 case PCIE_LNK_X8:
cd474ba0
AD
5559 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5560 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5563 break;
5d9a6330 5564 case PCIE_LNK_X4:
cd474ba0
AD
5565 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5566 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5567 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5568 break;
5d9a6330 5569 case PCIE_LNK_X2:
cd474ba0
AD
5570 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5572 break;
5d9a6330 5573 case PCIE_LNK_X1:
cd474ba0
AD
5574 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5575 break;
5576 default:
5577 break;
5578 }
d0dd7f0c
AD
5579 }
5580 }
5581}
d38ceaf9 5582
08a2fd23
RE
5583/**
5584 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5585 *
5586 * @adev: amdgpu_device pointer
5587 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5588 *
5589 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5590 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5591 * @peer_adev.
5592 */
5593bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5594 struct amdgpu_device *peer_adev)
5595{
5596#ifdef CONFIG_HSA_AMD_P2P
5597 uint64_t address_mask = peer_adev->dev->dma_mask ?
5598 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5599 resource_size_t aper_limit =
5600 adev->gmc.aper_base + adev->gmc.aper_size - 1;
bb66ecbf
LL
5601 bool p2p_access =
5602 !adev->gmc.xgmi.connected_to_cpu &&
5603 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
08a2fd23
RE
5604
5605 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5606 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5607 !(adev->gmc.aper_base & address_mask ||
5608 aper_limit & address_mask));
5609#else
5610 return false;
5611#endif
5612}
5613
361dbd01
AD
5614int amdgpu_device_baco_enter(struct drm_device *dev)
5615{
1348969a 5616 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5617 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 5618
6ab68650 5619 if (!amdgpu_device_supports_baco(dev))
361dbd01
AD
5620 return -ENOTSUPP;
5621
8ab0d6f0 5622 if (ras && adev->ras_enabled &&
acdae216 5623 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5624 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5625
9530273e 5626 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
5627}
5628
5629int amdgpu_device_baco_exit(struct drm_device *dev)
5630{
1348969a 5631 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5632 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 5633 int ret = 0;
361dbd01 5634
6ab68650 5635 if (!amdgpu_device_supports_baco(dev))
361dbd01
AD
5636 return -ENOTSUPP;
5637
9530273e
EQ
5638 ret = amdgpu_dpm_baco_exit(adev);
5639 if (ret)
5640 return ret;
7a22677b 5641
8ab0d6f0 5642 if (ras && adev->ras_enabled &&
acdae216 5643 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5644 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5645
1bece222
CL
5646 if (amdgpu_passthrough(adev) &&
5647 adev->nbio.funcs->clear_doorbell_interrupt)
5648 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5649
7a22677b 5650 return 0;
361dbd01 5651}
c9a6b82f
AG
5652
5653/**
5654 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5655 * @pdev: PCI device struct
5656 * @state: PCI channel state
5657 *
5658 * Description: Called when a PCI error is detected.
5659 *
5660 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5661 */
5662pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5663{
5664 struct drm_device *dev = pci_get_drvdata(pdev);
5665 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5666 int i;
c9a6b82f
AG
5667
5668 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5669
6894305c
AG
5670 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5671 DRM_WARN("No support for XGMI hive yet...");
5672 return PCI_ERS_RESULT_DISCONNECT;
5673 }
5674
e17e27f9
GC
5675 adev->pci_channel_state = state;
5676
c9a6b82f
AG
5677 switch (state) {
5678 case pci_channel_io_normal:
5679 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca 5680 /* Fatal error, prepare for slot reset */
8a11d283
TZ
5681 case pci_channel_io_frozen:
5682 /*
d0fb18b5 5683 * Locking adev->reset_domain->sem will prevent any external access
acd89fca
AG
5684 * to GPU during PCI error recovery
5685 */
3675c2f2 5686 amdgpu_device_lock_reset_domain(adev->reset_domain);
e923be99 5687 amdgpu_device_set_mp1_state(adev);
acd89fca
AG
5688
5689 /*
5690 * Block any work scheduling as we do for regular GPU reset
5691 * for the duration of the recovery
5692 */
5693 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5694 struct amdgpu_ring *ring = adev->rings[i];
5695
5696 if (!ring || !ring->sched.thread)
5697 continue;
5698
5699 drm_sched_stop(&ring->sched, NULL);
5700 }
8f8c80f4 5701 atomic_inc(&adev->gpu_reset_counter);
c9a6b82f
AG
5702 return PCI_ERS_RESULT_NEED_RESET;
5703 case pci_channel_io_perm_failure:
5704 /* Permanent error, prepare for device removal */
5705 return PCI_ERS_RESULT_DISCONNECT;
5706 }
5707
5708 return PCI_ERS_RESULT_NEED_RESET;
5709}
5710
5711/**
5712 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5713 * @pdev: pointer to PCI device
5714 */
5715pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5716{
5717
5718 DRM_INFO("PCI error: mmio enabled callback!!\n");
5719
5720 /* TODO - dump whatever for debugging purposes */
5721
5722 /* This called only if amdgpu_pci_error_detected returns
5723 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5724 * works, no need to reset slot.
5725 */
5726
5727 return PCI_ERS_RESULT_RECOVERED;
5728}
5729
5730/**
5731 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5732 * @pdev: PCI device struct
5733 *
5734 * Description: This routine is called by the pci error recovery
5735 * code after the PCI slot has been reset, just before we
5736 * should resume normal operations.
5737 */
5738pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5739{
5740 struct drm_device *dev = pci_get_drvdata(pdev);
5741 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 5742 int r, i;
04442bf7 5743 struct amdgpu_reset_context reset_context;
362c7b91 5744 u32 memsize;
7ac71382 5745 struct list_head device_list;
c9a6b82f
AG
5746
5747 DRM_INFO("PCI error: slot reset callback!!\n");
5748
04442bf7
LL
5749 memset(&reset_context, 0, sizeof(reset_context));
5750
7ac71382 5751 INIT_LIST_HEAD(&device_list);
655ce9cb 5752 list_add_tail(&adev->reset_list, &device_list);
7ac71382 5753
362c7b91
AG
5754 /* wait for asic to come out of reset */
5755 msleep(500);
5756
7ac71382 5757 /* Restore PCI confspace */
c1dd4aa6 5758 amdgpu_device_load_pci_state(pdev);
c9a6b82f 5759
362c7b91
AG
5760 /* confirm ASIC came out of reset */
5761 for (i = 0; i < adev->usec_timeout; i++) {
5762 memsize = amdgpu_asic_get_config_memsize(adev);
5763
5764 if (memsize != 0xffffffff)
5765 break;
5766 udelay(1);
5767 }
5768 if (memsize == 0xffffffff) {
5769 r = -ETIME;
5770 goto out;
5771 }
5772
04442bf7
LL
5773 reset_context.method = AMD_RESET_METHOD_NONE;
5774 reset_context.reset_req_dev = adev;
5775 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5776 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5777
7afefb81 5778 adev->no_hw_access = true;
04442bf7 5779 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
7afefb81 5780 adev->no_hw_access = false;
c9a6b82f
AG
5781 if (r)
5782 goto out;
5783
04442bf7 5784 r = amdgpu_do_asic_reset(&device_list, &reset_context);
c9a6b82f
AG
5785
5786out:
c9a6b82f 5787 if (!r) {
c1dd4aa6
AG
5788 if (amdgpu_device_cache_pci_state(adev->pdev))
5789 pci_restore_state(adev->pdev);
5790
c9a6b82f
AG
5791 DRM_INFO("PCIe error recovery succeeded\n");
5792 } else {
5793 DRM_ERROR("PCIe error recovery failed, err:%d", r);
e923be99
AG
5794 amdgpu_device_unset_mp1_state(adev);
5795 amdgpu_device_unlock_reset_domain(adev->reset_domain);
c9a6b82f
AG
5796 }
5797
5798 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5799}
5800
5801/**
5802 * amdgpu_pci_resume() - resume normal ops after PCI reset
5803 * @pdev: pointer to PCI device
5804 *
5805 * Called when the error recovery driver tells us that its
505199a3 5806 * OK to resume normal operation.
c9a6b82f
AG
5807 */
5808void amdgpu_pci_resume(struct pci_dev *pdev)
5809{
5810 struct drm_device *dev = pci_get_drvdata(pdev);
5811 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5812 int i;
c9a6b82f 5813
c9a6b82f
AG
5814
5815 DRM_INFO("PCI error: resume callback!!\n");
acd89fca 5816
e17e27f9
GC
5817 /* Only continue execution for the case of pci_channel_io_frozen */
5818 if (adev->pci_channel_state != pci_channel_io_frozen)
5819 return;
5820
acd89fca
AG
5821 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5822 struct amdgpu_ring *ring = adev->rings[i];
5823
5824 if (!ring || !ring->sched.thread)
5825 continue;
5826
acd89fca
AG
5827 drm_sched_start(&ring->sched, true);
5828 }
5829
e923be99
AG
5830 amdgpu_device_unset_mp1_state(adev);
5831 amdgpu_device_unlock_reset_domain(adev->reset_domain);
c9a6b82f 5832}
c1dd4aa6
AG
5833
5834bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5835{
5836 struct drm_device *dev = pci_get_drvdata(pdev);
5837 struct amdgpu_device *adev = drm_to_adev(dev);
5838 int r;
5839
5840 r = pci_save_state(pdev);
5841 if (!r) {
5842 kfree(adev->pci_state);
5843
5844 adev->pci_state = pci_store_saved_state(pdev);
5845
5846 if (!adev->pci_state) {
5847 DRM_ERROR("Failed to store PCI saved state");
5848 return false;
5849 }
5850 } else {
5851 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5852 return false;
5853 }
5854
5855 return true;
5856}
5857
5858bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5859{
5860 struct drm_device *dev = pci_get_drvdata(pdev);
5861 struct amdgpu_device *adev = drm_to_adev(dev);
5862 int r;
5863
5864 if (!adev->pci_state)
5865 return false;
5866
5867 r = pci_load_saved_state(pdev, adev->pci_state);
5868
5869 if (!r) {
5870 pci_restore_state(pdev);
5871 } else {
5872 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5873 return false;
5874 }
5875
5876 return true;
5877}
5878
810085dd
EH
5879void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5880 struct amdgpu_ring *ring)
5881{
5882#ifdef CONFIG_X86_64
b818a5d3 5883 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
810085dd
EH
5884 return;
5885#endif
5886 if (adev->gmc.xgmi.connected_to_cpu)
5887 return;
5888
5889 if (ring && ring->funcs->emit_hdp_flush)
5890 amdgpu_ring_emit_hdp_flush(ring);
5891 else
5892 amdgpu_asic_flush_hdp(adev, ring);
5893}
c1dd4aa6 5894
810085dd
EH
5895void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5896 struct amdgpu_ring *ring)
5897{
5898#ifdef CONFIG_X86_64
b818a5d3 5899 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
810085dd
EH
5900 return;
5901#endif
5902 if (adev->gmc.xgmi.connected_to_cpu)
5903 return;
c1dd4aa6 5904
810085dd
EH
5905 amdgpu_asic_invalidate_hdp(adev, ring);
5906}
34f3a4a9 5907
89a7a870
AG
5908int amdgpu_in_reset(struct amdgpu_device *adev)
5909{
5910 return atomic_read(&adev->reset_domain->in_gpu_reset);
53a17b6b
TZ
5911}
5912
34f3a4a9
LY
5913/**
5914 * amdgpu_device_halt() - bring hardware to some kind of halt state
5915 *
5916 * @adev: amdgpu_device pointer
5917 *
5918 * Bring hardware to some kind of halt state so that no one can touch it
5919 * any more. It will help to maintain error context when error occurred.
5920 * Compare to a simple hang, the system will keep stable at least for SSH
5921 * access. Then it should be trivial to inspect the hardware state and
5922 * see what's going on. Implemented as following:
5923 *
5924 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5925 * clears all CPU mappings to device, disallows remappings through page faults
5926 * 2. amdgpu_irq_disable_all() disables all interrupts
5927 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5928 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5929 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5930 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5931 * flush any in flight DMA operations
5932 */
5933void amdgpu_device_halt(struct amdgpu_device *adev)
5934{
5935 struct pci_dev *pdev = adev->pdev;
e0f943b4 5936 struct drm_device *ddev = adev_to_drm(adev);
34f3a4a9 5937
2c1c7ba4 5938 amdgpu_xcp_dev_unplug(adev);
34f3a4a9
LY
5939 drm_dev_unplug(ddev);
5940
5941 amdgpu_irq_disable_all(adev);
5942
5943 amdgpu_fence_driver_hw_fini(adev);
5944
5945 adev->no_hw_access = true;
5946
5947 amdgpu_device_unmap_mmio(adev);
5948
5949 pci_disable_device(pdev);
5950 pci_wait_for_pending_transaction(pdev);
5951}
86700a40
XD
5952
5953u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5954 u32 reg)
5955{
5956 unsigned long flags, address, data;
5957 u32 r;
5958
5959 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5960 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5961
5962 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5963 WREG32(address, reg * 4);
5964 (void)RREG32(address);
5965 r = RREG32(data);
5966 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5967 return r;
5968}
5969
5970void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5971 u32 reg, u32 v)
5972{
5973 unsigned long flags, address, data;
5974
5975 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5976 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5977
5978 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5979 WREG32(address, reg * 4);
5980 (void)RREG32(address);
5981 WREG32(data, v);
5982 (void)RREG32(data);
5983 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5984}
68ce8b24
CK
5985
5986/**
5987 * amdgpu_device_switch_gang - switch to a new gang
5988 * @adev: amdgpu_device pointer
5989 * @gang: the gang to switch to
5990 *
5991 * Try to switch to a new gang.
5992 * Returns: NULL if we switched to the new gang or a reference to the current
5993 * gang leader.
5994 */
5995struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
5996 struct dma_fence *gang)
5997{
5998 struct dma_fence *old = NULL;
5999
6000 do {
6001 dma_fence_put(old);
6002 rcu_read_lock();
6003 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6004 rcu_read_unlock();
6005
6006 if (old == gang)
6007 break;
6008
6009 if (!dma_fence_is_signaled(old))
6010 return old;
6011
6012 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6013 old, gang) != old);
6014
6015 dma_fence_put(old);
6016 return NULL;
6017}
220c8cc8
AD
6018
6019bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6020{
6021 switch (adev->asic_type) {
6022#ifdef CONFIG_DRM_AMDGPU_SI
6023 case CHIP_HAINAN:
6024#endif
6025 case CHIP_TOPAZ:
6026 /* chips with no display hardware */
6027 return false;
6028#ifdef CONFIG_DRM_AMDGPU_SI
6029 case CHIP_TAHITI:
6030 case CHIP_PITCAIRN:
6031 case CHIP_VERDE:
6032 case CHIP_OLAND:
6033#endif
6034#ifdef CONFIG_DRM_AMDGPU_CIK
6035 case CHIP_BONAIRE:
6036 case CHIP_HAWAII:
6037 case CHIP_KAVERI:
6038 case CHIP_KABINI:
6039 case CHIP_MULLINS:
6040#endif
6041 case CHIP_TONGA:
6042 case CHIP_FIJI:
6043 case CHIP_POLARIS10:
6044 case CHIP_POLARIS11:
6045 case CHIP_POLARIS12:
6046 case CHIP_VEGAM:
6047 case CHIP_CARRIZO:
6048 case CHIP_STONEY:
6049 /* chips with display hardware */
6050 return true;
6051 default:
6052 /* IP discovery */
6053 if (!adev->ip_versions[DCE_HWIP][0] ||
6054 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6055 return false;
6056 return true;
6057 }
6058}
81283fee
JZ
6059
6060uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6061 uint32_t inst, uint32_t reg_addr, char reg_name[],
6062 uint32_t expected_value, uint32_t mask)
6063{
6064 uint32_t ret = 0;
6065 uint32_t old_ = 0;
6066 uint32_t tmp_ = RREG32(reg_addr);
6067 uint32_t loop = adev->usec_timeout;
6068
6069 while ((tmp_ & (mask)) != (expected_value)) {
6070 if (old_ != tmp_) {
6071 loop = adev->usec_timeout;
6072 old_ = tmp_;
6073 } else
6074 udelay(1);
6075 tmp_ = RREG32(reg_addr);
6076 loop--;
6077 if (!loop) {
6078 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6079 inst, reg_name, (uint32_t)expected_value,
6080 (uint32_t)(tmp_ & (mask)));
6081 ret = -ETIMEDOUT;
6082 break;
6083 }
6084 }
6085 return ret;
6086}