drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
4a74c38c 33#include <linux/iommu.h>
fdf2f6c5 34
4562236b 35#include <drm/drm_atomic_helper.h>
fcd70cd3 36#include <drm/drm_probe_helper.h>
d38ceaf9
AD
37#include <drm/amdgpu_drm.h>
38#include <linux/vgaarb.h>
39#include <linux/vga_switcheroo.h>
40#include <linux/efi.h>
41#include "amdgpu.h"
f4b373f4 42#include "amdgpu_trace.h"
d38ceaf9
AD
43#include "amdgpu_i2c.h"
44#include "atom.h"
45#include "amdgpu_atombios.h"
a5bde2f9 46#include "amdgpu_atomfirmware.h"
d0dd7f0c 47#include "amd_pcie.h"
33f34802
KW
48#ifdef CONFIG_DRM_AMDGPU_SI
49#include "si.h"
50#endif
a2e73f56
AD
51#ifdef CONFIG_DRM_AMDGPU_CIK
52#include "cik.h"
53#endif
aaa36a97 54#include "vi.h"
460826e6 55#include "soc15.h"
0a5b8c7b 56#include "nv.h"
d38ceaf9 57#include "bif/bif_4_1_d.h"
9accf2fd 58#include <linux/pci.h>
bec86378 59#include <linux/firmware.h>
89041940 60#include "amdgpu_vf_error.h"
d38ceaf9 61
ba997709 62#include "amdgpu_amdkfd.h"
d2f52ac8 63#include "amdgpu_pm.h"
d38ceaf9 64
5183411b 65#include "amdgpu_xgmi.h"
c030f2e4 66#include "amdgpu_ras.h"
9c7c85f7 67#include "amdgpu_pmu.h"
bd607166 68#include "amdgpu_fru_eeprom.h"
04442bf7 69#include "amdgpu_reset.h"
5183411b 70
d5ea093e 71#include <linux/suspend.h>
c6a6e2db 72#include <drm/task_barrier.h>
3f12acc8 73#include <linux/pm_runtime.h>
d5ea093e 74
f89f8c6b
AG
75#include <drm/drm_drv.h>
76
e2a75f88 77MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 78MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 79MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 80MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 81MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 82MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
b51a26a0 83MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
23c6268e 84MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
ed42cfe1 85MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
42b325e5 86MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
4e52a9f8 87MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
8bf84f60 88MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
e2a75f88 89
2dc80b00
S
90#define AMDGPU_RESUME_MS 2000
91
050091ab 92const char *amdgpu_asic_name[] = {
da69c161
KW
93 "TAHITI",
94 "PITCAIRN",
95 "VERDE",
96 "OLAND",
97 "HAINAN",
d38ceaf9
AD
98 "BONAIRE",
99 "KAVERI",
100 "KABINI",
101 "HAWAII",
102 "MULLINS",
103 "TOPAZ",
104 "TONGA",
48299f95 105 "FIJI",
d38ceaf9 106 "CARRIZO",
139f4917 107 "STONEY",
2cc0c0b5
FC
108 "POLARIS10",
109 "POLARIS11",
c4642a47 110 "POLARIS12",
48ff108d 111 "VEGAM",
d4196f01 112 "VEGA10",
8fab806a 113 "VEGA12",
956fcddc 114 "VEGA20",
2ca8a5d2 115 "RAVEN",
d6c3b24e 116 "ARCTURUS",
1eee4228 117 "RENOIR",
d46b417a 118 "ALDEBARAN",
852a6626 119 "NAVI10",
d0f56dc2 120 "CYAN_SKILLFISH",
87dbad02 121 "NAVI14",
9802f5d7 122 "NAVI12",
ccaf72d3 123 "SIENNA_CICHLID",
ddd8fbe7 124 "NAVY_FLOUNDER",
4f1e9a76 125 "VANGOGH",
a2468e04 126 "DIMGREY_CAVEFISH",
6f169591 127 "BEIGE_GOBY",
ee9236b7 128 "YELLOW_CARP",
3ae695d6 129 "IP DISCOVERY",
d38ceaf9
AD
130 "LAST",
131};
132
dcea6e65
KR
133/**
134 * DOC: pcie_replay_count
135 *
136 * The amdgpu driver provides a sysfs API for reporting the total number
137 * of PCIe replays (NAKs)
138 * The file pcie_replay_count is used for this and returns the total
139 * number of replays as a sum of the NAKs generated and NAKs received
140 */
141
142static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
143 struct device_attribute *attr, char *buf)
144{
145 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 146 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
147 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
148
36000c7a 149 return sysfs_emit(buf, "%llu\n", cnt);
dcea6e65
KR
150}
151
152static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
153 amdgpu_device_get_pcie_replay_count, NULL);
154
5494d864
AD
155static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
156
bd607166
KR
157/**
158 * DOC: product_name
159 *
160 * The amdgpu driver provides a sysfs API for reporting the product name
161 * for the device
162 * The file serial_number is used for this and returns the product name
163 * as returned from the FRU.
164 * NOTE: This is only available for certain server cards
165 */
166
167static ssize_t amdgpu_device_get_product_name(struct device *dev,
168 struct device_attribute *attr, char *buf)
169{
170 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 171 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 172
36000c7a 173 return sysfs_emit(buf, "%s\n", adev->product_name);
bd607166
KR
174}
175
176static DEVICE_ATTR(product_name, S_IRUGO,
177 amdgpu_device_get_product_name, NULL);
178
179/**
180 * DOC: product_number
181 *
182 * The amdgpu driver provides a sysfs API for reporting the part number
183 * for the device
184 * The file serial_number is used for this and returns the part number
185 * as returned from the FRU.
186 * NOTE: This is only available for certain server cards
187 */
188
189static ssize_t amdgpu_device_get_product_number(struct device *dev,
190 struct device_attribute *attr, char *buf)
191{
192 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 193 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 194
36000c7a 195 return sysfs_emit(buf, "%s\n", adev->product_number);
bd607166
KR
196}
197
198static DEVICE_ATTR(product_number, S_IRUGO,
199 amdgpu_device_get_product_number, NULL);
200
201/**
202 * DOC: serial_number
203 *
204 * The amdgpu driver provides a sysfs API for reporting the serial number
205 * for the device
206 * The file serial_number is used for this and returns the serial number
207 * as returned from the FRU.
208 * NOTE: This is only available for certain server cards
209 */
210
211static ssize_t amdgpu_device_get_serial_number(struct device *dev,
212 struct device_attribute *attr, char *buf)
213{
214 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 215 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 216
36000c7a 217 return sysfs_emit(buf, "%s\n", adev->serial);
bd607166
KR
218}
219
220static DEVICE_ATTR(serial_number, S_IRUGO,
221 amdgpu_device_get_serial_number, NULL);
222
fd496ca8 223/**
b98c6299 224 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
fd496ca8
AD
225 *
226 * @dev: drm_device pointer
227 *
b98c6299 228 * Returns true if the device is a dGPU with ATPX power control,
fd496ca8
AD
229 * otherwise return false.
230 */
b98c6299 231bool amdgpu_device_supports_px(struct drm_device *dev)
fd496ca8
AD
232{
233 struct amdgpu_device *adev = drm_to_adev(dev);
234
b98c6299 235 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
fd496ca8
AD
236 return true;
237 return false;
238}
239
e3ecdffa 240/**
0330b848 241 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
e3ecdffa
AD
242 *
243 * @dev: drm_device pointer
244 *
b98c6299 245 * Returns true if the device is a dGPU with ACPI power control,
e3ecdffa
AD
246 * otherwise return false.
247 */
31af062a 248bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 249{
1348969a 250 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 251
b98c6299
AD
252 if (adev->has_pr3 ||
253 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
d38ceaf9
AD
254 return true;
255 return false;
256}
257
a69cba42
AD
258/**
259 * amdgpu_device_supports_baco - Does the device support BACO
260 *
261 * @dev: drm_device pointer
262 *
263 * Returns true if the device supporte BACO,
264 * otherwise return false.
265 */
266bool amdgpu_device_supports_baco(struct drm_device *dev)
267{
1348969a 268 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
269
270 return amdgpu_asic_supports_baco(adev);
271}
272
3fa8f89d
S
273/**
274 * amdgpu_device_supports_smart_shift - Is the device dGPU with
275 * smart shift support
276 *
277 * @dev: drm_device pointer
278 *
279 * Returns true if the device is a dGPU with Smart Shift support,
280 * otherwise returns false.
281 */
282bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
283{
284 return (amdgpu_device_supports_boco(dev) &&
285 amdgpu_acpi_is_power_shift_control_supported());
286}
287
6e3cd2a9
MCC
288/*
289 * VRAM access helper functions
290 */
291
e35e2b11 292/**
048af66b 293 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
e35e2b11
TY
294 *
295 * @adev: amdgpu_device pointer
296 * @pos: offset of the buffer in vram
297 * @buf: virtual address of the buffer in system memory
298 * @size: read/write size, sizeof(@buf) must > @size
299 * @write: true - write to vram, otherwise - read from vram
300 */
048af66b
KW
301void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
302 void *buf, size_t size, bool write)
e35e2b11 303{
e35e2b11 304 unsigned long flags;
048af66b
KW
305 uint32_t hi = ~0, tmp = 0;
306 uint32_t *data = buf;
ce05ac56 307 uint64_t last;
f89f8c6b 308 int idx;
ce05ac56 309
c58a863b 310 if (!drm_dev_enter(adev_to_drm(adev), &idx))
f89f8c6b 311 return;
9d11eb0d 312
048af66b
KW
313 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
314
315 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
316 for (last = pos + size; pos < last; pos += 4) {
317 tmp = pos >> 31;
318
319 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
320 if (tmp != hi) {
321 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
322 hi = tmp;
323 }
324 if (write)
325 WREG32_NO_KIQ(mmMM_DATA, *data++);
326 else
327 *data++ = RREG32_NO_KIQ(mmMM_DATA);
328 }
329
330 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
331 drm_dev_exit(idx);
332}
333
334/**
bbe04dec 335 * amdgpu_device_aper_access - access vram by vram aperature
048af66b
KW
336 *
337 * @adev: amdgpu_device pointer
338 * @pos: offset of the buffer in vram
339 * @buf: virtual address of the buffer in system memory
340 * @size: read/write size, sizeof(@buf) must > @size
341 * @write: true - write to vram, otherwise - read from vram
342 *
343 * The return value means how many bytes have been transferred.
344 */
345size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
346 void *buf, size_t size, bool write)
347{
9d11eb0d 348#ifdef CONFIG_64BIT
048af66b
KW
349 void __iomem *addr;
350 size_t count = 0;
351 uint64_t last;
352
353 if (!adev->mman.aper_base_kaddr)
354 return 0;
355
9d11eb0d
CK
356 last = min(pos + size, adev->gmc.visible_vram_size);
357 if (last > pos) {
048af66b
KW
358 addr = adev->mman.aper_base_kaddr + pos;
359 count = last - pos;
9d11eb0d
CK
360
361 if (write) {
362 memcpy_toio(addr, buf, count);
363 mb();
810085dd 364 amdgpu_device_flush_hdp(adev, NULL);
9d11eb0d 365 } else {
810085dd 366 amdgpu_device_invalidate_hdp(adev, NULL);
9d11eb0d
CK
367 mb();
368 memcpy_fromio(buf, addr, count);
369 }
370
9d11eb0d 371 }
048af66b
KW
372
373 return count;
374#else
375 return 0;
9d11eb0d 376#endif
048af66b 377}
9d11eb0d 378
048af66b
KW
379/**
380 * amdgpu_device_vram_access - read/write a buffer in vram
381 *
382 * @adev: amdgpu_device pointer
383 * @pos: offset of the buffer in vram
384 * @buf: virtual address of the buffer in system memory
385 * @size: read/write size, sizeof(@buf) must > @size
386 * @write: true - write to vram, otherwise - read from vram
387 */
388void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
389 void *buf, size_t size, bool write)
390{
391 size_t count;
e35e2b11 392
048af66b
KW
393 /* try to using vram apreature to access vram first */
394 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
395 size -= count;
396 if (size) {
397 /* using MM to access rest vram */
398 pos += count;
399 buf += count;
400 amdgpu_device_mm_access(adev, pos, buf, size, write);
e35e2b11
TY
401 }
402}
403
d38ceaf9 404/*
f7ee1874 405 * register access helper functions.
d38ceaf9 406 */
56b53c0b
DL
407
408/* Check if hw access should be skipped because of hotplug or device error */
409bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
410{
7afefb81 411 if (adev->no_hw_access)
56b53c0b
DL
412 return true;
413
414#ifdef CONFIG_LOCKDEP
415 /*
416 * This is a bit complicated to understand, so worth a comment. What we assert
417 * here is that the GPU reset is not running on another thread in parallel.
418 *
419 * For this we trylock the read side of the reset semaphore, if that succeeds
420 * we know that the reset is not running in paralell.
421 *
422 * If the trylock fails we assert that we are either already holding the read
423 * side of the lock or are the reset thread itself and hold the write side of
424 * the lock.
425 */
426 if (in_task()) {
427 if (down_read_trylock(&adev->reset_sem))
428 up_read(&adev->reset_sem);
429 else
430 lockdep_assert_held(&adev->reset_sem);
431 }
432#endif
433 return false;
434}
435
e3ecdffa 436/**
f7ee1874 437 * amdgpu_device_rreg - read a memory mapped IO or indirect register
e3ecdffa
AD
438 *
439 * @adev: amdgpu_device pointer
440 * @reg: dword aligned register offset
441 * @acc_flags: access flags which require special behavior
442 *
443 * Returns the 32 bit value from the offset specified.
444 */
f7ee1874
HZ
445uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
446 uint32_t reg, uint32_t acc_flags)
d38ceaf9 447{
f4b373f4
TSD
448 uint32_t ret;
449
56b53c0b 450 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
451 return 0;
452
f7ee1874
HZ
453 if ((reg * 4) < adev->rmmio_size) {
454 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
455 amdgpu_sriov_runtime(adev) &&
456 down_read_trylock(&adev->reset_sem)) {
457 ret = amdgpu_kiq_rreg(adev, reg);
458 up_read(&adev->reset_sem);
459 } else {
460 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
461 }
462 } else {
463 ret = adev->pcie_rreg(adev, reg * 4);
81202807 464 }
bc992ba5 465
f7ee1874 466 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
e78b579d 467
f4b373f4 468 return ret;
d38ceaf9
AD
469}
470
421a2a30
ML
471/*
472 * MMIO register read with bytes helper functions
473 * @offset:bytes offset from MMIO start
474 *
475*/
476
e3ecdffa
AD
477/**
478 * amdgpu_mm_rreg8 - read a memory mapped IO register
479 *
480 * @adev: amdgpu_device pointer
481 * @offset: byte aligned register offset
482 *
483 * Returns the 8 bit value from the offset specified.
484 */
7cbbc745
AG
485uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
486{
56b53c0b 487 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
488 return 0;
489
421a2a30
ML
490 if (offset < adev->rmmio_size)
491 return (readb(adev->rmmio + offset));
492 BUG();
493}
494
495/*
496 * MMIO register write with bytes helper functions
497 * @offset:bytes offset from MMIO start
498 * @value: the value want to be written to the register
499 *
500*/
e3ecdffa
AD
501/**
502 * amdgpu_mm_wreg8 - read a memory mapped IO register
503 *
504 * @adev: amdgpu_device pointer
505 * @offset: byte aligned register offset
506 * @value: 8 bit value to write
507 *
508 * Writes the value specified to the offset specified.
509 */
7cbbc745
AG
510void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
511{
56b53c0b 512 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
513 return;
514
421a2a30
ML
515 if (offset < adev->rmmio_size)
516 writeb(value, adev->rmmio + offset);
517 else
518 BUG();
519}
520
e3ecdffa 521/**
f7ee1874 522 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
e3ecdffa
AD
523 *
524 * @adev: amdgpu_device pointer
525 * @reg: dword aligned register offset
526 * @v: 32 bit value to write to the register
527 * @acc_flags: access flags which require special behavior
528 *
529 * Writes the value specified to the offset specified.
530 */
f7ee1874
HZ
531void amdgpu_device_wreg(struct amdgpu_device *adev,
532 uint32_t reg, uint32_t v,
533 uint32_t acc_flags)
d38ceaf9 534{
56b53c0b 535 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
536 return;
537
f7ee1874
HZ
538 if ((reg * 4) < adev->rmmio_size) {
539 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
540 amdgpu_sriov_runtime(adev) &&
541 down_read_trylock(&adev->reset_sem)) {
542 amdgpu_kiq_wreg(adev, reg, v);
543 up_read(&adev->reset_sem);
544 } else {
545 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
546 }
547 } else {
548 adev->pcie_wreg(adev, reg * 4, v);
81202807 549 }
bc992ba5 550
f7ee1874 551 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
2e0cc4d4 552}
d38ceaf9 553
03f2abb0 554/**
4cc9f86f 555 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
2e0cc4d4
ML
556 *
557 * this function is invoked only the debugfs register access
03f2abb0 558 */
f7ee1874
HZ
559void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
560 uint32_t reg, uint32_t v)
2e0cc4d4 561{
56b53c0b 562 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
563 return;
564
2e0cc4d4 565 if (amdgpu_sriov_fullaccess(adev) &&
f7ee1874
HZ
566 adev->gfx.rlc.funcs &&
567 adev->gfx.rlc.funcs->is_rlcg_access_range) {
2e0cc4d4 568 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
1a4772d9 569 return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0);
4cc9f86f
TSD
570 } else if ((reg * 4) >= adev->rmmio_size) {
571 adev->pcie_wreg(adev, reg * 4, v);
f7ee1874
HZ
572 } else {
573 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
47ed4e1c 574 }
d38ceaf9
AD
575}
576
d38ceaf9
AD
577/**
578 * amdgpu_mm_rdoorbell - read a doorbell dword
579 *
580 * @adev: amdgpu_device pointer
581 * @index: doorbell index
582 *
583 * Returns the value in the doorbell aperture at the
584 * requested doorbell index (CIK).
585 */
586u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
587{
56b53c0b 588 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
589 return 0;
590
d38ceaf9
AD
591 if (index < adev->doorbell.num_doorbells) {
592 return readl(adev->doorbell.ptr + index);
593 } else {
594 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
595 return 0;
596 }
597}
598
599/**
600 * amdgpu_mm_wdoorbell - write a doorbell dword
601 *
602 * @adev: amdgpu_device pointer
603 * @index: doorbell index
604 * @v: value to write
605 *
606 * Writes @v to the doorbell aperture at the
607 * requested doorbell index (CIK).
608 */
609void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
610{
56b53c0b 611 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
612 return;
613
d38ceaf9
AD
614 if (index < adev->doorbell.num_doorbells) {
615 writel(v, adev->doorbell.ptr + index);
616 } else {
617 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
618 }
619}
620
832be404
KW
621/**
622 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
623 *
624 * @adev: amdgpu_device pointer
625 * @index: doorbell index
626 *
627 * Returns the value in the doorbell aperture at the
628 * requested doorbell index (VEGA10+).
629 */
630u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
631{
56b53c0b 632 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
633 return 0;
634
832be404
KW
635 if (index < adev->doorbell.num_doorbells) {
636 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
637 } else {
638 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
639 return 0;
640 }
641}
642
643/**
644 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
645 *
646 * @adev: amdgpu_device pointer
647 * @index: doorbell index
648 * @v: value to write
649 *
650 * Writes @v to the doorbell aperture at the
651 * requested doorbell index (VEGA10+).
652 */
653void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
654{
56b53c0b 655 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
656 return;
657
832be404
KW
658 if (index < adev->doorbell.num_doorbells) {
659 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
660 } else {
661 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
662 }
663}
664
1bba3683
HZ
665/**
666 * amdgpu_device_indirect_rreg - read an indirect register
667 *
668 * @adev: amdgpu_device pointer
669 * @pcie_index: mmio register offset
670 * @pcie_data: mmio register offset
22f453fb 671 * @reg_addr: indirect register address to read from
1bba3683
HZ
672 *
673 * Returns the value of indirect register @reg_addr
674 */
675u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
676 u32 pcie_index, u32 pcie_data,
677 u32 reg_addr)
678{
679 unsigned long flags;
680 u32 r;
681 void __iomem *pcie_index_offset;
682 void __iomem *pcie_data_offset;
683
684 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
685 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
686 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
687
688 writel(reg_addr, pcie_index_offset);
689 readl(pcie_index_offset);
690 r = readl(pcie_data_offset);
691 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
692
693 return r;
694}
695
696/**
697 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
698 *
699 * @adev: amdgpu_device pointer
700 * @pcie_index: mmio register offset
701 * @pcie_data: mmio register offset
22f453fb 702 * @reg_addr: indirect register address to read from
1bba3683
HZ
703 *
704 * Returns the value of indirect register @reg_addr
705 */
706u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
707 u32 pcie_index, u32 pcie_data,
708 u32 reg_addr)
709{
710 unsigned long flags;
711 u64 r;
712 void __iomem *pcie_index_offset;
713 void __iomem *pcie_data_offset;
714
715 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
716 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
717 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
718
719 /* read low 32 bits */
720 writel(reg_addr, pcie_index_offset);
721 readl(pcie_index_offset);
722 r = readl(pcie_data_offset);
723 /* read high 32 bits */
724 writel(reg_addr + 4, pcie_index_offset);
725 readl(pcie_index_offset);
726 r |= ((u64)readl(pcie_data_offset) << 32);
727 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
728
729 return r;
730}
731
732/**
733 * amdgpu_device_indirect_wreg - write an indirect register address
734 *
735 * @adev: amdgpu_device pointer
736 * @pcie_index: mmio register offset
737 * @pcie_data: mmio register offset
738 * @reg_addr: indirect register offset
739 * @reg_data: indirect register data
740 *
741 */
742void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
743 u32 pcie_index, u32 pcie_data,
744 u32 reg_addr, u32 reg_data)
745{
746 unsigned long flags;
747 void __iomem *pcie_index_offset;
748 void __iomem *pcie_data_offset;
749
750 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
751 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
752 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
753
754 writel(reg_addr, pcie_index_offset);
755 readl(pcie_index_offset);
756 writel(reg_data, pcie_data_offset);
757 readl(pcie_data_offset);
758 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
759}
760
761/**
762 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
763 *
764 * @adev: amdgpu_device pointer
765 * @pcie_index: mmio register offset
766 * @pcie_data: mmio register offset
767 * @reg_addr: indirect register offset
768 * @reg_data: indirect register data
769 *
770 */
771void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
772 u32 pcie_index, u32 pcie_data,
773 u32 reg_addr, u64 reg_data)
774{
775 unsigned long flags;
776 void __iomem *pcie_index_offset;
777 void __iomem *pcie_data_offset;
778
779 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
780 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
781 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
782
783 /* write low 32 bits */
784 writel(reg_addr, pcie_index_offset);
785 readl(pcie_index_offset);
786 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
787 readl(pcie_data_offset);
788 /* write high 32 bits */
789 writel(reg_addr + 4, pcie_index_offset);
790 readl(pcie_index_offset);
791 writel((u32)(reg_data >> 32), pcie_data_offset);
792 readl(pcie_data_offset);
793 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
794}
795
d38ceaf9
AD
796/**
797 * amdgpu_invalid_rreg - dummy reg read function
798 *
982a820b 799 * @adev: amdgpu_device pointer
d38ceaf9
AD
800 * @reg: offset of register
801 *
802 * Dummy register read function. Used for register blocks
803 * that certain asics don't have (all asics).
804 * Returns the value in the register.
805 */
806static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
807{
808 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
809 BUG();
810 return 0;
811}
812
813/**
814 * amdgpu_invalid_wreg - dummy reg write function
815 *
982a820b 816 * @adev: amdgpu_device pointer
d38ceaf9
AD
817 * @reg: offset of register
818 * @v: value to write to the register
819 *
820 * Dummy register read function. Used for register blocks
821 * that certain asics don't have (all asics).
822 */
823static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
824{
825 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
826 reg, v);
827 BUG();
828}
829
4fa1c6a6
TZ
830/**
831 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
832 *
982a820b 833 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
834 * @reg: offset of register
835 *
836 * Dummy register read function. Used for register blocks
837 * that certain asics don't have (all asics).
838 * Returns the value in the register.
839 */
840static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
841{
842 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
843 BUG();
844 return 0;
845}
846
847/**
848 * amdgpu_invalid_wreg64 - dummy reg write function
849 *
982a820b 850 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
851 * @reg: offset of register
852 * @v: value to write to the register
853 *
854 * Dummy register read function. Used for register blocks
855 * that certain asics don't have (all asics).
856 */
857static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
858{
859 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
860 reg, v);
861 BUG();
862}
863
d38ceaf9
AD
864/**
865 * amdgpu_block_invalid_rreg - dummy reg read function
866 *
982a820b 867 * @adev: amdgpu_device pointer
d38ceaf9
AD
868 * @block: offset of instance
869 * @reg: offset of register
870 *
871 * Dummy register read function. Used for register blocks
872 * that certain asics don't have (all asics).
873 * Returns the value in the register.
874 */
875static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
876 uint32_t block, uint32_t reg)
877{
878 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
879 reg, block);
880 BUG();
881 return 0;
882}
883
884/**
885 * amdgpu_block_invalid_wreg - dummy reg write function
886 *
982a820b 887 * @adev: amdgpu_device pointer
d38ceaf9
AD
888 * @block: offset of instance
889 * @reg: offset of register
890 * @v: value to write to the register
891 *
892 * Dummy register read function. Used for register blocks
893 * that certain asics don't have (all asics).
894 */
895static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
896 uint32_t block,
897 uint32_t reg, uint32_t v)
898{
899 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
900 reg, block, v);
901 BUG();
902}
903
4d2997ab
AD
904/**
905 * amdgpu_device_asic_init - Wrapper for atom asic_init
906 *
982a820b 907 * @adev: amdgpu_device pointer
4d2997ab
AD
908 *
909 * Does any asic specific work and then calls atom asic init.
910 */
911static int amdgpu_device_asic_init(struct amdgpu_device *adev)
912{
913 amdgpu_asic_pre_asic_init(adev);
914
915 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
916}
917
e3ecdffa
AD
918/**
919 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
920 *
982a820b 921 * @adev: amdgpu_device pointer
e3ecdffa
AD
922 *
923 * Allocates a scratch page of VRAM for use by various things in the
924 * driver.
925 */
06ec9070 926static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 927{
a4a02777
CK
928 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
929 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
930 &adev->vram_scratch.robj,
931 &adev->vram_scratch.gpu_addr,
932 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
933}
934
e3ecdffa
AD
935/**
936 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
937 *
982a820b 938 * @adev: amdgpu_device pointer
e3ecdffa
AD
939 *
940 * Frees the VRAM scratch page.
941 */
06ec9070 942static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 943{
078af1a3 944 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
945}
946
947/**
9c3f2b54 948 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
949 *
950 * @adev: amdgpu_device pointer
951 * @registers: pointer to the register array
952 * @array_size: size of the register array
953 *
954 * Programs an array or registers with and and or masks.
955 * This is a helper for setting golden registers.
956 */
9c3f2b54
AD
957void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
958 const u32 *registers,
959 const u32 array_size)
d38ceaf9
AD
960{
961 u32 tmp, reg, and_mask, or_mask;
962 int i;
963
964 if (array_size % 3)
965 return;
966
967 for (i = 0; i < array_size; i +=3) {
968 reg = registers[i + 0];
969 and_mask = registers[i + 1];
970 or_mask = registers[i + 2];
971
972 if (and_mask == 0xffffffff) {
973 tmp = or_mask;
974 } else {
975 tmp = RREG32(reg);
976 tmp &= ~and_mask;
e0d07657
HZ
977 if (adev->family >= AMDGPU_FAMILY_AI)
978 tmp |= (or_mask & and_mask);
979 else
980 tmp |= or_mask;
d38ceaf9
AD
981 }
982 WREG32(reg, tmp);
983 }
984}
985
e3ecdffa
AD
986/**
987 * amdgpu_device_pci_config_reset - reset the GPU
988 *
989 * @adev: amdgpu_device pointer
990 *
991 * Resets the GPU using the pci config reset sequence.
992 * Only applicable to asics prior to vega10.
993 */
8111c387 994void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
995{
996 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
997}
998
af484df8
AD
999/**
1000 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1001 *
1002 * @adev: amdgpu_device pointer
1003 *
1004 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1005 */
1006int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1007{
1008 return pci_reset_function(adev->pdev);
1009}
1010
d38ceaf9
AD
1011/*
1012 * GPU doorbell aperture helpers function.
1013 */
1014/**
06ec9070 1015 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
1016 *
1017 * @adev: amdgpu_device pointer
1018 *
1019 * Init doorbell driver information (CIK)
1020 * Returns 0 on success, error on failure.
1021 */
06ec9070 1022static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 1023{
6585661d 1024
705e519e
CK
1025 /* No doorbell on SI hardware generation */
1026 if (adev->asic_type < CHIP_BONAIRE) {
1027 adev->doorbell.base = 0;
1028 adev->doorbell.size = 0;
1029 adev->doorbell.num_doorbells = 0;
1030 adev->doorbell.ptr = NULL;
1031 return 0;
1032 }
1033
d6895ad3
CK
1034 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1035 return -EINVAL;
1036
22357775
AD
1037 amdgpu_asic_init_doorbell_index(adev);
1038
d38ceaf9
AD
1039 /* doorbell bar mapping */
1040 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1041 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1042
edf600da 1043 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
9564f192 1044 adev->doorbell_index.max_assignment+1);
d38ceaf9
AD
1045 if (adev->doorbell.num_doorbells == 0)
1046 return -EINVAL;
1047
ec3db8a6 1048 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
88dc26e4
OZ
1049 * paging queue doorbell use the second page. The
1050 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1051 * doorbells are in the first page. So with paging queue enabled,
1052 * the max num_doorbells should + 1 page (0x400 in dword)
ec3db8a6
PY
1053 */
1054 if (adev->asic_type >= CHIP_VEGA10)
88dc26e4 1055 adev->doorbell.num_doorbells += 0x400;
ec3db8a6 1056
8972e5d2
CK
1057 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1058 adev->doorbell.num_doorbells *
1059 sizeof(u32));
1060 if (adev->doorbell.ptr == NULL)
d38ceaf9 1061 return -ENOMEM;
d38ceaf9
AD
1062
1063 return 0;
1064}
1065
1066/**
06ec9070 1067 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
1068 *
1069 * @adev: amdgpu_device pointer
1070 *
1071 * Tear down doorbell driver information (CIK)
1072 */
06ec9070 1073static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1074{
1075 iounmap(adev->doorbell.ptr);
1076 adev->doorbell.ptr = NULL;
1077}
1078
22cb0164 1079
d38ceaf9
AD
1080
1081/*
06ec9070 1082 * amdgpu_device_wb_*()
455a7bc2 1083 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 1084 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
1085 */
1086
1087/**
06ec9070 1088 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
1089 *
1090 * @adev: amdgpu_device pointer
1091 *
1092 * Disables Writeback and frees the Writeback memory (all asics).
1093 * Used at driver shutdown.
1094 */
06ec9070 1095static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1096{
1097 if (adev->wb.wb_obj) {
a76ed485
AD
1098 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1099 &adev->wb.gpu_addr,
1100 (void **)&adev->wb.wb);
d38ceaf9
AD
1101 adev->wb.wb_obj = NULL;
1102 }
1103}
1104
1105/**
03f2abb0 1106 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
d38ceaf9
AD
1107 *
1108 * @adev: amdgpu_device pointer
1109 *
455a7bc2 1110 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
1111 * Used at driver startup.
1112 * Returns 0 on success or an -error on failure.
1113 */
06ec9070 1114static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
1115{
1116 int r;
1117
1118 if (adev->wb.wb_obj == NULL) {
97407b63
AD
1119 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1120 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
1121 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1122 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1123 (void **)&adev->wb.wb);
d38ceaf9
AD
1124 if (r) {
1125 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1126 return r;
1127 }
d38ceaf9
AD
1128
1129 adev->wb.num_wb = AMDGPU_MAX_WB;
1130 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1131
1132 /* clear wb memory */
73469585 1133 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
1134 }
1135
1136 return 0;
1137}
1138
1139/**
131b4b36 1140 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
1141 *
1142 * @adev: amdgpu_device pointer
1143 * @wb: wb index
1144 *
1145 * Allocate a wb slot for use by the driver (all asics).
1146 * Returns 0 on success or -EINVAL on failure.
1147 */
131b4b36 1148int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
1149{
1150 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 1151
97407b63 1152 if (offset < adev->wb.num_wb) {
7014285a 1153 __set_bit(offset, adev->wb.used);
63ae07ca 1154 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
1155 return 0;
1156 } else {
1157 return -EINVAL;
1158 }
1159}
1160
d38ceaf9 1161/**
131b4b36 1162 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
1163 *
1164 * @adev: amdgpu_device pointer
1165 * @wb: wb index
1166 *
1167 * Free a wb slot allocated for use by the driver (all asics)
1168 */
131b4b36 1169void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 1170{
73469585 1171 wb >>= 3;
d38ceaf9 1172 if (wb < adev->wb.num_wb)
73469585 1173 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
1174}
1175
d6895ad3
CK
1176/**
1177 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1178 *
1179 * @adev: amdgpu_device pointer
1180 *
1181 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1182 * to fail, but if any of the BARs is not accessible after the size we abort
1183 * driver loading by returning -ENODEV.
1184 */
1185int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1186{
453f617a 1187 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
31b8adab
CK
1188 struct pci_bus *root;
1189 struct resource *res;
1190 unsigned i;
d6895ad3
CK
1191 u16 cmd;
1192 int r;
1193
0c03b912 1194 /* Bypass for VF */
1195 if (amdgpu_sriov_vf(adev))
1196 return 0;
1197
b7221f2b
AD
1198 /* skip if the bios has already enabled large BAR */
1199 if (adev->gmc.real_vram_size &&
1200 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1201 return 0;
1202
31b8adab
CK
1203 /* Check if the root BUS has 64bit memory resources */
1204 root = adev->pdev->bus;
1205 while (root->parent)
1206 root = root->parent;
1207
1208 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 1209 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1210 res->start > 0x100000000ull)
1211 break;
1212 }
1213
1214 /* Trying to resize is pointless without a root hub window above 4GB */
1215 if (!res)
1216 return 0;
1217
453f617a
ND
1218 /* Limit the BAR size to what is available */
1219 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1220 rbar_size);
1221
d6895ad3
CK
1222 /* Disable memory decoding while we change the BAR addresses and size */
1223 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1224 pci_write_config_word(adev->pdev, PCI_COMMAND,
1225 cmd & ~PCI_COMMAND_MEMORY);
1226
1227 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1228 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1229 if (adev->asic_type >= CHIP_BONAIRE)
1230 pci_release_resource(adev->pdev, 2);
1231
1232 pci_release_resource(adev->pdev, 0);
1233
1234 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1235 if (r == -ENOSPC)
1236 DRM_INFO("Not enough PCI address space for a large BAR.");
1237 else if (r && r != -ENOTSUPP)
1238 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1239
1240 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1241
1242 /* When the doorbell or fb BAR isn't available we have no chance of
1243 * using the device.
1244 */
06ec9070 1245 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1246 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1247 return -ENODEV;
1248
1249 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1250
1251 return 0;
1252}
a05502e5 1253
d38ceaf9
AD
1254/*
1255 * GPU helpers function.
1256 */
1257/**
39c640c0 1258 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1259 *
1260 * @adev: amdgpu_device pointer
1261 *
c836fec5
JQ
1262 * Check if the asic has been initialized (all asics) at driver startup
1263 * or post is needed if hw reset is performed.
1264 * Returns true if need or false if not.
d38ceaf9 1265 */
39c640c0 1266bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1267{
1268 uint32_t reg;
1269
bec86378
ML
1270 if (amdgpu_sriov_vf(adev))
1271 return false;
1272
1273 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1274 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1275 * some old smc fw still need driver do vPost otherwise gpu hang, while
1276 * those smc fw version above 22.15 doesn't have this flaw, so we force
1277 * vpost executed for smc version below 22.15
bec86378
ML
1278 */
1279 if (adev->asic_type == CHIP_FIJI) {
1280 int err;
1281 uint32_t fw_ver;
1282 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1283 /* force vPost if error occured */
1284 if (err)
1285 return true;
1286
1287 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1288 if (fw_ver < 0x00160e00)
1289 return true;
bec86378 1290 }
bec86378 1291 }
91fe77eb 1292
e3c1b071 1293 /* Don't post if we need to reset whole hive on init */
1294 if (adev->gmc.xgmi.pending_reset)
1295 return false;
1296
91fe77eb 1297 if (adev->has_hw_reset) {
1298 adev->has_hw_reset = false;
1299 return true;
1300 }
1301
1302 /* bios scratch used on CIK+ */
1303 if (adev->asic_type >= CHIP_BONAIRE)
1304 return amdgpu_atombios_scratch_need_asic_init(adev);
1305
1306 /* check MEM_SIZE for older asics */
1307 reg = amdgpu_asic_get_config_memsize(adev);
1308
1309 if ((reg != 0) && (reg != 0xffffffff))
1310 return false;
1311
1312 return true;
bec86378
ML
1313}
1314
d38ceaf9
AD
1315/* if we get transitioned to only one device, take VGA back */
1316/**
06ec9070 1317 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9 1318 *
bf44e8ce 1319 * @pdev: PCI device pointer
d38ceaf9
AD
1320 * @state: enable/disable vga decode
1321 *
1322 * Enable/disable vga decode (all asics).
1323 * Returns VGA resource flags.
1324 */
bf44e8ce
CH
1325static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326 bool state)
d38ceaf9 1327{
bf44e8ce 1328 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
d38ceaf9
AD
1329 amdgpu_asic_set_vga_state(adev, state);
1330 if (state)
1331 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1332 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1333 else
1334 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1335}
1336
e3ecdffa
AD
1337/**
1338 * amdgpu_device_check_block_size - validate the vm block size
1339 *
1340 * @adev: amdgpu_device pointer
1341 *
1342 * Validates the vm block size specified via module parameter.
1343 * The vm block size defines number of bits in page table versus page directory,
1344 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1345 * page table and the remaining bits are in the page directory.
1346 */
06ec9070 1347static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1348{
1349 /* defines number of bits in page table versus page directory,
1350 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1351 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1352 if (amdgpu_vm_block_size == -1)
1353 return;
a1adf8be 1354
bab4fee7 1355 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1356 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1357 amdgpu_vm_block_size);
97489129 1358 amdgpu_vm_block_size = -1;
a1adf8be 1359 }
a1adf8be
CZ
1360}
1361
e3ecdffa
AD
1362/**
1363 * amdgpu_device_check_vm_size - validate the vm size
1364 *
1365 * @adev: amdgpu_device pointer
1366 *
1367 * Validates the vm size in GB specified via module parameter.
1368 * The VM size is the size of the GPU virtual memory space in GB.
1369 */
06ec9070 1370static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1371{
64dab074
AD
1372 /* no need to check the default value */
1373 if (amdgpu_vm_size == -1)
1374 return;
1375
83ca145d
ZJ
1376 if (amdgpu_vm_size < 1) {
1377 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1378 amdgpu_vm_size);
f3368128 1379 amdgpu_vm_size = -1;
83ca145d 1380 }
83ca145d
ZJ
1381}
1382
7951e376
RZ
1383static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1384{
1385 struct sysinfo si;
a9d4fe2f 1386 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1387 uint64_t total_memory;
1388 uint64_t dram_size_seven_GB = 0x1B8000000;
1389 uint64_t dram_size_three_GB = 0xB8000000;
1390
1391 if (amdgpu_smu_memory_pool_size == 0)
1392 return;
1393
1394 if (!is_os_64) {
1395 DRM_WARN("Not 64-bit OS, feature not supported\n");
1396 goto def_value;
1397 }
1398 si_meminfo(&si);
1399 total_memory = (uint64_t)si.totalram * si.mem_unit;
1400
1401 if ((amdgpu_smu_memory_pool_size == 1) ||
1402 (amdgpu_smu_memory_pool_size == 2)) {
1403 if (total_memory < dram_size_three_GB)
1404 goto def_value1;
1405 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1406 (amdgpu_smu_memory_pool_size == 8)) {
1407 if (total_memory < dram_size_seven_GB)
1408 goto def_value1;
1409 } else {
1410 DRM_WARN("Smu memory pool size not supported\n");
1411 goto def_value;
1412 }
1413 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1414
1415 return;
1416
1417def_value1:
1418 DRM_WARN("No enough system memory\n");
1419def_value:
1420 adev->pm.smu_prv_buffer_size = 0;
1421}
1422
9f6a7857
HR
1423static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1424{
1425 if (!(adev->flags & AMD_IS_APU) ||
1426 adev->asic_type < CHIP_RAVEN)
1427 return 0;
1428
1429 switch (adev->asic_type) {
1430 case CHIP_RAVEN:
1431 if (adev->pdev->device == 0x15dd)
1432 adev->apu_flags |= AMD_APU_IS_RAVEN;
1433 if (adev->pdev->device == 0x15d8)
1434 adev->apu_flags |= AMD_APU_IS_PICASSO;
1435 break;
1436 case CHIP_RENOIR:
1437 if ((adev->pdev->device == 0x1636) ||
1438 (adev->pdev->device == 0x164c))
1439 adev->apu_flags |= AMD_APU_IS_RENOIR;
1440 else
1441 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1442 break;
1443 case CHIP_VANGOGH:
1444 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1445 break;
1446 case CHIP_YELLOW_CARP:
1447 break;
d0f56dc2
TZ
1448 case CHIP_CYAN_SKILLFISH:
1449 if (adev->pdev->device == 0x13FE)
1450 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1451 break;
9f6a7857 1452 default:
4eaf21b7 1453 break;
9f6a7857
HR
1454 }
1455
1456 return 0;
1457}
1458
d38ceaf9 1459/**
06ec9070 1460 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1461 *
1462 * @adev: amdgpu_device pointer
1463 *
1464 * Validates certain module parameters and updates
1465 * the associated values used by the driver (all asics).
1466 */
912dfc84 1467static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1468{
5b011235
CZ
1469 if (amdgpu_sched_jobs < 4) {
1470 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1471 amdgpu_sched_jobs);
1472 amdgpu_sched_jobs = 4;
76117507 1473 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1474 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1475 amdgpu_sched_jobs);
1476 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1477 }
d38ceaf9 1478
83e74db6 1479 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1480 /* gart size must be greater or equal to 32M */
1481 dev_warn(adev->dev, "gart size (%d) too small\n",
1482 amdgpu_gart_size);
83e74db6 1483 amdgpu_gart_size = -1;
d38ceaf9
AD
1484 }
1485
36d38372 1486 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1487 /* gtt size must be greater or equal to 32M */
36d38372
CK
1488 dev_warn(adev->dev, "gtt size (%d) too small\n",
1489 amdgpu_gtt_size);
1490 amdgpu_gtt_size = -1;
d38ceaf9
AD
1491 }
1492
d07f14be
RH
1493 /* valid range is between 4 and 9 inclusive */
1494 if (amdgpu_vm_fragment_size != -1 &&
1495 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1496 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1497 amdgpu_vm_fragment_size = -1;
1498 }
1499
5d5bd5e3
KW
1500 if (amdgpu_sched_hw_submission < 2) {
1501 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1502 amdgpu_sched_hw_submission);
1503 amdgpu_sched_hw_submission = 2;
1504 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1505 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1506 amdgpu_sched_hw_submission);
1507 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1508 }
1509
7951e376
RZ
1510 amdgpu_device_check_smu_prv_buffer_size(adev);
1511
06ec9070 1512 amdgpu_device_check_vm_size(adev);
d38ceaf9 1513
06ec9070 1514 amdgpu_device_check_block_size(adev);
6a7f76e7 1515
19aede77 1516 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1517
c6252390 1518 amdgpu_gmc_tmz_set(adev);
01a8dcec 1519
9b498efa
AD
1520 amdgpu_gmc_noretry_set(adev);
1521
e3c00faa 1522 return 0;
d38ceaf9
AD
1523}
1524
1525/**
1526 * amdgpu_switcheroo_set_state - set switcheroo state
1527 *
1528 * @pdev: pci dev pointer
1694467b 1529 * @state: vga_switcheroo state
d38ceaf9
AD
1530 *
1531 * Callback for the switcheroo driver. Suspends or resumes the
1532 * the asics before or after it is powered up using ACPI methods.
1533 */
8aba21b7
LT
1534static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1535 enum vga_switcheroo_state state)
d38ceaf9
AD
1536{
1537 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1538 int r;
d38ceaf9 1539
b98c6299 1540 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1541 return;
1542
1543 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1544 pr_info("switched on\n");
d38ceaf9
AD
1545 /* don't suspend or resume card normally */
1546 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1547
8f66090b
TZ
1548 pci_set_power_state(pdev, PCI_D0);
1549 amdgpu_device_load_pci_state(pdev);
1550 r = pci_enable_device(pdev);
de185019
AD
1551 if (r)
1552 DRM_WARN("pci_enable_device failed (%d)\n", r);
1553 amdgpu_device_resume(dev, true);
d38ceaf9 1554
d38ceaf9 1555 dev->switch_power_state = DRM_SWITCH_POWER_ON;
d38ceaf9 1556 } else {
dd4fa6c1 1557 pr_info("switched off\n");
d38ceaf9 1558 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1559 amdgpu_device_suspend(dev, true);
8f66090b 1560 amdgpu_device_cache_pci_state(pdev);
de185019 1561 /* Shut down the device */
8f66090b
TZ
1562 pci_disable_device(pdev);
1563 pci_set_power_state(pdev, PCI_D3cold);
d38ceaf9
AD
1564 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1565 }
1566}
1567
1568/**
1569 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1570 *
1571 * @pdev: pci dev pointer
1572 *
1573 * Callback for the switcheroo driver. Check of the switcheroo
1574 * state can be changed.
1575 * Returns true if the state can be changed, false if not.
1576 */
1577static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1578{
1579 struct drm_device *dev = pci_get_drvdata(pdev);
1580
1581 /*
1582 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1583 * locking inversion with the driver load path. And the access here is
1584 * completely racy anyway. So don't bother with locking for now.
1585 */
7e13ad89 1586 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1587}
1588
1589static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1590 .set_gpu_state = amdgpu_switcheroo_set_state,
1591 .reprobe = NULL,
1592 .can_switch = amdgpu_switcheroo_can_switch,
1593};
1594
e3ecdffa
AD
1595/**
1596 * amdgpu_device_ip_set_clockgating_state - set the CG state
1597 *
87e3f136 1598 * @dev: amdgpu_device pointer
e3ecdffa
AD
1599 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1600 * @state: clockgating state (gate or ungate)
1601 *
1602 * Sets the requested clockgating state for all instances of
1603 * the hardware IP specified.
1604 * Returns the error code from the last instance.
1605 */
43fa561f 1606int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1607 enum amd_ip_block_type block_type,
1608 enum amd_clockgating_state state)
d38ceaf9 1609{
43fa561f 1610 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1611 int i, r = 0;
1612
1613 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1614 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1615 continue;
c722865a
RZ
1616 if (adev->ip_blocks[i].version->type != block_type)
1617 continue;
1618 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1619 continue;
1620 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1621 (void *)adev, state);
1622 if (r)
1623 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1624 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1625 }
1626 return r;
1627}
1628
e3ecdffa
AD
1629/**
1630 * amdgpu_device_ip_set_powergating_state - set the PG state
1631 *
87e3f136 1632 * @dev: amdgpu_device pointer
e3ecdffa
AD
1633 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1634 * @state: powergating state (gate or ungate)
1635 *
1636 * Sets the requested powergating state for all instances of
1637 * the hardware IP specified.
1638 * Returns the error code from the last instance.
1639 */
43fa561f 1640int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1641 enum amd_ip_block_type block_type,
1642 enum amd_powergating_state state)
d38ceaf9 1643{
43fa561f 1644 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1645 int i, r = 0;
1646
1647 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1648 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1649 continue;
c722865a
RZ
1650 if (adev->ip_blocks[i].version->type != block_type)
1651 continue;
1652 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1653 continue;
1654 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1655 (void *)adev, state);
1656 if (r)
1657 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1658 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1659 }
1660 return r;
1661}
1662
e3ecdffa
AD
1663/**
1664 * amdgpu_device_ip_get_clockgating_state - get the CG state
1665 *
1666 * @adev: amdgpu_device pointer
1667 * @flags: clockgating feature flags
1668 *
1669 * Walks the list of IPs on the device and updates the clockgating
1670 * flags for each IP.
1671 * Updates @flags with the feature flags for each hardware IP where
1672 * clockgating is enabled.
1673 */
2990a1fc
AD
1674void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1675 u32 *flags)
6cb2d4e4
HR
1676{
1677 int i;
1678
1679 for (i = 0; i < adev->num_ip_blocks; i++) {
1680 if (!adev->ip_blocks[i].status.valid)
1681 continue;
1682 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1683 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1684 }
1685}
1686
e3ecdffa
AD
1687/**
1688 * amdgpu_device_ip_wait_for_idle - wait for idle
1689 *
1690 * @adev: amdgpu_device pointer
1691 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1692 *
1693 * Waits for the request hardware IP to be idle.
1694 * Returns 0 for success or a negative error code on failure.
1695 */
2990a1fc
AD
1696int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1697 enum amd_ip_block_type block_type)
5dbbb60b
AD
1698{
1699 int i, r;
1700
1701 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1702 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1703 continue;
a1255107
AD
1704 if (adev->ip_blocks[i].version->type == block_type) {
1705 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1706 if (r)
1707 return r;
1708 break;
1709 }
1710 }
1711 return 0;
1712
1713}
1714
e3ecdffa
AD
1715/**
1716 * amdgpu_device_ip_is_idle - is the hardware IP idle
1717 *
1718 * @adev: amdgpu_device pointer
1719 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1720 *
1721 * Check if the hardware IP is idle or not.
1722 * Returns true if it the IP is idle, false if not.
1723 */
2990a1fc
AD
1724bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1725 enum amd_ip_block_type block_type)
5dbbb60b
AD
1726{
1727 int i;
1728
1729 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1730 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1731 continue;
a1255107
AD
1732 if (adev->ip_blocks[i].version->type == block_type)
1733 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1734 }
1735 return true;
1736
1737}
1738
e3ecdffa
AD
1739/**
1740 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1741 *
1742 * @adev: amdgpu_device pointer
87e3f136 1743 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1744 *
1745 * Returns a pointer to the hardware IP block structure
1746 * if it exists for the asic, otherwise NULL.
1747 */
2990a1fc
AD
1748struct amdgpu_ip_block *
1749amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1750 enum amd_ip_block_type type)
d38ceaf9
AD
1751{
1752 int i;
1753
1754 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1755 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1756 return &adev->ip_blocks[i];
1757
1758 return NULL;
1759}
1760
1761/**
2990a1fc 1762 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1763 *
1764 * @adev: amdgpu_device pointer
5fc3aeeb 1765 * @type: enum amd_ip_block_type
d38ceaf9
AD
1766 * @major: major version
1767 * @minor: minor version
1768 *
1769 * return 0 if equal or greater
1770 * return 1 if smaller or the ip_block doesn't exist
1771 */
2990a1fc
AD
1772int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1773 enum amd_ip_block_type type,
1774 u32 major, u32 minor)
d38ceaf9 1775{
2990a1fc 1776 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1777
a1255107
AD
1778 if (ip_block && ((ip_block->version->major > major) ||
1779 ((ip_block->version->major == major) &&
1780 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1781 return 0;
1782
1783 return 1;
1784}
1785
a1255107 1786/**
2990a1fc 1787 * amdgpu_device_ip_block_add
a1255107
AD
1788 *
1789 * @adev: amdgpu_device pointer
1790 * @ip_block_version: pointer to the IP to add
1791 *
1792 * Adds the IP block driver information to the collection of IPs
1793 * on the asic.
1794 */
2990a1fc
AD
1795int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1796 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1797{
1798 if (!ip_block_version)
1799 return -EINVAL;
1800
7bd939d0
LG
1801 switch (ip_block_version->type) {
1802 case AMD_IP_BLOCK_TYPE_VCN:
1803 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1804 return 0;
1805 break;
1806 case AMD_IP_BLOCK_TYPE_JPEG:
1807 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1808 return 0;
1809 break;
1810 default:
1811 break;
1812 }
1813
e966a725 1814 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1815 ip_block_version->funcs->name);
1816
a1255107
AD
1817 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1818
1819 return 0;
1820}
1821
e3ecdffa
AD
1822/**
1823 * amdgpu_device_enable_virtual_display - enable virtual display feature
1824 *
1825 * @adev: amdgpu_device pointer
1826 *
1827 * Enabled the virtual display feature if the user has enabled it via
1828 * the module parameter virtual_display. This feature provides a virtual
1829 * display hardware on headless boards or in virtualized environments.
1830 * This function parses and validates the configuration string specified by
1831 * the user and configues the virtual display configuration (number of
1832 * virtual connectors, crtcs, etc.) specified.
1833 */
483ef985 1834static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1835{
1836 adev->enable_virtual_display = false;
1837
1838 if (amdgpu_virtual_display) {
8f66090b 1839 const char *pci_address_name = pci_name(adev->pdev);
0f66356d 1840 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1841
1842 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1843 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1844 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1845 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1846 if (!strcmp("all", pciaddname)
1847 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1848 long num_crtc;
1849 int res = -1;
1850
9accf2fd 1851 adev->enable_virtual_display = true;
0f66356d
ED
1852
1853 if (pciaddname_tmp)
1854 res = kstrtol(pciaddname_tmp, 10,
1855 &num_crtc);
1856
1857 if (!res) {
1858 if (num_crtc < 1)
1859 num_crtc = 1;
1860 if (num_crtc > 6)
1861 num_crtc = 6;
1862 adev->mode_info.num_crtc = num_crtc;
1863 } else {
1864 adev->mode_info.num_crtc = 1;
1865 }
9accf2fd
ED
1866 break;
1867 }
1868 }
1869
0f66356d
ED
1870 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1871 amdgpu_virtual_display, pci_address_name,
1872 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1873
1874 kfree(pciaddstr);
1875 }
1876}
1877
e3ecdffa
AD
1878/**
1879 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1880 *
1881 * @adev: amdgpu_device pointer
1882 *
1883 * Parses the asic configuration parameters specified in the gpu info
1884 * firmware and makes them availale to the driver for use in configuring
1885 * the asic.
1886 * Returns 0 on success, -EINVAL on failure.
1887 */
e2a75f88
AD
1888static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1889{
e2a75f88 1890 const char *chip_name;
c0a43457 1891 char fw_name[40];
e2a75f88
AD
1892 int err;
1893 const struct gpu_info_firmware_header_v1_0 *hdr;
1894
ab4fe3e1
HR
1895 adev->firmware.gpu_info_fw = NULL;
1896
72de33f8 1897 if (adev->mman.discovery_bin) {
258620d0 1898 amdgpu_discovery_get_gfx_info(adev);
cc375d8c
TY
1899
1900 /*
1901 * FIXME: The bounding box is still needed by Navi12, so
1902 * temporarily read it from gpu_info firmware. Should be droped
1903 * when DAL no longer needs it.
1904 */
1905 if (adev->asic_type != CHIP_NAVI12)
1906 return 0;
258620d0
AD
1907 }
1908
e2a75f88 1909 switch (adev->asic_type) {
e2a75f88
AD
1910#ifdef CONFIG_DRM_AMDGPU_SI
1911 case CHIP_VERDE:
1912 case CHIP_TAHITI:
1913 case CHIP_PITCAIRN:
1914 case CHIP_OLAND:
1915 case CHIP_HAINAN:
1916#endif
1917#ifdef CONFIG_DRM_AMDGPU_CIK
1918 case CHIP_BONAIRE:
1919 case CHIP_HAWAII:
1920 case CHIP_KAVERI:
1921 case CHIP_KABINI:
1922 case CHIP_MULLINS:
1923#endif
da87c30b
AD
1924 case CHIP_TOPAZ:
1925 case CHIP_TONGA:
1926 case CHIP_FIJI:
1927 case CHIP_POLARIS10:
1928 case CHIP_POLARIS11:
1929 case CHIP_POLARIS12:
1930 case CHIP_VEGAM:
1931 case CHIP_CARRIZO:
1932 case CHIP_STONEY:
27c0bc71 1933 case CHIP_VEGA20:
44b3253a 1934 case CHIP_ALDEBARAN:
84d244a3
JC
1935 case CHIP_SIENNA_CICHLID:
1936 case CHIP_NAVY_FLOUNDER:
eac88a5f 1937 case CHIP_DIMGREY_CAVEFISH:
0e5f4b09 1938 case CHIP_BEIGE_GOBY:
e2a75f88
AD
1939 default:
1940 return 0;
1941 case CHIP_VEGA10:
1942 chip_name = "vega10";
1943 break;
3f76dced
AD
1944 case CHIP_VEGA12:
1945 chip_name = "vega12";
1946 break;
2d2e5e7e 1947 case CHIP_RAVEN:
54f78a76 1948 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1949 chip_name = "raven2";
54f78a76 1950 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1951 chip_name = "picasso";
54c4d17e
FX
1952 else
1953 chip_name = "raven";
2d2e5e7e 1954 break;
65e60f6e
LM
1955 case CHIP_ARCTURUS:
1956 chip_name = "arcturus";
1957 break;
b51a26a0 1958 case CHIP_RENOIR:
2e62f0b5
PL
1959 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1960 chip_name = "renoir";
1961 else
1962 chip_name = "green_sardine";
b51a26a0 1963 break;
23c6268e
HR
1964 case CHIP_NAVI10:
1965 chip_name = "navi10";
1966 break;
ed42cfe1
XY
1967 case CHIP_NAVI14:
1968 chip_name = "navi14";
1969 break;
42b325e5
XY
1970 case CHIP_NAVI12:
1971 chip_name = "navi12";
1972 break;
4e52a9f8
HR
1973 case CHIP_VANGOGH:
1974 chip_name = "vangogh";
1975 break;
8bf84f60
AL
1976 case CHIP_YELLOW_CARP:
1977 chip_name = "yellow_carp";
1978 break;
e2a75f88
AD
1979 }
1980
1981 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1982 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1983 if (err) {
1984 dev_err(adev->dev,
1985 "Failed to load gpu_info firmware \"%s\"\n",
1986 fw_name);
1987 goto out;
1988 }
ab4fe3e1 1989 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1990 if (err) {
1991 dev_err(adev->dev,
1992 "Failed to validate gpu_info firmware \"%s\"\n",
1993 fw_name);
1994 goto out;
1995 }
1996
ab4fe3e1 1997 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1998 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1999
2000 switch (hdr->version_major) {
2001 case 1:
2002 {
2003 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 2004 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
2005 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2006
cc375d8c
TY
2007 /*
2008 * Should be droped when DAL no longer needs it.
2009 */
2010 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
2011 goto parse_soc_bounding_box;
2012
b5ab16bf
AD
2013 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2014 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2015 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2016 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 2017 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
2018 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2019 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2020 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2021 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2022 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 2023 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
2024 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2025 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
2026 adev->gfx.cu_info.max_waves_per_simd =
2027 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2028 adev->gfx.cu_info.max_scratch_slots_per_cu =
2029 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2030 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 2031 if (hdr->version_minor >= 1) {
35c2e910
HZ
2032 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2033 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2034 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2035 adev->gfx.config.num_sc_per_sh =
2036 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2037 adev->gfx.config.num_packer_per_sc =
2038 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2039 }
ec51d3fa
XY
2040
2041parse_soc_bounding_box:
ec51d3fa
XY
2042 /*
2043 * soc bounding box info is not integrated in disocovery table,
258620d0 2044 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 2045 */
48321c3d
HW
2046 if (hdr->version_minor == 2) {
2047 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2048 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2049 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2050 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2051 }
e2a75f88
AD
2052 break;
2053 }
2054 default:
2055 dev_err(adev->dev,
2056 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2057 err = -EINVAL;
2058 goto out;
2059 }
2060out:
e2a75f88
AD
2061 return err;
2062}
2063
e3ecdffa
AD
2064/**
2065 * amdgpu_device_ip_early_init - run early init for hardware IPs
2066 *
2067 * @adev: amdgpu_device pointer
2068 *
2069 * Early initialization pass for hardware IPs. The hardware IPs that make
2070 * up each asic are discovered each IP's early_init callback is run. This
2071 * is the first stage in initializing the asic.
2072 * Returns 0 on success, negative error code on failure.
2073 */
06ec9070 2074static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 2075{
aaa36a97 2076 int i, r;
d38ceaf9 2077
483ef985 2078 amdgpu_device_enable_virtual_display(adev);
a6be7570 2079
00a979f3 2080 if (amdgpu_sriov_vf(adev)) {
00a979f3 2081 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
2082 if (r)
2083 return r;
00a979f3
WS
2084 }
2085
d38ceaf9 2086 switch (adev->asic_type) {
33f34802
KW
2087#ifdef CONFIG_DRM_AMDGPU_SI
2088 case CHIP_VERDE:
2089 case CHIP_TAHITI:
2090 case CHIP_PITCAIRN:
2091 case CHIP_OLAND:
2092 case CHIP_HAINAN:
295d0daf 2093 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
2094 r = si_set_ip_blocks(adev);
2095 if (r)
2096 return r;
2097 break;
2098#endif
a2e73f56
AD
2099#ifdef CONFIG_DRM_AMDGPU_CIK
2100 case CHIP_BONAIRE:
2101 case CHIP_HAWAII:
2102 case CHIP_KAVERI:
2103 case CHIP_KABINI:
2104 case CHIP_MULLINS:
e1ad2d53 2105 if (adev->flags & AMD_IS_APU)
a2e73f56 2106 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
2107 else
2108 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
2109
2110 r = cik_set_ip_blocks(adev);
2111 if (r)
2112 return r;
2113 break;
2114#endif
da87c30b
AD
2115 case CHIP_TOPAZ:
2116 case CHIP_TONGA:
2117 case CHIP_FIJI:
2118 case CHIP_POLARIS10:
2119 case CHIP_POLARIS11:
2120 case CHIP_POLARIS12:
2121 case CHIP_VEGAM:
2122 case CHIP_CARRIZO:
2123 case CHIP_STONEY:
2124 if (adev->flags & AMD_IS_APU)
2125 adev->family = AMDGPU_FAMILY_CZ;
2126 else
2127 adev->family = AMDGPU_FAMILY_VI;
2128
2129 r = vi_set_ip_blocks(adev);
2130 if (r)
2131 return r;
2132 break;
d38ceaf9 2133 default:
63352b7f
AD
2134 r = amdgpu_discovery_set_ip_blocks(adev);
2135 if (r)
2136 return r;
2137 break;
d38ceaf9
AD
2138 }
2139
1884734a 2140 amdgpu_amdkfd_device_probe(adev);
2141
3b94fb10 2142 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 2143 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 2144 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
4215a119
HC
2145 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2146 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
00f54b97 2147
d38ceaf9
AD
2148 for (i = 0; i < adev->num_ip_blocks; i++) {
2149 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
2150 DRM_ERROR("disabled ip block: %d <%s>\n",
2151 i, adev->ip_blocks[i].version->funcs->name);
a1255107 2152 adev->ip_blocks[i].status.valid = false;
d38ceaf9 2153 } else {
a1255107
AD
2154 if (adev->ip_blocks[i].version->funcs->early_init) {
2155 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 2156 if (r == -ENOENT) {
a1255107 2157 adev->ip_blocks[i].status.valid = false;
2c1a2784 2158 } else if (r) {
a1255107
AD
2159 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2160 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2161 return r;
2c1a2784 2162 } else {
a1255107 2163 adev->ip_blocks[i].status.valid = true;
2c1a2784 2164 }
974e6b64 2165 } else {
a1255107 2166 adev->ip_blocks[i].status.valid = true;
d38ceaf9 2167 }
d38ceaf9 2168 }
21a249ca
AD
2169 /* get the vbios after the asic_funcs are set up */
2170 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
2171 r = amdgpu_device_parse_gpu_info_fw(adev);
2172 if (r)
2173 return r;
2174
21a249ca
AD
2175 /* Read BIOS */
2176 if (!amdgpu_get_bios(adev))
2177 return -EINVAL;
2178
2179 r = amdgpu_atombios_init(adev);
2180 if (r) {
2181 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2182 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2183 return r;
2184 }
77eabc6f
PJZ
2185
2186 /*get pf2vf msg info at it's earliest time*/
2187 if (amdgpu_sriov_vf(adev))
2188 amdgpu_virt_init_data_exchange(adev);
2189
21a249ca 2190 }
d38ceaf9
AD
2191 }
2192
395d1fb9
NH
2193 adev->cg_flags &= amdgpu_cg_mask;
2194 adev->pg_flags &= amdgpu_pg_mask;
2195
d38ceaf9
AD
2196 return 0;
2197}
2198
0a4f2520
RZ
2199static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2200{
2201 int i, r;
2202
2203 for (i = 0; i < adev->num_ip_blocks; i++) {
2204 if (!adev->ip_blocks[i].status.sw)
2205 continue;
2206 if (adev->ip_blocks[i].status.hw)
2207 continue;
2208 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 2209 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
2210 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2211 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2212 if (r) {
2213 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2214 adev->ip_blocks[i].version->funcs->name, r);
2215 return r;
2216 }
2217 adev->ip_blocks[i].status.hw = true;
2218 }
2219 }
2220
2221 return 0;
2222}
2223
2224static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2225{
2226 int i, r;
2227
2228 for (i = 0; i < adev->num_ip_blocks; i++) {
2229 if (!adev->ip_blocks[i].status.sw)
2230 continue;
2231 if (adev->ip_blocks[i].status.hw)
2232 continue;
2233 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2234 if (r) {
2235 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2236 adev->ip_blocks[i].version->funcs->name, r);
2237 return r;
2238 }
2239 adev->ip_blocks[i].status.hw = true;
2240 }
2241
2242 return 0;
2243}
2244
7a3e0bb2
RZ
2245static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2246{
2247 int r = 0;
2248 int i;
80f41f84 2249 uint32_t smu_version;
7a3e0bb2
RZ
2250
2251 if (adev->asic_type >= CHIP_VEGA10) {
2252 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2253 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2254 continue;
2255
e3c1b071 2256 if (!adev->ip_blocks[i].status.sw)
2257 continue;
2258
482f0e53
ML
2259 /* no need to do the fw loading again if already done*/
2260 if (adev->ip_blocks[i].status.hw == true)
2261 break;
2262
53b3f8f4 2263 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2264 r = adev->ip_blocks[i].version->funcs->resume(adev);
2265 if (r) {
2266 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2267 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2268 return r;
2269 }
2270 } else {
2271 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2272 if (r) {
2273 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2274 adev->ip_blocks[i].version->funcs->name, r);
2275 return r;
7a3e0bb2 2276 }
7a3e0bb2 2277 }
482f0e53
ML
2278
2279 adev->ip_blocks[i].status.hw = true;
2280 break;
7a3e0bb2
RZ
2281 }
2282 }
482f0e53 2283
8973d9ec
ED
2284 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2285 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2286
80f41f84 2287 return r;
7a3e0bb2
RZ
2288}
2289
5fd8518d
AG
2290static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2291{
2292 long timeout;
2293 int r, i;
2294
2295 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2296 struct amdgpu_ring *ring = adev->rings[i];
2297
2298 /* No need to setup the GPU scheduler for rings that don't need it */
2299 if (!ring || ring->no_scheduler)
2300 continue;
2301
2302 switch (ring->funcs->type) {
2303 case AMDGPU_RING_TYPE_GFX:
2304 timeout = adev->gfx_timeout;
2305 break;
2306 case AMDGPU_RING_TYPE_COMPUTE:
2307 timeout = adev->compute_timeout;
2308 break;
2309 case AMDGPU_RING_TYPE_SDMA:
2310 timeout = adev->sdma_timeout;
2311 break;
2312 default:
2313 timeout = adev->video_timeout;
2314 break;
2315 }
2316
2317 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2318 ring->num_hw_submission, amdgpu_job_hang_limit,
2319 timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
2320 if (r) {
2321 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2322 ring->name);
2323 return r;
2324 }
2325 }
2326
2327 return 0;
2328}
2329
2330
e3ecdffa
AD
2331/**
2332 * amdgpu_device_ip_init - run init for hardware IPs
2333 *
2334 * @adev: amdgpu_device pointer
2335 *
2336 * Main initialization pass for hardware IPs. The list of all the hardware
2337 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2338 * are run. sw_init initializes the software state associated with each IP
2339 * and hw_init initializes the hardware associated with each IP.
2340 * Returns 0 on success, negative error code on failure.
2341 */
06ec9070 2342static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2343{
2344 int i, r;
2345
c030f2e4 2346 r = amdgpu_ras_init(adev);
2347 if (r)
2348 return r;
2349
d38ceaf9 2350 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2351 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2352 continue;
a1255107 2353 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2354 if (r) {
a1255107
AD
2355 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2356 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2357 goto init_failed;
2c1a2784 2358 }
a1255107 2359 adev->ip_blocks[i].status.sw = true;
bfca0289 2360
d38ceaf9 2361 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2362 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
892deb48
VS
2363 /* Try to reserve bad pages early */
2364 if (amdgpu_sriov_vf(adev))
2365 amdgpu_virt_exchange_data(adev);
2366
06ec9070 2367 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2368 if (r) {
2369 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2370 goto init_failed;
2c1a2784 2371 }
a1255107 2372 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2373 if (r) {
2374 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2375 goto init_failed;
2c1a2784 2376 }
06ec9070 2377 r = amdgpu_device_wb_init(adev);
2c1a2784 2378 if (r) {
06ec9070 2379 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2380 goto init_failed;
2c1a2784 2381 }
a1255107 2382 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2383
2384 /* right after GMC hw init, we create CSA */
f92d5c61 2385 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2386 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2387 AMDGPU_GEM_DOMAIN_VRAM,
2388 AMDGPU_CSA_SIZE);
2493664f
ML
2389 if (r) {
2390 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2391 goto init_failed;
2493664f
ML
2392 }
2393 }
d38ceaf9
AD
2394 }
2395 }
2396
c9ffa427 2397 if (amdgpu_sriov_vf(adev))
9a458402 2398 amdgpu_virt_init_data_exchange(adev);
c9ffa427 2399
533aed27
AG
2400 r = amdgpu_ib_pool_init(adev);
2401 if (r) {
2402 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2403 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2404 goto init_failed;
2405 }
2406
c8963ea4
RZ
2407 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2408 if (r)
72d3f592 2409 goto init_failed;
0a4f2520
RZ
2410
2411 r = amdgpu_device_ip_hw_init_phase1(adev);
2412 if (r)
72d3f592 2413 goto init_failed;
0a4f2520 2414
7a3e0bb2
RZ
2415 r = amdgpu_device_fw_loading(adev);
2416 if (r)
72d3f592 2417 goto init_failed;
7a3e0bb2 2418
0a4f2520
RZ
2419 r = amdgpu_device_ip_hw_init_phase2(adev);
2420 if (r)
72d3f592 2421 goto init_failed;
d38ceaf9 2422
121a2bc6
AG
2423 /*
2424 * retired pages will be loaded from eeprom and reserved here,
2425 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2426 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2427 * for I2C communication which only true at this point.
b82e65a9
GC
2428 *
2429 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2430 * failure from bad gpu situation and stop amdgpu init process
2431 * accordingly. For other failed cases, it will still release all
2432 * the resource and print error message, rather than returning one
2433 * negative value to upper level.
121a2bc6
AG
2434 *
2435 * Note: theoretically, this should be called before all vram allocations
2436 * to protect retired page from abusing
2437 */
b82e65a9
GC
2438 r = amdgpu_ras_recovery_init(adev);
2439 if (r)
2440 goto init_failed;
121a2bc6 2441
a4c63caf
AG
2442 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2443 struct amdgpu_hive_info *hive;
2444
3e2e2ab5 2445 amdgpu_xgmi_add_device(adev);
e3c1b071 2446
a4c63caf
AG
2447 hive = amdgpu_get_xgmi_hive(adev);
2448 if (!hive || !hive->reset_domain.wq) {
2449 DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id);
2450 r = -EINVAL;
2451 goto init_failed;
2452 }
2453
2454 adev->reset_domain.wq = hive->reset_domain.wq;
2455 } else {
2456 adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0);
2457 if (!adev->reset_domain.wq) {
2458 r = -ENOMEM;
2459 goto init_failed;
2460 }
2461 }
2462
5fd8518d
AG
2463 r = amdgpu_device_init_schedulers(adev);
2464 if (r)
2465 goto init_failed;
2466
e3c1b071 2467 /* Don't init kfd if whole hive need to be reset during init */
2468 if (!adev->gmc.xgmi.pending_reset)
2469 amdgpu_amdkfd_device_init(adev);
c6332b97 2470
bd607166
KR
2471 amdgpu_fru_get_product_info(adev);
2472
72d3f592 2473init_failed:
c9ffa427 2474 if (amdgpu_sriov_vf(adev))
c6332b97 2475 amdgpu_virt_release_full_gpu(adev, true);
2476
72d3f592 2477 return r;
d38ceaf9
AD
2478}
2479
e3ecdffa
AD
2480/**
2481 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2482 *
2483 * @adev: amdgpu_device pointer
2484 *
2485 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2486 * this function before a GPU reset. If the value is retained after a
2487 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2488 */
06ec9070 2489static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2490{
2491 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2492}
2493
e3ecdffa
AD
2494/**
2495 * amdgpu_device_check_vram_lost - check if vram is valid
2496 *
2497 * @adev: amdgpu_device pointer
2498 *
2499 * Checks the reset magic value written to the gart pointer in VRAM.
2500 * The driver calls this after a GPU reset to see if the contents of
2501 * VRAM is lost or now.
2502 * returns true if vram is lost, false if not.
2503 */
06ec9070 2504static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2505{
dadce777
EQ
2506 if (memcmp(adev->gart.ptr, adev->reset_magic,
2507 AMDGPU_RESET_MAGIC_NUM))
2508 return true;
2509
53b3f8f4 2510 if (!amdgpu_in_reset(adev))
dadce777
EQ
2511 return false;
2512
2513 /*
2514 * For all ASICs with baco/mode1 reset, the VRAM is
2515 * always assumed to be lost.
2516 */
2517 switch (amdgpu_asic_reset_method(adev)) {
2518 case AMD_RESET_METHOD_BACO:
2519 case AMD_RESET_METHOD_MODE1:
2520 return true;
2521 default:
2522 return false;
2523 }
0c49e0b8
CZ
2524}
2525
e3ecdffa 2526/**
1112a46b 2527 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2528 *
2529 * @adev: amdgpu_device pointer
b8b72130 2530 * @state: clockgating state (gate or ungate)
e3ecdffa 2531 *
e3ecdffa 2532 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2533 * set_clockgating_state callbacks are run.
2534 * Late initialization pass enabling clockgating for hardware IPs.
2535 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2536 * Returns 0 on success, negative error code on failure.
2537 */
fdd34271 2538
5d89bb2d
LL
2539int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2540 enum amd_clockgating_state state)
d38ceaf9 2541{
1112a46b 2542 int i, j, r;
d38ceaf9 2543
4a2ba394
SL
2544 if (amdgpu_emu_mode == 1)
2545 return 0;
2546
1112a46b
RZ
2547 for (j = 0; j < adev->num_ip_blocks; j++) {
2548 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2549 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2550 continue;
5d70a549
PV
2551 /* skip CG for GFX on S0ix */
2552 if (adev->in_s0ix &&
2553 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2554 continue;
4a446d55 2555 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2556 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2557 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2558 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2559 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2560 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2561 /* enable clockgating to save power */
a1255107 2562 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2563 state);
4a446d55
AD
2564 if (r) {
2565 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2566 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2567 return r;
2568 }
b0b00ff1 2569 }
d38ceaf9 2570 }
06b18f61 2571
c9f96fd5
RZ
2572 return 0;
2573}
2574
5d89bb2d
LL
2575int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2576 enum amd_powergating_state state)
c9f96fd5 2577{
1112a46b 2578 int i, j, r;
06b18f61 2579
c9f96fd5
RZ
2580 if (amdgpu_emu_mode == 1)
2581 return 0;
2582
1112a46b
RZ
2583 for (j = 0; j < adev->num_ip_blocks; j++) {
2584 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2585 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5 2586 continue;
5d70a549
PV
2587 /* skip PG for GFX on S0ix */
2588 if (adev->in_s0ix &&
2589 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2590 continue;
c9f96fd5
RZ
2591 /* skip CG for VCE/UVD, it's handled specially */
2592 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2593 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2594 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2595 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2596 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2597 /* enable powergating to save power */
2598 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2599 state);
c9f96fd5
RZ
2600 if (r) {
2601 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2602 adev->ip_blocks[i].version->funcs->name, r);
2603 return r;
2604 }
2605 }
2606 }
2dc80b00
S
2607 return 0;
2608}
2609
beff74bc
AD
2610static int amdgpu_device_enable_mgpu_fan_boost(void)
2611{
2612 struct amdgpu_gpu_instance *gpu_ins;
2613 struct amdgpu_device *adev;
2614 int i, ret = 0;
2615
2616 mutex_lock(&mgpu_info.mutex);
2617
2618 /*
2619 * MGPU fan boost feature should be enabled
2620 * only when there are two or more dGPUs in
2621 * the system
2622 */
2623 if (mgpu_info.num_dgpu < 2)
2624 goto out;
2625
2626 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2627 gpu_ins = &(mgpu_info.gpu_ins[i]);
2628 adev = gpu_ins->adev;
2629 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2630 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2631 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2632 if (ret)
2633 break;
2634
2635 gpu_ins->mgpu_fan_enabled = 1;
2636 }
2637 }
2638
2639out:
2640 mutex_unlock(&mgpu_info.mutex);
2641
2642 return ret;
2643}
2644
e3ecdffa
AD
2645/**
2646 * amdgpu_device_ip_late_init - run late init for hardware IPs
2647 *
2648 * @adev: amdgpu_device pointer
2649 *
2650 * Late initialization pass for hardware IPs. The list of all the hardware
2651 * IPs that make up the asic is walked and the late_init callbacks are run.
2652 * late_init covers any special initialization that an IP requires
2653 * after all of the have been initialized or something that needs to happen
2654 * late in the init process.
2655 * Returns 0 on success, negative error code on failure.
2656 */
06ec9070 2657static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2658{
60599a03 2659 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2660 int i = 0, r;
2661
2662 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2663 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2664 continue;
2665 if (adev->ip_blocks[i].version->funcs->late_init) {
2666 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2667 if (r) {
2668 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2669 adev->ip_blocks[i].version->funcs->name, r);
2670 return r;
2671 }
2dc80b00 2672 }
73f847db 2673 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2674 }
2675
a891d239
DL
2676 amdgpu_ras_set_error_query_ready(adev, true);
2677
1112a46b
RZ
2678 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2679 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2680
06ec9070 2681 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2682
beff74bc
AD
2683 r = amdgpu_device_enable_mgpu_fan_boost();
2684 if (r)
2685 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2686
4da8b639 2687 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2688 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2689 adev->asic_type == CHIP_ALDEBARAN ))
2690 smu_handle_passthrough_sbr(&adev->smu, true);
60599a03
EQ
2691
2692 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2693 mutex_lock(&mgpu_info.mutex);
2694
2695 /*
2696 * Reset device p-state to low as this was booted with high.
2697 *
2698 * This should be performed only after all devices from the same
2699 * hive get initialized.
2700 *
2701 * However, it's unknown how many device in the hive in advance.
2702 * As this is counted one by one during devices initializations.
2703 *
2704 * So, we wait for all XGMI interlinked devices initialized.
2705 * This may bring some delays as those devices may come from
2706 * different hives. But that should be OK.
2707 */
2708 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2709 for (i = 0; i < mgpu_info.num_gpu; i++) {
2710 gpu_instance = &(mgpu_info.gpu_ins[i]);
2711 if (gpu_instance->adev->flags & AMD_IS_APU)
2712 continue;
2713
d84a430d
JK
2714 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2715 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2716 if (r) {
2717 DRM_ERROR("pstate setting failed (%d).\n", r);
2718 break;
2719 }
2720 }
2721 }
2722
2723 mutex_unlock(&mgpu_info.mutex);
2724 }
2725
d38ceaf9
AD
2726 return 0;
2727}
2728
613aa3ea
LY
2729/**
2730 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2731 *
2732 * @adev: amdgpu_device pointer
2733 *
2734 * For ASICs need to disable SMC first
2735 */
2736static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2737{
2738 int i, r;
2739
2740 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2741 return;
2742
2743 for (i = 0; i < adev->num_ip_blocks; i++) {
2744 if (!adev->ip_blocks[i].status.hw)
2745 continue;
2746 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2747 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2748 /* XXX handle errors */
2749 if (r) {
2750 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2751 adev->ip_blocks[i].version->funcs->name, r);
2752 }
2753 adev->ip_blocks[i].status.hw = false;
2754 break;
2755 }
2756 }
2757}
2758
e9669fb7 2759static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
d38ceaf9
AD
2760{
2761 int i, r;
2762
e9669fb7
AG
2763 for (i = 0; i < adev->num_ip_blocks; i++) {
2764 if (!adev->ip_blocks[i].version->funcs->early_fini)
2765 continue;
5278a159 2766
e9669fb7
AG
2767 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2768 if (r) {
2769 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2770 adev->ip_blocks[i].version->funcs->name, r);
2771 }
2772 }
c030f2e4 2773
e9669fb7 2774 amdgpu_amdkfd_suspend(adev, false);
a82400b5 2775
05df1f01 2776 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2777 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2778
613aa3ea
LY
2779 /* Workaroud for ASICs need to disable SMC first */
2780 amdgpu_device_smu_fini_early(adev);
3e96dbfd 2781
d38ceaf9 2782 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2783 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2784 continue;
8201a67a 2785
a1255107 2786 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2787 /* XXX handle errors */
2c1a2784 2788 if (r) {
a1255107
AD
2789 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2790 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2791 }
8201a67a 2792
a1255107 2793 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2794 }
2795
6effad8a
GC
2796 if (amdgpu_sriov_vf(adev)) {
2797 if (amdgpu_virt_release_full_gpu(adev, false))
2798 DRM_ERROR("failed to release exclusive mode on fini\n");
2799 }
2800
e9669fb7
AG
2801 return 0;
2802}
2803
2804/**
2805 * amdgpu_device_ip_fini - run fini for hardware IPs
2806 *
2807 * @adev: amdgpu_device pointer
2808 *
2809 * Main teardown pass for hardware IPs. The list of all the hardware
2810 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2811 * are run. hw_fini tears down the hardware associated with each IP
2812 * and sw_fini tears down any software state associated with each IP.
2813 * Returns 0 on success, negative error code on failure.
2814 */
2815static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2816{
2817 int i, r;
2818
2819 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2820 amdgpu_virt_release_ras_err_handler_data(adev);
2821
e9669fb7
AG
2822 if (adev->gmc.xgmi.num_physical_nodes > 1)
2823 amdgpu_xgmi_remove_device(adev);
2824
2825 amdgpu_amdkfd_device_fini_sw(adev);
9950cda2 2826
d38ceaf9 2827 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2828 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2829 continue;
c12aba3a
ML
2830
2831 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2832 amdgpu_ucode_free_bo(adev);
1e256e27 2833 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2834 amdgpu_device_wb_fini(adev);
2835 amdgpu_device_vram_scratch_fini(adev);
533aed27 2836 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2837 }
2838
a1255107 2839 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2840 /* XXX handle errors */
2c1a2784 2841 if (r) {
a1255107
AD
2842 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2843 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2844 }
a1255107
AD
2845 adev->ip_blocks[i].status.sw = false;
2846 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2847 }
2848
a6dcfd9c 2849 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2850 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2851 continue;
a1255107
AD
2852 if (adev->ip_blocks[i].version->funcs->late_fini)
2853 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2854 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2855 }
2856
c030f2e4 2857 amdgpu_ras_fini(adev);
2858
d38ceaf9
AD
2859 return 0;
2860}
2861
e3ecdffa 2862/**
beff74bc 2863 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2864 *
1112a46b 2865 * @work: work_struct.
e3ecdffa 2866 */
beff74bc 2867static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2868{
2869 struct amdgpu_device *adev =
beff74bc 2870 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2871 int r;
2872
2873 r = amdgpu_ib_ring_tests(adev);
2874 if (r)
2875 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2876}
2877
1e317b99
RZ
2878static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2879{
2880 struct amdgpu_device *adev =
2881 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2882
90a92662
MD
2883 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2884 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2885
2886 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2887 adev->gfx.gfx_off_state = true;
1e317b99
RZ
2888}
2889
e3ecdffa 2890/**
e7854a03 2891 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2892 *
2893 * @adev: amdgpu_device pointer
2894 *
2895 * Main suspend function for hardware IPs. The list of all the hardware
2896 * IPs that make up the asic is walked, clockgating is disabled and the
2897 * suspend callbacks are run. suspend puts the hardware and software state
2898 * in each IP into a state suitable for suspend.
2899 * Returns 0 on success, negative error code on failure.
2900 */
e7854a03
AD
2901static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2902{
2903 int i, r;
2904
50ec83f0
AD
2905 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2906 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2907
e7854a03
AD
2908 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2909 if (!adev->ip_blocks[i].status.valid)
2910 continue;
2b9f7848 2911
e7854a03 2912 /* displays are handled separately */
2b9f7848
ND
2913 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2914 continue;
2915
2916 /* XXX handle errors */
2917 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2918 /* XXX handle errors */
2919 if (r) {
2920 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2921 adev->ip_blocks[i].version->funcs->name, r);
2922 return r;
e7854a03 2923 }
2b9f7848
ND
2924
2925 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2926 }
2927
e7854a03
AD
2928 return 0;
2929}
2930
2931/**
2932 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2933 *
2934 * @adev: amdgpu_device pointer
2935 *
2936 * Main suspend function for hardware IPs. The list of all the hardware
2937 * IPs that make up the asic is walked, clockgating is disabled and the
2938 * suspend callbacks are run. suspend puts the hardware and software state
2939 * in each IP into a state suitable for suspend.
2940 * Returns 0 on success, negative error code on failure.
2941 */
2942static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2943{
2944 int i, r;
2945
557f42a2 2946 if (adev->in_s0ix)
34416931 2947 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
34416931 2948
d38ceaf9 2949 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2950 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2951 continue;
e7854a03
AD
2952 /* displays are handled in phase1 */
2953 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2954 continue;
bff77e86
LM
2955 /* PSP lost connection when err_event_athub occurs */
2956 if (amdgpu_ras_intr_triggered() &&
2957 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2958 adev->ip_blocks[i].status.hw = false;
2959 continue;
2960 }
e3c1b071 2961
2962 /* skip unnecessary suspend if we do not initialize them yet */
2963 if (adev->gmc.xgmi.pending_reset &&
2964 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2965 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2966 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2968 adev->ip_blocks[i].status.hw = false;
2969 continue;
2970 }
557f42a2 2971
32ff160d
AD
2972 /* skip suspend of gfx and psp for S0ix
2973 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2974 * like at runtime. PSP is also part of the always on hardware
2975 * so no need to suspend it.
2976 */
557f42a2 2977 if (adev->in_s0ix &&
32ff160d
AD
2978 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2979 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
557f42a2
AD
2980 continue;
2981
d38ceaf9 2982 /* XXX handle errors */
a1255107 2983 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2984 /* XXX handle errors */
2c1a2784 2985 if (r) {
a1255107
AD
2986 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2987 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2988 }
876923fb 2989 adev->ip_blocks[i].status.hw = false;
a3a09142 2990 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2991 if(!amdgpu_sriov_vf(adev)){
2992 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2993 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2994 if (r) {
2995 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2996 adev->mp1_state, r);
2997 return r;
2998 }
a3a09142
AD
2999 }
3000 }
d38ceaf9
AD
3001 }
3002
3003 return 0;
3004}
3005
e7854a03
AD
3006/**
3007 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3008 *
3009 * @adev: amdgpu_device pointer
3010 *
3011 * Main suspend function for hardware IPs. The list of all the hardware
3012 * IPs that make up the asic is walked, clockgating is disabled and the
3013 * suspend callbacks are run. suspend puts the hardware and software state
3014 * in each IP into a state suitable for suspend.
3015 * Returns 0 on success, negative error code on failure.
3016 */
3017int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3018{
3019 int r;
3020
3c73683c
JC
3021 if (amdgpu_sriov_vf(adev)) {
3022 amdgpu_virt_fini_data_exchange(adev);
e7819644 3023 amdgpu_virt_request_full_gpu(adev, false);
3c73683c 3024 }
e7819644 3025
e7854a03
AD
3026 r = amdgpu_device_ip_suspend_phase1(adev);
3027 if (r)
3028 return r;
3029 r = amdgpu_device_ip_suspend_phase2(adev);
3030
e7819644
YT
3031 if (amdgpu_sriov_vf(adev))
3032 amdgpu_virt_release_full_gpu(adev, false);
3033
e7854a03
AD
3034 return r;
3035}
3036
06ec9070 3037static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3038{
3039 int i, r;
3040
2cb681b6
ML
3041 static enum amd_ip_block_type ip_order[] = {
3042 AMD_IP_BLOCK_TYPE_GMC,
3043 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 3044 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
3045 AMD_IP_BLOCK_TYPE_IH,
3046 };
a90ad3c2 3047
95ea3dbc 3048 for (i = 0; i < adev->num_ip_blocks; i++) {
2cb681b6
ML
3049 int j;
3050 struct amdgpu_ip_block *block;
a90ad3c2 3051
4cd2a96d
J
3052 block = &adev->ip_blocks[i];
3053 block->status.hw = false;
2cb681b6 3054
4cd2a96d 3055 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 3056
4cd2a96d 3057 if (block->version->type != ip_order[j] ||
2cb681b6
ML
3058 !block->status.valid)
3059 continue;
3060
3061 r = block->version->funcs->hw_init(adev);
0aaeefcc 3062 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3063 if (r)
3064 return r;
482f0e53 3065 block->status.hw = true;
a90ad3c2
ML
3066 }
3067 }
3068
3069 return 0;
3070}
3071
06ec9070 3072static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3073{
3074 int i, r;
3075
2cb681b6
ML
3076 static enum amd_ip_block_type ip_order[] = {
3077 AMD_IP_BLOCK_TYPE_SMC,
3078 AMD_IP_BLOCK_TYPE_DCE,
3079 AMD_IP_BLOCK_TYPE_GFX,
3080 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 3081 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
3082 AMD_IP_BLOCK_TYPE_VCE,
3083 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 3084 };
a90ad3c2 3085
2cb681b6
ML
3086 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3087 int j;
3088 struct amdgpu_ip_block *block;
a90ad3c2 3089
2cb681b6
ML
3090 for (j = 0; j < adev->num_ip_blocks; j++) {
3091 block = &adev->ip_blocks[j];
3092
3093 if (block->version->type != ip_order[i] ||
482f0e53
ML
3094 !block->status.valid ||
3095 block->status.hw)
2cb681b6
ML
3096 continue;
3097
895bd048
JZ
3098 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3099 r = block->version->funcs->resume(adev);
3100 else
3101 r = block->version->funcs->hw_init(adev);
3102
0aaeefcc 3103 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3104 if (r)
3105 return r;
482f0e53 3106 block->status.hw = true;
a90ad3c2
ML
3107 }
3108 }
3109
3110 return 0;
3111}
3112
e3ecdffa
AD
3113/**
3114 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3115 *
3116 * @adev: amdgpu_device pointer
3117 *
3118 * First resume function for hardware IPs. The list of all the hardware
3119 * IPs that make up the asic is walked and the resume callbacks are run for
3120 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3121 * after a suspend and updates the software state as necessary. This
3122 * function is also used for restoring the GPU after a GPU reset.
3123 * Returns 0 on success, negative error code on failure.
3124 */
06ec9070 3125static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
3126{
3127 int i, r;
3128
a90ad3c2 3129 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3130 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 3131 continue;
a90ad3c2 3132 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
3133 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 3135
fcf0649f
CZ
3136 r = adev->ip_blocks[i].version->funcs->resume(adev);
3137 if (r) {
3138 DRM_ERROR("resume of IP block <%s> failed %d\n",
3139 adev->ip_blocks[i].version->funcs->name, r);
3140 return r;
3141 }
482f0e53 3142 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
3143 }
3144 }
3145
3146 return 0;
3147}
3148
e3ecdffa
AD
3149/**
3150 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3151 *
3152 * @adev: amdgpu_device pointer
3153 *
3154 * First resume function for hardware IPs. The list of all the hardware
3155 * IPs that make up the asic is walked and the resume callbacks are run for
3156 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3157 * functional state after a suspend and updates the software state as
3158 * necessary. This function is also used for restoring the GPU after a GPU
3159 * reset.
3160 * Returns 0 on success, negative error code on failure.
3161 */
06ec9070 3162static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
3163{
3164 int i, r;
3165
3166 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3167 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 3168 continue;
fcf0649f 3169 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3170 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3172 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 3173 continue;
a1255107 3174 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 3175 if (r) {
a1255107
AD
3176 DRM_ERROR("resume of IP block <%s> failed %d\n",
3177 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 3178 return r;
2c1a2784 3179 }
482f0e53 3180 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
3181 }
3182
3183 return 0;
3184}
3185
e3ecdffa
AD
3186/**
3187 * amdgpu_device_ip_resume - run resume for hardware IPs
3188 *
3189 * @adev: amdgpu_device pointer
3190 *
3191 * Main resume function for hardware IPs. The hardware IPs
3192 * are split into two resume functions because they are
3193 * are also used in in recovering from a GPU reset and some additional
3194 * steps need to be take between them. In this case (S3/S4) they are
3195 * run sequentially.
3196 * Returns 0 on success, negative error code on failure.
3197 */
06ec9070 3198static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
3199{
3200 int r;
3201
9cec53c1
JZ
3202 r = amdgpu_amdkfd_resume_iommu(adev);
3203 if (r)
3204 return r;
3205
06ec9070 3206 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
3207 if (r)
3208 return r;
7a3e0bb2
RZ
3209
3210 r = amdgpu_device_fw_loading(adev);
3211 if (r)
3212 return r;
3213
06ec9070 3214 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
3215
3216 return r;
3217}
3218
e3ecdffa
AD
3219/**
3220 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3221 *
3222 * @adev: amdgpu_device pointer
3223 *
3224 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3225 */
4e99a44e 3226static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 3227{
6867e1b5
ML
3228 if (amdgpu_sriov_vf(adev)) {
3229 if (adev->is_atom_fw) {
58ff791a 3230 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
6867e1b5
ML
3231 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3232 } else {
3233 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3234 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3235 }
3236
3237 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3238 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 3239 }
048765ad
AR
3240}
3241
e3ecdffa
AD
3242/**
3243 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3244 *
3245 * @asic_type: AMD asic type
3246 *
3247 * Check if there is DC (new modesetting infrastructre) support for an asic.
3248 * returns true if DC has support, false if not.
3249 */
4562236b
HW
3250bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3251{
3252 switch (asic_type) {
0637d417
AD
3253#ifdef CONFIG_DRM_AMDGPU_SI
3254 case CHIP_HAINAN:
3255#endif
3256 case CHIP_TOPAZ:
3257 /* chips with no display hardware */
3258 return false;
4562236b 3259#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
3260 case CHIP_TAHITI:
3261 case CHIP_PITCAIRN:
3262 case CHIP_VERDE:
3263 case CHIP_OLAND:
2d32ffd6
AD
3264 /*
3265 * We have systems in the wild with these ASICs that require
3266 * LVDS and VGA support which is not supported with DC.
3267 *
3268 * Fallback to the non-DC driver here by default so as not to
3269 * cause regressions.
3270 */
3271#if defined(CONFIG_DRM_AMD_DC_SI)
3272 return amdgpu_dc > 0;
3273#else
3274 return false;
64200c46 3275#endif
4562236b 3276 case CHIP_BONAIRE:
0d6fbccb 3277 case CHIP_KAVERI:
367e6687
AD
3278 case CHIP_KABINI:
3279 case CHIP_MULLINS:
d9fda248
HW
3280 /*
3281 * We have systems in the wild with these ASICs that require
3282 * LVDS and VGA support which is not supported with DC.
3283 *
3284 * Fallback to the non-DC driver here by default so as not to
3285 * cause regressions.
3286 */
3287 return amdgpu_dc > 0;
3288 case CHIP_HAWAII:
4562236b
HW
3289 case CHIP_CARRIZO:
3290 case CHIP_STONEY:
4562236b 3291 case CHIP_POLARIS10:
675fd32b 3292 case CHIP_POLARIS11:
2c8ad2d5 3293 case CHIP_POLARIS12:
675fd32b 3294 case CHIP_VEGAM:
4562236b
HW
3295 case CHIP_TONGA:
3296 case CHIP_FIJI:
42f8ffa1 3297 case CHIP_VEGA10:
dca7b401 3298 case CHIP_VEGA12:
c6034aa2 3299 case CHIP_VEGA20:
b86a1aa3 3300#if defined(CONFIG_DRM_AMD_DC_DCN)
fd187853 3301 case CHIP_RAVEN:
b4f199c7 3302 case CHIP_NAVI10:
8fceceb6 3303 case CHIP_NAVI14:
078655d9 3304 case CHIP_NAVI12:
e1c14c43 3305 case CHIP_RENOIR:
3f68c01b 3306 case CHIP_CYAN_SKILLFISH:
81d9bfb8 3307 case CHIP_SIENNA_CICHLID:
a6c5308f 3308 case CHIP_NAVY_FLOUNDER:
7cc656e2 3309 case CHIP_DIMGREY_CAVEFISH:
ddaed58b 3310 case CHIP_BEIGE_GOBY:
84b934bc 3311 case CHIP_VANGOGH:
c8b73f7f 3312 case CHIP_YELLOW_CARP:
42f8ffa1 3313#endif
f7f12b25 3314 default:
fd187853 3315 return amdgpu_dc != 0;
f7f12b25 3316#else
4562236b 3317 default:
93b09a9a 3318 if (amdgpu_dc > 0)
044a48f4 3319 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
93b09a9a 3320 "but isn't supported by ASIC, ignoring\n");
4562236b 3321 return false;
f7f12b25 3322#endif
4562236b
HW
3323 }
3324}
3325
3326/**
3327 * amdgpu_device_has_dc_support - check if dc is supported
3328 *
982a820b 3329 * @adev: amdgpu_device pointer
4562236b
HW
3330 *
3331 * Returns true for supported, false for not supported
3332 */
3333bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3334{
abaf210c
AS
3335 if (amdgpu_sriov_vf(adev) ||
3336 adev->enable_virtual_display ||
3337 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
2555039d
XY
3338 return false;
3339
4562236b
HW
3340 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3341}
3342
d4535e2c
AG
3343static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3344{
3345 struct amdgpu_device *adev =
3346 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 3347 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 3348
c6a6e2db
AG
3349 /* It's a bug to not have a hive within this function */
3350 if (WARN_ON(!hive))
3351 return;
3352
3353 /*
3354 * Use task barrier to synchronize all xgmi reset works across the
3355 * hive. task_barrier_enter and task_barrier_exit will block
3356 * until all the threads running the xgmi reset works reach
3357 * those points. task_barrier_full will do both blocks.
3358 */
3359 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3360
3361 task_barrier_enter(&hive->tb);
4a580877 3362 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
3363
3364 if (adev->asic_reset_res)
3365 goto fail;
3366
3367 task_barrier_exit(&hive->tb);
4a580877 3368 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
3369
3370 if (adev->asic_reset_res)
3371 goto fail;
43c4d576 3372
8bc7b360
HZ
3373 if (adev->mmhub.ras_funcs &&
3374 adev->mmhub.ras_funcs->reset_ras_error_count)
3375 adev->mmhub.ras_funcs->reset_ras_error_count(adev);
c6a6e2db
AG
3376 } else {
3377
3378 task_barrier_full(&hive->tb);
3379 adev->asic_reset_res = amdgpu_asic_reset(adev);
3380 }
ce316fa5 3381
c6a6e2db 3382fail:
d4535e2c 3383 if (adev->asic_reset_res)
fed184e9 3384 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 3385 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 3386 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
3387}
3388
71f98027
AD
3389static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3390{
3391 char *input = amdgpu_lockup_timeout;
3392 char *timeout_setting = NULL;
3393 int index = 0;
3394 long timeout;
3395 int ret = 0;
3396
3397 /*
67387dfe
AD
3398 * By default timeout for non compute jobs is 10000
3399 * and 60000 for compute jobs.
71f98027 3400 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 3401 * jobs are 60000 by default.
71f98027
AD
3402 */
3403 adev->gfx_timeout = msecs_to_jiffies(10000);
3404 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
9882e278
ED
3405 if (amdgpu_sriov_vf(adev))
3406 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3407 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
71f98027 3408 else
67387dfe 3409 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027 3410
f440ff44 3411 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 3412 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 3413 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
3414 ret = kstrtol(timeout_setting, 0, &timeout);
3415 if (ret)
3416 return ret;
3417
3418 if (timeout == 0) {
3419 index++;
3420 continue;
3421 } else if (timeout < 0) {
3422 timeout = MAX_SCHEDULE_TIMEOUT;
127aedf9
CK
3423 dev_warn(adev->dev, "lockup timeout disabled");
3424 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
71f98027
AD
3425 } else {
3426 timeout = msecs_to_jiffies(timeout);
3427 }
3428
3429 switch (index++) {
3430 case 0:
3431 adev->gfx_timeout = timeout;
3432 break;
3433 case 1:
3434 adev->compute_timeout = timeout;
3435 break;
3436 case 2:
3437 adev->sdma_timeout = timeout;
3438 break;
3439 case 3:
3440 adev->video_timeout = timeout;
3441 break;
3442 default:
3443 break;
3444 }
3445 }
3446 /*
3447 * There is only one value specified and
3448 * it should apply to all non-compute jobs.
3449 */
bcccee89 3450 if (index == 1) {
71f98027 3451 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3452 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3453 adev->compute_timeout = adev->gfx_timeout;
3454 }
71f98027
AD
3455 }
3456
3457 return ret;
3458}
d4535e2c 3459
4a74c38c
PY
3460/**
3461 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3462 *
3463 * @adev: amdgpu_device pointer
3464 *
3465 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3466 */
3467static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3468{
3469 struct iommu_domain *domain;
3470
3471 domain = iommu_get_domain_for_dev(adev->dev);
3472 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3473 adev->ram_is_direct_mapped = true;
3474}
3475
77f3a5cd
ND
3476static const struct attribute *amdgpu_dev_attributes[] = {
3477 &dev_attr_product_name.attr,
3478 &dev_attr_product_number.attr,
3479 &dev_attr_serial_number.attr,
3480 &dev_attr_pcie_replay_count.attr,
3481 NULL
3482};
3483
d38ceaf9
AD
3484/**
3485 * amdgpu_device_init - initialize the driver
3486 *
3487 * @adev: amdgpu_device pointer
d38ceaf9
AD
3488 * @flags: driver flags
3489 *
3490 * Initializes the driver info and hw (all asics).
3491 * Returns 0 for success or an error on failure.
3492 * Called at driver startup.
3493 */
3494int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3495 uint32_t flags)
3496{
8aba21b7
LT
3497 struct drm_device *ddev = adev_to_drm(adev);
3498 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3499 int r, i;
b98c6299 3500 bool px = false;
95844d20 3501 u32 max_MBps;
d38ceaf9
AD
3502
3503 adev->shutdown = false;
d38ceaf9 3504 adev->flags = flags;
4e66d7d2
YZ
3505
3506 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3507 adev->asic_type = amdgpu_force_asic_type;
3508 else
3509 adev->asic_type = flags & AMD_ASIC_MASK;
3510
d38ceaf9 3511 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3512 if (amdgpu_emu_mode == 1)
8bdab6bb 3513 adev->usec_timeout *= 10;
770d13b1 3514 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3515 adev->accel_working = false;
3516 adev->num_rings = 0;
3517 adev->mman.buffer_funcs = NULL;
3518 adev->mman.buffer_funcs_ring = NULL;
3519 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3520 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3521 adev->gmc.gmc_funcs = NULL;
7bd939d0 3522 adev->harvest_ip_mask = 0x0;
f54d1867 3523 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3524 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3525
3526 adev->smc_rreg = &amdgpu_invalid_rreg;
3527 adev->smc_wreg = &amdgpu_invalid_wreg;
3528 adev->pcie_rreg = &amdgpu_invalid_rreg;
3529 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3530 adev->pciep_rreg = &amdgpu_invalid_rreg;
3531 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3532 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3533 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3534 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3535 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3536 adev->didt_rreg = &amdgpu_invalid_rreg;
3537 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3538 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3539 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3540 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3541 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3542
3e39ab90
AD
3543 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3544 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3545 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3546
3547 /* mutex initialization are all done here so we
3548 * can recall function without having locking issues */
0e5ca0d1 3549 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3550 mutex_init(&adev->pm.mutex);
3551 mutex_init(&adev->gfx.gpu_clock_mutex);
3552 mutex_init(&adev->srbm_mutex);
b8866c26 3553 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3554 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3555 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3556 mutex_init(&adev->mn_lock);
e23b74aa 3557 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3558 hash_init(adev->mn_hash);
53b3f8f4 3559 atomic_set(&adev->in_gpu_reset, 0);
6049db43 3560 init_rwsem(&adev->reset_sem);
32eaeae0 3561 mutex_init(&adev->psp.mutex);
bd052211 3562 mutex_init(&adev->notifier_lock);
d38ceaf9 3563
4eaf21b7 3564 amdgpu_device_init_apu_flags(adev);
9f6a7857 3565
912dfc84
EQ
3566 r = amdgpu_device_check_arguments(adev);
3567 if (r)
3568 return r;
d38ceaf9 3569
d38ceaf9
AD
3570 spin_lock_init(&adev->mmio_idx_lock);
3571 spin_lock_init(&adev->smc_idx_lock);
3572 spin_lock_init(&adev->pcie_idx_lock);
3573 spin_lock_init(&adev->uvd_ctx_idx_lock);
3574 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3575 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3576 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3577 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3578 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3579
0c4e7fa5
CZ
3580 INIT_LIST_HEAD(&adev->shadow_list);
3581 mutex_init(&adev->shadow_list_lock);
3582
655ce9cb 3583 INIT_LIST_HEAD(&adev->reset_list);
3584
beff74bc
AD
3585 INIT_DELAYED_WORK(&adev->delayed_init_work,
3586 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3587 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3588 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3589
d4535e2c
AG
3590 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3591
d23ee13f 3592 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3593 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3594
b265bdbd
EQ
3595 atomic_set(&adev->throttling_logging_enabled, 1);
3596 /*
3597 * If throttling continues, logging will be performed every minute
3598 * to avoid log flooding. "-1" is subtracted since the thermal
3599 * throttling interrupt comes every second. Thus, the total logging
3600 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3601 * for throttling interrupt) = 60 seconds.
3602 */
3603 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3604 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3605
0fa49558
AX
3606 /* Registers mapping */
3607 /* TODO: block userspace mapping of io register */
da69c161
KW
3608 if (adev->asic_type >= CHIP_BONAIRE) {
3609 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3610 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3611 } else {
3612 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3613 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3614 }
d38ceaf9 3615
6c08e0ef
EQ
3616 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3617 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3618
d38ceaf9
AD
3619 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3620 if (adev->rmmio == NULL) {
3621 return -ENOMEM;
3622 }
3623 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3624 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3625
5494d864
AD
3626 amdgpu_device_get_pcie_info(adev);
3627
b239c017
JX
3628 if (amdgpu_mcbp)
3629 DRM_INFO("MCBP is enabled\n");
3630
5f84cc63
JX
3631 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3632 adev->enable_mes = true;
3633
3aa0115d
ML
3634 /* detect hw virtualization here */
3635 amdgpu_detect_virtualization(adev);
3636
dffa11b4
ML
3637 r = amdgpu_device_get_job_timeout_settings(adev);
3638 if (r) {
3639 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4ef87d8f 3640 return r;
a190d1c7
XY
3641 }
3642
d38ceaf9 3643 /* early init functions */
06ec9070 3644 r = amdgpu_device_ip_early_init(adev);
d38ceaf9 3645 if (r)
4ef87d8f 3646 return r;
d38ceaf9 3647
4a0165f0
VS
3648 /* Need to get xgmi info early to decide the reset behavior*/
3649 if (adev->gmc.xgmi.supported) {
3650 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3651 if (r)
3652 return r;
3653 }
3654
8e6d0b69 3655 /* enable PCIE atomic ops */
3656 if (amdgpu_sriov_vf(adev))
3657 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3658 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags ==
3659 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3660 else
3661 adev->have_atomics_support =
3662 !pci_enable_atomic_ops_to_root(adev->pdev,
3663 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3664 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3665 if (!adev->have_atomics_support)
3666 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3667
6585661d
OZ
3668 /* doorbell bar mapping and doorbell index init*/
3669 amdgpu_device_doorbell_init(adev);
3670
9475a943
SL
3671 if (amdgpu_emu_mode == 1) {
3672 /* post the asic on emulation mode */
3673 emu_soc_asic_init(adev);
bfca0289 3674 goto fence_driver_init;
9475a943 3675 }
bfca0289 3676
04442bf7
LL
3677 amdgpu_reset_init(adev);
3678
4e99a44e
ML
3679 /* detect if we are with an SRIOV vbios */
3680 amdgpu_device_detect_sriov_bios(adev);
048765ad 3681
95e8e59e
AD
3682 /* check if we need to reset the asic
3683 * E.g., driver was not cleanly unloaded previously, etc.
3684 */
f14899fd 3685 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
e3c1b071 3686 if (adev->gmc.xgmi.num_physical_nodes) {
3687 dev_info(adev->dev, "Pending hive reset.\n");
3688 adev->gmc.xgmi.pending_reset = true;
3689 /* Only need to init necessary block for SMU to handle the reset */
3690 for (i = 0; i < adev->num_ip_blocks; i++) {
3691 if (!adev->ip_blocks[i].status.valid)
3692 continue;
3693 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3694 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3695 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3696 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
751f43e7 3697 DRM_DEBUG("IP %s disabled for hw_init.\n",
e3c1b071 3698 adev->ip_blocks[i].version->funcs->name);
3699 adev->ip_blocks[i].status.hw = true;
3700 }
3701 }
3702 } else {
3703 r = amdgpu_asic_reset(adev);
3704 if (r) {
3705 dev_err(adev->dev, "asic reset on init failed\n");
3706 goto failed;
3707 }
95e8e59e
AD
3708 }
3709 }
3710
8f66090b 3711 pci_enable_pcie_error_reporting(adev->pdev);
c9a6b82f 3712
d38ceaf9 3713 /* Post card if necessary */
39c640c0 3714 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3715 if (!adev->bios) {
bec86378 3716 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3717 r = -EINVAL;
3718 goto failed;
d38ceaf9 3719 }
bec86378 3720 DRM_INFO("GPU posting now...\n");
4d2997ab 3721 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3722 if (r) {
3723 dev_err(adev->dev, "gpu post error!\n");
3724 goto failed;
3725 }
d38ceaf9
AD
3726 }
3727
88b64e95
AD
3728 if (adev->is_atom_fw) {
3729 /* Initialize clocks */
3730 r = amdgpu_atomfirmware_get_clock_info(adev);
3731 if (r) {
3732 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3733 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3734 goto failed;
3735 }
3736 } else {
a5bde2f9
AD
3737 /* Initialize clocks */
3738 r = amdgpu_atombios_get_clock_info(adev);
3739 if (r) {
3740 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3741 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3742 goto failed;
a5bde2f9
AD
3743 }
3744 /* init i2c buses */
4562236b
HW
3745 if (!amdgpu_device_has_dc_support(adev))
3746 amdgpu_atombios_i2c_init(adev);
2c1a2784 3747 }
d38ceaf9 3748
bfca0289 3749fence_driver_init:
d38ceaf9 3750 /* Fence driver */
067f44c8 3751 r = amdgpu_fence_driver_sw_init(adev);
2c1a2784 3752 if (r) {
067f44c8 3753 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
e23b74aa 3754 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3755 goto failed;
2c1a2784 3756 }
d38ceaf9
AD
3757
3758 /* init the mode config */
4a580877 3759 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3760
06ec9070 3761 r = amdgpu_device_ip_init(adev);
d38ceaf9 3762 if (r) {
8840a387 3763 /* failed in exclusive mode due to timeout */
3764 if (amdgpu_sriov_vf(adev) &&
3765 !amdgpu_sriov_runtime(adev) &&
3766 amdgpu_virt_mmio_blocked(adev) &&
3767 !amdgpu_virt_wait_reset(adev)) {
3768 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3769 /* Don't send request since VF is inactive. */
3770 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3771 adev->virt.ops = NULL;
8840a387 3772 r = -EAGAIN;
970fd197 3773 goto release_ras_con;
8840a387 3774 }
06ec9070 3775 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3776 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
970fd197 3777 goto release_ras_con;
d38ceaf9
AD
3778 }
3779
8d35a259
LG
3780 amdgpu_fence_driver_hw_init(adev);
3781
d69b8971
YZ
3782 dev_info(adev->dev,
3783 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3784 adev->gfx.config.max_shader_engines,
3785 adev->gfx.config.max_sh_per_se,
3786 adev->gfx.config.max_cu_per_sh,
3787 adev->gfx.cu_info.number);
3788
d38ceaf9
AD
3789 adev->accel_working = true;
3790
e59c0205
AX
3791 amdgpu_vm_check_compute_bug(adev);
3792
95844d20
MO
3793 /* Initialize the buffer migration limit. */
3794 if (amdgpu_moverate >= 0)
3795 max_MBps = amdgpu_moverate;
3796 else
3797 max_MBps = 8; /* Allow 8 MB/s. */
3798 /* Get a log2 for easy divisions. */
3799 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3800
d2f52ac8 3801 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3802 if (r) {
3803 adev->pm_sysfs_en = false;
d2f52ac8 3804 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3805 } else
3806 adev->pm_sysfs_en = true;
d2f52ac8 3807
5bb23532 3808 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3809 if (r) {
3810 adev->ucode_sysfs_en = false;
5bb23532 3811 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3812 } else
3813 adev->ucode_sysfs_en = true;
5bb23532 3814
d38ceaf9
AD
3815 if ((amdgpu_testing & 1)) {
3816 if (adev->accel_working)
3817 amdgpu_test_moves(adev);
3818 else
3819 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3820 }
d38ceaf9
AD
3821 if (amdgpu_benchmarking) {
3822 if (adev->accel_working)
3823 amdgpu_benchmark(adev, amdgpu_benchmarking);
3824 else
3825 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3826 }
3827
b0adca4d
EQ
3828 /*
3829 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3830 * Otherwise the mgpu fan boost feature will be skipped due to the
3831 * gpu instance is counted less.
3832 */
3833 amdgpu_register_gpu_instance(adev);
3834
d38ceaf9
AD
3835 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3836 * explicit gating rather than handling it automatically.
3837 */
e3c1b071 3838 if (!adev->gmc.xgmi.pending_reset) {
3839 r = amdgpu_device_ip_late_init(adev);
3840 if (r) {
3841 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3842 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
970fd197 3843 goto release_ras_con;
e3c1b071 3844 }
3845 /* must succeed. */
3846 amdgpu_ras_resume(adev);
3847 queue_delayed_work(system_wq, &adev->delayed_init_work,
3848 msecs_to_jiffies(AMDGPU_RESUME_MS));
2c1a2784 3849 }
d38ceaf9 3850
2c738637
ML
3851 if (amdgpu_sriov_vf(adev))
3852 flush_delayed_work(&adev->delayed_init_work);
3853
77f3a5cd 3854 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
5aea5327 3855 if (r)
77f3a5cd 3856 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166 3857
d155bef0
AB
3858 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3859 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3860 if (r)
3861 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3862
c1dd4aa6
AG
3863 /* Have stored pci confspace at hand for restore in sudden PCI error */
3864 if (amdgpu_device_cache_pci_state(adev->pdev))
3865 pci_restore_state(pdev);
3866
8c3dd61c
KHF
3867 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3868 /* this will fail for cards that aren't VGA class devices, just
3869 * ignore it */
3870 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
bf44e8ce 3871 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
8c3dd61c
KHF
3872
3873 if (amdgpu_device_supports_px(ddev)) {
3874 px = true;
3875 vga_switcheroo_register_client(adev->pdev,
3876 &amdgpu_switcheroo_ops, px);
3877 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3878 }
3879
e3c1b071 3880 if (adev->gmc.xgmi.pending_reset)
3881 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3882 msecs_to_jiffies(AMDGPU_RESUME_MS));
3883
4a74c38c
PY
3884 amdgpu_device_check_iommu_direct_map(adev);
3885
d38ceaf9 3886 return 0;
83ba126a 3887
970fd197
SY
3888release_ras_con:
3889 amdgpu_release_ras_context(adev);
3890
83ba126a 3891failed:
89041940 3892 amdgpu_vf_error_trans_all(adev);
8840a387 3893
83ba126a 3894 return r;
d38ceaf9
AD
3895}
3896
07775fc1
AG
3897static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3898{
62d5f9f7 3899
07775fc1
AG
3900 /* Clear all CPU mappings pointing to this device */
3901 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3902
3903 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3904 amdgpu_device_doorbell_fini(adev);
3905
3906 iounmap(adev->rmmio);
3907 adev->rmmio = NULL;
3908 if (adev->mman.aper_base_kaddr)
3909 iounmap(adev->mman.aper_base_kaddr);
3910 adev->mman.aper_base_kaddr = NULL;
3911
3912 /* Memory manager related */
3913 if (!adev->gmc.xgmi.connected_to_cpu) {
3914 arch_phys_wc_del(adev->gmc.vram_mtrr);
3915 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3916 }
3917}
3918
d38ceaf9 3919/**
bbe04dec 3920 * amdgpu_device_fini_hw - tear down the driver
d38ceaf9
AD
3921 *
3922 * @adev: amdgpu_device pointer
3923 *
3924 * Tear down the driver info (all asics).
3925 * Called at driver shutdown.
3926 */
72c8c97b 3927void amdgpu_device_fini_hw(struct amdgpu_device *adev)
d38ceaf9 3928{
aac89168 3929 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3930 flush_delayed_work(&adev->delayed_init_work);
691191a2
YW
3931 if (adev->mman.initialized) {
3932 flush_delayed_work(&adev->mman.bdev.wq);
e78b3197 3933 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
691191a2 3934 }
d0d13fe8 3935 adev->shutdown = true;
9f875167 3936
752c683d
ML
3937 /* make sure IB test finished before entering exclusive mode
3938 * to avoid preemption on IB test
3939 * */
519b8b76 3940 if (amdgpu_sriov_vf(adev)) {
752c683d 3941 amdgpu_virt_request_full_gpu(adev, false);
519b8b76
BZ
3942 amdgpu_virt_fini_data_exchange(adev);
3943 }
752c683d 3944
e5b03032
ML
3945 /* disable all interrupts */
3946 amdgpu_irq_disable_all(adev);
ff97cba8 3947 if (adev->mode_info.mode_config_initialized){
1053b9c9 3948 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4a580877 3949 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3950 else
4a580877 3951 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3952 }
8d35a259 3953 amdgpu_fence_driver_hw_fini(adev);
72c8c97b 3954
7c868b59
YT
3955 if (adev->pm_sysfs_en)
3956 amdgpu_pm_sysfs_fini(adev);
72c8c97b
AG
3957 if (adev->ucode_sysfs_en)
3958 amdgpu_ucode_sysfs_fini(adev);
3959 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3960
232d1d43
SY
3961 /* disable ras feature must before hw fini */
3962 amdgpu_ras_pre_fini(adev);
3963
e9669fb7 3964 amdgpu_device_ip_fini_early(adev);
d10d0daa 3965
a3848df6
YW
3966 amdgpu_irq_fini_hw(adev);
3967
b6fd6e0f
SK
3968 if (adev->mman.initialized)
3969 ttm_device_clear_dma_mappings(&adev->mman.bdev);
894c6890 3970
d10d0daa 3971 amdgpu_gart_dummy_page_fini(adev);
07775fc1 3972
87172e89
LS
3973 if (drm_dev_is_unplugged(adev_to_drm(adev)))
3974 amdgpu_device_unmap_mmio(adev);
3975
72c8c97b
AG
3976}
3977
3978void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3979{
62d5f9f7
LS
3980 int idx;
3981
8d35a259 3982 amdgpu_fence_driver_sw_fini(adev);
a5c5d8d5 3983 amdgpu_device_ip_fini(adev);
75e1658e
ND
3984 release_firmware(adev->firmware.gpu_info_fw);
3985 adev->firmware.gpu_info_fw = NULL;
d38ceaf9 3986 adev->accel_working = false;
04442bf7
LL
3987
3988 amdgpu_reset_fini(adev);
3989
d38ceaf9 3990 /* free i2c buses */
4562236b
HW
3991 if (!amdgpu_device_has_dc_support(adev))
3992 amdgpu_i2c_fini(adev);
bfca0289
SL
3993
3994 if (amdgpu_emu_mode != 1)
3995 amdgpu_atombios_fini(adev);
3996
d38ceaf9
AD
3997 kfree(adev->bios);
3998 adev->bios = NULL;
b98c6299 3999 if (amdgpu_device_supports_px(adev_to_drm(adev))) {
84c8b22e 4000 vga_switcheroo_unregister_client(adev->pdev);
83ba126a 4001 vga_switcheroo_fini_domain_pm_ops(adev->dev);
b98c6299 4002 }
38d6be81 4003 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
b8779475 4004 vga_client_unregister(adev->pdev);
e9bc1bf7 4005
62d5f9f7
LS
4006 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4007
4008 iounmap(adev->rmmio);
4009 adev->rmmio = NULL;
4010 amdgpu_device_doorbell_fini(adev);
4011 drm_dev_exit(idx);
4012 }
4013
d155bef0
AB
4014 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4015 amdgpu_pmu_fini(adev);
72de33f8 4016 if (adev->mman.discovery_bin)
a190d1c7 4017 amdgpu_discovery_fini(adev);
72c8c97b
AG
4018
4019 kfree(adev->pci_state);
4020
d38ceaf9
AD
4021}
4022
58144d28
ND
4023/**
4024 * amdgpu_device_evict_resources - evict device resources
4025 * @adev: amdgpu device object
4026 *
4027 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4028 * of the vram memory type. Mainly used for evicting device resources
4029 * at suspend time.
4030 *
4031 */
4032static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
4033{
e53d9665
ML
4034 /* No need to evict vram on APUs for suspend to ram or s2idle */
4035 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
58144d28
ND
4036 return;
4037
4038 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
4039 DRM_WARN("evicting device resources failed\n");
4040
4041}
d38ceaf9
AD
4042
4043/*
4044 * Suspend & resume.
4045 */
4046/**
810ddc3a 4047 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 4048 *
87e3f136 4049 * @dev: drm dev pointer
87e3f136 4050 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
4051 *
4052 * Puts the hw in the suspend state (all asics).
4053 * Returns 0 for success or an error on failure.
4054 * Called at driver suspend.
4055 */
de185019 4056int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9 4057{
a2e15b0e 4058 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 4059
d38ceaf9
AD
4060 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4061 return 0;
4062
44779b43 4063 adev->in_suspend = true;
3fa8f89d
S
4064
4065 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4066 DRM_WARN("smart shift update failed\n");
4067
d38ceaf9
AD
4068 drm_kms_helper_poll_disable(dev);
4069
5f818173 4070 if (fbcon)
087451f3 4071 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
5f818173 4072
beff74bc 4073 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 4074
5e6932fe 4075 amdgpu_ras_suspend(adev);
4076
2196927b 4077 amdgpu_device_ip_suspend_phase1(adev);
fe1053b7 4078
5d3a2d95
AD
4079 if (!adev->in_s0ix)
4080 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
94fa5660 4081
58144d28 4082 amdgpu_device_evict_resources(adev);
d38ceaf9 4083
8d35a259 4084 amdgpu_fence_driver_hw_fini(adev);
d38ceaf9 4085
2196927b 4086 amdgpu_device_ip_suspend_phase2(adev);
d38ceaf9 4087
d38ceaf9
AD
4088 return 0;
4089}
4090
4091/**
810ddc3a 4092 * amdgpu_device_resume - initiate device resume
d38ceaf9 4093 *
87e3f136 4094 * @dev: drm dev pointer
87e3f136 4095 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
4096 *
4097 * Bring the hw back to operating state (all asics).
4098 * Returns 0 for success or an error on failure.
4099 * Called at driver resume.
4100 */
de185019 4101int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9 4102{
1348969a 4103 struct amdgpu_device *adev = drm_to_adev(dev);
03161a6e 4104 int r = 0;
d38ceaf9
AD
4105
4106 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4107 return 0;
4108
62498733 4109 if (adev->in_s0ix)
628c36d7
PL
4110 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
4111
d38ceaf9 4112 /* post card */
39c640c0 4113 if (amdgpu_device_need_post(adev)) {
4d2997ab 4114 r = amdgpu_device_asic_init(adev);
74b0b157 4115 if (r)
aac89168 4116 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 4117 }
d38ceaf9 4118
06ec9070 4119 r = amdgpu_device_ip_resume(adev);
e6707218 4120 if (r) {
aac89168 4121 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 4122 return r;
e6707218 4123 }
8d35a259 4124 amdgpu_fence_driver_hw_init(adev);
5ceb54c6 4125
06ec9070 4126 r = amdgpu_device_ip_late_init(adev);
03161a6e 4127 if (r)
4d3b9ae5 4128 return r;
d38ceaf9 4129
beff74bc
AD
4130 queue_delayed_work(system_wq, &adev->delayed_init_work,
4131 msecs_to_jiffies(AMDGPU_RESUME_MS));
4132
5d3a2d95
AD
4133 if (!adev->in_s0ix) {
4134 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4135 if (r)
4136 return r;
4137 }
756e6880 4138
96a5d8d4 4139 /* Make sure IB tests flushed */
beff74bc 4140 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 4141
a2e15b0e 4142 if (fbcon)
087451f3 4143 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
d38ceaf9
AD
4144
4145 drm_kms_helper_poll_enable(dev);
23a1a9e5 4146
5e6932fe 4147 amdgpu_ras_resume(adev);
4148
23a1a9e5
L
4149 /*
4150 * Most of the connector probing functions try to acquire runtime pm
4151 * refs to ensure that the GPU is powered on when connector polling is
4152 * performed. Since we're calling this from a runtime PM callback,
4153 * trying to acquire rpm refs will cause us to deadlock.
4154 *
4155 * Since we're guaranteed to be holding the rpm lock, it's safe to
4156 * temporarily disable the rpm helpers so this doesn't deadlock us.
4157 */
4158#ifdef CONFIG_PM
4159 dev->dev->power.disable_depth++;
4160#endif
4562236b
HW
4161 if (!amdgpu_device_has_dc_support(adev))
4162 drm_helper_hpd_irq_event(dev);
4163 else
4164 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
4165#ifdef CONFIG_PM
4166 dev->dev->power.disable_depth--;
4167#endif
44779b43
RZ
4168 adev->in_suspend = false;
4169
3fa8f89d
S
4170 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4171 DRM_WARN("smart shift update failed\n");
4172
4d3b9ae5 4173 return 0;
d38ceaf9
AD
4174}
4175
e3ecdffa
AD
4176/**
4177 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4178 *
4179 * @adev: amdgpu_device pointer
4180 *
4181 * The list of all the hardware IPs that make up the asic is walked and
4182 * the check_soft_reset callbacks are run. check_soft_reset determines
4183 * if the asic is still hung or not.
4184 * Returns true if any of the IPs are still in a hung state, false if not.
4185 */
06ec9070 4186static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
4187{
4188 int i;
4189 bool asic_hang = false;
4190
f993d628
ML
4191 if (amdgpu_sriov_vf(adev))
4192 return true;
4193
8bc04c29
AD
4194 if (amdgpu_asic_need_full_reset(adev))
4195 return true;
4196
63fbf42f 4197 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4198 if (!adev->ip_blocks[i].status.valid)
63fbf42f 4199 continue;
a1255107
AD
4200 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4201 adev->ip_blocks[i].status.hang =
4202 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4203 if (adev->ip_blocks[i].status.hang) {
aac89168 4204 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
4205 asic_hang = true;
4206 }
4207 }
4208 return asic_hang;
4209}
4210
e3ecdffa
AD
4211/**
4212 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4213 *
4214 * @adev: amdgpu_device pointer
4215 *
4216 * The list of all the hardware IPs that make up the asic is walked and the
4217 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4218 * handles any IP specific hardware or software state changes that are
4219 * necessary for a soft reset to succeed.
4220 * Returns 0 on success, negative error code on failure.
4221 */
06ec9070 4222static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
4223{
4224 int i, r = 0;
4225
4226 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4227 if (!adev->ip_blocks[i].status.valid)
d31a501e 4228 continue;
a1255107
AD
4229 if (adev->ip_blocks[i].status.hang &&
4230 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4231 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
4232 if (r)
4233 return r;
4234 }
4235 }
4236
4237 return 0;
4238}
4239
e3ecdffa
AD
4240/**
4241 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4242 *
4243 * @adev: amdgpu_device pointer
4244 *
4245 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4246 * reset is necessary to recover.
4247 * Returns true if a full asic reset is required, false if not.
4248 */
06ec9070 4249static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 4250{
da146d3b
AD
4251 int i;
4252
8bc04c29
AD
4253 if (amdgpu_asic_need_full_reset(adev))
4254 return true;
4255
da146d3b 4256 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4257 if (!adev->ip_blocks[i].status.valid)
da146d3b 4258 continue;
a1255107
AD
4259 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4260 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4261 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
4262 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4263 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 4264 if (adev->ip_blocks[i].status.hang) {
aac89168 4265 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
4266 return true;
4267 }
4268 }
35d782fe
CZ
4269 }
4270 return false;
4271}
4272
e3ecdffa
AD
4273/**
4274 * amdgpu_device_ip_soft_reset - do a soft reset
4275 *
4276 * @adev: amdgpu_device pointer
4277 *
4278 * The list of all the hardware IPs that make up the asic is walked and the
4279 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4280 * IP specific hardware or software state changes that are necessary to soft
4281 * reset the IP.
4282 * Returns 0 on success, negative error code on failure.
4283 */
06ec9070 4284static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4285{
4286 int i, r = 0;
4287
4288 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4289 if (!adev->ip_blocks[i].status.valid)
35d782fe 4290 continue;
a1255107
AD
4291 if (adev->ip_blocks[i].status.hang &&
4292 adev->ip_blocks[i].version->funcs->soft_reset) {
4293 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
4294 if (r)
4295 return r;
4296 }
4297 }
4298
4299 return 0;
4300}
4301
e3ecdffa
AD
4302/**
4303 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4304 *
4305 * @adev: amdgpu_device pointer
4306 *
4307 * The list of all the hardware IPs that make up the asic is walked and the
4308 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4309 * handles any IP specific hardware or software state changes that are
4310 * necessary after the IP has been soft reset.
4311 * Returns 0 on success, negative error code on failure.
4312 */
06ec9070 4313static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4314{
4315 int i, r = 0;
4316
4317 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4318 if (!adev->ip_blocks[i].status.valid)
35d782fe 4319 continue;
a1255107
AD
4320 if (adev->ip_blocks[i].status.hang &&
4321 adev->ip_blocks[i].version->funcs->post_soft_reset)
4322 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
4323 if (r)
4324 return r;
4325 }
4326
4327 return 0;
4328}
4329
e3ecdffa 4330/**
c33adbc7 4331 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
4332 *
4333 * @adev: amdgpu_device pointer
4334 *
4335 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4336 * restore things like GPUVM page tables after a GPU reset where
4337 * the contents of VRAM might be lost.
403009bf
CK
4338 *
4339 * Returns:
4340 * 0 on success, negative error code on failure.
e3ecdffa 4341 */
c33adbc7 4342static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 4343{
c41d1cf6 4344 struct dma_fence *fence = NULL, *next = NULL;
403009bf 4345 struct amdgpu_bo *shadow;
e18aaea7 4346 struct amdgpu_bo_vm *vmbo;
403009bf 4347 long r = 1, tmo;
c41d1cf6
ML
4348
4349 if (amdgpu_sriov_runtime(adev))
b045d3af 4350 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
4351 else
4352 tmo = msecs_to_jiffies(100);
4353
aac89168 4354 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 4355 mutex_lock(&adev->shadow_list_lock);
e18aaea7
ND
4356 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4357 shadow = &vmbo->bo;
403009bf 4358 /* No need to recover an evicted BO */
d3116756
CK
4359 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4360 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4361 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
403009bf
CK
4362 continue;
4363
4364 r = amdgpu_bo_restore_shadow(shadow, &next);
4365 if (r)
4366 break;
4367
c41d1cf6 4368 if (fence) {
1712fb1a 4369 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
4370 dma_fence_put(fence);
4371 fence = next;
1712fb1a 4372 if (tmo == 0) {
4373 r = -ETIMEDOUT;
c41d1cf6 4374 break;
1712fb1a 4375 } else if (tmo < 0) {
4376 r = tmo;
4377 break;
4378 }
403009bf
CK
4379 } else {
4380 fence = next;
c41d1cf6 4381 }
c41d1cf6
ML
4382 }
4383 mutex_unlock(&adev->shadow_list_lock);
4384
403009bf
CK
4385 if (fence)
4386 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
4387 dma_fence_put(fence);
4388
1712fb1a 4389 if (r < 0 || tmo <= 0) {
aac89168 4390 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
4391 return -EIO;
4392 }
c41d1cf6 4393
aac89168 4394 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 4395 return 0;
c41d1cf6
ML
4396}
4397
a90ad3c2 4398
e3ecdffa 4399/**
06ec9070 4400 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e 4401 *
982a820b 4402 * @adev: amdgpu_device pointer
87e3f136 4403 * @from_hypervisor: request from hypervisor
5740682e
ML
4404 *
4405 * do VF FLR and reinitialize Asic
3f48c681 4406 * return 0 means succeeded otherwise failed
e3ecdffa
AD
4407 */
4408static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4409 bool from_hypervisor)
5740682e
ML
4410{
4411 int r;
a5f67c93 4412 struct amdgpu_hive_info *hive = NULL;
5740682e 4413
992110d7 4414 amdgpu_amdkfd_pre_reset(adev);
5740682e 4415
428890a3 4416 amdgpu_amdkfd_pre_reset(adev);
4417
5740682e
ML
4418 if (from_hypervisor)
4419 r = amdgpu_virt_request_full_gpu(adev, true);
4420 else
4421 r = amdgpu_virt_reset_gpu(adev);
4422 if (r)
4423 return r;
a90ad3c2
ML
4424
4425 /* Resume IP prior to SMC */
06ec9070 4426 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
4427 if (r)
4428 goto error;
a90ad3c2 4429
c9ffa427 4430 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 4431
7a3e0bb2
RZ
4432 r = amdgpu_device_fw_loading(adev);
4433 if (r)
4434 return r;
4435
a90ad3c2 4436 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 4437 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
4438 if (r)
4439 goto error;
a90ad3c2 4440
a5f67c93
ZL
4441 hive = amdgpu_get_xgmi_hive(adev);
4442 /* Update PSP FW topology after reset */
4443 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4444 r = amdgpu_xgmi_update_topology(hive, adev);
4445
4446 if (hive)
4447 amdgpu_put_xgmi_hive(hive);
4448
4449 if (!r) {
4450 amdgpu_irq_gpu_reset_resume_helper(adev);
4451 r = amdgpu_ib_ring_tests(adev);
4452 amdgpu_amdkfd_post_reset(adev);
4453 }
a90ad3c2 4454
abc34253 4455error:
c41d1cf6 4456 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 4457 amdgpu_inc_vram_lost(adev);
c33adbc7 4458 r = amdgpu_device_recover_vram(adev);
a90ad3c2 4459 }
437f3e0b 4460 amdgpu_virt_release_full_gpu(adev, true);
a90ad3c2
ML
4461
4462 return r;
4463}
4464
9a1cddd6 4465/**
4466 * amdgpu_device_has_job_running - check if there is any job in mirror list
4467 *
982a820b 4468 * @adev: amdgpu_device pointer
9a1cddd6 4469 *
4470 * check if there is any job in mirror list
4471 */
4472bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4473{
4474 int i;
4475 struct drm_sched_job *job;
4476
4477 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4478 struct amdgpu_ring *ring = adev->rings[i];
4479
4480 if (!ring || !ring->sched.thread)
4481 continue;
4482
4483 spin_lock(&ring->sched.job_list_lock);
6efa4b46
LT
4484 job = list_first_entry_or_null(&ring->sched.pending_list,
4485 struct drm_sched_job, list);
9a1cddd6 4486 spin_unlock(&ring->sched.job_list_lock);
4487 if (job)
4488 return true;
4489 }
4490 return false;
4491}
4492
12938fad
CK
4493/**
4494 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4495 *
982a820b 4496 * @adev: amdgpu_device pointer
12938fad
CK
4497 *
4498 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4499 * a hung GPU.
4500 */
4501bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4502{
4503 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4504 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4505 return false;
4506 }
4507
3ba7b418
AG
4508 if (amdgpu_gpu_recovery == 0)
4509 goto disabled;
4510
4511 if (amdgpu_sriov_vf(adev))
4512 return true;
4513
4514 if (amdgpu_gpu_recovery == -1) {
4515 switch (adev->asic_type) {
0ffb1fd1
AD
4516#ifdef CONFIG_DRM_AMDGPU_SI
4517 case CHIP_VERDE:
4518 case CHIP_TAHITI:
4519 case CHIP_PITCAIRN:
4520 case CHIP_OLAND:
4521 case CHIP_HAINAN:
4522#endif
4523#ifdef CONFIG_DRM_AMDGPU_CIK
4524 case CHIP_KAVERI:
4525 case CHIP_KABINI:
4526 case CHIP_MULLINS:
4527#endif
4528 case CHIP_CARRIZO:
4529 case CHIP_STONEY:
4530 case CHIP_CYAN_SKILLFISH:
3ba7b418 4531 goto disabled;
0ffb1fd1
AD
4532 default:
4533 break;
3ba7b418 4534 }
12938fad
CK
4535 }
4536
4537 return true;
3ba7b418
AG
4538
4539disabled:
aac89168 4540 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4541 return false;
12938fad
CK
4542}
4543
5c03e584
FX
4544int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4545{
4546 u32 i;
4547 int ret = 0;
4548
4549 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4550
4551 dev_info(adev->dev, "GPU mode1 reset\n");
4552
4553 /* disable BM */
4554 pci_clear_master(adev->pdev);
4555
4556 amdgpu_device_cache_pci_state(adev->pdev);
4557
4558 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4559 dev_info(adev->dev, "GPU smu mode1 reset\n");
4560 ret = amdgpu_dpm_mode1_reset(adev);
4561 } else {
4562 dev_info(adev->dev, "GPU psp mode1 reset\n");
4563 ret = psp_gpu_reset(adev);
4564 }
4565
4566 if (ret)
4567 dev_err(adev->dev, "GPU mode1 reset failed\n");
4568
4569 amdgpu_device_load_pci_state(adev->pdev);
4570
4571 /* wait for asic to come out of reset */
4572 for (i = 0; i < adev->usec_timeout; i++) {
4573 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4574
4575 if (memsize != 0xffffffff)
4576 break;
4577 udelay(1);
4578 }
4579
4580 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4581 return ret;
4582}
5c6dd71e 4583
e3c1b071 4584int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
04442bf7 4585 struct amdgpu_reset_context *reset_context)
26bc5340 4586{
5c1e6fa4 4587 int i, r = 0;
04442bf7
LL
4588 struct amdgpu_job *job = NULL;
4589 bool need_full_reset =
4590 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4591
4592 if (reset_context->reset_req_dev == adev)
4593 job = reset_context->job;
71182665 4594
b602ca5f
TZ
4595 if (amdgpu_sriov_vf(adev)) {
4596 /* stop the data exchange thread */
4597 amdgpu_virt_fini_data_exchange(adev);
4598 }
4599
71182665 4600 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4601 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4602 struct amdgpu_ring *ring = adev->rings[i];
4603
51687759 4604 if (!ring || !ring->sched.thread)
0875dc9e 4605 continue;
5740682e 4606
c530b02f
JZ
4607 /*clear job fence from fence drv to avoid force_completion
4608 *leave NULL and vm flush fence in fence drv */
5c1e6fa4 4609 amdgpu_fence_driver_clear_job_fences(ring);
c530b02f 4610
2f9d4084
ML
4611 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4612 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4613 }
d38ceaf9 4614
ff99849b 4615 if (job && job->vm)
222b5f04
AG
4616 drm_sched_increase_karma(&job->base);
4617
04442bf7 4618 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
404b277b
LL
4619 /* If reset handler not implemented, continue; otherwise return */
4620 if (r == -ENOSYS)
4621 r = 0;
4622 else
04442bf7
LL
4623 return r;
4624
1d721ed6 4625 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4626 if (!amdgpu_sriov_vf(adev)) {
4627
4628 if (!need_full_reset)
4629 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4630
4631 if (!need_full_reset) {
4632 amdgpu_device_ip_pre_soft_reset(adev);
4633 r = amdgpu_device_ip_soft_reset(adev);
4634 amdgpu_device_ip_post_soft_reset(adev);
4635 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4636 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4637 need_full_reset = true;
4638 }
4639 }
4640
4641 if (need_full_reset)
4642 r = amdgpu_device_ip_suspend(adev);
04442bf7
LL
4643 if (need_full_reset)
4644 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4645 else
4646 clear_bit(AMDGPU_NEED_FULL_RESET,
4647 &reset_context->flags);
26bc5340
AG
4648 }
4649
4650 return r;
4651}
4652
04442bf7
LL
4653int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4654 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4655{
4656 struct amdgpu_device *tmp_adev = NULL;
04442bf7 4657 bool need_full_reset, skip_hw_reset, vram_lost = false;
26bc5340
AG
4658 int r = 0;
4659
04442bf7
LL
4660 /* Try reset handler method first */
4661 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4662 reset_list);
4663 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
404b277b
LL
4664 /* If reset handler not implemented, continue; otherwise return */
4665 if (r == -ENOSYS)
4666 r = 0;
4667 else
04442bf7
LL
4668 return r;
4669
4670 /* Reset handler not implemented, use the default method */
4671 need_full_reset =
4672 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4673 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4674
26bc5340 4675 /*
655ce9cb 4676 * ASIC reset has to be done on all XGMI hive nodes ASAP
26bc5340
AG
4677 * to allow proper links negotiation in FW (within 1 sec)
4678 */
7ac71382 4679 if (!skip_hw_reset && need_full_reset) {
655ce9cb 4680 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
041a62bc 4681 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4682 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
e3c1b071 4683 tmp_adev->gmc.xgmi.pending_reset = false;
c96cf282 4684 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4685 r = -EALREADY;
4686 } else
4687 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4688
041a62bc 4689 if (r) {
aac89168 4690 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4691 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4692 break;
ce316fa5
LM
4693 }
4694 }
4695
041a62bc
AG
4696 /* For XGMI wait for all resets to complete before proceed */
4697 if (!r) {
655ce9cb 4698 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
ce316fa5
LM
4699 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4700 flush_work(&tmp_adev->xgmi_reset_work);
4701 r = tmp_adev->asic_reset_res;
4702 if (r)
4703 break;
ce316fa5
LM
4704 }
4705 }
4706 }
ce316fa5 4707 }
26bc5340 4708
43c4d576 4709 if (!r && amdgpu_ras_intr_triggered()) {
655ce9cb 4710 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
8bc7b360
HZ
4711 if (tmp_adev->mmhub.ras_funcs &&
4712 tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4713 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
43c4d576
JC
4714 }
4715
00eaa571 4716 amdgpu_ras_intr_cleared();
43c4d576 4717 }
00eaa571 4718
655ce9cb 4719 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
26bc5340
AG
4720 if (need_full_reset) {
4721 /* post card */
e3c1b071 4722 r = amdgpu_device_asic_init(tmp_adev);
4723 if (r) {
aac89168 4724 dev_warn(tmp_adev->dev, "asic atom init failed!");
e3c1b071 4725 } else {
26bc5340 4726 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
9cec53c1
JZ
4727 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4728 if (r)
4729 goto out;
4730
26bc5340
AG
4731 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4732 if (r)
4733 goto out;
4734
4735 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4736 if (vram_lost) {
77e7f829 4737 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4738 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4739 }
4740
26bc5340
AG
4741 r = amdgpu_device_fw_loading(tmp_adev);
4742 if (r)
4743 return r;
4744
4745 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4746 if (r)
4747 goto out;
4748
4749 if (vram_lost)
4750 amdgpu_device_fill_reset_magic(tmp_adev);
4751
fdafb359
EQ
4752 /*
4753 * Add this ASIC as tracked as reset was already
4754 * complete successfully.
4755 */
4756 amdgpu_register_gpu_instance(tmp_adev);
4757
04442bf7
LL
4758 if (!reset_context->hive &&
4759 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
e3c1b071 4760 amdgpu_xgmi_add_device(tmp_adev);
4761
7c04ca50 4762 r = amdgpu_device_ip_late_init(tmp_adev);
4763 if (r)
4764 goto out;
4765
087451f3 4766 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
565d1941 4767
e8fbaf03
GC
4768 /*
4769 * The GPU enters bad state once faulty pages
4770 * by ECC has reached the threshold, and ras
4771 * recovery is scheduled next. So add one check
4772 * here to break recovery if it indeed exceeds
4773 * bad page threshold, and remind user to
4774 * retire this GPU or setting one bigger
4775 * bad_page_threshold value to fix this once
4776 * probing driver again.
4777 */
11003c68 4778 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
e8fbaf03
GC
4779 /* must succeed. */
4780 amdgpu_ras_resume(tmp_adev);
4781 } else {
4782 r = -EINVAL;
4783 goto out;
4784 }
e79a04d5 4785
26bc5340 4786 /* Update PSP FW topology after reset */
04442bf7
LL
4787 if (reset_context->hive &&
4788 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4789 r = amdgpu_xgmi_update_topology(
4790 reset_context->hive, tmp_adev);
26bc5340
AG
4791 }
4792 }
4793
26bc5340
AG
4794out:
4795 if (!r) {
4796 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4797 r = amdgpu_ib_ring_tests(tmp_adev);
4798 if (r) {
4799 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
26bc5340
AG
4800 need_full_reset = true;
4801 r = -EAGAIN;
4802 goto end;
4803 }
4804 }
4805
4806 if (!r)
4807 r = amdgpu_device_recover_vram(tmp_adev);
4808 else
4809 tmp_adev->asic_reset_res = r;
4810 }
4811
4812end:
04442bf7
LL
4813 if (need_full_reset)
4814 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4815 else
4816 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340
AG
4817 return r;
4818}
4819
08ebb485
DL
4820static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4821 struct amdgpu_hive_info *hive)
26bc5340 4822{
53b3f8f4
DL
4823 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4824 return false;
4825
08ebb485
DL
4826 if (hive) {
4827 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4828 } else {
4829 down_write(&adev->reset_sem);
4830 }
5740682e 4831
a3a09142
AD
4832 switch (amdgpu_asic_reset_method(adev)) {
4833 case AMD_RESET_METHOD_MODE1:
4834 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4835 break;
4836 case AMD_RESET_METHOD_MODE2:
4837 adev->mp1_state = PP_MP1_STATE_RESET;
4838 break;
4839 default:
4840 adev->mp1_state = PP_MP1_STATE_NONE;
4841 break;
4842 }
1d721ed6
AG
4843
4844 return true;
26bc5340 4845}
d38ceaf9 4846
26bc5340
AG
4847static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4848{
89041940 4849 amdgpu_vf_error_trans_all(adev);
a3a09142 4850 adev->mp1_state = PP_MP1_STATE_NONE;
53b3f8f4 4851 atomic_set(&adev->in_gpu_reset, 0);
6049db43 4852 up_write(&adev->reset_sem);
26bc5340
AG
4853}
4854
91fb309d
HC
4855/*
4856 * to lockup a list of amdgpu devices in a hive safely, if not a hive
4857 * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4858 *
4859 * unlock won't require roll back.
4860 */
4861static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4862{
4863 struct amdgpu_device *tmp_adev = NULL;
4864
175ac6ec 4865 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
91fb309d
HC
4866 if (!hive) {
4867 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4868 return -ENODEV;
4869 }
4870 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4871 if (!amdgpu_device_lock_adev(tmp_adev, hive))
4872 goto roll_back;
4873 }
4874 } else if (!amdgpu_device_lock_adev(adev, hive))
4875 return -EAGAIN;
4876
4877 return 0;
4878roll_back:
4879 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4880 /*
4881 * if the lockup iteration break in the middle of a hive,
4882 * it may means there may has a race issue,
4883 * or a hive device locked up independently.
4884 * we may be in trouble and may not, so will try to roll back
4885 * the lock and give out a warnning.
4886 */
4887 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4888 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4889 amdgpu_device_unlock_adev(tmp_adev);
4890 }
4891 }
4892 return -EAGAIN;
4893}
4894
3f12acc8
EQ
4895static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4896{
4897 struct pci_dev *p = NULL;
4898
4899 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4900 adev->pdev->bus->number, 1);
4901 if (p) {
4902 pm_runtime_enable(&(p->dev));
4903 pm_runtime_resume(&(p->dev));
4904 }
4905}
4906
4907static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4908{
4909 enum amd_reset_method reset_method;
4910 struct pci_dev *p = NULL;
4911 u64 expires;
4912
4913 /*
4914 * For now, only BACO and mode1 reset are confirmed
4915 * to suffer the audio issue without proper suspended.
4916 */
4917 reset_method = amdgpu_asic_reset_method(adev);
4918 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4919 (reset_method != AMD_RESET_METHOD_MODE1))
4920 return -EINVAL;
4921
4922 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4923 adev->pdev->bus->number, 1);
4924 if (!p)
4925 return -ENODEV;
4926
4927 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4928 if (!expires)
4929 /*
4930 * If we cannot get the audio device autosuspend delay,
4931 * a fixed 4S interval will be used. Considering 3S is
4932 * the audio controller default autosuspend delay setting.
4933 * 4S used here is guaranteed to cover that.
4934 */
54b7feb9 4935 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4936
4937 while (!pm_runtime_status_suspended(&(p->dev))) {
4938 if (!pm_runtime_suspend(&(p->dev)))
4939 break;
4940
4941 if (expires < ktime_get_mono_fast_ns()) {
4942 dev_warn(adev->dev, "failed to suspend display audio\n");
4943 /* TODO: abort the succeeding gpu reset? */
4944 return -ETIMEDOUT;
4945 }
4946 }
4947
4948 pm_runtime_disable(&(p->dev));
4949
4950 return 0;
4951}
4952
9d8d96be 4953static void amdgpu_device_recheck_guilty_jobs(
04442bf7
LL
4954 struct amdgpu_device *adev, struct list_head *device_list_handle,
4955 struct amdgpu_reset_context *reset_context)
e6c6338f
JZ
4956{
4957 int i, r = 0;
4958
4959 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4960 struct amdgpu_ring *ring = adev->rings[i];
4961 int ret = 0;
4962 struct drm_sched_job *s_job;
4963
4964 if (!ring || !ring->sched.thread)
4965 continue;
4966
4967 s_job = list_first_entry_or_null(&ring->sched.pending_list,
4968 struct drm_sched_job, list);
4969 if (s_job == NULL)
4970 continue;
4971
4972 /* clear job's guilty and depend the folowing step to decide the real one */
4973 drm_sched_reset_karma(s_job);
38d4e463
JC
4974 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
4975 * to make sure fence is balanced */
4976 dma_fence_get(s_job->s_fence->parent);
e6c6338f
JZ
4977 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4978
4979 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4980 if (ret == 0) { /* timeout */
4981 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4982 ring->sched.name, s_job->id);
4983
4984 /* set guilty */
4985 drm_sched_increase_karma(s_job);
4986retry:
4987 /* do hw reset */
4988 if (amdgpu_sriov_vf(adev)) {
4989 amdgpu_virt_fini_data_exchange(adev);
4990 r = amdgpu_device_reset_sriov(adev, false);
4991 if (r)
4992 adev->asic_reset_res = r;
4993 } else {
04442bf7
LL
4994 clear_bit(AMDGPU_SKIP_HW_RESET,
4995 &reset_context->flags);
4996 r = amdgpu_do_asic_reset(device_list_handle,
4997 reset_context);
e6c6338f
JZ
4998 if (r && r == -EAGAIN)
4999 goto retry;
5000 }
5001
5002 /*
5003 * add reset counter so that the following
5004 * resubmitted job could flush vmid
5005 */
5006 atomic_inc(&adev->gpu_reset_counter);
5007 continue;
5008 }
5009
5010 /* got the hw fence, signal finished fence */
5011 atomic_dec(ring->sched.score);
38d4e463 5012 dma_fence_put(s_job->s_fence->parent);
e6c6338f
JZ
5013 dma_fence_get(&s_job->s_fence->finished);
5014 dma_fence_signal(&s_job->s_fence->finished);
5015 dma_fence_put(&s_job->s_fence->finished);
5016
5017 /* remove node from list and free the job */
5018 spin_lock(&ring->sched.job_list_lock);
5019 list_del_init(&s_job->list);
5020 spin_unlock(&ring->sched.job_list_lock);
5021 ring->sched.ops->free_job(s_job);
5022 }
5023}
5024
26bc5340
AG
5025/**
5026 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5027 *
982a820b 5028 * @adev: amdgpu_device pointer
26bc5340
AG
5029 * @job: which job trigger hang
5030 *
5031 * Attempt to reset the GPU if it has hung (all asics).
5032 * Attempt to do soft-reset or full-reset and reinitialize Asic
5033 * Returns 0 for success or an error on failure.
5034 */
5035
54f329cc 5036int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
26bc5340
AG
5037 struct amdgpu_job *job)
5038{
1d721ed6 5039 struct list_head device_list, *device_list_handle = NULL;
7dd8c205 5040 bool job_signaled = false;
26bc5340 5041 struct amdgpu_hive_info *hive = NULL;
26bc5340 5042 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 5043 int i, r = 0;
bb5c7235 5044 bool need_emergency_restart = false;
3f12acc8 5045 bool audio_suspended = false;
e6c6338f 5046 int tmp_vram_lost_counter;
04442bf7
LL
5047 struct amdgpu_reset_context reset_context;
5048
5049 memset(&reset_context, 0, sizeof(reset_context));
26bc5340 5050
6e3cd2a9 5051 /*
bb5c7235
WS
5052 * Special case: RAS triggered and full reset isn't supported
5053 */
5054 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5055
d5ea093e
AG
5056 /*
5057 * Flush RAM to disk so that after reboot
5058 * the user can read log and see why the system rebooted.
5059 */
bb5c7235 5060 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
5061 DRM_WARN("Emergency reboot.");
5062
5063 ksys_sync_helper();
5064 emergency_restart();
5065 }
5066
b823821f 5067 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 5068 need_emergency_restart ? "jobs stop":"reset");
26bc5340
AG
5069
5070 /*
1d721ed6
AG
5071 * Here we trylock to avoid chain of resets executing from
5072 * either trigger by jobs on different adevs in XGMI hive or jobs on
5073 * different schedulers for same device while this TO handler is running.
5074 * We always reset all schedulers for device and all devices for XGMI
5075 * hive so that should take care of them too.
26bc5340 5076 */
175ac6ec
ZL
5077 if (!amdgpu_sriov_vf(adev))
5078 hive = amdgpu_get_xgmi_hive(adev);
53b3f8f4
DL
5079 if (hive) {
5080 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
5081 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
5082 job ? job->base.id : -1, hive->hive_id);
d95e8e97 5083 amdgpu_put_xgmi_hive(hive);
ff99849b 5084 if (job && job->vm)
91fb309d 5085 drm_sched_increase_karma(&job->base);
53b3f8f4
DL
5086 return 0;
5087 }
5088 mutex_lock(&hive->hive_lock);
1d721ed6 5089 }
26bc5340 5090
04442bf7
LL
5091 reset_context.method = AMD_RESET_METHOD_NONE;
5092 reset_context.reset_req_dev = adev;
5093 reset_context.job = job;
5094 reset_context.hive = hive;
5095 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5096
91fb309d
HC
5097 /*
5098 * lock the device before we try to operate the linked list
5099 * if didn't get the device lock, don't touch the linked list since
5100 * others may iterating it.
5101 */
5102 r = amdgpu_device_lock_hive_adev(adev, hive);
5103 if (r) {
5104 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
5105 job ? job->base.id : -1);
5106
5107 /* even we skipped this reset, still need to set the job to guilty */
ff99849b 5108 if (job && job->vm)
91fb309d
HC
5109 drm_sched_increase_karma(&job->base);
5110 goto skip_recovery;
5111 }
5112
9e94d22c
EQ
5113 /*
5114 * Build list of devices to reset.
5115 * In case we are in XGMI hive mode, resort the device list
5116 * to put adev in the 1st position.
5117 */
5118 INIT_LIST_HEAD(&device_list);
175ac6ec 5119 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
655ce9cb 5120 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
5121 list_add_tail(&tmp_adev->reset_list, &device_list);
5122 if (!list_is_first(&adev->reset_list, &device_list))
5123 list_rotate_to_front(&adev->reset_list, &device_list);
5124 device_list_handle = &device_list;
26bc5340 5125 } else {
655ce9cb 5126 list_add_tail(&adev->reset_list, &device_list);
26bc5340
AG
5127 device_list_handle = &device_list;
5128 }
5129
1d721ed6 5130 /* block all schedulers and reset given job's ring */
655ce9cb 5131 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
3f12acc8
EQ
5132 /*
5133 * Try to put the audio codec into suspend state
5134 * before gpu reset started.
5135 *
5136 * Due to the power domain of the graphics device
5137 * is shared with AZ power domain. Without this,
5138 * we may change the audio hardware from behind
5139 * the audio driver's back. That will trigger
5140 * some audio codec errors.
5141 */
5142 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5143 audio_suspended = true;
5144
9e94d22c
EQ
5145 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5146
52fb44cf
EQ
5147 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5148
428890a3 5149 if (!amdgpu_sriov_vf(tmp_adev))
5150 amdgpu_amdkfd_pre_reset(tmp_adev);
9e94d22c 5151
12ffa55d
AG
5152 /*
5153 * Mark these ASICs to be reseted as untracked first
5154 * And add them back after reset completed
5155 */
5156 amdgpu_unregister_gpu_instance(tmp_adev);
5157
087451f3 5158 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
565d1941 5159
f1c1314b 5160 /* disable ras on ALL IPs */
bb5c7235 5161 if (!need_emergency_restart &&
b823821f 5162 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 5163 amdgpu_ras_suspend(tmp_adev);
5164
1d721ed6
AG
5165 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5166 struct amdgpu_ring *ring = tmp_adev->rings[i];
5167
5168 if (!ring || !ring->sched.thread)
5169 continue;
5170
0b2d2c2e 5171 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 5172
bb5c7235 5173 if (need_emergency_restart)
7c6e68c7 5174 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6 5175 }
8f8c80f4 5176 atomic_inc(&tmp_adev->gpu_reset_counter);
1d721ed6
AG
5177 }
5178
bb5c7235 5179 if (need_emergency_restart)
7c6e68c7
AG
5180 goto skip_sched_resume;
5181
1d721ed6
AG
5182 /*
5183 * Must check guilty signal here since after this point all old
5184 * HW fences are force signaled.
5185 *
5186 * job->base holds a reference to parent fence
5187 */
5188 if (job && job->base.s_fence->parent &&
7dd8c205 5189 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 5190 job_signaled = true;
1d721ed6
AG
5191 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5192 goto skip_hw_reset;
5193 }
5194
26bc5340 5195retry: /* Rest of adevs pre asic reset from XGMI hive. */
655ce9cb 5196 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
04442bf7 5197 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
26bc5340
AG
5198 /*TODO Should we stop ?*/
5199 if (r) {
aac89168 5200 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 5201 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
5202 tmp_adev->asic_reset_res = r;
5203 }
5204 }
5205
e6c6338f 5206 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
26bc5340 5207 /* Actual ASIC resets if needed.*/
4f30d920 5208 /* Host driver will handle XGMI hive reset for SRIOV */
26bc5340
AG
5209 if (amdgpu_sriov_vf(adev)) {
5210 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5211 if (r)
5212 adev->asic_reset_res = r;
5213 } else {
04442bf7 5214 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
26bc5340
AG
5215 if (r && r == -EAGAIN)
5216 goto retry;
5217 }
5218
1d721ed6
AG
5219skip_hw_reset:
5220
26bc5340 5221 /* Post ASIC reset for all devs .*/
655ce9cb 5222 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
7c6e68c7 5223
e6c6338f
JZ
5224 /*
5225 * Sometimes a later bad compute job can block a good gfx job as gfx
5226 * and compute ring share internal GC HW mutually. We add an additional
5227 * guilty jobs recheck step to find the real guilty job, it synchronously
5228 * submits and pends for the first job being signaled. If it gets timeout,
5229 * we identify it as a real guilty job.
5230 */
5231 if (amdgpu_gpu_recovery == 2 &&
5232 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
04442bf7
LL
5233 amdgpu_device_recheck_guilty_jobs(
5234 tmp_adev, device_list_handle, &reset_context);
e6c6338f 5235
1d721ed6
AG
5236 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5237 struct amdgpu_ring *ring = tmp_adev->rings[i];
5238
5239 if (!ring || !ring->sched.thread)
5240 continue;
5241
5242 /* No point to resubmit jobs if we didn't HW reset*/
5243 if (!tmp_adev->asic_reset_res && !job_signaled)
5244 drm_sched_resubmit_jobs(&ring->sched);
5245
5246 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5247 }
5248
1053b9c9 5249 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
4a580877 5250 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
5251 }
5252
5253 tmp_adev->asic_reset_res = 0;
26bc5340
AG
5254
5255 if (r) {
5256 /* bad news, how to tell it to userspace ? */
12ffa55d 5257 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
5258 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5259 } else {
12ffa55d 5260 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
3fa8f89d
S
5261 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5262 DRM_WARN("smart shift update failed\n");
26bc5340 5263 }
7c6e68c7 5264 }
26bc5340 5265
7c6e68c7 5266skip_sched_resume:
655ce9cb 5267 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
428890a3 5268 /* unlock kfd: SRIOV would do it separately */
5269 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5270 amdgpu_amdkfd_post_reset(tmp_adev);
8e2712e7 5271
5272 /* kfd_post_reset will do nothing if kfd device is not initialized,
5273 * need to bring up kfd here if it's not be initialized before
5274 */
5275 if (!adev->kfd.init_complete)
5276 amdgpu_amdkfd_device_init(adev);
5277
3f12acc8
EQ
5278 if (audio_suspended)
5279 amdgpu_device_resume_display_audio(tmp_adev);
26bc5340
AG
5280 amdgpu_device_unlock_adev(tmp_adev);
5281 }
5282
cbfd17f7 5283skip_recovery:
9e94d22c 5284 if (hive) {
53b3f8f4 5285 atomic_set(&hive->in_reset, 0);
9e94d22c 5286 mutex_unlock(&hive->hive_lock);
d95e8e97 5287 amdgpu_put_xgmi_hive(hive);
9e94d22c 5288 }
26bc5340 5289
91fb309d 5290 if (r && r != -EAGAIN)
26bc5340 5291 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
5292 return r;
5293}
5294
54f329cc
AG
5295struct amdgpu_recover_work_struct {
5296 struct work_struct base;
5297 struct amdgpu_device *adev;
5298 struct amdgpu_job *job;
5299 int ret;
5300};
5301
5302static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
5303{
5304 struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
5305
5306 recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
5307}
5308/*
5309 * Serialize gpu recover into reset domain single threaded wq
5310 */
5311int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5312 struct amdgpu_job *job)
5313{
5314 struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
5315
5316 INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
5317
5318 if (!queue_work(adev->reset_domain.wq, &work.base))
5319 return -EAGAIN;
5320
5321 flush_work(&work.base);
5322
5323 return work.ret;
5324}
5325
e3ecdffa
AD
5326/**
5327 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5328 *
5329 * @adev: amdgpu_device pointer
5330 *
5331 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5332 * and lanes) of the slot the device is in. Handles APUs and
5333 * virtualized environments where PCIE config space may not be available.
5334 */
5494d864 5335static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 5336{
5d9a6330 5337 struct pci_dev *pdev;
c5313457
HK
5338 enum pci_bus_speed speed_cap, platform_speed_cap;
5339 enum pcie_link_width platform_link_width;
d0dd7f0c 5340
cd474ba0
AD
5341 if (amdgpu_pcie_gen_cap)
5342 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 5343
cd474ba0
AD
5344 if (amdgpu_pcie_lane_cap)
5345 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 5346
cd474ba0
AD
5347 /* covers APUs as well */
5348 if (pci_is_root_bus(adev->pdev->bus)) {
5349 if (adev->pm.pcie_gen_mask == 0)
5350 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5351 if (adev->pm.pcie_mlw_mask == 0)
5352 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 5353 return;
cd474ba0 5354 }
d0dd7f0c 5355
c5313457
HK
5356 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5357 return;
5358
dbaa922b
AD
5359 pcie_bandwidth_available(adev->pdev, NULL,
5360 &platform_speed_cap, &platform_link_width);
c5313457 5361
cd474ba0 5362 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
5363 /* asic caps */
5364 pdev = adev->pdev;
5365 speed_cap = pcie_get_speed_cap(pdev);
5366 if (speed_cap == PCI_SPEED_UNKNOWN) {
5367 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
5368 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5369 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 5370 } else {
2b3a1f51
FX
5371 if (speed_cap == PCIE_SPEED_32_0GT)
5372 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5373 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5374 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5375 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5376 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5377 else if (speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5378 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5379 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5380 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5381 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5382 else if (speed_cap == PCIE_SPEED_8_0GT)
5383 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5384 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5385 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5386 else if (speed_cap == PCIE_SPEED_5_0GT)
5387 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5388 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5389 else
5390 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5391 }
5392 /* platform caps */
c5313457 5393 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
5394 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5395 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5396 } else {
2b3a1f51
FX
5397 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5398 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5399 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5400 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5401 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5402 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5403 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5404 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5405 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5406 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5407 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 5408 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
5409 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5410 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5411 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 5412 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
5413 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5414 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5415 else
5416 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5417
cd474ba0
AD
5418 }
5419 }
5420 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 5421 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
5422 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5423 } else {
c5313457 5424 switch (platform_link_width) {
5d9a6330 5425 case PCIE_LNK_X32:
cd474ba0
AD
5426 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5427 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5428 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5431 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5432 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5433 break;
5d9a6330 5434 case PCIE_LNK_X16:
cd474ba0
AD
5435 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5436 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5437 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5438 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5439 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5440 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5441 break;
5d9a6330 5442 case PCIE_LNK_X12:
cd474ba0
AD
5443 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5444 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5445 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5447 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5448 break;
5d9a6330 5449 case PCIE_LNK_X8:
cd474ba0
AD
5450 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5451 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5452 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5453 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5454 break;
5d9a6330 5455 case PCIE_LNK_X4:
cd474ba0
AD
5456 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5457 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5458 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5459 break;
5d9a6330 5460 case PCIE_LNK_X2:
cd474ba0
AD
5461 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5462 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5463 break;
5d9a6330 5464 case PCIE_LNK_X1:
cd474ba0
AD
5465 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5466 break;
5467 default:
5468 break;
5469 }
d0dd7f0c
AD
5470 }
5471 }
5472}
d38ceaf9 5473
361dbd01
AD
5474int amdgpu_device_baco_enter(struct drm_device *dev)
5475{
1348969a 5476 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5477 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 5478
4a580877 5479 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5480 return -ENOTSUPP;
5481
8ab0d6f0 5482 if (ras && adev->ras_enabled &&
acdae216 5483 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5484 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5485
9530273e 5486 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
5487}
5488
5489int amdgpu_device_baco_exit(struct drm_device *dev)
5490{
1348969a 5491 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5492 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 5493 int ret = 0;
361dbd01 5494
4a580877 5495 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5496 return -ENOTSUPP;
5497
9530273e
EQ
5498 ret = amdgpu_dpm_baco_exit(adev);
5499 if (ret)
5500 return ret;
7a22677b 5501
8ab0d6f0 5502 if (ras && adev->ras_enabled &&
acdae216 5503 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5504 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5505
1bece222
CL
5506 if (amdgpu_passthrough(adev) &&
5507 adev->nbio.funcs->clear_doorbell_interrupt)
5508 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5509
7a22677b 5510 return 0;
361dbd01 5511}
c9a6b82f 5512
acd89fca
AG
5513static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5514{
5515 int i;
5516
5517 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5518 struct amdgpu_ring *ring = adev->rings[i];
5519
5520 if (!ring || !ring->sched.thread)
5521 continue;
5522
5523 cancel_delayed_work_sync(&ring->sched.work_tdr);
5524 }
5525}
5526
c9a6b82f
AG
5527/**
5528 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5529 * @pdev: PCI device struct
5530 * @state: PCI channel state
5531 *
5532 * Description: Called when a PCI error is detected.
5533 *
5534 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5535 */
5536pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5537{
5538 struct drm_device *dev = pci_get_drvdata(pdev);
5539 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5540 int i;
c9a6b82f
AG
5541
5542 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5543
6894305c
AG
5544 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5545 DRM_WARN("No support for XGMI hive yet...");
5546 return PCI_ERS_RESULT_DISCONNECT;
5547 }
5548
e17e27f9
GC
5549 adev->pci_channel_state = state;
5550
c9a6b82f
AG
5551 switch (state) {
5552 case pci_channel_io_normal:
5553 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca 5554 /* Fatal error, prepare for slot reset */
8a11d283
TZ
5555 case pci_channel_io_frozen:
5556 /*
acd89fca
AG
5557 * Cancel and wait for all TDRs in progress if failing to
5558 * set adev->in_gpu_reset in amdgpu_device_lock_adev
5559 *
5560 * Locking adev->reset_sem will prevent any external access
5561 * to GPU during PCI error recovery
5562 */
5563 while (!amdgpu_device_lock_adev(adev, NULL))
5564 amdgpu_cancel_all_tdr(adev);
5565
5566 /*
5567 * Block any work scheduling as we do for regular GPU reset
5568 * for the duration of the recovery
5569 */
5570 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5571 struct amdgpu_ring *ring = adev->rings[i];
5572
5573 if (!ring || !ring->sched.thread)
5574 continue;
5575
5576 drm_sched_stop(&ring->sched, NULL);
5577 }
8f8c80f4 5578 atomic_inc(&adev->gpu_reset_counter);
c9a6b82f
AG
5579 return PCI_ERS_RESULT_NEED_RESET;
5580 case pci_channel_io_perm_failure:
5581 /* Permanent error, prepare for device removal */
5582 return PCI_ERS_RESULT_DISCONNECT;
5583 }
5584
5585 return PCI_ERS_RESULT_NEED_RESET;
5586}
5587
5588/**
5589 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5590 * @pdev: pointer to PCI device
5591 */
5592pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5593{
5594
5595 DRM_INFO("PCI error: mmio enabled callback!!\n");
5596
5597 /* TODO - dump whatever for debugging purposes */
5598
5599 /* This called only if amdgpu_pci_error_detected returns
5600 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5601 * works, no need to reset slot.
5602 */
5603
5604 return PCI_ERS_RESULT_RECOVERED;
5605}
5606
5607/**
5608 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5609 * @pdev: PCI device struct
5610 *
5611 * Description: This routine is called by the pci error recovery
5612 * code after the PCI slot has been reset, just before we
5613 * should resume normal operations.
5614 */
5615pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5616{
5617 struct drm_device *dev = pci_get_drvdata(pdev);
5618 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 5619 int r, i;
04442bf7 5620 struct amdgpu_reset_context reset_context;
362c7b91 5621 u32 memsize;
7ac71382 5622 struct list_head device_list;
c9a6b82f
AG
5623
5624 DRM_INFO("PCI error: slot reset callback!!\n");
5625
04442bf7
LL
5626 memset(&reset_context, 0, sizeof(reset_context));
5627
7ac71382 5628 INIT_LIST_HEAD(&device_list);
655ce9cb 5629 list_add_tail(&adev->reset_list, &device_list);
7ac71382 5630
362c7b91
AG
5631 /* wait for asic to come out of reset */
5632 msleep(500);
5633
7ac71382 5634 /* Restore PCI confspace */
c1dd4aa6 5635 amdgpu_device_load_pci_state(pdev);
c9a6b82f 5636
362c7b91
AG
5637 /* confirm ASIC came out of reset */
5638 for (i = 0; i < adev->usec_timeout; i++) {
5639 memsize = amdgpu_asic_get_config_memsize(adev);
5640
5641 if (memsize != 0xffffffff)
5642 break;
5643 udelay(1);
5644 }
5645 if (memsize == 0xffffffff) {
5646 r = -ETIME;
5647 goto out;
5648 }
5649
04442bf7
LL
5650 reset_context.method = AMD_RESET_METHOD_NONE;
5651 reset_context.reset_req_dev = adev;
5652 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5653 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5654
7afefb81 5655 adev->no_hw_access = true;
04442bf7 5656 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
7afefb81 5657 adev->no_hw_access = false;
c9a6b82f
AG
5658 if (r)
5659 goto out;
5660
04442bf7 5661 r = amdgpu_do_asic_reset(&device_list, &reset_context);
c9a6b82f
AG
5662
5663out:
c9a6b82f 5664 if (!r) {
c1dd4aa6
AG
5665 if (amdgpu_device_cache_pci_state(adev->pdev))
5666 pci_restore_state(adev->pdev);
5667
c9a6b82f
AG
5668 DRM_INFO("PCIe error recovery succeeded\n");
5669 } else {
5670 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5671 amdgpu_device_unlock_adev(adev);
5672 }
5673
5674 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5675}
5676
5677/**
5678 * amdgpu_pci_resume() - resume normal ops after PCI reset
5679 * @pdev: pointer to PCI device
5680 *
5681 * Called when the error recovery driver tells us that its
505199a3 5682 * OK to resume normal operation.
c9a6b82f
AG
5683 */
5684void amdgpu_pci_resume(struct pci_dev *pdev)
5685{
5686 struct drm_device *dev = pci_get_drvdata(pdev);
5687 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5688 int i;
c9a6b82f 5689
c9a6b82f
AG
5690
5691 DRM_INFO("PCI error: resume callback!!\n");
acd89fca 5692
e17e27f9
GC
5693 /* Only continue execution for the case of pci_channel_io_frozen */
5694 if (adev->pci_channel_state != pci_channel_io_frozen)
5695 return;
5696
acd89fca
AG
5697 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5698 struct amdgpu_ring *ring = adev->rings[i];
5699
5700 if (!ring || !ring->sched.thread)
5701 continue;
5702
5703
5704 drm_sched_resubmit_jobs(&ring->sched);
5705 drm_sched_start(&ring->sched, true);
5706 }
5707
5708 amdgpu_device_unlock_adev(adev);
c9a6b82f 5709}
c1dd4aa6
AG
5710
5711bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5712{
5713 struct drm_device *dev = pci_get_drvdata(pdev);
5714 struct amdgpu_device *adev = drm_to_adev(dev);
5715 int r;
5716
5717 r = pci_save_state(pdev);
5718 if (!r) {
5719 kfree(adev->pci_state);
5720
5721 adev->pci_state = pci_store_saved_state(pdev);
5722
5723 if (!adev->pci_state) {
5724 DRM_ERROR("Failed to store PCI saved state");
5725 return false;
5726 }
5727 } else {
5728 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5729 return false;
5730 }
5731
5732 return true;
5733}
5734
5735bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5736{
5737 struct drm_device *dev = pci_get_drvdata(pdev);
5738 struct amdgpu_device *adev = drm_to_adev(dev);
5739 int r;
5740
5741 if (!adev->pci_state)
5742 return false;
5743
5744 r = pci_load_saved_state(pdev, adev->pci_state);
5745
5746 if (!r) {
5747 pci_restore_state(pdev);
5748 } else {
5749 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5750 return false;
5751 }
5752
5753 return true;
5754}
5755
810085dd
EH
5756void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5757 struct amdgpu_ring *ring)
5758{
5759#ifdef CONFIG_X86_64
5760 if (adev->flags & AMD_IS_APU)
5761 return;
5762#endif
5763 if (adev->gmc.xgmi.connected_to_cpu)
5764 return;
5765
5766 if (ring && ring->funcs->emit_hdp_flush)
5767 amdgpu_ring_emit_hdp_flush(ring);
5768 else
5769 amdgpu_asic_flush_hdp(adev, ring);
5770}
c1dd4aa6 5771
810085dd
EH
5772void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5773 struct amdgpu_ring *ring)
5774{
5775#ifdef CONFIG_X86_64
5776 if (adev->flags & AMD_IS_APU)
5777 return;
5778#endif
5779 if (adev->gmc.xgmi.connected_to_cpu)
5780 return;
c1dd4aa6 5781
810085dd
EH
5782 amdgpu_asic_invalidate_hdp(adev, ring);
5783}
34f3a4a9
LY
5784
5785/**
5786 * amdgpu_device_halt() - bring hardware to some kind of halt state
5787 *
5788 * @adev: amdgpu_device pointer
5789 *
5790 * Bring hardware to some kind of halt state so that no one can touch it
5791 * any more. It will help to maintain error context when error occurred.
5792 * Compare to a simple hang, the system will keep stable at least for SSH
5793 * access. Then it should be trivial to inspect the hardware state and
5794 * see what's going on. Implemented as following:
5795 *
5796 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5797 * clears all CPU mappings to device, disallows remappings through page faults
5798 * 2. amdgpu_irq_disable_all() disables all interrupts
5799 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5800 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5801 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5802 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5803 * flush any in flight DMA operations
5804 */
5805void amdgpu_device_halt(struct amdgpu_device *adev)
5806{
5807 struct pci_dev *pdev = adev->pdev;
e0f943b4 5808 struct drm_device *ddev = adev_to_drm(adev);
34f3a4a9
LY
5809
5810 drm_dev_unplug(ddev);
5811
5812 amdgpu_irq_disable_all(adev);
5813
5814 amdgpu_fence_driver_hw_fini(adev);
5815
5816 adev->no_hw_access = true;
5817
5818 amdgpu_device_unmap_mmio(adev);
5819
5820 pci_disable_device(pdev);
5821 pci_wait_for_pending_transaction(pdev);
5822}