vgaarb: cleanup vgaarb.h
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
fdf2f6c5 33
4562236b 34#include <drm/drm_atomic_helper.h>
fcd70cd3 35#include <drm/drm_probe_helper.h>
d38ceaf9
AD
36#include <drm/amdgpu_drm.h>
37#include <linux/vgaarb.h>
38#include <linux/vga_switcheroo.h>
39#include <linux/efi.h>
40#include "amdgpu.h"
f4b373f4 41#include "amdgpu_trace.h"
d38ceaf9
AD
42#include "amdgpu_i2c.h"
43#include "atom.h"
44#include "amdgpu_atombios.h"
a5bde2f9 45#include "amdgpu_atomfirmware.h"
d0dd7f0c 46#include "amd_pcie.h"
33f34802
KW
47#ifdef CONFIG_DRM_AMDGPU_SI
48#include "si.h"
49#endif
a2e73f56
AD
50#ifdef CONFIG_DRM_AMDGPU_CIK
51#include "cik.h"
52#endif
aaa36a97 53#include "vi.h"
460826e6 54#include "soc15.h"
0a5b8c7b 55#include "nv.h"
d38ceaf9 56#include "bif/bif_4_1_d.h"
9accf2fd 57#include <linux/pci.h>
bec86378 58#include <linux/firmware.h>
89041940 59#include "amdgpu_vf_error.h"
d38ceaf9 60
ba997709 61#include "amdgpu_amdkfd.h"
d2f52ac8 62#include "amdgpu_pm.h"
d38ceaf9 63
5183411b 64#include "amdgpu_xgmi.h"
c030f2e4 65#include "amdgpu_ras.h"
9c7c85f7 66#include "amdgpu_pmu.h"
bd607166 67#include "amdgpu_fru_eeprom.h"
04442bf7 68#include "amdgpu_reset.h"
5183411b 69
d5ea093e 70#include <linux/suspend.h>
c6a6e2db 71#include <drm/task_barrier.h>
3f12acc8 72#include <linux/pm_runtime.h>
d5ea093e 73
f89f8c6b
AG
74#include <drm/drm_drv.h>
75
e2a75f88 76MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 77MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 78MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 79MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 80MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 81MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
b51a26a0 82MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
23c6268e 83MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
ed42cfe1 84MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
42b325e5 85MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
4e52a9f8 86MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
8bf84f60 87MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
e2a75f88 88
2dc80b00
S
89#define AMDGPU_RESUME_MS 2000
90
050091ab 91const char *amdgpu_asic_name[] = {
da69c161
KW
92 "TAHITI",
93 "PITCAIRN",
94 "VERDE",
95 "OLAND",
96 "HAINAN",
d38ceaf9
AD
97 "BONAIRE",
98 "KAVERI",
99 "KABINI",
100 "HAWAII",
101 "MULLINS",
102 "TOPAZ",
103 "TONGA",
48299f95 104 "FIJI",
d38ceaf9 105 "CARRIZO",
139f4917 106 "STONEY",
2cc0c0b5
FC
107 "POLARIS10",
108 "POLARIS11",
c4642a47 109 "POLARIS12",
48ff108d 110 "VEGAM",
d4196f01 111 "VEGA10",
8fab806a 112 "VEGA12",
956fcddc 113 "VEGA20",
2ca8a5d2 114 "RAVEN",
d6c3b24e 115 "ARCTURUS",
1eee4228 116 "RENOIR",
d46b417a 117 "ALDEBARAN",
852a6626 118 "NAVI10",
87dbad02 119 "NAVI14",
9802f5d7 120 "NAVI12",
ccaf72d3 121 "SIENNA_CICHLID",
ddd8fbe7 122 "NAVY_FLOUNDER",
4f1e9a76 123 "VANGOGH",
a2468e04 124 "DIMGREY_CAVEFISH",
6f169591 125 "BEIGE_GOBY",
ee9236b7 126 "YELLOW_CARP",
d38ceaf9
AD
127 "LAST",
128};
129
dcea6e65
KR
130/**
131 * DOC: pcie_replay_count
132 *
133 * The amdgpu driver provides a sysfs API for reporting the total number
134 * of PCIe replays (NAKs)
135 * The file pcie_replay_count is used for this and returns the total
136 * number of replays as a sum of the NAKs generated and NAKs received
137 */
138
139static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
140 struct device_attribute *attr, char *buf)
141{
142 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 143 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
145
36000c7a 146 return sysfs_emit(buf, "%llu\n", cnt);
dcea6e65
KR
147}
148
149static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
150 amdgpu_device_get_pcie_replay_count, NULL);
151
5494d864
AD
152static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
153
bd607166
KR
154/**
155 * DOC: product_name
156 *
157 * The amdgpu driver provides a sysfs API for reporting the product name
158 * for the device
159 * The file serial_number is used for this and returns the product name
160 * as returned from the FRU.
161 * NOTE: This is only available for certain server cards
162 */
163
164static ssize_t amdgpu_device_get_product_name(struct device *dev,
165 struct device_attribute *attr, char *buf)
166{
167 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 168 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 169
36000c7a 170 return sysfs_emit(buf, "%s\n", adev->product_name);
bd607166
KR
171}
172
173static DEVICE_ATTR(product_name, S_IRUGO,
174 amdgpu_device_get_product_name, NULL);
175
176/**
177 * DOC: product_number
178 *
179 * The amdgpu driver provides a sysfs API for reporting the part number
180 * for the device
181 * The file serial_number is used for this and returns the part number
182 * as returned from the FRU.
183 * NOTE: This is only available for certain server cards
184 */
185
186static ssize_t amdgpu_device_get_product_number(struct device *dev,
187 struct device_attribute *attr, char *buf)
188{
189 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 190 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 191
36000c7a 192 return sysfs_emit(buf, "%s\n", adev->product_number);
bd607166
KR
193}
194
195static DEVICE_ATTR(product_number, S_IRUGO,
196 amdgpu_device_get_product_number, NULL);
197
198/**
199 * DOC: serial_number
200 *
201 * The amdgpu driver provides a sysfs API for reporting the serial number
202 * for the device
203 * The file serial_number is used for this and returns the serial number
204 * as returned from the FRU.
205 * NOTE: This is only available for certain server cards
206 */
207
208static ssize_t amdgpu_device_get_serial_number(struct device *dev,
209 struct device_attribute *attr, char *buf)
210{
211 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 212 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 213
36000c7a 214 return sysfs_emit(buf, "%s\n", adev->serial);
bd607166
KR
215}
216
217static DEVICE_ATTR(serial_number, S_IRUGO,
218 amdgpu_device_get_serial_number, NULL);
219
fd496ca8 220/**
b98c6299 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
fd496ca8
AD
222 *
223 * @dev: drm_device pointer
224 *
b98c6299 225 * Returns true if the device is a dGPU with ATPX power control,
fd496ca8
AD
226 * otherwise return false.
227 */
b98c6299 228bool amdgpu_device_supports_px(struct drm_device *dev)
fd496ca8
AD
229{
230 struct amdgpu_device *adev = drm_to_adev(dev);
231
b98c6299 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
fd496ca8
AD
233 return true;
234 return false;
235}
236
e3ecdffa 237/**
0330b848 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
e3ecdffa
AD
239 *
240 * @dev: drm_device pointer
241 *
b98c6299 242 * Returns true if the device is a dGPU with ACPI power control,
e3ecdffa
AD
243 * otherwise return false.
244 */
31af062a 245bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 246{
1348969a 247 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 248
b98c6299
AD
249 if (adev->has_pr3 ||
250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
d38ceaf9
AD
251 return true;
252 return false;
253}
254
a69cba42
AD
255/**
256 * amdgpu_device_supports_baco - Does the device support BACO
257 *
258 * @dev: drm_device pointer
259 *
260 * Returns true if the device supporte BACO,
261 * otherwise return false.
262 */
263bool amdgpu_device_supports_baco(struct drm_device *dev)
264{
1348969a 265 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
266
267 return amdgpu_asic_supports_baco(adev);
268}
269
3fa8f89d
S
270/**
271 * amdgpu_device_supports_smart_shift - Is the device dGPU with
272 * smart shift support
273 *
274 * @dev: drm_device pointer
275 *
276 * Returns true if the device is a dGPU with Smart Shift support,
277 * otherwise returns false.
278 */
279bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
280{
281 return (amdgpu_device_supports_boco(dev) &&
282 amdgpu_acpi_is_power_shift_control_supported());
283}
284
6e3cd2a9
MCC
285/*
286 * VRAM access helper functions
287 */
288
e35e2b11 289/**
e35e2b11
TY
290 * amdgpu_device_vram_access - read/write a buffer in vram
291 *
292 * @adev: amdgpu_device pointer
293 * @pos: offset of the buffer in vram
294 * @buf: virtual address of the buffer in system memory
295 * @size: read/write size, sizeof(@buf) must > @size
296 * @write: true - write to vram, otherwise - read from vram
297 */
298void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
299 uint32_t *buf, size_t size, bool write)
300{
e35e2b11 301 unsigned long flags;
ce05ac56
CK
302 uint32_t hi = ~0;
303 uint64_t last;
f89f8c6b 304 int idx;
ce05ac56 305
f89f8c6b
AG
306 if (!drm_dev_enter(&adev->ddev, &idx))
307 return;
9d11eb0d
CK
308
309#ifdef CONFIG_64BIT
310 last = min(pos + size, adev->gmc.visible_vram_size);
311 if (last > pos) {
312 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
313 size_t count = last - pos;
314
315 if (write) {
316 memcpy_toio(addr, buf, count);
317 mb();
810085dd 318 amdgpu_device_flush_hdp(adev, NULL);
9d11eb0d 319 } else {
810085dd 320 amdgpu_device_invalidate_hdp(adev, NULL);
9d11eb0d
CK
321 mb();
322 memcpy_fromio(buf, addr, count);
323 }
324
325 if (count == size)
f89f8c6b 326 goto exit;
9d11eb0d
CK
327
328 pos += count;
329 buf += count / 4;
330 size -= count;
331 }
332#endif
333
ce05ac56
CK
334 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
335 for (last = pos + size; pos < last; pos += 4) {
336 uint32_t tmp = pos >> 31;
e35e2b11 337
e35e2b11 338 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
ce05ac56
CK
339 if (tmp != hi) {
340 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
341 hi = tmp;
342 }
e35e2b11
TY
343 if (write)
344 WREG32_NO_KIQ(mmMM_DATA, *buf++);
345 else
346 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
e35e2b11 347 }
ce05ac56 348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
f89f8c6b 349
8eca89a1 350#ifdef CONFIG_64BIT
f89f8c6b 351exit:
8eca89a1 352#endif
f89f8c6b 353 drm_dev_exit(idx);
e35e2b11
TY
354}
355
d38ceaf9 356/*
f7ee1874 357 * register access helper functions.
d38ceaf9 358 */
56b53c0b
DL
359
360/* Check if hw access should be skipped because of hotplug or device error */
361bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
362{
7afefb81 363 if (adev->no_hw_access)
56b53c0b
DL
364 return true;
365
366#ifdef CONFIG_LOCKDEP
367 /*
368 * This is a bit complicated to understand, so worth a comment. What we assert
369 * here is that the GPU reset is not running on another thread in parallel.
370 *
371 * For this we trylock the read side of the reset semaphore, if that succeeds
372 * we know that the reset is not running in paralell.
373 *
374 * If the trylock fails we assert that we are either already holding the read
375 * side of the lock or are the reset thread itself and hold the write side of
376 * the lock.
377 */
378 if (in_task()) {
379 if (down_read_trylock(&adev->reset_sem))
380 up_read(&adev->reset_sem);
381 else
382 lockdep_assert_held(&adev->reset_sem);
383 }
384#endif
385 return false;
386}
387
e3ecdffa 388/**
f7ee1874 389 * amdgpu_device_rreg - read a memory mapped IO or indirect register
e3ecdffa
AD
390 *
391 * @adev: amdgpu_device pointer
392 * @reg: dword aligned register offset
393 * @acc_flags: access flags which require special behavior
394 *
395 * Returns the 32 bit value from the offset specified.
396 */
f7ee1874
HZ
397uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
398 uint32_t reg, uint32_t acc_flags)
d38ceaf9 399{
f4b373f4
TSD
400 uint32_t ret;
401
56b53c0b 402 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
403 return 0;
404
f7ee1874
HZ
405 if ((reg * 4) < adev->rmmio_size) {
406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
407 amdgpu_sriov_runtime(adev) &&
408 down_read_trylock(&adev->reset_sem)) {
409 ret = amdgpu_kiq_rreg(adev, reg);
410 up_read(&adev->reset_sem);
411 } else {
412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
413 }
414 } else {
415 ret = adev->pcie_rreg(adev, reg * 4);
81202807 416 }
bc992ba5 417
f7ee1874 418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
e78b579d 419
f4b373f4 420 return ret;
d38ceaf9
AD
421}
422
421a2a30
ML
423/*
424 * MMIO register read with bytes helper functions
425 * @offset:bytes offset from MMIO start
426 *
427*/
428
e3ecdffa
AD
429/**
430 * amdgpu_mm_rreg8 - read a memory mapped IO register
431 *
432 * @adev: amdgpu_device pointer
433 * @offset: byte aligned register offset
434 *
435 * Returns the 8 bit value from the offset specified.
436 */
7cbbc745
AG
437uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
438{
56b53c0b 439 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
440 return 0;
441
421a2a30
ML
442 if (offset < adev->rmmio_size)
443 return (readb(adev->rmmio + offset));
444 BUG();
445}
446
447/*
448 * MMIO register write with bytes helper functions
449 * @offset:bytes offset from MMIO start
450 * @value: the value want to be written to the register
451 *
452*/
e3ecdffa
AD
453/**
454 * amdgpu_mm_wreg8 - read a memory mapped IO register
455 *
456 * @adev: amdgpu_device pointer
457 * @offset: byte aligned register offset
458 * @value: 8 bit value to write
459 *
460 * Writes the value specified to the offset specified.
461 */
7cbbc745
AG
462void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
463{
56b53c0b 464 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
465 return;
466
421a2a30
ML
467 if (offset < adev->rmmio_size)
468 writeb(value, adev->rmmio + offset);
469 else
470 BUG();
471}
472
e3ecdffa 473/**
f7ee1874 474 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
e3ecdffa
AD
475 *
476 * @adev: amdgpu_device pointer
477 * @reg: dword aligned register offset
478 * @v: 32 bit value to write to the register
479 * @acc_flags: access flags which require special behavior
480 *
481 * Writes the value specified to the offset specified.
482 */
f7ee1874
HZ
483void amdgpu_device_wreg(struct amdgpu_device *adev,
484 uint32_t reg, uint32_t v,
485 uint32_t acc_flags)
d38ceaf9 486{
56b53c0b 487 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
488 return;
489
f7ee1874
HZ
490 if ((reg * 4) < adev->rmmio_size) {
491 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
492 amdgpu_sriov_runtime(adev) &&
493 down_read_trylock(&adev->reset_sem)) {
494 amdgpu_kiq_wreg(adev, reg, v);
495 up_read(&adev->reset_sem);
496 } else {
497 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
498 }
499 } else {
500 adev->pcie_wreg(adev, reg * 4, v);
81202807 501 }
bc992ba5 502
f7ee1874 503 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
2e0cc4d4 504}
d38ceaf9 505
2e0cc4d4
ML
506/*
507 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
508 *
509 * this function is invoked only the debugfs register access
510 * */
f7ee1874
HZ
511void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
512 uint32_t reg, uint32_t v)
2e0cc4d4 513{
56b53c0b 514 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
515 return;
516
2e0cc4d4 517 if (amdgpu_sriov_fullaccess(adev) &&
f7ee1874
HZ
518 adev->gfx.rlc.funcs &&
519 adev->gfx.rlc.funcs->is_rlcg_access_range) {
2e0cc4d4 520 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
a5504e9a 521 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0, 0);
f7ee1874
HZ
522 } else {
523 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
47ed4e1c 524 }
d38ceaf9
AD
525}
526
d38ceaf9
AD
527/**
528 * amdgpu_mm_rdoorbell - read a doorbell dword
529 *
530 * @adev: amdgpu_device pointer
531 * @index: doorbell index
532 *
533 * Returns the value in the doorbell aperture at the
534 * requested doorbell index (CIK).
535 */
536u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
537{
56b53c0b 538 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
539 return 0;
540
d38ceaf9
AD
541 if (index < adev->doorbell.num_doorbells) {
542 return readl(adev->doorbell.ptr + index);
543 } else {
544 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
545 return 0;
546 }
547}
548
549/**
550 * amdgpu_mm_wdoorbell - write a doorbell dword
551 *
552 * @adev: amdgpu_device pointer
553 * @index: doorbell index
554 * @v: value to write
555 *
556 * Writes @v to the doorbell aperture at the
557 * requested doorbell index (CIK).
558 */
559void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
560{
56b53c0b 561 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
562 return;
563
d38ceaf9
AD
564 if (index < adev->doorbell.num_doorbells) {
565 writel(v, adev->doorbell.ptr + index);
566 } else {
567 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
568 }
569}
570
832be404
KW
571/**
572 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
573 *
574 * @adev: amdgpu_device pointer
575 * @index: doorbell index
576 *
577 * Returns the value in the doorbell aperture at the
578 * requested doorbell index (VEGA10+).
579 */
580u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
581{
56b53c0b 582 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
583 return 0;
584
832be404
KW
585 if (index < adev->doorbell.num_doorbells) {
586 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
587 } else {
588 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
589 return 0;
590 }
591}
592
593/**
594 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
595 *
596 * @adev: amdgpu_device pointer
597 * @index: doorbell index
598 * @v: value to write
599 *
600 * Writes @v to the doorbell aperture at the
601 * requested doorbell index (VEGA10+).
602 */
603void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
604{
56b53c0b 605 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
606 return;
607
832be404
KW
608 if (index < adev->doorbell.num_doorbells) {
609 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
610 } else {
611 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
612 }
613}
614
1bba3683
HZ
615/**
616 * amdgpu_device_indirect_rreg - read an indirect register
617 *
618 * @adev: amdgpu_device pointer
619 * @pcie_index: mmio register offset
620 * @pcie_data: mmio register offset
22f453fb 621 * @reg_addr: indirect register address to read from
1bba3683
HZ
622 *
623 * Returns the value of indirect register @reg_addr
624 */
625u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
626 u32 pcie_index, u32 pcie_data,
627 u32 reg_addr)
628{
629 unsigned long flags;
630 u32 r;
631 void __iomem *pcie_index_offset;
632 void __iomem *pcie_data_offset;
633
634 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
635 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
636 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
637
638 writel(reg_addr, pcie_index_offset);
639 readl(pcie_index_offset);
640 r = readl(pcie_data_offset);
641 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
642
643 return r;
644}
645
646/**
647 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
648 *
649 * @adev: amdgpu_device pointer
650 * @pcie_index: mmio register offset
651 * @pcie_data: mmio register offset
22f453fb 652 * @reg_addr: indirect register address to read from
1bba3683
HZ
653 *
654 * Returns the value of indirect register @reg_addr
655 */
656u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
657 u32 pcie_index, u32 pcie_data,
658 u32 reg_addr)
659{
660 unsigned long flags;
661 u64 r;
662 void __iomem *pcie_index_offset;
663 void __iomem *pcie_data_offset;
664
665 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
666 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
667 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
668
669 /* read low 32 bits */
670 writel(reg_addr, pcie_index_offset);
671 readl(pcie_index_offset);
672 r = readl(pcie_data_offset);
673 /* read high 32 bits */
674 writel(reg_addr + 4, pcie_index_offset);
675 readl(pcie_index_offset);
676 r |= ((u64)readl(pcie_data_offset) << 32);
677 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
678
679 return r;
680}
681
682/**
683 * amdgpu_device_indirect_wreg - write an indirect register address
684 *
685 * @adev: amdgpu_device pointer
686 * @pcie_index: mmio register offset
687 * @pcie_data: mmio register offset
688 * @reg_addr: indirect register offset
689 * @reg_data: indirect register data
690 *
691 */
692void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
693 u32 pcie_index, u32 pcie_data,
694 u32 reg_addr, u32 reg_data)
695{
696 unsigned long flags;
697 void __iomem *pcie_index_offset;
698 void __iomem *pcie_data_offset;
699
700 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
701 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
702 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
703
704 writel(reg_addr, pcie_index_offset);
705 readl(pcie_index_offset);
706 writel(reg_data, pcie_data_offset);
707 readl(pcie_data_offset);
708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
709}
710
711/**
712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
713 *
714 * @adev: amdgpu_device pointer
715 * @pcie_index: mmio register offset
716 * @pcie_data: mmio register offset
717 * @reg_addr: indirect register offset
718 * @reg_data: indirect register data
719 *
720 */
721void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
722 u32 pcie_index, u32 pcie_data,
723 u32 reg_addr, u64 reg_data)
724{
725 unsigned long flags;
726 void __iomem *pcie_index_offset;
727 void __iomem *pcie_data_offset;
728
729 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
732
733 /* write low 32 bits */
734 writel(reg_addr, pcie_index_offset);
735 readl(pcie_index_offset);
736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
737 readl(pcie_data_offset);
738 /* write high 32 bits */
739 writel(reg_addr + 4, pcie_index_offset);
740 readl(pcie_index_offset);
741 writel((u32)(reg_data >> 32), pcie_data_offset);
742 readl(pcie_data_offset);
743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
744}
745
d38ceaf9
AD
746/**
747 * amdgpu_invalid_rreg - dummy reg read function
748 *
982a820b 749 * @adev: amdgpu_device pointer
d38ceaf9
AD
750 * @reg: offset of register
751 *
752 * Dummy register read function. Used for register blocks
753 * that certain asics don't have (all asics).
754 * Returns the value in the register.
755 */
756static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
757{
758 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
759 BUG();
760 return 0;
761}
762
763/**
764 * amdgpu_invalid_wreg - dummy reg write function
765 *
982a820b 766 * @adev: amdgpu_device pointer
d38ceaf9
AD
767 * @reg: offset of register
768 * @v: value to write to the register
769 *
770 * Dummy register read function. Used for register blocks
771 * that certain asics don't have (all asics).
772 */
773static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
774{
775 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
776 reg, v);
777 BUG();
778}
779
4fa1c6a6
TZ
780/**
781 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
782 *
982a820b 783 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
784 * @reg: offset of register
785 *
786 * Dummy register read function. Used for register blocks
787 * that certain asics don't have (all asics).
788 * Returns the value in the register.
789 */
790static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
791{
792 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
793 BUG();
794 return 0;
795}
796
797/**
798 * amdgpu_invalid_wreg64 - dummy reg write function
799 *
982a820b 800 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
801 * @reg: offset of register
802 * @v: value to write to the register
803 *
804 * Dummy register read function. Used for register blocks
805 * that certain asics don't have (all asics).
806 */
807static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
808{
809 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
810 reg, v);
811 BUG();
812}
813
d38ceaf9
AD
814/**
815 * amdgpu_block_invalid_rreg - dummy reg read function
816 *
982a820b 817 * @adev: amdgpu_device pointer
d38ceaf9
AD
818 * @block: offset of instance
819 * @reg: offset of register
820 *
821 * Dummy register read function. Used for register blocks
822 * that certain asics don't have (all asics).
823 * Returns the value in the register.
824 */
825static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
826 uint32_t block, uint32_t reg)
827{
828 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
829 reg, block);
830 BUG();
831 return 0;
832}
833
834/**
835 * amdgpu_block_invalid_wreg - dummy reg write function
836 *
982a820b 837 * @adev: amdgpu_device pointer
d38ceaf9
AD
838 * @block: offset of instance
839 * @reg: offset of register
840 * @v: value to write to the register
841 *
842 * Dummy register read function. Used for register blocks
843 * that certain asics don't have (all asics).
844 */
845static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
846 uint32_t block,
847 uint32_t reg, uint32_t v)
848{
849 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
850 reg, block, v);
851 BUG();
852}
853
4d2997ab
AD
854/**
855 * amdgpu_device_asic_init - Wrapper for atom asic_init
856 *
982a820b 857 * @adev: amdgpu_device pointer
4d2997ab
AD
858 *
859 * Does any asic specific work and then calls atom asic init.
860 */
861static int amdgpu_device_asic_init(struct amdgpu_device *adev)
862{
863 amdgpu_asic_pre_asic_init(adev);
864
865 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
866}
867
e3ecdffa
AD
868/**
869 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
870 *
982a820b 871 * @adev: amdgpu_device pointer
e3ecdffa
AD
872 *
873 * Allocates a scratch page of VRAM for use by various things in the
874 * driver.
875 */
06ec9070 876static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 877{
a4a02777
CK
878 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
879 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
880 &adev->vram_scratch.robj,
881 &adev->vram_scratch.gpu_addr,
882 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
883}
884
e3ecdffa
AD
885/**
886 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
887 *
982a820b 888 * @adev: amdgpu_device pointer
e3ecdffa
AD
889 *
890 * Frees the VRAM scratch page.
891 */
06ec9070 892static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 893{
078af1a3 894 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
895}
896
897/**
9c3f2b54 898 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
899 *
900 * @adev: amdgpu_device pointer
901 * @registers: pointer to the register array
902 * @array_size: size of the register array
903 *
904 * Programs an array or registers with and and or masks.
905 * This is a helper for setting golden registers.
906 */
9c3f2b54
AD
907void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
908 const u32 *registers,
909 const u32 array_size)
d38ceaf9
AD
910{
911 u32 tmp, reg, and_mask, or_mask;
912 int i;
913
914 if (array_size % 3)
915 return;
916
917 for (i = 0; i < array_size; i +=3) {
918 reg = registers[i + 0];
919 and_mask = registers[i + 1];
920 or_mask = registers[i + 2];
921
922 if (and_mask == 0xffffffff) {
923 tmp = or_mask;
924 } else {
925 tmp = RREG32(reg);
926 tmp &= ~and_mask;
e0d07657
HZ
927 if (adev->family >= AMDGPU_FAMILY_AI)
928 tmp |= (or_mask & and_mask);
929 else
930 tmp |= or_mask;
d38ceaf9
AD
931 }
932 WREG32(reg, tmp);
933 }
934}
935
e3ecdffa
AD
936/**
937 * amdgpu_device_pci_config_reset - reset the GPU
938 *
939 * @adev: amdgpu_device pointer
940 *
941 * Resets the GPU using the pci config reset sequence.
942 * Only applicable to asics prior to vega10.
943 */
8111c387 944void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
945{
946 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
947}
948
af484df8
AD
949/**
950 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
951 *
952 * @adev: amdgpu_device pointer
953 *
954 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
955 */
956int amdgpu_device_pci_reset(struct amdgpu_device *adev)
957{
958 return pci_reset_function(adev->pdev);
959}
960
d38ceaf9
AD
961/*
962 * GPU doorbell aperture helpers function.
963 */
964/**
06ec9070 965 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
966 *
967 * @adev: amdgpu_device pointer
968 *
969 * Init doorbell driver information (CIK)
970 * Returns 0 on success, error on failure.
971 */
06ec9070 972static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 973{
6585661d 974
705e519e
CK
975 /* No doorbell on SI hardware generation */
976 if (adev->asic_type < CHIP_BONAIRE) {
977 adev->doorbell.base = 0;
978 adev->doorbell.size = 0;
979 adev->doorbell.num_doorbells = 0;
980 adev->doorbell.ptr = NULL;
981 return 0;
982 }
983
d6895ad3
CK
984 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
985 return -EINVAL;
986
22357775
AD
987 amdgpu_asic_init_doorbell_index(adev);
988
d38ceaf9
AD
989 /* doorbell bar mapping */
990 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
991 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
992
edf600da 993 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
9564f192 994 adev->doorbell_index.max_assignment+1);
d38ceaf9
AD
995 if (adev->doorbell.num_doorbells == 0)
996 return -EINVAL;
997
ec3db8a6 998 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
88dc26e4
OZ
999 * paging queue doorbell use the second page. The
1000 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1001 * doorbells are in the first page. So with paging queue enabled,
1002 * the max num_doorbells should + 1 page (0x400 in dword)
ec3db8a6
PY
1003 */
1004 if (adev->asic_type >= CHIP_VEGA10)
88dc26e4 1005 adev->doorbell.num_doorbells += 0x400;
ec3db8a6 1006
8972e5d2
CK
1007 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1008 adev->doorbell.num_doorbells *
1009 sizeof(u32));
1010 if (adev->doorbell.ptr == NULL)
d38ceaf9 1011 return -ENOMEM;
d38ceaf9
AD
1012
1013 return 0;
1014}
1015
1016/**
06ec9070 1017 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
1018 *
1019 * @adev: amdgpu_device pointer
1020 *
1021 * Tear down doorbell driver information (CIK)
1022 */
06ec9070 1023static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1024{
1025 iounmap(adev->doorbell.ptr);
1026 adev->doorbell.ptr = NULL;
1027}
1028
22cb0164 1029
d38ceaf9
AD
1030
1031/*
06ec9070 1032 * amdgpu_device_wb_*()
455a7bc2 1033 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 1034 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
1035 */
1036
1037/**
06ec9070 1038 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
1039 *
1040 * @adev: amdgpu_device pointer
1041 *
1042 * Disables Writeback and frees the Writeback memory (all asics).
1043 * Used at driver shutdown.
1044 */
06ec9070 1045static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1046{
1047 if (adev->wb.wb_obj) {
a76ed485
AD
1048 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1049 &adev->wb.gpu_addr,
1050 (void **)&adev->wb.wb);
d38ceaf9
AD
1051 adev->wb.wb_obj = NULL;
1052 }
1053}
1054
1055/**
06ec9070 1056 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
d38ceaf9
AD
1057 *
1058 * @adev: amdgpu_device pointer
1059 *
455a7bc2 1060 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
1061 * Used at driver startup.
1062 * Returns 0 on success or an -error on failure.
1063 */
06ec9070 1064static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
1065{
1066 int r;
1067
1068 if (adev->wb.wb_obj == NULL) {
97407b63
AD
1069 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1070 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
1071 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1072 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1073 (void **)&adev->wb.wb);
d38ceaf9
AD
1074 if (r) {
1075 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1076 return r;
1077 }
d38ceaf9
AD
1078
1079 adev->wb.num_wb = AMDGPU_MAX_WB;
1080 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1081
1082 /* clear wb memory */
73469585 1083 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
1084 }
1085
1086 return 0;
1087}
1088
1089/**
131b4b36 1090 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
1091 *
1092 * @adev: amdgpu_device pointer
1093 * @wb: wb index
1094 *
1095 * Allocate a wb slot for use by the driver (all asics).
1096 * Returns 0 on success or -EINVAL on failure.
1097 */
131b4b36 1098int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
1099{
1100 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 1101
97407b63 1102 if (offset < adev->wb.num_wb) {
7014285a 1103 __set_bit(offset, adev->wb.used);
63ae07ca 1104 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
1105 return 0;
1106 } else {
1107 return -EINVAL;
1108 }
1109}
1110
d38ceaf9 1111/**
131b4b36 1112 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
1113 *
1114 * @adev: amdgpu_device pointer
1115 * @wb: wb index
1116 *
1117 * Free a wb slot allocated for use by the driver (all asics)
1118 */
131b4b36 1119void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 1120{
73469585 1121 wb >>= 3;
d38ceaf9 1122 if (wb < adev->wb.num_wb)
73469585 1123 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
1124}
1125
d6895ad3
CK
1126/**
1127 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1128 *
1129 * @adev: amdgpu_device pointer
1130 *
1131 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1132 * to fail, but if any of the BARs is not accessible after the size we abort
1133 * driver loading by returning -ENODEV.
1134 */
1135int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1136{
453f617a 1137 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
31b8adab
CK
1138 struct pci_bus *root;
1139 struct resource *res;
1140 unsigned i;
d6895ad3
CK
1141 u16 cmd;
1142 int r;
1143
0c03b912 1144 /* Bypass for VF */
1145 if (amdgpu_sriov_vf(adev))
1146 return 0;
1147
b7221f2b
AD
1148 /* skip if the bios has already enabled large BAR */
1149 if (adev->gmc.real_vram_size &&
1150 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1151 return 0;
1152
31b8adab
CK
1153 /* Check if the root BUS has 64bit memory resources */
1154 root = adev->pdev->bus;
1155 while (root->parent)
1156 root = root->parent;
1157
1158 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 1159 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1160 res->start > 0x100000000ull)
1161 break;
1162 }
1163
1164 /* Trying to resize is pointless without a root hub window above 4GB */
1165 if (!res)
1166 return 0;
1167
453f617a
ND
1168 /* Limit the BAR size to what is available */
1169 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1170 rbar_size);
1171
d6895ad3
CK
1172 /* Disable memory decoding while we change the BAR addresses and size */
1173 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1174 pci_write_config_word(adev->pdev, PCI_COMMAND,
1175 cmd & ~PCI_COMMAND_MEMORY);
1176
1177 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1178 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1179 if (adev->asic_type >= CHIP_BONAIRE)
1180 pci_release_resource(adev->pdev, 2);
1181
1182 pci_release_resource(adev->pdev, 0);
1183
1184 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1185 if (r == -ENOSPC)
1186 DRM_INFO("Not enough PCI address space for a large BAR.");
1187 else if (r && r != -ENOTSUPP)
1188 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1189
1190 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1191
1192 /* When the doorbell or fb BAR isn't available we have no chance of
1193 * using the device.
1194 */
06ec9070 1195 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1196 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1197 return -ENODEV;
1198
1199 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1200
1201 return 0;
1202}
a05502e5 1203
d38ceaf9
AD
1204/*
1205 * GPU helpers function.
1206 */
1207/**
39c640c0 1208 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1209 *
1210 * @adev: amdgpu_device pointer
1211 *
c836fec5
JQ
1212 * Check if the asic has been initialized (all asics) at driver startup
1213 * or post is needed if hw reset is performed.
1214 * Returns true if need or false if not.
d38ceaf9 1215 */
39c640c0 1216bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1217{
1218 uint32_t reg;
1219
bec86378
ML
1220 if (amdgpu_sriov_vf(adev))
1221 return false;
1222
1223 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1224 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1225 * some old smc fw still need driver do vPost otherwise gpu hang, while
1226 * those smc fw version above 22.15 doesn't have this flaw, so we force
1227 * vpost executed for smc version below 22.15
bec86378
ML
1228 */
1229 if (adev->asic_type == CHIP_FIJI) {
1230 int err;
1231 uint32_t fw_ver;
1232 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1233 /* force vPost if error occured */
1234 if (err)
1235 return true;
1236
1237 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1238 if (fw_ver < 0x00160e00)
1239 return true;
bec86378 1240 }
bec86378 1241 }
91fe77eb 1242
e3c1b071 1243 /* Don't post if we need to reset whole hive on init */
1244 if (adev->gmc.xgmi.pending_reset)
1245 return false;
1246
91fe77eb 1247 if (adev->has_hw_reset) {
1248 adev->has_hw_reset = false;
1249 return true;
1250 }
1251
1252 /* bios scratch used on CIK+ */
1253 if (adev->asic_type >= CHIP_BONAIRE)
1254 return amdgpu_atombios_scratch_need_asic_init(adev);
1255
1256 /* check MEM_SIZE for older asics */
1257 reg = amdgpu_asic_get_config_memsize(adev);
1258
1259 if ((reg != 0) && (reg != 0xffffffff))
1260 return false;
1261
1262 return true;
bec86378
ML
1263}
1264
d38ceaf9
AD
1265/* if we get transitioned to only one device, take VGA back */
1266/**
06ec9070 1267 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9
AD
1268 *
1269 * @cookie: amdgpu_device pointer
1270 * @state: enable/disable vga decode
1271 *
1272 * Enable/disable vga decode (all asics).
1273 * Returns VGA resource flags.
1274 */
06ec9070 1275static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
d38ceaf9
AD
1276{
1277 struct amdgpu_device *adev = cookie;
1278 amdgpu_asic_set_vga_state(adev, state);
1279 if (state)
1280 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1281 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1282 else
1283 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1284}
1285
e3ecdffa
AD
1286/**
1287 * amdgpu_device_check_block_size - validate the vm block size
1288 *
1289 * @adev: amdgpu_device pointer
1290 *
1291 * Validates the vm block size specified via module parameter.
1292 * The vm block size defines number of bits in page table versus page directory,
1293 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1294 * page table and the remaining bits are in the page directory.
1295 */
06ec9070 1296static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1297{
1298 /* defines number of bits in page table versus page directory,
1299 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1300 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1301 if (amdgpu_vm_block_size == -1)
1302 return;
a1adf8be 1303
bab4fee7 1304 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1305 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1306 amdgpu_vm_block_size);
97489129 1307 amdgpu_vm_block_size = -1;
a1adf8be 1308 }
a1adf8be
CZ
1309}
1310
e3ecdffa
AD
1311/**
1312 * amdgpu_device_check_vm_size - validate the vm size
1313 *
1314 * @adev: amdgpu_device pointer
1315 *
1316 * Validates the vm size in GB specified via module parameter.
1317 * The VM size is the size of the GPU virtual memory space in GB.
1318 */
06ec9070 1319static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1320{
64dab074
AD
1321 /* no need to check the default value */
1322 if (amdgpu_vm_size == -1)
1323 return;
1324
83ca145d
ZJ
1325 if (amdgpu_vm_size < 1) {
1326 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1327 amdgpu_vm_size);
f3368128 1328 amdgpu_vm_size = -1;
83ca145d 1329 }
83ca145d
ZJ
1330}
1331
7951e376
RZ
1332static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1333{
1334 struct sysinfo si;
a9d4fe2f 1335 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1336 uint64_t total_memory;
1337 uint64_t dram_size_seven_GB = 0x1B8000000;
1338 uint64_t dram_size_three_GB = 0xB8000000;
1339
1340 if (amdgpu_smu_memory_pool_size == 0)
1341 return;
1342
1343 if (!is_os_64) {
1344 DRM_WARN("Not 64-bit OS, feature not supported\n");
1345 goto def_value;
1346 }
1347 si_meminfo(&si);
1348 total_memory = (uint64_t)si.totalram * si.mem_unit;
1349
1350 if ((amdgpu_smu_memory_pool_size == 1) ||
1351 (amdgpu_smu_memory_pool_size == 2)) {
1352 if (total_memory < dram_size_three_GB)
1353 goto def_value1;
1354 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1355 (amdgpu_smu_memory_pool_size == 8)) {
1356 if (total_memory < dram_size_seven_GB)
1357 goto def_value1;
1358 } else {
1359 DRM_WARN("Smu memory pool size not supported\n");
1360 goto def_value;
1361 }
1362 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1363
1364 return;
1365
1366def_value1:
1367 DRM_WARN("No enough system memory\n");
1368def_value:
1369 adev->pm.smu_prv_buffer_size = 0;
1370}
1371
d38ceaf9 1372/**
06ec9070 1373 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1374 *
1375 * @adev: amdgpu_device pointer
1376 *
1377 * Validates certain module parameters and updates
1378 * the associated values used by the driver (all asics).
1379 */
912dfc84 1380static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1381{
5b011235
CZ
1382 if (amdgpu_sched_jobs < 4) {
1383 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1384 amdgpu_sched_jobs);
1385 amdgpu_sched_jobs = 4;
76117507 1386 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1387 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1388 amdgpu_sched_jobs);
1389 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1390 }
d38ceaf9 1391
83e74db6 1392 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1393 /* gart size must be greater or equal to 32M */
1394 dev_warn(adev->dev, "gart size (%d) too small\n",
1395 amdgpu_gart_size);
83e74db6 1396 amdgpu_gart_size = -1;
d38ceaf9
AD
1397 }
1398
36d38372 1399 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1400 /* gtt size must be greater or equal to 32M */
36d38372
CK
1401 dev_warn(adev->dev, "gtt size (%d) too small\n",
1402 amdgpu_gtt_size);
1403 amdgpu_gtt_size = -1;
d38ceaf9
AD
1404 }
1405
d07f14be
RH
1406 /* valid range is between 4 and 9 inclusive */
1407 if (amdgpu_vm_fragment_size != -1 &&
1408 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1409 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1410 amdgpu_vm_fragment_size = -1;
1411 }
1412
5d5bd5e3
KW
1413 if (amdgpu_sched_hw_submission < 2) {
1414 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1415 amdgpu_sched_hw_submission);
1416 amdgpu_sched_hw_submission = 2;
1417 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1418 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1419 amdgpu_sched_hw_submission);
1420 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1421 }
1422
7951e376
RZ
1423 amdgpu_device_check_smu_prv_buffer_size(adev);
1424
06ec9070 1425 amdgpu_device_check_vm_size(adev);
d38ceaf9 1426
06ec9070 1427 amdgpu_device_check_block_size(adev);
6a7f76e7 1428
19aede77 1429 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1430
c6252390 1431 amdgpu_gmc_tmz_set(adev);
01a8dcec 1432
9b498efa
AD
1433 amdgpu_gmc_noretry_set(adev);
1434
e3c00faa 1435 return 0;
d38ceaf9
AD
1436}
1437
1438/**
1439 * amdgpu_switcheroo_set_state - set switcheroo state
1440 *
1441 * @pdev: pci dev pointer
1694467b 1442 * @state: vga_switcheroo state
d38ceaf9
AD
1443 *
1444 * Callback for the switcheroo driver. Suspends or resumes the
1445 * the asics before or after it is powered up using ACPI methods.
1446 */
8aba21b7
LT
1447static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1448 enum vga_switcheroo_state state)
d38ceaf9
AD
1449{
1450 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1451 int r;
d38ceaf9 1452
b98c6299 1453 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1454 return;
1455
1456 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1457 pr_info("switched on\n");
d38ceaf9
AD
1458 /* don't suspend or resume card normally */
1459 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1460
8f66090b
TZ
1461 pci_set_power_state(pdev, PCI_D0);
1462 amdgpu_device_load_pci_state(pdev);
1463 r = pci_enable_device(pdev);
de185019
AD
1464 if (r)
1465 DRM_WARN("pci_enable_device failed (%d)\n", r);
1466 amdgpu_device_resume(dev, true);
d38ceaf9 1467
d38ceaf9 1468 dev->switch_power_state = DRM_SWITCH_POWER_ON;
d38ceaf9 1469 } else {
dd4fa6c1 1470 pr_info("switched off\n");
d38ceaf9 1471 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1472 amdgpu_device_suspend(dev, true);
8f66090b 1473 amdgpu_device_cache_pci_state(pdev);
de185019 1474 /* Shut down the device */
8f66090b
TZ
1475 pci_disable_device(pdev);
1476 pci_set_power_state(pdev, PCI_D3cold);
d38ceaf9
AD
1477 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1478 }
1479}
1480
1481/**
1482 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1483 *
1484 * @pdev: pci dev pointer
1485 *
1486 * Callback for the switcheroo driver. Check of the switcheroo
1487 * state can be changed.
1488 * Returns true if the state can be changed, false if not.
1489 */
1490static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1491{
1492 struct drm_device *dev = pci_get_drvdata(pdev);
1493
1494 /*
1495 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1496 * locking inversion with the driver load path. And the access here is
1497 * completely racy anyway. So don't bother with locking for now.
1498 */
7e13ad89 1499 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1500}
1501
1502static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1503 .set_gpu_state = amdgpu_switcheroo_set_state,
1504 .reprobe = NULL,
1505 .can_switch = amdgpu_switcheroo_can_switch,
1506};
1507
e3ecdffa
AD
1508/**
1509 * amdgpu_device_ip_set_clockgating_state - set the CG state
1510 *
87e3f136 1511 * @dev: amdgpu_device pointer
e3ecdffa
AD
1512 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1513 * @state: clockgating state (gate or ungate)
1514 *
1515 * Sets the requested clockgating state for all instances of
1516 * the hardware IP specified.
1517 * Returns the error code from the last instance.
1518 */
43fa561f 1519int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1520 enum amd_ip_block_type block_type,
1521 enum amd_clockgating_state state)
d38ceaf9 1522{
43fa561f 1523 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1524 int i, r = 0;
1525
1526 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1527 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1528 continue;
c722865a
RZ
1529 if (adev->ip_blocks[i].version->type != block_type)
1530 continue;
1531 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1532 continue;
1533 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1534 (void *)adev, state);
1535 if (r)
1536 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1537 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1538 }
1539 return r;
1540}
1541
e3ecdffa
AD
1542/**
1543 * amdgpu_device_ip_set_powergating_state - set the PG state
1544 *
87e3f136 1545 * @dev: amdgpu_device pointer
e3ecdffa
AD
1546 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1547 * @state: powergating state (gate or ungate)
1548 *
1549 * Sets the requested powergating state for all instances of
1550 * the hardware IP specified.
1551 * Returns the error code from the last instance.
1552 */
43fa561f 1553int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1554 enum amd_ip_block_type block_type,
1555 enum amd_powergating_state state)
d38ceaf9 1556{
43fa561f 1557 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1558 int i, r = 0;
1559
1560 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1561 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1562 continue;
c722865a
RZ
1563 if (adev->ip_blocks[i].version->type != block_type)
1564 continue;
1565 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1566 continue;
1567 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1568 (void *)adev, state);
1569 if (r)
1570 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1571 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1572 }
1573 return r;
1574}
1575
e3ecdffa
AD
1576/**
1577 * amdgpu_device_ip_get_clockgating_state - get the CG state
1578 *
1579 * @adev: amdgpu_device pointer
1580 * @flags: clockgating feature flags
1581 *
1582 * Walks the list of IPs on the device and updates the clockgating
1583 * flags for each IP.
1584 * Updates @flags with the feature flags for each hardware IP where
1585 * clockgating is enabled.
1586 */
2990a1fc
AD
1587void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1588 u32 *flags)
6cb2d4e4
HR
1589{
1590 int i;
1591
1592 for (i = 0; i < adev->num_ip_blocks; i++) {
1593 if (!adev->ip_blocks[i].status.valid)
1594 continue;
1595 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1596 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1597 }
1598}
1599
e3ecdffa
AD
1600/**
1601 * amdgpu_device_ip_wait_for_idle - wait for idle
1602 *
1603 * @adev: amdgpu_device pointer
1604 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1605 *
1606 * Waits for the request hardware IP to be idle.
1607 * Returns 0 for success or a negative error code on failure.
1608 */
2990a1fc
AD
1609int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1610 enum amd_ip_block_type block_type)
5dbbb60b
AD
1611{
1612 int i, r;
1613
1614 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1615 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1616 continue;
a1255107
AD
1617 if (adev->ip_blocks[i].version->type == block_type) {
1618 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1619 if (r)
1620 return r;
1621 break;
1622 }
1623 }
1624 return 0;
1625
1626}
1627
e3ecdffa
AD
1628/**
1629 * amdgpu_device_ip_is_idle - is the hardware IP idle
1630 *
1631 * @adev: amdgpu_device pointer
1632 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1633 *
1634 * Check if the hardware IP is idle or not.
1635 * Returns true if it the IP is idle, false if not.
1636 */
2990a1fc
AD
1637bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1638 enum amd_ip_block_type block_type)
5dbbb60b
AD
1639{
1640 int i;
1641
1642 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1643 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1644 continue;
a1255107
AD
1645 if (adev->ip_blocks[i].version->type == block_type)
1646 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1647 }
1648 return true;
1649
1650}
1651
e3ecdffa
AD
1652/**
1653 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1654 *
1655 * @adev: amdgpu_device pointer
87e3f136 1656 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1657 *
1658 * Returns a pointer to the hardware IP block structure
1659 * if it exists for the asic, otherwise NULL.
1660 */
2990a1fc
AD
1661struct amdgpu_ip_block *
1662amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1663 enum amd_ip_block_type type)
d38ceaf9
AD
1664{
1665 int i;
1666
1667 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1668 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1669 return &adev->ip_blocks[i];
1670
1671 return NULL;
1672}
1673
1674/**
2990a1fc 1675 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1676 *
1677 * @adev: amdgpu_device pointer
5fc3aeeb 1678 * @type: enum amd_ip_block_type
d38ceaf9
AD
1679 * @major: major version
1680 * @minor: minor version
1681 *
1682 * return 0 if equal or greater
1683 * return 1 if smaller or the ip_block doesn't exist
1684 */
2990a1fc
AD
1685int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1686 enum amd_ip_block_type type,
1687 u32 major, u32 minor)
d38ceaf9 1688{
2990a1fc 1689 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1690
a1255107
AD
1691 if (ip_block && ((ip_block->version->major > major) ||
1692 ((ip_block->version->major == major) &&
1693 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1694 return 0;
1695
1696 return 1;
1697}
1698
a1255107 1699/**
2990a1fc 1700 * amdgpu_device_ip_block_add
a1255107
AD
1701 *
1702 * @adev: amdgpu_device pointer
1703 * @ip_block_version: pointer to the IP to add
1704 *
1705 * Adds the IP block driver information to the collection of IPs
1706 * on the asic.
1707 */
2990a1fc
AD
1708int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1709 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1710{
1711 if (!ip_block_version)
1712 return -EINVAL;
1713
7bd939d0
LG
1714 switch (ip_block_version->type) {
1715 case AMD_IP_BLOCK_TYPE_VCN:
1716 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1717 return 0;
1718 break;
1719 case AMD_IP_BLOCK_TYPE_JPEG:
1720 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1721 return 0;
1722 break;
1723 default:
1724 break;
1725 }
1726
e966a725 1727 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1728 ip_block_version->funcs->name);
1729
a1255107
AD
1730 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1731
1732 return 0;
1733}
1734
e3ecdffa
AD
1735/**
1736 * amdgpu_device_enable_virtual_display - enable virtual display feature
1737 *
1738 * @adev: amdgpu_device pointer
1739 *
1740 * Enabled the virtual display feature if the user has enabled it via
1741 * the module parameter virtual_display. This feature provides a virtual
1742 * display hardware on headless boards or in virtualized environments.
1743 * This function parses and validates the configuration string specified by
1744 * the user and configues the virtual display configuration (number of
1745 * virtual connectors, crtcs, etc.) specified.
1746 */
483ef985 1747static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1748{
1749 adev->enable_virtual_display = false;
1750
1751 if (amdgpu_virtual_display) {
8f66090b 1752 const char *pci_address_name = pci_name(adev->pdev);
0f66356d 1753 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1754
1755 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1756 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1757 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1758 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1759 if (!strcmp("all", pciaddname)
1760 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1761 long num_crtc;
1762 int res = -1;
1763
9accf2fd 1764 adev->enable_virtual_display = true;
0f66356d
ED
1765
1766 if (pciaddname_tmp)
1767 res = kstrtol(pciaddname_tmp, 10,
1768 &num_crtc);
1769
1770 if (!res) {
1771 if (num_crtc < 1)
1772 num_crtc = 1;
1773 if (num_crtc > 6)
1774 num_crtc = 6;
1775 adev->mode_info.num_crtc = num_crtc;
1776 } else {
1777 adev->mode_info.num_crtc = 1;
1778 }
9accf2fd
ED
1779 break;
1780 }
1781 }
1782
0f66356d
ED
1783 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1784 amdgpu_virtual_display, pci_address_name,
1785 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1786
1787 kfree(pciaddstr);
1788 }
1789}
1790
e3ecdffa
AD
1791/**
1792 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1793 *
1794 * @adev: amdgpu_device pointer
1795 *
1796 * Parses the asic configuration parameters specified in the gpu info
1797 * firmware and makes them availale to the driver for use in configuring
1798 * the asic.
1799 * Returns 0 on success, -EINVAL on failure.
1800 */
e2a75f88
AD
1801static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1802{
e2a75f88 1803 const char *chip_name;
c0a43457 1804 char fw_name[40];
e2a75f88
AD
1805 int err;
1806 const struct gpu_info_firmware_header_v1_0 *hdr;
1807
ab4fe3e1
HR
1808 adev->firmware.gpu_info_fw = NULL;
1809
72de33f8 1810 if (adev->mman.discovery_bin) {
258620d0 1811 amdgpu_discovery_get_gfx_info(adev);
cc375d8c
TY
1812
1813 /*
1814 * FIXME: The bounding box is still needed by Navi12, so
1815 * temporarily read it from gpu_info firmware. Should be droped
1816 * when DAL no longer needs it.
1817 */
1818 if (adev->asic_type != CHIP_NAVI12)
1819 return 0;
258620d0
AD
1820 }
1821
e2a75f88 1822 switch (adev->asic_type) {
e2a75f88
AD
1823#ifdef CONFIG_DRM_AMDGPU_SI
1824 case CHIP_VERDE:
1825 case CHIP_TAHITI:
1826 case CHIP_PITCAIRN:
1827 case CHIP_OLAND:
1828 case CHIP_HAINAN:
1829#endif
1830#ifdef CONFIG_DRM_AMDGPU_CIK
1831 case CHIP_BONAIRE:
1832 case CHIP_HAWAII:
1833 case CHIP_KAVERI:
1834 case CHIP_KABINI:
1835 case CHIP_MULLINS:
1836#endif
da87c30b
AD
1837 case CHIP_TOPAZ:
1838 case CHIP_TONGA:
1839 case CHIP_FIJI:
1840 case CHIP_POLARIS10:
1841 case CHIP_POLARIS11:
1842 case CHIP_POLARIS12:
1843 case CHIP_VEGAM:
1844 case CHIP_CARRIZO:
1845 case CHIP_STONEY:
27c0bc71 1846 case CHIP_VEGA20:
44b3253a 1847 case CHIP_ALDEBARAN:
84d244a3
JC
1848 case CHIP_SIENNA_CICHLID:
1849 case CHIP_NAVY_FLOUNDER:
eac88a5f 1850 case CHIP_DIMGREY_CAVEFISH:
0e5f4b09 1851 case CHIP_BEIGE_GOBY:
e2a75f88
AD
1852 default:
1853 return 0;
1854 case CHIP_VEGA10:
1855 chip_name = "vega10";
1856 break;
3f76dced
AD
1857 case CHIP_VEGA12:
1858 chip_name = "vega12";
1859 break;
2d2e5e7e 1860 case CHIP_RAVEN:
54f78a76 1861 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1862 chip_name = "raven2";
54f78a76 1863 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1864 chip_name = "picasso";
54c4d17e
FX
1865 else
1866 chip_name = "raven";
2d2e5e7e 1867 break;
65e60f6e
LM
1868 case CHIP_ARCTURUS:
1869 chip_name = "arcturus";
1870 break;
b51a26a0 1871 case CHIP_RENOIR:
2e62f0b5
PL
1872 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1873 chip_name = "renoir";
1874 else
1875 chip_name = "green_sardine";
b51a26a0 1876 break;
23c6268e
HR
1877 case CHIP_NAVI10:
1878 chip_name = "navi10";
1879 break;
ed42cfe1
XY
1880 case CHIP_NAVI14:
1881 chip_name = "navi14";
1882 break;
42b325e5
XY
1883 case CHIP_NAVI12:
1884 chip_name = "navi12";
1885 break;
4e52a9f8
HR
1886 case CHIP_VANGOGH:
1887 chip_name = "vangogh";
1888 break;
8bf84f60
AL
1889 case CHIP_YELLOW_CARP:
1890 chip_name = "yellow_carp";
1891 break;
e2a75f88
AD
1892 }
1893
1894 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1895 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1896 if (err) {
1897 dev_err(adev->dev,
1898 "Failed to load gpu_info firmware \"%s\"\n",
1899 fw_name);
1900 goto out;
1901 }
ab4fe3e1 1902 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1903 if (err) {
1904 dev_err(adev->dev,
1905 "Failed to validate gpu_info firmware \"%s\"\n",
1906 fw_name);
1907 goto out;
1908 }
1909
ab4fe3e1 1910 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1911 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1912
1913 switch (hdr->version_major) {
1914 case 1:
1915 {
1916 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1917 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1918 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1919
cc375d8c
TY
1920 /*
1921 * Should be droped when DAL no longer needs it.
1922 */
1923 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
1924 goto parse_soc_bounding_box;
1925
b5ab16bf
AD
1926 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1927 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1928 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1929 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 1930 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
1931 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1932 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1933 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1934 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1935 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 1936 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
1937 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1938 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
1939 adev->gfx.cu_info.max_waves_per_simd =
1940 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1941 adev->gfx.cu_info.max_scratch_slots_per_cu =
1942 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1943 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 1944 if (hdr->version_minor >= 1) {
35c2e910
HZ
1945 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1946 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1947 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1948 adev->gfx.config.num_sc_per_sh =
1949 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1950 adev->gfx.config.num_packer_per_sc =
1951 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1952 }
ec51d3fa
XY
1953
1954parse_soc_bounding_box:
ec51d3fa
XY
1955 /*
1956 * soc bounding box info is not integrated in disocovery table,
258620d0 1957 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 1958 */
48321c3d
HW
1959 if (hdr->version_minor == 2) {
1960 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1961 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1962 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1963 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1964 }
e2a75f88
AD
1965 break;
1966 }
1967 default:
1968 dev_err(adev->dev,
1969 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1970 err = -EINVAL;
1971 goto out;
1972 }
1973out:
e2a75f88
AD
1974 return err;
1975}
1976
e3ecdffa
AD
1977/**
1978 * amdgpu_device_ip_early_init - run early init for hardware IPs
1979 *
1980 * @adev: amdgpu_device pointer
1981 *
1982 * Early initialization pass for hardware IPs. The hardware IPs that make
1983 * up each asic are discovered each IP's early_init callback is run. This
1984 * is the first stage in initializing the asic.
1985 * Returns 0 on success, negative error code on failure.
1986 */
06ec9070 1987static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 1988{
aaa36a97 1989 int i, r;
d38ceaf9 1990
483ef985 1991 amdgpu_device_enable_virtual_display(adev);
a6be7570 1992
00a979f3 1993 if (amdgpu_sriov_vf(adev)) {
00a979f3 1994 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
1995 if (r)
1996 return r;
00a979f3
WS
1997 }
1998
d38ceaf9 1999 switch (adev->asic_type) {
33f34802
KW
2000#ifdef CONFIG_DRM_AMDGPU_SI
2001 case CHIP_VERDE:
2002 case CHIP_TAHITI:
2003 case CHIP_PITCAIRN:
2004 case CHIP_OLAND:
2005 case CHIP_HAINAN:
295d0daf 2006 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
2007 r = si_set_ip_blocks(adev);
2008 if (r)
2009 return r;
2010 break;
2011#endif
a2e73f56
AD
2012#ifdef CONFIG_DRM_AMDGPU_CIK
2013 case CHIP_BONAIRE:
2014 case CHIP_HAWAII:
2015 case CHIP_KAVERI:
2016 case CHIP_KABINI:
2017 case CHIP_MULLINS:
e1ad2d53 2018 if (adev->flags & AMD_IS_APU)
a2e73f56 2019 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
2020 else
2021 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
2022
2023 r = cik_set_ip_blocks(adev);
2024 if (r)
2025 return r;
2026 break;
2027#endif
da87c30b
AD
2028 case CHIP_TOPAZ:
2029 case CHIP_TONGA:
2030 case CHIP_FIJI:
2031 case CHIP_POLARIS10:
2032 case CHIP_POLARIS11:
2033 case CHIP_POLARIS12:
2034 case CHIP_VEGAM:
2035 case CHIP_CARRIZO:
2036 case CHIP_STONEY:
2037 if (adev->flags & AMD_IS_APU)
2038 adev->family = AMDGPU_FAMILY_CZ;
2039 else
2040 adev->family = AMDGPU_FAMILY_VI;
2041
2042 r = vi_set_ip_blocks(adev);
2043 if (r)
2044 return r;
2045 break;
e48a3cd9
AD
2046 case CHIP_VEGA10:
2047 case CHIP_VEGA12:
e4bd8170 2048 case CHIP_VEGA20:
e48a3cd9 2049 case CHIP_RAVEN:
61cf44c1 2050 case CHIP_ARCTURUS:
b51a26a0 2051 case CHIP_RENOIR:
c00a18ec 2052 case CHIP_ALDEBARAN:
70534d1e 2053 if (adev->flags & AMD_IS_APU)
2ca8a5d2
CZ
2054 adev->family = AMDGPU_FAMILY_RV;
2055 else
2056 adev->family = AMDGPU_FAMILY_AI;
460826e6
KW
2057
2058 r = soc15_set_ip_blocks(adev);
2059 if (r)
2060 return r;
2061 break;
0a5b8c7b 2062 case CHIP_NAVI10:
7ecb5cd4 2063 case CHIP_NAVI14:
4808cf9c 2064 case CHIP_NAVI12:
11e8aef5 2065 case CHIP_SIENNA_CICHLID:
41f446bf 2066 case CHIP_NAVY_FLOUNDER:
144722fa 2067 case CHIP_DIMGREY_CAVEFISH:
b41f5b7a 2068 case CHIP_BEIGE_GOBY:
4e52a9f8 2069 case CHIP_VANGOGH:
8bf84f60 2070 case CHIP_YELLOW_CARP:
4e52a9f8
HR
2071 if (adev->asic_type == CHIP_VANGOGH)
2072 adev->family = AMDGPU_FAMILY_VGH;
8bf84f60
AL
2073 else if (adev->asic_type == CHIP_YELLOW_CARP)
2074 adev->family = AMDGPU_FAMILY_YC;
4e52a9f8
HR
2075 else
2076 adev->family = AMDGPU_FAMILY_NV;
0a5b8c7b
HR
2077
2078 r = nv_set_ip_blocks(adev);
2079 if (r)
2080 return r;
2081 break;
d38ceaf9
AD
2082 default:
2083 /* FIXME: not supported yet */
2084 return -EINVAL;
2085 }
2086
1884734a 2087 amdgpu_amdkfd_device_probe(adev);
2088
3b94fb10 2089 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 2090 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 2091 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
4215a119
HC
2092 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2093 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
00f54b97 2094
d38ceaf9
AD
2095 for (i = 0; i < adev->num_ip_blocks; i++) {
2096 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
2097 DRM_ERROR("disabled ip block: %d <%s>\n",
2098 i, adev->ip_blocks[i].version->funcs->name);
a1255107 2099 adev->ip_blocks[i].status.valid = false;
d38ceaf9 2100 } else {
a1255107
AD
2101 if (adev->ip_blocks[i].version->funcs->early_init) {
2102 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 2103 if (r == -ENOENT) {
a1255107 2104 adev->ip_blocks[i].status.valid = false;
2c1a2784 2105 } else if (r) {
a1255107
AD
2106 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2107 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2108 return r;
2c1a2784 2109 } else {
a1255107 2110 adev->ip_blocks[i].status.valid = true;
2c1a2784 2111 }
974e6b64 2112 } else {
a1255107 2113 adev->ip_blocks[i].status.valid = true;
d38ceaf9 2114 }
d38ceaf9 2115 }
21a249ca
AD
2116 /* get the vbios after the asic_funcs are set up */
2117 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
2118 r = amdgpu_device_parse_gpu_info_fw(adev);
2119 if (r)
2120 return r;
2121
21a249ca
AD
2122 /* Read BIOS */
2123 if (!amdgpu_get_bios(adev))
2124 return -EINVAL;
2125
2126 r = amdgpu_atombios_init(adev);
2127 if (r) {
2128 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2129 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2130 return r;
2131 }
77eabc6f
PJZ
2132
2133 /*get pf2vf msg info at it's earliest time*/
2134 if (amdgpu_sriov_vf(adev))
2135 amdgpu_virt_init_data_exchange(adev);
2136
21a249ca 2137 }
d38ceaf9
AD
2138 }
2139
395d1fb9
NH
2140 adev->cg_flags &= amdgpu_cg_mask;
2141 adev->pg_flags &= amdgpu_pg_mask;
2142
d38ceaf9
AD
2143 return 0;
2144}
2145
0a4f2520
RZ
2146static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2147{
2148 int i, r;
2149
2150 for (i = 0; i < adev->num_ip_blocks; i++) {
2151 if (!adev->ip_blocks[i].status.sw)
2152 continue;
2153 if (adev->ip_blocks[i].status.hw)
2154 continue;
2155 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 2156 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
2157 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2158 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2159 if (r) {
2160 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2161 adev->ip_blocks[i].version->funcs->name, r);
2162 return r;
2163 }
2164 adev->ip_blocks[i].status.hw = true;
2165 }
2166 }
2167
2168 return 0;
2169}
2170
2171static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2172{
2173 int i, r;
2174
2175 for (i = 0; i < adev->num_ip_blocks; i++) {
2176 if (!adev->ip_blocks[i].status.sw)
2177 continue;
2178 if (adev->ip_blocks[i].status.hw)
2179 continue;
2180 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2181 if (r) {
2182 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2183 adev->ip_blocks[i].version->funcs->name, r);
2184 return r;
2185 }
2186 adev->ip_blocks[i].status.hw = true;
2187 }
2188
2189 return 0;
2190}
2191
7a3e0bb2
RZ
2192static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2193{
2194 int r = 0;
2195 int i;
80f41f84 2196 uint32_t smu_version;
7a3e0bb2
RZ
2197
2198 if (adev->asic_type >= CHIP_VEGA10) {
2199 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2200 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2201 continue;
2202
e3c1b071 2203 if (!adev->ip_blocks[i].status.sw)
2204 continue;
2205
482f0e53
ML
2206 /* no need to do the fw loading again if already done*/
2207 if (adev->ip_blocks[i].status.hw == true)
2208 break;
2209
53b3f8f4 2210 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2211 r = adev->ip_blocks[i].version->funcs->resume(adev);
2212 if (r) {
2213 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2214 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2215 return r;
2216 }
2217 } else {
2218 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2219 if (r) {
2220 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2221 adev->ip_blocks[i].version->funcs->name, r);
2222 return r;
7a3e0bb2 2223 }
7a3e0bb2 2224 }
482f0e53
ML
2225
2226 adev->ip_blocks[i].status.hw = true;
2227 break;
7a3e0bb2
RZ
2228 }
2229 }
482f0e53 2230
8973d9ec
ED
2231 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2232 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2233
80f41f84 2234 return r;
7a3e0bb2
RZ
2235}
2236
e3ecdffa
AD
2237/**
2238 * amdgpu_device_ip_init - run init for hardware IPs
2239 *
2240 * @adev: amdgpu_device pointer
2241 *
2242 * Main initialization pass for hardware IPs. The list of all the hardware
2243 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2244 * are run. sw_init initializes the software state associated with each IP
2245 * and hw_init initializes the hardware associated with each IP.
2246 * Returns 0 on success, negative error code on failure.
2247 */
06ec9070 2248static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2249{
2250 int i, r;
2251
c030f2e4 2252 r = amdgpu_ras_init(adev);
2253 if (r)
2254 return r;
2255
d38ceaf9 2256 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2257 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2258 continue;
a1255107 2259 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2260 if (r) {
a1255107
AD
2261 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2262 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2263 goto init_failed;
2c1a2784 2264 }
a1255107 2265 adev->ip_blocks[i].status.sw = true;
bfca0289 2266
d38ceaf9 2267 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2268 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
06ec9070 2269 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2270 if (r) {
2271 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2272 goto init_failed;
2c1a2784 2273 }
a1255107 2274 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2275 if (r) {
2276 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2277 goto init_failed;
2c1a2784 2278 }
06ec9070 2279 r = amdgpu_device_wb_init(adev);
2c1a2784 2280 if (r) {
06ec9070 2281 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2282 goto init_failed;
2c1a2784 2283 }
a1255107 2284 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2285
2286 /* right after GMC hw init, we create CSA */
f92d5c61 2287 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2288 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2289 AMDGPU_GEM_DOMAIN_VRAM,
2290 AMDGPU_CSA_SIZE);
2493664f
ML
2291 if (r) {
2292 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2293 goto init_failed;
2493664f
ML
2294 }
2295 }
d38ceaf9
AD
2296 }
2297 }
2298
c9ffa427
YT
2299 if (amdgpu_sriov_vf(adev))
2300 amdgpu_virt_init_data_exchange(adev);
2301
533aed27
AG
2302 r = amdgpu_ib_pool_init(adev);
2303 if (r) {
2304 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2305 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2306 goto init_failed;
2307 }
2308
c8963ea4
RZ
2309 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2310 if (r)
72d3f592 2311 goto init_failed;
0a4f2520
RZ
2312
2313 r = amdgpu_device_ip_hw_init_phase1(adev);
2314 if (r)
72d3f592 2315 goto init_failed;
0a4f2520 2316
7a3e0bb2
RZ
2317 r = amdgpu_device_fw_loading(adev);
2318 if (r)
72d3f592 2319 goto init_failed;
7a3e0bb2 2320
0a4f2520
RZ
2321 r = amdgpu_device_ip_hw_init_phase2(adev);
2322 if (r)
72d3f592 2323 goto init_failed;
d38ceaf9 2324
121a2bc6
AG
2325 /*
2326 * retired pages will be loaded from eeprom and reserved here,
2327 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2328 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2329 * for I2C communication which only true at this point.
b82e65a9
GC
2330 *
2331 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2332 * failure from bad gpu situation and stop amdgpu init process
2333 * accordingly. For other failed cases, it will still release all
2334 * the resource and print error message, rather than returning one
2335 * negative value to upper level.
121a2bc6
AG
2336 *
2337 * Note: theoretically, this should be called before all vram allocations
2338 * to protect retired page from abusing
2339 */
b82e65a9
GC
2340 r = amdgpu_ras_recovery_init(adev);
2341 if (r)
2342 goto init_failed;
121a2bc6 2343
3e2e2ab5
HZ
2344 if (adev->gmc.xgmi.num_physical_nodes > 1)
2345 amdgpu_xgmi_add_device(adev);
e3c1b071 2346
2347 /* Don't init kfd if whole hive need to be reset during init */
2348 if (!adev->gmc.xgmi.pending_reset)
2349 amdgpu_amdkfd_device_init(adev);
c6332b97 2350
bd607166
KR
2351 amdgpu_fru_get_product_info(adev);
2352
72d3f592 2353init_failed:
c9ffa427 2354 if (amdgpu_sriov_vf(adev))
c6332b97 2355 amdgpu_virt_release_full_gpu(adev, true);
2356
72d3f592 2357 return r;
d38ceaf9
AD
2358}
2359
e3ecdffa
AD
2360/**
2361 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2362 *
2363 * @adev: amdgpu_device pointer
2364 *
2365 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2366 * this function before a GPU reset. If the value is retained after a
2367 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2368 */
06ec9070 2369static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2370{
2371 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2372}
2373
e3ecdffa
AD
2374/**
2375 * amdgpu_device_check_vram_lost - check if vram is valid
2376 *
2377 * @adev: amdgpu_device pointer
2378 *
2379 * Checks the reset magic value written to the gart pointer in VRAM.
2380 * The driver calls this after a GPU reset to see if the contents of
2381 * VRAM is lost or now.
2382 * returns true if vram is lost, false if not.
2383 */
06ec9070 2384static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2385{
dadce777
EQ
2386 if (memcmp(adev->gart.ptr, adev->reset_magic,
2387 AMDGPU_RESET_MAGIC_NUM))
2388 return true;
2389
53b3f8f4 2390 if (!amdgpu_in_reset(adev))
dadce777
EQ
2391 return false;
2392
2393 /*
2394 * For all ASICs with baco/mode1 reset, the VRAM is
2395 * always assumed to be lost.
2396 */
2397 switch (amdgpu_asic_reset_method(adev)) {
2398 case AMD_RESET_METHOD_BACO:
2399 case AMD_RESET_METHOD_MODE1:
2400 return true;
2401 default:
2402 return false;
2403 }
0c49e0b8
CZ
2404}
2405
e3ecdffa 2406/**
1112a46b 2407 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2408 *
2409 * @adev: amdgpu_device pointer
b8b72130 2410 * @state: clockgating state (gate or ungate)
e3ecdffa 2411 *
e3ecdffa 2412 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2413 * set_clockgating_state callbacks are run.
2414 * Late initialization pass enabling clockgating for hardware IPs.
2415 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2416 * Returns 0 on success, negative error code on failure.
2417 */
fdd34271 2418
5d89bb2d
LL
2419int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2420 enum amd_clockgating_state state)
d38ceaf9 2421{
1112a46b 2422 int i, j, r;
d38ceaf9 2423
4a2ba394
SL
2424 if (amdgpu_emu_mode == 1)
2425 return 0;
2426
1112a46b
RZ
2427 for (j = 0; j < adev->num_ip_blocks; j++) {
2428 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2429 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2430 continue;
5d70a549
PV
2431 /* skip CG for GFX on S0ix */
2432 if (adev->in_s0ix &&
2433 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2434 continue;
4a446d55 2435 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2436 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2437 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2438 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2439 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2440 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2441 /* enable clockgating to save power */
a1255107 2442 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2443 state);
4a446d55
AD
2444 if (r) {
2445 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2446 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2447 return r;
2448 }
b0b00ff1 2449 }
d38ceaf9 2450 }
06b18f61 2451
c9f96fd5
RZ
2452 return 0;
2453}
2454
5d89bb2d
LL
2455int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2456 enum amd_powergating_state state)
c9f96fd5 2457{
1112a46b 2458 int i, j, r;
06b18f61 2459
c9f96fd5
RZ
2460 if (amdgpu_emu_mode == 1)
2461 return 0;
2462
1112a46b
RZ
2463 for (j = 0; j < adev->num_ip_blocks; j++) {
2464 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2465 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5 2466 continue;
5d70a549
PV
2467 /* skip PG for GFX on S0ix */
2468 if (adev->in_s0ix &&
2469 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2470 continue;
c9f96fd5
RZ
2471 /* skip CG for VCE/UVD, it's handled specially */
2472 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2473 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2474 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2475 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2476 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2477 /* enable powergating to save power */
2478 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2479 state);
c9f96fd5
RZ
2480 if (r) {
2481 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2482 adev->ip_blocks[i].version->funcs->name, r);
2483 return r;
2484 }
2485 }
2486 }
2dc80b00
S
2487 return 0;
2488}
2489
beff74bc
AD
2490static int amdgpu_device_enable_mgpu_fan_boost(void)
2491{
2492 struct amdgpu_gpu_instance *gpu_ins;
2493 struct amdgpu_device *adev;
2494 int i, ret = 0;
2495
2496 mutex_lock(&mgpu_info.mutex);
2497
2498 /*
2499 * MGPU fan boost feature should be enabled
2500 * only when there are two or more dGPUs in
2501 * the system
2502 */
2503 if (mgpu_info.num_dgpu < 2)
2504 goto out;
2505
2506 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2507 gpu_ins = &(mgpu_info.gpu_ins[i]);
2508 adev = gpu_ins->adev;
2509 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2510 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2511 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2512 if (ret)
2513 break;
2514
2515 gpu_ins->mgpu_fan_enabled = 1;
2516 }
2517 }
2518
2519out:
2520 mutex_unlock(&mgpu_info.mutex);
2521
2522 return ret;
2523}
2524
e3ecdffa
AD
2525/**
2526 * amdgpu_device_ip_late_init - run late init for hardware IPs
2527 *
2528 * @adev: amdgpu_device pointer
2529 *
2530 * Late initialization pass for hardware IPs. The list of all the hardware
2531 * IPs that make up the asic is walked and the late_init callbacks are run.
2532 * late_init covers any special initialization that an IP requires
2533 * after all of the have been initialized or something that needs to happen
2534 * late in the init process.
2535 * Returns 0 on success, negative error code on failure.
2536 */
06ec9070 2537static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2538{
60599a03 2539 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2540 int i = 0, r;
2541
2542 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2543 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2544 continue;
2545 if (adev->ip_blocks[i].version->funcs->late_init) {
2546 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2547 if (r) {
2548 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2549 adev->ip_blocks[i].version->funcs->name, r);
2550 return r;
2551 }
2dc80b00 2552 }
73f847db 2553 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2554 }
2555
a891d239
DL
2556 amdgpu_ras_set_error_query_ready(adev, true);
2557
1112a46b
RZ
2558 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2559 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2560
06ec9070 2561 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2562
beff74bc
AD
2563 r = amdgpu_device_enable_mgpu_fan_boost();
2564 if (r)
2565 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2566
2d02893f 2567 /* For XGMI + passthrough configuration on arcturus, enable light SBR */
2568 if (adev->asic_type == CHIP_ARCTURUS &&
2569 amdgpu_passthrough(adev) &&
2570 adev->gmc.xgmi.num_physical_nodes > 1)
2571 smu_set_light_sbr(&adev->smu, true);
60599a03
EQ
2572
2573 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2574 mutex_lock(&mgpu_info.mutex);
2575
2576 /*
2577 * Reset device p-state to low as this was booted with high.
2578 *
2579 * This should be performed only after all devices from the same
2580 * hive get initialized.
2581 *
2582 * However, it's unknown how many device in the hive in advance.
2583 * As this is counted one by one during devices initializations.
2584 *
2585 * So, we wait for all XGMI interlinked devices initialized.
2586 * This may bring some delays as those devices may come from
2587 * different hives. But that should be OK.
2588 */
2589 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2590 for (i = 0; i < mgpu_info.num_gpu; i++) {
2591 gpu_instance = &(mgpu_info.gpu_ins[i]);
2592 if (gpu_instance->adev->flags & AMD_IS_APU)
2593 continue;
2594
d84a430d
JK
2595 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2596 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2597 if (r) {
2598 DRM_ERROR("pstate setting failed (%d).\n", r);
2599 break;
2600 }
2601 }
2602 }
2603
2604 mutex_unlock(&mgpu_info.mutex);
2605 }
2606
d38ceaf9
AD
2607 return 0;
2608}
2609
e9669fb7 2610static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
d38ceaf9
AD
2611{
2612 int i, r;
2613
e9669fb7
AG
2614 for (i = 0; i < adev->num_ip_blocks; i++) {
2615 if (!adev->ip_blocks[i].version->funcs->early_fini)
2616 continue;
5278a159 2617
e9669fb7
AG
2618 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2619 if (r) {
2620 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2621 adev->ip_blocks[i].version->funcs->name, r);
2622 }
2623 }
c030f2e4 2624
e9669fb7 2625 amdgpu_amdkfd_suspend(adev, false);
a82400b5 2626
05df1f01 2627 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2628 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2629
3e96dbfd
AD
2630 /* need to disable SMC first */
2631 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2632 if (!adev->ip_blocks[i].status.hw)
3e96dbfd 2633 continue;
fdd34271 2634 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
a1255107 2635 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3e96dbfd
AD
2636 /* XXX handle errors */
2637 if (r) {
2638 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
a1255107 2639 adev->ip_blocks[i].version->funcs->name, r);
3e96dbfd 2640 }
a1255107 2641 adev->ip_blocks[i].status.hw = false;
3e96dbfd
AD
2642 break;
2643 }
2644 }
2645
d38ceaf9 2646 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2647 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2648 continue;
8201a67a 2649
a1255107 2650 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2651 /* XXX handle errors */
2c1a2784 2652 if (r) {
a1255107
AD
2653 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2654 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2655 }
8201a67a 2656
a1255107 2657 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2658 }
2659
e9669fb7
AG
2660 return 0;
2661}
2662
2663/**
2664 * amdgpu_device_ip_fini - run fini for hardware IPs
2665 *
2666 * @adev: amdgpu_device pointer
2667 *
2668 * Main teardown pass for hardware IPs. The list of all the hardware
2669 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2670 * are run. hw_fini tears down the hardware associated with each IP
2671 * and sw_fini tears down any software state associated with each IP.
2672 * Returns 0 on success, negative error code on failure.
2673 */
2674static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2675{
2676 int i, r;
2677
2678 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2679 amdgpu_virt_release_ras_err_handler_data(adev);
2680
2681 amdgpu_ras_pre_fini(adev);
2682
2683 if (adev->gmc.xgmi.num_physical_nodes > 1)
2684 amdgpu_xgmi_remove_device(adev);
2685
2686 amdgpu_amdkfd_device_fini_sw(adev);
9950cda2 2687
d38ceaf9 2688 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2689 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2690 continue;
c12aba3a
ML
2691
2692 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2693 amdgpu_ucode_free_bo(adev);
1e256e27 2694 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2695 amdgpu_device_wb_fini(adev);
2696 amdgpu_device_vram_scratch_fini(adev);
533aed27 2697 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2698 }
2699
a1255107 2700 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2701 /* XXX handle errors */
2c1a2784 2702 if (r) {
a1255107
AD
2703 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2704 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2705 }
a1255107
AD
2706 adev->ip_blocks[i].status.sw = false;
2707 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2708 }
2709
a6dcfd9c 2710 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2711 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2712 continue;
a1255107
AD
2713 if (adev->ip_blocks[i].version->funcs->late_fini)
2714 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2715 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2716 }
2717
c030f2e4 2718 amdgpu_ras_fini(adev);
2719
030308fc 2720 if (amdgpu_sriov_vf(adev))
24136135
ML
2721 if (amdgpu_virt_release_full_gpu(adev, false))
2722 DRM_ERROR("failed to release exclusive mode on fini\n");
2493664f 2723
d38ceaf9
AD
2724 return 0;
2725}
2726
e3ecdffa 2727/**
beff74bc 2728 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2729 *
1112a46b 2730 * @work: work_struct.
e3ecdffa 2731 */
beff74bc 2732static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2733{
2734 struct amdgpu_device *adev =
beff74bc 2735 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2736 int r;
2737
2738 r = amdgpu_ib_ring_tests(adev);
2739 if (r)
2740 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2741}
2742
1e317b99
RZ
2743static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2744{
2745 struct amdgpu_device *adev =
2746 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2747
2748 mutex_lock(&adev->gfx.gfx_off_mutex);
2749 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2750 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2751 adev->gfx.gfx_off_state = true;
2752 }
2753 mutex_unlock(&adev->gfx.gfx_off_mutex);
2754}
2755
e3ecdffa 2756/**
e7854a03 2757 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2758 *
2759 * @adev: amdgpu_device pointer
2760 *
2761 * Main suspend function for hardware IPs. The list of all the hardware
2762 * IPs that make up the asic is walked, clockgating is disabled and the
2763 * suspend callbacks are run. suspend puts the hardware and software state
2764 * in each IP into a state suitable for suspend.
2765 * Returns 0 on success, negative error code on failure.
2766 */
e7854a03
AD
2767static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2768{
2769 int i, r;
2770
50ec83f0
AD
2771 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2772 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2773
e7854a03
AD
2774 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2775 if (!adev->ip_blocks[i].status.valid)
2776 continue;
2b9f7848 2777
e7854a03 2778 /* displays are handled separately */
2b9f7848
ND
2779 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2780 continue;
2781
2782 /* XXX handle errors */
2783 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2784 /* XXX handle errors */
2785 if (r) {
2786 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2787 adev->ip_blocks[i].version->funcs->name, r);
2788 return r;
e7854a03 2789 }
2b9f7848
ND
2790
2791 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2792 }
2793
e7854a03
AD
2794 return 0;
2795}
2796
2797/**
2798 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2799 *
2800 * @adev: amdgpu_device pointer
2801 *
2802 * Main suspend function for hardware IPs. The list of all the hardware
2803 * IPs that make up the asic is walked, clockgating is disabled and the
2804 * suspend callbacks are run. suspend puts the hardware and software state
2805 * in each IP into a state suitable for suspend.
2806 * Returns 0 on success, negative error code on failure.
2807 */
2808static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2809{
2810 int i, r;
2811
557f42a2 2812 if (adev->in_s0ix)
34416931 2813 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
34416931 2814
d38ceaf9 2815 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2816 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2817 continue;
e7854a03
AD
2818 /* displays are handled in phase1 */
2819 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2820 continue;
bff77e86
LM
2821 /* PSP lost connection when err_event_athub occurs */
2822 if (amdgpu_ras_intr_triggered() &&
2823 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2824 adev->ip_blocks[i].status.hw = false;
2825 continue;
2826 }
e3c1b071 2827
2828 /* skip unnecessary suspend if we do not initialize them yet */
2829 if (adev->gmc.xgmi.pending_reset &&
2830 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2831 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2832 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2833 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2834 adev->ip_blocks[i].status.hw = false;
2835 continue;
2836 }
557f42a2 2837
32ff160d
AD
2838 /* skip suspend of gfx and psp for S0ix
2839 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2840 * like at runtime. PSP is also part of the always on hardware
2841 * so no need to suspend it.
2842 */
557f42a2 2843 if (adev->in_s0ix &&
32ff160d
AD
2844 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2845 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
557f42a2
AD
2846 continue;
2847
d38ceaf9 2848 /* XXX handle errors */
a1255107 2849 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2850 /* XXX handle errors */
2c1a2784 2851 if (r) {
a1255107
AD
2852 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2853 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2854 }
876923fb 2855 adev->ip_blocks[i].status.hw = false;
a3a09142 2856 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2857 if(!amdgpu_sriov_vf(adev)){
2858 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2859 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2860 if (r) {
2861 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2862 adev->mp1_state, r);
2863 return r;
2864 }
a3a09142
AD
2865 }
2866 }
d38ceaf9
AD
2867 }
2868
2869 return 0;
2870}
2871
e7854a03
AD
2872/**
2873 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2874 *
2875 * @adev: amdgpu_device pointer
2876 *
2877 * Main suspend function for hardware IPs. The list of all the hardware
2878 * IPs that make up the asic is walked, clockgating is disabled and the
2879 * suspend callbacks are run. suspend puts the hardware and software state
2880 * in each IP into a state suitable for suspend.
2881 * Returns 0 on success, negative error code on failure.
2882 */
2883int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2884{
2885 int r;
2886
3c73683c
JC
2887 if (amdgpu_sriov_vf(adev)) {
2888 amdgpu_virt_fini_data_exchange(adev);
e7819644 2889 amdgpu_virt_request_full_gpu(adev, false);
3c73683c 2890 }
e7819644 2891
e7854a03
AD
2892 r = amdgpu_device_ip_suspend_phase1(adev);
2893 if (r)
2894 return r;
2895 r = amdgpu_device_ip_suspend_phase2(adev);
2896
e7819644
YT
2897 if (amdgpu_sriov_vf(adev))
2898 amdgpu_virt_release_full_gpu(adev, false);
2899
e7854a03
AD
2900 return r;
2901}
2902
06ec9070 2903static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2904{
2905 int i, r;
2906
2cb681b6
ML
2907 static enum amd_ip_block_type ip_order[] = {
2908 AMD_IP_BLOCK_TYPE_GMC,
2909 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 2910 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
2911 AMD_IP_BLOCK_TYPE_IH,
2912 };
a90ad3c2 2913
95ea3dbc 2914 for (i = 0; i < adev->num_ip_blocks; i++) {
2cb681b6
ML
2915 int j;
2916 struct amdgpu_ip_block *block;
a90ad3c2 2917
4cd2a96d
J
2918 block = &adev->ip_blocks[i];
2919 block->status.hw = false;
2cb681b6 2920
4cd2a96d 2921 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 2922
4cd2a96d 2923 if (block->version->type != ip_order[j] ||
2cb681b6
ML
2924 !block->status.valid)
2925 continue;
2926
2927 r = block->version->funcs->hw_init(adev);
0aaeefcc 2928 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2929 if (r)
2930 return r;
482f0e53 2931 block->status.hw = true;
a90ad3c2
ML
2932 }
2933 }
2934
2935 return 0;
2936}
2937
06ec9070 2938static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2939{
2940 int i, r;
2941
2cb681b6
ML
2942 static enum amd_ip_block_type ip_order[] = {
2943 AMD_IP_BLOCK_TYPE_SMC,
2944 AMD_IP_BLOCK_TYPE_DCE,
2945 AMD_IP_BLOCK_TYPE_GFX,
2946 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 2947 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
2948 AMD_IP_BLOCK_TYPE_VCE,
2949 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 2950 };
a90ad3c2 2951
2cb681b6
ML
2952 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2953 int j;
2954 struct amdgpu_ip_block *block;
a90ad3c2 2955
2cb681b6
ML
2956 for (j = 0; j < adev->num_ip_blocks; j++) {
2957 block = &adev->ip_blocks[j];
2958
2959 if (block->version->type != ip_order[i] ||
482f0e53
ML
2960 !block->status.valid ||
2961 block->status.hw)
2cb681b6
ML
2962 continue;
2963
895bd048
JZ
2964 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2965 r = block->version->funcs->resume(adev);
2966 else
2967 r = block->version->funcs->hw_init(adev);
2968
0aaeefcc 2969 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2970 if (r)
2971 return r;
482f0e53 2972 block->status.hw = true;
a90ad3c2
ML
2973 }
2974 }
2975
2976 return 0;
2977}
2978
e3ecdffa
AD
2979/**
2980 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2981 *
2982 * @adev: amdgpu_device pointer
2983 *
2984 * First resume function for hardware IPs. The list of all the hardware
2985 * IPs that make up the asic is walked and the resume callbacks are run for
2986 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2987 * after a suspend and updates the software state as necessary. This
2988 * function is also used for restoring the GPU after a GPU reset.
2989 * Returns 0 on success, negative error code on failure.
2990 */
06ec9070 2991static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
2992{
2993 int i, r;
2994
a90ad3c2 2995 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2996 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 2997 continue;
a90ad3c2 2998 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
2999 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 3001
fcf0649f
CZ
3002 r = adev->ip_blocks[i].version->funcs->resume(adev);
3003 if (r) {
3004 DRM_ERROR("resume of IP block <%s> failed %d\n",
3005 adev->ip_blocks[i].version->funcs->name, r);
3006 return r;
3007 }
482f0e53 3008 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
3009 }
3010 }
3011
3012 return 0;
3013}
3014
e3ecdffa
AD
3015/**
3016 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3017 *
3018 * @adev: amdgpu_device pointer
3019 *
3020 * First resume function for hardware IPs. The list of all the hardware
3021 * IPs that make up the asic is walked and the resume callbacks are run for
3022 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3023 * functional state after a suspend and updates the software state as
3024 * necessary. This function is also used for restoring the GPU after a GPU
3025 * reset.
3026 * Returns 0 on success, negative error code on failure.
3027 */
06ec9070 3028static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
3029{
3030 int i, r;
3031
3032 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3033 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 3034 continue;
fcf0649f 3035 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3036 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
3037 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3038 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 3039 continue;
a1255107 3040 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 3041 if (r) {
a1255107
AD
3042 DRM_ERROR("resume of IP block <%s> failed %d\n",
3043 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 3044 return r;
2c1a2784 3045 }
482f0e53 3046 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
3047 }
3048
3049 return 0;
3050}
3051
e3ecdffa
AD
3052/**
3053 * amdgpu_device_ip_resume - run resume for hardware IPs
3054 *
3055 * @adev: amdgpu_device pointer
3056 *
3057 * Main resume function for hardware IPs. The hardware IPs
3058 * are split into two resume functions because they are
3059 * are also used in in recovering from a GPU reset and some additional
3060 * steps need to be take between them. In this case (S3/S4) they are
3061 * run sequentially.
3062 * Returns 0 on success, negative error code on failure.
3063 */
06ec9070 3064static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
3065{
3066 int r;
3067
06ec9070 3068 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
3069 if (r)
3070 return r;
7a3e0bb2
RZ
3071
3072 r = amdgpu_device_fw_loading(adev);
3073 if (r)
3074 return r;
3075
06ec9070 3076 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
3077
3078 return r;
3079}
3080
e3ecdffa
AD
3081/**
3082 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3083 *
3084 * @adev: amdgpu_device pointer
3085 *
3086 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3087 */
4e99a44e 3088static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 3089{
6867e1b5
ML
3090 if (amdgpu_sriov_vf(adev)) {
3091 if (adev->is_atom_fw) {
58ff791a 3092 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
6867e1b5
ML
3093 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3094 } else {
3095 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3096 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3097 }
3098
3099 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3100 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 3101 }
048765ad
AR
3102}
3103
e3ecdffa
AD
3104/**
3105 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3106 *
3107 * @asic_type: AMD asic type
3108 *
3109 * Check if there is DC (new modesetting infrastructre) support for an asic.
3110 * returns true if DC has support, false if not.
3111 */
4562236b
HW
3112bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3113{
3114 switch (asic_type) {
3115#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
3116#if defined(CONFIG_DRM_AMD_DC_SI)
3117 case CHIP_TAHITI:
3118 case CHIP_PITCAIRN:
3119 case CHIP_VERDE:
3120 case CHIP_OLAND:
3121#endif
4562236b 3122 case CHIP_BONAIRE:
0d6fbccb 3123 case CHIP_KAVERI:
367e6687
AD
3124 case CHIP_KABINI:
3125 case CHIP_MULLINS:
d9fda248
HW
3126 /*
3127 * We have systems in the wild with these ASICs that require
3128 * LVDS and VGA support which is not supported with DC.
3129 *
3130 * Fallback to the non-DC driver here by default so as not to
3131 * cause regressions.
3132 */
3133 return amdgpu_dc > 0;
3134 case CHIP_HAWAII:
4562236b
HW
3135 case CHIP_CARRIZO:
3136 case CHIP_STONEY:
4562236b 3137 case CHIP_POLARIS10:
675fd32b 3138 case CHIP_POLARIS11:
2c8ad2d5 3139 case CHIP_POLARIS12:
675fd32b 3140 case CHIP_VEGAM:
4562236b
HW
3141 case CHIP_TONGA:
3142 case CHIP_FIJI:
42f8ffa1 3143 case CHIP_VEGA10:
dca7b401 3144 case CHIP_VEGA12:
c6034aa2 3145 case CHIP_VEGA20:
b86a1aa3 3146#if defined(CONFIG_DRM_AMD_DC_DCN)
fd187853 3147 case CHIP_RAVEN:
b4f199c7 3148 case CHIP_NAVI10:
8fceceb6 3149 case CHIP_NAVI14:
078655d9 3150 case CHIP_NAVI12:
e1c14c43 3151 case CHIP_RENOIR:
81d9bfb8 3152 case CHIP_SIENNA_CICHLID:
a6c5308f 3153 case CHIP_NAVY_FLOUNDER:
7cc656e2 3154 case CHIP_DIMGREY_CAVEFISH:
ddaed58b 3155 case CHIP_BEIGE_GOBY:
84b934bc 3156 case CHIP_VANGOGH:
c8b73f7f 3157 case CHIP_YELLOW_CARP:
42f8ffa1 3158#endif
fd187853 3159 return amdgpu_dc != 0;
4562236b
HW
3160#endif
3161 default:
93b09a9a 3162 if (amdgpu_dc > 0)
044a48f4 3163 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
93b09a9a 3164 "but isn't supported by ASIC, ignoring\n");
4562236b
HW
3165 return false;
3166 }
3167}
3168
3169/**
3170 * amdgpu_device_has_dc_support - check if dc is supported
3171 *
982a820b 3172 * @adev: amdgpu_device pointer
4562236b
HW
3173 *
3174 * Returns true for supported, false for not supported
3175 */
3176bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3177{
abaf210c
AS
3178 if (amdgpu_sriov_vf(adev) ||
3179 adev->enable_virtual_display ||
3180 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
2555039d
XY
3181 return false;
3182
4562236b
HW
3183 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3184}
3185
d4535e2c
AG
3186static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3187{
3188 struct amdgpu_device *adev =
3189 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 3190 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 3191
c6a6e2db
AG
3192 /* It's a bug to not have a hive within this function */
3193 if (WARN_ON(!hive))
3194 return;
3195
3196 /*
3197 * Use task barrier to synchronize all xgmi reset works across the
3198 * hive. task_barrier_enter and task_barrier_exit will block
3199 * until all the threads running the xgmi reset works reach
3200 * those points. task_barrier_full will do both blocks.
3201 */
3202 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3203
3204 task_barrier_enter(&hive->tb);
4a580877 3205 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
3206
3207 if (adev->asic_reset_res)
3208 goto fail;
3209
3210 task_barrier_exit(&hive->tb);
4a580877 3211 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
3212
3213 if (adev->asic_reset_res)
3214 goto fail;
43c4d576 3215
8bc7b360
HZ
3216 if (adev->mmhub.ras_funcs &&
3217 adev->mmhub.ras_funcs->reset_ras_error_count)
3218 adev->mmhub.ras_funcs->reset_ras_error_count(adev);
c6a6e2db
AG
3219 } else {
3220
3221 task_barrier_full(&hive->tb);
3222 adev->asic_reset_res = amdgpu_asic_reset(adev);
3223 }
ce316fa5 3224
c6a6e2db 3225fail:
d4535e2c 3226 if (adev->asic_reset_res)
fed184e9 3227 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 3228 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 3229 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
3230}
3231
71f98027
AD
3232static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3233{
3234 char *input = amdgpu_lockup_timeout;
3235 char *timeout_setting = NULL;
3236 int index = 0;
3237 long timeout;
3238 int ret = 0;
3239
3240 /*
67387dfe
AD
3241 * By default timeout for non compute jobs is 10000
3242 * and 60000 for compute jobs.
71f98027 3243 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 3244 * jobs are 60000 by default.
71f98027
AD
3245 */
3246 adev->gfx_timeout = msecs_to_jiffies(10000);
3247 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
9882e278
ED
3248 if (amdgpu_sriov_vf(adev))
3249 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3250 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
71f98027 3251 else
67387dfe 3252 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027 3253
f440ff44 3254 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 3255 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 3256 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
3257 ret = kstrtol(timeout_setting, 0, &timeout);
3258 if (ret)
3259 return ret;
3260
3261 if (timeout == 0) {
3262 index++;
3263 continue;
3264 } else if (timeout < 0) {
3265 timeout = MAX_SCHEDULE_TIMEOUT;
3266 } else {
3267 timeout = msecs_to_jiffies(timeout);
3268 }
3269
3270 switch (index++) {
3271 case 0:
3272 adev->gfx_timeout = timeout;
3273 break;
3274 case 1:
3275 adev->compute_timeout = timeout;
3276 break;
3277 case 2:
3278 adev->sdma_timeout = timeout;
3279 break;
3280 case 3:
3281 adev->video_timeout = timeout;
3282 break;
3283 default:
3284 break;
3285 }
3286 }
3287 /*
3288 * There is only one value specified and
3289 * it should apply to all non-compute jobs.
3290 */
bcccee89 3291 if (index == 1) {
71f98027 3292 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3293 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3294 adev->compute_timeout = adev->gfx_timeout;
3295 }
71f98027
AD
3296 }
3297
3298 return ret;
3299}
d4535e2c 3300
77f3a5cd
ND
3301static const struct attribute *amdgpu_dev_attributes[] = {
3302 &dev_attr_product_name.attr,
3303 &dev_attr_product_number.attr,
3304 &dev_attr_serial_number.attr,
3305 &dev_attr_pcie_replay_count.attr,
3306 NULL
3307};
3308
d38ceaf9
AD
3309/**
3310 * amdgpu_device_init - initialize the driver
3311 *
3312 * @adev: amdgpu_device pointer
d38ceaf9
AD
3313 * @flags: driver flags
3314 *
3315 * Initializes the driver info and hw (all asics).
3316 * Returns 0 for success or an error on failure.
3317 * Called at driver startup.
3318 */
3319int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3320 uint32_t flags)
3321{
8aba21b7
LT
3322 struct drm_device *ddev = adev_to_drm(adev);
3323 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3324 int r, i;
b98c6299 3325 bool px = false;
95844d20 3326 u32 max_MBps;
d38ceaf9
AD
3327
3328 adev->shutdown = false;
d38ceaf9 3329 adev->flags = flags;
4e66d7d2
YZ
3330
3331 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3332 adev->asic_type = amdgpu_force_asic_type;
3333 else
3334 adev->asic_type = flags & AMD_ASIC_MASK;
3335
d38ceaf9 3336 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3337 if (amdgpu_emu_mode == 1)
8bdab6bb 3338 adev->usec_timeout *= 10;
770d13b1 3339 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3340 adev->accel_working = false;
3341 adev->num_rings = 0;
3342 adev->mman.buffer_funcs = NULL;
3343 adev->mman.buffer_funcs_ring = NULL;
3344 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3345 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3346 adev->gmc.gmc_funcs = NULL;
7bd939d0 3347 adev->harvest_ip_mask = 0x0;
f54d1867 3348 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3349 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3350
3351 adev->smc_rreg = &amdgpu_invalid_rreg;
3352 adev->smc_wreg = &amdgpu_invalid_wreg;
3353 adev->pcie_rreg = &amdgpu_invalid_rreg;
3354 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3355 adev->pciep_rreg = &amdgpu_invalid_rreg;
3356 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3357 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3358 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3359 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3360 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3361 adev->didt_rreg = &amdgpu_invalid_rreg;
3362 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3363 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3364 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3365 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3366 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3367
3e39ab90
AD
3368 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3369 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3370 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3371
3372 /* mutex initialization are all done here so we
3373 * can recall function without having locking issues */
0e5ca0d1 3374 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3375 mutex_init(&adev->pm.mutex);
3376 mutex_init(&adev->gfx.gpu_clock_mutex);
3377 mutex_init(&adev->srbm_mutex);
b8866c26 3378 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3379 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3380 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3381 mutex_init(&adev->mn_lock);
e23b74aa 3382 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3383 hash_init(adev->mn_hash);
53b3f8f4 3384 atomic_set(&adev->in_gpu_reset, 0);
6049db43 3385 init_rwsem(&adev->reset_sem);
32eaeae0 3386 mutex_init(&adev->psp.mutex);
bd052211 3387 mutex_init(&adev->notifier_lock);
d38ceaf9 3388
912dfc84
EQ
3389 r = amdgpu_device_check_arguments(adev);
3390 if (r)
3391 return r;
d38ceaf9 3392
d38ceaf9
AD
3393 spin_lock_init(&adev->mmio_idx_lock);
3394 spin_lock_init(&adev->smc_idx_lock);
3395 spin_lock_init(&adev->pcie_idx_lock);
3396 spin_lock_init(&adev->uvd_ctx_idx_lock);
3397 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3398 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3399 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3400 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3401 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3402
0c4e7fa5
CZ
3403 INIT_LIST_HEAD(&adev->shadow_list);
3404 mutex_init(&adev->shadow_list_lock);
3405
655ce9cb 3406 INIT_LIST_HEAD(&adev->reset_list);
3407
beff74bc
AD
3408 INIT_DELAYED_WORK(&adev->delayed_init_work,
3409 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3410 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3411 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3412
d4535e2c
AG
3413 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3414
d23ee13f 3415 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3416 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3417
b265bdbd
EQ
3418 atomic_set(&adev->throttling_logging_enabled, 1);
3419 /*
3420 * If throttling continues, logging will be performed every minute
3421 * to avoid log flooding. "-1" is subtracted since the thermal
3422 * throttling interrupt comes every second. Thus, the total logging
3423 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3424 * for throttling interrupt) = 60 seconds.
3425 */
3426 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3427 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3428
0fa49558
AX
3429 /* Registers mapping */
3430 /* TODO: block userspace mapping of io register */
da69c161
KW
3431 if (adev->asic_type >= CHIP_BONAIRE) {
3432 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3433 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3434 } else {
3435 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3436 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3437 }
d38ceaf9 3438
d38ceaf9
AD
3439 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3440 if (adev->rmmio == NULL) {
3441 return -ENOMEM;
3442 }
3443 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3444 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3445
b2109d8e
JX
3446 /* enable PCIE atomic ops */
3447 r = pci_enable_atomic_ops_to_root(adev->pdev,
3448 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3449 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3450 if (r) {
3451 adev->have_atomics_support = false;
3452 DRM_INFO("PCIE atomic ops is not supported\n");
3453 } else {
3454 adev->have_atomics_support = true;
3455 }
3456
5494d864
AD
3457 amdgpu_device_get_pcie_info(adev);
3458
b239c017
JX
3459 if (amdgpu_mcbp)
3460 DRM_INFO("MCBP is enabled\n");
3461
5f84cc63
JX
3462 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3463 adev->enable_mes = true;
3464
3aa0115d
ML
3465 /* detect hw virtualization here */
3466 amdgpu_detect_virtualization(adev);
3467
dffa11b4
ML
3468 r = amdgpu_device_get_job_timeout_settings(adev);
3469 if (r) {
3470 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4192f7b5 3471 goto failed_unmap;
a190d1c7
XY
3472 }
3473
d38ceaf9 3474 /* early init functions */
06ec9070 3475 r = amdgpu_device_ip_early_init(adev);
d38ceaf9 3476 if (r)
4192f7b5 3477 goto failed_unmap;
d38ceaf9 3478
6585661d
OZ
3479 /* doorbell bar mapping and doorbell index init*/
3480 amdgpu_device_doorbell_init(adev);
3481
9475a943
SL
3482 if (amdgpu_emu_mode == 1) {
3483 /* post the asic on emulation mode */
3484 emu_soc_asic_init(adev);
bfca0289 3485 goto fence_driver_init;
9475a943 3486 }
bfca0289 3487
04442bf7
LL
3488 amdgpu_reset_init(adev);
3489
4e99a44e
ML
3490 /* detect if we are with an SRIOV vbios */
3491 amdgpu_device_detect_sriov_bios(adev);
048765ad 3492
95e8e59e
AD
3493 /* check if we need to reset the asic
3494 * E.g., driver was not cleanly unloaded previously, etc.
3495 */
f14899fd 3496 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
e3c1b071 3497 if (adev->gmc.xgmi.num_physical_nodes) {
3498 dev_info(adev->dev, "Pending hive reset.\n");
3499 adev->gmc.xgmi.pending_reset = true;
3500 /* Only need to init necessary block for SMU to handle the reset */
3501 for (i = 0; i < adev->num_ip_blocks; i++) {
3502 if (!adev->ip_blocks[i].status.valid)
3503 continue;
3504 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3505 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3506 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3507 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
751f43e7 3508 DRM_DEBUG("IP %s disabled for hw_init.\n",
e3c1b071 3509 adev->ip_blocks[i].version->funcs->name);
3510 adev->ip_blocks[i].status.hw = true;
3511 }
3512 }
3513 } else {
3514 r = amdgpu_asic_reset(adev);
3515 if (r) {
3516 dev_err(adev->dev, "asic reset on init failed\n");
3517 goto failed;
3518 }
95e8e59e
AD
3519 }
3520 }
3521
8f66090b 3522 pci_enable_pcie_error_reporting(adev->pdev);
c9a6b82f 3523
d38ceaf9 3524 /* Post card if necessary */
39c640c0 3525 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3526 if (!adev->bios) {
bec86378 3527 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3528 r = -EINVAL;
3529 goto failed;
d38ceaf9 3530 }
bec86378 3531 DRM_INFO("GPU posting now...\n");
4d2997ab 3532 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3533 if (r) {
3534 dev_err(adev->dev, "gpu post error!\n");
3535 goto failed;
3536 }
d38ceaf9
AD
3537 }
3538
88b64e95
AD
3539 if (adev->is_atom_fw) {
3540 /* Initialize clocks */
3541 r = amdgpu_atomfirmware_get_clock_info(adev);
3542 if (r) {
3543 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3544 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3545 goto failed;
3546 }
3547 } else {
a5bde2f9
AD
3548 /* Initialize clocks */
3549 r = amdgpu_atombios_get_clock_info(adev);
3550 if (r) {
3551 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3552 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3553 goto failed;
a5bde2f9
AD
3554 }
3555 /* init i2c buses */
4562236b
HW
3556 if (!amdgpu_device_has_dc_support(adev))
3557 amdgpu_atombios_i2c_init(adev);
2c1a2784 3558 }
d38ceaf9 3559
bfca0289 3560fence_driver_init:
d38ceaf9
AD
3561 /* Fence driver */
3562 r = amdgpu_fence_driver_init(adev);
2c1a2784
AD
3563 if (r) {
3564 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
e23b74aa 3565 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3566 goto failed;
2c1a2784 3567 }
d38ceaf9
AD
3568
3569 /* init the mode config */
4a580877 3570 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3571
06ec9070 3572 r = amdgpu_device_ip_init(adev);
d38ceaf9 3573 if (r) {
8840a387 3574 /* failed in exclusive mode due to timeout */
3575 if (amdgpu_sriov_vf(adev) &&
3576 !amdgpu_sriov_runtime(adev) &&
3577 amdgpu_virt_mmio_blocked(adev) &&
3578 !amdgpu_virt_wait_reset(adev)) {
3579 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3580 /* Don't send request since VF is inactive. */
3581 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3582 adev->virt.ops = NULL;
8840a387 3583 r = -EAGAIN;
970fd197 3584 goto release_ras_con;
8840a387 3585 }
06ec9070 3586 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3587 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
970fd197 3588 goto release_ras_con;
d38ceaf9
AD
3589 }
3590
d69b8971
YZ
3591 dev_info(adev->dev,
3592 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3593 adev->gfx.config.max_shader_engines,
3594 adev->gfx.config.max_sh_per_se,
3595 adev->gfx.config.max_cu_per_sh,
3596 adev->gfx.cu_info.number);
3597
d38ceaf9
AD
3598 adev->accel_working = true;
3599
e59c0205
AX
3600 amdgpu_vm_check_compute_bug(adev);
3601
95844d20
MO
3602 /* Initialize the buffer migration limit. */
3603 if (amdgpu_moverate >= 0)
3604 max_MBps = amdgpu_moverate;
3605 else
3606 max_MBps = 8; /* Allow 8 MB/s. */
3607 /* Get a log2 for easy divisions. */
3608 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3609
9bc92b9c
ML
3610 amdgpu_fbdev_init(adev);
3611
d2f52ac8 3612 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3613 if (r) {
3614 adev->pm_sysfs_en = false;
d2f52ac8 3615 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3616 } else
3617 adev->pm_sysfs_en = true;
d2f52ac8 3618
5bb23532 3619 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3620 if (r) {
3621 adev->ucode_sysfs_en = false;
5bb23532 3622 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3623 } else
3624 adev->ucode_sysfs_en = true;
5bb23532 3625
d38ceaf9
AD
3626 if ((amdgpu_testing & 1)) {
3627 if (adev->accel_working)
3628 amdgpu_test_moves(adev);
3629 else
3630 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3631 }
d38ceaf9
AD
3632 if (amdgpu_benchmarking) {
3633 if (adev->accel_working)
3634 amdgpu_benchmark(adev, amdgpu_benchmarking);
3635 else
3636 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3637 }
3638
b0adca4d
EQ
3639 /*
3640 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3641 * Otherwise the mgpu fan boost feature will be skipped due to the
3642 * gpu instance is counted less.
3643 */
3644 amdgpu_register_gpu_instance(adev);
3645
d38ceaf9
AD
3646 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3647 * explicit gating rather than handling it automatically.
3648 */
e3c1b071 3649 if (!adev->gmc.xgmi.pending_reset) {
3650 r = amdgpu_device_ip_late_init(adev);
3651 if (r) {
3652 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3653 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
970fd197 3654 goto release_ras_con;
e3c1b071 3655 }
3656 /* must succeed. */
3657 amdgpu_ras_resume(adev);
3658 queue_delayed_work(system_wq, &adev->delayed_init_work,
3659 msecs_to_jiffies(AMDGPU_RESUME_MS));
2c1a2784 3660 }
d38ceaf9 3661
2c738637
ML
3662 if (amdgpu_sriov_vf(adev))
3663 flush_delayed_work(&adev->delayed_init_work);
3664
77f3a5cd 3665 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
5aea5327 3666 if (r)
77f3a5cd 3667 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166 3668
d155bef0
AB
3669 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3670 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3671 if (r)
3672 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3673
c1dd4aa6
AG
3674 /* Have stored pci confspace at hand for restore in sudden PCI error */
3675 if (amdgpu_device_cache_pci_state(adev->pdev))
3676 pci_restore_state(pdev);
3677
8c3dd61c
KHF
3678 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3679 /* this will fail for cards that aren't VGA class devices, just
3680 * ignore it */
3681 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3682 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3683
3684 if (amdgpu_device_supports_px(ddev)) {
3685 px = true;
3686 vga_switcheroo_register_client(adev->pdev,
3687 &amdgpu_switcheroo_ops, px);
3688 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3689 }
3690
e3c1b071 3691 if (adev->gmc.xgmi.pending_reset)
3692 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3693 msecs_to_jiffies(AMDGPU_RESUME_MS));
3694
d38ceaf9 3695 return 0;
83ba126a 3696
970fd197
SY
3697release_ras_con:
3698 amdgpu_release_ras_context(adev);
3699
83ba126a 3700failed:
89041940 3701 amdgpu_vf_error_trans_all(adev);
8840a387 3702
4192f7b5
AD
3703failed_unmap:
3704 iounmap(adev->rmmio);
3705 adev->rmmio = NULL;
3706
83ba126a 3707 return r;
d38ceaf9
AD
3708}
3709
07775fc1
AG
3710static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3711{
3712 /* Clear all CPU mappings pointing to this device */
3713 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3714
3715 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3716 amdgpu_device_doorbell_fini(adev);
3717
3718 iounmap(adev->rmmio);
3719 adev->rmmio = NULL;
3720 if (adev->mman.aper_base_kaddr)
3721 iounmap(adev->mman.aper_base_kaddr);
3722 adev->mman.aper_base_kaddr = NULL;
3723
3724 /* Memory manager related */
3725 if (!adev->gmc.xgmi.connected_to_cpu) {
3726 arch_phys_wc_del(adev->gmc.vram_mtrr);
3727 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3728 }
3729}
3730
d38ceaf9
AD
3731/**
3732 * amdgpu_device_fini - tear down the driver
3733 *
3734 * @adev: amdgpu_device pointer
3735 *
3736 * Tear down the driver info (all asics).
3737 * Called at driver shutdown.
3738 */
72c8c97b 3739void amdgpu_device_fini_hw(struct amdgpu_device *adev)
d38ceaf9 3740{
aac89168 3741 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3742 flush_delayed_work(&adev->delayed_init_work);
bb0cd09b 3743 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
d0d13fe8 3744 adev->shutdown = true;
9f875167 3745
752c683d
ML
3746 /* make sure IB test finished before entering exclusive mode
3747 * to avoid preemption on IB test
3748 * */
519b8b76 3749 if (amdgpu_sriov_vf(adev)) {
752c683d 3750 amdgpu_virt_request_full_gpu(adev, false);
519b8b76
BZ
3751 amdgpu_virt_fini_data_exchange(adev);
3752 }
752c683d 3753
e5b03032
ML
3754 /* disable all interrupts */
3755 amdgpu_irq_disable_all(adev);
ff97cba8
ML
3756 if (adev->mode_info.mode_config_initialized){
3757 if (!amdgpu_device_has_dc_support(adev))
4a580877 3758 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3759 else
4a580877 3760 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3761 }
72c8c97b
AG
3762 amdgpu_fence_driver_fini_hw(adev);
3763
7c868b59
YT
3764 if (adev->pm_sysfs_en)
3765 amdgpu_pm_sysfs_fini(adev);
72c8c97b
AG
3766 if (adev->ucode_sysfs_en)
3767 amdgpu_ucode_sysfs_fini(adev);
3768 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3769
d38ceaf9 3770 amdgpu_fbdev_fini(adev);
72c8c97b
AG
3771
3772 amdgpu_irq_fini_hw(adev);
e9669fb7
AG
3773
3774 amdgpu_device_ip_fini_early(adev);
d10d0daa
AG
3775
3776 amdgpu_gart_dummy_page_fini(adev);
07775fc1
AG
3777
3778 amdgpu_device_unmap_mmio(adev);
72c8c97b
AG
3779}
3780
3781void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3782{
e230ac11 3783 amdgpu_device_ip_fini(adev);
72c8c97b 3784 amdgpu_fence_driver_fini_sw(adev);
75e1658e
ND
3785 release_firmware(adev->firmware.gpu_info_fw);
3786 adev->firmware.gpu_info_fw = NULL;
d38ceaf9 3787 adev->accel_working = false;
04442bf7
LL
3788
3789 amdgpu_reset_fini(adev);
3790
d38ceaf9 3791 /* free i2c buses */
4562236b
HW
3792 if (!amdgpu_device_has_dc_support(adev))
3793 amdgpu_i2c_fini(adev);
bfca0289
SL
3794
3795 if (amdgpu_emu_mode != 1)
3796 amdgpu_atombios_fini(adev);
3797
d38ceaf9
AD
3798 kfree(adev->bios);
3799 adev->bios = NULL;
b98c6299 3800 if (amdgpu_device_supports_px(adev_to_drm(adev))) {
84c8b22e 3801 vga_switcheroo_unregister_client(adev->pdev);
83ba126a 3802 vga_switcheroo_fini_domain_pm_ops(adev->dev);
b98c6299 3803 }
38d6be81
AD
3804 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3805 vga_client_register(adev->pdev, NULL, NULL, NULL);
e9bc1bf7 3806
d155bef0
AB
3807 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3808 amdgpu_pmu_fini(adev);
72de33f8 3809 if (adev->mman.discovery_bin)
a190d1c7 3810 amdgpu_discovery_fini(adev);
72c8c97b
AG
3811
3812 kfree(adev->pci_state);
3813
d38ceaf9
AD
3814}
3815
3816
3817/*
3818 * Suspend & resume.
3819 */
3820/**
810ddc3a 3821 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 3822 *
87e3f136 3823 * @dev: drm dev pointer
87e3f136 3824 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
3825 *
3826 * Puts the hw in the suspend state (all asics).
3827 * Returns 0 for success or an error on failure.
3828 * Called at driver suspend.
3829 */
de185019 3830int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9 3831{
a2e15b0e 3832 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 3833
d38ceaf9
AD
3834 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3835 return 0;
3836
44779b43 3837 adev->in_suspend = true;
3fa8f89d
S
3838
3839 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
3840 DRM_WARN("smart shift update failed\n");
3841
d38ceaf9
AD
3842 drm_kms_helper_poll_disable(dev);
3843
5f818173
S
3844 if (fbcon)
3845 amdgpu_fbdev_set_suspend(adev, 1);
3846
beff74bc 3847 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 3848
5e6932fe 3849 amdgpu_ras_suspend(adev);
3850
2196927b 3851 amdgpu_device_ip_suspend_phase1(adev);
fe1053b7 3852
5d3a2d95
AD
3853 if (!adev->in_s0ix)
3854 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
94fa5660 3855
d38ceaf9
AD
3856 /* evict vram memory */
3857 amdgpu_bo_evict_vram(adev);
3858
5ceb54c6 3859 amdgpu_fence_driver_suspend(adev);
d38ceaf9 3860
2196927b 3861 amdgpu_device_ip_suspend_phase2(adev);
a0a71e49
AD
3862 /* evict remaining vram memory
3863 * This second call to evict vram is to evict the gart page table
3864 * using the CPU.
3865 */
d38ceaf9
AD
3866 amdgpu_bo_evict_vram(adev);
3867
d38ceaf9
AD
3868 return 0;
3869}
3870
3871/**
810ddc3a 3872 * amdgpu_device_resume - initiate device resume
d38ceaf9 3873 *
87e3f136 3874 * @dev: drm dev pointer
87e3f136 3875 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
3876 *
3877 * Bring the hw back to operating state (all asics).
3878 * Returns 0 for success or an error on failure.
3879 * Called at driver resume.
3880 */
de185019 3881int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9 3882{
1348969a 3883 struct amdgpu_device *adev = drm_to_adev(dev);
03161a6e 3884 int r = 0;
d38ceaf9
AD
3885
3886 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3887 return 0;
3888
62498733 3889 if (adev->in_s0ix)
628c36d7
PL
3890 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3891
d38ceaf9 3892 /* post card */
39c640c0 3893 if (amdgpu_device_need_post(adev)) {
4d2997ab 3894 r = amdgpu_device_asic_init(adev);
74b0b157 3895 if (r)
aac89168 3896 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 3897 }
d38ceaf9 3898
06ec9070 3899 r = amdgpu_device_ip_resume(adev);
e6707218 3900 if (r) {
aac89168 3901 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 3902 return r;
e6707218 3903 }
5ceb54c6
AD
3904 amdgpu_fence_driver_resume(adev);
3905
d38ceaf9 3906
06ec9070 3907 r = amdgpu_device_ip_late_init(adev);
03161a6e 3908 if (r)
4d3b9ae5 3909 return r;
d38ceaf9 3910
beff74bc
AD
3911 queue_delayed_work(system_wq, &adev->delayed_init_work,
3912 msecs_to_jiffies(AMDGPU_RESUME_MS));
3913
5d3a2d95
AD
3914 if (!adev->in_s0ix) {
3915 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3916 if (r)
3917 return r;
3918 }
756e6880 3919
96a5d8d4 3920 /* Make sure IB tests flushed */
beff74bc 3921 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 3922
a2e15b0e 3923 if (fbcon)
4d3b9ae5 3924 amdgpu_fbdev_set_suspend(adev, 0);
d38ceaf9
AD
3925
3926 drm_kms_helper_poll_enable(dev);
23a1a9e5 3927
5e6932fe 3928 amdgpu_ras_resume(adev);
3929
23a1a9e5
L
3930 /*
3931 * Most of the connector probing functions try to acquire runtime pm
3932 * refs to ensure that the GPU is powered on when connector polling is
3933 * performed. Since we're calling this from a runtime PM callback,
3934 * trying to acquire rpm refs will cause us to deadlock.
3935 *
3936 * Since we're guaranteed to be holding the rpm lock, it's safe to
3937 * temporarily disable the rpm helpers so this doesn't deadlock us.
3938 */
3939#ifdef CONFIG_PM
3940 dev->dev->power.disable_depth++;
3941#endif
4562236b
HW
3942 if (!amdgpu_device_has_dc_support(adev))
3943 drm_helper_hpd_irq_event(dev);
3944 else
3945 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
3946#ifdef CONFIG_PM
3947 dev->dev->power.disable_depth--;
3948#endif
44779b43
RZ
3949 adev->in_suspend = false;
3950
3fa8f89d
S
3951 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
3952 DRM_WARN("smart shift update failed\n");
3953
4d3b9ae5 3954 return 0;
d38ceaf9
AD
3955}
3956
e3ecdffa
AD
3957/**
3958 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3959 *
3960 * @adev: amdgpu_device pointer
3961 *
3962 * The list of all the hardware IPs that make up the asic is walked and
3963 * the check_soft_reset callbacks are run. check_soft_reset determines
3964 * if the asic is still hung or not.
3965 * Returns true if any of the IPs are still in a hung state, false if not.
3966 */
06ec9070 3967static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
3968{
3969 int i;
3970 bool asic_hang = false;
3971
f993d628
ML
3972 if (amdgpu_sriov_vf(adev))
3973 return true;
3974
8bc04c29
AD
3975 if (amdgpu_asic_need_full_reset(adev))
3976 return true;
3977
63fbf42f 3978 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3979 if (!adev->ip_blocks[i].status.valid)
63fbf42f 3980 continue;
a1255107
AD
3981 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3982 adev->ip_blocks[i].status.hang =
3983 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3984 if (adev->ip_blocks[i].status.hang) {
aac89168 3985 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
3986 asic_hang = true;
3987 }
3988 }
3989 return asic_hang;
3990}
3991
e3ecdffa
AD
3992/**
3993 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3994 *
3995 * @adev: amdgpu_device pointer
3996 *
3997 * The list of all the hardware IPs that make up the asic is walked and the
3998 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3999 * handles any IP specific hardware or software state changes that are
4000 * necessary for a soft reset to succeed.
4001 * Returns 0 on success, negative error code on failure.
4002 */
06ec9070 4003static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
4004{
4005 int i, r = 0;
4006
4007 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4008 if (!adev->ip_blocks[i].status.valid)
d31a501e 4009 continue;
a1255107
AD
4010 if (adev->ip_blocks[i].status.hang &&
4011 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4012 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
4013 if (r)
4014 return r;
4015 }
4016 }
4017
4018 return 0;
4019}
4020
e3ecdffa
AD
4021/**
4022 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4023 *
4024 * @adev: amdgpu_device pointer
4025 *
4026 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4027 * reset is necessary to recover.
4028 * Returns true if a full asic reset is required, false if not.
4029 */
06ec9070 4030static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 4031{
da146d3b
AD
4032 int i;
4033
8bc04c29
AD
4034 if (amdgpu_asic_need_full_reset(adev))
4035 return true;
4036
da146d3b 4037 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4038 if (!adev->ip_blocks[i].status.valid)
da146d3b 4039 continue;
a1255107
AD
4040 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4041 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4042 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
4043 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4044 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 4045 if (adev->ip_blocks[i].status.hang) {
aac89168 4046 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
4047 return true;
4048 }
4049 }
35d782fe
CZ
4050 }
4051 return false;
4052}
4053
e3ecdffa
AD
4054/**
4055 * amdgpu_device_ip_soft_reset - do a soft reset
4056 *
4057 * @adev: amdgpu_device pointer
4058 *
4059 * The list of all the hardware IPs that make up the asic is walked and the
4060 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4061 * IP specific hardware or software state changes that are necessary to soft
4062 * reset the IP.
4063 * Returns 0 on success, negative error code on failure.
4064 */
06ec9070 4065static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4066{
4067 int i, r = 0;
4068
4069 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4070 if (!adev->ip_blocks[i].status.valid)
35d782fe 4071 continue;
a1255107
AD
4072 if (adev->ip_blocks[i].status.hang &&
4073 adev->ip_blocks[i].version->funcs->soft_reset) {
4074 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
4075 if (r)
4076 return r;
4077 }
4078 }
4079
4080 return 0;
4081}
4082
e3ecdffa
AD
4083/**
4084 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4085 *
4086 * @adev: amdgpu_device pointer
4087 *
4088 * The list of all the hardware IPs that make up the asic is walked and the
4089 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4090 * handles any IP specific hardware or software state changes that are
4091 * necessary after the IP has been soft reset.
4092 * Returns 0 on success, negative error code on failure.
4093 */
06ec9070 4094static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4095{
4096 int i, r = 0;
4097
4098 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4099 if (!adev->ip_blocks[i].status.valid)
35d782fe 4100 continue;
a1255107
AD
4101 if (adev->ip_blocks[i].status.hang &&
4102 adev->ip_blocks[i].version->funcs->post_soft_reset)
4103 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
4104 if (r)
4105 return r;
4106 }
4107
4108 return 0;
4109}
4110
e3ecdffa 4111/**
c33adbc7 4112 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
4113 *
4114 * @adev: amdgpu_device pointer
4115 *
4116 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4117 * restore things like GPUVM page tables after a GPU reset where
4118 * the contents of VRAM might be lost.
403009bf
CK
4119 *
4120 * Returns:
4121 * 0 on success, negative error code on failure.
e3ecdffa 4122 */
c33adbc7 4123static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 4124{
c41d1cf6 4125 struct dma_fence *fence = NULL, *next = NULL;
403009bf
CK
4126 struct amdgpu_bo *shadow;
4127 long r = 1, tmo;
c41d1cf6
ML
4128
4129 if (amdgpu_sriov_runtime(adev))
b045d3af 4130 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
4131 else
4132 tmo = msecs_to_jiffies(100);
4133
aac89168 4134 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 4135 mutex_lock(&adev->shadow_list_lock);
403009bf
CK
4136 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4137
4138 /* No need to recover an evicted BO */
d3116756
CK
4139 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4140 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4141 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
403009bf
CK
4142 continue;
4143
4144 r = amdgpu_bo_restore_shadow(shadow, &next);
4145 if (r)
4146 break;
4147
c41d1cf6 4148 if (fence) {
1712fb1a 4149 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
4150 dma_fence_put(fence);
4151 fence = next;
1712fb1a 4152 if (tmo == 0) {
4153 r = -ETIMEDOUT;
c41d1cf6 4154 break;
1712fb1a 4155 } else if (tmo < 0) {
4156 r = tmo;
4157 break;
4158 }
403009bf
CK
4159 } else {
4160 fence = next;
c41d1cf6 4161 }
c41d1cf6
ML
4162 }
4163 mutex_unlock(&adev->shadow_list_lock);
4164
403009bf
CK
4165 if (fence)
4166 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
4167 dma_fence_put(fence);
4168
1712fb1a 4169 if (r < 0 || tmo <= 0) {
aac89168 4170 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
4171 return -EIO;
4172 }
c41d1cf6 4173
aac89168 4174 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 4175 return 0;
c41d1cf6
ML
4176}
4177
a90ad3c2 4178
e3ecdffa 4179/**
06ec9070 4180 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e 4181 *
982a820b 4182 * @adev: amdgpu_device pointer
87e3f136 4183 * @from_hypervisor: request from hypervisor
5740682e
ML
4184 *
4185 * do VF FLR and reinitialize Asic
3f48c681 4186 * return 0 means succeeded otherwise failed
e3ecdffa
AD
4187 */
4188static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4189 bool from_hypervisor)
5740682e
ML
4190{
4191 int r;
4192
4193 if (from_hypervisor)
4194 r = amdgpu_virt_request_full_gpu(adev, true);
4195 else
4196 r = amdgpu_virt_reset_gpu(adev);
4197 if (r)
4198 return r;
a90ad3c2 4199
b639c22c
JZ
4200 amdgpu_amdkfd_pre_reset(adev);
4201
a90ad3c2 4202 /* Resume IP prior to SMC */
06ec9070 4203 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
4204 if (r)
4205 goto error;
a90ad3c2 4206
c9ffa427 4207 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 4208 /* we need recover gart prior to run SMC/CP/SDMA resume */
6c28aed6 4209 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
a90ad3c2 4210
7a3e0bb2
RZ
4211 r = amdgpu_device_fw_loading(adev);
4212 if (r)
4213 return r;
4214
a90ad3c2 4215 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 4216 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
4217 if (r)
4218 goto error;
a90ad3c2
ML
4219
4220 amdgpu_irq_gpu_reset_resume_helper(adev);
5740682e 4221 r = amdgpu_ib_ring_tests(adev);
f81e8d53 4222 amdgpu_amdkfd_post_reset(adev);
a90ad3c2 4223
abc34253 4224error:
c41d1cf6 4225 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 4226 amdgpu_inc_vram_lost(adev);
c33adbc7 4227 r = amdgpu_device_recover_vram(adev);
a90ad3c2 4228 }
437f3e0b 4229 amdgpu_virt_release_full_gpu(adev, true);
a90ad3c2
ML
4230
4231 return r;
4232}
4233
9a1cddd6 4234/**
4235 * amdgpu_device_has_job_running - check if there is any job in mirror list
4236 *
982a820b 4237 * @adev: amdgpu_device pointer
9a1cddd6 4238 *
4239 * check if there is any job in mirror list
4240 */
4241bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4242{
4243 int i;
4244 struct drm_sched_job *job;
4245
4246 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4247 struct amdgpu_ring *ring = adev->rings[i];
4248
4249 if (!ring || !ring->sched.thread)
4250 continue;
4251
4252 spin_lock(&ring->sched.job_list_lock);
6efa4b46
LT
4253 job = list_first_entry_or_null(&ring->sched.pending_list,
4254 struct drm_sched_job, list);
9a1cddd6 4255 spin_unlock(&ring->sched.job_list_lock);
4256 if (job)
4257 return true;
4258 }
4259 return false;
4260}
4261
12938fad
CK
4262/**
4263 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4264 *
982a820b 4265 * @adev: amdgpu_device pointer
12938fad
CK
4266 *
4267 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4268 * a hung GPU.
4269 */
4270bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4271{
4272 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4273 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4274 return false;
4275 }
4276
3ba7b418
AG
4277 if (amdgpu_gpu_recovery == 0)
4278 goto disabled;
4279
4280 if (amdgpu_sriov_vf(adev))
4281 return true;
4282
4283 if (amdgpu_gpu_recovery == -1) {
4284 switch (adev->asic_type) {
fc42d47c
AG
4285 case CHIP_BONAIRE:
4286 case CHIP_HAWAII:
3ba7b418
AG
4287 case CHIP_TOPAZ:
4288 case CHIP_TONGA:
4289 case CHIP_FIJI:
4290 case CHIP_POLARIS10:
4291 case CHIP_POLARIS11:
4292 case CHIP_POLARIS12:
4293 case CHIP_VEGAM:
4294 case CHIP_VEGA20:
4295 case CHIP_VEGA10:
4296 case CHIP_VEGA12:
c43b849f 4297 case CHIP_RAVEN:
e9d4cf91 4298 case CHIP_ARCTURUS:
2cb44fb0 4299 case CHIP_RENOIR:
658c6639
AD
4300 case CHIP_NAVI10:
4301 case CHIP_NAVI14:
4302 case CHIP_NAVI12:
131a3c74 4303 case CHIP_SIENNA_CICHLID:
665fe4dc 4304 case CHIP_NAVY_FLOUNDER:
27859ee3 4305 case CHIP_DIMGREY_CAVEFISH:
fe68ceef 4306 case CHIP_VANGOGH:
ea4e96a7 4307 case CHIP_ALDEBARAN:
3ba7b418
AG
4308 break;
4309 default:
4310 goto disabled;
4311 }
12938fad
CK
4312 }
4313
4314 return true;
3ba7b418
AG
4315
4316disabled:
aac89168 4317 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4318 return false;
12938fad
CK
4319}
4320
5c03e584
FX
4321int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4322{
4323 u32 i;
4324 int ret = 0;
4325
4326 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4327
4328 dev_info(adev->dev, "GPU mode1 reset\n");
4329
4330 /* disable BM */
4331 pci_clear_master(adev->pdev);
4332
4333 amdgpu_device_cache_pci_state(adev->pdev);
4334
4335 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4336 dev_info(adev->dev, "GPU smu mode1 reset\n");
4337 ret = amdgpu_dpm_mode1_reset(adev);
4338 } else {
4339 dev_info(adev->dev, "GPU psp mode1 reset\n");
4340 ret = psp_gpu_reset(adev);
4341 }
4342
4343 if (ret)
4344 dev_err(adev->dev, "GPU mode1 reset failed\n");
4345
4346 amdgpu_device_load_pci_state(adev->pdev);
4347
4348 /* wait for asic to come out of reset */
4349 for (i = 0; i < adev->usec_timeout; i++) {
4350 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4351
4352 if (memsize != 0xffffffff)
4353 break;
4354 udelay(1);
4355 }
4356
4357 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4358 return ret;
4359}
5c6dd71e 4360
e3c1b071 4361int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
04442bf7 4362 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4363{
4364 int i, r = 0;
04442bf7
LL
4365 struct amdgpu_job *job = NULL;
4366 bool need_full_reset =
4367 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4368
4369 if (reset_context->reset_req_dev == adev)
4370 job = reset_context->job;
71182665 4371
e3c1b071 4372 /* no need to dump if device is not in good state during probe period */
4373 if (!adev->gmc.xgmi.pending_reset)
4374 amdgpu_debugfs_wait_dump(adev);
728e7e0c 4375
b602ca5f
TZ
4376 if (amdgpu_sriov_vf(adev)) {
4377 /* stop the data exchange thread */
4378 amdgpu_virt_fini_data_exchange(adev);
4379 }
4380
71182665 4381 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4382 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4383 struct amdgpu_ring *ring = adev->rings[i];
4384
51687759 4385 if (!ring || !ring->sched.thread)
0875dc9e 4386 continue;
5740682e 4387
2f9d4084
ML
4388 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4389 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4390 }
d38ceaf9 4391
222b5f04
AG
4392 if(job)
4393 drm_sched_increase_karma(&job->base);
4394
04442bf7 4395 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
404b277b
LL
4396 /* If reset handler not implemented, continue; otherwise return */
4397 if (r == -ENOSYS)
4398 r = 0;
4399 else
04442bf7
LL
4400 return r;
4401
1d721ed6 4402 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4403 if (!amdgpu_sriov_vf(adev)) {
4404
4405 if (!need_full_reset)
4406 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4407
4408 if (!need_full_reset) {
4409 amdgpu_device_ip_pre_soft_reset(adev);
4410 r = amdgpu_device_ip_soft_reset(adev);
4411 amdgpu_device_ip_post_soft_reset(adev);
4412 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4413 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4414 need_full_reset = true;
4415 }
4416 }
4417
4418 if (need_full_reset)
4419 r = amdgpu_device_ip_suspend(adev);
04442bf7
LL
4420 if (need_full_reset)
4421 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4422 else
4423 clear_bit(AMDGPU_NEED_FULL_RESET,
4424 &reset_context->flags);
26bc5340
AG
4425 }
4426
4427 return r;
4428}
4429
04442bf7
LL
4430int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4431 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4432{
4433 struct amdgpu_device *tmp_adev = NULL;
04442bf7 4434 bool need_full_reset, skip_hw_reset, vram_lost = false;
26bc5340
AG
4435 int r = 0;
4436
04442bf7
LL
4437 /* Try reset handler method first */
4438 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4439 reset_list);
4440 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
404b277b
LL
4441 /* If reset handler not implemented, continue; otherwise return */
4442 if (r == -ENOSYS)
4443 r = 0;
4444 else
04442bf7
LL
4445 return r;
4446
4447 /* Reset handler not implemented, use the default method */
4448 need_full_reset =
4449 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4450 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4451
26bc5340 4452 /*
655ce9cb 4453 * ASIC reset has to be done on all XGMI hive nodes ASAP
26bc5340
AG
4454 * to allow proper links negotiation in FW (within 1 sec)
4455 */
7ac71382 4456 if (!skip_hw_reset && need_full_reset) {
655ce9cb 4457 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
041a62bc 4458 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4459 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
e3c1b071 4460 tmp_adev->gmc.xgmi.pending_reset = false;
c96cf282 4461 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4462 r = -EALREADY;
4463 } else
4464 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4465
041a62bc 4466 if (r) {
aac89168 4467 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4468 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4469 break;
ce316fa5
LM
4470 }
4471 }
4472
041a62bc
AG
4473 /* For XGMI wait for all resets to complete before proceed */
4474 if (!r) {
655ce9cb 4475 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
ce316fa5
LM
4476 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4477 flush_work(&tmp_adev->xgmi_reset_work);
4478 r = tmp_adev->asic_reset_res;
4479 if (r)
4480 break;
ce316fa5
LM
4481 }
4482 }
4483 }
ce316fa5 4484 }
26bc5340 4485
43c4d576 4486 if (!r && amdgpu_ras_intr_triggered()) {
655ce9cb 4487 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
8bc7b360
HZ
4488 if (tmp_adev->mmhub.ras_funcs &&
4489 tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4490 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
43c4d576
JC
4491 }
4492
00eaa571 4493 amdgpu_ras_intr_cleared();
43c4d576 4494 }
00eaa571 4495
655ce9cb 4496 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
26bc5340
AG
4497 if (need_full_reset) {
4498 /* post card */
e3c1b071 4499 r = amdgpu_device_asic_init(tmp_adev);
4500 if (r) {
aac89168 4501 dev_warn(tmp_adev->dev, "asic atom init failed!");
e3c1b071 4502 } else {
26bc5340
AG
4503 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4504 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4505 if (r)
4506 goto out;
4507
4508 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4509 if (vram_lost) {
77e7f829 4510 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4511 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4512 }
4513
6c28aed6 4514 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
26bc5340
AG
4515 if (r)
4516 goto out;
4517
4518 r = amdgpu_device_fw_loading(tmp_adev);
4519 if (r)
4520 return r;
4521
4522 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4523 if (r)
4524 goto out;
4525
4526 if (vram_lost)
4527 amdgpu_device_fill_reset_magic(tmp_adev);
4528
fdafb359
EQ
4529 /*
4530 * Add this ASIC as tracked as reset was already
4531 * complete successfully.
4532 */
4533 amdgpu_register_gpu_instance(tmp_adev);
4534
04442bf7
LL
4535 if (!reset_context->hive &&
4536 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
e3c1b071 4537 amdgpu_xgmi_add_device(tmp_adev);
4538
7c04ca50 4539 r = amdgpu_device_ip_late_init(tmp_adev);
4540 if (r)
4541 goto out;
4542
565d1941
EQ
4543 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4544
e8fbaf03
GC
4545 /*
4546 * The GPU enters bad state once faulty pages
4547 * by ECC has reached the threshold, and ras
4548 * recovery is scheduled next. So add one check
4549 * here to break recovery if it indeed exceeds
4550 * bad page threshold, and remind user to
4551 * retire this GPU or setting one bigger
4552 * bad_page_threshold value to fix this once
4553 * probing driver again.
4554 */
11003c68 4555 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
e8fbaf03
GC
4556 /* must succeed. */
4557 amdgpu_ras_resume(tmp_adev);
4558 } else {
4559 r = -EINVAL;
4560 goto out;
4561 }
e79a04d5 4562
26bc5340 4563 /* Update PSP FW topology after reset */
04442bf7
LL
4564 if (reset_context->hive &&
4565 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4566 r = amdgpu_xgmi_update_topology(
4567 reset_context->hive, tmp_adev);
26bc5340
AG
4568 }
4569 }
4570
26bc5340
AG
4571out:
4572 if (!r) {
4573 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4574 r = amdgpu_ib_ring_tests(tmp_adev);
4575 if (r) {
4576 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
26bc5340
AG
4577 need_full_reset = true;
4578 r = -EAGAIN;
4579 goto end;
4580 }
4581 }
4582
4583 if (!r)
4584 r = amdgpu_device_recover_vram(tmp_adev);
4585 else
4586 tmp_adev->asic_reset_res = r;
4587 }
4588
4589end:
04442bf7
LL
4590 if (need_full_reset)
4591 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4592 else
4593 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340
AG
4594 return r;
4595}
4596
08ebb485
DL
4597static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4598 struct amdgpu_hive_info *hive)
26bc5340 4599{
53b3f8f4
DL
4600 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4601 return false;
4602
08ebb485
DL
4603 if (hive) {
4604 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4605 } else {
4606 down_write(&adev->reset_sem);
4607 }
5740682e 4608
a3a09142
AD
4609 switch (amdgpu_asic_reset_method(adev)) {
4610 case AMD_RESET_METHOD_MODE1:
4611 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4612 break;
4613 case AMD_RESET_METHOD_MODE2:
4614 adev->mp1_state = PP_MP1_STATE_RESET;
4615 break;
4616 default:
4617 adev->mp1_state = PP_MP1_STATE_NONE;
4618 break;
4619 }
1d721ed6
AG
4620
4621 return true;
26bc5340 4622}
d38ceaf9 4623
26bc5340
AG
4624static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4625{
89041940 4626 amdgpu_vf_error_trans_all(adev);
a3a09142 4627 adev->mp1_state = PP_MP1_STATE_NONE;
53b3f8f4 4628 atomic_set(&adev->in_gpu_reset, 0);
6049db43 4629 up_write(&adev->reset_sem);
26bc5340
AG
4630}
4631
91fb309d
HC
4632/*
4633 * to lockup a list of amdgpu devices in a hive safely, if not a hive
4634 * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4635 *
4636 * unlock won't require roll back.
4637 */
4638static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4639{
4640 struct amdgpu_device *tmp_adev = NULL;
4641
4642 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4643 if (!hive) {
4644 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4645 return -ENODEV;
4646 }
4647 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4648 if (!amdgpu_device_lock_adev(tmp_adev, hive))
4649 goto roll_back;
4650 }
4651 } else if (!amdgpu_device_lock_adev(adev, hive))
4652 return -EAGAIN;
4653
4654 return 0;
4655roll_back:
4656 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4657 /*
4658 * if the lockup iteration break in the middle of a hive,
4659 * it may means there may has a race issue,
4660 * or a hive device locked up independently.
4661 * we may be in trouble and may not, so will try to roll back
4662 * the lock and give out a warnning.
4663 */
4664 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4665 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4666 amdgpu_device_unlock_adev(tmp_adev);
4667 }
4668 }
4669 return -EAGAIN;
4670}
4671
3f12acc8
EQ
4672static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4673{
4674 struct pci_dev *p = NULL;
4675
4676 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4677 adev->pdev->bus->number, 1);
4678 if (p) {
4679 pm_runtime_enable(&(p->dev));
4680 pm_runtime_resume(&(p->dev));
4681 }
4682}
4683
4684static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4685{
4686 enum amd_reset_method reset_method;
4687 struct pci_dev *p = NULL;
4688 u64 expires;
4689
4690 /*
4691 * For now, only BACO and mode1 reset are confirmed
4692 * to suffer the audio issue without proper suspended.
4693 */
4694 reset_method = amdgpu_asic_reset_method(adev);
4695 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4696 (reset_method != AMD_RESET_METHOD_MODE1))
4697 return -EINVAL;
4698
4699 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4700 adev->pdev->bus->number, 1);
4701 if (!p)
4702 return -ENODEV;
4703
4704 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4705 if (!expires)
4706 /*
4707 * If we cannot get the audio device autosuspend delay,
4708 * a fixed 4S interval will be used. Considering 3S is
4709 * the audio controller default autosuspend delay setting.
4710 * 4S used here is guaranteed to cover that.
4711 */
54b7feb9 4712 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4713
4714 while (!pm_runtime_status_suspended(&(p->dev))) {
4715 if (!pm_runtime_suspend(&(p->dev)))
4716 break;
4717
4718 if (expires < ktime_get_mono_fast_ns()) {
4719 dev_warn(adev->dev, "failed to suspend display audio\n");
4720 /* TODO: abort the succeeding gpu reset? */
4721 return -ETIMEDOUT;
4722 }
4723 }
4724
4725 pm_runtime_disable(&(p->dev));
4726
4727 return 0;
4728}
4729
9d8d96be 4730static void amdgpu_device_recheck_guilty_jobs(
04442bf7
LL
4731 struct amdgpu_device *adev, struct list_head *device_list_handle,
4732 struct amdgpu_reset_context *reset_context)
e6c6338f
JZ
4733{
4734 int i, r = 0;
4735
4736 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4737 struct amdgpu_ring *ring = adev->rings[i];
4738 int ret = 0;
4739 struct drm_sched_job *s_job;
4740
4741 if (!ring || !ring->sched.thread)
4742 continue;
4743
4744 s_job = list_first_entry_or_null(&ring->sched.pending_list,
4745 struct drm_sched_job, list);
4746 if (s_job == NULL)
4747 continue;
4748
4749 /* clear job's guilty and depend the folowing step to decide the real one */
4750 drm_sched_reset_karma(s_job);
4751 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4752
4753 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4754 if (ret == 0) { /* timeout */
4755 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4756 ring->sched.name, s_job->id);
4757
4758 /* set guilty */
4759 drm_sched_increase_karma(s_job);
4760retry:
4761 /* do hw reset */
4762 if (amdgpu_sriov_vf(adev)) {
4763 amdgpu_virt_fini_data_exchange(adev);
4764 r = amdgpu_device_reset_sriov(adev, false);
4765 if (r)
4766 adev->asic_reset_res = r;
4767 } else {
04442bf7
LL
4768 clear_bit(AMDGPU_SKIP_HW_RESET,
4769 &reset_context->flags);
4770 r = amdgpu_do_asic_reset(device_list_handle,
4771 reset_context);
e6c6338f
JZ
4772 if (r && r == -EAGAIN)
4773 goto retry;
4774 }
4775
4776 /*
4777 * add reset counter so that the following
4778 * resubmitted job could flush vmid
4779 */
4780 atomic_inc(&adev->gpu_reset_counter);
4781 continue;
4782 }
4783
4784 /* got the hw fence, signal finished fence */
4785 atomic_dec(ring->sched.score);
4786 dma_fence_get(&s_job->s_fence->finished);
4787 dma_fence_signal(&s_job->s_fence->finished);
4788 dma_fence_put(&s_job->s_fence->finished);
4789
4790 /* remove node from list and free the job */
4791 spin_lock(&ring->sched.job_list_lock);
4792 list_del_init(&s_job->list);
4793 spin_unlock(&ring->sched.job_list_lock);
4794 ring->sched.ops->free_job(s_job);
4795 }
4796}
4797
26bc5340
AG
4798/**
4799 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4800 *
982a820b 4801 * @adev: amdgpu_device pointer
26bc5340
AG
4802 * @job: which job trigger hang
4803 *
4804 * Attempt to reset the GPU if it has hung (all asics).
4805 * Attempt to do soft-reset or full-reset and reinitialize Asic
4806 * Returns 0 for success or an error on failure.
4807 */
4808
4809int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4810 struct amdgpu_job *job)
4811{
1d721ed6 4812 struct list_head device_list, *device_list_handle = NULL;
7dd8c205 4813 bool job_signaled = false;
26bc5340 4814 struct amdgpu_hive_info *hive = NULL;
26bc5340 4815 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 4816 int i, r = 0;
bb5c7235 4817 bool need_emergency_restart = false;
3f12acc8 4818 bool audio_suspended = false;
e6c6338f 4819 int tmp_vram_lost_counter;
04442bf7
LL
4820 struct amdgpu_reset_context reset_context;
4821
4822 memset(&reset_context, 0, sizeof(reset_context));
26bc5340 4823
6e3cd2a9 4824 /*
bb5c7235
WS
4825 * Special case: RAS triggered and full reset isn't supported
4826 */
4827 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4828
d5ea093e
AG
4829 /*
4830 * Flush RAM to disk so that after reboot
4831 * the user can read log and see why the system rebooted.
4832 */
bb5c7235 4833 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
4834 DRM_WARN("Emergency reboot.");
4835
4836 ksys_sync_helper();
4837 emergency_restart();
4838 }
4839
b823821f 4840 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 4841 need_emergency_restart ? "jobs stop":"reset");
26bc5340
AG
4842
4843 /*
1d721ed6
AG
4844 * Here we trylock to avoid chain of resets executing from
4845 * either trigger by jobs on different adevs in XGMI hive or jobs on
4846 * different schedulers for same device while this TO handler is running.
4847 * We always reset all schedulers for device and all devices for XGMI
4848 * hive so that should take care of them too.
26bc5340 4849 */
d95e8e97 4850 hive = amdgpu_get_xgmi_hive(adev);
53b3f8f4
DL
4851 if (hive) {
4852 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4853 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4854 job ? job->base.id : -1, hive->hive_id);
d95e8e97 4855 amdgpu_put_xgmi_hive(hive);
91fb309d
HC
4856 if (job)
4857 drm_sched_increase_karma(&job->base);
53b3f8f4
DL
4858 return 0;
4859 }
4860 mutex_lock(&hive->hive_lock);
1d721ed6 4861 }
26bc5340 4862
04442bf7
LL
4863 reset_context.method = AMD_RESET_METHOD_NONE;
4864 reset_context.reset_req_dev = adev;
4865 reset_context.job = job;
4866 reset_context.hive = hive;
4867 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
4868
91fb309d
HC
4869 /*
4870 * lock the device before we try to operate the linked list
4871 * if didn't get the device lock, don't touch the linked list since
4872 * others may iterating it.
4873 */
4874 r = amdgpu_device_lock_hive_adev(adev, hive);
4875 if (r) {
4876 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4877 job ? job->base.id : -1);
4878
4879 /* even we skipped this reset, still need to set the job to guilty */
4880 if (job)
4881 drm_sched_increase_karma(&job->base);
4882 goto skip_recovery;
4883 }
4884
9e94d22c
EQ
4885 /*
4886 * Build list of devices to reset.
4887 * In case we are in XGMI hive mode, resort the device list
4888 * to put adev in the 1st position.
4889 */
4890 INIT_LIST_HEAD(&device_list);
4891 if (adev->gmc.xgmi.num_physical_nodes > 1) {
655ce9cb 4892 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
4893 list_add_tail(&tmp_adev->reset_list, &device_list);
4894 if (!list_is_first(&adev->reset_list, &device_list))
4895 list_rotate_to_front(&adev->reset_list, &device_list);
4896 device_list_handle = &device_list;
26bc5340 4897 } else {
655ce9cb 4898 list_add_tail(&adev->reset_list, &device_list);
26bc5340
AG
4899 device_list_handle = &device_list;
4900 }
4901
1d721ed6 4902 /* block all schedulers and reset given job's ring */
655ce9cb 4903 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
3f12acc8
EQ
4904 /*
4905 * Try to put the audio codec into suspend state
4906 * before gpu reset started.
4907 *
4908 * Due to the power domain of the graphics device
4909 * is shared with AZ power domain. Without this,
4910 * we may change the audio hardware from behind
4911 * the audio driver's back. That will trigger
4912 * some audio codec errors.
4913 */
4914 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4915 audio_suspended = true;
4916
9e94d22c
EQ
4917 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4918
52fb44cf
EQ
4919 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4920
9e94d22c
EQ
4921 if (!amdgpu_sriov_vf(tmp_adev))
4922 amdgpu_amdkfd_pre_reset(tmp_adev);
4923
12ffa55d
AG
4924 /*
4925 * Mark these ASICs to be reseted as untracked first
4926 * And add them back after reset completed
4927 */
4928 amdgpu_unregister_gpu_instance(tmp_adev);
4929
a2f63ee8 4930 amdgpu_fbdev_set_suspend(tmp_adev, 1);
565d1941 4931
f1c1314b 4932 /* disable ras on ALL IPs */
bb5c7235 4933 if (!need_emergency_restart &&
b823821f 4934 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 4935 amdgpu_ras_suspend(tmp_adev);
4936
1d721ed6
AG
4937 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4938 struct amdgpu_ring *ring = tmp_adev->rings[i];
4939
4940 if (!ring || !ring->sched.thread)
4941 continue;
4942
0b2d2c2e 4943 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 4944
bb5c7235 4945 if (need_emergency_restart)
7c6e68c7 4946 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6 4947 }
8f8c80f4 4948 atomic_inc(&tmp_adev->gpu_reset_counter);
1d721ed6
AG
4949 }
4950
bb5c7235 4951 if (need_emergency_restart)
7c6e68c7
AG
4952 goto skip_sched_resume;
4953
1d721ed6
AG
4954 /*
4955 * Must check guilty signal here since after this point all old
4956 * HW fences are force signaled.
4957 *
4958 * job->base holds a reference to parent fence
4959 */
4960 if (job && job->base.s_fence->parent &&
7dd8c205 4961 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 4962 job_signaled = true;
1d721ed6
AG
4963 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4964 goto skip_hw_reset;
4965 }
4966
26bc5340 4967retry: /* Rest of adevs pre asic reset from XGMI hive. */
655ce9cb 4968 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
04442bf7 4969 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
26bc5340
AG
4970 /*TODO Should we stop ?*/
4971 if (r) {
aac89168 4972 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 4973 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
4974 tmp_adev->asic_reset_res = r;
4975 }
4976 }
4977
e6c6338f 4978 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
26bc5340
AG
4979 /* Actual ASIC resets if needed.*/
4980 /* TODO Implement XGMI hive reset logic for SRIOV */
4981 if (amdgpu_sriov_vf(adev)) {
4982 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4983 if (r)
4984 adev->asic_reset_res = r;
4985 } else {
04442bf7 4986 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
26bc5340
AG
4987 if (r && r == -EAGAIN)
4988 goto retry;
4989 }
4990
1d721ed6
AG
4991skip_hw_reset:
4992
26bc5340 4993 /* Post ASIC reset for all devs .*/
655ce9cb 4994 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
7c6e68c7 4995
e6c6338f
JZ
4996 /*
4997 * Sometimes a later bad compute job can block a good gfx job as gfx
4998 * and compute ring share internal GC HW mutually. We add an additional
4999 * guilty jobs recheck step to find the real guilty job, it synchronously
5000 * submits and pends for the first job being signaled. If it gets timeout,
5001 * we identify it as a real guilty job.
5002 */
5003 if (amdgpu_gpu_recovery == 2 &&
5004 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
04442bf7
LL
5005 amdgpu_device_recheck_guilty_jobs(
5006 tmp_adev, device_list_handle, &reset_context);
e6c6338f 5007
1d721ed6
AG
5008 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5009 struct amdgpu_ring *ring = tmp_adev->rings[i];
5010
5011 if (!ring || !ring->sched.thread)
5012 continue;
5013
5014 /* No point to resubmit jobs if we didn't HW reset*/
5015 if (!tmp_adev->asic_reset_res && !job_signaled)
5016 drm_sched_resubmit_jobs(&ring->sched);
5017
5018 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5019 }
5020
5021 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4a580877 5022 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
5023 }
5024
5025 tmp_adev->asic_reset_res = 0;
26bc5340
AG
5026
5027 if (r) {
5028 /* bad news, how to tell it to userspace ? */
12ffa55d 5029 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
5030 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5031 } else {
12ffa55d 5032 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
3fa8f89d
S
5033 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5034 DRM_WARN("smart shift update failed\n");
26bc5340 5035 }
7c6e68c7 5036 }
26bc5340 5037
7c6e68c7 5038skip_sched_resume:
655ce9cb 5039 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
8e2712e7 5040 /* unlock kfd: SRIOV would do it separately */
bb5c7235 5041 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
7c6e68c7 5042 amdgpu_amdkfd_post_reset(tmp_adev);
8e2712e7 5043
5044 /* kfd_post_reset will do nothing if kfd device is not initialized,
5045 * need to bring up kfd here if it's not be initialized before
5046 */
5047 if (!adev->kfd.init_complete)
5048 amdgpu_amdkfd_device_init(adev);
5049
3f12acc8
EQ
5050 if (audio_suspended)
5051 amdgpu_device_resume_display_audio(tmp_adev);
26bc5340
AG
5052 amdgpu_device_unlock_adev(tmp_adev);
5053 }
5054
cbfd17f7 5055skip_recovery:
9e94d22c 5056 if (hive) {
53b3f8f4 5057 atomic_set(&hive->in_reset, 0);
9e94d22c 5058 mutex_unlock(&hive->hive_lock);
d95e8e97 5059 amdgpu_put_xgmi_hive(hive);
9e94d22c 5060 }
26bc5340 5061
91fb309d 5062 if (r && r != -EAGAIN)
26bc5340 5063 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
5064 return r;
5065}
5066
e3ecdffa
AD
5067/**
5068 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5069 *
5070 * @adev: amdgpu_device pointer
5071 *
5072 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5073 * and lanes) of the slot the device is in. Handles APUs and
5074 * virtualized environments where PCIE config space may not be available.
5075 */
5494d864 5076static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 5077{
5d9a6330 5078 struct pci_dev *pdev;
c5313457
HK
5079 enum pci_bus_speed speed_cap, platform_speed_cap;
5080 enum pcie_link_width platform_link_width;
d0dd7f0c 5081
cd474ba0
AD
5082 if (amdgpu_pcie_gen_cap)
5083 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 5084
cd474ba0
AD
5085 if (amdgpu_pcie_lane_cap)
5086 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 5087
cd474ba0
AD
5088 /* covers APUs as well */
5089 if (pci_is_root_bus(adev->pdev->bus)) {
5090 if (adev->pm.pcie_gen_mask == 0)
5091 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5092 if (adev->pm.pcie_mlw_mask == 0)
5093 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 5094 return;
cd474ba0 5095 }
d0dd7f0c 5096
c5313457
HK
5097 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5098 return;
5099
dbaa922b
AD
5100 pcie_bandwidth_available(adev->pdev, NULL,
5101 &platform_speed_cap, &platform_link_width);
c5313457 5102
cd474ba0 5103 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
5104 /* asic caps */
5105 pdev = adev->pdev;
5106 speed_cap = pcie_get_speed_cap(pdev);
5107 if (speed_cap == PCI_SPEED_UNKNOWN) {
5108 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
5109 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5110 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 5111 } else {
2b3a1f51
FX
5112 if (speed_cap == PCIE_SPEED_32_0GT)
5113 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5114 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5115 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5116 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5117 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5118 else if (speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5119 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5120 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5121 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5122 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5123 else if (speed_cap == PCIE_SPEED_8_0GT)
5124 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5125 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5126 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5127 else if (speed_cap == PCIE_SPEED_5_0GT)
5128 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5129 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5130 else
5131 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5132 }
5133 /* platform caps */
c5313457 5134 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
5135 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5136 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5137 } else {
2b3a1f51
FX
5138 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5139 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5140 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5141 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5142 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5143 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5144 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5145 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5146 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5147 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5148 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 5149 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
5150 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5151 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5152 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 5153 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
5154 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5155 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5156 else
5157 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5158
cd474ba0
AD
5159 }
5160 }
5161 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 5162 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
5163 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5164 } else {
c5313457 5165 switch (platform_link_width) {
5d9a6330 5166 case PCIE_LNK_X32:
cd474ba0
AD
5167 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5168 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5169 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5170 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5171 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5172 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5173 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5174 break;
5d9a6330 5175 case PCIE_LNK_X16:
cd474ba0
AD
5176 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5177 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5178 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5179 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5180 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5181 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5182 break;
5d9a6330 5183 case PCIE_LNK_X12:
cd474ba0
AD
5184 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5185 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5186 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5187 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5188 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5189 break;
5d9a6330 5190 case PCIE_LNK_X8:
cd474ba0
AD
5191 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5192 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5193 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5194 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5195 break;
5d9a6330 5196 case PCIE_LNK_X4:
cd474ba0
AD
5197 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5198 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5199 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5200 break;
5d9a6330 5201 case PCIE_LNK_X2:
cd474ba0
AD
5202 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5203 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5204 break;
5d9a6330 5205 case PCIE_LNK_X1:
cd474ba0
AD
5206 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5207 break;
5208 default:
5209 break;
5210 }
d0dd7f0c
AD
5211 }
5212 }
5213}
d38ceaf9 5214
361dbd01
AD
5215int amdgpu_device_baco_enter(struct drm_device *dev)
5216{
1348969a 5217 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5218 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 5219
4a580877 5220 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5221 return -ENOTSUPP;
5222
8ab0d6f0 5223 if (ras && adev->ras_enabled &&
acdae216 5224 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5225 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5226
9530273e 5227 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
5228}
5229
5230int amdgpu_device_baco_exit(struct drm_device *dev)
5231{
1348969a 5232 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5233 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 5234 int ret = 0;
361dbd01 5235
4a580877 5236 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5237 return -ENOTSUPP;
5238
9530273e
EQ
5239 ret = amdgpu_dpm_baco_exit(adev);
5240 if (ret)
5241 return ret;
7a22677b 5242
8ab0d6f0 5243 if (ras && adev->ras_enabled &&
acdae216 5244 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5245 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5246
5247 return 0;
361dbd01 5248}
c9a6b82f 5249
acd89fca
AG
5250static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5251{
5252 int i;
5253
5254 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5255 struct amdgpu_ring *ring = adev->rings[i];
5256
5257 if (!ring || !ring->sched.thread)
5258 continue;
5259
5260 cancel_delayed_work_sync(&ring->sched.work_tdr);
5261 }
5262}
5263
c9a6b82f
AG
5264/**
5265 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5266 * @pdev: PCI device struct
5267 * @state: PCI channel state
5268 *
5269 * Description: Called when a PCI error is detected.
5270 *
5271 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5272 */
5273pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5274{
5275 struct drm_device *dev = pci_get_drvdata(pdev);
5276 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5277 int i;
c9a6b82f
AG
5278
5279 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5280
6894305c
AG
5281 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5282 DRM_WARN("No support for XGMI hive yet...");
5283 return PCI_ERS_RESULT_DISCONNECT;
5284 }
5285
c9a6b82f
AG
5286 switch (state) {
5287 case pci_channel_io_normal:
5288 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca 5289 /* Fatal error, prepare for slot reset */
8a11d283
TZ
5290 case pci_channel_io_frozen:
5291 /*
acd89fca
AG
5292 * Cancel and wait for all TDRs in progress if failing to
5293 * set adev->in_gpu_reset in amdgpu_device_lock_adev
5294 *
5295 * Locking adev->reset_sem will prevent any external access
5296 * to GPU during PCI error recovery
5297 */
5298 while (!amdgpu_device_lock_adev(adev, NULL))
5299 amdgpu_cancel_all_tdr(adev);
5300
5301 /*
5302 * Block any work scheduling as we do for regular GPU reset
5303 * for the duration of the recovery
5304 */
5305 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5306 struct amdgpu_ring *ring = adev->rings[i];
5307
5308 if (!ring || !ring->sched.thread)
5309 continue;
5310
5311 drm_sched_stop(&ring->sched, NULL);
5312 }
8f8c80f4 5313 atomic_inc(&adev->gpu_reset_counter);
c9a6b82f
AG
5314 return PCI_ERS_RESULT_NEED_RESET;
5315 case pci_channel_io_perm_failure:
5316 /* Permanent error, prepare for device removal */
5317 return PCI_ERS_RESULT_DISCONNECT;
5318 }
5319
5320 return PCI_ERS_RESULT_NEED_RESET;
5321}
5322
5323/**
5324 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5325 * @pdev: pointer to PCI device
5326 */
5327pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5328{
5329
5330 DRM_INFO("PCI error: mmio enabled callback!!\n");
5331
5332 /* TODO - dump whatever for debugging purposes */
5333
5334 /* This called only if amdgpu_pci_error_detected returns
5335 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5336 * works, no need to reset slot.
5337 */
5338
5339 return PCI_ERS_RESULT_RECOVERED;
5340}
5341
5342/**
5343 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5344 * @pdev: PCI device struct
5345 *
5346 * Description: This routine is called by the pci error recovery
5347 * code after the PCI slot has been reset, just before we
5348 * should resume normal operations.
5349 */
5350pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5351{
5352 struct drm_device *dev = pci_get_drvdata(pdev);
5353 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 5354 int r, i;
04442bf7 5355 struct amdgpu_reset_context reset_context;
362c7b91 5356 u32 memsize;
7ac71382 5357 struct list_head device_list;
c9a6b82f
AG
5358
5359 DRM_INFO("PCI error: slot reset callback!!\n");
5360
04442bf7
LL
5361 memset(&reset_context, 0, sizeof(reset_context));
5362
7ac71382 5363 INIT_LIST_HEAD(&device_list);
655ce9cb 5364 list_add_tail(&adev->reset_list, &device_list);
7ac71382 5365
362c7b91
AG
5366 /* wait for asic to come out of reset */
5367 msleep(500);
5368
7ac71382 5369 /* Restore PCI confspace */
c1dd4aa6 5370 amdgpu_device_load_pci_state(pdev);
c9a6b82f 5371
362c7b91
AG
5372 /* confirm ASIC came out of reset */
5373 for (i = 0; i < adev->usec_timeout; i++) {
5374 memsize = amdgpu_asic_get_config_memsize(adev);
5375
5376 if (memsize != 0xffffffff)
5377 break;
5378 udelay(1);
5379 }
5380 if (memsize == 0xffffffff) {
5381 r = -ETIME;
5382 goto out;
5383 }
5384
04442bf7
LL
5385 reset_context.method = AMD_RESET_METHOD_NONE;
5386 reset_context.reset_req_dev = adev;
5387 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5388 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5389
7afefb81 5390 adev->no_hw_access = true;
04442bf7 5391 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
7afefb81 5392 adev->no_hw_access = false;
c9a6b82f
AG
5393 if (r)
5394 goto out;
5395
04442bf7 5396 r = amdgpu_do_asic_reset(&device_list, &reset_context);
c9a6b82f
AG
5397
5398out:
c9a6b82f 5399 if (!r) {
c1dd4aa6
AG
5400 if (amdgpu_device_cache_pci_state(adev->pdev))
5401 pci_restore_state(adev->pdev);
5402
c9a6b82f
AG
5403 DRM_INFO("PCIe error recovery succeeded\n");
5404 } else {
5405 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5406 amdgpu_device_unlock_adev(adev);
5407 }
5408
5409 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5410}
5411
5412/**
5413 * amdgpu_pci_resume() - resume normal ops after PCI reset
5414 * @pdev: pointer to PCI device
5415 *
5416 * Called when the error recovery driver tells us that its
505199a3 5417 * OK to resume normal operation.
c9a6b82f
AG
5418 */
5419void amdgpu_pci_resume(struct pci_dev *pdev)
5420{
5421 struct drm_device *dev = pci_get_drvdata(pdev);
5422 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5423 int i;
c9a6b82f 5424
c9a6b82f
AG
5425
5426 DRM_INFO("PCI error: resume callback!!\n");
acd89fca
AG
5427
5428 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5429 struct amdgpu_ring *ring = adev->rings[i];
5430
5431 if (!ring || !ring->sched.thread)
5432 continue;
5433
5434
5435 drm_sched_resubmit_jobs(&ring->sched);
5436 drm_sched_start(&ring->sched, true);
5437 }
5438
5439 amdgpu_device_unlock_adev(adev);
c9a6b82f 5440}
c1dd4aa6
AG
5441
5442bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5443{
5444 struct drm_device *dev = pci_get_drvdata(pdev);
5445 struct amdgpu_device *adev = drm_to_adev(dev);
5446 int r;
5447
5448 r = pci_save_state(pdev);
5449 if (!r) {
5450 kfree(adev->pci_state);
5451
5452 adev->pci_state = pci_store_saved_state(pdev);
5453
5454 if (!adev->pci_state) {
5455 DRM_ERROR("Failed to store PCI saved state");
5456 return false;
5457 }
5458 } else {
5459 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5460 return false;
5461 }
5462
5463 return true;
5464}
5465
5466bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5467{
5468 struct drm_device *dev = pci_get_drvdata(pdev);
5469 struct amdgpu_device *adev = drm_to_adev(dev);
5470 int r;
5471
5472 if (!adev->pci_state)
5473 return false;
5474
5475 r = pci_load_saved_state(pdev, adev->pci_state);
5476
5477 if (!r) {
5478 pci_restore_state(pdev);
5479 } else {
5480 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5481 return false;
5482 }
5483
5484 return true;
5485}
5486
810085dd
EH
5487void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5488 struct amdgpu_ring *ring)
5489{
5490#ifdef CONFIG_X86_64
5491 if (adev->flags & AMD_IS_APU)
5492 return;
5493#endif
5494 if (adev->gmc.xgmi.connected_to_cpu)
5495 return;
5496
5497 if (ring && ring->funcs->emit_hdp_flush)
5498 amdgpu_ring_emit_hdp_flush(ring);
5499 else
5500 amdgpu_asic_flush_hdp(adev, ring);
5501}
c1dd4aa6 5502
810085dd
EH
5503void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5504 struct amdgpu_ring *ring)
5505{
5506#ifdef CONFIG_X86_64
5507 if (adev->flags & AMD_IS_APU)
5508 return;
5509#endif
5510 if (adev->gmc.xgmi.connected_to_cpu)
5511 return;
c1dd4aa6 5512
810085dd
EH
5513 amdgpu_asic_invalidate_hdp(adev, ring);
5514}