drm/amdkfd: Fix a memory limit issue
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
4a74c38c 33#include <linux/iommu.h>
901e2be2 34#include <linux/pci.h>
3d8785f6
SA
35#include <linux/devcoredump.h>
36#include <generated/utsrelease.h>
08a2fd23 37#include <linux/pci-p2pdma.h>
fdf2f6c5 38
4562236b 39#include <drm/drm_atomic_helper.h>
fcd70cd3 40#include <drm/drm_probe_helper.h>
d38ceaf9
AD
41#include <drm/amdgpu_drm.h>
42#include <linux/vgaarb.h>
43#include <linux/vga_switcheroo.h>
44#include <linux/efi.h>
45#include "amdgpu.h"
f4b373f4 46#include "amdgpu_trace.h"
d38ceaf9
AD
47#include "amdgpu_i2c.h"
48#include "atom.h"
49#include "amdgpu_atombios.h"
a5bde2f9 50#include "amdgpu_atomfirmware.h"
d0dd7f0c 51#include "amd_pcie.h"
33f34802
KW
52#ifdef CONFIG_DRM_AMDGPU_SI
53#include "si.h"
54#endif
a2e73f56
AD
55#ifdef CONFIG_DRM_AMDGPU_CIK
56#include "cik.h"
57#endif
aaa36a97 58#include "vi.h"
460826e6 59#include "soc15.h"
0a5b8c7b 60#include "nv.h"
d38ceaf9 61#include "bif/bif_4_1_d.h"
bec86378 62#include <linux/firmware.h>
89041940 63#include "amdgpu_vf_error.h"
d38ceaf9 64
ba997709 65#include "amdgpu_amdkfd.h"
d2f52ac8 66#include "amdgpu_pm.h"
d38ceaf9 67
5183411b 68#include "amdgpu_xgmi.h"
c030f2e4 69#include "amdgpu_ras.h"
9c7c85f7 70#include "amdgpu_pmu.h"
bd607166 71#include "amdgpu_fru_eeprom.h"
04442bf7 72#include "amdgpu_reset.h"
5183411b 73
d5ea093e 74#include <linux/suspend.h>
c6a6e2db 75#include <drm/task_barrier.h>
3f12acc8 76#include <linux/pm_runtime.h>
d5ea093e 77
f89f8c6b
AG
78#include <drm/drm_drv.h>
79
e2a75f88 80MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 81MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 82MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 83MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 84MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 85MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
42b325e5 86MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
e2a75f88 87
2dc80b00 88#define AMDGPU_RESUME_MS 2000
7258fa31
SK
89#define AMDGPU_MAX_RETRY_LIMIT 2
90#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
2dc80b00 91
050091ab 92const char *amdgpu_asic_name[] = {
da69c161
KW
93 "TAHITI",
94 "PITCAIRN",
95 "VERDE",
96 "OLAND",
97 "HAINAN",
d38ceaf9
AD
98 "BONAIRE",
99 "KAVERI",
100 "KABINI",
101 "HAWAII",
102 "MULLINS",
103 "TOPAZ",
104 "TONGA",
48299f95 105 "FIJI",
d38ceaf9 106 "CARRIZO",
139f4917 107 "STONEY",
2cc0c0b5
FC
108 "POLARIS10",
109 "POLARIS11",
c4642a47 110 "POLARIS12",
48ff108d 111 "VEGAM",
d4196f01 112 "VEGA10",
8fab806a 113 "VEGA12",
956fcddc 114 "VEGA20",
2ca8a5d2 115 "RAVEN",
d6c3b24e 116 "ARCTURUS",
1eee4228 117 "RENOIR",
d46b417a 118 "ALDEBARAN",
852a6626 119 "NAVI10",
d0f56dc2 120 "CYAN_SKILLFISH",
87dbad02 121 "NAVI14",
9802f5d7 122 "NAVI12",
ccaf72d3 123 "SIENNA_CICHLID",
ddd8fbe7 124 "NAVY_FLOUNDER",
4f1e9a76 125 "VANGOGH",
a2468e04 126 "DIMGREY_CAVEFISH",
6f169591 127 "BEIGE_GOBY",
ee9236b7 128 "YELLOW_CARP",
3ae695d6 129 "IP DISCOVERY",
d38ceaf9
AD
130 "LAST",
131};
132
dcea6e65
KR
133/**
134 * DOC: pcie_replay_count
135 *
136 * The amdgpu driver provides a sysfs API for reporting the total number
137 * of PCIe replays (NAKs)
138 * The file pcie_replay_count is used for this and returns the total
139 * number of replays as a sum of the NAKs generated and NAKs received
140 */
141
142static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
143 struct device_attribute *attr, char *buf)
144{
145 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 146 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
147 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
148
36000c7a 149 return sysfs_emit(buf, "%llu\n", cnt);
dcea6e65
KR
150}
151
152static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
153 amdgpu_device_get_pcie_replay_count, NULL);
154
5494d864
AD
155static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
156
bd607166
KR
157/**
158 * DOC: product_name
159 *
160 * The amdgpu driver provides a sysfs API for reporting the product name
161 * for the device
162 * The file serial_number is used for this and returns the product name
163 * as returned from the FRU.
164 * NOTE: This is only available for certain server cards
165 */
166
167static ssize_t amdgpu_device_get_product_name(struct device *dev,
168 struct device_attribute *attr, char *buf)
169{
170 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 171 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 172
36000c7a 173 return sysfs_emit(buf, "%s\n", adev->product_name);
bd607166
KR
174}
175
176static DEVICE_ATTR(product_name, S_IRUGO,
177 amdgpu_device_get_product_name, NULL);
178
179/**
180 * DOC: product_number
181 *
182 * The amdgpu driver provides a sysfs API for reporting the part number
183 * for the device
184 * The file serial_number is used for this and returns the part number
185 * as returned from the FRU.
186 * NOTE: This is only available for certain server cards
187 */
188
189static ssize_t amdgpu_device_get_product_number(struct device *dev,
190 struct device_attribute *attr, char *buf)
191{
192 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 193 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 194
36000c7a 195 return sysfs_emit(buf, "%s\n", adev->product_number);
bd607166
KR
196}
197
198static DEVICE_ATTR(product_number, S_IRUGO,
199 amdgpu_device_get_product_number, NULL);
200
201/**
202 * DOC: serial_number
203 *
204 * The amdgpu driver provides a sysfs API for reporting the serial number
205 * for the device
206 * The file serial_number is used for this and returns the serial number
207 * as returned from the FRU.
208 * NOTE: This is only available for certain server cards
209 */
210
211static ssize_t amdgpu_device_get_serial_number(struct device *dev,
212 struct device_attribute *attr, char *buf)
213{
214 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 215 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 216
36000c7a 217 return sysfs_emit(buf, "%s\n", adev->serial);
bd607166
KR
218}
219
220static DEVICE_ATTR(serial_number, S_IRUGO,
221 amdgpu_device_get_serial_number, NULL);
222
fd496ca8 223/**
b98c6299 224 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
fd496ca8
AD
225 *
226 * @dev: drm_device pointer
227 *
b98c6299 228 * Returns true if the device is a dGPU with ATPX power control,
fd496ca8
AD
229 * otherwise return false.
230 */
b98c6299 231bool amdgpu_device_supports_px(struct drm_device *dev)
fd496ca8
AD
232{
233 struct amdgpu_device *adev = drm_to_adev(dev);
234
b98c6299 235 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
fd496ca8
AD
236 return true;
237 return false;
238}
239
e3ecdffa 240/**
0330b848 241 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
e3ecdffa
AD
242 *
243 * @dev: drm_device pointer
244 *
b98c6299 245 * Returns true if the device is a dGPU with ACPI power control,
e3ecdffa
AD
246 * otherwise return false.
247 */
31af062a 248bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 249{
1348969a 250 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 251
b98c6299
AD
252 if (adev->has_pr3 ||
253 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
d38ceaf9
AD
254 return true;
255 return false;
256}
257
a69cba42
AD
258/**
259 * amdgpu_device_supports_baco - Does the device support BACO
260 *
261 * @dev: drm_device pointer
262 *
263 * Returns true if the device supporte BACO,
264 * otherwise return false.
265 */
266bool amdgpu_device_supports_baco(struct drm_device *dev)
267{
1348969a 268 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
269
270 return amdgpu_asic_supports_baco(adev);
271}
272
3fa8f89d
S
273/**
274 * amdgpu_device_supports_smart_shift - Is the device dGPU with
275 * smart shift support
276 *
277 * @dev: drm_device pointer
278 *
279 * Returns true if the device is a dGPU with Smart Shift support,
280 * otherwise returns false.
281 */
282bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
283{
284 return (amdgpu_device_supports_boco(dev) &&
285 amdgpu_acpi_is_power_shift_control_supported());
286}
287
6e3cd2a9
MCC
288/*
289 * VRAM access helper functions
290 */
291
e35e2b11 292/**
048af66b 293 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
e35e2b11
TY
294 *
295 * @adev: amdgpu_device pointer
296 * @pos: offset of the buffer in vram
297 * @buf: virtual address of the buffer in system memory
298 * @size: read/write size, sizeof(@buf) must > @size
299 * @write: true - write to vram, otherwise - read from vram
300 */
048af66b
KW
301void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
302 void *buf, size_t size, bool write)
e35e2b11 303{
e35e2b11 304 unsigned long flags;
048af66b
KW
305 uint32_t hi = ~0, tmp = 0;
306 uint32_t *data = buf;
ce05ac56 307 uint64_t last;
f89f8c6b 308 int idx;
ce05ac56 309
c58a863b 310 if (!drm_dev_enter(adev_to_drm(adev), &idx))
f89f8c6b 311 return;
9d11eb0d 312
048af66b
KW
313 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
314
315 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
316 for (last = pos + size; pos < last; pos += 4) {
317 tmp = pos >> 31;
318
319 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
320 if (tmp != hi) {
321 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
322 hi = tmp;
323 }
324 if (write)
325 WREG32_NO_KIQ(mmMM_DATA, *data++);
326 else
327 *data++ = RREG32_NO_KIQ(mmMM_DATA);
328 }
329
330 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
331 drm_dev_exit(idx);
332}
333
334/**
bbe04dec 335 * amdgpu_device_aper_access - access vram by vram aperature
048af66b
KW
336 *
337 * @adev: amdgpu_device pointer
338 * @pos: offset of the buffer in vram
339 * @buf: virtual address of the buffer in system memory
340 * @size: read/write size, sizeof(@buf) must > @size
341 * @write: true - write to vram, otherwise - read from vram
342 *
343 * The return value means how many bytes have been transferred.
344 */
345size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
346 void *buf, size_t size, bool write)
347{
9d11eb0d 348#ifdef CONFIG_64BIT
048af66b
KW
349 void __iomem *addr;
350 size_t count = 0;
351 uint64_t last;
352
353 if (!adev->mman.aper_base_kaddr)
354 return 0;
355
9d11eb0d
CK
356 last = min(pos + size, adev->gmc.visible_vram_size);
357 if (last > pos) {
048af66b
KW
358 addr = adev->mman.aper_base_kaddr + pos;
359 count = last - pos;
9d11eb0d
CK
360
361 if (write) {
362 memcpy_toio(addr, buf, count);
363 mb();
810085dd 364 amdgpu_device_flush_hdp(adev, NULL);
9d11eb0d 365 } else {
810085dd 366 amdgpu_device_invalidate_hdp(adev, NULL);
9d11eb0d
CK
367 mb();
368 memcpy_fromio(buf, addr, count);
369 }
370
9d11eb0d 371 }
048af66b
KW
372
373 return count;
374#else
375 return 0;
9d11eb0d 376#endif
048af66b 377}
9d11eb0d 378
048af66b
KW
379/**
380 * amdgpu_device_vram_access - read/write a buffer in vram
381 *
382 * @adev: amdgpu_device pointer
383 * @pos: offset of the buffer in vram
384 * @buf: virtual address of the buffer in system memory
385 * @size: read/write size, sizeof(@buf) must > @size
386 * @write: true - write to vram, otherwise - read from vram
387 */
388void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
389 void *buf, size_t size, bool write)
390{
391 size_t count;
e35e2b11 392
048af66b
KW
393 /* try to using vram apreature to access vram first */
394 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
395 size -= count;
396 if (size) {
397 /* using MM to access rest vram */
398 pos += count;
399 buf += count;
400 amdgpu_device_mm_access(adev, pos, buf, size, write);
e35e2b11
TY
401 }
402}
403
d38ceaf9 404/*
f7ee1874 405 * register access helper functions.
d38ceaf9 406 */
56b53c0b
DL
407
408/* Check if hw access should be skipped because of hotplug or device error */
409bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
410{
7afefb81 411 if (adev->no_hw_access)
56b53c0b
DL
412 return true;
413
414#ifdef CONFIG_LOCKDEP
415 /*
416 * This is a bit complicated to understand, so worth a comment. What we assert
417 * here is that the GPU reset is not running on another thread in parallel.
418 *
419 * For this we trylock the read side of the reset semaphore, if that succeeds
420 * we know that the reset is not running in paralell.
421 *
422 * If the trylock fails we assert that we are either already holding the read
423 * side of the lock or are the reset thread itself and hold the write side of
424 * the lock.
425 */
426 if (in_task()) {
d0fb18b5
AG
427 if (down_read_trylock(&adev->reset_domain->sem))
428 up_read(&adev->reset_domain->sem);
56b53c0b 429 else
d0fb18b5 430 lockdep_assert_held(&adev->reset_domain->sem);
56b53c0b
DL
431 }
432#endif
433 return false;
434}
435
e3ecdffa 436/**
f7ee1874 437 * amdgpu_device_rreg - read a memory mapped IO or indirect register
e3ecdffa
AD
438 *
439 * @adev: amdgpu_device pointer
440 * @reg: dword aligned register offset
441 * @acc_flags: access flags which require special behavior
442 *
443 * Returns the 32 bit value from the offset specified.
444 */
f7ee1874
HZ
445uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
446 uint32_t reg, uint32_t acc_flags)
d38ceaf9 447{
f4b373f4
TSD
448 uint32_t ret;
449
56b53c0b 450 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
451 return 0;
452
f7ee1874
HZ
453 if ((reg * 4) < adev->rmmio_size) {
454 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
455 amdgpu_sriov_runtime(adev) &&
d0fb18b5 456 down_read_trylock(&adev->reset_domain->sem)) {
f7ee1874 457 ret = amdgpu_kiq_rreg(adev, reg);
d0fb18b5 458 up_read(&adev->reset_domain->sem);
f7ee1874
HZ
459 } else {
460 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
461 }
462 } else {
463 ret = adev->pcie_rreg(adev, reg * 4);
81202807 464 }
bc992ba5 465
f7ee1874 466 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
e78b579d 467
f4b373f4 468 return ret;
d38ceaf9
AD
469}
470
421a2a30
ML
471/*
472 * MMIO register read with bytes helper functions
473 * @offset:bytes offset from MMIO start
474 *
475*/
476
e3ecdffa
AD
477/**
478 * amdgpu_mm_rreg8 - read a memory mapped IO register
479 *
480 * @adev: amdgpu_device pointer
481 * @offset: byte aligned register offset
482 *
483 * Returns the 8 bit value from the offset specified.
484 */
7cbbc745
AG
485uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
486{
56b53c0b 487 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
488 return 0;
489
421a2a30
ML
490 if (offset < adev->rmmio_size)
491 return (readb(adev->rmmio + offset));
492 BUG();
493}
494
495/*
496 * MMIO register write with bytes helper functions
497 * @offset:bytes offset from MMIO start
498 * @value: the value want to be written to the register
499 *
500*/
e3ecdffa
AD
501/**
502 * amdgpu_mm_wreg8 - read a memory mapped IO register
503 *
504 * @adev: amdgpu_device pointer
505 * @offset: byte aligned register offset
506 * @value: 8 bit value to write
507 *
508 * Writes the value specified to the offset specified.
509 */
7cbbc745
AG
510void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
511{
56b53c0b 512 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
513 return;
514
421a2a30
ML
515 if (offset < adev->rmmio_size)
516 writeb(value, adev->rmmio + offset);
517 else
518 BUG();
519}
520
e3ecdffa 521/**
f7ee1874 522 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
e3ecdffa
AD
523 *
524 * @adev: amdgpu_device pointer
525 * @reg: dword aligned register offset
526 * @v: 32 bit value to write to the register
527 * @acc_flags: access flags which require special behavior
528 *
529 * Writes the value specified to the offset specified.
530 */
f7ee1874
HZ
531void amdgpu_device_wreg(struct amdgpu_device *adev,
532 uint32_t reg, uint32_t v,
533 uint32_t acc_flags)
d38ceaf9 534{
56b53c0b 535 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
536 return;
537
f7ee1874
HZ
538 if ((reg * 4) < adev->rmmio_size) {
539 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
540 amdgpu_sriov_runtime(adev) &&
d0fb18b5 541 down_read_trylock(&adev->reset_domain->sem)) {
f7ee1874 542 amdgpu_kiq_wreg(adev, reg, v);
d0fb18b5 543 up_read(&adev->reset_domain->sem);
f7ee1874
HZ
544 } else {
545 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
546 }
547 } else {
548 adev->pcie_wreg(adev, reg * 4, v);
81202807 549 }
bc992ba5 550
f7ee1874 551 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
2e0cc4d4 552}
d38ceaf9 553
03f2abb0 554/**
4cc9f86f 555 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
2e0cc4d4 556 *
71579346
RB
557 * @adev: amdgpu_device pointer
558 * @reg: mmio/rlc register
559 * @v: value to write
560 *
561 * this function is invoked only for the debugfs register access
03f2abb0 562 */
f7ee1874
HZ
563void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
564 uint32_t reg, uint32_t v)
2e0cc4d4 565{
56b53c0b 566 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
567 return;
568
2e0cc4d4 569 if (amdgpu_sriov_fullaccess(adev) &&
f7ee1874
HZ
570 adev->gfx.rlc.funcs &&
571 adev->gfx.rlc.funcs->is_rlcg_access_range) {
2e0cc4d4 572 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
1b2dc99e 573 return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
4cc9f86f
TSD
574 } else if ((reg * 4) >= adev->rmmio_size) {
575 adev->pcie_wreg(adev, reg * 4, v);
f7ee1874
HZ
576 } else {
577 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
47ed4e1c 578 }
d38ceaf9
AD
579}
580
d38ceaf9
AD
581/**
582 * amdgpu_mm_rdoorbell - read a doorbell dword
583 *
584 * @adev: amdgpu_device pointer
585 * @index: doorbell index
586 *
587 * Returns the value in the doorbell aperture at the
588 * requested doorbell index (CIK).
589 */
590u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
591{
56b53c0b 592 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
593 return 0;
594
d38ceaf9
AD
595 if (index < adev->doorbell.num_doorbells) {
596 return readl(adev->doorbell.ptr + index);
597 } else {
598 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
599 return 0;
600 }
601}
602
603/**
604 * amdgpu_mm_wdoorbell - write a doorbell dword
605 *
606 * @adev: amdgpu_device pointer
607 * @index: doorbell index
608 * @v: value to write
609 *
610 * Writes @v to the doorbell aperture at the
611 * requested doorbell index (CIK).
612 */
613void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
614{
56b53c0b 615 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
616 return;
617
d38ceaf9
AD
618 if (index < adev->doorbell.num_doorbells) {
619 writel(v, adev->doorbell.ptr + index);
620 } else {
621 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
622 }
623}
624
832be404
KW
625/**
626 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
627 *
628 * @adev: amdgpu_device pointer
629 * @index: doorbell index
630 *
631 * Returns the value in the doorbell aperture at the
632 * requested doorbell index (VEGA10+).
633 */
634u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
635{
56b53c0b 636 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
637 return 0;
638
832be404
KW
639 if (index < adev->doorbell.num_doorbells) {
640 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
641 } else {
642 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
643 return 0;
644 }
645}
646
647/**
648 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
649 *
650 * @adev: amdgpu_device pointer
651 * @index: doorbell index
652 * @v: value to write
653 *
654 * Writes @v to the doorbell aperture at the
655 * requested doorbell index (VEGA10+).
656 */
657void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
658{
56b53c0b 659 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
660 return;
661
832be404
KW
662 if (index < adev->doorbell.num_doorbells) {
663 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
664 } else {
665 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
666 }
667}
668
1bba3683
HZ
669/**
670 * amdgpu_device_indirect_rreg - read an indirect register
671 *
672 * @adev: amdgpu_device pointer
673 * @pcie_index: mmio register offset
674 * @pcie_data: mmio register offset
22f453fb 675 * @reg_addr: indirect register address to read from
1bba3683
HZ
676 *
677 * Returns the value of indirect register @reg_addr
678 */
679u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
680 u32 pcie_index, u32 pcie_data,
681 u32 reg_addr)
682{
683 unsigned long flags;
684 u32 r;
685 void __iomem *pcie_index_offset;
686 void __iomem *pcie_data_offset;
687
688 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
689 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
690 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
691
692 writel(reg_addr, pcie_index_offset);
693 readl(pcie_index_offset);
694 r = readl(pcie_data_offset);
695 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
696
697 return r;
698}
699
700/**
701 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
702 *
703 * @adev: amdgpu_device pointer
704 * @pcie_index: mmio register offset
705 * @pcie_data: mmio register offset
22f453fb 706 * @reg_addr: indirect register address to read from
1bba3683
HZ
707 *
708 * Returns the value of indirect register @reg_addr
709 */
710u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
711 u32 pcie_index, u32 pcie_data,
712 u32 reg_addr)
713{
714 unsigned long flags;
715 u64 r;
716 void __iomem *pcie_index_offset;
717 void __iomem *pcie_data_offset;
718
719 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
720 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
721 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
722
723 /* read low 32 bits */
724 writel(reg_addr, pcie_index_offset);
725 readl(pcie_index_offset);
726 r = readl(pcie_data_offset);
727 /* read high 32 bits */
728 writel(reg_addr + 4, pcie_index_offset);
729 readl(pcie_index_offset);
730 r |= ((u64)readl(pcie_data_offset) << 32);
731 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
732
733 return r;
734}
735
736/**
737 * amdgpu_device_indirect_wreg - write an indirect register address
738 *
739 * @adev: amdgpu_device pointer
740 * @pcie_index: mmio register offset
741 * @pcie_data: mmio register offset
742 * @reg_addr: indirect register offset
743 * @reg_data: indirect register data
744 *
745 */
746void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
747 u32 pcie_index, u32 pcie_data,
748 u32 reg_addr, u32 reg_data)
749{
750 unsigned long flags;
751 void __iomem *pcie_index_offset;
752 void __iomem *pcie_data_offset;
753
754 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
755 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
756 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
757
758 writel(reg_addr, pcie_index_offset);
759 readl(pcie_index_offset);
760 writel(reg_data, pcie_data_offset);
761 readl(pcie_data_offset);
762 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
763}
764
765/**
766 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
767 *
768 * @adev: amdgpu_device pointer
769 * @pcie_index: mmio register offset
770 * @pcie_data: mmio register offset
771 * @reg_addr: indirect register offset
772 * @reg_data: indirect register data
773 *
774 */
775void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
776 u32 pcie_index, u32 pcie_data,
777 u32 reg_addr, u64 reg_data)
778{
779 unsigned long flags;
780 void __iomem *pcie_index_offset;
781 void __iomem *pcie_data_offset;
782
783 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
784 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
785 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
786
787 /* write low 32 bits */
788 writel(reg_addr, pcie_index_offset);
789 readl(pcie_index_offset);
790 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
791 readl(pcie_data_offset);
792 /* write high 32 bits */
793 writel(reg_addr + 4, pcie_index_offset);
794 readl(pcie_index_offset);
795 writel((u32)(reg_data >> 32), pcie_data_offset);
796 readl(pcie_data_offset);
797 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
798}
799
d38ceaf9
AD
800/**
801 * amdgpu_invalid_rreg - dummy reg read function
802 *
982a820b 803 * @adev: amdgpu_device pointer
d38ceaf9
AD
804 * @reg: offset of register
805 *
806 * Dummy register read function. Used for register blocks
807 * that certain asics don't have (all asics).
808 * Returns the value in the register.
809 */
810static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
811{
812 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
813 BUG();
814 return 0;
815}
816
817/**
818 * amdgpu_invalid_wreg - dummy reg write function
819 *
982a820b 820 * @adev: amdgpu_device pointer
d38ceaf9
AD
821 * @reg: offset of register
822 * @v: value to write to the register
823 *
824 * Dummy register read function. Used for register blocks
825 * that certain asics don't have (all asics).
826 */
827static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
828{
829 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
830 reg, v);
831 BUG();
832}
833
4fa1c6a6
TZ
834/**
835 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
836 *
982a820b 837 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
838 * @reg: offset of register
839 *
840 * Dummy register read function. Used for register blocks
841 * that certain asics don't have (all asics).
842 * Returns the value in the register.
843 */
844static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
845{
846 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
847 BUG();
848 return 0;
849}
850
851/**
852 * amdgpu_invalid_wreg64 - dummy reg write function
853 *
982a820b 854 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
855 * @reg: offset of register
856 * @v: value to write to the register
857 *
858 * Dummy register read function. Used for register blocks
859 * that certain asics don't have (all asics).
860 */
861static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
862{
863 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
864 reg, v);
865 BUG();
866}
867
d38ceaf9
AD
868/**
869 * amdgpu_block_invalid_rreg - dummy reg read function
870 *
982a820b 871 * @adev: amdgpu_device pointer
d38ceaf9
AD
872 * @block: offset of instance
873 * @reg: offset of register
874 *
875 * Dummy register read function. Used for register blocks
876 * that certain asics don't have (all asics).
877 * Returns the value in the register.
878 */
879static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
880 uint32_t block, uint32_t reg)
881{
882 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
883 reg, block);
884 BUG();
885 return 0;
886}
887
888/**
889 * amdgpu_block_invalid_wreg - dummy reg write function
890 *
982a820b 891 * @adev: amdgpu_device pointer
d38ceaf9
AD
892 * @block: offset of instance
893 * @reg: offset of register
894 * @v: value to write to the register
895 *
896 * Dummy register read function. Used for register blocks
897 * that certain asics don't have (all asics).
898 */
899static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
900 uint32_t block,
901 uint32_t reg, uint32_t v)
902{
903 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
904 reg, block, v);
905 BUG();
906}
907
4d2997ab
AD
908/**
909 * amdgpu_device_asic_init - Wrapper for atom asic_init
910 *
982a820b 911 * @adev: amdgpu_device pointer
4d2997ab
AD
912 *
913 * Does any asic specific work and then calls atom asic init.
914 */
915static int amdgpu_device_asic_init(struct amdgpu_device *adev)
916{
917 amdgpu_asic_pre_asic_init(adev);
918
85d1bcc6
HZ
919 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
920 return amdgpu_atomfirmware_asic_init(adev, true);
921 else
922 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
4d2997ab
AD
923}
924
e3ecdffa
AD
925/**
926 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
927 *
982a820b 928 * @adev: amdgpu_device pointer
e3ecdffa
AD
929 *
930 * Allocates a scratch page of VRAM for use by various things in the
931 * driver.
932 */
06ec9070 933static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 934{
a4a02777
CK
935 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
936 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
937 &adev->vram_scratch.robj,
938 &adev->vram_scratch.gpu_addr,
939 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
940}
941
e3ecdffa
AD
942/**
943 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
944 *
982a820b 945 * @adev: amdgpu_device pointer
e3ecdffa
AD
946 *
947 * Frees the VRAM scratch page.
948 */
06ec9070 949static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 950{
078af1a3 951 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
952}
953
954/**
9c3f2b54 955 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
956 *
957 * @adev: amdgpu_device pointer
958 * @registers: pointer to the register array
959 * @array_size: size of the register array
960 *
961 * Programs an array or registers with and and or masks.
962 * This is a helper for setting golden registers.
963 */
9c3f2b54
AD
964void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
965 const u32 *registers,
966 const u32 array_size)
d38ceaf9
AD
967{
968 u32 tmp, reg, and_mask, or_mask;
969 int i;
970
971 if (array_size % 3)
972 return;
973
974 for (i = 0; i < array_size; i +=3) {
975 reg = registers[i + 0];
976 and_mask = registers[i + 1];
977 or_mask = registers[i + 2];
978
979 if (and_mask == 0xffffffff) {
980 tmp = or_mask;
981 } else {
982 tmp = RREG32(reg);
983 tmp &= ~and_mask;
e0d07657
HZ
984 if (adev->family >= AMDGPU_FAMILY_AI)
985 tmp |= (or_mask & and_mask);
986 else
987 tmp |= or_mask;
d38ceaf9
AD
988 }
989 WREG32(reg, tmp);
990 }
991}
992
e3ecdffa
AD
993/**
994 * amdgpu_device_pci_config_reset - reset the GPU
995 *
996 * @adev: amdgpu_device pointer
997 *
998 * Resets the GPU using the pci config reset sequence.
999 * Only applicable to asics prior to vega10.
1000 */
8111c387 1001void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
1002{
1003 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1004}
1005
af484df8
AD
1006/**
1007 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1008 *
1009 * @adev: amdgpu_device pointer
1010 *
1011 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1012 */
1013int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1014{
1015 return pci_reset_function(adev->pdev);
1016}
1017
d38ceaf9
AD
1018/*
1019 * GPU doorbell aperture helpers function.
1020 */
1021/**
06ec9070 1022 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
1023 *
1024 * @adev: amdgpu_device pointer
1025 *
1026 * Init doorbell driver information (CIK)
1027 * Returns 0 on success, error on failure.
1028 */
06ec9070 1029static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 1030{
6585661d 1031
705e519e
CK
1032 /* No doorbell on SI hardware generation */
1033 if (adev->asic_type < CHIP_BONAIRE) {
1034 adev->doorbell.base = 0;
1035 adev->doorbell.size = 0;
1036 adev->doorbell.num_doorbells = 0;
1037 adev->doorbell.ptr = NULL;
1038 return 0;
1039 }
1040
d6895ad3
CK
1041 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1042 return -EINVAL;
1043
22357775
AD
1044 amdgpu_asic_init_doorbell_index(adev);
1045
d38ceaf9
AD
1046 /* doorbell bar mapping */
1047 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1048 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1049
de33a329
JX
1050 if (adev->enable_mes) {
1051 adev->doorbell.num_doorbells =
1052 adev->doorbell.size / sizeof(u32);
1053 } else {
1054 adev->doorbell.num_doorbells =
1055 min_t(u32, adev->doorbell.size / sizeof(u32),
1056 adev->doorbell_index.max_assignment+1);
1057 if (adev->doorbell.num_doorbells == 0)
1058 return -EINVAL;
1059
1060 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1061 * paging queue doorbell use the second page. The
1062 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1063 * doorbells are in the first page. So with paging queue enabled,
1064 * the max num_doorbells should + 1 page (0x400 in dword)
1065 */
1066 if (adev->asic_type >= CHIP_VEGA10)
1067 adev->doorbell.num_doorbells += 0x400;
1068 }
ec3db8a6 1069
8972e5d2
CK
1070 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1071 adev->doorbell.num_doorbells *
1072 sizeof(u32));
1073 if (adev->doorbell.ptr == NULL)
d38ceaf9 1074 return -ENOMEM;
d38ceaf9
AD
1075
1076 return 0;
1077}
1078
1079/**
06ec9070 1080 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
1081 *
1082 * @adev: amdgpu_device pointer
1083 *
1084 * Tear down doorbell driver information (CIK)
1085 */
06ec9070 1086static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1087{
1088 iounmap(adev->doorbell.ptr);
1089 adev->doorbell.ptr = NULL;
1090}
1091
22cb0164 1092
d38ceaf9
AD
1093
1094/*
06ec9070 1095 * amdgpu_device_wb_*()
455a7bc2 1096 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 1097 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
1098 */
1099
1100/**
06ec9070 1101 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
1102 *
1103 * @adev: amdgpu_device pointer
1104 *
1105 * Disables Writeback and frees the Writeback memory (all asics).
1106 * Used at driver shutdown.
1107 */
06ec9070 1108static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1109{
1110 if (adev->wb.wb_obj) {
a76ed485
AD
1111 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1112 &adev->wb.gpu_addr,
1113 (void **)&adev->wb.wb);
d38ceaf9
AD
1114 adev->wb.wb_obj = NULL;
1115 }
1116}
1117
1118/**
03f2abb0 1119 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
d38ceaf9
AD
1120 *
1121 * @adev: amdgpu_device pointer
1122 *
455a7bc2 1123 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
1124 * Used at driver startup.
1125 * Returns 0 on success or an -error on failure.
1126 */
06ec9070 1127static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
1128{
1129 int r;
1130
1131 if (adev->wb.wb_obj == NULL) {
97407b63
AD
1132 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1133 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
1134 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1135 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1136 (void **)&adev->wb.wb);
d38ceaf9
AD
1137 if (r) {
1138 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1139 return r;
1140 }
d38ceaf9
AD
1141
1142 adev->wb.num_wb = AMDGPU_MAX_WB;
1143 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1144
1145 /* clear wb memory */
73469585 1146 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
1147 }
1148
1149 return 0;
1150}
1151
1152/**
131b4b36 1153 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
1154 *
1155 * @adev: amdgpu_device pointer
1156 * @wb: wb index
1157 *
1158 * Allocate a wb slot for use by the driver (all asics).
1159 * Returns 0 on success or -EINVAL on failure.
1160 */
131b4b36 1161int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
1162{
1163 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 1164
97407b63 1165 if (offset < adev->wb.num_wb) {
7014285a 1166 __set_bit(offset, adev->wb.used);
63ae07ca 1167 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
1168 return 0;
1169 } else {
1170 return -EINVAL;
1171 }
1172}
1173
d38ceaf9 1174/**
131b4b36 1175 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
1176 *
1177 * @adev: amdgpu_device pointer
1178 * @wb: wb index
1179 *
1180 * Free a wb slot allocated for use by the driver (all asics)
1181 */
131b4b36 1182void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 1183{
73469585 1184 wb >>= 3;
d38ceaf9 1185 if (wb < adev->wb.num_wb)
73469585 1186 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
1187}
1188
d6895ad3
CK
1189/**
1190 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1191 *
1192 * @adev: amdgpu_device pointer
1193 *
1194 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1195 * to fail, but if any of the BARs is not accessible after the size we abort
1196 * driver loading by returning -ENODEV.
1197 */
1198int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1199{
453f617a 1200 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
31b8adab
CK
1201 struct pci_bus *root;
1202 struct resource *res;
1203 unsigned i;
d6895ad3
CK
1204 u16 cmd;
1205 int r;
1206
0c03b912 1207 /* Bypass for VF */
1208 if (amdgpu_sriov_vf(adev))
1209 return 0;
1210
b7221f2b
AD
1211 /* skip if the bios has already enabled large BAR */
1212 if (adev->gmc.real_vram_size &&
1213 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1214 return 0;
1215
31b8adab
CK
1216 /* Check if the root BUS has 64bit memory resources */
1217 root = adev->pdev->bus;
1218 while (root->parent)
1219 root = root->parent;
1220
1221 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 1222 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1223 res->start > 0x100000000ull)
1224 break;
1225 }
1226
1227 /* Trying to resize is pointless without a root hub window above 4GB */
1228 if (!res)
1229 return 0;
1230
453f617a
ND
1231 /* Limit the BAR size to what is available */
1232 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1233 rbar_size);
1234
d6895ad3
CK
1235 /* Disable memory decoding while we change the BAR addresses and size */
1236 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1237 pci_write_config_word(adev->pdev, PCI_COMMAND,
1238 cmd & ~PCI_COMMAND_MEMORY);
1239
1240 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1241 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1242 if (adev->asic_type >= CHIP_BONAIRE)
1243 pci_release_resource(adev->pdev, 2);
1244
1245 pci_release_resource(adev->pdev, 0);
1246
1247 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1248 if (r == -ENOSPC)
1249 DRM_INFO("Not enough PCI address space for a large BAR.");
1250 else if (r && r != -ENOTSUPP)
1251 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1252
1253 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1254
1255 /* When the doorbell or fb BAR isn't available we have no chance of
1256 * using the device.
1257 */
06ec9070 1258 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1259 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1260 return -ENODEV;
1261
1262 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1263
1264 return 0;
1265}
a05502e5 1266
d38ceaf9
AD
1267/*
1268 * GPU helpers function.
1269 */
1270/**
39c640c0 1271 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1272 *
1273 * @adev: amdgpu_device pointer
1274 *
c836fec5
JQ
1275 * Check if the asic has been initialized (all asics) at driver startup
1276 * or post is needed if hw reset is performed.
1277 * Returns true if need or false if not.
d38ceaf9 1278 */
39c640c0 1279bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1280{
1281 uint32_t reg;
1282
bec86378
ML
1283 if (amdgpu_sriov_vf(adev))
1284 return false;
1285
1286 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1287 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1288 * some old smc fw still need driver do vPost otherwise gpu hang, while
1289 * those smc fw version above 22.15 doesn't have this flaw, so we force
1290 * vpost executed for smc version below 22.15
bec86378
ML
1291 */
1292 if (adev->asic_type == CHIP_FIJI) {
1293 int err;
1294 uint32_t fw_ver;
1295 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1296 /* force vPost if error occured */
1297 if (err)
1298 return true;
1299
1300 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1301 if (fw_ver < 0x00160e00)
1302 return true;
bec86378 1303 }
bec86378 1304 }
91fe77eb 1305
e3c1b071 1306 /* Don't post if we need to reset whole hive on init */
1307 if (adev->gmc.xgmi.pending_reset)
1308 return false;
1309
91fe77eb 1310 if (adev->has_hw_reset) {
1311 adev->has_hw_reset = false;
1312 return true;
1313 }
1314
1315 /* bios scratch used on CIK+ */
1316 if (adev->asic_type >= CHIP_BONAIRE)
1317 return amdgpu_atombios_scratch_need_asic_init(adev);
1318
1319 /* check MEM_SIZE for older asics */
1320 reg = amdgpu_asic_get_config_memsize(adev);
1321
1322 if ((reg != 0) && (reg != 0xffffffff))
1323 return false;
1324
1325 return true;
bec86378
ML
1326}
1327
0ab5d711
ML
1328/**
1329 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1330 *
1331 * @adev: amdgpu_device pointer
1332 *
1333 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1334 * be set for this device.
1335 *
1336 * Returns true if it should be used or false if not.
1337 */
1338bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1339{
1340 switch (amdgpu_aspm) {
1341 case -1:
1342 break;
1343 case 0:
1344 return false;
1345 case 1:
1346 return true;
1347 default:
1348 return false;
1349 }
1350 return pcie_aspm_enabled(adev->pdev);
1351}
1352
d38ceaf9
AD
1353/* if we get transitioned to only one device, take VGA back */
1354/**
06ec9070 1355 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9 1356 *
bf44e8ce 1357 * @pdev: PCI device pointer
d38ceaf9
AD
1358 * @state: enable/disable vga decode
1359 *
1360 * Enable/disable vga decode (all asics).
1361 * Returns VGA resource flags.
1362 */
bf44e8ce
CH
1363static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1364 bool state)
d38ceaf9 1365{
bf44e8ce 1366 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
d38ceaf9
AD
1367 amdgpu_asic_set_vga_state(adev, state);
1368 if (state)
1369 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1370 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1371 else
1372 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1373}
1374
e3ecdffa
AD
1375/**
1376 * amdgpu_device_check_block_size - validate the vm block size
1377 *
1378 * @adev: amdgpu_device pointer
1379 *
1380 * Validates the vm block size specified via module parameter.
1381 * The vm block size defines number of bits in page table versus page directory,
1382 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1383 * page table and the remaining bits are in the page directory.
1384 */
06ec9070 1385static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1386{
1387 /* defines number of bits in page table versus page directory,
1388 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1389 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1390 if (amdgpu_vm_block_size == -1)
1391 return;
a1adf8be 1392
bab4fee7 1393 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1394 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1395 amdgpu_vm_block_size);
97489129 1396 amdgpu_vm_block_size = -1;
a1adf8be 1397 }
a1adf8be
CZ
1398}
1399
e3ecdffa
AD
1400/**
1401 * amdgpu_device_check_vm_size - validate the vm size
1402 *
1403 * @adev: amdgpu_device pointer
1404 *
1405 * Validates the vm size in GB specified via module parameter.
1406 * The VM size is the size of the GPU virtual memory space in GB.
1407 */
06ec9070 1408static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1409{
64dab074
AD
1410 /* no need to check the default value */
1411 if (amdgpu_vm_size == -1)
1412 return;
1413
83ca145d
ZJ
1414 if (amdgpu_vm_size < 1) {
1415 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1416 amdgpu_vm_size);
f3368128 1417 amdgpu_vm_size = -1;
83ca145d 1418 }
83ca145d
ZJ
1419}
1420
7951e376
RZ
1421static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1422{
1423 struct sysinfo si;
a9d4fe2f 1424 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1425 uint64_t total_memory;
1426 uint64_t dram_size_seven_GB = 0x1B8000000;
1427 uint64_t dram_size_three_GB = 0xB8000000;
1428
1429 if (amdgpu_smu_memory_pool_size == 0)
1430 return;
1431
1432 if (!is_os_64) {
1433 DRM_WARN("Not 64-bit OS, feature not supported\n");
1434 goto def_value;
1435 }
1436 si_meminfo(&si);
1437 total_memory = (uint64_t)si.totalram * si.mem_unit;
1438
1439 if ((amdgpu_smu_memory_pool_size == 1) ||
1440 (amdgpu_smu_memory_pool_size == 2)) {
1441 if (total_memory < dram_size_three_GB)
1442 goto def_value1;
1443 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1444 (amdgpu_smu_memory_pool_size == 8)) {
1445 if (total_memory < dram_size_seven_GB)
1446 goto def_value1;
1447 } else {
1448 DRM_WARN("Smu memory pool size not supported\n");
1449 goto def_value;
1450 }
1451 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1452
1453 return;
1454
1455def_value1:
1456 DRM_WARN("No enough system memory\n");
1457def_value:
1458 adev->pm.smu_prv_buffer_size = 0;
1459}
1460
9f6a7857
HR
1461static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1462{
1463 if (!(adev->flags & AMD_IS_APU) ||
1464 adev->asic_type < CHIP_RAVEN)
1465 return 0;
1466
1467 switch (adev->asic_type) {
1468 case CHIP_RAVEN:
1469 if (adev->pdev->device == 0x15dd)
1470 adev->apu_flags |= AMD_APU_IS_RAVEN;
1471 if (adev->pdev->device == 0x15d8)
1472 adev->apu_flags |= AMD_APU_IS_PICASSO;
1473 break;
1474 case CHIP_RENOIR:
1475 if ((adev->pdev->device == 0x1636) ||
1476 (adev->pdev->device == 0x164c))
1477 adev->apu_flags |= AMD_APU_IS_RENOIR;
1478 else
1479 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1480 break;
1481 case CHIP_VANGOGH:
1482 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1483 break;
1484 case CHIP_YELLOW_CARP:
1485 break;
d0f56dc2 1486 case CHIP_CYAN_SKILLFISH:
dfcc3e8c
AD
1487 if ((adev->pdev->device == 0x13FE) ||
1488 (adev->pdev->device == 0x143F))
d0f56dc2
TZ
1489 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1490 break;
9f6a7857 1491 default:
4eaf21b7 1492 break;
9f6a7857
HR
1493 }
1494
1495 return 0;
1496}
1497
d38ceaf9 1498/**
06ec9070 1499 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1500 *
1501 * @adev: amdgpu_device pointer
1502 *
1503 * Validates certain module parameters and updates
1504 * the associated values used by the driver (all asics).
1505 */
912dfc84 1506static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1507{
5b011235
CZ
1508 if (amdgpu_sched_jobs < 4) {
1509 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1510 amdgpu_sched_jobs);
1511 amdgpu_sched_jobs = 4;
76117507 1512 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1513 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1514 amdgpu_sched_jobs);
1515 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1516 }
d38ceaf9 1517
83e74db6 1518 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1519 /* gart size must be greater or equal to 32M */
1520 dev_warn(adev->dev, "gart size (%d) too small\n",
1521 amdgpu_gart_size);
83e74db6 1522 amdgpu_gart_size = -1;
d38ceaf9
AD
1523 }
1524
36d38372 1525 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1526 /* gtt size must be greater or equal to 32M */
36d38372
CK
1527 dev_warn(adev->dev, "gtt size (%d) too small\n",
1528 amdgpu_gtt_size);
1529 amdgpu_gtt_size = -1;
d38ceaf9
AD
1530 }
1531
d07f14be
RH
1532 /* valid range is between 4 and 9 inclusive */
1533 if (amdgpu_vm_fragment_size != -1 &&
1534 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1535 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1536 amdgpu_vm_fragment_size = -1;
1537 }
1538
5d5bd5e3
KW
1539 if (amdgpu_sched_hw_submission < 2) {
1540 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1541 amdgpu_sched_hw_submission);
1542 amdgpu_sched_hw_submission = 2;
1543 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1544 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1545 amdgpu_sched_hw_submission);
1546 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1547 }
1548
2656fd23
AG
1549 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1550 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1551 amdgpu_reset_method = -1;
1552 }
1553
7951e376
RZ
1554 amdgpu_device_check_smu_prv_buffer_size(adev);
1555
06ec9070 1556 amdgpu_device_check_vm_size(adev);
d38ceaf9 1557
06ec9070 1558 amdgpu_device_check_block_size(adev);
6a7f76e7 1559
19aede77 1560 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1561
e3c00faa 1562 return 0;
d38ceaf9
AD
1563}
1564
1565/**
1566 * amdgpu_switcheroo_set_state - set switcheroo state
1567 *
1568 * @pdev: pci dev pointer
1694467b 1569 * @state: vga_switcheroo state
d38ceaf9 1570 *
12024b17 1571 * Callback for the switcheroo driver. Suspends or resumes
d38ceaf9
AD
1572 * the asics before or after it is powered up using ACPI methods.
1573 */
8aba21b7
LT
1574static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1575 enum vga_switcheroo_state state)
d38ceaf9
AD
1576{
1577 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1578 int r;
d38ceaf9 1579
b98c6299 1580 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1581 return;
1582
1583 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1584 pr_info("switched on\n");
d38ceaf9
AD
1585 /* don't suspend or resume card normally */
1586 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1587
8f66090b
TZ
1588 pci_set_power_state(pdev, PCI_D0);
1589 amdgpu_device_load_pci_state(pdev);
1590 r = pci_enable_device(pdev);
de185019
AD
1591 if (r)
1592 DRM_WARN("pci_enable_device failed (%d)\n", r);
1593 amdgpu_device_resume(dev, true);
d38ceaf9 1594
d38ceaf9 1595 dev->switch_power_state = DRM_SWITCH_POWER_ON;
d38ceaf9 1596 } else {
dd4fa6c1 1597 pr_info("switched off\n");
d38ceaf9 1598 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1599 amdgpu_device_suspend(dev, true);
8f66090b 1600 amdgpu_device_cache_pci_state(pdev);
de185019 1601 /* Shut down the device */
8f66090b
TZ
1602 pci_disable_device(pdev);
1603 pci_set_power_state(pdev, PCI_D3cold);
d38ceaf9
AD
1604 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1605 }
1606}
1607
1608/**
1609 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1610 *
1611 * @pdev: pci dev pointer
1612 *
1613 * Callback for the switcheroo driver. Check of the switcheroo
1614 * state can be changed.
1615 * Returns true if the state can be changed, false if not.
1616 */
1617static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1618{
1619 struct drm_device *dev = pci_get_drvdata(pdev);
1620
1621 /*
1622 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1623 * locking inversion with the driver load path. And the access here is
1624 * completely racy anyway. So don't bother with locking for now.
1625 */
7e13ad89 1626 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1627}
1628
1629static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1630 .set_gpu_state = amdgpu_switcheroo_set_state,
1631 .reprobe = NULL,
1632 .can_switch = amdgpu_switcheroo_can_switch,
1633};
1634
e3ecdffa
AD
1635/**
1636 * amdgpu_device_ip_set_clockgating_state - set the CG state
1637 *
87e3f136 1638 * @dev: amdgpu_device pointer
e3ecdffa
AD
1639 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1640 * @state: clockgating state (gate or ungate)
1641 *
1642 * Sets the requested clockgating state for all instances of
1643 * the hardware IP specified.
1644 * Returns the error code from the last instance.
1645 */
43fa561f 1646int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1647 enum amd_ip_block_type block_type,
1648 enum amd_clockgating_state state)
d38ceaf9 1649{
43fa561f 1650 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1651 int i, r = 0;
1652
1653 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1654 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1655 continue;
c722865a
RZ
1656 if (adev->ip_blocks[i].version->type != block_type)
1657 continue;
1658 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1659 continue;
1660 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1661 (void *)adev, state);
1662 if (r)
1663 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1664 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1665 }
1666 return r;
1667}
1668
e3ecdffa
AD
1669/**
1670 * amdgpu_device_ip_set_powergating_state - set the PG state
1671 *
87e3f136 1672 * @dev: amdgpu_device pointer
e3ecdffa
AD
1673 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1674 * @state: powergating state (gate or ungate)
1675 *
1676 * Sets the requested powergating state for all instances of
1677 * the hardware IP specified.
1678 * Returns the error code from the last instance.
1679 */
43fa561f 1680int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1681 enum amd_ip_block_type block_type,
1682 enum amd_powergating_state state)
d38ceaf9 1683{
43fa561f 1684 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1685 int i, r = 0;
1686
1687 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1688 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1689 continue;
c722865a
RZ
1690 if (adev->ip_blocks[i].version->type != block_type)
1691 continue;
1692 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1693 continue;
1694 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1695 (void *)adev, state);
1696 if (r)
1697 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1698 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1699 }
1700 return r;
1701}
1702
e3ecdffa
AD
1703/**
1704 * amdgpu_device_ip_get_clockgating_state - get the CG state
1705 *
1706 * @adev: amdgpu_device pointer
1707 * @flags: clockgating feature flags
1708 *
1709 * Walks the list of IPs on the device and updates the clockgating
1710 * flags for each IP.
1711 * Updates @flags with the feature flags for each hardware IP where
1712 * clockgating is enabled.
1713 */
2990a1fc 1714void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
25faeddc 1715 u64 *flags)
6cb2d4e4
HR
1716{
1717 int i;
1718
1719 for (i = 0; i < adev->num_ip_blocks; i++) {
1720 if (!adev->ip_blocks[i].status.valid)
1721 continue;
1722 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1723 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1724 }
1725}
1726
e3ecdffa
AD
1727/**
1728 * amdgpu_device_ip_wait_for_idle - wait for idle
1729 *
1730 * @adev: amdgpu_device pointer
1731 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1732 *
1733 * Waits for the request hardware IP to be idle.
1734 * Returns 0 for success or a negative error code on failure.
1735 */
2990a1fc
AD
1736int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1737 enum amd_ip_block_type block_type)
5dbbb60b
AD
1738{
1739 int i, r;
1740
1741 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1742 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1743 continue;
a1255107
AD
1744 if (adev->ip_blocks[i].version->type == block_type) {
1745 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1746 if (r)
1747 return r;
1748 break;
1749 }
1750 }
1751 return 0;
1752
1753}
1754
e3ecdffa
AD
1755/**
1756 * amdgpu_device_ip_is_idle - is the hardware IP idle
1757 *
1758 * @adev: amdgpu_device pointer
1759 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1760 *
1761 * Check if the hardware IP is idle or not.
1762 * Returns true if it the IP is idle, false if not.
1763 */
2990a1fc
AD
1764bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1765 enum amd_ip_block_type block_type)
5dbbb60b
AD
1766{
1767 int i;
1768
1769 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1770 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1771 continue;
a1255107
AD
1772 if (adev->ip_blocks[i].version->type == block_type)
1773 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1774 }
1775 return true;
1776
1777}
1778
e3ecdffa
AD
1779/**
1780 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1781 *
1782 * @adev: amdgpu_device pointer
87e3f136 1783 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1784 *
1785 * Returns a pointer to the hardware IP block structure
1786 * if it exists for the asic, otherwise NULL.
1787 */
2990a1fc
AD
1788struct amdgpu_ip_block *
1789amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1790 enum amd_ip_block_type type)
d38ceaf9
AD
1791{
1792 int i;
1793
1794 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1795 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1796 return &adev->ip_blocks[i];
1797
1798 return NULL;
1799}
1800
1801/**
2990a1fc 1802 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1803 *
1804 * @adev: amdgpu_device pointer
5fc3aeeb 1805 * @type: enum amd_ip_block_type
d38ceaf9
AD
1806 * @major: major version
1807 * @minor: minor version
1808 *
1809 * return 0 if equal or greater
1810 * return 1 if smaller or the ip_block doesn't exist
1811 */
2990a1fc
AD
1812int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1813 enum amd_ip_block_type type,
1814 u32 major, u32 minor)
d38ceaf9 1815{
2990a1fc 1816 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1817
a1255107
AD
1818 if (ip_block && ((ip_block->version->major > major) ||
1819 ((ip_block->version->major == major) &&
1820 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1821 return 0;
1822
1823 return 1;
1824}
1825
a1255107 1826/**
2990a1fc 1827 * amdgpu_device_ip_block_add
a1255107
AD
1828 *
1829 * @adev: amdgpu_device pointer
1830 * @ip_block_version: pointer to the IP to add
1831 *
1832 * Adds the IP block driver information to the collection of IPs
1833 * on the asic.
1834 */
2990a1fc
AD
1835int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1836 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1837{
1838 if (!ip_block_version)
1839 return -EINVAL;
1840
7bd939d0
LG
1841 switch (ip_block_version->type) {
1842 case AMD_IP_BLOCK_TYPE_VCN:
1843 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1844 return 0;
1845 break;
1846 case AMD_IP_BLOCK_TYPE_JPEG:
1847 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1848 return 0;
1849 break;
1850 default:
1851 break;
1852 }
1853
e966a725 1854 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1855 ip_block_version->funcs->name);
1856
a1255107
AD
1857 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1858
1859 return 0;
1860}
1861
e3ecdffa
AD
1862/**
1863 * amdgpu_device_enable_virtual_display - enable virtual display feature
1864 *
1865 * @adev: amdgpu_device pointer
1866 *
1867 * Enabled the virtual display feature if the user has enabled it via
1868 * the module parameter virtual_display. This feature provides a virtual
1869 * display hardware on headless boards or in virtualized environments.
1870 * This function parses and validates the configuration string specified by
1871 * the user and configues the virtual display configuration (number of
1872 * virtual connectors, crtcs, etc.) specified.
1873 */
483ef985 1874static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1875{
1876 adev->enable_virtual_display = false;
1877
1878 if (amdgpu_virtual_display) {
8f66090b 1879 const char *pci_address_name = pci_name(adev->pdev);
0f66356d 1880 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1881
1882 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1883 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1884 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1885 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1886 if (!strcmp("all", pciaddname)
1887 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1888 long num_crtc;
1889 int res = -1;
1890
9accf2fd 1891 adev->enable_virtual_display = true;
0f66356d
ED
1892
1893 if (pciaddname_tmp)
1894 res = kstrtol(pciaddname_tmp, 10,
1895 &num_crtc);
1896
1897 if (!res) {
1898 if (num_crtc < 1)
1899 num_crtc = 1;
1900 if (num_crtc > 6)
1901 num_crtc = 6;
1902 adev->mode_info.num_crtc = num_crtc;
1903 } else {
1904 adev->mode_info.num_crtc = 1;
1905 }
9accf2fd
ED
1906 break;
1907 }
1908 }
1909
0f66356d
ED
1910 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1911 amdgpu_virtual_display, pci_address_name,
1912 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1913
1914 kfree(pciaddstr);
1915 }
1916}
1917
25263da3
AD
1918void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1919{
1920 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1921 adev->mode_info.num_crtc = 1;
1922 adev->enable_virtual_display = true;
1923 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1924 adev->enable_virtual_display, adev->mode_info.num_crtc);
1925 }
1926}
1927
e3ecdffa
AD
1928/**
1929 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1930 *
1931 * @adev: amdgpu_device pointer
1932 *
1933 * Parses the asic configuration parameters specified in the gpu info
1934 * firmware and makes them availale to the driver for use in configuring
1935 * the asic.
1936 * Returns 0 on success, -EINVAL on failure.
1937 */
e2a75f88
AD
1938static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1939{
e2a75f88 1940 const char *chip_name;
c0a43457 1941 char fw_name[40];
e2a75f88
AD
1942 int err;
1943 const struct gpu_info_firmware_header_v1_0 *hdr;
1944
ab4fe3e1
HR
1945 adev->firmware.gpu_info_fw = NULL;
1946
72de33f8 1947 if (adev->mman.discovery_bin) {
cc375d8c
TY
1948 /*
1949 * FIXME: The bounding box is still needed by Navi12, so
e24d0e91 1950 * temporarily read it from gpu_info firmware. Should be dropped
cc375d8c
TY
1951 * when DAL no longer needs it.
1952 */
1953 if (adev->asic_type != CHIP_NAVI12)
1954 return 0;
258620d0
AD
1955 }
1956
e2a75f88 1957 switch (adev->asic_type) {
e2a75f88
AD
1958 default:
1959 return 0;
1960 case CHIP_VEGA10:
1961 chip_name = "vega10";
1962 break;
3f76dced
AD
1963 case CHIP_VEGA12:
1964 chip_name = "vega12";
1965 break;
2d2e5e7e 1966 case CHIP_RAVEN:
54f78a76 1967 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1968 chip_name = "raven2";
54f78a76 1969 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1970 chip_name = "picasso";
54c4d17e
FX
1971 else
1972 chip_name = "raven";
2d2e5e7e 1973 break;
65e60f6e
LM
1974 case CHIP_ARCTURUS:
1975 chip_name = "arcturus";
1976 break;
42b325e5
XY
1977 case CHIP_NAVI12:
1978 chip_name = "navi12";
1979 break;
e2a75f88
AD
1980 }
1981
1982 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1983 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1984 if (err) {
1985 dev_err(adev->dev,
1986 "Failed to load gpu_info firmware \"%s\"\n",
1987 fw_name);
1988 goto out;
1989 }
ab4fe3e1 1990 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1991 if (err) {
1992 dev_err(adev->dev,
1993 "Failed to validate gpu_info firmware \"%s\"\n",
1994 fw_name);
1995 goto out;
1996 }
1997
ab4fe3e1 1998 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1999 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2000
2001 switch (hdr->version_major) {
2002 case 1:
2003 {
2004 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 2005 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
2006 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2007
cc375d8c
TY
2008 /*
2009 * Should be droped when DAL no longer needs it.
2010 */
2011 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
2012 goto parse_soc_bounding_box;
2013
b5ab16bf
AD
2014 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2015 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2016 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2017 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 2018 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
2019 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2020 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2021 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2022 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2023 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 2024 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
2025 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2026 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
2027 adev->gfx.cu_info.max_waves_per_simd =
2028 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2029 adev->gfx.cu_info.max_scratch_slots_per_cu =
2030 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2031 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 2032 if (hdr->version_minor >= 1) {
35c2e910
HZ
2033 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2034 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2035 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2036 adev->gfx.config.num_sc_per_sh =
2037 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2038 adev->gfx.config.num_packer_per_sc =
2039 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2040 }
ec51d3fa
XY
2041
2042parse_soc_bounding_box:
ec51d3fa
XY
2043 /*
2044 * soc bounding box info is not integrated in disocovery table,
258620d0 2045 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 2046 */
48321c3d
HW
2047 if (hdr->version_minor == 2) {
2048 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2049 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2050 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2051 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2052 }
e2a75f88
AD
2053 break;
2054 }
2055 default:
2056 dev_err(adev->dev,
2057 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2058 err = -EINVAL;
2059 goto out;
2060 }
2061out:
e2a75f88
AD
2062 return err;
2063}
2064
e3ecdffa
AD
2065/**
2066 * amdgpu_device_ip_early_init - run early init for hardware IPs
2067 *
2068 * @adev: amdgpu_device pointer
2069 *
2070 * Early initialization pass for hardware IPs. The hardware IPs that make
2071 * up each asic are discovered each IP's early_init callback is run. This
2072 * is the first stage in initializing the asic.
2073 * Returns 0 on success, negative error code on failure.
2074 */
06ec9070 2075static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 2076{
901e2be2
AD
2077 struct drm_device *dev = adev_to_drm(adev);
2078 struct pci_dev *parent;
aaa36a97 2079 int i, r;
d38ceaf9 2080
483ef985 2081 amdgpu_device_enable_virtual_display(adev);
a6be7570 2082
00a979f3 2083 if (amdgpu_sriov_vf(adev)) {
00a979f3 2084 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
2085 if (r)
2086 return r;
00a979f3
WS
2087 }
2088
d38ceaf9 2089 switch (adev->asic_type) {
33f34802
KW
2090#ifdef CONFIG_DRM_AMDGPU_SI
2091 case CHIP_VERDE:
2092 case CHIP_TAHITI:
2093 case CHIP_PITCAIRN:
2094 case CHIP_OLAND:
2095 case CHIP_HAINAN:
295d0daf 2096 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
2097 r = si_set_ip_blocks(adev);
2098 if (r)
2099 return r;
2100 break;
2101#endif
a2e73f56
AD
2102#ifdef CONFIG_DRM_AMDGPU_CIK
2103 case CHIP_BONAIRE:
2104 case CHIP_HAWAII:
2105 case CHIP_KAVERI:
2106 case CHIP_KABINI:
2107 case CHIP_MULLINS:
e1ad2d53 2108 if (adev->flags & AMD_IS_APU)
a2e73f56 2109 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
2110 else
2111 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
2112
2113 r = cik_set_ip_blocks(adev);
2114 if (r)
2115 return r;
2116 break;
2117#endif
da87c30b
AD
2118 case CHIP_TOPAZ:
2119 case CHIP_TONGA:
2120 case CHIP_FIJI:
2121 case CHIP_POLARIS10:
2122 case CHIP_POLARIS11:
2123 case CHIP_POLARIS12:
2124 case CHIP_VEGAM:
2125 case CHIP_CARRIZO:
2126 case CHIP_STONEY:
2127 if (adev->flags & AMD_IS_APU)
2128 adev->family = AMDGPU_FAMILY_CZ;
2129 else
2130 adev->family = AMDGPU_FAMILY_VI;
2131
2132 r = vi_set_ip_blocks(adev);
2133 if (r)
2134 return r;
2135 break;
d38ceaf9 2136 default:
63352b7f
AD
2137 r = amdgpu_discovery_set_ip_blocks(adev);
2138 if (r)
2139 return r;
2140 break;
d38ceaf9
AD
2141 }
2142
901e2be2
AD
2143 if (amdgpu_has_atpx() &&
2144 (amdgpu_is_atpx_hybrid() ||
2145 amdgpu_has_atpx_dgpu_power_cntl()) &&
2146 ((adev->flags & AMD_IS_APU) == 0) &&
2147 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2148 adev->flags |= AMD_IS_PX;
2149
85ac2021
AD
2150 if (!(adev->flags & AMD_IS_APU)) {
2151 parent = pci_upstream_bridge(adev->pdev);
2152 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2153 }
901e2be2 2154
c004d44e 2155 amdgpu_amdkfd_device_probe(adev);
1884734a 2156
3b94fb10 2157 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 2158 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 2159 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
4215a119
HC
2160 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2161 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
00f54b97 2162
d38ceaf9
AD
2163 for (i = 0; i < adev->num_ip_blocks; i++) {
2164 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
2165 DRM_ERROR("disabled ip block: %d <%s>\n",
2166 i, adev->ip_blocks[i].version->funcs->name);
a1255107 2167 adev->ip_blocks[i].status.valid = false;
d38ceaf9 2168 } else {
a1255107
AD
2169 if (adev->ip_blocks[i].version->funcs->early_init) {
2170 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 2171 if (r == -ENOENT) {
a1255107 2172 adev->ip_blocks[i].status.valid = false;
2c1a2784 2173 } else if (r) {
a1255107
AD
2174 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2175 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2176 return r;
2c1a2784 2177 } else {
a1255107 2178 adev->ip_blocks[i].status.valid = true;
2c1a2784 2179 }
974e6b64 2180 } else {
a1255107 2181 adev->ip_blocks[i].status.valid = true;
d38ceaf9 2182 }
d38ceaf9 2183 }
21a249ca
AD
2184 /* get the vbios after the asic_funcs are set up */
2185 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
2186 r = amdgpu_device_parse_gpu_info_fw(adev);
2187 if (r)
2188 return r;
2189
21a249ca
AD
2190 /* Read BIOS */
2191 if (!amdgpu_get_bios(adev))
2192 return -EINVAL;
2193
2194 r = amdgpu_atombios_init(adev);
2195 if (r) {
2196 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2197 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2198 return r;
2199 }
77eabc6f
PJZ
2200
2201 /*get pf2vf msg info at it's earliest time*/
2202 if (amdgpu_sriov_vf(adev))
2203 amdgpu_virt_init_data_exchange(adev);
2204
21a249ca 2205 }
d38ceaf9
AD
2206 }
2207
395d1fb9
NH
2208 adev->cg_flags &= amdgpu_cg_mask;
2209 adev->pg_flags &= amdgpu_pg_mask;
2210
d38ceaf9
AD
2211 return 0;
2212}
2213
0a4f2520
RZ
2214static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2215{
2216 int i, r;
2217
2218 for (i = 0; i < adev->num_ip_blocks; i++) {
2219 if (!adev->ip_blocks[i].status.sw)
2220 continue;
2221 if (adev->ip_blocks[i].status.hw)
2222 continue;
2223 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 2224 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
2225 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2226 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2227 if (r) {
2228 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2229 adev->ip_blocks[i].version->funcs->name, r);
2230 return r;
2231 }
2232 adev->ip_blocks[i].status.hw = true;
2233 }
2234 }
2235
2236 return 0;
2237}
2238
2239static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2240{
2241 int i, r;
2242
2243 for (i = 0; i < adev->num_ip_blocks; i++) {
2244 if (!adev->ip_blocks[i].status.sw)
2245 continue;
2246 if (adev->ip_blocks[i].status.hw)
2247 continue;
2248 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2249 if (r) {
2250 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2251 adev->ip_blocks[i].version->funcs->name, r);
2252 return r;
2253 }
2254 adev->ip_blocks[i].status.hw = true;
2255 }
2256
2257 return 0;
2258}
2259
7a3e0bb2
RZ
2260static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2261{
2262 int r = 0;
2263 int i;
80f41f84 2264 uint32_t smu_version;
7a3e0bb2
RZ
2265
2266 if (adev->asic_type >= CHIP_VEGA10) {
2267 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2268 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2269 continue;
2270
e3c1b071 2271 if (!adev->ip_blocks[i].status.sw)
2272 continue;
2273
482f0e53
ML
2274 /* no need to do the fw loading again if already done*/
2275 if (adev->ip_blocks[i].status.hw == true)
2276 break;
2277
53b3f8f4 2278 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2279 r = adev->ip_blocks[i].version->funcs->resume(adev);
2280 if (r) {
2281 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2282 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2283 return r;
2284 }
2285 } else {
2286 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2287 if (r) {
2288 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2289 adev->ip_blocks[i].version->funcs->name, r);
2290 return r;
7a3e0bb2 2291 }
7a3e0bb2 2292 }
482f0e53
ML
2293
2294 adev->ip_blocks[i].status.hw = true;
2295 break;
7a3e0bb2
RZ
2296 }
2297 }
482f0e53 2298
8973d9ec
ED
2299 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2300 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2301
80f41f84 2302 return r;
7a3e0bb2
RZ
2303}
2304
5fd8518d
AG
2305static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2306{
2307 long timeout;
2308 int r, i;
2309
2310 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2311 struct amdgpu_ring *ring = adev->rings[i];
2312
2313 /* No need to setup the GPU scheduler for rings that don't need it */
2314 if (!ring || ring->no_scheduler)
2315 continue;
2316
2317 switch (ring->funcs->type) {
2318 case AMDGPU_RING_TYPE_GFX:
2319 timeout = adev->gfx_timeout;
2320 break;
2321 case AMDGPU_RING_TYPE_COMPUTE:
2322 timeout = adev->compute_timeout;
2323 break;
2324 case AMDGPU_RING_TYPE_SDMA:
2325 timeout = adev->sdma_timeout;
2326 break;
2327 default:
2328 timeout = adev->video_timeout;
2329 break;
2330 }
2331
2332 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2333 ring->num_hw_submission, amdgpu_job_hang_limit,
8ab62eda
JG
2334 timeout, adev->reset_domain->wq,
2335 ring->sched_score, ring->name,
2336 adev->dev);
5fd8518d
AG
2337 if (r) {
2338 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2339 ring->name);
2340 return r;
2341 }
2342 }
2343
2344 return 0;
2345}
2346
2347
e3ecdffa
AD
2348/**
2349 * amdgpu_device_ip_init - run init for hardware IPs
2350 *
2351 * @adev: amdgpu_device pointer
2352 *
2353 * Main initialization pass for hardware IPs. The list of all the hardware
2354 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2355 * are run. sw_init initializes the software state associated with each IP
2356 * and hw_init initializes the hardware associated with each IP.
2357 * Returns 0 on success, negative error code on failure.
2358 */
06ec9070 2359static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2360{
2361 int i, r;
2362
c030f2e4 2363 r = amdgpu_ras_init(adev);
2364 if (r)
2365 return r;
2366
d38ceaf9 2367 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2368 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2369 continue;
a1255107 2370 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2371 if (r) {
a1255107
AD
2372 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2373 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2374 goto init_failed;
2c1a2784 2375 }
a1255107 2376 adev->ip_blocks[i].status.sw = true;
bfca0289 2377
c1c39032
AD
2378 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2379 /* need to do common hw init early so everything is set up for gmc */
2380 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2381 if (r) {
2382 DRM_ERROR("hw_init %d failed %d\n", i, r);
2383 goto init_failed;
2384 }
2385 adev->ip_blocks[i].status.hw = true;
2386 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2387 /* need to do gmc hw init early so we can allocate gpu mem */
892deb48
VS
2388 /* Try to reserve bad pages early */
2389 if (amdgpu_sriov_vf(adev))
2390 amdgpu_virt_exchange_data(adev);
2391
06ec9070 2392 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2393 if (r) {
2394 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2395 goto init_failed;
2c1a2784 2396 }
a1255107 2397 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2398 if (r) {
2399 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2400 goto init_failed;
2c1a2784 2401 }
06ec9070 2402 r = amdgpu_device_wb_init(adev);
2c1a2784 2403 if (r) {
06ec9070 2404 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2405 goto init_failed;
2c1a2784 2406 }
a1255107 2407 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2408
2409 /* right after GMC hw init, we create CSA */
8a1fbb4a 2410 if (amdgpu_mcbp) {
1e256e27
RZ
2411 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2412 AMDGPU_GEM_DOMAIN_VRAM,
2413 AMDGPU_CSA_SIZE);
2493664f
ML
2414 if (r) {
2415 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2416 goto init_failed;
2493664f
ML
2417 }
2418 }
d38ceaf9
AD
2419 }
2420 }
2421
c9ffa427 2422 if (amdgpu_sriov_vf(adev))
22c16d25 2423 amdgpu_virt_init_data_exchange(adev);
c9ffa427 2424
533aed27
AG
2425 r = amdgpu_ib_pool_init(adev);
2426 if (r) {
2427 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2428 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2429 goto init_failed;
2430 }
2431
c8963ea4
RZ
2432 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2433 if (r)
72d3f592 2434 goto init_failed;
0a4f2520
RZ
2435
2436 r = amdgpu_device_ip_hw_init_phase1(adev);
2437 if (r)
72d3f592 2438 goto init_failed;
0a4f2520 2439
7a3e0bb2
RZ
2440 r = amdgpu_device_fw_loading(adev);
2441 if (r)
72d3f592 2442 goto init_failed;
7a3e0bb2 2443
0a4f2520
RZ
2444 r = amdgpu_device_ip_hw_init_phase2(adev);
2445 if (r)
72d3f592 2446 goto init_failed;
d38ceaf9 2447
121a2bc6
AG
2448 /*
2449 * retired pages will be loaded from eeprom and reserved here,
2450 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2451 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2452 * for I2C communication which only true at this point.
b82e65a9
GC
2453 *
2454 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2455 * failure from bad gpu situation and stop amdgpu init process
2456 * accordingly. For other failed cases, it will still release all
2457 * the resource and print error message, rather than returning one
2458 * negative value to upper level.
121a2bc6
AG
2459 *
2460 * Note: theoretically, this should be called before all vram allocations
2461 * to protect retired page from abusing
2462 */
b82e65a9
GC
2463 r = amdgpu_ras_recovery_init(adev);
2464 if (r)
2465 goto init_failed;
121a2bc6 2466
cfbb6b00
AG
2467 /**
2468 * In case of XGMI grab extra reference for reset domain for this device
2469 */
a4c63caf 2470 if (adev->gmc.xgmi.num_physical_nodes > 1) {
cfbb6b00 2471 if (amdgpu_xgmi_add_device(adev) == 0) {
46c67660 2472 if (!amdgpu_sriov_vf(adev)) {
2efc30f0
VC
2473 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2474
46c67660 2475 if (!hive->reset_domain ||
2476 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2477 r = -ENOENT;
2478 amdgpu_put_xgmi_hive(hive);
2479 goto init_failed;
2480 }
2481
2482 /* Drop the early temporary reset domain we created for device */
2483 amdgpu_reset_put_reset_domain(adev->reset_domain);
2484 adev->reset_domain = hive->reset_domain;
9dfa4860 2485 amdgpu_put_xgmi_hive(hive);
cfbb6b00 2486 }
a4c63caf
AG
2487 }
2488 }
2489
5fd8518d
AG
2490 r = amdgpu_device_init_schedulers(adev);
2491 if (r)
2492 goto init_failed;
e3c1b071 2493
2494 /* Don't init kfd if whole hive need to be reset during init */
c004d44e 2495 if (!adev->gmc.xgmi.pending_reset)
e3c1b071 2496 amdgpu_amdkfd_device_init(adev);
c6332b97 2497
bd607166
KR
2498 amdgpu_fru_get_product_info(adev);
2499
72d3f592 2500init_failed:
c9ffa427 2501 if (amdgpu_sriov_vf(adev))
c6332b97 2502 amdgpu_virt_release_full_gpu(adev, true);
2503
72d3f592 2504 return r;
d38ceaf9
AD
2505}
2506
e3ecdffa
AD
2507/**
2508 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2509 *
2510 * @adev: amdgpu_device pointer
2511 *
2512 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2513 * this function before a GPU reset. If the value is retained after a
2514 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2515 */
06ec9070 2516static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2517{
2518 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2519}
2520
e3ecdffa
AD
2521/**
2522 * amdgpu_device_check_vram_lost - check if vram is valid
2523 *
2524 * @adev: amdgpu_device pointer
2525 *
2526 * Checks the reset magic value written to the gart pointer in VRAM.
2527 * The driver calls this after a GPU reset to see if the contents of
2528 * VRAM is lost or now.
2529 * returns true if vram is lost, false if not.
2530 */
06ec9070 2531static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2532{
dadce777
EQ
2533 if (memcmp(adev->gart.ptr, adev->reset_magic,
2534 AMDGPU_RESET_MAGIC_NUM))
2535 return true;
2536
53b3f8f4 2537 if (!amdgpu_in_reset(adev))
dadce777
EQ
2538 return false;
2539
2540 /*
2541 * For all ASICs with baco/mode1 reset, the VRAM is
2542 * always assumed to be lost.
2543 */
2544 switch (amdgpu_asic_reset_method(adev)) {
2545 case AMD_RESET_METHOD_BACO:
2546 case AMD_RESET_METHOD_MODE1:
2547 return true;
2548 default:
2549 return false;
2550 }
0c49e0b8
CZ
2551}
2552
e3ecdffa 2553/**
1112a46b 2554 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2555 *
2556 * @adev: amdgpu_device pointer
b8b72130 2557 * @state: clockgating state (gate or ungate)
e3ecdffa 2558 *
e3ecdffa 2559 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2560 * set_clockgating_state callbacks are run.
2561 * Late initialization pass enabling clockgating for hardware IPs.
2562 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2563 * Returns 0 on success, negative error code on failure.
2564 */
fdd34271 2565
5d89bb2d
LL
2566int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2567 enum amd_clockgating_state state)
d38ceaf9 2568{
1112a46b 2569 int i, j, r;
d38ceaf9 2570
4a2ba394
SL
2571 if (amdgpu_emu_mode == 1)
2572 return 0;
2573
1112a46b
RZ
2574 for (j = 0; j < adev->num_ip_blocks; j++) {
2575 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2576 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2577 continue;
5d70a549
PV
2578 /* skip CG for GFX on S0ix */
2579 if (adev->in_s0ix &&
2580 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2581 continue;
4a446d55 2582 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2583 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2584 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2585 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2586 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2587 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2588 /* enable clockgating to save power */
a1255107 2589 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2590 state);
4a446d55
AD
2591 if (r) {
2592 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2593 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2594 return r;
2595 }
b0b00ff1 2596 }
d38ceaf9 2597 }
06b18f61 2598
c9f96fd5
RZ
2599 return 0;
2600}
2601
5d89bb2d
LL
2602int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2603 enum amd_powergating_state state)
c9f96fd5 2604{
1112a46b 2605 int i, j, r;
06b18f61 2606
c9f96fd5
RZ
2607 if (amdgpu_emu_mode == 1)
2608 return 0;
2609
1112a46b
RZ
2610 for (j = 0; j < adev->num_ip_blocks; j++) {
2611 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2612 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5 2613 continue;
5d70a549
PV
2614 /* skip PG for GFX on S0ix */
2615 if (adev->in_s0ix &&
2616 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2617 continue;
c9f96fd5
RZ
2618 /* skip CG for VCE/UVD, it's handled specially */
2619 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2620 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2621 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2622 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2623 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2624 /* enable powergating to save power */
2625 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2626 state);
c9f96fd5
RZ
2627 if (r) {
2628 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2629 adev->ip_blocks[i].version->funcs->name, r);
2630 return r;
2631 }
2632 }
2633 }
2dc80b00
S
2634 return 0;
2635}
2636
beff74bc
AD
2637static int amdgpu_device_enable_mgpu_fan_boost(void)
2638{
2639 struct amdgpu_gpu_instance *gpu_ins;
2640 struct amdgpu_device *adev;
2641 int i, ret = 0;
2642
2643 mutex_lock(&mgpu_info.mutex);
2644
2645 /*
2646 * MGPU fan boost feature should be enabled
2647 * only when there are two or more dGPUs in
2648 * the system
2649 */
2650 if (mgpu_info.num_dgpu < 2)
2651 goto out;
2652
2653 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2654 gpu_ins = &(mgpu_info.gpu_ins[i]);
2655 adev = gpu_ins->adev;
2656 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2657 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2658 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2659 if (ret)
2660 break;
2661
2662 gpu_ins->mgpu_fan_enabled = 1;
2663 }
2664 }
2665
2666out:
2667 mutex_unlock(&mgpu_info.mutex);
2668
2669 return ret;
2670}
2671
e3ecdffa
AD
2672/**
2673 * amdgpu_device_ip_late_init - run late init for hardware IPs
2674 *
2675 * @adev: amdgpu_device pointer
2676 *
2677 * Late initialization pass for hardware IPs. The list of all the hardware
2678 * IPs that make up the asic is walked and the late_init callbacks are run.
2679 * late_init covers any special initialization that an IP requires
2680 * after all of the have been initialized or something that needs to happen
2681 * late in the init process.
2682 * Returns 0 on success, negative error code on failure.
2683 */
06ec9070 2684static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2685{
60599a03 2686 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2687 int i = 0, r;
2688
2689 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2690 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2691 continue;
2692 if (adev->ip_blocks[i].version->funcs->late_init) {
2693 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2694 if (r) {
2695 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2696 adev->ip_blocks[i].version->funcs->name, r);
2697 return r;
2698 }
2dc80b00 2699 }
73f847db 2700 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2701 }
2702
867e24ca 2703 r = amdgpu_ras_late_init(adev);
2704 if (r) {
2705 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2706 return r;
2707 }
2708
a891d239
DL
2709 amdgpu_ras_set_error_query_ready(adev, true);
2710
1112a46b
RZ
2711 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2712 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2713
06ec9070 2714 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2715
beff74bc
AD
2716 r = amdgpu_device_enable_mgpu_fan_boost();
2717 if (r)
2718 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2719
4da8b639 2720 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2721 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2722 adev->asic_type == CHIP_ALDEBARAN ))
bc143d8b 2723 amdgpu_dpm_handle_passthrough_sbr(adev, true);
60599a03
EQ
2724
2725 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2726 mutex_lock(&mgpu_info.mutex);
2727
2728 /*
2729 * Reset device p-state to low as this was booted with high.
2730 *
2731 * This should be performed only after all devices from the same
2732 * hive get initialized.
2733 *
2734 * However, it's unknown how many device in the hive in advance.
2735 * As this is counted one by one during devices initializations.
2736 *
2737 * So, we wait for all XGMI interlinked devices initialized.
2738 * This may bring some delays as those devices may come from
2739 * different hives. But that should be OK.
2740 */
2741 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2742 for (i = 0; i < mgpu_info.num_gpu; i++) {
2743 gpu_instance = &(mgpu_info.gpu_ins[i]);
2744 if (gpu_instance->adev->flags & AMD_IS_APU)
2745 continue;
2746
d84a430d
JK
2747 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2748 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2749 if (r) {
2750 DRM_ERROR("pstate setting failed (%d).\n", r);
2751 break;
2752 }
2753 }
2754 }
2755
2756 mutex_unlock(&mgpu_info.mutex);
2757 }
2758
d38ceaf9
AD
2759 return 0;
2760}
2761
613aa3ea
LY
2762/**
2763 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2764 *
2765 * @adev: amdgpu_device pointer
2766 *
2767 * For ASICs need to disable SMC first
2768 */
2769static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2770{
2771 int i, r;
2772
2773 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2774 return;
2775
2776 for (i = 0; i < adev->num_ip_blocks; i++) {
2777 if (!adev->ip_blocks[i].status.hw)
2778 continue;
2779 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2780 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2781 /* XXX handle errors */
2782 if (r) {
2783 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2784 adev->ip_blocks[i].version->funcs->name, r);
2785 }
2786 adev->ip_blocks[i].status.hw = false;
2787 break;
2788 }
2789 }
2790}
2791
e9669fb7 2792static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
d38ceaf9
AD
2793{
2794 int i, r;
2795
e9669fb7
AG
2796 for (i = 0; i < adev->num_ip_blocks; i++) {
2797 if (!adev->ip_blocks[i].version->funcs->early_fini)
2798 continue;
5278a159 2799
e9669fb7
AG
2800 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2801 if (r) {
2802 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2803 adev->ip_blocks[i].version->funcs->name, r);
2804 }
2805 }
c030f2e4 2806
05df1f01 2807 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2808 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2809
7270e895
TY
2810 amdgpu_amdkfd_suspend(adev, false);
2811
613aa3ea
LY
2812 /* Workaroud for ASICs need to disable SMC first */
2813 amdgpu_device_smu_fini_early(adev);
3e96dbfd 2814
d38ceaf9 2815 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2816 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2817 continue;
8201a67a 2818
a1255107 2819 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2820 /* XXX handle errors */
2c1a2784 2821 if (r) {
a1255107
AD
2822 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2823 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2824 }
8201a67a 2825
a1255107 2826 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2827 }
2828
6effad8a
GC
2829 if (amdgpu_sriov_vf(adev)) {
2830 if (amdgpu_virt_release_full_gpu(adev, false))
2831 DRM_ERROR("failed to release exclusive mode on fini\n");
2832 }
2833
e9669fb7
AG
2834 return 0;
2835}
2836
2837/**
2838 * amdgpu_device_ip_fini - run fini for hardware IPs
2839 *
2840 * @adev: amdgpu_device pointer
2841 *
2842 * Main teardown pass for hardware IPs. The list of all the hardware
2843 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2844 * are run. hw_fini tears down the hardware associated with each IP
2845 * and sw_fini tears down any software state associated with each IP.
2846 * Returns 0 on success, negative error code on failure.
2847 */
2848static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2849{
2850 int i, r;
2851
2852 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2853 amdgpu_virt_release_ras_err_handler_data(adev);
2854
e9669fb7
AG
2855 if (adev->gmc.xgmi.num_physical_nodes > 1)
2856 amdgpu_xgmi_remove_device(adev);
2857
c004d44e 2858 amdgpu_amdkfd_device_fini_sw(adev);
9950cda2 2859
d38ceaf9 2860 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2861 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2862 continue;
c12aba3a
ML
2863
2864 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2865 amdgpu_ucode_free_bo(adev);
1e256e27 2866 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2867 amdgpu_device_wb_fini(adev);
2868 amdgpu_device_vram_scratch_fini(adev);
533aed27 2869 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2870 }
2871
a1255107 2872 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2873 /* XXX handle errors */
2c1a2784 2874 if (r) {
a1255107
AD
2875 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2876 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2877 }
a1255107
AD
2878 adev->ip_blocks[i].status.sw = false;
2879 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2880 }
2881
a6dcfd9c 2882 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2883 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2884 continue;
a1255107
AD
2885 if (adev->ip_blocks[i].version->funcs->late_fini)
2886 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2887 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2888 }
2889
c030f2e4 2890 amdgpu_ras_fini(adev);
2891
d38ceaf9
AD
2892 return 0;
2893}
2894
e3ecdffa 2895/**
beff74bc 2896 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2897 *
1112a46b 2898 * @work: work_struct.
e3ecdffa 2899 */
beff74bc 2900static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2901{
2902 struct amdgpu_device *adev =
beff74bc 2903 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2904 int r;
2905
2906 r = amdgpu_ib_ring_tests(adev);
2907 if (r)
2908 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2909}
2910
1e317b99
RZ
2911static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2912{
2913 struct amdgpu_device *adev =
2914 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2915
90a92662
MD
2916 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2917 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2918
2919 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2920 adev->gfx.gfx_off_state = true;
1e317b99
RZ
2921}
2922
e3ecdffa 2923/**
e7854a03 2924 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2925 *
2926 * @adev: amdgpu_device pointer
2927 *
2928 * Main suspend function for hardware IPs. The list of all the hardware
2929 * IPs that make up the asic is walked, clockgating is disabled and the
2930 * suspend callbacks are run. suspend puts the hardware and software state
2931 * in each IP into a state suitable for suspend.
2932 * Returns 0 on success, negative error code on failure.
2933 */
e7854a03
AD
2934static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2935{
2936 int i, r;
2937
50ec83f0
AD
2938 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2939 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2940
b31d6ada
EQ
2941 /*
2942 * Per PMFW team's suggestion, driver needs to handle gfxoff
2943 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2944 * scenario. Add the missing df cstate disablement here.
2945 */
2946 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2947 dev_warn(adev->dev, "Failed to disallow df cstate");
2948
e7854a03
AD
2949 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2950 if (!adev->ip_blocks[i].status.valid)
2951 continue;
2b9f7848 2952
e7854a03 2953 /* displays are handled separately */
2b9f7848
ND
2954 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2955 continue;
2956
2957 /* XXX handle errors */
2958 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2959 /* XXX handle errors */
2960 if (r) {
2961 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2962 adev->ip_blocks[i].version->funcs->name, r);
2963 return r;
e7854a03 2964 }
2b9f7848
ND
2965
2966 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2967 }
2968
e7854a03
AD
2969 return 0;
2970}
2971
2972/**
2973 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2974 *
2975 * @adev: amdgpu_device pointer
2976 *
2977 * Main suspend function for hardware IPs. The list of all the hardware
2978 * IPs that make up the asic is walked, clockgating is disabled and the
2979 * suspend callbacks are run. suspend puts the hardware and software state
2980 * in each IP into a state suitable for suspend.
2981 * Returns 0 on success, negative error code on failure.
2982 */
2983static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2984{
2985 int i, r;
2986
557f42a2 2987 if (adev->in_s0ix)
bc143d8b 2988 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
34416931 2989
d38ceaf9 2990 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2991 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2992 continue;
e7854a03
AD
2993 /* displays are handled in phase1 */
2994 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2995 continue;
bff77e86
LM
2996 /* PSP lost connection when err_event_athub occurs */
2997 if (amdgpu_ras_intr_triggered() &&
2998 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2999 adev->ip_blocks[i].status.hw = false;
3000 continue;
3001 }
e3c1b071 3002
3003 /* skip unnecessary suspend if we do not initialize them yet */
3004 if (adev->gmc.xgmi.pending_reset &&
3005 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3006 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3007 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3008 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3009 adev->ip_blocks[i].status.hw = false;
3010 continue;
3011 }
557f42a2 3012
32ff160d
AD
3013 /* skip suspend of gfx and psp for S0ix
3014 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3015 * like at runtime. PSP is also part of the always on hardware
3016 * so no need to suspend it.
3017 */
557f42a2 3018 if (adev->in_s0ix &&
32ff160d
AD
3019 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3020 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
557f42a2
AD
3021 continue;
3022
d38ceaf9 3023 /* XXX handle errors */
a1255107 3024 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 3025 /* XXX handle errors */
2c1a2784 3026 if (r) {
a1255107
AD
3027 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3028 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 3029 }
876923fb 3030 adev->ip_blocks[i].status.hw = false;
a3a09142 3031 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
3032 if(!amdgpu_sriov_vf(adev)){
3033 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3034 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3035 if (r) {
3036 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3037 adev->mp1_state, r);
3038 return r;
3039 }
a3a09142
AD
3040 }
3041 }
d38ceaf9
AD
3042 }
3043
3044 return 0;
3045}
3046
e7854a03
AD
3047/**
3048 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3049 *
3050 * @adev: amdgpu_device pointer
3051 *
3052 * Main suspend function for hardware IPs. The list of all the hardware
3053 * IPs that make up the asic is walked, clockgating is disabled and the
3054 * suspend callbacks are run. suspend puts the hardware and software state
3055 * in each IP into a state suitable for suspend.
3056 * Returns 0 on success, negative error code on failure.
3057 */
3058int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3059{
3060 int r;
3061
3c73683c
JC
3062 if (amdgpu_sriov_vf(adev)) {
3063 amdgpu_virt_fini_data_exchange(adev);
e7819644 3064 amdgpu_virt_request_full_gpu(adev, false);
3c73683c 3065 }
e7819644 3066
e7854a03
AD
3067 r = amdgpu_device_ip_suspend_phase1(adev);
3068 if (r)
3069 return r;
3070 r = amdgpu_device_ip_suspend_phase2(adev);
3071
e7819644
YT
3072 if (amdgpu_sriov_vf(adev))
3073 amdgpu_virt_release_full_gpu(adev, false);
3074
e7854a03
AD
3075 return r;
3076}
3077
06ec9070 3078static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3079{
3080 int i, r;
3081
2cb681b6 3082 static enum amd_ip_block_type ip_order[] = {
2cb681b6 3083 AMD_IP_BLOCK_TYPE_COMMON,
c1c39032 3084 AMD_IP_BLOCK_TYPE_GMC,
39186aef 3085 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
3086 AMD_IP_BLOCK_TYPE_IH,
3087 };
a90ad3c2 3088
95ea3dbc 3089 for (i = 0; i < adev->num_ip_blocks; i++) {
2cb681b6
ML
3090 int j;
3091 struct amdgpu_ip_block *block;
a90ad3c2 3092
4cd2a96d
J
3093 block = &adev->ip_blocks[i];
3094 block->status.hw = false;
2cb681b6 3095
4cd2a96d 3096 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 3097
4cd2a96d 3098 if (block->version->type != ip_order[j] ||
2cb681b6
ML
3099 !block->status.valid)
3100 continue;
3101
3102 r = block->version->funcs->hw_init(adev);
0aaeefcc 3103 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3104 if (r)
3105 return r;
482f0e53 3106 block->status.hw = true;
a90ad3c2
ML
3107 }
3108 }
3109
3110 return 0;
3111}
3112
06ec9070 3113static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3114{
3115 int i, r;
3116
2cb681b6
ML
3117 static enum amd_ip_block_type ip_order[] = {
3118 AMD_IP_BLOCK_TYPE_SMC,
3119 AMD_IP_BLOCK_TYPE_DCE,
3120 AMD_IP_BLOCK_TYPE_GFX,
3121 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 3122 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
3123 AMD_IP_BLOCK_TYPE_VCE,
3124 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 3125 };
a90ad3c2 3126
2cb681b6
ML
3127 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3128 int j;
3129 struct amdgpu_ip_block *block;
a90ad3c2 3130
2cb681b6
ML
3131 for (j = 0; j < adev->num_ip_blocks; j++) {
3132 block = &adev->ip_blocks[j];
3133
3134 if (block->version->type != ip_order[i] ||
482f0e53
ML
3135 !block->status.valid ||
3136 block->status.hw)
2cb681b6
ML
3137 continue;
3138
895bd048
JZ
3139 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3140 r = block->version->funcs->resume(adev);
3141 else
3142 r = block->version->funcs->hw_init(adev);
3143
0aaeefcc 3144 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3145 if (r)
3146 return r;
482f0e53 3147 block->status.hw = true;
a90ad3c2
ML
3148 }
3149 }
3150
3151 return 0;
3152}
3153
e3ecdffa
AD
3154/**
3155 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3156 *
3157 * @adev: amdgpu_device pointer
3158 *
3159 * First resume function for hardware IPs. The list of all the hardware
3160 * IPs that make up the asic is walked and the resume callbacks are run for
3161 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3162 * after a suspend and updates the software state as necessary. This
3163 * function is also used for restoring the GPU after a GPU reset.
3164 * Returns 0 on success, negative error code on failure.
3165 */
06ec9070 3166static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
3167{
3168 int i, r;
3169
a90ad3c2 3170 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3171 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 3172 continue;
a90ad3c2 3173 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3174 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
d7274ec7
BZ
3175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3176 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
482f0e53 3177
fcf0649f
CZ
3178 r = adev->ip_blocks[i].version->funcs->resume(adev);
3179 if (r) {
3180 DRM_ERROR("resume of IP block <%s> failed %d\n",
3181 adev->ip_blocks[i].version->funcs->name, r);
3182 return r;
3183 }
482f0e53 3184 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
3185 }
3186 }
3187
3188 return 0;
3189}
3190
e3ecdffa
AD
3191/**
3192 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3193 *
3194 * @adev: amdgpu_device pointer
3195 *
3196 * First resume function for hardware IPs. The list of all the hardware
3197 * IPs that make up the asic is walked and the resume callbacks are run for
3198 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3199 * functional state after a suspend and updates the software state as
3200 * necessary. This function is also used for restoring the GPU after a GPU
3201 * reset.
3202 * Returns 0 on success, negative error code on failure.
3203 */
06ec9070 3204static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
3205{
3206 int i, r;
3207
3208 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3209 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 3210 continue;
fcf0649f 3211 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3212 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
3213 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3214 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 3215 continue;
a1255107 3216 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 3217 if (r) {
a1255107
AD
3218 DRM_ERROR("resume of IP block <%s> failed %d\n",
3219 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 3220 return r;
2c1a2784 3221 }
482f0e53 3222 adev->ip_blocks[i].status.hw = true;
f543d286
PL
3223
3224 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3225 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in
3226 * amdgpu_device_resume() after IP resume.
3227 */
3228 amdgpu_gfx_off_ctrl(adev, false);
3229 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n");
3230 }
3231
d38ceaf9
AD
3232 }
3233
3234 return 0;
3235}
3236
e3ecdffa
AD
3237/**
3238 * amdgpu_device_ip_resume - run resume for hardware IPs
3239 *
3240 * @adev: amdgpu_device pointer
3241 *
3242 * Main resume function for hardware IPs. The hardware IPs
3243 * are split into two resume functions because they are
3244 * are also used in in recovering from a GPU reset and some additional
3245 * steps need to be take between them. In this case (S3/S4) they are
3246 * run sequentially.
3247 * Returns 0 on success, negative error code on failure.
3248 */
06ec9070 3249static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
3250{
3251 int r;
3252
9cec53c1
JZ
3253 r = amdgpu_amdkfd_resume_iommu(adev);
3254 if (r)
3255 return r;
3256
06ec9070 3257 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
3258 if (r)
3259 return r;
7a3e0bb2
RZ
3260
3261 r = amdgpu_device_fw_loading(adev);
3262 if (r)
3263 return r;
3264
06ec9070 3265 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
3266
3267 return r;
3268}
3269
e3ecdffa
AD
3270/**
3271 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3272 *
3273 * @adev: amdgpu_device pointer
3274 *
3275 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3276 */
4e99a44e 3277static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 3278{
6867e1b5
ML
3279 if (amdgpu_sriov_vf(adev)) {
3280 if (adev->is_atom_fw) {
58ff791a 3281 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
6867e1b5
ML
3282 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3283 } else {
3284 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3285 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3286 }
3287
3288 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3289 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 3290 }
048765ad
AR
3291}
3292
e3ecdffa
AD
3293/**
3294 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3295 *
3296 * @asic_type: AMD asic type
3297 *
3298 * Check if there is DC (new modesetting infrastructre) support for an asic.
3299 * returns true if DC has support, false if not.
3300 */
4562236b
HW
3301bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3302{
3303 switch (asic_type) {
0637d417
AD
3304#ifdef CONFIG_DRM_AMDGPU_SI
3305 case CHIP_HAINAN:
3306#endif
3307 case CHIP_TOPAZ:
3308 /* chips with no display hardware */
3309 return false;
4562236b 3310#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
3311 case CHIP_TAHITI:
3312 case CHIP_PITCAIRN:
3313 case CHIP_VERDE:
3314 case CHIP_OLAND:
2d32ffd6
AD
3315 /*
3316 * We have systems in the wild with these ASICs that require
3317 * LVDS and VGA support which is not supported with DC.
3318 *
3319 * Fallback to the non-DC driver here by default so as not to
3320 * cause regressions.
3321 */
3322#if defined(CONFIG_DRM_AMD_DC_SI)
3323 return amdgpu_dc > 0;
3324#else
3325 return false;
64200c46 3326#endif
4562236b 3327 case CHIP_BONAIRE:
0d6fbccb 3328 case CHIP_KAVERI:
367e6687
AD
3329 case CHIP_KABINI:
3330 case CHIP_MULLINS:
d9fda248
HW
3331 /*
3332 * We have systems in the wild with these ASICs that require
b5a0168e 3333 * VGA support which is not supported with DC.
d9fda248
HW
3334 *
3335 * Fallback to the non-DC driver here by default so as not to
3336 * cause regressions.
3337 */
3338 return amdgpu_dc > 0;
f7f12b25 3339 default:
fd187853 3340 return amdgpu_dc != 0;
f7f12b25 3341#else
4562236b 3342 default:
93b09a9a 3343 if (amdgpu_dc > 0)
044a48f4 3344 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
93b09a9a 3345 "but isn't supported by ASIC, ignoring\n");
4562236b 3346 return false;
f7f12b25 3347#endif
4562236b
HW
3348 }
3349}
3350
3351/**
3352 * amdgpu_device_has_dc_support - check if dc is supported
3353 *
982a820b 3354 * @adev: amdgpu_device pointer
4562236b
HW
3355 *
3356 * Returns true for supported, false for not supported
3357 */
3358bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3359{
25263da3 3360 if (adev->enable_virtual_display ||
abaf210c 3361 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
2555039d
XY
3362 return false;
3363
4562236b
HW
3364 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3365}
3366
d4535e2c
AG
3367static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3368{
3369 struct amdgpu_device *adev =
3370 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 3371 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 3372
c6a6e2db
AG
3373 /* It's a bug to not have a hive within this function */
3374 if (WARN_ON(!hive))
3375 return;
3376
3377 /*
3378 * Use task barrier to synchronize all xgmi reset works across the
3379 * hive. task_barrier_enter and task_barrier_exit will block
3380 * until all the threads running the xgmi reset works reach
3381 * those points. task_barrier_full will do both blocks.
3382 */
3383 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3384
3385 task_barrier_enter(&hive->tb);
4a580877 3386 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
3387
3388 if (adev->asic_reset_res)
3389 goto fail;
3390
3391 task_barrier_exit(&hive->tb);
4a580877 3392 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
3393
3394 if (adev->asic_reset_res)
3395 goto fail;
43c4d576 3396
5e67bba3 3397 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3398 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3399 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
c6a6e2db
AG
3400 } else {
3401
3402 task_barrier_full(&hive->tb);
3403 adev->asic_reset_res = amdgpu_asic_reset(adev);
3404 }
ce316fa5 3405
c6a6e2db 3406fail:
d4535e2c 3407 if (adev->asic_reset_res)
fed184e9 3408 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 3409 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 3410 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
3411}
3412
71f98027
AD
3413static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3414{
3415 char *input = amdgpu_lockup_timeout;
3416 char *timeout_setting = NULL;
3417 int index = 0;
3418 long timeout;
3419 int ret = 0;
3420
3421 /*
67387dfe
AD
3422 * By default timeout for non compute jobs is 10000
3423 * and 60000 for compute jobs.
71f98027 3424 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 3425 * jobs are 60000 by default.
71f98027
AD
3426 */
3427 adev->gfx_timeout = msecs_to_jiffies(10000);
3428 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
9882e278
ED
3429 if (amdgpu_sriov_vf(adev))
3430 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3431 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
71f98027 3432 else
67387dfe 3433 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027 3434
f440ff44 3435 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 3436 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 3437 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
3438 ret = kstrtol(timeout_setting, 0, &timeout);
3439 if (ret)
3440 return ret;
3441
3442 if (timeout == 0) {
3443 index++;
3444 continue;
3445 } else if (timeout < 0) {
3446 timeout = MAX_SCHEDULE_TIMEOUT;
127aedf9
CK
3447 dev_warn(adev->dev, "lockup timeout disabled");
3448 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
71f98027
AD
3449 } else {
3450 timeout = msecs_to_jiffies(timeout);
3451 }
3452
3453 switch (index++) {
3454 case 0:
3455 adev->gfx_timeout = timeout;
3456 break;
3457 case 1:
3458 adev->compute_timeout = timeout;
3459 break;
3460 case 2:
3461 adev->sdma_timeout = timeout;
3462 break;
3463 case 3:
3464 adev->video_timeout = timeout;
3465 break;
3466 default:
3467 break;
3468 }
3469 }
3470 /*
3471 * There is only one value specified and
3472 * it should apply to all non-compute jobs.
3473 */
bcccee89 3474 if (index == 1) {
71f98027 3475 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3476 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3477 adev->compute_timeout = adev->gfx_timeout;
3478 }
71f98027
AD
3479 }
3480
3481 return ret;
3482}
d4535e2c 3483
4a74c38c
PY
3484/**
3485 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3486 *
3487 * @adev: amdgpu_device pointer
3488 *
3489 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3490 */
3491static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3492{
3493 struct iommu_domain *domain;
3494
3495 domain = iommu_get_domain_for_dev(adev->dev);
3496 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3497 adev->ram_is_direct_mapped = true;
3498}
3499
77f3a5cd
ND
3500static const struct attribute *amdgpu_dev_attributes[] = {
3501 &dev_attr_product_name.attr,
3502 &dev_attr_product_number.attr,
3503 &dev_attr_serial_number.attr,
3504 &dev_attr_pcie_replay_count.attr,
3505 NULL
3506};
3507
d38ceaf9
AD
3508/**
3509 * amdgpu_device_init - initialize the driver
3510 *
3511 * @adev: amdgpu_device pointer
d38ceaf9
AD
3512 * @flags: driver flags
3513 *
3514 * Initializes the driver info and hw (all asics).
3515 * Returns 0 for success or an error on failure.
3516 * Called at driver startup.
3517 */
3518int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3519 uint32_t flags)
3520{
8aba21b7
LT
3521 struct drm_device *ddev = adev_to_drm(adev);
3522 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3523 int r, i;
b98c6299 3524 bool px = false;
95844d20 3525 u32 max_MBps;
d38ceaf9
AD
3526
3527 adev->shutdown = false;
d38ceaf9 3528 adev->flags = flags;
4e66d7d2
YZ
3529
3530 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3531 adev->asic_type = amdgpu_force_asic_type;
3532 else
3533 adev->asic_type = flags & AMD_ASIC_MASK;
3534
d38ceaf9 3535 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3536 if (amdgpu_emu_mode == 1)
8bdab6bb 3537 adev->usec_timeout *= 10;
770d13b1 3538 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3539 adev->accel_working = false;
3540 adev->num_rings = 0;
68ce8b24 3541 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
d38ceaf9
AD
3542 adev->mman.buffer_funcs = NULL;
3543 adev->mman.buffer_funcs_ring = NULL;
3544 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3545 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3546 adev->gmc.gmc_funcs = NULL;
7bd939d0 3547 adev->harvest_ip_mask = 0x0;
f54d1867 3548 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3549 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3550
3551 adev->smc_rreg = &amdgpu_invalid_rreg;
3552 adev->smc_wreg = &amdgpu_invalid_wreg;
3553 adev->pcie_rreg = &amdgpu_invalid_rreg;
3554 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3555 adev->pciep_rreg = &amdgpu_invalid_rreg;
3556 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3557 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3558 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3559 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3560 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3561 adev->didt_rreg = &amdgpu_invalid_rreg;
3562 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3563 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3564 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3565 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3566 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3567
3e39ab90
AD
3568 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3569 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3570 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3571
3572 /* mutex initialization are all done here so we
3573 * can recall function without having locking issues */
0e5ca0d1 3574 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3575 mutex_init(&adev->pm.mutex);
3576 mutex_init(&adev->gfx.gpu_clock_mutex);
3577 mutex_init(&adev->srbm_mutex);
b8866c26 3578 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3579 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3580 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3581 mutex_init(&adev->mn_lock);
e23b74aa 3582 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3583 hash_init(adev->mn_hash);
32eaeae0 3584 mutex_init(&adev->psp.mutex);
bd052211 3585 mutex_init(&adev->notifier_lock);
8cda7a4f 3586 mutex_init(&adev->pm.stable_pstate_ctx_lock);
f113cc32 3587 mutex_init(&adev->benchmark_mutex);
d38ceaf9 3588
ab3b9de6 3589 amdgpu_device_init_apu_flags(adev);
9f6a7857 3590
912dfc84
EQ
3591 r = amdgpu_device_check_arguments(adev);
3592 if (r)
3593 return r;
d38ceaf9 3594
d38ceaf9
AD
3595 spin_lock_init(&adev->mmio_idx_lock);
3596 spin_lock_init(&adev->smc_idx_lock);
3597 spin_lock_init(&adev->pcie_idx_lock);
3598 spin_lock_init(&adev->uvd_ctx_idx_lock);
3599 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3600 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3601 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3602 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3603 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3604
0c4e7fa5
CZ
3605 INIT_LIST_HEAD(&adev->shadow_list);
3606 mutex_init(&adev->shadow_list_lock);
3607
655ce9cb 3608 INIT_LIST_HEAD(&adev->reset_list);
3609
6492e1b0 3610 INIT_LIST_HEAD(&adev->ras_list);
3611
beff74bc
AD
3612 INIT_DELAYED_WORK(&adev->delayed_init_work,
3613 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3614 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3615 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3616
d4535e2c
AG
3617 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3618
d23ee13f 3619 adev->gfx.gfx_off_req_count = 1;
0ad7347a
AA
3620 adev->gfx.gfx_off_residency = 0;
3621 adev->gfx.gfx_off_entrycount = 0;
b6e79d9a 3622 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3623
b265bdbd
EQ
3624 atomic_set(&adev->throttling_logging_enabled, 1);
3625 /*
3626 * If throttling continues, logging will be performed every minute
3627 * to avoid log flooding. "-1" is subtracted since the thermal
3628 * throttling interrupt comes every second. Thus, the total logging
3629 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3630 * for throttling interrupt) = 60 seconds.
3631 */
3632 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3633 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3634
0fa49558
AX
3635 /* Registers mapping */
3636 /* TODO: block userspace mapping of io register */
da69c161
KW
3637 if (adev->asic_type >= CHIP_BONAIRE) {
3638 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3639 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3640 } else {
3641 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3642 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3643 }
d38ceaf9 3644
6c08e0ef
EQ
3645 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3646 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3647
d38ceaf9
AD
3648 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3649 if (adev->rmmio == NULL) {
3650 return -ENOMEM;
3651 }
3652 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3653 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3654
5494d864
AD
3655 amdgpu_device_get_pcie_info(adev);
3656
b239c017
JX
3657 if (amdgpu_mcbp)
3658 DRM_INFO("MCBP is enabled\n");
3659
436afdfa
PY
3660 /*
3661 * Reset domain needs to be present early, before XGMI hive discovered
3662 * (if any) and intitialized to use reset sem and in_gpu reset flag
3663 * early on during init and before calling to RREG32.
3664 */
3665 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3666 if (!adev->reset_domain)
3667 return -ENOMEM;
3668
3aa0115d
ML
3669 /* detect hw virtualization here */
3670 amdgpu_detect_virtualization(adev);
3671
dffa11b4
ML
3672 r = amdgpu_device_get_job_timeout_settings(adev);
3673 if (r) {
3674 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4ef87d8f 3675 return r;
a190d1c7
XY
3676 }
3677
d38ceaf9 3678 /* early init functions */
06ec9070 3679 r = amdgpu_device_ip_early_init(adev);
d38ceaf9 3680 if (r)
4ef87d8f 3681 return r;
d38ceaf9 3682
4d33e704
SK
3683 /* Enable TMZ based on IP_VERSION */
3684 amdgpu_gmc_tmz_set(adev);
3685
957b0787 3686 amdgpu_gmc_noretry_set(adev);
4a0165f0
VS
3687 /* Need to get xgmi info early to decide the reset behavior*/
3688 if (adev->gmc.xgmi.supported) {
3689 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3690 if (r)
3691 return r;
3692 }
3693
8e6d0b69 3694 /* enable PCIE atomic ops */
3695 if (amdgpu_sriov_vf(adev))
3696 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
e15c9d06 3697 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
8e6d0b69 3698 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3699 else
3700 adev->have_atomics_support =
3701 !pci_enable_atomic_ops_to_root(adev->pdev,
3702 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3703 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3704 if (!adev->have_atomics_support)
3705 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3706
6585661d
OZ
3707 /* doorbell bar mapping and doorbell index init*/
3708 amdgpu_device_doorbell_init(adev);
3709
9475a943
SL
3710 if (amdgpu_emu_mode == 1) {
3711 /* post the asic on emulation mode */
3712 emu_soc_asic_init(adev);
bfca0289 3713 goto fence_driver_init;
9475a943 3714 }
bfca0289 3715
04442bf7
LL
3716 amdgpu_reset_init(adev);
3717
4e99a44e
ML
3718 /* detect if we are with an SRIOV vbios */
3719 amdgpu_device_detect_sriov_bios(adev);
048765ad 3720
95e8e59e
AD
3721 /* check if we need to reset the asic
3722 * E.g., driver was not cleanly unloaded previously, etc.
3723 */
f14899fd 3724 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
e3c1b071 3725 if (adev->gmc.xgmi.num_physical_nodes) {
3726 dev_info(adev->dev, "Pending hive reset.\n");
3727 adev->gmc.xgmi.pending_reset = true;
3728 /* Only need to init necessary block for SMU to handle the reset */
3729 for (i = 0; i < adev->num_ip_blocks; i++) {
3730 if (!adev->ip_blocks[i].status.valid)
3731 continue;
3732 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3733 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3734 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
751f43e7 3736 DRM_DEBUG("IP %s disabled for hw_init.\n",
e3c1b071 3737 adev->ip_blocks[i].version->funcs->name);
3738 adev->ip_blocks[i].status.hw = true;
3739 }
3740 }
3741 } else {
3742 r = amdgpu_asic_reset(adev);
3743 if (r) {
3744 dev_err(adev->dev, "asic reset on init failed\n");
3745 goto failed;
3746 }
95e8e59e
AD
3747 }
3748 }
3749
8f66090b 3750 pci_enable_pcie_error_reporting(adev->pdev);
c9a6b82f 3751
d38ceaf9 3752 /* Post card if necessary */
39c640c0 3753 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3754 if (!adev->bios) {
bec86378 3755 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3756 r = -EINVAL;
3757 goto failed;
d38ceaf9 3758 }
bec86378 3759 DRM_INFO("GPU posting now...\n");
4d2997ab 3760 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3761 if (r) {
3762 dev_err(adev->dev, "gpu post error!\n");
3763 goto failed;
3764 }
d38ceaf9
AD
3765 }
3766
88b64e95
AD
3767 if (adev->is_atom_fw) {
3768 /* Initialize clocks */
3769 r = amdgpu_atomfirmware_get_clock_info(adev);
3770 if (r) {
3771 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3772 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3773 goto failed;
3774 }
3775 } else {
a5bde2f9
AD
3776 /* Initialize clocks */
3777 r = amdgpu_atombios_get_clock_info(adev);
3778 if (r) {
3779 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3780 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3781 goto failed;
a5bde2f9
AD
3782 }
3783 /* init i2c buses */
4562236b
HW
3784 if (!amdgpu_device_has_dc_support(adev))
3785 amdgpu_atombios_i2c_init(adev);
2c1a2784 3786 }
d38ceaf9 3787
bfca0289 3788fence_driver_init:
d38ceaf9 3789 /* Fence driver */
067f44c8 3790 r = amdgpu_fence_driver_sw_init(adev);
2c1a2784 3791 if (r) {
067f44c8 3792 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
e23b74aa 3793 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3794 goto failed;
2c1a2784 3795 }
d38ceaf9
AD
3796
3797 /* init the mode config */
4a580877 3798 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3799
06ec9070 3800 r = amdgpu_device_ip_init(adev);
d38ceaf9 3801 if (r) {
8840a387 3802 /* failed in exclusive mode due to timeout */
3803 if (amdgpu_sriov_vf(adev) &&
3804 !amdgpu_sriov_runtime(adev) &&
3805 amdgpu_virt_mmio_blocked(adev) &&
3806 !amdgpu_virt_wait_reset(adev)) {
3807 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3808 /* Don't send request since VF is inactive. */
3809 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3810 adev->virt.ops = NULL;
8840a387 3811 r = -EAGAIN;
970fd197 3812 goto release_ras_con;
8840a387 3813 }
06ec9070 3814 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3815 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
970fd197 3816 goto release_ras_con;
d38ceaf9
AD
3817 }
3818
8d35a259
LG
3819 amdgpu_fence_driver_hw_init(adev);
3820
d69b8971
YZ
3821 dev_info(adev->dev,
3822 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3823 adev->gfx.config.max_shader_engines,
3824 adev->gfx.config.max_sh_per_se,
3825 adev->gfx.config.max_cu_per_sh,
3826 adev->gfx.cu_info.number);
3827
d38ceaf9
AD
3828 adev->accel_working = true;
3829
e59c0205
AX
3830 amdgpu_vm_check_compute_bug(adev);
3831
95844d20
MO
3832 /* Initialize the buffer migration limit. */
3833 if (amdgpu_moverate >= 0)
3834 max_MBps = amdgpu_moverate;
3835 else
3836 max_MBps = 8; /* Allow 8 MB/s. */
3837 /* Get a log2 for easy divisions. */
3838 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3839
d2f52ac8 3840 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3841 if (r) {
3842 adev->pm_sysfs_en = false;
d2f52ac8 3843 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3844 } else
3845 adev->pm_sysfs_en = true;
d2f52ac8 3846
5bb23532 3847 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3848 if (r) {
3849 adev->ucode_sysfs_en = false;
5bb23532 3850 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3851 } else
3852 adev->ucode_sysfs_en = true;
5bb23532 3853
8424f2cc
LG
3854 r = amdgpu_psp_sysfs_init(adev);
3855 if (r) {
3856 adev->psp_sysfs_en = false;
3857 if (!amdgpu_sriov_vf(adev))
3858 DRM_ERROR("Creating psp sysfs failed\n");
3859 } else
3860 adev->psp_sysfs_en = true;
3861
b0adca4d
EQ
3862 /*
3863 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3864 * Otherwise the mgpu fan boost feature will be skipped due to the
3865 * gpu instance is counted less.
3866 */
3867 amdgpu_register_gpu_instance(adev);
3868
d38ceaf9
AD
3869 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3870 * explicit gating rather than handling it automatically.
3871 */
e3c1b071 3872 if (!adev->gmc.xgmi.pending_reset) {
3873 r = amdgpu_device_ip_late_init(adev);
3874 if (r) {
3875 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3876 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
970fd197 3877 goto release_ras_con;
e3c1b071 3878 }
3879 /* must succeed. */
3880 amdgpu_ras_resume(adev);
3881 queue_delayed_work(system_wq, &adev->delayed_init_work,
3882 msecs_to_jiffies(AMDGPU_RESUME_MS));
2c1a2784 3883 }
d38ceaf9 3884
2c738637
ML
3885 if (amdgpu_sriov_vf(adev))
3886 flush_delayed_work(&adev->delayed_init_work);
3887
77f3a5cd 3888 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
5aea5327 3889 if (r)
77f3a5cd 3890 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166 3891
d155bef0
AB
3892 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3893 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3894 if (r)
3895 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3896
c1dd4aa6
AG
3897 /* Have stored pci confspace at hand for restore in sudden PCI error */
3898 if (amdgpu_device_cache_pci_state(adev->pdev))
3899 pci_restore_state(pdev);
3900
8c3dd61c
KHF
3901 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3902 /* this will fail for cards that aren't VGA class devices, just
3903 * ignore it */
3904 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
bf44e8ce 3905 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
8c3dd61c
KHF
3906
3907 if (amdgpu_device_supports_px(ddev)) {
3908 px = true;
3909 vga_switcheroo_register_client(adev->pdev,
3910 &amdgpu_switcheroo_ops, px);
3911 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3912 }
3913
e3c1b071 3914 if (adev->gmc.xgmi.pending_reset)
3915 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3916 msecs_to_jiffies(AMDGPU_RESUME_MS));
3917
4a74c38c
PY
3918 amdgpu_device_check_iommu_direct_map(adev);
3919
d38ceaf9 3920 return 0;
83ba126a 3921
970fd197
SY
3922release_ras_con:
3923 amdgpu_release_ras_context(adev);
3924
83ba126a 3925failed:
89041940 3926 amdgpu_vf_error_trans_all(adev);
8840a387 3927
83ba126a 3928 return r;
d38ceaf9
AD
3929}
3930
07775fc1
AG
3931static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3932{
62d5f9f7 3933
07775fc1
AG
3934 /* Clear all CPU mappings pointing to this device */
3935 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3936
3937 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3938 amdgpu_device_doorbell_fini(adev);
3939
3940 iounmap(adev->rmmio);
3941 adev->rmmio = NULL;
3942 if (adev->mman.aper_base_kaddr)
3943 iounmap(adev->mman.aper_base_kaddr);
3944 adev->mman.aper_base_kaddr = NULL;
3945
3946 /* Memory manager related */
3947 if (!adev->gmc.xgmi.connected_to_cpu) {
3948 arch_phys_wc_del(adev->gmc.vram_mtrr);
3949 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3950 }
3951}
3952
d38ceaf9 3953/**
bbe04dec 3954 * amdgpu_device_fini_hw - tear down the driver
d38ceaf9
AD
3955 *
3956 * @adev: amdgpu_device pointer
3957 *
3958 * Tear down the driver info (all asics).
3959 * Called at driver shutdown.
3960 */
72c8c97b 3961void amdgpu_device_fini_hw(struct amdgpu_device *adev)
d38ceaf9 3962{
aac89168 3963 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3964 flush_delayed_work(&adev->delayed_init_work);
d0d13fe8 3965 adev->shutdown = true;
9f875167 3966
752c683d
ML
3967 /* make sure IB test finished before entering exclusive mode
3968 * to avoid preemption on IB test
3969 * */
519b8b76 3970 if (amdgpu_sriov_vf(adev)) {
752c683d 3971 amdgpu_virt_request_full_gpu(adev, false);
519b8b76
BZ
3972 amdgpu_virt_fini_data_exchange(adev);
3973 }
752c683d 3974
e5b03032
ML
3975 /* disable all interrupts */
3976 amdgpu_irq_disable_all(adev);
ff97cba8 3977 if (adev->mode_info.mode_config_initialized){
1053b9c9 3978 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4a580877 3979 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3980 else
4a580877 3981 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3982 }
8d35a259 3983 amdgpu_fence_driver_hw_fini(adev);
72c8c97b 3984
98f56188
YY
3985 if (adev->mman.initialized) {
3986 flush_delayed_work(&adev->mman.bdev.wq);
3987 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3988 }
3989
7c868b59
YT
3990 if (adev->pm_sysfs_en)
3991 amdgpu_pm_sysfs_fini(adev);
72c8c97b
AG
3992 if (adev->ucode_sysfs_en)
3993 amdgpu_ucode_sysfs_fini(adev);
8424f2cc
LG
3994 if (adev->psp_sysfs_en)
3995 amdgpu_psp_sysfs_fini(adev);
72c8c97b
AG
3996 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3997
232d1d43
SY
3998 /* disable ras feature must before hw fini */
3999 amdgpu_ras_pre_fini(adev);
4000
e9669fb7 4001 amdgpu_device_ip_fini_early(adev);
d10d0daa 4002
a3848df6
YW
4003 amdgpu_irq_fini_hw(adev);
4004
b6fd6e0f
SK
4005 if (adev->mman.initialized)
4006 ttm_device_clear_dma_mappings(&adev->mman.bdev);
894c6890 4007
d10d0daa 4008 amdgpu_gart_dummy_page_fini(adev);
07775fc1 4009
fac53471 4010 amdgpu_device_unmap_mmio(adev);
87172e89 4011
72c8c97b
AG
4012}
4013
4014void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4015{
62d5f9f7
LS
4016 int idx;
4017
8d35a259 4018 amdgpu_fence_driver_sw_fini(adev);
a5c5d8d5 4019 amdgpu_device_ip_fini(adev);
75e1658e
ND
4020 release_firmware(adev->firmware.gpu_info_fw);
4021 adev->firmware.gpu_info_fw = NULL;
d38ceaf9 4022 adev->accel_working = false;
68ce8b24 4023 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
04442bf7
LL
4024
4025 amdgpu_reset_fini(adev);
4026
d38ceaf9 4027 /* free i2c buses */
4562236b
HW
4028 if (!amdgpu_device_has_dc_support(adev))
4029 amdgpu_i2c_fini(adev);
bfca0289
SL
4030
4031 if (amdgpu_emu_mode != 1)
4032 amdgpu_atombios_fini(adev);
4033
d38ceaf9
AD
4034 kfree(adev->bios);
4035 adev->bios = NULL;
b98c6299 4036 if (amdgpu_device_supports_px(adev_to_drm(adev))) {
84c8b22e 4037 vga_switcheroo_unregister_client(adev->pdev);
83ba126a 4038 vga_switcheroo_fini_domain_pm_ops(adev->dev);
b98c6299 4039 }
38d6be81 4040 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
b8779475 4041 vga_client_unregister(adev->pdev);
e9bc1bf7 4042
62d5f9f7
LS
4043 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4044
4045 iounmap(adev->rmmio);
4046 adev->rmmio = NULL;
4047 amdgpu_device_doorbell_fini(adev);
4048 drm_dev_exit(idx);
4049 }
4050
d155bef0
AB
4051 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4052 amdgpu_pmu_fini(adev);
72de33f8 4053 if (adev->mman.discovery_bin)
a190d1c7 4054 amdgpu_discovery_fini(adev);
72c8c97b 4055
cfbb6b00
AG
4056 amdgpu_reset_put_reset_domain(adev->reset_domain);
4057 adev->reset_domain = NULL;
4058
72c8c97b
AG
4059 kfree(adev->pci_state);
4060
d38ceaf9
AD
4061}
4062
58144d28
ND
4063/**
4064 * amdgpu_device_evict_resources - evict device resources
4065 * @adev: amdgpu device object
4066 *
4067 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4068 * of the vram memory type. Mainly used for evicting device resources
4069 * at suspend time.
4070 *
4071 */
7863c155 4072static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
58144d28 4073{
7863c155
ML
4074 int ret;
4075
e53d9665
ML
4076 /* No need to evict vram on APUs for suspend to ram or s2idle */
4077 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
7863c155 4078 return 0;
58144d28 4079
7863c155
ML
4080 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4081 if (ret)
58144d28 4082 DRM_WARN("evicting device resources failed\n");
7863c155 4083 return ret;
58144d28 4084}
d38ceaf9
AD
4085
4086/*
4087 * Suspend & resume.
4088 */
4089/**
810ddc3a 4090 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 4091 *
87e3f136 4092 * @dev: drm dev pointer
87e3f136 4093 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
4094 *
4095 * Puts the hw in the suspend state (all asics).
4096 * Returns 0 for success or an error on failure.
4097 * Called at driver suspend.
4098 */
de185019 4099int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9 4100{
a2e15b0e 4101 struct amdgpu_device *adev = drm_to_adev(dev);
d7274ec7 4102 int r = 0;
d38ceaf9 4103
d38ceaf9
AD
4104 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4105 return 0;
4106
44779b43 4107 adev->in_suspend = true;
3fa8f89d 4108
d7274ec7
BZ
4109 if (amdgpu_sriov_vf(adev)) {
4110 amdgpu_virt_fini_data_exchange(adev);
4111 r = amdgpu_virt_request_full_gpu(adev, false);
4112 if (r)
4113 return r;
4114 }
4115
3fa8f89d
S
4116 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4117 DRM_WARN("smart shift update failed\n");
4118
d38ceaf9
AD
4119 drm_kms_helper_poll_disable(dev);
4120
5f818173 4121 if (fbcon)
087451f3 4122 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
5f818173 4123
beff74bc 4124 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 4125
5e6932fe 4126 amdgpu_ras_suspend(adev);
4127
2196927b 4128 amdgpu_device_ip_suspend_phase1(adev);
fe1053b7 4129
c004d44e 4130 if (!adev->in_s0ix)
5d3a2d95 4131 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
94fa5660 4132
7863c155
ML
4133 r = amdgpu_device_evict_resources(adev);
4134 if (r)
4135 return r;
d38ceaf9 4136
8d35a259 4137 amdgpu_fence_driver_hw_fini(adev);
d38ceaf9 4138
2196927b 4139 amdgpu_device_ip_suspend_phase2(adev);
d38ceaf9 4140
d7274ec7
BZ
4141 if (amdgpu_sriov_vf(adev))
4142 amdgpu_virt_release_full_gpu(adev, false);
4143
d38ceaf9
AD
4144 return 0;
4145}
4146
4147/**
810ddc3a 4148 * amdgpu_device_resume - initiate device resume
d38ceaf9 4149 *
87e3f136 4150 * @dev: drm dev pointer
87e3f136 4151 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
4152 *
4153 * Bring the hw back to operating state (all asics).
4154 * Returns 0 for success or an error on failure.
4155 * Called at driver resume.
4156 */
de185019 4157int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9 4158{
1348969a 4159 struct amdgpu_device *adev = drm_to_adev(dev);
03161a6e 4160 int r = 0;
d38ceaf9 4161
d7274ec7
BZ
4162 if (amdgpu_sriov_vf(adev)) {
4163 r = amdgpu_virt_request_full_gpu(adev, true);
4164 if (r)
4165 return r;
4166 }
4167
d38ceaf9
AD
4168 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4169 return 0;
4170
62498733 4171 if (adev->in_s0ix)
bc143d8b 4172 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
628c36d7 4173
d38ceaf9 4174 /* post card */
39c640c0 4175 if (amdgpu_device_need_post(adev)) {
4d2997ab 4176 r = amdgpu_device_asic_init(adev);
74b0b157 4177 if (r)
aac89168 4178 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 4179 }
d38ceaf9 4180
06ec9070 4181 r = amdgpu_device_ip_resume(adev);
d7274ec7
BZ
4182
4183 /* no matter what r is, always need to properly release full GPU */
4184 if (amdgpu_sriov_vf(adev)) {
4185 amdgpu_virt_init_data_exchange(adev);
4186 amdgpu_virt_release_full_gpu(adev, true);
4187 }
4188
e6707218 4189 if (r) {
aac89168 4190 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 4191 return r;
e6707218 4192 }
8d35a259 4193 amdgpu_fence_driver_hw_init(adev);
5ceb54c6 4194
06ec9070 4195 r = amdgpu_device_ip_late_init(adev);
03161a6e 4196 if (r)
4d3b9ae5 4197 return r;
d38ceaf9 4198
beff74bc
AD
4199 queue_delayed_work(system_wq, &adev->delayed_init_work,
4200 msecs_to_jiffies(AMDGPU_RESUME_MS));
4201
c004d44e 4202 if (!adev->in_s0ix) {
5d3a2d95
AD
4203 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4204 if (r)
4205 return r;
4206 }
756e6880 4207
96a5d8d4 4208 /* Make sure IB tests flushed */
ec4927d4
VZ
4209 if (amdgpu_sriov_vf(adev))
4210 amdgpu_irq_gpu_reset_resume_helper(adev);
beff74bc 4211 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 4212
f543d286
PL
4213 if (adev->in_s0ix) {
4214 /* re-enable gfxoff after IP resume. This re-enables gfxoff after
4215 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2().
4216 */
4217 amdgpu_gfx_off_ctrl(adev, true);
4218 DRM_DEBUG("will enable gfxoff for the mission mode\n");
4219 }
a2e15b0e 4220 if (fbcon)
087451f3 4221 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
d38ceaf9
AD
4222
4223 drm_kms_helper_poll_enable(dev);
23a1a9e5 4224
5e6932fe 4225 amdgpu_ras_resume(adev);
4226
d09ef243
AD
4227 if (adev->mode_info.num_crtc) {
4228 /*
4229 * Most of the connector probing functions try to acquire runtime pm
4230 * refs to ensure that the GPU is powered on when connector polling is
4231 * performed. Since we're calling this from a runtime PM callback,
4232 * trying to acquire rpm refs will cause us to deadlock.
4233 *
4234 * Since we're guaranteed to be holding the rpm lock, it's safe to
4235 * temporarily disable the rpm helpers so this doesn't deadlock us.
4236 */
23a1a9e5 4237#ifdef CONFIG_PM
d09ef243 4238 dev->dev->power.disable_depth++;
23a1a9e5 4239#endif
d09ef243
AD
4240 if (!adev->dc_enabled)
4241 drm_helper_hpd_irq_event(dev);
4242 else
4243 drm_kms_helper_hotplug_event(dev);
23a1a9e5 4244#ifdef CONFIG_PM
d09ef243 4245 dev->dev->power.disable_depth--;
23a1a9e5 4246#endif
d09ef243 4247 }
44779b43
RZ
4248 adev->in_suspend = false;
4249
3fa8f89d
S
4250 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4251 DRM_WARN("smart shift update failed\n");
4252
4d3b9ae5 4253 return 0;
d38ceaf9
AD
4254}
4255
e3ecdffa
AD
4256/**
4257 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4258 *
4259 * @adev: amdgpu_device pointer
4260 *
4261 * The list of all the hardware IPs that make up the asic is walked and
4262 * the check_soft_reset callbacks are run. check_soft_reset determines
4263 * if the asic is still hung or not.
4264 * Returns true if any of the IPs are still in a hung state, false if not.
4265 */
06ec9070 4266static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
4267{
4268 int i;
4269 bool asic_hang = false;
4270
f993d628
ML
4271 if (amdgpu_sriov_vf(adev))
4272 return true;
4273
8bc04c29
AD
4274 if (amdgpu_asic_need_full_reset(adev))
4275 return true;
4276
63fbf42f 4277 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4278 if (!adev->ip_blocks[i].status.valid)
63fbf42f 4279 continue;
a1255107
AD
4280 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4281 adev->ip_blocks[i].status.hang =
4282 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4283 if (adev->ip_blocks[i].status.hang) {
aac89168 4284 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
4285 asic_hang = true;
4286 }
4287 }
4288 return asic_hang;
4289}
4290
e3ecdffa
AD
4291/**
4292 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4293 *
4294 * @adev: amdgpu_device pointer
4295 *
4296 * The list of all the hardware IPs that make up the asic is walked and the
4297 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4298 * handles any IP specific hardware or software state changes that are
4299 * necessary for a soft reset to succeed.
4300 * Returns 0 on success, negative error code on failure.
4301 */
06ec9070 4302static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
4303{
4304 int i, r = 0;
4305
4306 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4307 if (!adev->ip_blocks[i].status.valid)
d31a501e 4308 continue;
a1255107
AD
4309 if (adev->ip_blocks[i].status.hang &&
4310 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4311 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
4312 if (r)
4313 return r;
4314 }
4315 }
4316
4317 return 0;
4318}
4319
e3ecdffa
AD
4320/**
4321 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4322 *
4323 * @adev: amdgpu_device pointer
4324 *
4325 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4326 * reset is necessary to recover.
4327 * Returns true if a full asic reset is required, false if not.
4328 */
06ec9070 4329static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 4330{
da146d3b
AD
4331 int i;
4332
8bc04c29
AD
4333 if (amdgpu_asic_need_full_reset(adev))
4334 return true;
4335
da146d3b 4336 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4337 if (!adev->ip_blocks[i].status.valid)
da146d3b 4338 continue;
a1255107
AD
4339 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4340 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4341 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
4342 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4343 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 4344 if (adev->ip_blocks[i].status.hang) {
aac89168 4345 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
4346 return true;
4347 }
4348 }
35d782fe
CZ
4349 }
4350 return false;
4351}
4352
e3ecdffa
AD
4353/**
4354 * amdgpu_device_ip_soft_reset - do a soft reset
4355 *
4356 * @adev: amdgpu_device pointer
4357 *
4358 * The list of all the hardware IPs that make up the asic is walked and the
4359 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4360 * IP specific hardware or software state changes that are necessary to soft
4361 * reset the IP.
4362 * Returns 0 on success, negative error code on failure.
4363 */
06ec9070 4364static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4365{
4366 int i, r = 0;
4367
4368 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4369 if (!adev->ip_blocks[i].status.valid)
35d782fe 4370 continue;
a1255107
AD
4371 if (adev->ip_blocks[i].status.hang &&
4372 adev->ip_blocks[i].version->funcs->soft_reset) {
4373 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
4374 if (r)
4375 return r;
4376 }
4377 }
4378
4379 return 0;
4380}
4381
e3ecdffa
AD
4382/**
4383 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4384 *
4385 * @adev: amdgpu_device pointer
4386 *
4387 * The list of all the hardware IPs that make up the asic is walked and the
4388 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4389 * handles any IP specific hardware or software state changes that are
4390 * necessary after the IP has been soft reset.
4391 * Returns 0 on success, negative error code on failure.
4392 */
06ec9070 4393static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4394{
4395 int i, r = 0;
4396
4397 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4398 if (!adev->ip_blocks[i].status.valid)
35d782fe 4399 continue;
a1255107
AD
4400 if (adev->ip_blocks[i].status.hang &&
4401 adev->ip_blocks[i].version->funcs->post_soft_reset)
4402 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
4403 if (r)
4404 return r;
4405 }
4406
4407 return 0;
4408}
4409
e3ecdffa 4410/**
c33adbc7 4411 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
4412 *
4413 * @adev: amdgpu_device pointer
4414 *
4415 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4416 * restore things like GPUVM page tables after a GPU reset where
4417 * the contents of VRAM might be lost.
403009bf
CK
4418 *
4419 * Returns:
4420 * 0 on success, negative error code on failure.
e3ecdffa 4421 */
c33adbc7 4422static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 4423{
c41d1cf6 4424 struct dma_fence *fence = NULL, *next = NULL;
403009bf 4425 struct amdgpu_bo *shadow;
e18aaea7 4426 struct amdgpu_bo_vm *vmbo;
403009bf 4427 long r = 1, tmo;
c41d1cf6
ML
4428
4429 if (amdgpu_sriov_runtime(adev))
b045d3af 4430 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
4431 else
4432 tmo = msecs_to_jiffies(100);
4433
aac89168 4434 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 4435 mutex_lock(&adev->shadow_list_lock);
e18aaea7
ND
4436 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4437 shadow = &vmbo->bo;
403009bf 4438 /* No need to recover an evicted BO */
d3116756
CK
4439 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4440 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4441 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
403009bf
CK
4442 continue;
4443
4444 r = amdgpu_bo_restore_shadow(shadow, &next);
4445 if (r)
4446 break;
4447
c41d1cf6 4448 if (fence) {
1712fb1a 4449 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
4450 dma_fence_put(fence);
4451 fence = next;
1712fb1a 4452 if (tmo == 0) {
4453 r = -ETIMEDOUT;
c41d1cf6 4454 break;
1712fb1a 4455 } else if (tmo < 0) {
4456 r = tmo;
4457 break;
4458 }
403009bf
CK
4459 } else {
4460 fence = next;
c41d1cf6 4461 }
c41d1cf6
ML
4462 }
4463 mutex_unlock(&adev->shadow_list_lock);
4464
403009bf
CK
4465 if (fence)
4466 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
4467 dma_fence_put(fence);
4468
1712fb1a 4469 if (r < 0 || tmo <= 0) {
aac89168 4470 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
4471 return -EIO;
4472 }
c41d1cf6 4473
aac89168 4474 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 4475 return 0;
c41d1cf6
ML
4476}
4477
a90ad3c2 4478
e3ecdffa 4479/**
06ec9070 4480 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e 4481 *
982a820b 4482 * @adev: amdgpu_device pointer
87e3f136 4483 * @from_hypervisor: request from hypervisor
5740682e
ML
4484 *
4485 * do VF FLR and reinitialize Asic
3f48c681 4486 * return 0 means succeeded otherwise failed
e3ecdffa
AD
4487 */
4488static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4489 bool from_hypervisor)
5740682e
ML
4490{
4491 int r;
a5f67c93 4492 struct amdgpu_hive_info *hive = NULL;
7258fa31 4493 int retry_limit = 0;
5740682e 4494
7258fa31 4495retry:
c004d44e 4496 amdgpu_amdkfd_pre_reset(adev);
428890a3 4497
5740682e
ML
4498 if (from_hypervisor)
4499 r = amdgpu_virt_request_full_gpu(adev, true);
4500 else
4501 r = amdgpu_virt_reset_gpu(adev);
4502 if (r)
4503 return r;
a90ad3c2
ML
4504
4505 /* Resume IP prior to SMC */
06ec9070 4506 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
4507 if (r)
4508 goto error;
a90ad3c2 4509
c9ffa427 4510 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 4511
7a3e0bb2
RZ
4512 r = amdgpu_device_fw_loading(adev);
4513 if (r)
4514 return r;
4515
a90ad3c2 4516 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 4517 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
4518 if (r)
4519 goto error;
a90ad3c2 4520
a5f67c93
ZL
4521 hive = amdgpu_get_xgmi_hive(adev);
4522 /* Update PSP FW topology after reset */
4523 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4524 r = amdgpu_xgmi_update_topology(hive, adev);
4525
4526 if (hive)
4527 amdgpu_put_xgmi_hive(hive);
4528
4529 if (!r) {
4530 amdgpu_irq_gpu_reset_resume_helper(adev);
4531 r = amdgpu_ib_ring_tests(adev);
9c12f5cd 4532
c004d44e 4533 amdgpu_amdkfd_post_reset(adev);
a5f67c93 4534 }
a90ad3c2 4535
abc34253 4536error:
c41d1cf6 4537 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 4538 amdgpu_inc_vram_lost(adev);
c33adbc7 4539 r = amdgpu_device_recover_vram(adev);
a90ad3c2 4540 }
437f3e0b 4541 amdgpu_virt_release_full_gpu(adev, true);
a90ad3c2 4542
7258fa31
SK
4543 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4544 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4545 retry_limit++;
4546 goto retry;
4547 } else
4548 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4549 }
4550
a90ad3c2
ML
4551 return r;
4552}
4553
9a1cddd6 4554/**
4555 * amdgpu_device_has_job_running - check if there is any job in mirror list
4556 *
982a820b 4557 * @adev: amdgpu_device pointer
9a1cddd6 4558 *
4559 * check if there is any job in mirror list
4560 */
4561bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4562{
4563 int i;
4564 struct drm_sched_job *job;
4565
4566 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4567 struct amdgpu_ring *ring = adev->rings[i];
4568
4569 if (!ring || !ring->sched.thread)
4570 continue;
4571
4572 spin_lock(&ring->sched.job_list_lock);
6efa4b46
LT
4573 job = list_first_entry_or_null(&ring->sched.pending_list,
4574 struct drm_sched_job, list);
9a1cddd6 4575 spin_unlock(&ring->sched.job_list_lock);
4576 if (job)
4577 return true;
4578 }
4579 return false;
4580}
4581
12938fad
CK
4582/**
4583 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4584 *
982a820b 4585 * @adev: amdgpu_device pointer
12938fad
CK
4586 *
4587 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4588 * a hung GPU.
4589 */
4590bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4591{
12938fad 4592
3ba7b418
AG
4593 if (amdgpu_gpu_recovery == 0)
4594 goto disabled;
4595
d3ef9d57
CG
4596 if (!amdgpu_device_ip_check_soft_reset(adev)) {
4597 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
4598 return false;
4599 }
4600
3ba7b418
AG
4601 if (amdgpu_sriov_vf(adev))
4602 return true;
4603
4604 if (amdgpu_gpu_recovery == -1) {
4605 switch (adev->asic_type) {
b3523c45
AD
4606#ifdef CONFIG_DRM_AMDGPU_SI
4607 case CHIP_VERDE:
4608 case CHIP_TAHITI:
4609 case CHIP_PITCAIRN:
4610 case CHIP_OLAND:
4611 case CHIP_HAINAN:
4612#endif
4613#ifdef CONFIG_DRM_AMDGPU_CIK
4614 case CHIP_KAVERI:
4615 case CHIP_KABINI:
4616 case CHIP_MULLINS:
4617#endif
4618 case CHIP_CARRIZO:
4619 case CHIP_STONEY:
4620 case CHIP_CYAN_SKILLFISH:
3ba7b418 4621 goto disabled;
b3523c45
AD
4622 default:
4623 break;
3ba7b418 4624 }
12938fad
CK
4625 }
4626
4627 return true;
3ba7b418
AG
4628
4629disabled:
aac89168 4630 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4631 return false;
12938fad
CK
4632}
4633
5c03e584
FX
4634int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4635{
4636 u32 i;
4637 int ret = 0;
4638
4639 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4640
4641 dev_info(adev->dev, "GPU mode1 reset\n");
4642
4643 /* disable BM */
4644 pci_clear_master(adev->pdev);
4645
4646 amdgpu_device_cache_pci_state(adev->pdev);
4647
4648 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4649 dev_info(adev->dev, "GPU smu mode1 reset\n");
4650 ret = amdgpu_dpm_mode1_reset(adev);
4651 } else {
4652 dev_info(adev->dev, "GPU psp mode1 reset\n");
4653 ret = psp_gpu_reset(adev);
4654 }
4655
4656 if (ret)
4657 dev_err(adev->dev, "GPU mode1 reset failed\n");
4658
4659 amdgpu_device_load_pci_state(adev->pdev);
4660
4661 /* wait for asic to come out of reset */
4662 for (i = 0; i < adev->usec_timeout; i++) {
4663 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4664
4665 if (memsize != 0xffffffff)
4666 break;
4667 udelay(1);
4668 }
4669
4670 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4671 return ret;
4672}
5c6dd71e 4673
e3c1b071 4674int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
04442bf7 4675 struct amdgpu_reset_context *reset_context)
26bc5340 4676{
5c1e6fa4 4677 int i, r = 0;
04442bf7
LL
4678 struct amdgpu_job *job = NULL;
4679 bool need_full_reset =
4680 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4681
4682 if (reset_context->reset_req_dev == adev)
4683 job = reset_context->job;
71182665 4684
b602ca5f
TZ
4685 if (amdgpu_sriov_vf(adev)) {
4686 /* stop the data exchange thread */
4687 amdgpu_virt_fini_data_exchange(adev);
4688 }
4689
9e225fb9
AG
4690 amdgpu_fence_driver_isr_toggle(adev, true);
4691
71182665 4692 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4693 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4694 struct amdgpu_ring *ring = adev->rings[i];
4695
51687759 4696 if (!ring || !ring->sched.thread)
0875dc9e 4697 continue;
5740682e 4698
c530b02f
JZ
4699 /*clear job fence from fence drv to avoid force_completion
4700 *leave NULL and vm flush fence in fence drv */
5c1e6fa4 4701 amdgpu_fence_driver_clear_job_fences(ring);
c530b02f 4702
2f9d4084
ML
4703 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4704 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4705 }
d38ceaf9 4706
9e225fb9
AG
4707 amdgpu_fence_driver_isr_toggle(adev, false);
4708
ff99849b 4709 if (job && job->vm)
222b5f04
AG
4710 drm_sched_increase_karma(&job->base);
4711
04442bf7 4712 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
404b277b
LL
4713 /* If reset handler not implemented, continue; otherwise return */
4714 if (r == -ENOSYS)
4715 r = 0;
4716 else
04442bf7
LL
4717 return r;
4718
1d721ed6 4719 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4720 if (!amdgpu_sriov_vf(adev)) {
4721
4722 if (!need_full_reset)
4723 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4724
d3ef9d57 4725 if (!need_full_reset && amdgpu_gpu_recovery) {
26bc5340
AG
4726 amdgpu_device_ip_pre_soft_reset(adev);
4727 r = amdgpu_device_ip_soft_reset(adev);
4728 amdgpu_device_ip_post_soft_reset(adev);
4729 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4730 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4731 need_full_reset = true;
4732 }
4733 }
4734
4735 if (need_full_reset)
4736 r = amdgpu_device_ip_suspend(adev);
04442bf7
LL
4737 if (need_full_reset)
4738 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4739 else
4740 clear_bit(AMDGPU_NEED_FULL_RESET,
4741 &reset_context->flags);
26bc5340
AG
4742 }
4743
4744 return r;
4745}
4746
15fd09a0
SA
4747static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4748{
15fd09a0
SA
4749 int i;
4750
38a15ad9 4751 lockdep_assert_held(&adev->reset_domain->sem);
15fd09a0
SA
4752
4753 for (i = 0; i < adev->num_regs; i++) {
651d7ee6
SA
4754 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4755 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4756 adev->reset_dump_reg_value[i]);
15fd09a0
SA
4757 }
4758
4759 return 0;
4760}
4761
3d8785f6
SA
4762#ifdef CONFIG_DEV_COREDUMP
4763static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4764 size_t count, void *data, size_t datalen)
4765{
4766 struct drm_printer p;
4767 struct amdgpu_device *adev = data;
4768 struct drm_print_iterator iter;
4769 int i;
4770
4771 iter.data = buffer;
4772 iter.offset = 0;
4773 iter.start = offset;
4774 iter.remain = count;
4775
4776 p = drm_coredump_printer(&iter);
4777
4778 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4779 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4780 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4781 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4782 if (adev->reset_task_info.pid)
4783 drm_printf(&p, "process_name: %s PID: %d\n",
4784 adev->reset_task_info.process_name,
4785 adev->reset_task_info.pid);
4786
4787 if (adev->reset_vram_lost)
4788 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4789 if (adev->num_regs) {
4790 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4791
4792 for (i = 0; i < adev->num_regs; i++)
4793 drm_printf(&p, "0x%08x: 0x%08x\n",
4794 adev->reset_dump_reg_list[i],
4795 adev->reset_dump_reg_value[i]);
4796 }
4797
4798 return count - iter.remain;
4799}
4800
4801static void amdgpu_devcoredump_free(void *data)
4802{
4803}
4804
4805static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4806{
4807 struct drm_device *dev = adev_to_drm(adev);
4808
4809 ktime_get_ts64(&adev->reset_time);
4810 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4811 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4812}
4813#endif
4814
04442bf7
LL
4815int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4816 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4817{
4818 struct amdgpu_device *tmp_adev = NULL;
04442bf7 4819 bool need_full_reset, skip_hw_reset, vram_lost = false;
26bc5340 4820 int r = 0;
f5c7e779 4821 bool gpu_reset_for_dev_remove = 0;
26bc5340 4822
04442bf7
LL
4823 /* Try reset handler method first */
4824 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4825 reset_list);
15fd09a0 4826 amdgpu_reset_reg_dumps(tmp_adev);
0a83bb35
LL
4827
4828 reset_context->reset_device_list = device_list_handle;
04442bf7 4829 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
404b277b
LL
4830 /* If reset handler not implemented, continue; otherwise return */
4831 if (r == -ENOSYS)
4832 r = 0;
4833 else
04442bf7
LL
4834 return r;
4835
4836 /* Reset handler not implemented, use the default method */
4837 need_full_reset =
4838 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4839 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4840
f5c7e779
YC
4841 gpu_reset_for_dev_remove =
4842 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4843 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4844
26bc5340 4845 /*
655ce9cb 4846 * ASIC reset has to be done on all XGMI hive nodes ASAP
26bc5340
AG
4847 * to allow proper links negotiation in FW (within 1 sec)
4848 */
7ac71382 4849 if (!skip_hw_reset && need_full_reset) {
655ce9cb 4850 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
041a62bc 4851 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4852 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
e3c1b071 4853 tmp_adev->gmc.xgmi.pending_reset = false;
c96cf282 4854 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4855 r = -EALREADY;
4856 } else
4857 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4858
041a62bc 4859 if (r) {
aac89168 4860 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4861 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4862 break;
ce316fa5
LM
4863 }
4864 }
4865
041a62bc
AG
4866 /* For XGMI wait for all resets to complete before proceed */
4867 if (!r) {
655ce9cb 4868 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
ce316fa5
LM
4869 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4870 flush_work(&tmp_adev->xgmi_reset_work);
4871 r = tmp_adev->asic_reset_res;
4872 if (r)
4873 break;
ce316fa5
LM
4874 }
4875 }
4876 }
ce316fa5 4877 }
26bc5340 4878
43c4d576 4879 if (!r && amdgpu_ras_intr_triggered()) {
655ce9cb 4880 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5e67bba3 4881 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4882 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4883 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
43c4d576
JC
4884 }
4885
00eaa571 4886 amdgpu_ras_intr_cleared();
43c4d576 4887 }
00eaa571 4888
f5c7e779
YC
4889 /* Since the mode1 reset affects base ip blocks, the
4890 * phase1 ip blocks need to be resumed. Otherwise there
4891 * will be a BIOS signature error and the psp bootloader
4892 * can't load kdb on the next amdgpu install.
4893 */
4894 if (gpu_reset_for_dev_remove) {
4895 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4896 amdgpu_device_ip_resume_phase1(tmp_adev);
4897
4898 goto end;
4899 }
4900
655ce9cb 4901 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
26bc5340
AG
4902 if (need_full_reset) {
4903 /* post card */
e3c1b071 4904 r = amdgpu_device_asic_init(tmp_adev);
4905 if (r) {
aac89168 4906 dev_warn(tmp_adev->dev, "asic atom init failed!");
e3c1b071 4907 } else {
26bc5340 4908 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
9cec53c1
JZ
4909 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4910 if (r)
4911 goto out;
4912
26bc5340
AG
4913 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4914 if (r)
4915 goto out;
4916
4917 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
3d8785f6
SA
4918#ifdef CONFIG_DEV_COREDUMP
4919 tmp_adev->reset_vram_lost = vram_lost;
4920 memset(&tmp_adev->reset_task_info, 0,
4921 sizeof(tmp_adev->reset_task_info));
4922 if (reset_context->job && reset_context->job->vm)
4923 tmp_adev->reset_task_info =
4924 reset_context->job->vm->task_info;
4925 amdgpu_reset_capture_coredumpm(tmp_adev);
4926#endif
26bc5340 4927 if (vram_lost) {
77e7f829 4928 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4929 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4930 }
4931
26bc5340
AG
4932 r = amdgpu_device_fw_loading(tmp_adev);
4933 if (r)
4934 return r;
4935
4936 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4937 if (r)
4938 goto out;
4939
4940 if (vram_lost)
4941 amdgpu_device_fill_reset_magic(tmp_adev);
4942
fdafb359
EQ
4943 /*
4944 * Add this ASIC as tracked as reset was already
4945 * complete successfully.
4946 */
4947 amdgpu_register_gpu_instance(tmp_adev);
4948
04442bf7
LL
4949 if (!reset_context->hive &&
4950 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
e3c1b071 4951 amdgpu_xgmi_add_device(tmp_adev);
4952
7c04ca50 4953 r = amdgpu_device_ip_late_init(tmp_adev);
4954 if (r)
4955 goto out;
4956
087451f3 4957 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
565d1941 4958
e8fbaf03
GC
4959 /*
4960 * The GPU enters bad state once faulty pages
4961 * by ECC has reached the threshold, and ras
4962 * recovery is scheduled next. So add one check
4963 * here to break recovery if it indeed exceeds
4964 * bad page threshold, and remind user to
4965 * retire this GPU or setting one bigger
4966 * bad_page_threshold value to fix this once
4967 * probing driver again.
4968 */
11003c68 4969 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
e8fbaf03
GC
4970 /* must succeed. */
4971 amdgpu_ras_resume(tmp_adev);
4972 } else {
4973 r = -EINVAL;
4974 goto out;
4975 }
e79a04d5 4976
26bc5340 4977 /* Update PSP FW topology after reset */
04442bf7
LL
4978 if (reset_context->hive &&
4979 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4980 r = amdgpu_xgmi_update_topology(
4981 reset_context->hive, tmp_adev);
26bc5340
AG
4982 }
4983 }
4984
26bc5340
AG
4985out:
4986 if (!r) {
4987 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4988 r = amdgpu_ib_ring_tests(tmp_adev);
4989 if (r) {
4990 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
26bc5340
AG
4991 need_full_reset = true;
4992 r = -EAGAIN;
4993 goto end;
4994 }
4995 }
4996
4997 if (!r)
4998 r = amdgpu_device_recover_vram(tmp_adev);
4999 else
5000 tmp_adev->asic_reset_res = r;
5001 }
5002
5003end:
04442bf7
LL
5004 if (need_full_reset)
5005 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5006 else
5007 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340
AG
5008 return r;
5009}
5010
e923be99 5011static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
26bc5340 5012{
5740682e 5013
a3a09142
AD
5014 switch (amdgpu_asic_reset_method(adev)) {
5015 case AMD_RESET_METHOD_MODE1:
5016 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5017 break;
5018 case AMD_RESET_METHOD_MODE2:
5019 adev->mp1_state = PP_MP1_STATE_RESET;
5020 break;
5021 default:
5022 adev->mp1_state = PP_MP1_STATE_NONE;
5023 break;
5024 }
26bc5340 5025}
d38ceaf9 5026
e923be99 5027static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
26bc5340 5028{
89041940 5029 amdgpu_vf_error_trans_all(adev);
a3a09142 5030 adev->mp1_state = PP_MP1_STATE_NONE;
91fb309d
HC
5031}
5032
3f12acc8
EQ
5033static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5034{
5035 struct pci_dev *p = NULL;
5036
5037 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5038 adev->pdev->bus->number, 1);
5039 if (p) {
5040 pm_runtime_enable(&(p->dev));
5041 pm_runtime_resume(&(p->dev));
5042 }
5043}
5044
5045static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5046{
5047 enum amd_reset_method reset_method;
5048 struct pci_dev *p = NULL;
5049 u64 expires;
5050
5051 /*
5052 * For now, only BACO and mode1 reset are confirmed
5053 * to suffer the audio issue without proper suspended.
5054 */
5055 reset_method = amdgpu_asic_reset_method(adev);
5056 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5057 (reset_method != AMD_RESET_METHOD_MODE1))
5058 return -EINVAL;
5059
5060 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5061 adev->pdev->bus->number, 1);
5062 if (!p)
5063 return -ENODEV;
5064
5065 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5066 if (!expires)
5067 /*
5068 * If we cannot get the audio device autosuspend delay,
5069 * a fixed 4S interval will be used. Considering 3S is
5070 * the audio controller default autosuspend delay setting.
5071 * 4S used here is guaranteed to cover that.
5072 */
54b7feb9 5073 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
5074
5075 while (!pm_runtime_status_suspended(&(p->dev))) {
5076 if (!pm_runtime_suspend(&(p->dev)))
5077 break;
5078
5079 if (expires < ktime_get_mono_fast_ns()) {
5080 dev_warn(adev->dev, "failed to suspend display audio\n");
5081 /* TODO: abort the succeeding gpu reset? */
5082 return -ETIMEDOUT;
5083 }
5084 }
5085
5086 pm_runtime_disable(&(p->dev));
5087
5088 return 0;
5089}
5090
9d8d96be 5091static void amdgpu_device_recheck_guilty_jobs(
04442bf7
LL
5092 struct amdgpu_device *adev, struct list_head *device_list_handle,
5093 struct amdgpu_reset_context *reset_context)
e6c6338f
JZ
5094{
5095 int i, r = 0;
5096
5097 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5098 struct amdgpu_ring *ring = adev->rings[i];
5099 int ret = 0;
5100 struct drm_sched_job *s_job;
5101
5102 if (!ring || !ring->sched.thread)
5103 continue;
5104
5105 s_job = list_first_entry_or_null(&ring->sched.pending_list,
5106 struct drm_sched_job, list);
5107 if (s_job == NULL)
5108 continue;
5109
5110 /* clear job's guilty and depend the folowing step to decide the real one */
5111 drm_sched_reset_karma(s_job);
5112 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
5113
9ae55f03
AG
5114 if (!s_job->s_fence->parent) {
5115 DRM_WARN("Failed to get a HW fence for job!");
5116 continue;
5117 }
5118
e6c6338f
JZ
5119 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
5120 if (ret == 0) { /* timeout */
5121 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
5122 ring->sched.name, s_job->id);
5123
9ae55f03
AG
5124
5125 amdgpu_fence_driver_isr_toggle(adev, true);
5126
5127 /* Clear this failed job from fence array */
5128 amdgpu_fence_driver_clear_job_fences(ring);
5129
5130 amdgpu_fence_driver_isr_toggle(adev, false);
5131
5132 /* Since the job won't signal and we go for
5133 * another resubmit drop this parent pointer
5134 */
5135 dma_fence_put(s_job->s_fence->parent);
5136 s_job->s_fence->parent = NULL;
5137
e6c6338f
JZ
5138 /* set guilty */
5139 drm_sched_increase_karma(s_job);
72fadb13 5140 amdgpu_reset_prepare_hwcontext(adev, reset_context);
e6c6338f
JZ
5141retry:
5142 /* do hw reset */
5143 if (amdgpu_sriov_vf(adev)) {
5144 amdgpu_virt_fini_data_exchange(adev);
5145 r = amdgpu_device_reset_sriov(adev, false);
5146 if (r)
5147 adev->asic_reset_res = r;
5148 } else {
04442bf7
LL
5149 clear_bit(AMDGPU_SKIP_HW_RESET,
5150 &reset_context->flags);
5151 r = amdgpu_do_asic_reset(device_list_handle,
5152 reset_context);
e6c6338f
JZ
5153 if (r && r == -EAGAIN)
5154 goto retry;
5155 }
5156
5157 /*
5158 * add reset counter so that the following
5159 * resubmitted job could flush vmid
5160 */
5161 atomic_inc(&adev->gpu_reset_counter);
5162 continue;
5163 }
5164
5165 /* got the hw fence, signal finished fence */
5166 atomic_dec(ring->sched.score);
5167 dma_fence_get(&s_job->s_fence->finished);
5168 dma_fence_signal(&s_job->s_fence->finished);
5169 dma_fence_put(&s_job->s_fence->finished);
5170
5171 /* remove node from list and free the job */
5172 spin_lock(&ring->sched.job_list_lock);
5173 list_del_init(&s_job->list);
5174 spin_unlock(&ring->sched.job_list_lock);
5175 ring->sched.ops->free_job(s_job);
5176 }
5177}
5178
d193b12b 5179static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
247c7b0d
AG
5180{
5181 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5182
5183#if defined(CONFIG_DEBUG_FS)
5184 if (!amdgpu_sriov_vf(adev))
5185 cancel_work(&adev->reset_work);
5186#endif
5187
5188 if (adev->kfd.dev)
5189 cancel_work(&adev->kfd.reset_work);
5190
5191 if (amdgpu_sriov_vf(adev))
5192 cancel_work(&adev->virt.flr_work);
5193
5194 if (con && adev->ras_enabled)
5195 cancel_work(&con->recovery_work);
5196
5197}
5198
5199
26bc5340 5200/**
6e9c65f7 5201 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
26bc5340 5202 *
982a820b 5203 * @adev: amdgpu_device pointer
26bc5340
AG
5204 * @job: which job trigger hang
5205 *
5206 * Attempt to reset the GPU if it has hung (all asics).
5207 * Attempt to do soft-reset or full-reset and reinitialize Asic
5208 * Returns 0 for success or an error on failure.
5209 */
5210
cf727044 5211int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
f1549c09
LG
5212 struct amdgpu_job *job,
5213 struct amdgpu_reset_context *reset_context)
26bc5340 5214{
1d721ed6 5215 struct list_head device_list, *device_list_handle = NULL;
7dd8c205 5216 bool job_signaled = false;
26bc5340 5217 struct amdgpu_hive_info *hive = NULL;
26bc5340 5218 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 5219 int i, r = 0;
bb5c7235 5220 bool need_emergency_restart = false;
3f12acc8 5221 bool audio_suspended = false;
e6c6338f 5222 int tmp_vram_lost_counter;
f5c7e779
YC
5223 bool gpu_reset_for_dev_remove = false;
5224
5225 gpu_reset_for_dev_remove =
5226 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5227 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340 5228
6e3cd2a9 5229 /*
bb5c7235
WS
5230 * Special case: RAS triggered and full reset isn't supported
5231 */
5232 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5233
d5ea093e
AG
5234 /*
5235 * Flush RAM to disk so that after reboot
5236 * the user can read log and see why the system rebooted.
5237 */
bb5c7235 5238 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
5239 DRM_WARN("Emergency reboot.");
5240
5241 ksys_sync_helper();
5242 emergency_restart();
5243 }
5244
b823821f 5245 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 5246 need_emergency_restart ? "jobs stop":"reset");
26bc5340 5247
175ac6ec
ZL
5248 if (!amdgpu_sriov_vf(adev))
5249 hive = amdgpu_get_xgmi_hive(adev);
681260df 5250 if (hive)
53b3f8f4 5251 mutex_lock(&hive->hive_lock);
26bc5340 5252
f1549c09
LG
5253 reset_context->job = job;
5254 reset_context->hive = hive;
9e94d22c
EQ
5255 /*
5256 * Build list of devices to reset.
5257 * In case we are in XGMI hive mode, resort the device list
5258 * to put adev in the 1st position.
5259 */
5260 INIT_LIST_HEAD(&device_list);
175ac6ec 5261 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
83d29a5f 5262 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
655ce9cb 5263 list_add_tail(&tmp_adev->reset_list, &device_list);
83d29a5f
YC
5264 if (gpu_reset_for_dev_remove && adev->shutdown)
5265 tmp_adev->shutdown = true;
5266 }
655ce9cb 5267 if (!list_is_first(&adev->reset_list, &device_list))
5268 list_rotate_to_front(&adev->reset_list, &device_list);
5269 device_list_handle = &device_list;
26bc5340 5270 } else {
655ce9cb 5271 list_add_tail(&adev->reset_list, &device_list);
26bc5340
AG
5272 device_list_handle = &device_list;
5273 }
5274
e923be99
AG
5275 /* We need to lock reset domain only once both for XGMI and single device */
5276 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5277 reset_list);
3675c2f2 5278 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
e923be99 5279
1d721ed6 5280 /* block all schedulers and reset given job's ring */
655ce9cb 5281 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
f287a3c5 5282
e923be99 5283 amdgpu_device_set_mp1_state(tmp_adev);
f287a3c5 5284
3f12acc8
EQ
5285 /*
5286 * Try to put the audio codec into suspend state
5287 * before gpu reset started.
5288 *
5289 * Due to the power domain of the graphics device
5290 * is shared with AZ power domain. Without this,
5291 * we may change the audio hardware from behind
5292 * the audio driver's back. That will trigger
5293 * some audio codec errors.
5294 */
5295 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5296 audio_suspended = true;
5297
9e94d22c
EQ
5298 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5299
52fb44cf
EQ
5300 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5301
c004d44e 5302 if (!amdgpu_sriov_vf(tmp_adev))
428890a3 5303 amdgpu_amdkfd_pre_reset(tmp_adev);
9e94d22c 5304
12ffa55d
AG
5305 /*
5306 * Mark these ASICs to be reseted as untracked first
5307 * And add them back after reset completed
5308 */
5309 amdgpu_unregister_gpu_instance(tmp_adev);
5310
163d4cd2 5311 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
565d1941 5312
f1c1314b 5313 /* disable ras on ALL IPs */
bb5c7235 5314 if (!need_emergency_restart &&
b823821f 5315 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 5316 amdgpu_ras_suspend(tmp_adev);
5317
1d721ed6
AG
5318 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5319 struct amdgpu_ring *ring = tmp_adev->rings[i];
5320
5321 if (!ring || !ring->sched.thread)
5322 continue;
5323
0b2d2c2e 5324 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 5325
bb5c7235 5326 if (need_emergency_restart)
7c6e68c7 5327 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6 5328 }
8f8c80f4 5329 atomic_inc(&tmp_adev->gpu_reset_counter);
1d721ed6
AG
5330 }
5331
bb5c7235 5332 if (need_emergency_restart)
7c6e68c7
AG
5333 goto skip_sched_resume;
5334
1d721ed6
AG
5335 /*
5336 * Must check guilty signal here since after this point all old
5337 * HW fences are force signaled.
5338 *
5339 * job->base holds a reference to parent fence
5340 */
f6a3f660 5341 if (job && dma_fence_is_signaled(&job->hw_fence)) {
1d721ed6 5342 job_signaled = true;
1d721ed6
AG
5343 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5344 goto skip_hw_reset;
5345 }
5346
26bc5340 5347retry: /* Rest of adevs pre asic reset from XGMI hive. */
655ce9cb 5348 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
f5c7e779
YC
5349 if (gpu_reset_for_dev_remove) {
5350 /* Workaroud for ASICs need to disable SMC first */
5351 amdgpu_device_smu_fini_early(tmp_adev);
5352 }
f1549c09 5353 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
26bc5340
AG
5354 /*TODO Should we stop ?*/
5355 if (r) {
aac89168 5356 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 5357 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
5358 tmp_adev->asic_reset_res = r;
5359 }
247c7b0d
AG
5360
5361 /*
5362 * Drop all pending non scheduler resets. Scheduler resets
5363 * were already dropped during drm_sched_stop
5364 */
d193b12b 5365 amdgpu_device_stop_pending_resets(tmp_adev);
26bc5340
AG
5366 }
5367
e6c6338f 5368 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
26bc5340 5369 /* Actual ASIC resets if needed.*/
4f30d920 5370 /* Host driver will handle XGMI hive reset for SRIOV */
26bc5340
AG
5371 if (amdgpu_sriov_vf(adev)) {
5372 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5373 if (r)
5374 adev->asic_reset_res = r;
950d6425
SY
5375
5376 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5377 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5378 amdgpu_ras_resume(adev);
26bc5340 5379 } else {
f1549c09 5380 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
b98a1648 5381 if (r && r == -EAGAIN)
26bc5340 5382 goto retry;
f5c7e779
YC
5383
5384 if (!r && gpu_reset_for_dev_remove)
5385 goto recover_end;
26bc5340
AG
5386 }
5387
1d721ed6
AG
5388skip_hw_reset:
5389
26bc5340 5390 /* Post ASIC reset for all devs .*/
655ce9cb 5391 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
7c6e68c7 5392
e6c6338f
JZ
5393 /*
5394 * Sometimes a later bad compute job can block a good gfx job as gfx
5395 * and compute ring share internal GC HW mutually. We add an additional
5396 * guilty jobs recheck step to find the real guilty job, it synchronously
5397 * submits and pends for the first job being signaled. If it gets timeout,
5398 * we identify it as a real guilty job.
5399 */
5400 if (amdgpu_gpu_recovery == 2 &&
5401 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
04442bf7 5402 amdgpu_device_recheck_guilty_jobs(
f1549c09 5403 tmp_adev, device_list_handle, reset_context);
e6c6338f 5404
1d721ed6
AG
5405 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5406 struct amdgpu_ring *ring = tmp_adev->rings[i];
5407
5408 if (!ring || !ring->sched.thread)
5409 continue;
5410
5411 /* No point to resubmit jobs if we didn't HW reset*/
5412 if (!tmp_adev->asic_reset_res && !job_signaled)
5413 drm_sched_resubmit_jobs(&ring->sched);
5414
5415 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5416 }
5417
693073a0 5418 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
ed67f729
JX
5419 amdgpu_mes_self_test(tmp_adev);
5420
1053b9c9 5421 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
4a580877 5422 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
5423 }
5424
7258fa31
SK
5425 if (tmp_adev->asic_reset_res)
5426 r = tmp_adev->asic_reset_res;
5427
1d721ed6 5428 tmp_adev->asic_reset_res = 0;
26bc5340
AG
5429
5430 if (r) {
5431 /* bad news, how to tell it to userspace ? */
12ffa55d 5432 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
5433 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5434 } else {
12ffa55d 5435 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
3fa8f89d
S
5436 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5437 DRM_WARN("smart shift update failed\n");
26bc5340 5438 }
7c6e68c7 5439 }
26bc5340 5440
7c6e68c7 5441skip_sched_resume:
655ce9cb 5442 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
428890a3 5443 /* unlock kfd: SRIOV would do it separately */
c004d44e 5444 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
428890a3 5445 amdgpu_amdkfd_post_reset(tmp_adev);
8e2712e7 5446
5447 /* kfd_post_reset will do nothing if kfd device is not initialized,
5448 * need to bring up kfd here if it's not be initialized before
5449 */
5450 if (!adev->kfd.init_complete)
5451 amdgpu_amdkfd_device_init(adev);
5452
3f12acc8
EQ
5453 if (audio_suspended)
5454 amdgpu_device_resume_display_audio(tmp_adev);
e923be99
AG
5455
5456 amdgpu_device_unset_mp1_state(tmp_adev);
26bc5340
AG
5457 }
5458
f5c7e779 5459recover_end:
e923be99
AG
5460 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5461 reset_list);
5462 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5463
9e94d22c 5464 if (hive) {
9e94d22c 5465 mutex_unlock(&hive->hive_lock);
d95e8e97 5466 amdgpu_put_xgmi_hive(hive);
9e94d22c 5467 }
26bc5340 5468
f287a3c5 5469 if (r)
26bc5340 5470 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
ab9a0b1f
AG
5471
5472 atomic_set(&adev->reset_domain->reset_res, r);
d38ceaf9
AD
5473 return r;
5474}
5475
e3ecdffa
AD
5476/**
5477 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5478 *
5479 * @adev: amdgpu_device pointer
5480 *
5481 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5482 * and lanes) of the slot the device is in. Handles APUs and
5483 * virtualized environments where PCIE config space may not be available.
5484 */
5494d864 5485static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 5486{
5d9a6330 5487 struct pci_dev *pdev;
c5313457
HK
5488 enum pci_bus_speed speed_cap, platform_speed_cap;
5489 enum pcie_link_width platform_link_width;
d0dd7f0c 5490
cd474ba0
AD
5491 if (amdgpu_pcie_gen_cap)
5492 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 5493
cd474ba0
AD
5494 if (amdgpu_pcie_lane_cap)
5495 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 5496
cd474ba0
AD
5497 /* covers APUs as well */
5498 if (pci_is_root_bus(adev->pdev->bus)) {
5499 if (adev->pm.pcie_gen_mask == 0)
5500 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5501 if (adev->pm.pcie_mlw_mask == 0)
5502 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 5503 return;
cd474ba0 5504 }
d0dd7f0c 5505
c5313457
HK
5506 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5507 return;
5508
dbaa922b
AD
5509 pcie_bandwidth_available(adev->pdev, NULL,
5510 &platform_speed_cap, &platform_link_width);
c5313457 5511
cd474ba0 5512 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
5513 /* asic caps */
5514 pdev = adev->pdev;
5515 speed_cap = pcie_get_speed_cap(pdev);
5516 if (speed_cap == PCI_SPEED_UNKNOWN) {
5517 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
5518 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5519 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 5520 } else {
2b3a1f51
FX
5521 if (speed_cap == PCIE_SPEED_32_0GT)
5522 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5523 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5524 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5525 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5526 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5527 else if (speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5528 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5529 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5530 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5531 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5532 else if (speed_cap == PCIE_SPEED_8_0GT)
5533 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5534 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5535 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5536 else if (speed_cap == PCIE_SPEED_5_0GT)
5537 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5538 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5539 else
5540 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5541 }
5542 /* platform caps */
c5313457 5543 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
5544 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5545 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5546 } else {
2b3a1f51
FX
5547 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5548 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5549 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5550 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5551 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5552 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5553 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5554 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5555 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5556 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5557 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 5558 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
5559 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5560 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5561 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 5562 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
5563 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5564 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5565 else
5566 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5567
cd474ba0
AD
5568 }
5569 }
5570 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 5571 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
5572 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5573 } else {
c5313457 5574 switch (platform_link_width) {
5d9a6330 5575 case PCIE_LNK_X32:
cd474ba0
AD
5576 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5577 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5578 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5579 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5580 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5581 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5582 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5583 break;
5d9a6330 5584 case PCIE_LNK_X16:
cd474ba0
AD
5585 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5586 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5587 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5588 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5589 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5590 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5591 break;
5d9a6330 5592 case PCIE_LNK_X12:
cd474ba0
AD
5593 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5594 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5595 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5596 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5597 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5598 break;
5d9a6330 5599 case PCIE_LNK_X8:
cd474ba0
AD
5600 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5601 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5602 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5603 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5604 break;
5d9a6330 5605 case PCIE_LNK_X4:
cd474ba0
AD
5606 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5607 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5608 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5609 break;
5d9a6330 5610 case PCIE_LNK_X2:
cd474ba0
AD
5611 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5612 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5613 break;
5d9a6330 5614 case PCIE_LNK_X1:
cd474ba0
AD
5615 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5616 break;
5617 default:
5618 break;
5619 }
d0dd7f0c
AD
5620 }
5621 }
5622}
d38ceaf9 5623
08a2fd23
RE
5624/**
5625 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5626 *
5627 * @adev: amdgpu_device pointer
5628 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5629 *
5630 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5631 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5632 * @peer_adev.
5633 */
5634bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5635 struct amdgpu_device *peer_adev)
5636{
5637#ifdef CONFIG_HSA_AMD_P2P
5638 uint64_t address_mask = peer_adev->dev->dma_mask ?
5639 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5640 resource_size_t aper_limit =
5641 adev->gmc.aper_base + adev->gmc.aper_size - 1;
bb66ecbf
LL
5642 bool p2p_access =
5643 !adev->gmc.xgmi.connected_to_cpu &&
5644 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
08a2fd23
RE
5645
5646 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5647 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5648 !(adev->gmc.aper_base & address_mask ||
5649 aper_limit & address_mask));
5650#else
5651 return false;
5652#endif
5653}
5654
361dbd01
AD
5655int amdgpu_device_baco_enter(struct drm_device *dev)
5656{
1348969a 5657 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5658 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 5659
4a580877 5660 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5661 return -ENOTSUPP;
5662
8ab0d6f0 5663 if (ras && adev->ras_enabled &&
acdae216 5664 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5665 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5666
9530273e 5667 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
5668}
5669
5670int amdgpu_device_baco_exit(struct drm_device *dev)
5671{
1348969a 5672 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5673 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 5674 int ret = 0;
361dbd01 5675
4a580877 5676 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5677 return -ENOTSUPP;
5678
9530273e
EQ
5679 ret = amdgpu_dpm_baco_exit(adev);
5680 if (ret)
5681 return ret;
7a22677b 5682
8ab0d6f0 5683 if (ras && adev->ras_enabled &&
acdae216 5684 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5685 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5686
1bece222
CL
5687 if (amdgpu_passthrough(adev) &&
5688 adev->nbio.funcs->clear_doorbell_interrupt)
5689 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5690
7a22677b 5691 return 0;
361dbd01 5692}
c9a6b82f
AG
5693
5694/**
5695 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5696 * @pdev: PCI device struct
5697 * @state: PCI channel state
5698 *
5699 * Description: Called when a PCI error is detected.
5700 *
5701 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5702 */
5703pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5704{
5705 struct drm_device *dev = pci_get_drvdata(pdev);
5706 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5707 int i;
c9a6b82f
AG
5708
5709 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5710
6894305c
AG
5711 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5712 DRM_WARN("No support for XGMI hive yet...");
5713 return PCI_ERS_RESULT_DISCONNECT;
5714 }
5715
e17e27f9
GC
5716 adev->pci_channel_state = state;
5717
c9a6b82f
AG
5718 switch (state) {
5719 case pci_channel_io_normal:
5720 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca 5721 /* Fatal error, prepare for slot reset */
8a11d283
TZ
5722 case pci_channel_io_frozen:
5723 /*
d0fb18b5 5724 * Locking adev->reset_domain->sem will prevent any external access
acd89fca
AG
5725 * to GPU during PCI error recovery
5726 */
3675c2f2 5727 amdgpu_device_lock_reset_domain(adev->reset_domain);
e923be99 5728 amdgpu_device_set_mp1_state(adev);
acd89fca
AG
5729
5730 /*
5731 * Block any work scheduling as we do for regular GPU reset
5732 * for the duration of the recovery
5733 */
5734 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5735 struct amdgpu_ring *ring = adev->rings[i];
5736
5737 if (!ring || !ring->sched.thread)
5738 continue;
5739
5740 drm_sched_stop(&ring->sched, NULL);
5741 }
8f8c80f4 5742 atomic_inc(&adev->gpu_reset_counter);
c9a6b82f
AG
5743 return PCI_ERS_RESULT_NEED_RESET;
5744 case pci_channel_io_perm_failure:
5745 /* Permanent error, prepare for device removal */
5746 return PCI_ERS_RESULT_DISCONNECT;
5747 }
5748
5749 return PCI_ERS_RESULT_NEED_RESET;
5750}
5751
5752/**
5753 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5754 * @pdev: pointer to PCI device
5755 */
5756pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5757{
5758
5759 DRM_INFO("PCI error: mmio enabled callback!!\n");
5760
5761 /* TODO - dump whatever for debugging purposes */
5762
5763 /* This called only if amdgpu_pci_error_detected returns
5764 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5765 * works, no need to reset slot.
5766 */
5767
5768 return PCI_ERS_RESULT_RECOVERED;
5769}
5770
5771/**
5772 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5773 * @pdev: PCI device struct
5774 *
5775 * Description: This routine is called by the pci error recovery
5776 * code after the PCI slot has been reset, just before we
5777 * should resume normal operations.
5778 */
5779pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5780{
5781 struct drm_device *dev = pci_get_drvdata(pdev);
5782 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 5783 int r, i;
04442bf7 5784 struct amdgpu_reset_context reset_context;
362c7b91 5785 u32 memsize;
7ac71382 5786 struct list_head device_list;
c9a6b82f
AG
5787
5788 DRM_INFO("PCI error: slot reset callback!!\n");
5789
04442bf7
LL
5790 memset(&reset_context, 0, sizeof(reset_context));
5791
7ac71382 5792 INIT_LIST_HEAD(&device_list);
655ce9cb 5793 list_add_tail(&adev->reset_list, &device_list);
7ac71382 5794
362c7b91
AG
5795 /* wait for asic to come out of reset */
5796 msleep(500);
5797
7ac71382 5798 /* Restore PCI confspace */
c1dd4aa6 5799 amdgpu_device_load_pci_state(pdev);
c9a6b82f 5800
362c7b91
AG
5801 /* confirm ASIC came out of reset */
5802 for (i = 0; i < adev->usec_timeout; i++) {
5803 memsize = amdgpu_asic_get_config_memsize(adev);
5804
5805 if (memsize != 0xffffffff)
5806 break;
5807 udelay(1);
5808 }
5809 if (memsize == 0xffffffff) {
5810 r = -ETIME;
5811 goto out;
5812 }
5813
04442bf7
LL
5814 reset_context.method = AMD_RESET_METHOD_NONE;
5815 reset_context.reset_req_dev = adev;
5816 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5817 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5818
7afefb81 5819 adev->no_hw_access = true;
04442bf7 5820 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
7afefb81 5821 adev->no_hw_access = false;
c9a6b82f
AG
5822 if (r)
5823 goto out;
5824
04442bf7 5825 r = amdgpu_do_asic_reset(&device_list, &reset_context);
c9a6b82f
AG
5826
5827out:
c9a6b82f 5828 if (!r) {
c1dd4aa6
AG
5829 if (amdgpu_device_cache_pci_state(adev->pdev))
5830 pci_restore_state(adev->pdev);
5831
c9a6b82f
AG
5832 DRM_INFO("PCIe error recovery succeeded\n");
5833 } else {
5834 DRM_ERROR("PCIe error recovery failed, err:%d", r);
e923be99
AG
5835 amdgpu_device_unset_mp1_state(adev);
5836 amdgpu_device_unlock_reset_domain(adev->reset_domain);
c9a6b82f
AG
5837 }
5838
5839 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5840}
5841
5842/**
5843 * amdgpu_pci_resume() - resume normal ops after PCI reset
5844 * @pdev: pointer to PCI device
5845 *
5846 * Called when the error recovery driver tells us that its
505199a3 5847 * OK to resume normal operation.
c9a6b82f
AG
5848 */
5849void amdgpu_pci_resume(struct pci_dev *pdev)
5850{
5851 struct drm_device *dev = pci_get_drvdata(pdev);
5852 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5853 int i;
c9a6b82f 5854
c9a6b82f
AG
5855
5856 DRM_INFO("PCI error: resume callback!!\n");
acd89fca 5857
e17e27f9
GC
5858 /* Only continue execution for the case of pci_channel_io_frozen */
5859 if (adev->pci_channel_state != pci_channel_io_frozen)
5860 return;
5861
acd89fca
AG
5862 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5863 struct amdgpu_ring *ring = adev->rings[i];
5864
5865 if (!ring || !ring->sched.thread)
5866 continue;
5867
5868
5869 drm_sched_resubmit_jobs(&ring->sched);
5870 drm_sched_start(&ring->sched, true);
5871 }
5872
e923be99
AG
5873 amdgpu_device_unset_mp1_state(adev);
5874 amdgpu_device_unlock_reset_domain(adev->reset_domain);
c9a6b82f 5875}
c1dd4aa6
AG
5876
5877bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5878{
5879 struct drm_device *dev = pci_get_drvdata(pdev);
5880 struct amdgpu_device *adev = drm_to_adev(dev);
5881 int r;
5882
5883 r = pci_save_state(pdev);
5884 if (!r) {
5885 kfree(adev->pci_state);
5886
5887 adev->pci_state = pci_store_saved_state(pdev);
5888
5889 if (!adev->pci_state) {
5890 DRM_ERROR("Failed to store PCI saved state");
5891 return false;
5892 }
5893 } else {
5894 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5895 return false;
5896 }
5897
5898 return true;
5899}
5900
5901bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5902{
5903 struct drm_device *dev = pci_get_drvdata(pdev);
5904 struct amdgpu_device *adev = drm_to_adev(dev);
5905 int r;
5906
5907 if (!adev->pci_state)
5908 return false;
5909
5910 r = pci_load_saved_state(pdev, adev->pci_state);
5911
5912 if (!r) {
5913 pci_restore_state(pdev);
5914 } else {
5915 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5916 return false;
5917 }
5918
5919 return true;
5920}
5921
810085dd
EH
5922void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5923 struct amdgpu_ring *ring)
5924{
5925#ifdef CONFIG_X86_64
b818a5d3 5926 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
810085dd
EH
5927 return;
5928#endif
5929 if (adev->gmc.xgmi.connected_to_cpu)
5930 return;
5931
5932 if (ring && ring->funcs->emit_hdp_flush)
5933 amdgpu_ring_emit_hdp_flush(ring);
5934 else
5935 amdgpu_asic_flush_hdp(adev, ring);
5936}
c1dd4aa6 5937
810085dd
EH
5938void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5939 struct amdgpu_ring *ring)
5940{
5941#ifdef CONFIG_X86_64
b818a5d3 5942 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
810085dd
EH
5943 return;
5944#endif
5945 if (adev->gmc.xgmi.connected_to_cpu)
5946 return;
c1dd4aa6 5947
810085dd
EH
5948 amdgpu_asic_invalidate_hdp(adev, ring);
5949}
34f3a4a9 5950
89a7a870
AG
5951int amdgpu_in_reset(struct amdgpu_device *adev)
5952{
5953 return atomic_read(&adev->reset_domain->in_gpu_reset);
5954 }
5955
34f3a4a9
LY
5956/**
5957 * amdgpu_device_halt() - bring hardware to some kind of halt state
5958 *
5959 * @adev: amdgpu_device pointer
5960 *
5961 * Bring hardware to some kind of halt state so that no one can touch it
5962 * any more. It will help to maintain error context when error occurred.
5963 * Compare to a simple hang, the system will keep stable at least for SSH
5964 * access. Then it should be trivial to inspect the hardware state and
5965 * see what's going on. Implemented as following:
5966 *
5967 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5968 * clears all CPU mappings to device, disallows remappings through page faults
5969 * 2. amdgpu_irq_disable_all() disables all interrupts
5970 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5971 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5972 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5973 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5974 * flush any in flight DMA operations
5975 */
5976void amdgpu_device_halt(struct amdgpu_device *adev)
5977{
5978 struct pci_dev *pdev = adev->pdev;
e0f943b4 5979 struct drm_device *ddev = adev_to_drm(adev);
34f3a4a9
LY
5980
5981 drm_dev_unplug(ddev);
5982
5983 amdgpu_irq_disable_all(adev);
5984
5985 amdgpu_fence_driver_hw_fini(adev);
5986
5987 adev->no_hw_access = true;
5988
5989 amdgpu_device_unmap_mmio(adev);
5990
5991 pci_disable_device(pdev);
5992 pci_wait_for_pending_transaction(pdev);
5993}
86700a40
XD
5994
5995u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5996 u32 reg)
5997{
5998 unsigned long flags, address, data;
5999 u32 r;
6000
6001 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6002 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6003
6004 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6005 WREG32(address, reg * 4);
6006 (void)RREG32(address);
6007 r = RREG32(data);
6008 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6009 return r;
6010}
6011
6012void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6013 u32 reg, u32 v)
6014{
6015 unsigned long flags, address, data;
6016
6017 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6018 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6019
6020 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6021 WREG32(address, reg * 4);
6022 (void)RREG32(address);
6023 WREG32(data, v);
6024 (void)RREG32(data);
6025 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6026}
68ce8b24
CK
6027
6028/**
6029 * amdgpu_device_switch_gang - switch to a new gang
6030 * @adev: amdgpu_device pointer
6031 * @gang: the gang to switch to
6032 *
6033 * Try to switch to a new gang.
6034 * Returns: NULL if we switched to the new gang or a reference to the current
6035 * gang leader.
6036 */
6037struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6038 struct dma_fence *gang)
6039{
6040 struct dma_fence *old = NULL;
6041
6042 do {
6043 dma_fence_put(old);
6044 rcu_read_lock();
6045 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6046 rcu_read_unlock();
6047
6048 if (old == gang)
6049 break;
6050
6051 if (!dma_fence_is_signaled(old))
6052 return old;
6053
6054 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6055 old, gang) != old);
6056
6057 dma_fence_put(old);
6058 return NULL;
6059}