drm/amd/display: do not override CURSOR_REQ_MODE when SubVP is not enabled
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
4a74c38c 33#include <linux/iommu.h>
901e2be2 34#include <linux/pci.h>
fdf2f6c5 35
4562236b 36#include <drm/drm_atomic_helper.h>
fcd70cd3 37#include <drm/drm_probe_helper.h>
d38ceaf9
AD
38#include <drm/amdgpu_drm.h>
39#include <linux/vgaarb.h>
40#include <linux/vga_switcheroo.h>
41#include <linux/efi.h>
42#include "amdgpu.h"
f4b373f4 43#include "amdgpu_trace.h"
d38ceaf9
AD
44#include "amdgpu_i2c.h"
45#include "atom.h"
46#include "amdgpu_atombios.h"
a5bde2f9 47#include "amdgpu_atomfirmware.h"
d0dd7f0c 48#include "amd_pcie.h"
33f34802
KW
49#ifdef CONFIG_DRM_AMDGPU_SI
50#include "si.h"
51#endif
a2e73f56
AD
52#ifdef CONFIG_DRM_AMDGPU_CIK
53#include "cik.h"
54#endif
aaa36a97 55#include "vi.h"
460826e6 56#include "soc15.h"
0a5b8c7b 57#include "nv.h"
d38ceaf9 58#include "bif/bif_4_1_d.h"
bec86378 59#include <linux/firmware.h>
89041940 60#include "amdgpu_vf_error.h"
d38ceaf9 61
ba997709 62#include "amdgpu_amdkfd.h"
d2f52ac8 63#include "amdgpu_pm.h"
d38ceaf9 64
5183411b 65#include "amdgpu_xgmi.h"
c030f2e4 66#include "amdgpu_ras.h"
9c7c85f7 67#include "amdgpu_pmu.h"
bd607166 68#include "amdgpu_fru_eeprom.h"
04442bf7 69#include "amdgpu_reset.h"
5183411b 70
d5ea093e 71#include <linux/suspend.h>
c6a6e2db 72#include <drm/task_barrier.h>
3f12acc8 73#include <linux/pm_runtime.h>
d5ea093e 74
f89f8c6b
AG
75#include <drm/drm_drv.h>
76
e2a75f88 77MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 78MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 79MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 80MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 81MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 82MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
42b325e5 83MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
e2a75f88 84
2dc80b00 85#define AMDGPU_RESUME_MS 2000
7258fa31
SK
86#define AMDGPU_MAX_RETRY_LIMIT 2
87#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
2dc80b00 88
050091ab 89const char *amdgpu_asic_name[] = {
da69c161
KW
90 "TAHITI",
91 "PITCAIRN",
92 "VERDE",
93 "OLAND",
94 "HAINAN",
d38ceaf9
AD
95 "BONAIRE",
96 "KAVERI",
97 "KABINI",
98 "HAWAII",
99 "MULLINS",
100 "TOPAZ",
101 "TONGA",
48299f95 102 "FIJI",
d38ceaf9 103 "CARRIZO",
139f4917 104 "STONEY",
2cc0c0b5
FC
105 "POLARIS10",
106 "POLARIS11",
c4642a47 107 "POLARIS12",
48ff108d 108 "VEGAM",
d4196f01 109 "VEGA10",
8fab806a 110 "VEGA12",
956fcddc 111 "VEGA20",
2ca8a5d2 112 "RAVEN",
d6c3b24e 113 "ARCTURUS",
1eee4228 114 "RENOIR",
d46b417a 115 "ALDEBARAN",
852a6626 116 "NAVI10",
d0f56dc2 117 "CYAN_SKILLFISH",
87dbad02 118 "NAVI14",
9802f5d7 119 "NAVI12",
ccaf72d3 120 "SIENNA_CICHLID",
ddd8fbe7 121 "NAVY_FLOUNDER",
4f1e9a76 122 "VANGOGH",
a2468e04 123 "DIMGREY_CAVEFISH",
6f169591 124 "BEIGE_GOBY",
ee9236b7 125 "YELLOW_CARP",
3ae695d6 126 "IP DISCOVERY",
d38ceaf9
AD
127 "LAST",
128};
129
dcea6e65
KR
130/**
131 * DOC: pcie_replay_count
132 *
133 * The amdgpu driver provides a sysfs API for reporting the total number
134 * of PCIe replays (NAKs)
135 * The file pcie_replay_count is used for this and returns the total
136 * number of replays as a sum of the NAKs generated and NAKs received
137 */
138
139static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
140 struct device_attribute *attr, char *buf)
141{
142 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 143 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
145
36000c7a 146 return sysfs_emit(buf, "%llu\n", cnt);
dcea6e65
KR
147}
148
149static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
150 amdgpu_device_get_pcie_replay_count, NULL);
151
5494d864
AD
152static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
153
bd607166
KR
154/**
155 * DOC: product_name
156 *
157 * The amdgpu driver provides a sysfs API for reporting the product name
158 * for the device
159 * The file serial_number is used for this and returns the product name
160 * as returned from the FRU.
161 * NOTE: This is only available for certain server cards
162 */
163
164static ssize_t amdgpu_device_get_product_name(struct device *dev,
165 struct device_attribute *attr, char *buf)
166{
167 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 168 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 169
36000c7a 170 return sysfs_emit(buf, "%s\n", adev->product_name);
bd607166
KR
171}
172
173static DEVICE_ATTR(product_name, S_IRUGO,
174 amdgpu_device_get_product_name, NULL);
175
176/**
177 * DOC: product_number
178 *
179 * The amdgpu driver provides a sysfs API for reporting the part number
180 * for the device
181 * The file serial_number is used for this and returns the part number
182 * as returned from the FRU.
183 * NOTE: This is only available for certain server cards
184 */
185
186static ssize_t amdgpu_device_get_product_number(struct device *dev,
187 struct device_attribute *attr, char *buf)
188{
189 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 190 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 191
36000c7a 192 return sysfs_emit(buf, "%s\n", adev->product_number);
bd607166
KR
193}
194
195static DEVICE_ATTR(product_number, S_IRUGO,
196 amdgpu_device_get_product_number, NULL);
197
198/**
199 * DOC: serial_number
200 *
201 * The amdgpu driver provides a sysfs API for reporting the serial number
202 * for the device
203 * The file serial_number is used for this and returns the serial number
204 * as returned from the FRU.
205 * NOTE: This is only available for certain server cards
206 */
207
208static ssize_t amdgpu_device_get_serial_number(struct device *dev,
209 struct device_attribute *attr, char *buf)
210{
211 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 212 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 213
36000c7a 214 return sysfs_emit(buf, "%s\n", adev->serial);
bd607166
KR
215}
216
217static DEVICE_ATTR(serial_number, S_IRUGO,
218 amdgpu_device_get_serial_number, NULL);
219
fd496ca8 220/**
b98c6299 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
fd496ca8
AD
222 *
223 * @dev: drm_device pointer
224 *
b98c6299 225 * Returns true if the device is a dGPU with ATPX power control,
fd496ca8
AD
226 * otherwise return false.
227 */
b98c6299 228bool amdgpu_device_supports_px(struct drm_device *dev)
fd496ca8
AD
229{
230 struct amdgpu_device *adev = drm_to_adev(dev);
231
b98c6299 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
fd496ca8
AD
233 return true;
234 return false;
235}
236
e3ecdffa 237/**
0330b848 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
e3ecdffa
AD
239 *
240 * @dev: drm_device pointer
241 *
b98c6299 242 * Returns true if the device is a dGPU with ACPI power control,
e3ecdffa
AD
243 * otherwise return false.
244 */
31af062a 245bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 246{
1348969a 247 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 248
b98c6299
AD
249 if (adev->has_pr3 ||
250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
d38ceaf9
AD
251 return true;
252 return false;
253}
254
a69cba42
AD
255/**
256 * amdgpu_device_supports_baco - Does the device support BACO
257 *
258 * @dev: drm_device pointer
259 *
260 * Returns true if the device supporte BACO,
261 * otherwise return false.
262 */
263bool amdgpu_device_supports_baco(struct drm_device *dev)
264{
1348969a 265 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
266
267 return amdgpu_asic_supports_baco(adev);
268}
269
3fa8f89d
S
270/**
271 * amdgpu_device_supports_smart_shift - Is the device dGPU with
272 * smart shift support
273 *
274 * @dev: drm_device pointer
275 *
276 * Returns true if the device is a dGPU with Smart Shift support,
277 * otherwise returns false.
278 */
279bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
280{
281 return (amdgpu_device_supports_boco(dev) &&
282 amdgpu_acpi_is_power_shift_control_supported());
283}
284
6e3cd2a9
MCC
285/*
286 * VRAM access helper functions
287 */
288
e35e2b11 289/**
048af66b 290 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
e35e2b11
TY
291 *
292 * @adev: amdgpu_device pointer
293 * @pos: offset of the buffer in vram
294 * @buf: virtual address of the buffer in system memory
295 * @size: read/write size, sizeof(@buf) must > @size
296 * @write: true - write to vram, otherwise - read from vram
297 */
048af66b
KW
298void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
299 void *buf, size_t size, bool write)
e35e2b11 300{
e35e2b11 301 unsigned long flags;
048af66b
KW
302 uint32_t hi = ~0, tmp = 0;
303 uint32_t *data = buf;
ce05ac56 304 uint64_t last;
f89f8c6b 305 int idx;
ce05ac56 306
c58a863b 307 if (!drm_dev_enter(adev_to_drm(adev), &idx))
f89f8c6b 308 return;
9d11eb0d 309
048af66b
KW
310 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
311
312 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
313 for (last = pos + size; pos < last; pos += 4) {
314 tmp = pos >> 31;
315
316 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
317 if (tmp != hi) {
318 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
319 hi = tmp;
320 }
321 if (write)
322 WREG32_NO_KIQ(mmMM_DATA, *data++);
323 else
324 *data++ = RREG32_NO_KIQ(mmMM_DATA);
325 }
326
327 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
328 drm_dev_exit(idx);
329}
330
331/**
bbe04dec 332 * amdgpu_device_aper_access - access vram by vram aperature
048af66b
KW
333 *
334 * @adev: amdgpu_device pointer
335 * @pos: offset of the buffer in vram
336 * @buf: virtual address of the buffer in system memory
337 * @size: read/write size, sizeof(@buf) must > @size
338 * @write: true - write to vram, otherwise - read from vram
339 *
340 * The return value means how many bytes have been transferred.
341 */
342size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
343 void *buf, size_t size, bool write)
344{
9d11eb0d 345#ifdef CONFIG_64BIT
048af66b
KW
346 void __iomem *addr;
347 size_t count = 0;
348 uint64_t last;
349
350 if (!adev->mman.aper_base_kaddr)
351 return 0;
352
9d11eb0d
CK
353 last = min(pos + size, adev->gmc.visible_vram_size);
354 if (last > pos) {
048af66b
KW
355 addr = adev->mman.aper_base_kaddr + pos;
356 count = last - pos;
9d11eb0d
CK
357
358 if (write) {
359 memcpy_toio(addr, buf, count);
360 mb();
810085dd 361 amdgpu_device_flush_hdp(adev, NULL);
9d11eb0d 362 } else {
810085dd 363 amdgpu_device_invalidate_hdp(adev, NULL);
9d11eb0d
CK
364 mb();
365 memcpy_fromio(buf, addr, count);
366 }
367
9d11eb0d 368 }
048af66b
KW
369
370 return count;
371#else
372 return 0;
9d11eb0d 373#endif
048af66b 374}
9d11eb0d 375
048af66b
KW
376/**
377 * amdgpu_device_vram_access - read/write a buffer in vram
378 *
379 * @adev: amdgpu_device pointer
380 * @pos: offset of the buffer in vram
381 * @buf: virtual address of the buffer in system memory
382 * @size: read/write size, sizeof(@buf) must > @size
383 * @write: true - write to vram, otherwise - read from vram
384 */
385void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
386 void *buf, size_t size, bool write)
387{
388 size_t count;
e35e2b11 389
048af66b
KW
390 /* try to using vram apreature to access vram first */
391 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
392 size -= count;
393 if (size) {
394 /* using MM to access rest vram */
395 pos += count;
396 buf += count;
397 amdgpu_device_mm_access(adev, pos, buf, size, write);
e35e2b11
TY
398 }
399}
400
d38ceaf9 401/*
f7ee1874 402 * register access helper functions.
d38ceaf9 403 */
56b53c0b
DL
404
405/* Check if hw access should be skipped because of hotplug or device error */
406bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
407{
7afefb81 408 if (adev->no_hw_access)
56b53c0b
DL
409 return true;
410
411#ifdef CONFIG_LOCKDEP
412 /*
413 * This is a bit complicated to understand, so worth a comment. What we assert
414 * here is that the GPU reset is not running on another thread in parallel.
415 *
416 * For this we trylock the read side of the reset semaphore, if that succeeds
417 * we know that the reset is not running in paralell.
418 *
419 * If the trylock fails we assert that we are either already holding the read
420 * side of the lock or are the reset thread itself and hold the write side of
421 * the lock.
422 */
423 if (in_task()) {
d0fb18b5
AG
424 if (down_read_trylock(&adev->reset_domain->sem))
425 up_read(&adev->reset_domain->sem);
56b53c0b 426 else
d0fb18b5 427 lockdep_assert_held(&adev->reset_domain->sem);
56b53c0b
DL
428 }
429#endif
430 return false;
431}
432
e3ecdffa 433/**
f7ee1874 434 * amdgpu_device_rreg - read a memory mapped IO or indirect register
e3ecdffa
AD
435 *
436 * @adev: amdgpu_device pointer
437 * @reg: dword aligned register offset
438 * @acc_flags: access flags which require special behavior
439 *
440 * Returns the 32 bit value from the offset specified.
441 */
f7ee1874
HZ
442uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
443 uint32_t reg, uint32_t acc_flags)
d38ceaf9 444{
f4b373f4
TSD
445 uint32_t ret;
446
56b53c0b 447 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
448 return 0;
449
f7ee1874
HZ
450 if ((reg * 4) < adev->rmmio_size) {
451 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
452 amdgpu_sriov_runtime(adev) &&
d0fb18b5 453 down_read_trylock(&adev->reset_domain->sem)) {
f7ee1874 454 ret = amdgpu_kiq_rreg(adev, reg);
d0fb18b5 455 up_read(&adev->reset_domain->sem);
f7ee1874
HZ
456 } else {
457 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
458 }
459 } else {
460 ret = adev->pcie_rreg(adev, reg * 4);
81202807 461 }
bc992ba5 462
f7ee1874 463 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
e78b579d 464
f4b373f4 465 return ret;
d38ceaf9
AD
466}
467
421a2a30
ML
468/*
469 * MMIO register read with bytes helper functions
470 * @offset:bytes offset from MMIO start
471 *
472*/
473
e3ecdffa
AD
474/**
475 * amdgpu_mm_rreg8 - read a memory mapped IO register
476 *
477 * @adev: amdgpu_device pointer
478 * @offset: byte aligned register offset
479 *
480 * Returns the 8 bit value from the offset specified.
481 */
7cbbc745
AG
482uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
483{
56b53c0b 484 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
485 return 0;
486
421a2a30
ML
487 if (offset < adev->rmmio_size)
488 return (readb(adev->rmmio + offset));
489 BUG();
490}
491
492/*
493 * MMIO register write with bytes helper functions
494 * @offset:bytes offset from MMIO start
495 * @value: the value want to be written to the register
496 *
497*/
e3ecdffa
AD
498/**
499 * amdgpu_mm_wreg8 - read a memory mapped IO register
500 *
501 * @adev: amdgpu_device pointer
502 * @offset: byte aligned register offset
503 * @value: 8 bit value to write
504 *
505 * Writes the value specified to the offset specified.
506 */
7cbbc745
AG
507void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
508{
56b53c0b 509 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
510 return;
511
421a2a30
ML
512 if (offset < adev->rmmio_size)
513 writeb(value, adev->rmmio + offset);
514 else
515 BUG();
516}
517
e3ecdffa 518/**
f7ee1874 519 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
e3ecdffa
AD
520 *
521 * @adev: amdgpu_device pointer
522 * @reg: dword aligned register offset
523 * @v: 32 bit value to write to the register
524 * @acc_flags: access flags which require special behavior
525 *
526 * Writes the value specified to the offset specified.
527 */
f7ee1874
HZ
528void amdgpu_device_wreg(struct amdgpu_device *adev,
529 uint32_t reg, uint32_t v,
530 uint32_t acc_flags)
d38ceaf9 531{
56b53c0b 532 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
533 return;
534
f7ee1874
HZ
535 if ((reg * 4) < adev->rmmio_size) {
536 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
537 amdgpu_sriov_runtime(adev) &&
d0fb18b5 538 down_read_trylock(&adev->reset_domain->sem)) {
f7ee1874 539 amdgpu_kiq_wreg(adev, reg, v);
d0fb18b5 540 up_read(&adev->reset_domain->sem);
f7ee1874
HZ
541 } else {
542 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
543 }
544 } else {
545 adev->pcie_wreg(adev, reg * 4, v);
81202807 546 }
bc992ba5 547
f7ee1874 548 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
2e0cc4d4 549}
d38ceaf9 550
03f2abb0 551/**
4cc9f86f 552 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
2e0cc4d4 553 *
71579346
RB
554 * @adev: amdgpu_device pointer
555 * @reg: mmio/rlc register
556 * @v: value to write
557 *
558 * this function is invoked only for the debugfs register access
03f2abb0 559 */
f7ee1874
HZ
560void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
561 uint32_t reg, uint32_t v)
2e0cc4d4 562{
56b53c0b 563 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
564 return;
565
2e0cc4d4 566 if (amdgpu_sriov_fullaccess(adev) &&
f7ee1874
HZ
567 adev->gfx.rlc.funcs &&
568 adev->gfx.rlc.funcs->is_rlcg_access_range) {
2e0cc4d4 569 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
1b2dc99e 570 return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
4cc9f86f
TSD
571 } else if ((reg * 4) >= adev->rmmio_size) {
572 adev->pcie_wreg(adev, reg * 4, v);
f7ee1874
HZ
573 } else {
574 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
47ed4e1c 575 }
d38ceaf9
AD
576}
577
d38ceaf9
AD
578/**
579 * amdgpu_mm_rdoorbell - read a doorbell dword
580 *
581 * @adev: amdgpu_device pointer
582 * @index: doorbell index
583 *
584 * Returns the value in the doorbell aperture at the
585 * requested doorbell index (CIK).
586 */
587u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
588{
56b53c0b 589 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
590 return 0;
591
d38ceaf9
AD
592 if (index < adev->doorbell.num_doorbells) {
593 return readl(adev->doorbell.ptr + index);
594 } else {
595 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
596 return 0;
597 }
598}
599
600/**
601 * amdgpu_mm_wdoorbell - write a doorbell dword
602 *
603 * @adev: amdgpu_device pointer
604 * @index: doorbell index
605 * @v: value to write
606 *
607 * Writes @v to the doorbell aperture at the
608 * requested doorbell index (CIK).
609 */
610void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
611{
56b53c0b 612 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
613 return;
614
d38ceaf9
AD
615 if (index < adev->doorbell.num_doorbells) {
616 writel(v, adev->doorbell.ptr + index);
617 } else {
618 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
619 }
620}
621
832be404
KW
622/**
623 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
624 *
625 * @adev: amdgpu_device pointer
626 * @index: doorbell index
627 *
628 * Returns the value in the doorbell aperture at the
629 * requested doorbell index (VEGA10+).
630 */
631u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
632{
56b53c0b 633 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
634 return 0;
635
832be404
KW
636 if (index < adev->doorbell.num_doorbells) {
637 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
638 } else {
639 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
640 return 0;
641 }
642}
643
644/**
645 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
646 *
647 * @adev: amdgpu_device pointer
648 * @index: doorbell index
649 * @v: value to write
650 *
651 * Writes @v to the doorbell aperture at the
652 * requested doorbell index (VEGA10+).
653 */
654void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
655{
56b53c0b 656 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
657 return;
658
832be404
KW
659 if (index < adev->doorbell.num_doorbells) {
660 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
661 } else {
662 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
663 }
664}
665
1bba3683
HZ
666/**
667 * amdgpu_device_indirect_rreg - read an indirect register
668 *
669 * @adev: amdgpu_device pointer
670 * @pcie_index: mmio register offset
671 * @pcie_data: mmio register offset
22f453fb 672 * @reg_addr: indirect register address to read from
1bba3683
HZ
673 *
674 * Returns the value of indirect register @reg_addr
675 */
676u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
677 u32 pcie_index, u32 pcie_data,
678 u32 reg_addr)
679{
680 unsigned long flags;
681 u32 r;
682 void __iomem *pcie_index_offset;
683 void __iomem *pcie_data_offset;
684
685 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
688
689 writel(reg_addr, pcie_index_offset);
690 readl(pcie_index_offset);
691 r = readl(pcie_data_offset);
692 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
693
694 return r;
695}
696
697/**
698 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
699 *
700 * @adev: amdgpu_device pointer
701 * @pcie_index: mmio register offset
702 * @pcie_data: mmio register offset
22f453fb 703 * @reg_addr: indirect register address to read from
1bba3683
HZ
704 *
705 * Returns the value of indirect register @reg_addr
706 */
707u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
708 u32 pcie_index, u32 pcie_data,
709 u32 reg_addr)
710{
711 unsigned long flags;
712 u64 r;
713 void __iomem *pcie_index_offset;
714 void __iomem *pcie_data_offset;
715
716 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
717 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
718 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
719
720 /* read low 32 bits */
721 writel(reg_addr, pcie_index_offset);
722 readl(pcie_index_offset);
723 r = readl(pcie_data_offset);
724 /* read high 32 bits */
725 writel(reg_addr + 4, pcie_index_offset);
726 readl(pcie_index_offset);
727 r |= ((u64)readl(pcie_data_offset) << 32);
728 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
729
730 return r;
731}
732
733/**
734 * amdgpu_device_indirect_wreg - write an indirect register address
735 *
736 * @adev: amdgpu_device pointer
737 * @pcie_index: mmio register offset
738 * @pcie_data: mmio register offset
739 * @reg_addr: indirect register offset
740 * @reg_data: indirect register data
741 *
742 */
743void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
744 u32 pcie_index, u32 pcie_data,
745 u32 reg_addr, u32 reg_data)
746{
747 unsigned long flags;
748 void __iomem *pcie_index_offset;
749 void __iomem *pcie_data_offset;
750
751 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
752 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
753 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
754
755 writel(reg_addr, pcie_index_offset);
756 readl(pcie_index_offset);
757 writel(reg_data, pcie_data_offset);
758 readl(pcie_data_offset);
759 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
760}
761
762/**
763 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
764 *
765 * @adev: amdgpu_device pointer
766 * @pcie_index: mmio register offset
767 * @pcie_data: mmio register offset
768 * @reg_addr: indirect register offset
769 * @reg_data: indirect register data
770 *
771 */
772void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
773 u32 pcie_index, u32 pcie_data,
774 u32 reg_addr, u64 reg_data)
775{
776 unsigned long flags;
777 void __iomem *pcie_index_offset;
778 void __iomem *pcie_data_offset;
779
780 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
781 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
782 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
783
784 /* write low 32 bits */
785 writel(reg_addr, pcie_index_offset);
786 readl(pcie_index_offset);
787 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
788 readl(pcie_data_offset);
789 /* write high 32 bits */
790 writel(reg_addr + 4, pcie_index_offset);
791 readl(pcie_index_offset);
792 writel((u32)(reg_data >> 32), pcie_data_offset);
793 readl(pcie_data_offset);
794 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
795}
796
d38ceaf9
AD
797/**
798 * amdgpu_invalid_rreg - dummy reg read function
799 *
982a820b 800 * @adev: amdgpu_device pointer
d38ceaf9
AD
801 * @reg: offset of register
802 *
803 * Dummy register read function. Used for register blocks
804 * that certain asics don't have (all asics).
805 * Returns the value in the register.
806 */
807static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
808{
809 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
810 BUG();
811 return 0;
812}
813
814/**
815 * amdgpu_invalid_wreg - dummy reg write function
816 *
982a820b 817 * @adev: amdgpu_device pointer
d38ceaf9
AD
818 * @reg: offset of register
819 * @v: value to write to the register
820 *
821 * Dummy register read function. Used for register blocks
822 * that certain asics don't have (all asics).
823 */
824static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
825{
826 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
827 reg, v);
828 BUG();
829}
830
4fa1c6a6
TZ
831/**
832 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
833 *
982a820b 834 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
835 * @reg: offset of register
836 *
837 * Dummy register read function. Used for register blocks
838 * that certain asics don't have (all asics).
839 * Returns the value in the register.
840 */
841static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
842{
843 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
844 BUG();
845 return 0;
846}
847
848/**
849 * amdgpu_invalid_wreg64 - dummy reg write function
850 *
982a820b 851 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
852 * @reg: offset of register
853 * @v: value to write to the register
854 *
855 * Dummy register read function. Used for register blocks
856 * that certain asics don't have (all asics).
857 */
858static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
859{
860 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
861 reg, v);
862 BUG();
863}
864
d38ceaf9
AD
865/**
866 * amdgpu_block_invalid_rreg - dummy reg read function
867 *
982a820b 868 * @adev: amdgpu_device pointer
d38ceaf9
AD
869 * @block: offset of instance
870 * @reg: offset of register
871 *
872 * Dummy register read function. Used for register blocks
873 * that certain asics don't have (all asics).
874 * Returns the value in the register.
875 */
876static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
877 uint32_t block, uint32_t reg)
878{
879 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
880 reg, block);
881 BUG();
882 return 0;
883}
884
885/**
886 * amdgpu_block_invalid_wreg - dummy reg write function
887 *
982a820b 888 * @adev: amdgpu_device pointer
d38ceaf9
AD
889 * @block: offset of instance
890 * @reg: offset of register
891 * @v: value to write to the register
892 *
893 * Dummy register read function. Used for register blocks
894 * that certain asics don't have (all asics).
895 */
896static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
897 uint32_t block,
898 uint32_t reg, uint32_t v)
899{
900 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
901 reg, block, v);
902 BUG();
903}
904
4d2997ab
AD
905/**
906 * amdgpu_device_asic_init - Wrapper for atom asic_init
907 *
982a820b 908 * @adev: amdgpu_device pointer
4d2997ab
AD
909 *
910 * Does any asic specific work and then calls atom asic init.
911 */
912static int amdgpu_device_asic_init(struct amdgpu_device *adev)
913{
914 amdgpu_asic_pre_asic_init(adev);
915
85d1bcc6
HZ
916 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
917 return amdgpu_atomfirmware_asic_init(adev, true);
918 else
919 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
4d2997ab
AD
920}
921
e3ecdffa
AD
922/**
923 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
924 *
982a820b 925 * @adev: amdgpu_device pointer
e3ecdffa
AD
926 *
927 * Allocates a scratch page of VRAM for use by various things in the
928 * driver.
929 */
06ec9070 930static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 931{
a4a02777
CK
932 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
933 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
934 &adev->vram_scratch.robj,
935 &adev->vram_scratch.gpu_addr,
936 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
937}
938
e3ecdffa
AD
939/**
940 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
941 *
982a820b 942 * @adev: amdgpu_device pointer
e3ecdffa
AD
943 *
944 * Frees the VRAM scratch page.
945 */
06ec9070 946static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 947{
078af1a3 948 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
949}
950
951/**
9c3f2b54 952 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
953 *
954 * @adev: amdgpu_device pointer
955 * @registers: pointer to the register array
956 * @array_size: size of the register array
957 *
958 * Programs an array or registers with and and or masks.
959 * This is a helper for setting golden registers.
960 */
9c3f2b54
AD
961void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
962 const u32 *registers,
963 const u32 array_size)
d38ceaf9
AD
964{
965 u32 tmp, reg, and_mask, or_mask;
966 int i;
967
968 if (array_size % 3)
969 return;
970
971 for (i = 0; i < array_size; i +=3) {
972 reg = registers[i + 0];
973 and_mask = registers[i + 1];
974 or_mask = registers[i + 2];
975
976 if (and_mask == 0xffffffff) {
977 tmp = or_mask;
978 } else {
979 tmp = RREG32(reg);
980 tmp &= ~and_mask;
e0d07657
HZ
981 if (adev->family >= AMDGPU_FAMILY_AI)
982 tmp |= (or_mask & and_mask);
983 else
984 tmp |= or_mask;
d38ceaf9
AD
985 }
986 WREG32(reg, tmp);
987 }
988}
989
e3ecdffa
AD
990/**
991 * amdgpu_device_pci_config_reset - reset the GPU
992 *
993 * @adev: amdgpu_device pointer
994 *
995 * Resets the GPU using the pci config reset sequence.
996 * Only applicable to asics prior to vega10.
997 */
8111c387 998void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
999{
1000 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1001}
1002
af484df8
AD
1003/**
1004 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1005 *
1006 * @adev: amdgpu_device pointer
1007 *
1008 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1009 */
1010int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1011{
1012 return pci_reset_function(adev->pdev);
1013}
1014
d38ceaf9
AD
1015/*
1016 * GPU doorbell aperture helpers function.
1017 */
1018/**
06ec9070 1019 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
1020 *
1021 * @adev: amdgpu_device pointer
1022 *
1023 * Init doorbell driver information (CIK)
1024 * Returns 0 on success, error on failure.
1025 */
06ec9070 1026static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 1027{
6585661d 1028
705e519e
CK
1029 /* No doorbell on SI hardware generation */
1030 if (adev->asic_type < CHIP_BONAIRE) {
1031 adev->doorbell.base = 0;
1032 adev->doorbell.size = 0;
1033 adev->doorbell.num_doorbells = 0;
1034 adev->doorbell.ptr = NULL;
1035 return 0;
1036 }
1037
d6895ad3
CK
1038 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1039 return -EINVAL;
1040
22357775
AD
1041 amdgpu_asic_init_doorbell_index(adev);
1042
d38ceaf9
AD
1043 /* doorbell bar mapping */
1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1046
de33a329
JX
1047 if (adev->enable_mes) {
1048 adev->doorbell.num_doorbells =
1049 adev->doorbell.size / sizeof(u32);
1050 } else {
1051 adev->doorbell.num_doorbells =
1052 min_t(u32, adev->doorbell.size / sizeof(u32),
1053 adev->doorbell_index.max_assignment+1);
1054 if (adev->doorbell.num_doorbells == 0)
1055 return -EINVAL;
1056
1057 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1058 * paging queue doorbell use the second page. The
1059 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1060 * doorbells are in the first page. So with paging queue enabled,
1061 * the max num_doorbells should + 1 page (0x400 in dword)
1062 */
1063 if (adev->asic_type >= CHIP_VEGA10)
1064 adev->doorbell.num_doorbells += 0x400;
1065 }
ec3db8a6 1066
8972e5d2
CK
1067 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1068 adev->doorbell.num_doorbells *
1069 sizeof(u32));
1070 if (adev->doorbell.ptr == NULL)
d38ceaf9 1071 return -ENOMEM;
d38ceaf9
AD
1072
1073 return 0;
1074}
1075
1076/**
06ec9070 1077 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
1078 *
1079 * @adev: amdgpu_device pointer
1080 *
1081 * Tear down doorbell driver information (CIK)
1082 */
06ec9070 1083static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1084{
1085 iounmap(adev->doorbell.ptr);
1086 adev->doorbell.ptr = NULL;
1087}
1088
22cb0164 1089
d38ceaf9
AD
1090
1091/*
06ec9070 1092 * amdgpu_device_wb_*()
455a7bc2 1093 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 1094 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
1095 */
1096
1097/**
06ec9070 1098 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
1099 *
1100 * @adev: amdgpu_device pointer
1101 *
1102 * Disables Writeback and frees the Writeback memory (all asics).
1103 * Used at driver shutdown.
1104 */
06ec9070 1105static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1106{
1107 if (adev->wb.wb_obj) {
a76ed485
AD
1108 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1109 &adev->wb.gpu_addr,
1110 (void **)&adev->wb.wb);
d38ceaf9
AD
1111 adev->wb.wb_obj = NULL;
1112 }
1113}
1114
1115/**
03f2abb0 1116 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
d38ceaf9
AD
1117 *
1118 * @adev: amdgpu_device pointer
1119 *
455a7bc2 1120 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
1121 * Used at driver startup.
1122 * Returns 0 on success or an -error on failure.
1123 */
06ec9070 1124static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
1125{
1126 int r;
1127
1128 if (adev->wb.wb_obj == NULL) {
97407b63
AD
1129 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1130 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
1131 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1132 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1133 (void **)&adev->wb.wb);
d38ceaf9
AD
1134 if (r) {
1135 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1136 return r;
1137 }
d38ceaf9
AD
1138
1139 adev->wb.num_wb = AMDGPU_MAX_WB;
1140 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1141
1142 /* clear wb memory */
73469585 1143 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
1144 }
1145
1146 return 0;
1147}
1148
1149/**
131b4b36 1150 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
1151 *
1152 * @adev: amdgpu_device pointer
1153 * @wb: wb index
1154 *
1155 * Allocate a wb slot for use by the driver (all asics).
1156 * Returns 0 on success or -EINVAL on failure.
1157 */
131b4b36 1158int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
1159{
1160 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 1161
97407b63 1162 if (offset < adev->wb.num_wb) {
7014285a 1163 __set_bit(offset, adev->wb.used);
63ae07ca 1164 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
1165 return 0;
1166 } else {
1167 return -EINVAL;
1168 }
1169}
1170
d38ceaf9 1171/**
131b4b36 1172 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
1173 *
1174 * @adev: amdgpu_device pointer
1175 * @wb: wb index
1176 *
1177 * Free a wb slot allocated for use by the driver (all asics)
1178 */
131b4b36 1179void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 1180{
73469585 1181 wb >>= 3;
d38ceaf9 1182 if (wb < adev->wb.num_wb)
73469585 1183 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
1184}
1185
d6895ad3
CK
1186/**
1187 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1188 *
1189 * @adev: amdgpu_device pointer
1190 *
1191 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1192 * to fail, but if any of the BARs is not accessible after the size we abort
1193 * driver loading by returning -ENODEV.
1194 */
1195int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1196{
453f617a 1197 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
31b8adab
CK
1198 struct pci_bus *root;
1199 struct resource *res;
1200 unsigned i;
d6895ad3
CK
1201 u16 cmd;
1202 int r;
1203
0c03b912 1204 /* Bypass for VF */
1205 if (amdgpu_sriov_vf(adev))
1206 return 0;
1207
b7221f2b
AD
1208 /* skip if the bios has already enabled large BAR */
1209 if (adev->gmc.real_vram_size &&
1210 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1211 return 0;
1212
31b8adab
CK
1213 /* Check if the root BUS has 64bit memory resources */
1214 root = adev->pdev->bus;
1215 while (root->parent)
1216 root = root->parent;
1217
1218 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 1219 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1220 res->start > 0x100000000ull)
1221 break;
1222 }
1223
1224 /* Trying to resize is pointless without a root hub window above 4GB */
1225 if (!res)
1226 return 0;
1227
453f617a
ND
1228 /* Limit the BAR size to what is available */
1229 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1230 rbar_size);
1231
d6895ad3
CK
1232 /* Disable memory decoding while we change the BAR addresses and size */
1233 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1234 pci_write_config_word(adev->pdev, PCI_COMMAND,
1235 cmd & ~PCI_COMMAND_MEMORY);
1236
1237 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1238 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1239 if (adev->asic_type >= CHIP_BONAIRE)
1240 pci_release_resource(adev->pdev, 2);
1241
1242 pci_release_resource(adev->pdev, 0);
1243
1244 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1245 if (r == -ENOSPC)
1246 DRM_INFO("Not enough PCI address space for a large BAR.");
1247 else if (r && r != -ENOTSUPP)
1248 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1249
1250 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1251
1252 /* When the doorbell or fb BAR isn't available we have no chance of
1253 * using the device.
1254 */
06ec9070 1255 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1256 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1257 return -ENODEV;
1258
1259 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1260
1261 return 0;
1262}
a05502e5 1263
d38ceaf9
AD
1264/*
1265 * GPU helpers function.
1266 */
1267/**
39c640c0 1268 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1269 *
1270 * @adev: amdgpu_device pointer
1271 *
c836fec5
JQ
1272 * Check if the asic has been initialized (all asics) at driver startup
1273 * or post is needed if hw reset is performed.
1274 * Returns true if need or false if not.
d38ceaf9 1275 */
39c640c0 1276bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1277{
1278 uint32_t reg;
1279
bec86378
ML
1280 if (amdgpu_sriov_vf(adev))
1281 return false;
1282
1283 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1284 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1285 * some old smc fw still need driver do vPost otherwise gpu hang, while
1286 * those smc fw version above 22.15 doesn't have this flaw, so we force
1287 * vpost executed for smc version below 22.15
bec86378
ML
1288 */
1289 if (adev->asic_type == CHIP_FIJI) {
1290 int err;
1291 uint32_t fw_ver;
1292 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1293 /* force vPost if error occured */
1294 if (err)
1295 return true;
1296
1297 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1298 if (fw_ver < 0x00160e00)
1299 return true;
bec86378 1300 }
bec86378 1301 }
91fe77eb 1302
e3c1b071 1303 /* Don't post if we need to reset whole hive on init */
1304 if (adev->gmc.xgmi.pending_reset)
1305 return false;
1306
91fe77eb 1307 if (adev->has_hw_reset) {
1308 adev->has_hw_reset = false;
1309 return true;
1310 }
1311
1312 /* bios scratch used on CIK+ */
1313 if (adev->asic_type >= CHIP_BONAIRE)
1314 return amdgpu_atombios_scratch_need_asic_init(adev);
1315
1316 /* check MEM_SIZE for older asics */
1317 reg = amdgpu_asic_get_config_memsize(adev);
1318
1319 if ((reg != 0) && (reg != 0xffffffff))
1320 return false;
1321
1322 return true;
bec86378
ML
1323}
1324
0ab5d711
ML
1325/**
1326 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1327 *
1328 * @adev: amdgpu_device pointer
1329 *
1330 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1331 * be set for this device.
1332 *
1333 * Returns true if it should be used or false if not.
1334 */
1335bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1336{
1337 switch (amdgpu_aspm) {
1338 case -1:
1339 break;
1340 case 0:
1341 return false;
1342 case 1:
1343 return true;
1344 default:
1345 return false;
1346 }
1347 return pcie_aspm_enabled(adev->pdev);
1348}
1349
d38ceaf9
AD
1350/* if we get transitioned to only one device, take VGA back */
1351/**
06ec9070 1352 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9 1353 *
bf44e8ce 1354 * @pdev: PCI device pointer
d38ceaf9
AD
1355 * @state: enable/disable vga decode
1356 *
1357 * Enable/disable vga decode (all asics).
1358 * Returns VGA resource flags.
1359 */
bf44e8ce
CH
1360static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1361 bool state)
d38ceaf9 1362{
bf44e8ce 1363 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
d38ceaf9
AD
1364 amdgpu_asic_set_vga_state(adev, state);
1365 if (state)
1366 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1367 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1368 else
1369 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1370}
1371
e3ecdffa
AD
1372/**
1373 * amdgpu_device_check_block_size - validate the vm block size
1374 *
1375 * @adev: amdgpu_device pointer
1376 *
1377 * Validates the vm block size specified via module parameter.
1378 * The vm block size defines number of bits in page table versus page directory,
1379 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1380 * page table and the remaining bits are in the page directory.
1381 */
06ec9070 1382static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1383{
1384 /* defines number of bits in page table versus page directory,
1385 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1386 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1387 if (amdgpu_vm_block_size == -1)
1388 return;
a1adf8be 1389
bab4fee7 1390 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1391 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1392 amdgpu_vm_block_size);
97489129 1393 amdgpu_vm_block_size = -1;
a1adf8be 1394 }
a1adf8be
CZ
1395}
1396
e3ecdffa
AD
1397/**
1398 * amdgpu_device_check_vm_size - validate the vm size
1399 *
1400 * @adev: amdgpu_device pointer
1401 *
1402 * Validates the vm size in GB specified via module parameter.
1403 * The VM size is the size of the GPU virtual memory space in GB.
1404 */
06ec9070 1405static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1406{
64dab074
AD
1407 /* no need to check the default value */
1408 if (amdgpu_vm_size == -1)
1409 return;
1410
83ca145d
ZJ
1411 if (amdgpu_vm_size < 1) {
1412 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1413 amdgpu_vm_size);
f3368128 1414 amdgpu_vm_size = -1;
83ca145d 1415 }
83ca145d
ZJ
1416}
1417
7951e376
RZ
1418static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1419{
1420 struct sysinfo si;
a9d4fe2f 1421 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1422 uint64_t total_memory;
1423 uint64_t dram_size_seven_GB = 0x1B8000000;
1424 uint64_t dram_size_three_GB = 0xB8000000;
1425
1426 if (amdgpu_smu_memory_pool_size == 0)
1427 return;
1428
1429 if (!is_os_64) {
1430 DRM_WARN("Not 64-bit OS, feature not supported\n");
1431 goto def_value;
1432 }
1433 si_meminfo(&si);
1434 total_memory = (uint64_t)si.totalram * si.mem_unit;
1435
1436 if ((amdgpu_smu_memory_pool_size == 1) ||
1437 (amdgpu_smu_memory_pool_size == 2)) {
1438 if (total_memory < dram_size_three_GB)
1439 goto def_value1;
1440 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1441 (amdgpu_smu_memory_pool_size == 8)) {
1442 if (total_memory < dram_size_seven_GB)
1443 goto def_value1;
1444 } else {
1445 DRM_WARN("Smu memory pool size not supported\n");
1446 goto def_value;
1447 }
1448 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1449
1450 return;
1451
1452def_value1:
1453 DRM_WARN("No enough system memory\n");
1454def_value:
1455 adev->pm.smu_prv_buffer_size = 0;
1456}
1457
9f6a7857
HR
1458static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1459{
1460 if (!(adev->flags & AMD_IS_APU) ||
1461 adev->asic_type < CHIP_RAVEN)
1462 return 0;
1463
1464 switch (adev->asic_type) {
1465 case CHIP_RAVEN:
1466 if (adev->pdev->device == 0x15dd)
1467 adev->apu_flags |= AMD_APU_IS_RAVEN;
1468 if (adev->pdev->device == 0x15d8)
1469 adev->apu_flags |= AMD_APU_IS_PICASSO;
1470 break;
1471 case CHIP_RENOIR:
1472 if ((adev->pdev->device == 0x1636) ||
1473 (adev->pdev->device == 0x164c))
1474 adev->apu_flags |= AMD_APU_IS_RENOIR;
1475 else
1476 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1477 break;
1478 case CHIP_VANGOGH:
1479 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1480 break;
1481 case CHIP_YELLOW_CARP:
1482 break;
d0f56dc2 1483 case CHIP_CYAN_SKILLFISH:
dfcc3e8c
AD
1484 if ((adev->pdev->device == 0x13FE) ||
1485 (adev->pdev->device == 0x143F))
d0f56dc2
TZ
1486 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1487 break;
9f6a7857 1488 default:
4eaf21b7 1489 break;
9f6a7857
HR
1490 }
1491
1492 return 0;
1493}
1494
d38ceaf9 1495/**
06ec9070 1496 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1497 *
1498 * @adev: amdgpu_device pointer
1499 *
1500 * Validates certain module parameters and updates
1501 * the associated values used by the driver (all asics).
1502 */
912dfc84 1503static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1504{
5b011235
CZ
1505 if (amdgpu_sched_jobs < 4) {
1506 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1507 amdgpu_sched_jobs);
1508 amdgpu_sched_jobs = 4;
76117507 1509 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1510 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1511 amdgpu_sched_jobs);
1512 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1513 }
d38ceaf9 1514
83e74db6 1515 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1516 /* gart size must be greater or equal to 32M */
1517 dev_warn(adev->dev, "gart size (%d) too small\n",
1518 amdgpu_gart_size);
83e74db6 1519 amdgpu_gart_size = -1;
d38ceaf9
AD
1520 }
1521
36d38372 1522 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1523 /* gtt size must be greater or equal to 32M */
36d38372
CK
1524 dev_warn(adev->dev, "gtt size (%d) too small\n",
1525 amdgpu_gtt_size);
1526 amdgpu_gtt_size = -1;
d38ceaf9
AD
1527 }
1528
d07f14be
RH
1529 /* valid range is between 4 and 9 inclusive */
1530 if (amdgpu_vm_fragment_size != -1 &&
1531 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1532 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1533 amdgpu_vm_fragment_size = -1;
1534 }
1535
5d5bd5e3
KW
1536 if (amdgpu_sched_hw_submission < 2) {
1537 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1538 amdgpu_sched_hw_submission);
1539 amdgpu_sched_hw_submission = 2;
1540 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1541 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1542 amdgpu_sched_hw_submission);
1543 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1544 }
1545
2656fd23
AG
1546 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1547 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1548 amdgpu_reset_method = -1;
1549 }
1550
7951e376
RZ
1551 amdgpu_device_check_smu_prv_buffer_size(adev);
1552
06ec9070 1553 amdgpu_device_check_vm_size(adev);
d38ceaf9 1554
06ec9070 1555 amdgpu_device_check_block_size(adev);
6a7f76e7 1556
19aede77 1557 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1558
e3c00faa 1559 return 0;
d38ceaf9
AD
1560}
1561
1562/**
1563 * amdgpu_switcheroo_set_state - set switcheroo state
1564 *
1565 * @pdev: pci dev pointer
1694467b 1566 * @state: vga_switcheroo state
d38ceaf9
AD
1567 *
1568 * Callback for the switcheroo driver. Suspends or resumes the
1569 * the asics before or after it is powered up using ACPI methods.
1570 */
8aba21b7
LT
1571static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1572 enum vga_switcheroo_state state)
d38ceaf9
AD
1573{
1574 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1575 int r;
d38ceaf9 1576
b98c6299 1577 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1578 return;
1579
1580 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1581 pr_info("switched on\n");
d38ceaf9
AD
1582 /* don't suspend or resume card normally */
1583 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1584
8f66090b
TZ
1585 pci_set_power_state(pdev, PCI_D0);
1586 amdgpu_device_load_pci_state(pdev);
1587 r = pci_enable_device(pdev);
de185019
AD
1588 if (r)
1589 DRM_WARN("pci_enable_device failed (%d)\n", r);
1590 amdgpu_device_resume(dev, true);
d38ceaf9 1591
d38ceaf9 1592 dev->switch_power_state = DRM_SWITCH_POWER_ON;
d38ceaf9 1593 } else {
dd4fa6c1 1594 pr_info("switched off\n");
d38ceaf9 1595 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1596 amdgpu_device_suspend(dev, true);
8f66090b 1597 amdgpu_device_cache_pci_state(pdev);
de185019 1598 /* Shut down the device */
8f66090b
TZ
1599 pci_disable_device(pdev);
1600 pci_set_power_state(pdev, PCI_D3cold);
d38ceaf9
AD
1601 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1602 }
1603}
1604
1605/**
1606 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1607 *
1608 * @pdev: pci dev pointer
1609 *
1610 * Callback for the switcheroo driver. Check of the switcheroo
1611 * state can be changed.
1612 * Returns true if the state can be changed, false if not.
1613 */
1614static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1615{
1616 struct drm_device *dev = pci_get_drvdata(pdev);
1617
1618 /*
1619 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1620 * locking inversion with the driver load path. And the access here is
1621 * completely racy anyway. So don't bother with locking for now.
1622 */
7e13ad89 1623 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1624}
1625
1626static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1627 .set_gpu_state = amdgpu_switcheroo_set_state,
1628 .reprobe = NULL,
1629 .can_switch = amdgpu_switcheroo_can_switch,
1630};
1631
e3ecdffa
AD
1632/**
1633 * amdgpu_device_ip_set_clockgating_state - set the CG state
1634 *
87e3f136 1635 * @dev: amdgpu_device pointer
e3ecdffa
AD
1636 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1637 * @state: clockgating state (gate or ungate)
1638 *
1639 * Sets the requested clockgating state for all instances of
1640 * the hardware IP specified.
1641 * Returns the error code from the last instance.
1642 */
43fa561f 1643int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1644 enum amd_ip_block_type block_type,
1645 enum amd_clockgating_state state)
d38ceaf9 1646{
43fa561f 1647 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1648 int i, r = 0;
1649
1650 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1651 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1652 continue;
c722865a
RZ
1653 if (adev->ip_blocks[i].version->type != block_type)
1654 continue;
1655 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1656 continue;
1657 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1658 (void *)adev, state);
1659 if (r)
1660 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1661 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1662 }
1663 return r;
1664}
1665
e3ecdffa
AD
1666/**
1667 * amdgpu_device_ip_set_powergating_state - set the PG state
1668 *
87e3f136 1669 * @dev: amdgpu_device pointer
e3ecdffa
AD
1670 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1671 * @state: powergating state (gate or ungate)
1672 *
1673 * Sets the requested powergating state for all instances of
1674 * the hardware IP specified.
1675 * Returns the error code from the last instance.
1676 */
43fa561f 1677int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1678 enum amd_ip_block_type block_type,
1679 enum amd_powergating_state state)
d38ceaf9 1680{
43fa561f 1681 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1682 int i, r = 0;
1683
1684 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1685 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1686 continue;
c722865a
RZ
1687 if (adev->ip_blocks[i].version->type != block_type)
1688 continue;
1689 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1690 continue;
1691 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1692 (void *)adev, state);
1693 if (r)
1694 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1695 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1696 }
1697 return r;
1698}
1699
e3ecdffa
AD
1700/**
1701 * amdgpu_device_ip_get_clockgating_state - get the CG state
1702 *
1703 * @adev: amdgpu_device pointer
1704 * @flags: clockgating feature flags
1705 *
1706 * Walks the list of IPs on the device and updates the clockgating
1707 * flags for each IP.
1708 * Updates @flags with the feature flags for each hardware IP where
1709 * clockgating is enabled.
1710 */
2990a1fc 1711void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
25faeddc 1712 u64 *flags)
6cb2d4e4
HR
1713{
1714 int i;
1715
1716 for (i = 0; i < adev->num_ip_blocks; i++) {
1717 if (!adev->ip_blocks[i].status.valid)
1718 continue;
1719 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1720 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1721 }
1722}
1723
e3ecdffa
AD
1724/**
1725 * amdgpu_device_ip_wait_for_idle - wait for idle
1726 *
1727 * @adev: amdgpu_device pointer
1728 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1729 *
1730 * Waits for the request hardware IP to be idle.
1731 * Returns 0 for success or a negative error code on failure.
1732 */
2990a1fc
AD
1733int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1734 enum amd_ip_block_type block_type)
5dbbb60b
AD
1735{
1736 int i, r;
1737
1738 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1739 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1740 continue;
a1255107
AD
1741 if (adev->ip_blocks[i].version->type == block_type) {
1742 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1743 if (r)
1744 return r;
1745 break;
1746 }
1747 }
1748 return 0;
1749
1750}
1751
e3ecdffa
AD
1752/**
1753 * amdgpu_device_ip_is_idle - is the hardware IP idle
1754 *
1755 * @adev: amdgpu_device pointer
1756 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1757 *
1758 * Check if the hardware IP is idle or not.
1759 * Returns true if it the IP is idle, false if not.
1760 */
2990a1fc
AD
1761bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1762 enum amd_ip_block_type block_type)
5dbbb60b
AD
1763{
1764 int i;
1765
1766 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1767 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1768 continue;
a1255107
AD
1769 if (adev->ip_blocks[i].version->type == block_type)
1770 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1771 }
1772 return true;
1773
1774}
1775
e3ecdffa
AD
1776/**
1777 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1778 *
1779 * @adev: amdgpu_device pointer
87e3f136 1780 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1781 *
1782 * Returns a pointer to the hardware IP block structure
1783 * if it exists for the asic, otherwise NULL.
1784 */
2990a1fc
AD
1785struct amdgpu_ip_block *
1786amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1787 enum amd_ip_block_type type)
d38ceaf9
AD
1788{
1789 int i;
1790
1791 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1792 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1793 return &adev->ip_blocks[i];
1794
1795 return NULL;
1796}
1797
1798/**
2990a1fc 1799 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1800 *
1801 * @adev: amdgpu_device pointer
5fc3aeeb 1802 * @type: enum amd_ip_block_type
d38ceaf9
AD
1803 * @major: major version
1804 * @minor: minor version
1805 *
1806 * return 0 if equal or greater
1807 * return 1 if smaller or the ip_block doesn't exist
1808 */
2990a1fc
AD
1809int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1810 enum amd_ip_block_type type,
1811 u32 major, u32 minor)
d38ceaf9 1812{
2990a1fc 1813 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1814
a1255107
AD
1815 if (ip_block && ((ip_block->version->major > major) ||
1816 ((ip_block->version->major == major) &&
1817 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1818 return 0;
1819
1820 return 1;
1821}
1822
a1255107 1823/**
2990a1fc 1824 * amdgpu_device_ip_block_add
a1255107
AD
1825 *
1826 * @adev: amdgpu_device pointer
1827 * @ip_block_version: pointer to the IP to add
1828 *
1829 * Adds the IP block driver information to the collection of IPs
1830 * on the asic.
1831 */
2990a1fc
AD
1832int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1833 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1834{
1835 if (!ip_block_version)
1836 return -EINVAL;
1837
7bd939d0
LG
1838 switch (ip_block_version->type) {
1839 case AMD_IP_BLOCK_TYPE_VCN:
1840 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1841 return 0;
1842 break;
1843 case AMD_IP_BLOCK_TYPE_JPEG:
1844 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1845 return 0;
1846 break;
1847 default:
1848 break;
1849 }
1850
e966a725 1851 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1852 ip_block_version->funcs->name);
1853
a1255107
AD
1854 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1855
1856 return 0;
1857}
1858
e3ecdffa
AD
1859/**
1860 * amdgpu_device_enable_virtual_display - enable virtual display feature
1861 *
1862 * @adev: amdgpu_device pointer
1863 *
1864 * Enabled the virtual display feature if the user has enabled it via
1865 * the module parameter virtual_display. This feature provides a virtual
1866 * display hardware on headless boards or in virtualized environments.
1867 * This function parses and validates the configuration string specified by
1868 * the user and configues the virtual display configuration (number of
1869 * virtual connectors, crtcs, etc.) specified.
1870 */
483ef985 1871static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1872{
1873 adev->enable_virtual_display = false;
1874
1875 if (amdgpu_virtual_display) {
8f66090b 1876 const char *pci_address_name = pci_name(adev->pdev);
0f66356d 1877 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1878
1879 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1880 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1881 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1882 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1883 if (!strcmp("all", pciaddname)
1884 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1885 long num_crtc;
1886 int res = -1;
1887
9accf2fd 1888 adev->enable_virtual_display = true;
0f66356d
ED
1889
1890 if (pciaddname_tmp)
1891 res = kstrtol(pciaddname_tmp, 10,
1892 &num_crtc);
1893
1894 if (!res) {
1895 if (num_crtc < 1)
1896 num_crtc = 1;
1897 if (num_crtc > 6)
1898 num_crtc = 6;
1899 adev->mode_info.num_crtc = num_crtc;
1900 } else {
1901 adev->mode_info.num_crtc = 1;
1902 }
9accf2fd
ED
1903 break;
1904 }
1905 }
1906
0f66356d
ED
1907 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1908 amdgpu_virtual_display, pci_address_name,
1909 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1910
1911 kfree(pciaddstr);
1912 }
1913}
1914
e3ecdffa
AD
1915/**
1916 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1917 *
1918 * @adev: amdgpu_device pointer
1919 *
1920 * Parses the asic configuration parameters specified in the gpu info
1921 * firmware and makes them availale to the driver for use in configuring
1922 * the asic.
1923 * Returns 0 on success, -EINVAL on failure.
1924 */
e2a75f88
AD
1925static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1926{
e2a75f88 1927 const char *chip_name;
c0a43457 1928 char fw_name[40];
e2a75f88
AD
1929 int err;
1930 const struct gpu_info_firmware_header_v1_0 *hdr;
1931
ab4fe3e1
HR
1932 adev->firmware.gpu_info_fw = NULL;
1933
72de33f8 1934 if (adev->mman.discovery_bin) {
cc375d8c
TY
1935 /*
1936 * FIXME: The bounding box is still needed by Navi12, so
e24d0e91 1937 * temporarily read it from gpu_info firmware. Should be dropped
cc375d8c
TY
1938 * when DAL no longer needs it.
1939 */
1940 if (adev->asic_type != CHIP_NAVI12)
1941 return 0;
258620d0
AD
1942 }
1943
e2a75f88 1944 switch (adev->asic_type) {
e2a75f88
AD
1945 default:
1946 return 0;
1947 case CHIP_VEGA10:
1948 chip_name = "vega10";
1949 break;
3f76dced
AD
1950 case CHIP_VEGA12:
1951 chip_name = "vega12";
1952 break;
2d2e5e7e 1953 case CHIP_RAVEN:
54f78a76 1954 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1955 chip_name = "raven2";
54f78a76 1956 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1957 chip_name = "picasso";
54c4d17e
FX
1958 else
1959 chip_name = "raven";
2d2e5e7e 1960 break;
65e60f6e
LM
1961 case CHIP_ARCTURUS:
1962 chip_name = "arcturus";
1963 break;
42b325e5
XY
1964 case CHIP_NAVI12:
1965 chip_name = "navi12";
1966 break;
e2a75f88
AD
1967 }
1968
1969 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1970 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1971 if (err) {
1972 dev_err(adev->dev,
1973 "Failed to load gpu_info firmware \"%s\"\n",
1974 fw_name);
1975 goto out;
1976 }
ab4fe3e1 1977 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1978 if (err) {
1979 dev_err(adev->dev,
1980 "Failed to validate gpu_info firmware \"%s\"\n",
1981 fw_name);
1982 goto out;
1983 }
1984
ab4fe3e1 1985 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1986 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1987
1988 switch (hdr->version_major) {
1989 case 1:
1990 {
1991 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1992 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1993 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1994
cc375d8c
TY
1995 /*
1996 * Should be droped when DAL no longer needs it.
1997 */
1998 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
1999 goto parse_soc_bounding_box;
2000
b5ab16bf
AD
2001 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2002 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2003 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2004 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 2005 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
2006 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2007 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2008 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2009 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2010 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 2011 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
2012 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2013 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
2014 adev->gfx.cu_info.max_waves_per_simd =
2015 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2016 adev->gfx.cu_info.max_scratch_slots_per_cu =
2017 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2018 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 2019 if (hdr->version_minor >= 1) {
35c2e910
HZ
2020 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2021 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2022 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2023 adev->gfx.config.num_sc_per_sh =
2024 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2025 adev->gfx.config.num_packer_per_sc =
2026 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2027 }
ec51d3fa
XY
2028
2029parse_soc_bounding_box:
ec51d3fa
XY
2030 /*
2031 * soc bounding box info is not integrated in disocovery table,
258620d0 2032 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 2033 */
48321c3d
HW
2034 if (hdr->version_minor == 2) {
2035 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2036 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2037 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2038 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2039 }
e2a75f88
AD
2040 break;
2041 }
2042 default:
2043 dev_err(adev->dev,
2044 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2045 err = -EINVAL;
2046 goto out;
2047 }
2048out:
e2a75f88
AD
2049 return err;
2050}
2051
e3ecdffa
AD
2052/**
2053 * amdgpu_device_ip_early_init - run early init for hardware IPs
2054 *
2055 * @adev: amdgpu_device pointer
2056 *
2057 * Early initialization pass for hardware IPs. The hardware IPs that make
2058 * up each asic are discovered each IP's early_init callback is run. This
2059 * is the first stage in initializing the asic.
2060 * Returns 0 on success, negative error code on failure.
2061 */
06ec9070 2062static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 2063{
901e2be2
AD
2064 struct drm_device *dev = adev_to_drm(adev);
2065 struct pci_dev *parent;
aaa36a97 2066 int i, r;
d38ceaf9 2067
483ef985 2068 amdgpu_device_enable_virtual_display(adev);
a6be7570 2069
00a979f3 2070 if (amdgpu_sriov_vf(adev)) {
00a979f3 2071 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
2072 if (r)
2073 return r;
00a979f3
WS
2074 }
2075
d38ceaf9 2076 switch (adev->asic_type) {
33f34802
KW
2077#ifdef CONFIG_DRM_AMDGPU_SI
2078 case CHIP_VERDE:
2079 case CHIP_TAHITI:
2080 case CHIP_PITCAIRN:
2081 case CHIP_OLAND:
2082 case CHIP_HAINAN:
295d0daf 2083 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
2084 r = si_set_ip_blocks(adev);
2085 if (r)
2086 return r;
2087 break;
2088#endif
a2e73f56
AD
2089#ifdef CONFIG_DRM_AMDGPU_CIK
2090 case CHIP_BONAIRE:
2091 case CHIP_HAWAII:
2092 case CHIP_KAVERI:
2093 case CHIP_KABINI:
2094 case CHIP_MULLINS:
e1ad2d53 2095 if (adev->flags & AMD_IS_APU)
a2e73f56 2096 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
2097 else
2098 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
2099
2100 r = cik_set_ip_blocks(adev);
2101 if (r)
2102 return r;
2103 break;
2104#endif
da87c30b
AD
2105 case CHIP_TOPAZ:
2106 case CHIP_TONGA:
2107 case CHIP_FIJI:
2108 case CHIP_POLARIS10:
2109 case CHIP_POLARIS11:
2110 case CHIP_POLARIS12:
2111 case CHIP_VEGAM:
2112 case CHIP_CARRIZO:
2113 case CHIP_STONEY:
2114 if (adev->flags & AMD_IS_APU)
2115 adev->family = AMDGPU_FAMILY_CZ;
2116 else
2117 adev->family = AMDGPU_FAMILY_VI;
2118
2119 r = vi_set_ip_blocks(adev);
2120 if (r)
2121 return r;
2122 break;
d38ceaf9 2123 default:
63352b7f
AD
2124 r = amdgpu_discovery_set_ip_blocks(adev);
2125 if (r)
2126 return r;
2127 break;
d38ceaf9
AD
2128 }
2129
901e2be2
AD
2130 if (amdgpu_has_atpx() &&
2131 (amdgpu_is_atpx_hybrid() ||
2132 amdgpu_has_atpx_dgpu_power_cntl()) &&
2133 ((adev->flags & AMD_IS_APU) == 0) &&
2134 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2135 adev->flags |= AMD_IS_PX;
2136
85ac2021
AD
2137 if (!(adev->flags & AMD_IS_APU)) {
2138 parent = pci_upstream_bridge(adev->pdev);
2139 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2140 }
901e2be2 2141
c004d44e 2142 amdgpu_amdkfd_device_probe(adev);
1884734a 2143
3b94fb10 2144 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 2145 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 2146 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
4215a119
HC
2147 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2148 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
00f54b97 2149
d38ceaf9
AD
2150 for (i = 0; i < adev->num_ip_blocks; i++) {
2151 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
2152 DRM_ERROR("disabled ip block: %d <%s>\n",
2153 i, adev->ip_blocks[i].version->funcs->name);
a1255107 2154 adev->ip_blocks[i].status.valid = false;
d38ceaf9 2155 } else {
a1255107
AD
2156 if (adev->ip_blocks[i].version->funcs->early_init) {
2157 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 2158 if (r == -ENOENT) {
a1255107 2159 adev->ip_blocks[i].status.valid = false;
2c1a2784 2160 } else if (r) {
a1255107
AD
2161 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2162 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2163 return r;
2c1a2784 2164 } else {
a1255107 2165 adev->ip_blocks[i].status.valid = true;
2c1a2784 2166 }
974e6b64 2167 } else {
a1255107 2168 adev->ip_blocks[i].status.valid = true;
d38ceaf9 2169 }
d38ceaf9 2170 }
21a249ca
AD
2171 /* get the vbios after the asic_funcs are set up */
2172 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
2173 r = amdgpu_device_parse_gpu_info_fw(adev);
2174 if (r)
2175 return r;
2176
21a249ca
AD
2177 /* Read BIOS */
2178 if (!amdgpu_get_bios(adev))
2179 return -EINVAL;
2180
2181 r = amdgpu_atombios_init(adev);
2182 if (r) {
2183 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2184 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2185 return r;
2186 }
77eabc6f
PJZ
2187
2188 /*get pf2vf msg info at it's earliest time*/
2189 if (amdgpu_sriov_vf(adev))
2190 amdgpu_virt_init_data_exchange(adev);
2191
21a249ca 2192 }
d38ceaf9
AD
2193 }
2194
395d1fb9
NH
2195 adev->cg_flags &= amdgpu_cg_mask;
2196 adev->pg_flags &= amdgpu_pg_mask;
2197
d38ceaf9
AD
2198 return 0;
2199}
2200
0a4f2520
RZ
2201static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2202{
2203 int i, r;
2204
2205 for (i = 0; i < adev->num_ip_blocks; i++) {
2206 if (!adev->ip_blocks[i].status.sw)
2207 continue;
2208 if (adev->ip_blocks[i].status.hw)
2209 continue;
2210 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 2211 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
2212 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2213 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2214 if (r) {
2215 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2216 adev->ip_blocks[i].version->funcs->name, r);
2217 return r;
2218 }
2219 adev->ip_blocks[i].status.hw = true;
2220 }
2221 }
2222
2223 return 0;
2224}
2225
2226static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2227{
2228 int i, r;
2229
2230 for (i = 0; i < adev->num_ip_blocks; i++) {
2231 if (!adev->ip_blocks[i].status.sw)
2232 continue;
2233 if (adev->ip_blocks[i].status.hw)
2234 continue;
2235 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2236 if (r) {
2237 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2238 adev->ip_blocks[i].version->funcs->name, r);
2239 return r;
2240 }
2241 adev->ip_blocks[i].status.hw = true;
2242 }
2243
2244 return 0;
2245}
2246
7a3e0bb2
RZ
2247static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2248{
2249 int r = 0;
2250 int i;
80f41f84 2251 uint32_t smu_version;
7a3e0bb2
RZ
2252
2253 if (adev->asic_type >= CHIP_VEGA10) {
2254 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2255 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2256 continue;
2257
e3c1b071 2258 if (!adev->ip_blocks[i].status.sw)
2259 continue;
2260
482f0e53
ML
2261 /* no need to do the fw loading again if already done*/
2262 if (adev->ip_blocks[i].status.hw == true)
2263 break;
2264
53b3f8f4 2265 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2266 r = adev->ip_blocks[i].version->funcs->resume(adev);
2267 if (r) {
2268 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2269 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2270 return r;
2271 }
2272 } else {
2273 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2274 if (r) {
2275 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2276 adev->ip_blocks[i].version->funcs->name, r);
2277 return r;
7a3e0bb2 2278 }
7a3e0bb2 2279 }
482f0e53
ML
2280
2281 adev->ip_blocks[i].status.hw = true;
2282 break;
7a3e0bb2
RZ
2283 }
2284 }
482f0e53 2285
8973d9ec
ED
2286 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2287 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2288
80f41f84 2289 return r;
7a3e0bb2
RZ
2290}
2291
5fd8518d
AG
2292static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2293{
2294 long timeout;
2295 int r, i;
2296
2297 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2298 struct amdgpu_ring *ring = adev->rings[i];
2299
2300 /* No need to setup the GPU scheduler for rings that don't need it */
2301 if (!ring || ring->no_scheduler)
2302 continue;
2303
2304 switch (ring->funcs->type) {
2305 case AMDGPU_RING_TYPE_GFX:
2306 timeout = adev->gfx_timeout;
2307 break;
2308 case AMDGPU_RING_TYPE_COMPUTE:
2309 timeout = adev->compute_timeout;
2310 break;
2311 case AMDGPU_RING_TYPE_SDMA:
2312 timeout = adev->sdma_timeout;
2313 break;
2314 default:
2315 timeout = adev->video_timeout;
2316 break;
2317 }
2318
2319 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2320 ring->num_hw_submission, amdgpu_job_hang_limit,
8ab62eda
JG
2321 timeout, adev->reset_domain->wq,
2322 ring->sched_score, ring->name,
2323 adev->dev);
5fd8518d
AG
2324 if (r) {
2325 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2326 ring->name);
2327 return r;
2328 }
2329 }
2330
2331 return 0;
2332}
2333
2334
e3ecdffa
AD
2335/**
2336 * amdgpu_device_ip_init - run init for hardware IPs
2337 *
2338 * @adev: amdgpu_device pointer
2339 *
2340 * Main initialization pass for hardware IPs. The list of all the hardware
2341 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2342 * are run. sw_init initializes the software state associated with each IP
2343 * and hw_init initializes the hardware associated with each IP.
2344 * Returns 0 on success, negative error code on failure.
2345 */
06ec9070 2346static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2347{
2348 int i, r;
2349
c030f2e4 2350 r = amdgpu_ras_init(adev);
2351 if (r)
2352 return r;
2353
d38ceaf9 2354 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2355 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2356 continue;
a1255107 2357 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2358 if (r) {
a1255107
AD
2359 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2360 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2361 goto init_failed;
2c1a2784 2362 }
a1255107 2363 adev->ip_blocks[i].status.sw = true;
bfca0289 2364
d38ceaf9 2365 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2366 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
892deb48
VS
2367 /* Try to reserve bad pages early */
2368 if (amdgpu_sriov_vf(adev))
2369 amdgpu_virt_exchange_data(adev);
2370
06ec9070 2371 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2372 if (r) {
2373 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2374 goto init_failed;
2c1a2784 2375 }
a1255107 2376 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2377 if (r) {
2378 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2379 goto init_failed;
2c1a2784 2380 }
06ec9070 2381 r = amdgpu_device_wb_init(adev);
2c1a2784 2382 if (r) {
06ec9070 2383 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2384 goto init_failed;
2c1a2784 2385 }
a1255107 2386 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2387
2388 /* right after GMC hw init, we create CSA */
f92d5c61 2389 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2390 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2391 AMDGPU_GEM_DOMAIN_VRAM,
2392 AMDGPU_CSA_SIZE);
2493664f
ML
2393 if (r) {
2394 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2395 goto init_failed;
2493664f
ML
2396 }
2397 }
d38ceaf9
AD
2398 }
2399 }
2400
c9ffa427 2401 if (amdgpu_sriov_vf(adev))
22c16d25 2402 amdgpu_virt_init_data_exchange(adev);
c9ffa427 2403
533aed27
AG
2404 r = amdgpu_ib_pool_init(adev);
2405 if (r) {
2406 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2407 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2408 goto init_failed;
2409 }
2410
c8963ea4
RZ
2411 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2412 if (r)
72d3f592 2413 goto init_failed;
0a4f2520
RZ
2414
2415 r = amdgpu_device_ip_hw_init_phase1(adev);
2416 if (r)
72d3f592 2417 goto init_failed;
0a4f2520 2418
7a3e0bb2
RZ
2419 r = amdgpu_device_fw_loading(adev);
2420 if (r)
72d3f592 2421 goto init_failed;
7a3e0bb2 2422
0a4f2520
RZ
2423 r = amdgpu_device_ip_hw_init_phase2(adev);
2424 if (r)
72d3f592 2425 goto init_failed;
d38ceaf9 2426
121a2bc6
AG
2427 /*
2428 * retired pages will be loaded from eeprom and reserved here,
2429 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2430 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2431 * for I2C communication which only true at this point.
b82e65a9
GC
2432 *
2433 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2434 * failure from bad gpu situation and stop amdgpu init process
2435 * accordingly. For other failed cases, it will still release all
2436 * the resource and print error message, rather than returning one
2437 * negative value to upper level.
121a2bc6
AG
2438 *
2439 * Note: theoretically, this should be called before all vram allocations
2440 * to protect retired page from abusing
2441 */
b82e65a9
GC
2442 r = amdgpu_ras_recovery_init(adev);
2443 if (r)
2444 goto init_failed;
121a2bc6 2445
cfbb6b00
AG
2446 /**
2447 * In case of XGMI grab extra reference for reset domain for this device
2448 */
a4c63caf 2449 if (adev->gmc.xgmi.num_physical_nodes > 1) {
cfbb6b00
AG
2450 if (amdgpu_xgmi_add_device(adev) == 0) {
2451 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
a4c63caf 2452
cfbb6b00
AG
2453 if (!hive->reset_domain ||
2454 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2455 r = -ENOENT;
2456 goto init_failed;
2457 }
e3c1b071 2458
cfbb6b00
AG
2459 /* Drop the early temporary reset domain we created for device */
2460 amdgpu_reset_put_reset_domain(adev->reset_domain);
2461 adev->reset_domain = hive->reset_domain;
a4c63caf
AG
2462 }
2463 }
2464
5fd8518d
AG
2465 r = amdgpu_device_init_schedulers(adev);
2466 if (r)
2467 goto init_failed;
e3c1b071 2468
2469 /* Don't init kfd if whole hive need to be reset during init */
c004d44e 2470 if (!adev->gmc.xgmi.pending_reset)
e3c1b071 2471 amdgpu_amdkfd_device_init(adev);
c6332b97 2472
bd607166
KR
2473 amdgpu_fru_get_product_info(adev);
2474
72d3f592 2475init_failed:
c9ffa427 2476 if (amdgpu_sriov_vf(adev))
c6332b97 2477 amdgpu_virt_release_full_gpu(adev, true);
2478
72d3f592 2479 return r;
d38ceaf9
AD
2480}
2481
e3ecdffa
AD
2482/**
2483 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2484 *
2485 * @adev: amdgpu_device pointer
2486 *
2487 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2488 * this function before a GPU reset. If the value is retained after a
2489 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2490 */
06ec9070 2491static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2492{
2493 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2494}
2495
e3ecdffa
AD
2496/**
2497 * amdgpu_device_check_vram_lost - check if vram is valid
2498 *
2499 * @adev: amdgpu_device pointer
2500 *
2501 * Checks the reset magic value written to the gart pointer in VRAM.
2502 * The driver calls this after a GPU reset to see if the contents of
2503 * VRAM is lost or now.
2504 * returns true if vram is lost, false if not.
2505 */
06ec9070 2506static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2507{
dadce777
EQ
2508 if (memcmp(adev->gart.ptr, adev->reset_magic,
2509 AMDGPU_RESET_MAGIC_NUM))
2510 return true;
2511
53b3f8f4 2512 if (!amdgpu_in_reset(adev))
dadce777
EQ
2513 return false;
2514
2515 /*
2516 * For all ASICs with baco/mode1 reset, the VRAM is
2517 * always assumed to be lost.
2518 */
2519 switch (amdgpu_asic_reset_method(adev)) {
2520 case AMD_RESET_METHOD_BACO:
2521 case AMD_RESET_METHOD_MODE1:
2522 return true;
2523 default:
2524 return false;
2525 }
0c49e0b8
CZ
2526}
2527
e3ecdffa 2528/**
1112a46b 2529 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2530 *
2531 * @adev: amdgpu_device pointer
b8b72130 2532 * @state: clockgating state (gate or ungate)
e3ecdffa 2533 *
e3ecdffa 2534 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2535 * set_clockgating_state callbacks are run.
2536 * Late initialization pass enabling clockgating for hardware IPs.
2537 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2538 * Returns 0 on success, negative error code on failure.
2539 */
fdd34271 2540
5d89bb2d
LL
2541int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2542 enum amd_clockgating_state state)
d38ceaf9 2543{
1112a46b 2544 int i, j, r;
d38ceaf9 2545
4a2ba394
SL
2546 if (amdgpu_emu_mode == 1)
2547 return 0;
2548
1112a46b
RZ
2549 for (j = 0; j < adev->num_ip_blocks; j++) {
2550 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2551 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2552 continue;
5d70a549
PV
2553 /* skip CG for GFX on S0ix */
2554 if (adev->in_s0ix &&
2555 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2556 continue;
4a446d55 2557 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2558 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2559 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2560 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2561 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2562 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2563 /* enable clockgating to save power */
a1255107 2564 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2565 state);
4a446d55
AD
2566 if (r) {
2567 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2568 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2569 return r;
2570 }
b0b00ff1 2571 }
d38ceaf9 2572 }
06b18f61 2573
c9f96fd5
RZ
2574 return 0;
2575}
2576
5d89bb2d
LL
2577int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2578 enum amd_powergating_state state)
c9f96fd5 2579{
1112a46b 2580 int i, j, r;
06b18f61 2581
c9f96fd5
RZ
2582 if (amdgpu_emu_mode == 1)
2583 return 0;
2584
1112a46b
RZ
2585 for (j = 0; j < adev->num_ip_blocks; j++) {
2586 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2587 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5 2588 continue;
5d70a549
PV
2589 /* skip PG for GFX on S0ix */
2590 if (adev->in_s0ix &&
2591 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2592 continue;
c9f96fd5
RZ
2593 /* skip CG for VCE/UVD, it's handled specially */
2594 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2595 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2596 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2597 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2598 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2599 /* enable powergating to save power */
2600 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2601 state);
c9f96fd5
RZ
2602 if (r) {
2603 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2604 adev->ip_blocks[i].version->funcs->name, r);
2605 return r;
2606 }
2607 }
2608 }
2dc80b00
S
2609 return 0;
2610}
2611
beff74bc
AD
2612static int amdgpu_device_enable_mgpu_fan_boost(void)
2613{
2614 struct amdgpu_gpu_instance *gpu_ins;
2615 struct amdgpu_device *adev;
2616 int i, ret = 0;
2617
2618 mutex_lock(&mgpu_info.mutex);
2619
2620 /*
2621 * MGPU fan boost feature should be enabled
2622 * only when there are two or more dGPUs in
2623 * the system
2624 */
2625 if (mgpu_info.num_dgpu < 2)
2626 goto out;
2627
2628 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2629 gpu_ins = &(mgpu_info.gpu_ins[i]);
2630 adev = gpu_ins->adev;
2631 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2632 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2633 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2634 if (ret)
2635 break;
2636
2637 gpu_ins->mgpu_fan_enabled = 1;
2638 }
2639 }
2640
2641out:
2642 mutex_unlock(&mgpu_info.mutex);
2643
2644 return ret;
2645}
2646
e3ecdffa
AD
2647/**
2648 * amdgpu_device_ip_late_init - run late init for hardware IPs
2649 *
2650 * @adev: amdgpu_device pointer
2651 *
2652 * Late initialization pass for hardware IPs. The list of all the hardware
2653 * IPs that make up the asic is walked and the late_init callbacks are run.
2654 * late_init covers any special initialization that an IP requires
2655 * after all of the have been initialized or something that needs to happen
2656 * late in the init process.
2657 * Returns 0 on success, negative error code on failure.
2658 */
06ec9070 2659static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2660{
60599a03 2661 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2662 int i = 0, r;
2663
2664 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2665 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2666 continue;
2667 if (adev->ip_blocks[i].version->funcs->late_init) {
2668 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2669 if (r) {
2670 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2671 adev->ip_blocks[i].version->funcs->name, r);
2672 return r;
2673 }
2dc80b00 2674 }
73f847db 2675 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2676 }
2677
867e24ca 2678 r = amdgpu_ras_late_init(adev);
2679 if (r) {
2680 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2681 return r;
2682 }
2683
a891d239
DL
2684 amdgpu_ras_set_error_query_ready(adev, true);
2685
1112a46b
RZ
2686 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2687 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2688
06ec9070 2689 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2690
beff74bc
AD
2691 r = amdgpu_device_enable_mgpu_fan_boost();
2692 if (r)
2693 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2694
4da8b639 2695 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2696 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2697 adev->asic_type == CHIP_ALDEBARAN ))
bc143d8b 2698 amdgpu_dpm_handle_passthrough_sbr(adev, true);
60599a03
EQ
2699
2700 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2701 mutex_lock(&mgpu_info.mutex);
2702
2703 /*
2704 * Reset device p-state to low as this was booted with high.
2705 *
2706 * This should be performed only after all devices from the same
2707 * hive get initialized.
2708 *
2709 * However, it's unknown how many device in the hive in advance.
2710 * As this is counted one by one during devices initializations.
2711 *
2712 * So, we wait for all XGMI interlinked devices initialized.
2713 * This may bring some delays as those devices may come from
2714 * different hives. But that should be OK.
2715 */
2716 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2717 for (i = 0; i < mgpu_info.num_gpu; i++) {
2718 gpu_instance = &(mgpu_info.gpu_ins[i]);
2719 if (gpu_instance->adev->flags & AMD_IS_APU)
2720 continue;
2721
d84a430d
JK
2722 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2723 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2724 if (r) {
2725 DRM_ERROR("pstate setting failed (%d).\n", r);
2726 break;
2727 }
2728 }
2729 }
2730
2731 mutex_unlock(&mgpu_info.mutex);
2732 }
2733
d38ceaf9
AD
2734 return 0;
2735}
2736
613aa3ea
LY
2737/**
2738 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2739 *
2740 * @adev: amdgpu_device pointer
2741 *
2742 * For ASICs need to disable SMC first
2743 */
2744static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2745{
2746 int i, r;
2747
2748 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2749 return;
2750
2751 for (i = 0; i < adev->num_ip_blocks; i++) {
2752 if (!adev->ip_blocks[i].status.hw)
2753 continue;
2754 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2755 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2756 /* XXX handle errors */
2757 if (r) {
2758 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2759 adev->ip_blocks[i].version->funcs->name, r);
2760 }
2761 adev->ip_blocks[i].status.hw = false;
2762 break;
2763 }
2764 }
2765}
2766
e9669fb7 2767static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
d38ceaf9
AD
2768{
2769 int i, r;
2770
e9669fb7
AG
2771 for (i = 0; i < adev->num_ip_blocks; i++) {
2772 if (!adev->ip_blocks[i].version->funcs->early_fini)
2773 continue;
5278a159 2774
e9669fb7
AG
2775 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2776 if (r) {
2777 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2778 adev->ip_blocks[i].version->funcs->name, r);
2779 }
2780 }
c030f2e4 2781
05df1f01 2782 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2783 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2784
7270e895
TY
2785 amdgpu_amdkfd_suspend(adev, false);
2786
613aa3ea
LY
2787 /* Workaroud for ASICs need to disable SMC first */
2788 amdgpu_device_smu_fini_early(adev);
3e96dbfd 2789
d38ceaf9 2790 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2791 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2792 continue;
8201a67a 2793
a1255107 2794 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2795 /* XXX handle errors */
2c1a2784 2796 if (r) {
a1255107
AD
2797 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2798 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2799 }
8201a67a 2800
a1255107 2801 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2802 }
2803
6effad8a
GC
2804 if (amdgpu_sriov_vf(adev)) {
2805 if (amdgpu_virt_release_full_gpu(adev, false))
2806 DRM_ERROR("failed to release exclusive mode on fini\n");
2807 }
2808
e9669fb7
AG
2809 return 0;
2810}
2811
2812/**
2813 * amdgpu_device_ip_fini - run fini for hardware IPs
2814 *
2815 * @adev: amdgpu_device pointer
2816 *
2817 * Main teardown pass for hardware IPs. The list of all the hardware
2818 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2819 * are run. hw_fini tears down the hardware associated with each IP
2820 * and sw_fini tears down any software state associated with each IP.
2821 * Returns 0 on success, negative error code on failure.
2822 */
2823static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2824{
2825 int i, r;
2826
2827 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2828 amdgpu_virt_release_ras_err_handler_data(adev);
2829
e9669fb7
AG
2830 if (adev->gmc.xgmi.num_physical_nodes > 1)
2831 amdgpu_xgmi_remove_device(adev);
2832
c004d44e 2833 amdgpu_amdkfd_device_fini_sw(adev);
9950cda2 2834
d38ceaf9 2835 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2836 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2837 continue;
c12aba3a
ML
2838
2839 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2840 amdgpu_ucode_free_bo(adev);
1e256e27 2841 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2842 amdgpu_device_wb_fini(adev);
2843 amdgpu_device_vram_scratch_fini(adev);
533aed27 2844 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2845 }
2846
a1255107 2847 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2848 /* XXX handle errors */
2c1a2784 2849 if (r) {
a1255107
AD
2850 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2851 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2852 }
a1255107
AD
2853 adev->ip_blocks[i].status.sw = false;
2854 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2855 }
2856
a6dcfd9c 2857 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2858 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2859 continue;
a1255107
AD
2860 if (adev->ip_blocks[i].version->funcs->late_fini)
2861 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2862 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2863 }
2864
c030f2e4 2865 amdgpu_ras_fini(adev);
2866
d38ceaf9
AD
2867 return 0;
2868}
2869
e3ecdffa 2870/**
beff74bc 2871 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2872 *
1112a46b 2873 * @work: work_struct.
e3ecdffa 2874 */
beff74bc 2875static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2876{
2877 struct amdgpu_device *adev =
beff74bc 2878 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2879 int r;
2880
2881 r = amdgpu_ib_ring_tests(adev);
2882 if (r)
2883 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2884}
2885
1e317b99
RZ
2886static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2887{
2888 struct amdgpu_device *adev =
2889 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2890
90a92662
MD
2891 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2892 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2893
2894 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2895 adev->gfx.gfx_off_state = true;
1e317b99
RZ
2896}
2897
e3ecdffa 2898/**
e7854a03 2899 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2900 *
2901 * @adev: amdgpu_device pointer
2902 *
2903 * Main suspend function for hardware IPs. The list of all the hardware
2904 * IPs that make up the asic is walked, clockgating is disabled and the
2905 * suspend callbacks are run. suspend puts the hardware and software state
2906 * in each IP into a state suitable for suspend.
2907 * Returns 0 on success, negative error code on failure.
2908 */
e7854a03
AD
2909static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2910{
2911 int i, r;
2912
50ec83f0
AD
2913 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2914 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2915
e7854a03
AD
2916 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2917 if (!adev->ip_blocks[i].status.valid)
2918 continue;
2b9f7848 2919
e7854a03 2920 /* displays are handled separately */
2b9f7848
ND
2921 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2922 continue;
2923
2924 /* XXX handle errors */
2925 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2926 /* XXX handle errors */
2927 if (r) {
2928 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2929 adev->ip_blocks[i].version->funcs->name, r);
2930 return r;
e7854a03 2931 }
2b9f7848
ND
2932
2933 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2934 }
2935
e7854a03
AD
2936 return 0;
2937}
2938
2939/**
2940 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2941 *
2942 * @adev: amdgpu_device pointer
2943 *
2944 * Main suspend function for hardware IPs. The list of all the hardware
2945 * IPs that make up the asic is walked, clockgating is disabled and the
2946 * suspend callbacks are run. suspend puts the hardware and software state
2947 * in each IP into a state suitable for suspend.
2948 * Returns 0 on success, negative error code on failure.
2949 */
2950static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2951{
2952 int i, r;
2953
557f42a2 2954 if (adev->in_s0ix)
bc143d8b 2955 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
34416931 2956
d38ceaf9 2957 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2958 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2959 continue;
e7854a03
AD
2960 /* displays are handled in phase1 */
2961 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2962 continue;
bff77e86
LM
2963 /* PSP lost connection when err_event_athub occurs */
2964 if (amdgpu_ras_intr_triggered() &&
2965 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2966 adev->ip_blocks[i].status.hw = false;
2967 continue;
2968 }
e3c1b071 2969
2970 /* skip unnecessary suspend if we do not initialize them yet */
2971 if (adev->gmc.xgmi.pending_reset &&
2972 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2973 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2974 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2975 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2976 adev->ip_blocks[i].status.hw = false;
2977 continue;
2978 }
557f42a2 2979
32ff160d
AD
2980 /* skip suspend of gfx and psp for S0ix
2981 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2982 * like at runtime. PSP is also part of the always on hardware
2983 * so no need to suspend it.
2984 */
557f42a2 2985 if (adev->in_s0ix &&
32ff160d
AD
2986 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2987 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
557f42a2
AD
2988 continue;
2989
d38ceaf9 2990 /* XXX handle errors */
a1255107 2991 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2992 /* XXX handle errors */
2c1a2784 2993 if (r) {
a1255107
AD
2994 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2995 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2996 }
876923fb 2997 adev->ip_blocks[i].status.hw = false;
a3a09142 2998 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2999 if(!amdgpu_sriov_vf(adev)){
3000 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3001 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3002 if (r) {
3003 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3004 adev->mp1_state, r);
3005 return r;
3006 }
a3a09142
AD
3007 }
3008 }
d38ceaf9
AD
3009 }
3010
3011 return 0;
3012}
3013
e7854a03
AD
3014/**
3015 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3016 *
3017 * @adev: amdgpu_device pointer
3018 *
3019 * Main suspend function for hardware IPs. The list of all the hardware
3020 * IPs that make up the asic is walked, clockgating is disabled and the
3021 * suspend callbacks are run. suspend puts the hardware and software state
3022 * in each IP into a state suitable for suspend.
3023 * Returns 0 on success, negative error code on failure.
3024 */
3025int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3026{
3027 int r;
3028
3c73683c
JC
3029 if (amdgpu_sriov_vf(adev)) {
3030 amdgpu_virt_fini_data_exchange(adev);
e7819644 3031 amdgpu_virt_request_full_gpu(adev, false);
3c73683c 3032 }
e7819644 3033
e7854a03
AD
3034 r = amdgpu_device_ip_suspend_phase1(adev);
3035 if (r)
3036 return r;
3037 r = amdgpu_device_ip_suspend_phase2(adev);
3038
e7819644
YT
3039 if (amdgpu_sriov_vf(adev))
3040 amdgpu_virt_release_full_gpu(adev, false);
3041
e7854a03
AD
3042 return r;
3043}
3044
06ec9070 3045static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3046{
3047 int i, r;
3048
2cb681b6
ML
3049 static enum amd_ip_block_type ip_order[] = {
3050 AMD_IP_BLOCK_TYPE_GMC,
3051 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 3052 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
3053 AMD_IP_BLOCK_TYPE_IH,
3054 };
a90ad3c2 3055
95ea3dbc 3056 for (i = 0; i < adev->num_ip_blocks; i++) {
2cb681b6
ML
3057 int j;
3058 struct amdgpu_ip_block *block;
a90ad3c2 3059
4cd2a96d
J
3060 block = &adev->ip_blocks[i];
3061 block->status.hw = false;
2cb681b6 3062
4cd2a96d 3063 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 3064
4cd2a96d 3065 if (block->version->type != ip_order[j] ||
2cb681b6
ML
3066 !block->status.valid)
3067 continue;
3068
3069 r = block->version->funcs->hw_init(adev);
0aaeefcc 3070 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3071 if (r)
3072 return r;
482f0e53 3073 block->status.hw = true;
a90ad3c2
ML
3074 }
3075 }
3076
3077 return 0;
3078}
3079
06ec9070 3080static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3081{
3082 int i, r;
3083
2cb681b6
ML
3084 static enum amd_ip_block_type ip_order[] = {
3085 AMD_IP_BLOCK_TYPE_SMC,
3086 AMD_IP_BLOCK_TYPE_DCE,
3087 AMD_IP_BLOCK_TYPE_GFX,
3088 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 3089 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
3090 AMD_IP_BLOCK_TYPE_VCE,
3091 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 3092 };
a90ad3c2 3093
2cb681b6
ML
3094 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3095 int j;
3096 struct amdgpu_ip_block *block;
a90ad3c2 3097
2cb681b6
ML
3098 for (j = 0; j < adev->num_ip_blocks; j++) {
3099 block = &adev->ip_blocks[j];
3100
3101 if (block->version->type != ip_order[i] ||
482f0e53
ML
3102 !block->status.valid ||
3103 block->status.hw)
2cb681b6
ML
3104 continue;
3105
895bd048
JZ
3106 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3107 r = block->version->funcs->resume(adev);
3108 else
3109 r = block->version->funcs->hw_init(adev);
3110
0aaeefcc 3111 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3112 if (r)
3113 return r;
482f0e53 3114 block->status.hw = true;
a90ad3c2
ML
3115 }
3116 }
3117
3118 return 0;
3119}
3120
e3ecdffa
AD
3121/**
3122 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3123 *
3124 * @adev: amdgpu_device pointer
3125 *
3126 * First resume function for hardware IPs. The list of all the hardware
3127 * IPs that make up the asic is walked and the resume callbacks are run for
3128 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3129 * after a suspend and updates the software state as necessary. This
3130 * function is also used for restoring the GPU after a GPU reset.
3131 * Returns 0 on success, negative error code on failure.
3132 */
06ec9070 3133static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
3134{
3135 int i, r;
3136
a90ad3c2 3137 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3138 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 3139 continue;
a90ad3c2 3140 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
3141 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3142 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 3143
fcf0649f
CZ
3144 r = adev->ip_blocks[i].version->funcs->resume(adev);
3145 if (r) {
3146 DRM_ERROR("resume of IP block <%s> failed %d\n",
3147 adev->ip_blocks[i].version->funcs->name, r);
3148 return r;
3149 }
482f0e53 3150 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
3151 }
3152 }
3153
3154 return 0;
3155}
3156
e3ecdffa
AD
3157/**
3158 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3159 *
3160 * @adev: amdgpu_device pointer
3161 *
3162 * First resume function for hardware IPs. The list of all the hardware
3163 * IPs that make up the asic is walked and the resume callbacks are run for
3164 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3165 * functional state after a suspend and updates the software state as
3166 * necessary. This function is also used for restoring the GPU after a GPU
3167 * reset.
3168 * Returns 0 on success, negative error code on failure.
3169 */
06ec9070 3170static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
3171{
3172 int i, r;
3173
3174 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3175 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 3176 continue;
fcf0649f 3177 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3178 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
3179 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3180 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 3181 continue;
a1255107 3182 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 3183 if (r) {
a1255107
AD
3184 DRM_ERROR("resume of IP block <%s> failed %d\n",
3185 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 3186 return r;
2c1a2784 3187 }
482f0e53 3188 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
3189 }
3190
3191 return 0;
3192}
3193
e3ecdffa
AD
3194/**
3195 * amdgpu_device_ip_resume - run resume for hardware IPs
3196 *
3197 * @adev: amdgpu_device pointer
3198 *
3199 * Main resume function for hardware IPs. The hardware IPs
3200 * are split into two resume functions because they are
3201 * are also used in in recovering from a GPU reset and some additional
3202 * steps need to be take between them. In this case (S3/S4) they are
3203 * run sequentially.
3204 * Returns 0 on success, negative error code on failure.
3205 */
06ec9070 3206static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
3207{
3208 int r;
3209
9cec53c1
JZ
3210 r = amdgpu_amdkfd_resume_iommu(adev);
3211 if (r)
3212 return r;
3213
06ec9070 3214 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
3215 if (r)
3216 return r;
7a3e0bb2
RZ
3217
3218 r = amdgpu_device_fw_loading(adev);
3219 if (r)
3220 return r;
3221
06ec9070 3222 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
3223
3224 return r;
3225}
3226
e3ecdffa
AD
3227/**
3228 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3229 *
3230 * @adev: amdgpu_device pointer
3231 *
3232 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3233 */
4e99a44e 3234static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 3235{
6867e1b5
ML
3236 if (amdgpu_sriov_vf(adev)) {
3237 if (adev->is_atom_fw) {
58ff791a 3238 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
6867e1b5
ML
3239 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3240 } else {
3241 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3242 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3243 }
3244
3245 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3246 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 3247 }
048765ad
AR
3248}
3249
e3ecdffa
AD
3250/**
3251 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3252 *
3253 * @asic_type: AMD asic type
3254 *
3255 * Check if there is DC (new modesetting infrastructre) support for an asic.
3256 * returns true if DC has support, false if not.
3257 */
4562236b
HW
3258bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3259{
3260 switch (asic_type) {
0637d417
AD
3261#ifdef CONFIG_DRM_AMDGPU_SI
3262 case CHIP_HAINAN:
3263#endif
3264 case CHIP_TOPAZ:
3265 /* chips with no display hardware */
3266 return false;
4562236b 3267#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
3268 case CHIP_TAHITI:
3269 case CHIP_PITCAIRN:
3270 case CHIP_VERDE:
3271 case CHIP_OLAND:
2d32ffd6
AD
3272 /*
3273 * We have systems in the wild with these ASICs that require
3274 * LVDS and VGA support which is not supported with DC.
3275 *
3276 * Fallback to the non-DC driver here by default so as not to
3277 * cause regressions.
3278 */
3279#if defined(CONFIG_DRM_AMD_DC_SI)
3280 return amdgpu_dc > 0;
3281#else
3282 return false;
64200c46 3283#endif
4562236b 3284 case CHIP_BONAIRE:
0d6fbccb 3285 case CHIP_KAVERI:
367e6687
AD
3286 case CHIP_KABINI:
3287 case CHIP_MULLINS:
d9fda248
HW
3288 /*
3289 * We have systems in the wild with these ASICs that require
3290 * LVDS and VGA support which is not supported with DC.
3291 *
3292 * Fallback to the non-DC driver here by default so as not to
3293 * cause regressions.
3294 */
3295 return amdgpu_dc > 0;
f7f12b25 3296 default:
fd187853 3297 return amdgpu_dc != 0;
f7f12b25 3298#else
4562236b 3299 default:
93b09a9a 3300 if (amdgpu_dc > 0)
044a48f4 3301 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
93b09a9a 3302 "but isn't supported by ASIC, ignoring\n");
4562236b 3303 return false;
f7f12b25 3304#endif
4562236b
HW
3305 }
3306}
3307
3308/**
3309 * amdgpu_device_has_dc_support - check if dc is supported
3310 *
982a820b 3311 * @adev: amdgpu_device pointer
4562236b
HW
3312 *
3313 * Returns true for supported, false for not supported
3314 */
3315bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3316{
f74e78ca 3317 if (amdgpu_sriov_vf(adev) ||
abaf210c
AS
3318 adev->enable_virtual_display ||
3319 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
2555039d
XY
3320 return false;
3321
4562236b
HW
3322 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3323}
3324
d4535e2c
AG
3325static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3326{
3327 struct amdgpu_device *adev =
3328 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 3329 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 3330
c6a6e2db
AG
3331 /* It's a bug to not have a hive within this function */
3332 if (WARN_ON(!hive))
3333 return;
3334
3335 /*
3336 * Use task barrier to synchronize all xgmi reset works across the
3337 * hive. task_barrier_enter and task_barrier_exit will block
3338 * until all the threads running the xgmi reset works reach
3339 * those points. task_barrier_full will do both blocks.
3340 */
3341 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3342
3343 task_barrier_enter(&hive->tb);
4a580877 3344 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
3345
3346 if (adev->asic_reset_res)
3347 goto fail;
3348
3349 task_barrier_exit(&hive->tb);
4a580877 3350 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
3351
3352 if (adev->asic_reset_res)
3353 goto fail;
43c4d576 3354
5e67bba3 3355 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3356 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3357 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
c6a6e2db
AG
3358 } else {
3359
3360 task_barrier_full(&hive->tb);
3361 adev->asic_reset_res = amdgpu_asic_reset(adev);
3362 }
ce316fa5 3363
c6a6e2db 3364fail:
d4535e2c 3365 if (adev->asic_reset_res)
fed184e9 3366 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 3367 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 3368 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
3369}
3370
71f98027
AD
3371static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3372{
3373 char *input = amdgpu_lockup_timeout;
3374 char *timeout_setting = NULL;
3375 int index = 0;
3376 long timeout;
3377 int ret = 0;
3378
3379 /*
67387dfe
AD
3380 * By default timeout for non compute jobs is 10000
3381 * and 60000 for compute jobs.
71f98027 3382 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 3383 * jobs are 60000 by default.
71f98027
AD
3384 */
3385 adev->gfx_timeout = msecs_to_jiffies(10000);
3386 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
9882e278
ED
3387 if (amdgpu_sriov_vf(adev))
3388 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3389 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
71f98027 3390 else
67387dfe 3391 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027 3392
f440ff44 3393 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 3394 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 3395 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
3396 ret = kstrtol(timeout_setting, 0, &timeout);
3397 if (ret)
3398 return ret;
3399
3400 if (timeout == 0) {
3401 index++;
3402 continue;
3403 } else if (timeout < 0) {
3404 timeout = MAX_SCHEDULE_TIMEOUT;
127aedf9
CK
3405 dev_warn(adev->dev, "lockup timeout disabled");
3406 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
71f98027
AD
3407 } else {
3408 timeout = msecs_to_jiffies(timeout);
3409 }
3410
3411 switch (index++) {
3412 case 0:
3413 adev->gfx_timeout = timeout;
3414 break;
3415 case 1:
3416 adev->compute_timeout = timeout;
3417 break;
3418 case 2:
3419 adev->sdma_timeout = timeout;
3420 break;
3421 case 3:
3422 adev->video_timeout = timeout;
3423 break;
3424 default:
3425 break;
3426 }
3427 }
3428 /*
3429 * There is only one value specified and
3430 * it should apply to all non-compute jobs.
3431 */
bcccee89 3432 if (index == 1) {
71f98027 3433 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3434 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3435 adev->compute_timeout = adev->gfx_timeout;
3436 }
71f98027
AD
3437 }
3438
3439 return ret;
3440}
d4535e2c 3441
4a74c38c
PY
3442/**
3443 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3444 *
3445 * @adev: amdgpu_device pointer
3446 *
3447 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3448 */
3449static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3450{
3451 struct iommu_domain *domain;
3452
3453 domain = iommu_get_domain_for_dev(adev->dev);
3454 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3455 adev->ram_is_direct_mapped = true;
3456}
3457
77f3a5cd
ND
3458static const struct attribute *amdgpu_dev_attributes[] = {
3459 &dev_attr_product_name.attr,
3460 &dev_attr_product_number.attr,
3461 &dev_attr_serial_number.attr,
3462 &dev_attr_pcie_replay_count.attr,
3463 NULL
3464};
3465
d38ceaf9
AD
3466/**
3467 * amdgpu_device_init - initialize the driver
3468 *
3469 * @adev: amdgpu_device pointer
d38ceaf9
AD
3470 * @flags: driver flags
3471 *
3472 * Initializes the driver info and hw (all asics).
3473 * Returns 0 for success or an error on failure.
3474 * Called at driver startup.
3475 */
3476int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3477 uint32_t flags)
3478{
8aba21b7
LT
3479 struct drm_device *ddev = adev_to_drm(adev);
3480 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3481 int r, i;
b98c6299 3482 bool px = false;
95844d20 3483 u32 max_MBps;
d38ceaf9
AD
3484
3485 adev->shutdown = false;
d38ceaf9 3486 adev->flags = flags;
4e66d7d2
YZ
3487
3488 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3489 adev->asic_type = amdgpu_force_asic_type;
3490 else
3491 adev->asic_type = flags & AMD_ASIC_MASK;
3492
d38ceaf9 3493 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3494 if (amdgpu_emu_mode == 1)
8bdab6bb 3495 adev->usec_timeout *= 10;
770d13b1 3496 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3497 adev->accel_working = false;
3498 adev->num_rings = 0;
3499 adev->mman.buffer_funcs = NULL;
3500 adev->mman.buffer_funcs_ring = NULL;
3501 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3502 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3503 adev->gmc.gmc_funcs = NULL;
7bd939d0 3504 adev->harvest_ip_mask = 0x0;
f54d1867 3505 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3506 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3507
3508 adev->smc_rreg = &amdgpu_invalid_rreg;
3509 adev->smc_wreg = &amdgpu_invalid_wreg;
3510 adev->pcie_rreg = &amdgpu_invalid_rreg;
3511 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3512 adev->pciep_rreg = &amdgpu_invalid_rreg;
3513 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3514 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3515 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3516 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3517 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3518 adev->didt_rreg = &amdgpu_invalid_rreg;
3519 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3520 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3521 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3522 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3523 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3524
3e39ab90
AD
3525 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3526 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3527 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3528
3529 /* mutex initialization are all done here so we
3530 * can recall function without having locking issues */
0e5ca0d1 3531 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3532 mutex_init(&adev->pm.mutex);
3533 mutex_init(&adev->gfx.gpu_clock_mutex);
3534 mutex_init(&adev->srbm_mutex);
b8866c26 3535 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3536 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3537 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3538 mutex_init(&adev->mn_lock);
e23b74aa 3539 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3540 hash_init(adev->mn_hash);
32eaeae0 3541 mutex_init(&adev->psp.mutex);
bd052211 3542 mutex_init(&adev->notifier_lock);
8cda7a4f 3543 mutex_init(&adev->pm.stable_pstate_ctx_lock);
f113cc32 3544 mutex_init(&adev->benchmark_mutex);
d38ceaf9 3545
ab3b9de6 3546 amdgpu_device_init_apu_flags(adev);
9f6a7857 3547
912dfc84
EQ
3548 r = amdgpu_device_check_arguments(adev);
3549 if (r)
3550 return r;
d38ceaf9 3551
d38ceaf9
AD
3552 spin_lock_init(&adev->mmio_idx_lock);
3553 spin_lock_init(&adev->smc_idx_lock);
3554 spin_lock_init(&adev->pcie_idx_lock);
3555 spin_lock_init(&adev->uvd_ctx_idx_lock);
3556 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3557 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3558 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3559 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3560 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3561
0c4e7fa5
CZ
3562 INIT_LIST_HEAD(&adev->shadow_list);
3563 mutex_init(&adev->shadow_list_lock);
3564
655ce9cb 3565 INIT_LIST_HEAD(&adev->reset_list);
3566
6492e1b0 3567 INIT_LIST_HEAD(&adev->ras_list);
3568
beff74bc
AD
3569 INIT_DELAYED_WORK(&adev->delayed_init_work,
3570 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3571 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3572 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3573
d4535e2c
AG
3574 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3575
d23ee13f 3576 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3577 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3578
b265bdbd
EQ
3579 atomic_set(&adev->throttling_logging_enabled, 1);
3580 /*
3581 * If throttling continues, logging will be performed every minute
3582 * to avoid log flooding. "-1" is subtracted since the thermal
3583 * throttling interrupt comes every second. Thus, the total logging
3584 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3585 * for throttling interrupt) = 60 seconds.
3586 */
3587 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3588 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3589
0fa49558
AX
3590 /* Registers mapping */
3591 /* TODO: block userspace mapping of io register */
da69c161
KW
3592 if (adev->asic_type >= CHIP_BONAIRE) {
3593 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3594 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3595 } else {
3596 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3597 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3598 }
d38ceaf9 3599
6c08e0ef
EQ
3600 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3601 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3602
d38ceaf9
AD
3603 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3604 if (adev->rmmio == NULL) {
3605 return -ENOMEM;
3606 }
3607 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3608 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3609
5494d864
AD
3610 amdgpu_device_get_pcie_info(adev);
3611
b239c017
JX
3612 if (amdgpu_mcbp)
3613 DRM_INFO("MCBP is enabled\n");
3614
928fe236
JX
3615 if (adev->asic_type >= CHIP_NAVI10) {
3616 if (amdgpu_mes || amdgpu_mes_kiq)
3617 adev->enable_mes = true;
3618
3619 if (amdgpu_mes_kiq)
3620 adev->enable_mes_kiq = true;
3621 }
5f84cc63 3622
436afdfa
PY
3623 /*
3624 * Reset domain needs to be present early, before XGMI hive discovered
3625 * (if any) and intitialized to use reset sem and in_gpu reset flag
3626 * early on during init and before calling to RREG32.
3627 */
3628 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3629 if (!adev->reset_domain)
3630 return -ENOMEM;
3631
3aa0115d
ML
3632 /* detect hw virtualization here */
3633 amdgpu_detect_virtualization(adev);
3634
dffa11b4
ML
3635 r = amdgpu_device_get_job_timeout_settings(adev);
3636 if (r) {
3637 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4ef87d8f 3638 return r;
a190d1c7
XY
3639 }
3640
d38ceaf9 3641 /* early init functions */
06ec9070 3642 r = amdgpu_device_ip_early_init(adev);
d38ceaf9 3643 if (r)
4ef87d8f 3644 return r;
d38ceaf9 3645
4d33e704
SK
3646 /* Enable TMZ based on IP_VERSION */
3647 amdgpu_gmc_tmz_set(adev);
3648
957b0787 3649 amdgpu_gmc_noretry_set(adev);
4a0165f0
VS
3650 /* Need to get xgmi info early to decide the reset behavior*/
3651 if (adev->gmc.xgmi.supported) {
3652 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3653 if (r)
3654 return r;
3655 }
3656
8e6d0b69 3657 /* enable PCIE atomic ops */
3658 if (amdgpu_sriov_vf(adev))
3659 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
e15c9d06 3660 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
8e6d0b69 3661 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3662 else
3663 adev->have_atomics_support =
3664 !pci_enable_atomic_ops_to_root(adev->pdev,
3665 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3666 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3667 if (!adev->have_atomics_support)
3668 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3669
6585661d
OZ
3670 /* doorbell bar mapping and doorbell index init*/
3671 amdgpu_device_doorbell_init(adev);
3672
9475a943
SL
3673 if (amdgpu_emu_mode == 1) {
3674 /* post the asic on emulation mode */
3675 emu_soc_asic_init(adev);
bfca0289 3676 goto fence_driver_init;
9475a943 3677 }
bfca0289 3678
04442bf7
LL
3679 amdgpu_reset_init(adev);
3680
4e99a44e
ML
3681 /* detect if we are with an SRIOV vbios */
3682 amdgpu_device_detect_sriov_bios(adev);
048765ad 3683
95e8e59e
AD
3684 /* check if we need to reset the asic
3685 * E.g., driver was not cleanly unloaded previously, etc.
3686 */
f14899fd 3687 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
e3c1b071 3688 if (adev->gmc.xgmi.num_physical_nodes) {
3689 dev_info(adev->dev, "Pending hive reset.\n");
3690 adev->gmc.xgmi.pending_reset = true;
3691 /* Only need to init necessary block for SMU to handle the reset */
3692 for (i = 0; i < adev->num_ip_blocks; i++) {
3693 if (!adev->ip_blocks[i].status.valid)
3694 continue;
3695 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3696 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3697 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3698 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
751f43e7 3699 DRM_DEBUG("IP %s disabled for hw_init.\n",
e3c1b071 3700 adev->ip_blocks[i].version->funcs->name);
3701 adev->ip_blocks[i].status.hw = true;
3702 }
3703 }
3704 } else {
3705 r = amdgpu_asic_reset(adev);
3706 if (r) {
3707 dev_err(adev->dev, "asic reset on init failed\n");
3708 goto failed;
3709 }
95e8e59e
AD
3710 }
3711 }
3712
8f66090b 3713 pci_enable_pcie_error_reporting(adev->pdev);
c9a6b82f 3714
d38ceaf9 3715 /* Post card if necessary */
39c640c0 3716 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3717 if (!adev->bios) {
bec86378 3718 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3719 r = -EINVAL;
3720 goto failed;
d38ceaf9 3721 }
bec86378 3722 DRM_INFO("GPU posting now...\n");
4d2997ab 3723 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3724 if (r) {
3725 dev_err(adev->dev, "gpu post error!\n");
3726 goto failed;
3727 }
d38ceaf9
AD
3728 }
3729
88b64e95
AD
3730 if (adev->is_atom_fw) {
3731 /* Initialize clocks */
3732 r = amdgpu_atomfirmware_get_clock_info(adev);
3733 if (r) {
3734 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3735 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3736 goto failed;
3737 }
3738 } else {
a5bde2f9
AD
3739 /* Initialize clocks */
3740 r = amdgpu_atombios_get_clock_info(adev);
3741 if (r) {
3742 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3744 goto failed;
a5bde2f9
AD
3745 }
3746 /* init i2c buses */
4562236b
HW
3747 if (!amdgpu_device_has_dc_support(adev))
3748 amdgpu_atombios_i2c_init(adev);
2c1a2784 3749 }
d38ceaf9 3750
bfca0289 3751fence_driver_init:
d38ceaf9 3752 /* Fence driver */
067f44c8 3753 r = amdgpu_fence_driver_sw_init(adev);
2c1a2784 3754 if (r) {
067f44c8 3755 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
e23b74aa 3756 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3757 goto failed;
2c1a2784 3758 }
d38ceaf9
AD
3759
3760 /* init the mode config */
4a580877 3761 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3762
06ec9070 3763 r = amdgpu_device_ip_init(adev);
d38ceaf9 3764 if (r) {
8840a387 3765 /* failed in exclusive mode due to timeout */
3766 if (amdgpu_sriov_vf(adev) &&
3767 !amdgpu_sriov_runtime(adev) &&
3768 amdgpu_virt_mmio_blocked(adev) &&
3769 !amdgpu_virt_wait_reset(adev)) {
3770 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3771 /* Don't send request since VF is inactive. */
3772 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3773 adev->virt.ops = NULL;
8840a387 3774 r = -EAGAIN;
970fd197 3775 goto release_ras_con;
8840a387 3776 }
06ec9070 3777 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3778 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
970fd197 3779 goto release_ras_con;
d38ceaf9
AD
3780 }
3781
8d35a259
LG
3782 amdgpu_fence_driver_hw_init(adev);
3783
d69b8971
YZ
3784 dev_info(adev->dev,
3785 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3786 adev->gfx.config.max_shader_engines,
3787 adev->gfx.config.max_sh_per_se,
3788 adev->gfx.config.max_cu_per_sh,
3789 adev->gfx.cu_info.number);
3790
d38ceaf9
AD
3791 adev->accel_working = true;
3792
e59c0205
AX
3793 amdgpu_vm_check_compute_bug(adev);
3794
95844d20
MO
3795 /* Initialize the buffer migration limit. */
3796 if (amdgpu_moverate >= 0)
3797 max_MBps = amdgpu_moverate;
3798 else
3799 max_MBps = 8; /* Allow 8 MB/s. */
3800 /* Get a log2 for easy divisions. */
3801 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3802
d2f52ac8 3803 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3804 if (r) {
3805 adev->pm_sysfs_en = false;
d2f52ac8 3806 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3807 } else
3808 adev->pm_sysfs_en = true;
d2f52ac8 3809
5bb23532 3810 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3811 if (r) {
3812 adev->ucode_sysfs_en = false;
5bb23532 3813 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3814 } else
3815 adev->ucode_sysfs_en = true;
5bb23532 3816
8424f2cc
LG
3817 r = amdgpu_psp_sysfs_init(adev);
3818 if (r) {
3819 adev->psp_sysfs_en = false;
3820 if (!amdgpu_sriov_vf(adev))
3821 DRM_ERROR("Creating psp sysfs failed\n");
3822 } else
3823 adev->psp_sysfs_en = true;
3824
b0adca4d
EQ
3825 /*
3826 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3827 * Otherwise the mgpu fan boost feature will be skipped due to the
3828 * gpu instance is counted less.
3829 */
3830 amdgpu_register_gpu_instance(adev);
3831
d38ceaf9
AD
3832 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3833 * explicit gating rather than handling it automatically.
3834 */
e3c1b071 3835 if (!adev->gmc.xgmi.pending_reset) {
3836 r = amdgpu_device_ip_late_init(adev);
3837 if (r) {
3838 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3839 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
970fd197 3840 goto release_ras_con;
e3c1b071 3841 }
3842 /* must succeed. */
3843 amdgpu_ras_resume(adev);
3844 queue_delayed_work(system_wq, &adev->delayed_init_work,
3845 msecs_to_jiffies(AMDGPU_RESUME_MS));
2c1a2784 3846 }
d38ceaf9 3847
2c738637
ML
3848 if (amdgpu_sriov_vf(adev))
3849 flush_delayed_work(&adev->delayed_init_work);
3850
77f3a5cd 3851 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
5aea5327 3852 if (r)
77f3a5cd 3853 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166 3854
d155bef0
AB
3855 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3856 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3857 if (r)
3858 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3859
c1dd4aa6
AG
3860 /* Have stored pci confspace at hand for restore in sudden PCI error */
3861 if (amdgpu_device_cache_pci_state(adev->pdev))
3862 pci_restore_state(pdev);
3863
8c3dd61c
KHF
3864 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3865 /* this will fail for cards that aren't VGA class devices, just
3866 * ignore it */
3867 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
bf44e8ce 3868 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
8c3dd61c
KHF
3869
3870 if (amdgpu_device_supports_px(ddev)) {
3871 px = true;
3872 vga_switcheroo_register_client(adev->pdev,
3873 &amdgpu_switcheroo_ops, px);
3874 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3875 }
3876
e3c1b071 3877 if (adev->gmc.xgmi.pending_reset)
3878 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3879 msecs_to_jiffies(AMDGPU_RESUME_MS));
3880
4a74c38c
PY
3881 amdgpu_device_check_iommu_direct_map(adev);
3882
d38ceaf9 3883 return 0;
83ba126a 3884
970fd197
SY
3885release_ras_con:
3886 amdgpu_release_ras_context(adev);
3887
83ba126a 3888failed:
89041940 3889 amdgpu_vf_error_trans_all(adev);
8840a387 3890
83ba126a 3891 return r;
d38ceaf9
AD
3892}
3893
07775fc1
AG
3894static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3895{
62d5f9f7 3896
07775fc1
AG
3897 /* Clear all CPU mappings pointing to this device */
3898 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3899
3900 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3901 amdgpu_device_doorbell_fini(adev);
3902
3903 iounmap(adev->rmmio);
3904 adev->rmmio = NULL;
3905 if (adev->mman.aper_base_kaddr)
3906 iounmap(adev->mman.aper_base_kaddr);
3907 adev->mman.aper_base_kaddr = NULL;
3908
3909 /* Memory manager related */
3910 if (!adev->gmc.xgmi.connected_to_cpu) {
3911 arch_phys_wc_del(adev->gmc.vram_mtrr);
3912 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3913 }
3914}
3915
d38ceaf9 3916/**
bbe04dec 3917 * amdgpu_device_fini_hw - tear down the driver
d38ceaf9
AD
3918 *
3919 * @adev: amdgpu_device pointer
3920 *
3921 * Tear down the driver info (all asics).
3922 * Called at driver shutdown.
3923 */
72c8c97b 3924void amdgpu_device_fini_hw(struct amdgpu_device *adev)
d38ceaf9 3925{
aac89168 3926 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3927 flush_delayed_work(&adev->delayed_init_work);
d0d13fe8 3928 adev->shutdown = true;
9f875167 3929
752c683d
ML
3930 /* make sure IB test finished before entering exclusive mode
3931 * to avoid preemption on IB test
3932 * */
519b8b76 3933 if (amdgpu_sriov_vf(adev)) {
752c683d 3934 amdgpu_virt_request_full_gpu(adev, false);
519b8b76
BZ
3935 amdgpu_virt_fini_data_exchange(adev);
3936 }
752c683d 3937
e5b03032
ML
3938 /* disable all interrupts */
3939 amdgpu_irq_disable_all(adev);
ff97cba8 3940 if (adev->mode_info.mode_config_initialized){
1053b9c9 3941 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4a580877 3942 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3943 else
4a580877 3944 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3945 }
8d35a259 3946 amdgpu_fence_driver_hw_fini(adev);
72c8c97b 3947
98f56188
YY
3948 if (adev->mman.initialized) {
3949 flush_delayed_work(&adev->mman.bdev.wq);
3950 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3951 }
3952
7c868b59
YT
3953 if (adev->pm_sysfs_en)
3954 amdgpu_pm_sysfs_fini(adev);
72c8c97b
AG
3955 if (adev->ucode_sysfs_en)
3956 amdgpu_ucode_sysfs_fini(adev);
8424f2cc
LG
3957 if (adev->psp_sysfs_en)
3958 amdgpu_psp_sysfs_fini(adev);
72c8c97b
AG
3959 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3960
232d1d43
SY
3961 /* disable ras feature must before hw fini */
3962 amdgpu_ras_pre_fini(adev);
3963
e9669fb7 3964 amdgpu_device_ip_fini_early(adev);
d10d0daa 3965
a3848df6
YW
3966 amdgpu_irq_fini_hw(adev);
3967
b6fd6e0f
SK
3968 if (adev->mman.initialized)
3969 ttm_device_clear_dma_mappings(&adev->mman.bdev);
894c6890 3970
d10d0daa 3971 amdgpu_gart_dummy_page_fini(adev);
07775fc1 3972
87172e89
LS
3973 if (drm_dev_is_unplugged(adev_to_drm(adev)))
3974 amdgpu_device_unmap_mmio(adev);
3975
72c8c97b
AG
3976}
3977
3978void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3979{
62d5f9f7
LS
3980 int idx;
3981
8d35a259 3982 amdgpu_fence_driver_sw_fini(adev);
a5c5d8d5 3983 amdgpu_device_ip_fini(adev);
75e1658e
ND
3984 release_firmware(adev->firmware.gpu_info_fw);
3985 adev->firmware.gpu_info_fw = NULL;
d38ceaf9 3986 adev->accel_working = false;
04442bf7
LL
3987
3988 amdgpu_reset_fini(adev);
3989
d38ceaf9 3990 /* free i2c buses */
4562236b
HW
3991 if (!amdgpu_device_has_dc_support(adev))
3992 amdgpu_i2c_fini(adev);
bfca0289
SL
3993
3994 if (amdgpu_emu_mode != 1)
3995 amdgpu_atombios_fini(adev);
3996
d38ceaf9
AD
3997 kfree(adev->bios);
3998 adev->bios = NULL;
b98c6299 3999 if (amdgpu_device_supports_px(adev_to_drm(adev))) {
84c8b22e 4000 vga_switcheroo_unregister_client(adev->pdev);
83ba126a 4001 vga_switcheroo_fini_domain_pm_ops(adev->dev);
b98c6299 4002 }
38d6be81 4003 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
b8779475 4004 vga_client_unregister(adev->pdev);
e9bc1bf7 4005
62d5f9f7
LS
4006 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4007
4008 iounmap(adev->rmmio);
4009 adev->rmmio = NULL;
4010 amdgpu_device_doorbell_fini(adev);
4011 drm_dev_exit(idx);
4012 }
4013
d155bef0
AB
4014 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4015 amdgpu_pmu_fini(adev);
72de33f8 4016 if (adev->mman.discovery_bin)
a190d1c7 4017 amdgpu_discovery_fini(adev);
72c8c97b 4018
cfbb6b00
AG
4019 amdgpu_reset_put_reset_domain(adev->reset_domain);
4020 adev->reset_domain = NULL;
4021
72c8c97b
AG
4022 kfree(adev->pci_state);
4023
d38ceaf9
AD
4024}
4025
58144d28
ND
4026/**
4027 * amdgpu_device_evict_resources - evict device resources
4028 * @adev: amdgpu device object
4029 *
4030 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4031 * of the vram memory type. Mainly used for evicting device resources
4032 * at suspend time.
4033 *
4034 */
4035static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
4036{
e53d9665
ML
4037 /* No need to evict vram on APUs for suspend to ram or s2idle */
4038 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
58144d28
ND
4039 return;
4040
4041 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
4042 DRM_WARN("evicting device resources failed\n");
4043
4044}
d38ceaf9
AD
4045
4046/*
4047 * Suspend & resume.
4048 */
4049/**
810ddc3a 4050 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 4051 *
87e3f136 4052 * @dev: drm dev pointer
87e3f136 4053 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
4054 *
4055 * Puts the hw in the suspend state (all asics).
4056 * Returns 0 for success or an error on failure.
4057 * Called at driver suspend.
4058 */
de185019 4059int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9 4060{
a2e15b0e 4061 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 4062
d38ceaf9
AD
4063 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4064 return 0;
4065
44779b43 4066 adev->in_suspend = true;
3fa8f89d
S
4067
4068 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4069 DRM_WARN("smart shift update failed\n");
4070
d38ceaf9
AD
4071 drm_kms_helper_poll_disable(dev);
4072
5f818173 4073 if (fbcon)
087451f3 4074 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
5f818173 4075
beff74bc 4076 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 4077
5e6932fe 4078 amdgpu_ras_suspend(adev);
4079
2196927b 4080 amdgpu_device_ip_suspend_phase1(adev);
fe1053b7 4081
c004d44e 4082 if (!adev->in_s0ix)
5d3a2d95 4083 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
94fa5660 4084
58144d28 4085 amdgpu_device_evict_resources(adev);
d38ceaf9 4086
8d35a259 4087 amdgpu_fence_driver_hw_fini(adev);
d38ceaf9 4088
2196927b 4089 amdgpu_device_ip_suspend_phase2(adev);
d38ceaf9 4090
d38ceaf9
AD
4091 return 0;
4092}
4093
4094/**
810ddc3a 4095 * amdgpu_device_resume - initiate device resume
d38ceaf9 4096 *
87e3f136 4097 * @dev: drm dev pointer
87e3f136 4098 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
4099 *
4100 * Bring the hw back to operating state (all asics).
4101 * Returns 0 for success or an error on failure.
4102 * Called at driver resume.
4103 */
de185019 4104int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9 4105{
1348969a 4106 struct amdgpu_device *adev = drm_to_adev(dev);
03161a6e 4107 int r = 0;
d38ceaf9
AD
4108
4109 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4110 return 0;
4111
62498733 4112 if (adev->in_s0ix)
bc143d8b 4113 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
628c36d7 4114
d38ceaf9 4115 /* post card */
39c640c0 4116 if (amdgpu_device_need_post(adev)) {
4d2997ab 4117 r = amdgpu_device_asic_init(adev);
74b0b157 4118 if (r)
aac89168 4119 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 4120 }
d38ceaf9 4121
06ec9070 4122 r = amdgpu_device_ip_resume(adev);
e6707218 4123 if (r) {
aac89168 4124 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 4125 return r;
e6707218 4126 }
8d35a259 4127 amdgpu_fence_driver_hw_init(adev);
5ceb54c6 4128
06ec9070 4129 r = amdgpu_device_ip_late_init(adev);
03161a6e 4130 if (r)
4d3b9ae5 4131 return r;
d38ceaf9 4132
beff74bc
AD
4133 queue_delayed_work(system_wq, &adev->delayed_init_work,
4134 msecs_to_jiffies(AMDGPU_RESUME_MS));
4135
c004d44e 4136 if (!adev->in_s0ix) {
5d3a2d95
AD
4137 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4138 if (r)
4139 return r;
4140 }
756e6880 4141
96a5d8d4 4142 /* Make sure IB tests flushed */
beff74bc 4143 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 4144
a2e15b0e 4145 if (fbcon)
087451f3 4146 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
d38ceaf9
AD
4147
4148 drm_kms_helper_poll_enable(dev);
23a1a9e5 4149
5e6932fe 4150 amdgpu_ras_resume(adev);
4151
23a1a9e5
L
4152 /*
4153 * Most of the connector probing functions try to acquire runtime pm
4154 * refs to ensure that the GPU is powered on when connector polling is
4155 * performed. Since we're calling this from a runtime PM callback,
4156 * trying to acquire rpm refs will cause us to deadlock.
4157 *
4158 * Since we're guaranteed to be holding the rpm lock, it's safe to
4159 * temporarily disable the rpm helpers so this doesn't deadlock us.
4160 */
4161#ifdef CONFIG_PM
4162 dev->dev->power.disable_depth++;
4163#endif
4562236b
HW
4164 if (!amdgpu_device_has_dc_support(adev))
4165 drm_helper_hpd_irq_event(dev);
4166 else
4167 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
4168#ifdef CONFIG_PM
4169 dev->dev->power.disable_depth--;
4170#endif
44779b43
RZ
4171 adev->in_suspend = false;
4172
3fa8f89d
S
4173 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4174 DRM_WARN("smart shift update failed\n");
4175
4d3b9ae5 4176 return 0;
d38ceaf9
AD
4177}
4178
e3ecdffa
AD
4179/**
4180 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4181 *
4182 * @adev: amdgpu_device pointer
4183 *
4184 * The list of all the hardware IPs that make up the asic is walked and
4185 * the check_soft_reset callbacks are run. check_soft_reset determines
4186 * if the asic is still hung or not.
4187 * Returns true if any of the IPs are still in a hung state, false if not.
4188 */
06ec9070 4189static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
4190{
4191 int i;
4192 bool asic_hang = false;
4193
f993d628
ML
4194 if (amdgpu_sriov_vf(adev))
4195 return true;
4196
8bc04c29
AD
4197 if (amdgpu_asic_need_full_reset(adev))
4198 return true;
4199
63fbf42f 4200 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4201 if (!adev->ip_blocks[i].status.valid)
63fbf42f 4202 continue;
a1255107
AD
4203 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4204 adev->ip_blocks[i].status.hang =
4205 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4206 if (adev->ip_blocks[i].status.hang) {
aac89168 4207 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
4208 asic_hang = true;
4209 }
4210 }
4211 return asic_hang;
4212}
4213
e3ecdffa
AD
4214/**
4215 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4216 *
4217 * @adev: amdgpu_device pointer
4218 *
4219 * The list of all the hardware IPs that make up the asic is walked and the
4220 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4221 * handles any IP specific hardware or software state changes that are
4222 * necessary for a soft reset to succeed.
4223 * Returns 0 on success, negative error code on failure.
4224 */
06ec9070 4225static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
4226{
4227 int i, r = 0;
4228
4229 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4230 if (!adev->ip_blocks[i].status.valid)
d31a501e 4231 continue;
a1255107
AD
4232 if (adev->ip_blocks[i].status.hang &&
4233 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4234 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
4235 if (r)
4236 return r;
4237 }
4238 }
4239
4240 return 0;
4241}
4242
e3ecdffa
AD
4243/**
4244 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4245 *
4246 * @adev: amdgpu_device pointer
4247 *
4248 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4249 * reset is necessary to recover.
4250 * Returns true if a full asic reset is required, false if not.
4251 */
06ec9070 4252static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 4253{
da146d3b
AD
4254 int i;
4255
8bc04c29
AD
4256 if (amdgpu_asic_need_full_reset(adev))
4257 return true;
4258
da146d3b 4259 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4260 if (!adev->ip_blocks[i].status.valid)
da146d3b 4261 continue;
a1255107
AD
4262 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4263 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4264 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
4265 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4266 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 4267 if (adev->ip_blocks[i].status.hang) {
aac89168 4268 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
4269 return true;
4270 }
4271 }
35d782fe
CZ
4272 }
4273 return false;
4274}
4275
e3ecdffa
AD
4276/**
4277 * amdgpu_device_ip_soft_reset - do a soft reset
4278 *
4279 * @adev: amdgpu_device pointer
4280 *
4281 * The list of all the hardware IPs that make up the asic is walked and the
4282 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4283 * IP specific hardware or software state changes that are necessary to soft
4284 * reset the IP.
4285 * Returns 0 on success, negative error code on failure.
4286 */
06ec9070 4287static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4288{
4289 int i, r = 0;
4290
4291 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4292 if (!adev->ip_blocks[i].status.valid)
35d782fe 4293 continue;
a1255107
AD
4294 if (adev->ip_blocks[i].status.hang &&
4295 adev->ip_blocks[i].version->funcs->soft_reset) {
4296 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
4297 if (r)
4298 return r;
4299 }
4300 }
4301
4302 return 0;
4303}
4304
e3ecdffa
AD
4305/**
4306 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4307 *
4308 * @adev: amdgpu_device pointer
4309 *
4310 * The list of all the hardware IPs that make up the asic is walked and the
4311 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4312 * handles any IP specific hardware or software state changes that are
4313 * necessary after the IP has been soft reset.
4314 * Returns 0 on success, negative error code on failure.
4315 */
06ec9070 4316static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4317{
4318 int i, r = 0;
4319
4320 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4321 if (!adev->ip_blocks[i].status.valid)
35d782fe 4322 continue;
a1255107
AD
4323 if (adev->ip_blocks[i].status.hang &&
4324 adev->ip_blocks[i].version->funcs->post_soft_reset)
4325 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
4326 if (r)
4327 return r;
4328 }
4329
4330 return 0;
4331}
4332
e3ecdffa 4333/**
c33adbc7 4334 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
4335 *
4336 * @adev: amdgpu_device pointer
4337 *
4338 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4339 * restore things like GPUVM page tables after a GPU reset where
4340 * the contents of VRAM might be lost.
403009bf
CK
4341 *
4342 * Returns:
4343 * 0 on success, negative error code on failure.
e3ecdffa 4344 */
c33adbc7 4345static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 4346{
c41d1cf6 4347 struct dma_fence *fence = NULL, *next = NULL;
403009bf 4348 struct amdgpu_bo *shadow;
e18aaea7 4349 struct amdgpu_bo_vm *vmbo;
403009bf 4350 long r = 1, tmo;
c41d1cf6
ML
4351
4352 if (amdgpu_sriov_runtime(adev))
b045d3af 4353 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
4354 else
4355 tmo = msecs_to_jiffies(100);
4356
aac89168 4357 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 4358 mutex_lock(&adev->shadow_list_lock);
e18aaea7
ND
4359 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4360 shadow = &vmbo->bo;
403009bf 4361 /* No need to recover an evicted BO */
d3116756
CK
4362 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4363 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4364 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
403009bf
CK
4365 continue;
4366
4367 r = amdgpu_bo_restore_shadow(shadow, &next);
4368 if (r)
4369 break;
4370
c41d1cf6 4371 if (fence) {
1712fb1a 4372 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
4373 dma_fence_put(fence);
4374 fence = next;
1712fb1a 4375 if (tmo == 0) {
4376 r = -ETIMEDOUT;
c41d1cf6 4377 break;
1712fb1a 4378 } else if (tmo < 0) {
4379 r = tmo;
4380 break;
4381 }
403009bf
CK
4382 } else {
4383 fence = next;
c41d1cf6 4384 }
c41d1cf6
ML
4385 }
4386 mutex_unlock(&adev->shadow_list_lock);
4387
403009bf
CK
4388 if (fence)
4389 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
4390 dma_fence_put(fence);
4391
1712fb1a 4392 if (r < 0 || tmo <= 0) {
aac89168 4393 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
4394 return -EIO;
4395 }
c41d1cf6 4396
aac89168 4397 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 4398 return 0;
c41d1cf6
ML
4399}
4400
a90ad3c2 4401
e3ecdffa 4402/**
06ec9070 4403 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e 4404 *
982a820b 4405 * @adev: amdgpu_device pointer
87e3f136 4406 * @from_hypervisor: request from hypervisor
5740682e
ML
4407 *
4408 * do VF FLR and reinitialize Asic
3f48c681 4409 * return 0 means succeeded otherwise failed
e3ecdffa
AD
4410 */
4411static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4412 bool from_hypervisor)
5740682e
ML
4413{
4414 int r;
a5f67c93 4415 struct amdgpu_hive_info *hive = NULL;
7258fa31 4416 int retry_limit = 0;
5740682e 4417
7258fa31 4418retry:
c004d44e 4419 amdgpu_amdkfd_pre_reset(adev);
5740682e 4420
428890a3 4421 amdgpu_amdkfd_pre_reset(adev);
4422
5740682e
ML
4423 if (from_hypervisor)
4424 r = amdgpu_virt_request_full_gpu(adev, true);
4425 else
4426 r = amdgpu_virt_reset_gpu(adev);
4427 if (r)
4428 return r;
a90ad3c2
ML
4429
4430 /* Resume IP prior to SMC */
06ec9070 4431 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
4432 if (r)
4433 goto error;
a90ad3c2 4434
c9ffa427 4435 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 4436
7a3e0bb2
RZ
4437 r = amdgpu_device_fw_loading(adev);
4438 if (r)
4439 return r;
4440
a90ad3c2 4441 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 4442 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
4443 if (r)
4444 goto error;
a90ad3c2 4445
a5f67c93
ZL
4446 hive = amdgpu_get_xgmi_hive(adev);
4447 /* Update PSP FW topology after reset */
4448 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4449 r = amdgpu_xgmi_update_topology(hive, adev);
4450
4451 if (hive)
4452 amdgpu_put_xgmi_hive(hive);
4453
4454 if (!r) {
4455 amdgpu_irq_gpu_reset_resume_helper(adev);
4456 r = amdgpu_ib_ring_tests(adev);
9c12f5cd 4457
c004d44e 4458 amdgpu_amdkfd_post_reset(adev);
a5f67c93 4459 }
a90ad3c2 4460
abc34253 4461error:
c41d1cf6 4462 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 4463 amdgpu_inc_vram_lost(adev);
c33adbc7 4464 r = amdgpu_device_recover_vram(adev);
a90ad3c2 4465 }
437f3e0b 4466 amdgpu_virt_release_full_gpu(adev, true);
a90ad3c2 4467
7258fa31
SK
4468 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4469 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4470 retry_limit++;
4471 goto retry;
4472 } else
4473 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4474 }
4475
a90ad3c2
ML
4476 return r;
4477}
4478
9a1cddd6 4479/**
4480 * amdgpu_device_has_job_running - check if there is any job in mirror list
4481 *
982a820b 4482 * @adev: amdgpu_device pointer
9a1cddd6 4483 *
4484 * check if there is any job in mirror list
4485 */
4486bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4487{
4488 int i;
4489 struct drm_sched_job *job;
4490
4491 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4492 struct amdgpu_ring *ring = adev->rings[i];
4493
4494 if (!ring || !ring->sched.thread)
4495 continue;
4496
4497 spin_lock(&ring->sched.job_list_lock);
6efa4b46
LT
4498 job = list_first_entry_or_null(&ring->sched.pending_list,
4499 struct drm_sched_job, list);
9a1cddd6 4500 spin_unlock(&ring->sched.job_list_lock);
4501 if (job)
4502 return true;
4503 }
4504 return false;
4505}
4506
12938fad
CK
4507/**
4508 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4509 *
982a820b 4510 * @adev: amdgpu_device pointer
12938fad
CK
4511 *
4512 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4513 * a hung GPU.
4514 */
4515bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4516{
4517 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4518 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4519 return false;
4520 }
4521
3ba7b418
AG
4522 if (amdgpu_gpu_recovery == 0)
4523 goto disabled;
4524
4525 if (amdgpu_sriov_vf(adev))
4526 return true;
4527
4528 if (amdgpu_gpu_recovery == -1) {
4529 switch (adev->asic_type) {
b3523c45
AD
4530#ifdef CONFIG_DRM_AMDGPU_SI
4531 case CHIP_VERDE:
4532 case CHIP_TAHITI:
4533 case CHIP_PITCAIRN:
4534 case CHIP_OLAND:
4535 case CHIP_HAINAN:
4536#endif
4537#ifdef CONFIG_DRM_AMDGPU_CIK
4538 case CHIP_KAVERI:
4539 case CHIP_KABINI:
4540 case CHIP_MULLINS:
4541#endif
4542 case CHIP_CARRIZO:
4543 case CHIP_STONEY:
4544 case CHIP_CYAN_SKILLFISH:
3ba7b418 4545 goto disabled;
b3523c45
AD
4546 default:
4547 break;
3ba7b418 4548 }
12938fad
CK
4549 }
4550
4551 return true;
3ba7b418
AG
4552
4553disabled:
aac89168 4554 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4555 return false;
12938fad
CK
4556}
4557
5c03e584
FX
4558int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4559{
4560 u32 i;
4561 int ret = 0;
4562
4563 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4564
4565 dev_info(adev->dev, "GPU mode1 reset\n");
4566
4567 /* disable BM */
4568 pci_clear_master(adev->pdev);
4569
4570 amdgpu_device_cache_pci_state(adev->pdev);
4571
4572 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4573 dev_info(adev->dev, "GPU smu mode1 reset\n");
4574 ret = amdgpu_dpm_mode1_reset(adev);
4575 } else {
4576 dev_info(adev->dev, "GPU psp mode1 reset\n");
4577 ret = psp_gpu_reset(adev);
4578 }
4579
4580 if (ret)
4581 dev_err(adev->dev, "GPU mode1 reset failed\n");
4582
4583 amdgpu_device_load_pci_state(adev->pdev);
4584
4585 /* wait for asic to come out of reset */
4586 for (i = 0; i < adev->usec_timeout; i++) {
4587 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4588
4589 if (memsize != 0xffffffff)
4590 break;
4591 udelay(1);
4592 }
4593
4594 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4595 return ret;
4596}
5c6dd71e 4597
e3c1b071 4598int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
04442bf7 4599 struct amdgpu_reset_context *reset_context)
26bc5340 4600{
5c1e6fa4 4601 int i, r = 0;
04442bf7
LL
4602 struct amdgpu_job *job = NULL;
4603 bool need_full_reset =
4604 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4605
4606 if (reset_context->reset_req_dev == adev)
4607 job = reset_context->job;
71182665 4608
b602ca5f
TZ
4609 if (amdgpu_sriov_vf(adev)) {
4610 /* stop the data exchange thread */
4611 amdgpu_virt_fini_data_exchange(adev);
4612 }
4613
71182665 4614 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4615 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4616 struct amdgpu_ring *ring = adev->rings[i];
4617
51687759 4618 if (!ring || !ring->sched.thread)
0875dc9e 4619 continue;
5740682e 4620
c530b02f
JZ
4621 /*clear job fence from fence drv to avoid force_completion
4622 *leave NULL and vm flush fence in fence drv */
5c1e6fa4 4623 amdgpu_fence_driver_clear_job_fences(ring);
c530b02f 4624
2f9d4084
ML
4625 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4626 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4627 }
d38ceaf9 4628
ff99849b 4629 if (job && job->vm)
222b5f04
AG
4630 drm_sched_increase_karma(&job->base);
4631
04442bf7 4632 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
404b277b
LL
4633 /* If reset handler not implemented, continue; otherwise return */
4634 if (r == -ENOSYS)
4635 r = 0;
4636 else
04442bf7
LL
4637 return r;
4638
1d721ed6 4639 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4640 if (!amdgpu_sriov_vf(adev)) {
4641
4642 if (!need_full_reset)
4643 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4644
4645 if (!need_full_reset) {
4646 amdgpu_device_ip_pre_soft_reset(adev);
4647 r = amdgpu_device_ip_soft_reset(adev);
4648 amdgpu_device_ip_post_soft_reset(adev);
4649 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4650 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4651 need_full_reset = true;
4652 }
4653 }
4654
4655 if (need_full_reset)
4656 r = amdgpu_device_ip_suspend(adev);
04442bf7
LL
4657 if (need_full_reset)
4658 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4659 else
4660 clear_bit(AMDGPU_NEED_FULL_RESET,
4661 &reset_context->flags);
26bc5340
AG
4662 }
4663
4664 return r;
4665}
4666
15fd09a0
SA
4667static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4668{
4669 uint32_t reg_value;
4670 int i;
4671
38a15ad9 4672 lockdep_assert_held(&adev->reset_domain->sem);
15fd09a0
SA
4673 dump_stack();
4674
4675 for (i = 0; i < adev->num_regs; i++) {
4676 reg_value = RREG32(adev->reset_dump_reg_list[i]);
4677 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value);
4678 }
4679
4680 return 0;
4681}
4682
04442bf7
LL
4683int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4684 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4685{
4686 struct amdgpu_device *tmp_adev = NULL;
04442bf7 4687 bool need_full_reset, skip_hw_reset, vram_lost = false;
26bc5340
AG
4688 int r = 0;
4689
04442bf7
LL
4690 /* Try reset handler method first */
4691 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4692 reset_list);
15fd09a0 4693 amdgpu_reset_reg_dumps(tmp_adev);
04442bf7 4694 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
404b277b
LL
4695 /* If reset handler not implemented, continue; otherwise return */
4696 if (r == -ENOSYS)
4697 r = 0;
4698 else
04442bf7
LL
4699 return r;
4700
4701 /* Reset handler not implemented, use the default method */
4702 need_full_reset =
4703 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4704 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4705
26bc5340 4706 /*
655ce9cb 4707 * ASIC reset has to be done on all XGMI hive nodes ASAP
26bc5340
AG
4708 * to allow proper links negotiation in FW (within 1 sec)
4709 */
7ac71382 4710 if (!skip_hw_reset && need_full_reset) {
655ce9cb 4711 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
041a62bc 4712 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4713 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
e3c1b071 4714 tmp_adev->gmc.xgmi.pending_reset = false;
c96cf282 4715 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4716 r = -EALREADY;
4717 } else
4718 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4719
041a62bc 4720 if (r) {
aac89168 4721 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4722 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4723 break;
ce316fa5
LM
4724 }
4725 }
4726
041a62bc
AG
4727 /* For XGMI wait for all resets to complete before proceed */
4728 if (!r) {
655ce9cb 4729 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
ce316fa5
LM
4730 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4731 flush_work(&tmp_adev->xgmi_reset_work);
4732 r = tmp_adev->asic_reset_res;
4733 if (r)
4734 break;
ce316fa5
LM
4735 }
4736 }
4737 }
ce316fa5 4738 }
26bc5340 4739
43c4d576 4740 if (!r && amdgpu_ras_intr_triggered()) {
655ce9cb 4741 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5e67bba3 4742 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4743 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4744 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
43c4d576
JC
4745 }
4746
00eaa571 4747 amdgpu_ras_intr_cleared();
43c4d576 4748 }
00eaa571 4749
655ce9cb 4750 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
26bc5340
AG
4751 if (need_full_reset) {
4752 /* post card */
e3c1b071 4753 r = amdgpu_device_asic_init(tmp_adev);
4754 if (r) {
aac89168 4755 dev_warn(tmp_adev->dev, "asic atom init failed!");
e3c1b071 4756 } else {
26bc5340 4757 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
9cec53c1
JZ
4758 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4759 if (r)
4760 goto out;
4761
26bc5340
AG
4762 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4763 if (r)
4764 goto out;
4765
4766 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4767 if (vram_lost) {
77e7f829 4768 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4769 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4770 }
4771
26bc5340
AG
4772 r = amdgpu_device_fw_loading(tmp_adev);
4773 if (r)
4774 return r;
4775
4776 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4777 if (r)
4778 goto out;
4779
4780 if (vram_lost)
4781 amdgpu_device_fill_reset_magic(tmp_adev);
4782
fdafb359
EQ
4783 /*
4784 * Add this ASIC as tracked as reset was already
4785 * complete successfully.
4786 */
4787 amdgpu_register_gpu_instance(tmp_adev);
4788
04442bf7
LL
4789 if (!reset_context->hive &&
4790 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
e3c1b071 4791 amdgpu_xgmi_add_device(tmp_adev);
4792
7c04ca50 4793 r = amdgpu_device_ip_late_init(tmp_adev);
4794 if (r)
4795 goto out;
4796
087451f3 4797 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
565d1941 4798
e8fbaf03
GC
4799 /*
4800 * The GPU enters bad state once faulty pages
4801 * by ECC has reached the threshold, and ras
4802 * recovery is scheduled next. So add one check
4803 * here to break recovery if it indeed exceeds
4804 * bad page threshold, and remind user to
4805 * retire this GPU or setting one bigger
4806 * bad_page_threshold value to fix this once
4807 * probing driver again.
4808 */
11003c68 4809 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
e8fbaf03
GC
4810 /* must succeed. */
4811 amdgpu_ras_resume(tmp_adev);
4812 } else {
4813 r = -EINVAL;
4814 goto out;
4815 }
e79a04d5 4816
26bc5340 4817 /* Update PSP FW topology after reset */
04442bf7
LL
4818 if (reset_context->hive &&
4819 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4820 r = amdgpu_xgmi_update_topology(
4821 reset_context->hive, tmp_adev);
26bc5340
AG
4822 }
4823 }
4824
26bc5340
AG
4825out:
4826 if (!r) {
4827 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4828 r = amdgpu_ib_ring_tests(tmp_adev);
4829 if (r) {
4830 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
26bc5340
AG
4831 need_full_reset = true;
4832 r = -EAGAIN;
4833 goto end;
4834 }
4835 }
4836
4837 if (!r)
4838 r = amdgpu_device_recover_vram(tmp_adev);
4839 else
4840 tmp_adev->asic_reset_res = r;
4841 }
4842
4843end:
04442bf7
LL
4844 if (need_full_reset)
4845 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4846 else
4847 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340
AG
4848 return r;
4849}
4850
e923be99 4851static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
26bc5340 4852{
5740682e 4853
a3a09142
AD
4854 switch (amdgpu_asic_reset_method(adev)) {
4855 case AMD_RESET_METHOD_MODE1:
4856 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4857 break;
4858 case AMD_RESET_METHOD_MODE2:
4859 adev->mp1_state = PP_MP1_STATE_RESET;
4860 break;
4861 default:
4862 adev->mp1_state = PP_MP1_STATE_NONE;
4863 break;
4864 }
26bc5340 4865}
d38ceaf9 4866
e923be99 4867static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
26bc5340 4868{
89041940 4869 amdgpu_vf_error_trans_all(adev);
a3a09142 4870 adev->mp1_state = PP_MP1_STATE_NONE;
91fb309d
HC
4871}
4872
3f12acc8
EQ
4873static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4874{
4875 struct pci_dev *p = NULL;
4876
4877 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4878 adev->pdev->bus->number, 1);
4879 if (p) {
4880 pm_runtime_enable(&(p->dev));
4881 pm_runtime_resume(&(p->dev));
4882 }
4883}
4884
4885static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4886{
4887 enum amd_reset_method reset_method;
4888 struct pci_dev *p = NULL;
4889 u64 expires;
4890
4891 /*
4892 * For now, only BACO and mode1 reset are confirmed
4893 * to suffer the audio issue without proper suspended.
4894 */
4895 reset_method = amdgpu_asic_reset_method(adev);
4896 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4897 (reset_method != AMD_RESET_METHOD_MODE1))
4898 return -EINVAL;
4899
4900 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4901 adev->pdev->bus->number, 1);
4902 if (!p)
4903 return -ENODEV;
4904
4905 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4906 if (!expires)
4907 /*
4908 * If we cannot get the audio device autosuspend delay,
4909 * a fixed 4S interval will be used. Considering 3S is
4910 * the audio controller default autosuspend delay setting.
4911 * 4S used here is guaranteed to cover that.
4912 */
54b7feb9 4913 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4914
4915 while (!pm_runtime_status_suspended(&(p->dev))) {
4916 if (!pm_runtime_suspend(&(p->dev)))
4917 break;
4918
4919 if (expires < ktime_get_mono_fast_ns()) {
4920 dev_warn(adev->dev, "failed to suspend display audio\n");
4921 /* TODO: abort the succeeding gpu reset? */
4922 return -ETIMEDOUT;
4923 }
4924 }
4925
4926 pm_runtime_disable(&(p->dev));
4927
4928 return 0;
4929}
4930
9d8d96be 4931static void amdgpu_device_recheck_guilty_jobs(
04442bf7
LL
4932 struct amdgpu_device *adev, struct list_head *device_list_handle,
4933 struct amdgpu_reset_context *reset_context)
e6c6338f
JZ
4934{
4935 int i, r = 0;
4936
4937 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4938 struct amdgpu_ring *ring = adev->rings[i];
4939 int ret = 0;
4940 struct drm_sched_job *s_job;
4941
4942 if (!ring || !ring->sched.thread)
4943 continue;
4944
4945 s_job = list_first_entry_or_null(&ring->sched.pending_list,
4946 struct drm_sched_job, list);
4947 if (s_job == NULL)
4948 continue;
4949
4950 /* clear job's guilty and depend the folowing step to decide the real one */
4951 drm_sched_reset_karma(s_job);
38d4e463
JC
4952 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
4953 * to make sure fence is balanced */
4954 dma_fence_get(s_job->s_fence->parent);
e6c6338f
JZ
4955 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4956
4957 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4958 if (ret == 0) { /* timeout */
4959 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4960 ring->sched.name, s_job->id);
4961
4962 /* set guilty */
4963 drm_sched_increase_karma(s_job);
4964retry:
4965 /* do hw reset */
4966 if (amdgpu_sriov_vf(adev)) {
4967 amdgpu_virt_fini_data_exchange(adev);
4968 r = amdgpu_device_reset_sriov(adev, false);
4969 if (r)
4970 adev->asic_reset_res = r;
4971 } else {
04442bf7
LL
4972 clear_bit(AMDGPU_SKIP_HW_RESET,
4973 &reset_context->flags);
4974 r = amdgpu_do_asic_reset(device_list_handle,
4975 reset_context);
e6c6338f
JZ
4976 if (r && r == -EAGAIN)
4977 goto retry;
4978 }
4979
4980 /*
4981 * add reset counter so that the following
4982 * resubmitted job could flush vmid
4983 */
4984 atomic_inc(&adev->gpu_reset_counter);
4985 continue;
4986 }
4987
4988 /* got the hw fence, signal finished fence */
4989 atomic_dec(ring->sched.score);
38d4e463 4990 dma_fence_put(s_job->s_fence->parent);
e6c6338f
JZ
4991 dma_fence_get(&s_job->s_fence->finished);
4992 dma_fence_signal(&s_job->s_fence->finished);
4993 dma_fence_put(&s_job->s_fence->finished);
4994
4995 /* remove node from list and free the job */
4996 spin_lock(&ring->sched.job_list_lock);
4997 list_del_init(&s_job->list);
4998 spin_unlock(&ring->sched.job_list_lock);
4999 ring->sched.ops->free_job(s_job);
5000 }
5001}
5002
26bc5340 5003/**
c7703ce3 5004 * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler
26bc5340 5005 *
982a820b 5006 * @adev: amdgpu_device pointer
26bc5340
AG
5007 * @job: which job trigger hang
5008 *
5009 * Attempt to reset the GPU if it has hung (all asics).
5010 * Attempt to do soft-reset or full-reset and reinitialize Asic
5011 * Returns 0 for success or an error on failure.
5012 */
5013
54f329cc 5014int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
26bc5340
AG
5015 struct amdgpu_job *job)
5016{
1d721ed6 5017 struct list_head device_list, *device_list_handle = NULL;
7dd8c205 5018 bool job_signaled = false;
26bc5340 5019 struct amdgpu_hive_info *hive = NULL;
26bc5340 5020 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 5021 int i, r = 0;
bb5c7235 5022 bool need_emergency_restart = false;
3f12acc8 5023 bool audio_suspended = false;
e6c6338f 5024 int tmp_vram_lost_counter;
04442bf7
LL
5025 struct amdgpu_reset_context reset_context;
5026
5027 memset(&reset_context, 0, sizeof(reset_context));
26bc5340 5028
6e3cd2a9 5029 /*
bb5c7235
WS
5030 * Special case: RAS triggered and full reset isn't supported
5031 */
5032 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5033
d5ea093e
AG
5034 /*
5035 * Flush RAM to disk so that after reboot
5036 * the user can read log and see why the system rebooted.
5037 */
bb5c7235 5038 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
5039 DRM_WARN("Emergency reboot.");
5040
5041 ksys_sync_helper();
5042 emergency_restart();
5043 }
5044
b823821f 5045 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 5046 need_emergency_restart ? "jobs stop":"reset");
26bc5340 5047
175ac6ec
ZL
5048 if (!amdgpu_sriov_vf(adev))
5049 hive = amdgpu_get_xgmi_hive(adev);
681260df 5050 if (hive)
53b3f8f4 5051 mutex_lock(&hive->hive_lock);
26bc5340 5052
04442bf7
LL
5053 reset_context.method = AMD_RESET_METHOD_NONE;
5054 reset_context.reset_req_dev = adev;
5055 reset_context.job = job;
5056 reset_context.hive = hive;
5057 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5058
9e94d22c
EQ
5059 /*
5060 * Build list of devices to reset.
5061 * In case we are in XGMI hive mode, resort the device list
5062 * to put adev in the 1st position.
5063 */
5064 INIT_LIST_HEAD(&device_list);
175ac6ec 5065 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
655ce9cb 5066 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
5067 list_add_tail(&tmp_adev->reset_list, &device_list);
5068 if (!list_is_first(&adev->reset_list, &device_list))
5069 list_rotate_to_front(&adev->reset_list, &device_list);
5070 device_list_handle = &device_list;
26bc5340 5071 } else {
655ce9cb 5072 list_add_tail(&adev->reset_list, &device_list);
26bc5340
AG
5073 device_list_handle = &device_list;
5074 }
5075
e923be99
AG
5076 /* We need to lock reset domain only once both for XGMI and single device */
5077 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5078 reset_list);
3675c2f2 5079 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
e923be99 5080
1d721ed6 5081 /* block all schedulers and reset given job's ring */
655ce9cb 5082 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
f287a3c5 5083
e923be99 5084 amdgpu_device_set_mp1_state(tmp_adev);
f287a3c5 5085
3f12acc8
EQ
5086 /*
5087 * Try to put the audio codec into suspend state
5088 * before gpu reset started.
5089 *
5090 * Due to the power domain of the graphics device
5091 * is shared with AZ power domain. Without this,
5092 * we may change the audio hardware from behind
5093 * the audio driver's back. That will trigger
5094 * some audio codec errors.
5095 */
5096 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5097 audio_suspended = true;
5098
9e94d22c
EQ
5099 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5100
52fb44cf
EQ
5101 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5102
c004d44e 5103 if (!amdgpu_sriov_vf(tmp_adev))
428890a3 5104 amdgpu_amdkfd_pre_reset(tmp_adev);
9e94d22c 5105
12ffa55d
AG
5106 /*
5107 * Mark these ASICs to be reseted as untracked first
5108 * And add them back after reset completed
5109 */
5110 amdgpu_unregister_gpu_instance(tmp_adev);
5111
087451f3 5112 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
565d1941 5113
f1c1314b 5114 /* disable ras on ALL IPs */
bb5c7235 5115 if (!need_emergency_restart &&
b823821f 5116 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 5117 amdgpu_ras_suspend(tmp_adev);
5118
1d721ed6
AG
5119 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5120 struct amdgpu_ring *ring = tmp_adev->rings[i];
5121
5122 if (!ring || !ring->sched.thread)
5123 continue;
5124
0b2d2c2e 5125 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 5126
bb5c7235 5127 if (need_emergency_restart)
7c6e68c7 5128 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6 5129 }
8f8c80f4 5130 atomic_inc(&tmp_adev->gpu_reset_counter);
1d721ed6
AG
5131 }
5132
bb5c7235 5133 if (need_emergency_restart)
7c6e68c7
AG
5134 goto skip_sched_resume;
5135
1d721ed6
AG
5136 /*
5137 * Must check guilty signal here since after this point all old
5138 * HW fences are force signaled.
5139 *
5140 * job->base holds a reference to parent fence
5141 */
5142 if (job && job->base.s_fence->parent &&
7dd8c205 5143 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 5144 job_signaled = true;
1d721ed6
AG
5145 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5146 goto skip_hw_reset;
5147 }
5148
26bc5340 5149retry: /* Rest of adevs pre asic reset from XGMI hive. */
655ce9cb 5150 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
04442bf7 5151 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
26bc5340
AG
5152 /*TODO Should we stop ?*/
5153 if (r) {
aac89168 5154 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 5155 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
5156 tmp_adev->asic_reset_res = r;
5157 }
5158 }
5159
e6c6338f 5160 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
26bc5340 5161 /* Actual ASIC resets if needed.*/
4f30d920 5162 /* Host driver will handle XGMI hive reset for SRIOV */
26bc5340
AG
5163 if (amdgpu_sriov_vf(adev)) {
5164 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5165 if (r)
5166 adev->asic_reset_res = r;
950d6425
SY
5167
5168 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5169 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5170 amdgpu_ras_resume(adev);
26bc5340 5171 } else {
04442bf7 5172 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
26bc5340
AG
5173 if (r && r == -EAGAIN)
5174 goto retry;
5175 }
5176
1d721ed6
AG
5177skip_hw_reset:
5178
26bc5340 5179 /* Post ASIC reset for all devs .*/
655ce9cb 5180 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
7c6e68c7 5181
e6c6338f
JZ
5182 /*
5183 * Sometimes a later bad compute job can block a good gfx job as gfx
5184 * and compute ring share internal GC HW mutually. We add an additional
5185 * guilty jobs recheck step to find the real guilty job, it synchronously
5186 * submits and pends for the first job being signaled. If it gets timeout,
5187 * we identify it as a real guilty job.
5188 */
5189 if (amdgpu_gpu_recovery == 2 &&
5190 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
04442bf7
LL
5191 amdgpu_device_recheck_guilty_jobs(
5192 tmp_adev, device_list_handle, &reset_context);
e6c6338f 5193
1d721ed6
AG
5194 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5195 struct amdgpu_ring *ring = tmp_adev->rings[i];
5196
5197 if (!ring || !ring->sched.thread)
5198 continue;
5199
5200 /* No point to resubmit jobs if we didn't HW reset*/
5201 if (!tmp_adev->asic_reset_res && !job_signaled)
5202 drm_sched_resubmit_jobs(&ring->sched);
5203
5204 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5205 }
5206
1053b9c9 5207 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
4a580877 5208 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
5209 }
5210
7258fa31
SK
5211 if (tmp_adev->asic_reset_res)
5212 r = tmp_adev->asic_reset_res;
5213
1d721ed6 5214 tmp_adev->asic_reset_res = 0;
26bc5340
AG
5215
5216 if (r) {
5217 /* bad news, how to tell it to userspace ? */
12ffa55d 5218 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
5219 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5220 } else {
12ffa55d 5221 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
3fa8f89d
S
5222 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5223 DRM_WARN("smart shift update failed\n");
26bc5340 5224 }
7c6e68c7 5225 }
26bc5340 5226
7c6e68c7 5227skip_sched_resume:
655ce9cb 5228 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
428890a3 5229 /* unlock kfd: SRIOV would do it separately */
c004d44e 5230 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
428890a3 5231 amdgpu_amdkfd_post_reset(tmp_adev);
8e2712e7 5232
5233 /* kfd_post_reset will do nothing if kfd device is not initialized,
5234 * need to bring up kfd here if it's not be initialized before
5235 */
5236 if (!adev->kfd.init_complete)
5237 amdgpu_amdkfd_device_init(adev);
5238
3f12acc8
EQ
5239 if (audio_suspended)
5240 amdgpu_device_resume_display_audio(tmp_adev);
e923be99
AG
5241
5242 amdgpu_device_unset_mp1_state(tmp_adev);
26bc5340
AG
5243 }
5244
e923be99
AG
5245 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5246 reset_list);
5247 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5248
9e94d22c 5249 if (hive) {
9e94d22c 5250 mutex_unlock(&hive->hive_lock);
d95e8e97 5251 amdgpu_put_xgmi_hive(hive);
9e94d22c 5252 }
26bc5340 5253
f287a3c5 5254 if (r)
26bc5340 5255 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
5256 return r;
5257}
5258
54f329cc
AG
5259struct amdgpu_recover_work_struct {
5260 struct work_struct base;
5261 struct amdgpu_device *adev;
5262 struct amdgpu_job *job;
5263 int ret;
5264};
5265
5266static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
5267{
5268 struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
5269
5270 recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
5271}
5272/*
5273 * Serialize gpu recover into reset domain single threaded wq
5274 */
5275int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5276 struct amdgpu_job *job)
5277{
5278 struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
5279
5280 INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
5281
cfbb6b00 5282 if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
54f329cc
AG
5283 return -EAGAIN;
5284
5285 flush_work(&work.base);
5286
5287 return work.ret;
5288}
5289
e3ecdffa
AD
5290/**
5291 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5292 *
5293 * @adev: amdgpu_device pointer
5294 *
5295 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5296 * and lanes) of the slot the device is in. Handles APUs and
5297 * virtualized environments where PCIE config space may not be available.
5298 */
5494d864 5299static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 5300{
5d9a6330 5301 struct pci_dev *pdev;
c5313457
HK
5302 enum pci_bus_speed speed_cap, platform_speed_cap;
5303 enum pcie_link_width platform_link_width;
d0dd7f0c 5304
cd474ba0
AD
5305 if (amdgpu_pcie_gen_cap)
5306 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 5307
cd474ba0
AD
5308 if (amdgpu_pcie_lane_cap)
5309 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 5310
cd474ba0
AD
5311 /* covers APUs as well */
5312 if (pci_is_root_bus(adev->pdev->bus)) {
5313 if (adev->pm.pcie_gen_mask == 0)
5314 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5315 if (adev->pm.pcie_mlw_mask == 0)
5316 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 5317 return;
cd474ba0 5318 }
d0dd7f0c 5319
c5313457
HK
5320 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5321 return;
5322
dbaa922b
AD
5323 pcie_bandwidth_available(adev->pdev, NULL,
5324 &platform_speed_cap, &platform_link_width);
c5313457 5325
cd474ba0 5326 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
5327 /* asic caps */
5328 pdev = adev->pdev;
5329 speed_cap = pcie_get_speed_cap(pdev);
5330 if (speed_cap == PCI_SPEED_UNKNOWN) {
5331 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
5332 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5333 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 5334 } else {
2b3a1f51
FX
5335 if (speed_cap == PCIE_SPEED_32_0GT)
5336 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5337 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5338 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5339 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5340 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5341 else if (speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5342 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5343 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5344 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5345 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5346 else if (speed_cap == PCIE_SPEED_8_0GT)
5347 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5348 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5349 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5350 else if (speed_cap == PCIE_SPEED_5_0GT)
5351 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5352 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5353 else
5354 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5355 }
5356 /* platform caps */
c5313457 5357 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
5358 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5359 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5360 } else {
2b3a1f51
FX
5361 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5362 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5363 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5364 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5365 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5366 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5367 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5368 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5369 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5370 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5371 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 5372 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
5373 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5374 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5375 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 5376 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
5377 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5378 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5379 else
5380 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5381
cd474ba0
AD
5382 }
5383 }
5384 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 5385 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
5386 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5387 } else {
c5313457 5388 switch (platform_link_width) {
5d9a6330 5389 case PCIE_LNK_X32:
cd474ba0
AD
5390 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5391 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5392 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5393 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5394 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5395 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5396 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5397 break;
5d9a6330 5398 case PCIE_LNK_X16:
cd474ba0
AD
5399 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5400 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5401 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5402 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5403 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5404 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5405 break;
5d9a6330 5406 case PCIE_LNK_X12:
cd474ba0
AD
5407 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5408 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5409 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5410 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5411 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5412 break;
5d9a6330 5413 case PCIE_LNK_X8:
cd474ba0
AD
5414 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5415 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5416 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5417 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5418 break;
5d9a6330 5419 case PCIE_LNK_X4:
cd474ba0
AD
5420 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5421 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5423 break;
5d9a6330 5424 case PCIE_LNK_X2:
cd474ba0
AD
5425 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5426 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5427 break;
5d9a6330 5428 case PCIE_LNK_X1:
cd474ba0
AD
5429 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5430 break;
5431 default:
5432 break;
5433 }
d0dd7f0c
AD
5434 }
5435 }
5436}
d38ceaf9 5437
361dbd01
AD
5438int amdgpu_device_baco_enter(struct drm_device *dev)
5439{
1348969a 5440 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5441 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 5442
4a580877 5443 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5444 return -ENOTSUPP;
5445
8ab0d6f0 5446 if (ras && adev->ras_enabled &&
acdae216 5447 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5448 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5449
9530273e 5450 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
5451}
5452
5453int amdgpu_device_baco_exit(struct drm_device *dev)
5454{
1348969a 5455 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5456 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 5457 int ret = 0;
361dbd01 5458
4a580877 5459 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5460 return -ENOTSUPP;
5461
9530273e
EQ
5462 ret = amdgpu_dpm_baco_exit(adev);
5463 if (ret)
5464 return ret;
7a22677b 5465
8ab0d6f0 5466 if (ras && adev->ras_enabled &&
acdae216 5467 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5468 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5469
1bece222
CL
5470 if (amdgpu_passthrough(adev) &&
5471 adev->nbio.funcs->clear_doorbell_interrupt)
5472 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5473
7a22677b 5474 return 0;
361dbd01 5475}
c9a6b82f
AG
5476
5477/**
5478 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5479 * @pdev: PCI device struct
5480 * @state: PCI channel state
5481 *
5482 * Description: Called when a PCI error is detected.
5483 *
5484 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5485 */
5486pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5487{
5488 struct drm_device *dev = pci_get_drvdata(pdev);
5489 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5490 int i;
c9a6b82f
AG
5491
5492 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5493
6894305c
AG
5494 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5495 DRM_WARN("No support for XGMI hive yet...");
5496 return PCI_ERS_RESULT_DISCONNECT;
5497 }
5498
e17e27f9
GC
5499 adev->pci_channel_state = state;
5500
c9a6b82f
AG
5501 switch (state) {
5502 case pci_channel_io_normal:
5503 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca 5504 /* Fatal error, prepare for slot reset */
8a11d283
TZ
5505 case pci_channel_io_frozen:
5506 /*
d0fb18b5 5507 * Locking adev->reset_domain->sem will prevent any external access
acd89fca
AG
5508 * to GPU during PCI error recovery
5509 */
3675c2f2 5510 amdgpu_device_lock_reset_domain(adev->reset_domain);
e923be99 5511 amdgpu_device_set_mp1_state(adev);
acd89fca
AG
5512
5513 /*
5514 * Block any work scheduling as we do for regular GPU reset
5515 * for the duration of the recovery
5516 */
5517 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5518 struct amdgpu_ring *ring = adev->rings[i];
5519
5520 if (!ring || !ring->sched.thread)
5521 continue;
5522
5523 drm_sched_stop(&ring->sched, NULL);
5524 }
8f8c80f4 5525 atomic_inc(&adev->gpu_reset_counter);
c9a6b82f
AG
5526 return PCI_ERS_RESULT_NEED_RESET;
5527 case pci_channel_io_perm_failure:
5528 /* Permanent error, prepare for device removal */
5529 return PCI_ERS_RESULT_DISCONNECT;
5530 }
5531
5532 return PCI_ERS_RESULT_NEED_RESET;
5533}
5534
5535/**
5536 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5537 * @pdev: pointer to PCI device
5538 */
5539pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5540{
5541
5542 DRM_INFO("PCI error: mmio enabled callback!!\n");
5543
5544 /* TODO - dump whatever for debugging purposes */
5545
5546 /* This called only if amdgpu_pci_error_detected returns
5547 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5548 * works, no need to reset slot.
5549 */
5550
5551 return PCI_ERS_RESULT_RECOVERED;
5552}
5553
5554/**
5555 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5556 * @pdev: PCI device struct
5557 *
5558 * Description: This routine is called by the pci error recovery
5559 * code after the PCI slot has been reset, just before we
5560 * should resume normal operations.
5561 */
5562pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5563{
5564 struct drm_device *dev = pci_get_drvdata(pdev);
5565 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 5566 int r, i;
04442bf7 5567 struct amdgpu_reset_context reset_context;
362c7b91 5568 u32 memsize;
7ac71382 5569 struct list_head device_list;
c9a6b82f
AG
5570
5571 DRM_INFO("PCI error: slot reset callback!!\n");
5572
04442bf7
LL
5573 memset(&reset_context, 0, sizeof(reset_context));
5574
7ac71382 5575 INIT_LIST_HEAD(&device_list);
655ce9cb 5576 list_add_tail(&adev->reset_list, &device_list);
7ac71382 5577
362c7b91
AG
5578 /* wait for asic to come out of reset */
5579 msleep(500);
5580
7ac71382 5581 /* Restore PCI confspace */
c1dd4aa6 5582 amdgpu_device_load_pci_state(pdev);
c9a6b82f 5583
362c7b91
AG
5584 /* confirm ASIC came out of reset */
5585 for (i = 0; i < adev->usec_timeout; i++) {
5586 memsize = amdgpu_asic_get_config_memsize(adev);
5587
5588 if (memsize != 0xffffffff)
5589 break;
5590 udelay(1);
5591 }
5592 if (memsize == 0xffffffff) {
5593 r = -ETIME;
5594 goto out;
5595 }
5596
04442bf7
LL
5597 reset_context.method = AMD_RESET_METHOD_NONE;
5598 reset_context.reset_req_dev = adev;
5599 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5600 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5601
7afefb81 5602 adev->no_hw_access = true;
04442bf7 5603 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
7afefb81 5604 adev->no_hw_access = false;
c9a6b82f
AG
5605 if (r)
5606 goto out;
5607
04442bf7 5608 r = amdgpu_do_asic_reset(&device_list, &reset_context);
c9a6b82f
AG
5609
5610out:
c9a6b82f 5611 if (!r) {
c1dd4aa6
AG
5612 if (amdgpu_device_cache_pci_state(adev->pdev))
5613 pci_restore_state(adev->pdev);
5614
c9a6b82f
AG
5615 DRM_INFO("PCIe error recovery succeeded\n");
5616 } else {
5617 DRM_ERROR("PCIe error recovery failed, err:%d", r);
e923be99
AG
5618 amdgpu_device_unset_mp1_state(adev);
5619 amdgpu_device_unlock_reset_domain(adev->reset_domain);
c9a6b82f
AG
5620 }
5621
5622 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5623}
5624
5625/**
5626 * amdgpu_pci_resume() - resume normal ops after PCI reset
5627 * @pdev: pointer to PCI device
5628 *
5629 * Called when the error recovery driver tells us that its
505199a3 5630 * OK to resume normal operation.
c9a6b82f
AG
5631 */
5632void amdgpu_pci_resume(struct pci_dev *pdev)
5633{
5634 struct drm_device *dev = pci_get_drvdata(pdev);
5635 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5636 int i;
c9a6b82f 5637
c9a6b82f
AG
5638
5639 DRM_INFO("PCI error: resume callback!!\n");
acd89fca 5640
e17e27f9
GC
5641 /* Only continue execution for the case of pci_channel_io_frozen */
5642 if (adev->pci_channel_state != pci_channel_io_frozen)
5643 return;
5644
acd89fca
AG
5645 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5646 struct amdgpu_ring *ring = adev->rings[i];
5647
5648 if (!ring || !ring->sched.thread)
5649 continue;
5650
5651
5652 drm_sched_resubmit_jobs(&ring->sched);
5653 drm_sched_start(&ring->sched, true);
5654 }
5655
e923be99
AG
5656 amdgpu_device_unset_mp1_state(adev);
5657 amdgpu_device_unlock_reset_domain(adev->reset_domain);
c9a6b82f 5658}
c1dd4aa6
AG
5659
5660bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5661{
5662 struct drm_device *dev = pci_get_drvdata(pdev);
5663 struct amdgpu_device *adev = drm_to_adev(dev);
5664 int r;
5665
5666 r = pci_save_state(pdev);
5667 if (!r) {
5668 kfree(adev->pci_state);
5669
5670 adev->pci_state = pci_store_saved_state(pdev);
5671
5672 if (!adev->pci_state) {
5673 DRM_ERROR("Failed to store PCI saved state");
5674 return false;
5675 }
5676 } else {
5677 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5678 return false;
5679 }
5680
5681 return true;
5682}
5683
5684bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5685{
5686 struct drm_device *dev = pci_get_drvdata(pdev);
5687 struct amdgpu_device *adev = drm_to_adev(dev);
5688 int r;
5689
5690 if (!adev->pci_state)
5691 return false;
5692
5693 r = pci_load_saved_state(pdev, adev->pci_state);
5694
5695 if (!r) {
5696 pci_restore_state(pdev);
5697 } else {
5698 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5699 return false;
5700 }
5701
5702 return true;
5703}
5704
810085dd
EH
5705void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5706 struct amdgpu_ring *ring)
5707{
5708#ifdef CONFIG_X86_64
b818a5d3 5709 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
810085dd
EH
5710 return;
5711#endif
5712 if (adev->gmc.xgmi.connected_to_cpu)
5713 return;
5714
5715 if (ring && ring->funcs->emit_hdp_flush)
5716 amdgpu_ring_emit_hdp_flush(ring);
5717 else
5718 amdgpu_asic_flush_hdp(adev, ring);
5719}
c1dd4aa6 5720
810085dd
EH
5721void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5722 struct amdgpu_ring *ring)
5723{
5724#ifdef CONFIG_X86_64
b818a5d3 5725 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
810085dd
EH
5726 return;
5727#endif
5728 if (adev->gmc.xgmi.connected_to_cpu)
5729 return;
c1dd4aa6 5730
810085dd
EH
5731 amdgpu_asic_invalidate_hdp(adev, ring);
5732}
34f3a4a9 5733
89a7a870
AG
5734int amdgpu_in_reset(struct amdgpu_device *adev)
5735{
5736 return atomic_read(&adev->reset_domain->in_gpu_reset);
5737 }
5738
34f3a4a9
LY
5739/**
5740 * amdgpu_device_halt() - bring hardware to some kind of halt state
5741 *
5742 * @adev: amdgpu_device pointer
5743 *
5744 * Bring hardware to some kind of halt state so that no one can touch it
5745 * any more. It will help to maintain error context when error occurred.
5746 * Compare to a simple hang, the system will keep stable at least for SSH
5747 * access. Then it should be trivial to inspect the hardware state and
5748 * see what's going on. Implemented as following:
5749 *
5750 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5751 * clears all CPU mappings to device, disallows remappings through page faults
5752 * 2. amdgpu_irq_disable_all() disables all interrupts
5753 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5754 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5755 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5756 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5757 * flush any in flight DMA operations
5758 */
5759void amdgpu_device_halt(struct amdgpu_device *adev)
5760{
5761 struct pci_dev *pdev = adev->pdev;
e0f943b4 5762 struct drm_device *ddev = adev_to_drm(adev);
34f3a4a9
LY
5763
5764 drm_dev_unplug(ddev);
5765
5766 amdgpu_irq_disable_all(adev);
5767
5768 amdgpu_fence_driver_hw_fini(adev);
5769
5770 adev->no_hw_access = true;
5771
5772 amdgpu_device_unmap_mmio(adev);
5773
5774 pci_disable_device(pdev);
5775 pci_wait_for_pending_transaction(pdev);
5776}
86700a40
XD
5777
5778u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5779 u32 reg)
5780{
5781 unsigned long flags, address, data;
5782 u32 r;
5783
5784 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5785 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5786
5787 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5788 WREG32(address, reg * 4);
5789 (void)RREG32(address);
5790 r = RREG32(data);
5791 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5792 return r;
5793}
5794
5795void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5796 u32 reg, u32 v)
5797{
5798 unsigned long flags, address, data;
5799
5800 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5801 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5802
5803 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5804 WREG32(address, reg * 4);
5805 (void)RREG32(address);
5806 WREG32(data, v);
5807 (void)RREG32(data);
5808 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5809}