drm/amdgpu: adjust fw_name string length for toc
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
fdf2f6c5 33
4562236b 34#include <drm/drm_atomic_helper.h>
fcd70cd3 35#include <drm/drm_probe_helper.h>
d38ceaf9
AD
36#include <drm/amdgpu_drm.h>
37#include <linux/vgaarb.h>
38#include <linux/vga_switcheroo.h>
39#include <linux/efi.h>
40#include "amdgpu.h"
f4b373f4 41#include "amdgpu_trace.h"
d38ceaf9
AD
42#include "amdgpu_i2c.h"
43#include "atom.h"
44#include "amdgpu_atombios.h"
a5bde2f9 45#include "amdgpu_atomfirmware.h"
d0dd7f0c 46#include "amd_pcie.h"
33f34802
KW
47#ifdef CONFIG_DRM_AMDGPU_SI
48#include "si.h"
49#endif
a2e73f56
AD
50#ifdef CONFIG_DRM_AMDGPU_CIK
51#include "cik.h"
52#endif
aaa36a97 53#include "vi.h"
460826e6 54#include "soc15.h"
0a5b8c7b 55#include "nv.h"
d38ceaf9 56#include "bif/bif_4_1_d.h"
9accf2fd 57#include <linux/pci.h>
bec86378 58#include <linux/firmware.h>
89041940 59#include "amdgpu_vf_error.h"
d38ceaf9 60
ba997709 61#include "amdgpu_amdkfd.h"
d2f52ac8 62#include "amdgpu_pm.h"
d38ceaf9 63
5183411b 64#include "amdgpu_xgmi.h"
c030f2e4 65#include "amdgpu_ras.h"
9c7c85f7 66#include "amdgpu_pmu.h"
bd607166 67#include "amdgpu_fru_eeprom.h"
04442bf7 68#include "amdgpu_reset.h"
5183411b 69
d5ea093e 70#include <linux/suspend.h>
c6a6e2db 71#include <drm/task_barrier.h>
3f12acc8 72#include <linux/pm_runtime.h>
d5ea093e 73
f89f8c6b
AG
74#include <drm/drm_drv.h>
75
e2a75f88 76MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 77MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 78MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 79MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 80MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 81MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
b51a26a0 82MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
23c6268e 83MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
ed42cfe1 84MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
42b325e5 85MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
4e52a9f8 86MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
8bf84f60 87MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
e2a75f88 88
2dc80b00
S
89#define AMDGPU_RESUME_MS 2000
90
050091ab 91const char *amdgpu_asic_name[] = {
da69c161
KW
92 "TAHITI",
93 "PITCAIRN",
94 "VERDE",
95 "OLAND",
96 "HAINAN",
d38ceaf9
AD
97 "BONAIRE",
98 "KAVERI",
99 "KABINI",
100 "HAWAII",
101 "MULLINS",
102 "TOPAZ",
103 "TONGA",
48299f95 104 "FIJI",
d38ceaf9 105 "CARRIZO",
139f4917 106 "STONEY",
2cc0c0b5
FC
107 "POLARIS10",
108 "POLARIS11",
c4642a47 109 "POLARIS12",
48ff108d 110 "VEGAM",
d4196f01 111 "VEGA10",
8fab806a 112 "VEGA12",
956fcddc 113 "VEGA20",
2ca8a5d2 114 "RAVEN",
d6c3b24e 115 "ARCTURUS",
1eee4228 116 "RENOIR",
d46b417a 117 "ALDEBARAN",
852a6626 118 "NAVI10",
87dbad02 119 "NAVI14",
9802f5d7 120 "NAVI12",
ccaf72d3 121 "SIENNA_CICHLID",
ddd8fbe7 122 "NAVY_FLOUNDER",
4f1e9a76 123 "VANGOGH",
a2468e04 124 "DIMGREY_CAVEFISH",
6f169591 125 "BEIGE_GOBY",
ee9236b7 126 "YELLOW_CARP",
d38ceaf9
AD
127 "LAST",
128};
129
dcea6e65
KR
130/**
131 * DOC: pcie_replay_count
132 *
133 * The amdgpu driver provides a sysfs API for reporting the total number
134 * of PCIe replays (NAKs)
135 * The file pcie_replay_count is used for this and returns the total
136 * number of replays as a sum of the NAKs generated and NAKs received
137 */
138
139static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
140 struct device_attribute *attr, char *buf)
141{
142 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 143 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
145
36000c7a 146 return sysfs_emit(buf, "%llu\n", cnt);
dcea6e65
KR
147}
148
149static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
150 amdgpu_device_get_pcie_replay_count, NULL);
151
5494d864
AD
152static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
153
bd607166
KR
154/**
155 * DOC: product_name
156 *
157 * The amdgpu driver provides a sysfs API for reporting the product name
158 * for the device
159 * The file serial_number is used for this and returns the product name
160 * as returned from the FRU.
161 * NOTE: This is only available for certain server cards
162 */
163
164static ssize_t amdgpu_device_get_product_name(struct device *dev,
165 struct device_attribute *attr, char *buf)
166{
167 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 168 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 169
36000c7a 170 return sysfs_emit(buf, "%s\n", adev->product_name);
bd607166
KR
171}
172
173static DEVICE_ATTR(product_name, S_IRUGO,
174 amdgpu_device_get_product_name, NULL);
175
176/**
177 * DOC: product_number
178 *
179 * The amdgpu driver provides a sysfs API for reporting the part number
180 * for the device
181 * The file serial_number is used for this and returns the part number
182 * as returned from the FRU.
183 * NOTE: This is only available for certain server cards
184 */
185
186static ssize_t amdgpu_device_get_product_number(struct device *dev,
187 struct device_attribute *attr, char *buf)
188{
189 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 190 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 191
36000c7a 192 return sysfs_emit(buf, "%s\n", adev->product_number);
bd607166
KR
193}
194
195static DEVICE_ATTR(product_number, S_IRUGO,
196 amdgpu_device_get_product_number, NULL);
197
198/**
199 * DOC: serial_number
200 *
201 * The amdgpu driver provides a sysfs API for reporting the serial number
202 * for the device
203 * The file serial_number is used for this and returns the serial number
204 * as returned from the FRU.
205 * NOTE: This is only available for certain server cards
206 */
207
208static ssize_t amdgpu_device_get_serial_number(struct device *dev,
209 struct device_attribute *attr, char *buf)
210{
211 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 212 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166 213
36000c7a 214 return sysfs_emit(buf, "%s\n", adev->serial);
bd607166
KR
215}
216
217static DEVICE_ATTR(serial_number, S_IRUGO,
218 amdgpu_device_get_serial_number, NULL);
219
fd496ca8 220/**
b98c6299 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
fd496ca8
AD
222 *
223 * @dev: drm_device pointer
224 *
b98c6299 225 * Returns true if the device is a dGPU with ATPX power control,
fd496ca8
AD
226 * otherwise return false.
227 */
b98c6299 228bool amdgpu_device_supports_px(struct drm_device *dev)
fd496ca8
AD
229{
230 struct amdgpu_device *adev = drm_to_adev(dev);
231
b98c6299 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
fd496ca8
AD
233 return true;
234 return false;
235}
236
e3ecdffa 237/**
0330b848 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
e3ecdffa
AD
239 *
240 * @dev: drm_device pointer
241 *
b98c6299 242 * Returns true if the device is a dGPU with ACPI power control,
e3ecdffa
AD
243 * otherwise return false.
244 */
31af062a 245bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 246{
1348969a 247 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 248
b98c6299
AD
249 if (adev->has_pr3 ||
250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
d38ceaf9
AD
251 return true;
252 return false;
253}
254
a69cba42
AD
255/**
256 * amdgpu_device_supports_baco - Does the device support BACO
257 *
258 * @dev: drm_device pointer
259 *
260 * Returns true if the device supporte BACO,
261 * otherwise return false.
262 */
263bool amdgpu_device_supports_baco(struct drm_device *dev)
264{
1348969a 265 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
266
267 return amdgpu_asic_supports_baco(adev);
268}
269
3fa8f89d
S
270/**
271 * amdgpu_device_supports_smart_shift - Is the device dGPU with
272 * smart shift support
273 *
274 * @dev: drm_device pointer
275 *
276 * Returns true if the device is a dGPU with Smart Shift support,
277 * otherwise returns false.
278 */
279bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
280{
281 return (amdgpu_device_supports_boco(dev) &&
282 amdgpu_acpi_is_power_shift_control_supported());
283}
284
6e3cd2a9
MCC
285/*
286 * VRAM access helper functions
287 */
288
e35e2b11 289/**
048af66b 290 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
e35e2b11
TY
291 *
292 * @adev: amdgpu_device pointer
293 * @pos: offset of the buffer in vram
294 * @buf: virtual address of the buffer in system memory
295 * @size: read/write size, sizeof(@buf) must > @size
296 * @write: true - write to vram, otherwise - read from vram
297 */
048af66b
KW
298void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
299 void *buf, size_t size, bool write)
e35e2b11 300{
e35e2b11 301 unsigned long flags;
048af66b
KW
302 uint32_t hi = ~0, tmp = 0;
303 uint32_t *data = buf;
ce05ac56 304 uint64_t last;
f89f8c6b 305 int idx;
ce05ac56 306
f89f8c6b
AG
307 if (!drm_dev_enter(&adev->ddev, &idx))
308 return;
9d11eb0d 309
048af66b
KW
310 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
311
312 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
313 for (last = pos + size; pos < last; pos += 4) {
314 tmp = pos >> 31;
315
316 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
317 if (tmp != hi) {
318 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
319 hi = tmp;
320 }
321 if (write)
322 WREG32_NO_KIQ(mmMM_DATA, *data++);
323 else
324 *data++ = RREG32_NO_KIQ(mmMM_DATA);
325 }
326
327 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
328 drm_dev_exit(idx);
329}
330
331/**
332 * amdgpu_device_vram_access - access vram by vram aperature
333 *
334 * @adev: amdgpu_device pointer
335 * @pos: offset of the buffer in vram
336 * @buf: virtual address of the buffer in system memory
337 * @size: read/write size, sizeof(@buf) must > @size
338 * @write: true - write to vram, otherwise - read from vram
339 *
340 * The return value means how many bytes have been transferred.
341 */
342size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
343 void *buf, size_t size, bool write)
344{
9d11eb0d 345#ifdef CONFIG_64BIT
048af66b
KW
346 void __iomem *addr;
347 size_t count = 0;
348 uint64_t last;
349
350 if (!adev->mman.aper_base_kaddr)
351 return 0;
352
9d11eb0d
CK
353 last = min(pos + size, adev->gmc.visible_vram_size);
354 if (last > pos) {
048af66b
KW
355 addr = adev->mman.aper_base_kaddr + pos;
356 count = last - pos;
9d11eb0d
CK
357
358 if (write) {
359 memcpy_toio(addr, buf, count);
360 mb();
810085dd 361 amdgpu_device_flush_hdp(adev, NULL);
9d11eb0d 362 } else {
810085dd 363 amdgpu_device_invalidate_hdp(adev, NULL);
9d11eb0d
CK
364 mb();
365 memcpy_fromio(buf, addr, count);
366 }
367
9d11eb0d 368 }
048af66b
KW
369
370 return count;
371#else
372 return 0;
9d11eb0d 373#endif
048af66b 374}
9d11eb0d 375
048af66b
KW
376/**
377 * amdgpu_device_vram_access - read/write a buffer in vram
378 *
379 * @adev: amdgpu_device pointer
380 * @pos: offset of the buffer in vram
381 * @buf: virtual address of the buffer in system memory
382 * @size: read/write size, sizeof(@buf) must > @size
383 * @write: true - write to vram, otherwise - read from vram
384 */
385void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
386 void *buf, size_t size, bool write)
387{
388 size_t count;
e35e2b11 389
048af66b
KW
390 /* try to using vram apreature to access vram first */
391 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
392 size -= count;
393 if (size) {
394 /* using MM to access rest vram */
395 pos += count;
396 buf += count;
397 amdgpu_device_mm_access(adev, pos, buf, size, write);
e35e2b11
TY
398 }
399}
400
d38ceaf9 401/*
f7ee1874 402 * register access helper functions.
d38ceaf9 403 */
56b53c0b
DL
404
405/* Check if hw access should be skipped because of hotplug or device error */
406bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
407{
7afefb81 408 if (adev->no_hw_access)
56b53c0b
DL
409 return true;
410
411#ifdef CONFIG_LOCKDEP
412 /*
413 * This is a bit complicated to understand, so worth a comment. What we assert
414 * here is that the GPU reset is not running on another thread in parallel.
415 *
416 * For this we trylock the read side of the reset semaphore, if that succeeds
417 * we know that the reset is not running in paralell.
418 *
419 * If the trylock fails we assert that we are either already holding the read
420 * side of the lock or are the reset thread itself and hold the write side of
421 * the lock.
422 */
423 if (in_task()) {
424 if (down_read_trylock(&adev->reset_sem))
425 up_read(&adev->reset_sem);
426 else
427 lockdep_assert_held(&adev->reset_sem);
428 }
429#endif
430 return false;
431}
432
e3ecdffa 433/**
f7ee1874 434 * amdgpu_device_rreg - read a memory mapped IO or indirect register
e3ecdffa
AD
435 *
436 * @adev: amdgpu_device pointer
437 * @reg: dword aligned register offset
438 * @acc_flags: access flags which require special behavior
439 *
440 * Returns the 32 bit value from the offset specified.
441 */
f7ee1874
HZ
442uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
443 uint32_t reg, uint32_t acc_flags)
d38ceaf9 444{
f4b373f4
TSD
445 uint32_t ret;
446
56b53c0b 447 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
448 return 0;
449
f7ee1874
HZ
450 if ((reg * 4) < adev->rmmio_size) {
451 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
452 amdgpu_sriov_runtime(adev) &&
453 down_read_trylock(&adev->reset_sem)) {
454 ret = amdgpu_kiq_rreg(adev, reg);
455 up_read(&adev->reset_sem);
456 } else {
457 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
458 }
459 } else {
460 ret = adev->pcie_rreg(adev, reg * 4);
81202807 461 }
bc992ba5 462
f7ee1874 463 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
e78b579d 464
f4b373f4 465 return ret;
d38ceaf9
AD
466}
467
421a2a30
ML
468/*
469 * MMIO register read with bytes helper functions
470 * @offset:bytes offset from MMIO start
471 *
472*/
473
e3ecdffa
AD
474/**
475 * amdgpu_mm_rreg8 - read a memory mapped IO register
476 *
477 * @adev: amdgpu_device pointer
478 * @offset: byte aligned register offset
479 *
480 * Returns the 8 bit value from the offset specified.
481 */
7cbbc745
AG
482uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
483{
56b53c0b 484 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
485 return 0;
486
421a2a30
ML
487 if (offset < adev->rmmio_size)
488 return (readb(adev->rmmio + offset));
489 BUG();
490}
491
492/*
493 * MMIO register write with bytes helper functions
494 * @offset:bytes offset from MMIO start
495 * @value: the value want to be written to the register
496 *
497*/
e3ecdffa
AD
498/**
499 * amdgpu_mm_wreg8 - read a memory mapped IO register
500 *
501 * @adev: amdgpu_device pointer
502 * @offset: byte aligned register offset
503 * @value: 8 bit value to write
504 *
505 * Writes the value specified to the offset specified.
506 */
7cbbc745
AG
507void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
508{
56b53c0b 509 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
510 return;
511
421a2a30
ML
512 if (offset < adev->rmmio_size)
513 writeb(value, adev->rmmio + offset);
514 else
515 BUG();
516}
517
e3ecdffa 518/**
f7ee1874 519 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
e3ecdffa
AD
520 *
521 * @adev: amdgpu_device pointer
522 * @reg: dword aligned register offset
523 * @v: 32 bit value to write to the register
524 * @acc_flags: access flags which require special behavior
525 *
526 * Writes the value specified to the offset specified.
527 */
f7ee1874
HZ
528void amdgpu_device_wreg(struct amdgpu_device *adev,
529 uint32_t reg, uint32_t v,
530 uint32_t acc_flags)
d38ceaf9 531{
56b53c0b 532 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
533 return;
534
f7ee1874
HZ
535 if ((reg * 4) < adev->rmmio_size) {
536 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
537 amdgpu_sriov_runtime(adev) &&
538 down_read_trylock(&adev->reset_sem)) {
539 amdgpu_kiq_wreg(adev, reg, v);
540 up_read(&adev->reset_sem);
541 } else {
542 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
543 }
544 } else {
545 adev->pcie_wreg(adev, reg * 4, v);
81202807 546 }
bc992ba5 547
f7ee1874 548 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
2e0cc4d4 549}
d38ceaf9 550
2e0cc4d4
ML
551/*
552 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
553 *
554 * this function is invoked only the debugfs register access
555 * */
f7ee1874
HZ
556void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
557 uint32_t reg, uint32_t v)
2e0cc4d4 558{
56b53c0b 559 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
560 return;
561
2e0cc4d4 562 if (amdgpu_sriov_fullaccess(adev) &&
f7ee1874
HZ
563 adev->gfx.rlc.funcs &&
564 adev->gfx.rlc.funcs->is_rlcg_access_range) {
2e0cc4d4 565 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
1a4772d9 566 return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0);
f7ee1874
HZ
567 } else {
568 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
47ed4e1c 569 }
d38ceaf9
AD
570}
571
d38ceaf9
AD
572/**
573 * amdgpu_mm_rdoorbell - read a doorbell dword
574 *
575 * @adev: amdgpu_device pointer
576 * @index: doorbell index
577 *
578 * Returns the value in the doorbell aperture at the
579 * requested doorbell index (CIK).
580 */
581u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
582{
56b53c0b 583 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
584 return 0;
585
d38ceaf9
AD
586 if (index < adev->doorbell.num_doorbells) {
587 return readl(adev->doorbell.ptr + index);
588 } else {
589 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
590 return 0;
591 }
592}
593
594/**
595 * amdgpu_mm_wdoorbell - write a doorbell dword
596 *
597 * @adev: amdgpu_device pointer
598 * @index: doorbell index
599 * @v: value to write
600 *
601 * Writes @v to the doorbell aperture at the
602 * requested doorbell index (CIK).
603 */
604void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
605{
56b53c0b 606 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
607 return;
608
d38ceaf9
AD
609 if (index < adev->doorbell.num_doorbells) {
610 writel(v, adev->doorbell.ptr + index);
611 } else {
612 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
613 }
614}
615
832be404
KW
616/**
617 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
618 *
619 * @adev: amdgpu_device pointer
620 * @index: doorbell index
621 *
622 * Returns the value in the doorbell aperture at the
623 * requested doorbell index (VEGA10+).
624 */
625u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
626{
56b53c0b 627 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
628 return 0;
629
832be404
KW
630 if (index < adev->doorbell.num_doorbells) {
631 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
632 } else {
633 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
634 return 0;
635 }
636}
637
638/**
639 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
640 *
641 * @adev: amdgpu_device pointer
642 * @index: doorbell index
643 * @v: value to write
644 *
645 * Writes @v to the doorbell aperture at the
646 * requested doorbell index (VEGA10+).
647 */
648void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
649{
56b53c0b 650 if (amdgpu_device_skip_hw_access(adev))
bf36b52e
AG
651 return;
652
832be404
KW
653 if (index < adev->doorbell.num_doorbells) {
654 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
655 } else {
656 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
657 }
658}
659
1bba3683
HZ
660/**
661 * amdgpu_device_indirect_rreg - read an indirect register
662 *
663 * @adev: amdgpu_device pointer
664 * @pcie_index: mmio register offset
665 * @pcie_data: mmio register offset
22f453fb 666 * @reg_addr: indirect register address to read from
1bba3683
HZ
667 *
668 * Returns the value of indirect register @reg_addr
669 */
670u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
671 u32 pcie_index, u32 pcie_data,
672 u32 reg_addr)
673{
674 unsigned long flags;
675 u32 r;
676 void __iomem *pcie_index_offset;
677 void __iomem *pcie_data_offset;
678
679 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
680 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
681 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
682
683 writel(reg_addr, pcie_index_offset);
684 readl(pcie_index_offset);
685 r = readl(pcie_data_offset);
686 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
687
688 return r;
689}
690
691/**
692 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
693 *
694 * @adev: amdgpu_device pointer
695 * @pcie_index: mmio register offset
696 * @pcie_data: mmio register offset
22f453fb 697 * @reg_addr: indirect register address to read from
1bba3683
HZ
698 *
699 * Returns the value of indirect register @reg_addr
700 */
701u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
702 u32 pcie_index, u32 pcie_data,
703 u32 reg_addr)
704{
705 unsigned long flags;
706 u64 r;
707 void __iomem *pcie_index_offset;
708 void __iomem *pcie_data_offset;
709
710 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
711 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
712 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
713
714 /* read low 32 bits */
715 writel(reg_addr, pcie_index_offset);
716 readl(pcie_index_offset);
717 r = readl(pcie_data_offset);
718 /* read high 32 bits */
719 writel(reg_addr + 4, pcie_index_offset);
720 readl(pcie_index_offset);
721 r |= ((u64)readl(pcie_data_offset) << 32);
722 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
723
724 return r;
725}
726
727/**
728 * amdgpu_device_indirect_wreg - write an indirect register address
729 *
730 * @adev: amdgpu_device pointer
731 * @pcie_index: mmio register offset
732 * @pcie_data: mmio register offset
733 * @reg_addr: indirect register offset
734 * @reg_data: indirect register data
735 *
736 */
737void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
738 u32 pcie_index, u32 pcie_data,
739 u32 reg_addr, u32 reg_data)
740{
741 unsigned long flags;
742 void __iomem *pcie_index_offset;
743 void __iomem *pcie_data_offset;
744
745 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
746 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
747 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
748
749 writel(reg_addr, pcie_index_offset);
750 readl(pcie_index_offset);
751 writel(reg_data, pcie_data_offset);
752 readl(pcie_data_offset);
753 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
754}
755
756/**
757 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
758 *
759 * @adev: amdgpu_device pointer
760 * @pcie_index: mmio register offset
761 * @pcie_data: mmio register offset
762 * @reg_addr: indirect register offset
763 * @reg_data: indirect register data
764 *
765 */
766void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
767 u32 pcie_index, u32 pcie_data,
768 u32 reg_addr, u64 reg_data)
769{
770 unsigned long flags;
771 void __iomem *pcie_index_offset;
772 void __iomem *pcie_data_offset;
773
774 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
775 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
776 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
777
778 /* write low 32 bits */
779 writel(reg_addr, pcie_index_offset);
780 readl(pcie_index_offset);
781 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
782 readl(pcie_data_offset);
783 /* write high 32 bits */
784 writel(reg_addr + 4, pcie_index_offset);
785 readl(pcie_index_offset);
786 writel((u32)(reg_data >> 32), pcie_data_offset);
787 readl(pcie_data_offset);
788 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
789}
790
d38ceaf9
AD
791/**
792 * amdgpu_invalid_rreg - dummy reg read function
793 *
982a820b 794 * @adev: amdgpu_device pointer
d38ceaf9
AD
795 * @reg: offset of register
796 *
797 * Dummy register read function. Used for register blocks
798 * that certain asics don't have (all asics).
799 * Returns the value in the register.
800 */
801static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
802{
803 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
804 BUG();
805 return 0;
806}
807
808/**
809 * amdgpu_invalid_wreg - dummy reg write function
810 *
982a820b 811 * @adev: amdgpu_device pointer
d38ceaf9
AD
812 * @reg: offset of register
813 * @v: value to write to the register
814 *
815 * Dummy register read function. Used for register blocks
816 * that certain asics don't have (all asics).
817 */
818static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
819{
820 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
821 reg, v);
822 BUG();
823}
824
4fa1c6a6
TZ
825/**
826 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
827 *
982a820b 828 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
829 * @reg: offset of register
830 *
831 * Dummy register read function. Used for register blocks
832 * that certain asics don't have (all asics).
833 * Returns the value in the register.
834 */
835static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
836{
837 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
838 BUG();
839 return 0;
840}
841
842/**
843 * amdgpu_invalid_wreg64 - dummy reg write function
844 *
982a820b 845 * @adev: amdgpu_device pointer
4fa1c6a6
TZ
846 * @reg: offset of register
847 * @v: value to write to the register
848 *
849 * Dummy register read function. Used for register blocks
850 * that certain asics don't have (all asics).
851 */
852static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
853{
854 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
855 reg, v);
856 BUG();
857}
858
d38ceaf9
AD
859/**
860 * amdgpu_block_invalid_rreg - dummy reg read function
861 *
982a820b 862 * @adev: amdgpu_device pointer
d38ceaf9
AD
863 * @block: offset of instance
864 * @reg: offset of register
865 *
866 * Dummy register read function. Used for register blocks
867 * that certain asics don't have (all asics).
868 * Returns the value in the register.
869 */
870static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
871 uint32_t block, uint32_t reg)
872{
873 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
874 reg, block);
875 BUG();
876 return 0;
877}
878
879/**
880 * amdgpu_block_invalid_wreg - dummy reg write function
881 *
982a820b 882 * @adev: amdgpu_device pointer
d38ceaf9
AD
883 * @block: offset of instance
884 * @reg: offset of register
885 * @v: value to write to the register
886 *
887 * Dummy register read function. Used for register blocks
888 * that certain asics don't have (all asics).
889 */
890static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
891 uint32_t block,
892 uint32_t reg, uint32_t v)
893{
894 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
895 reg, block, v);
896 BUG();
897}
898
4d2997ab
AD
899/**
900 * amdgpu_device_asic_init - Wrapper for atom asic_init
901 *
982a820b 902 * @adev: amdgpu_device pointer
4d2997ab
AD
903 *
904 * Does any asic specific work and then calls atom asic init.
905 */
906static int amdgpu_device_asic_init(struct amdgpu_device *adev)
907{
908 amdgpu_asic_pre_asic_init(adev);
909
910 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
911}
912
e3ecdffa
AD
913/**
914 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
915 *
982a820b 916 * @adev: amdgpu_device pointer
e3ecdffa
AD
917 *
918 * Allocates a scratch page of VRAM for use by various things in the
919 * driver.
920 */
06ec9070 921static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 922{
a4a02777
CK
923 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
924 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
925 &adev->vram_scratch.robj,
926 &adev->vram_scratch.gpu_addr,
927 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
928}
929
e3ecdffa
AD
930/**
931 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
932 *
982a820b 933 * @adev: amdgpu_device pointer
e3ecdffa
AD
934 *
935 * Frees the VRAM scratch page.
936 */
06ec9070 937static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 938{
078af1a3 939 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
940}
941
942/**
9c3f2b54 943 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
944 *
945 * @adev: amdgpu_device pointer
946 * @registers: pointer to the register array
947 * @array_size: size of the register array
948 *
949 * Programs an array or registers with and and or masks.
950 * This is a helper for setting golden registers.
951 */
9c3f2b54
AD
952void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
953 const u32 *registers,
954 const u32 array_size)
d38ceaf9
AD
955{
956 u32 tmp, reg, and_mask, or_mask;
957 int i;
958
959 if (array_size % 3)
960 return;
961
962 for (i = 0; i < array_size; i +=3) {
963 reg = registers[i + 0];
964 and_mask = registers[i + 1];
965 or_mask = registers[i + 2];
966
967 if (and_mask == 0xffffffff) {
968 tmp = or_mask;
969 } else {
970 tmp = RREG32(reg);
971 tmp &= ~and_mask;
e0d07657
HZ
972 if (adev->family >= AMDGPU_FAMILY_AI)
973 tmp |= (or_mask & and_mask);
974 else
975 tmp |= or_mask;
d38ceaf9
AD
976 }
977 WREG32(reg, tmp);
978 }
979}
980
e3ecdffa
AD
981/**
982 * amdgpu_device_pci_config_reset - reset the GPU
983 *
984 * @adev: amdgpu_device pointer
985 *
986 * Resets the GPU using the pci config reset sequence.
987 * Only applicable to asics prior to vega10.
988 */
8111c387 989void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
990{
991 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
992}
993
af484df8
AD
994/**
995 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
996 *
997 * @adev: amdgpu_device pointer
998 *
999 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1000 */
1001int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1002{
1003 return pci_reset_function(adev->pdev);
1004}
1005
d38ceaf9
AD
1006/*
1007 * GPU doorbell aperture helpers function.
1008 */
1009/**
06ec9070 1010 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
1011 *
1012 * @adev: amdgpu_device pointer
1013 *
1014 * Init doorbell driver information (CIK)
1015 * Returns 0 on success, error on failure.
1016 */
06ec9070 1017static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 1018{
6585661d 1019
705e519e
CK
1020 /* No doorbell on SI hardware generation */
1021 if (adev->asic_type < CHIP_BONAIRE) {
1022 adev->doorbell.base = 0;
1023 adev->doorbell.size = 0;
1024 adev->doorbell.num_doorbells = 0;
1025 adev->doorbell.ptr = NULL;
1026 return 0;
1027 }
1028
d6895ad3
CK
1029 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1030 return -EINVAL;
1031
22357775
AD
1032 amdgpu_asic_init_doorbell_index(adev);
1033
d38ceaf9
AD
1034 /* doorbell bar mapping */
1035 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1036 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1037
edf600da 1038 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
9564f192 1039 adev->doorbell_index.max_assignment+1);
d38ceaf9
AD
1040 if (adev->doorbell.num_doorbells == 0)
1041 return -EINVAL;
1042
ec3db8a6 1043 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
88dc26e4
OZ
1044 * paging queue doorbell use the second page. The
1045 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1046 * doorbells are in the first page. So with paging queue enabled,
1047 * the max num_doorbells should + 1 page (0x400 in dword)
ec3db8a6
PY
1048 */
1049 if (adev->asic_type >= CHIP_VEGA10)
88dc26e4 1050 adev->doorbell.num_doorbells += 0x400;
ec3db8a6 1051
8972e5d2
CK
1052 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1053 adev->doorbell.num_doorbells *
1054 sizeof(u32));
1055 if (adev->doorbell.ptr == NULL)
d38ceaf9 1056 return -ENOMEM;
d38ceaf9
AD
1057
1058 return 0;
1059}
1060
1061/**
06ec9070 1062 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
1063 *
1064 * @adev: amdgpu_device pointer
1065 *
1066 * Tear down doorbell driver information (CIK)
1067 */
06ec9070 1068static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1069{
1070 iounmap(adev->doorbell.ptr);
1071 adev->doorbell.ptr = NULL;
1072}
1073
22cb0164 1074
d38ceaf9
AD
1075
1076/*
06ec9070 1077 * amdgpu_device_wb_*()
455a7bc2 1078 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 1079 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
1080 */
1081
1082/**
06ec9070 1083 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
1084 *
1085 * @adev: amdgpu_device pointer
1086 *
1087 * Disables Writeback and frees the Writeback memory (all asics).
1088 * Used at driver shutdown.
1089 */
06ec9070 1090static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
1091{
1092 if (adev->wb.wb_obj) {
a76ed485
AD
1093 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1094 &adev->wb.gpu_addr,
1095 (void **)&adev->wb.wb);
d38ceaf9
AD
1096 adev->wb.wb_obj = NULL;
1097 }
1098}
1099
1100/**
06ec9070 1101 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
d38ceaf9
AD
1102 *
1103 * @adev: amdgpu_device pointer
1104 *
455a7bc2 1105 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
1106 * Used at driver startup.
1107 * Returns 0 on success or an -error on failure.
1108 */
06ec9070 1109static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
1110{
1111 int r;
1112
1113 if (adev->wb.wb_obj == NULL) {
97407b63
AD
1114 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1115 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
1116 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1117 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1118 (void **)&adev->wb.wb);
d38ceaf9
AD
1119 if (r) {
1120 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1121 return r;
1122 }
d38ceaf9
AD
1123
1124 adev->wb.num_wb = AMDGPU_MAX_WB;
1125 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1126
1127 /* clear wb memory */
73469585 1128 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
1129 }
1130
1131 return 0;
1132}
1133
1134/**
131b4b36 1135 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
1136 *
1137 * @adev: amdgpu_device pointer
1138 * @wb: wb index
1139 *
1140 * Allocate a wb slot for use by the driver (all asics).
1141 * Returns 0 on success or -EINVAL on failure.
1142 */
131b4b36 1143int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
1144{
1145 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 1146
97407b63 1147 if (offset < adev->wb.num_wb) {
7014285a 1148 __set_bit(offset, adev->wb.used);
63ae07ca 1149 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
1150 return 0;
1151 } else {
1152 return -EINVAL;
1153 }
1154}
1155
d38ceaf9 1156/**
131b4b36 1157 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
1158 *
1159 * @adev: amdgpu_device pointer
1160 * @wb: wb index
1161 *
1162 * Free a wb slot allocated for use by the driver (all asics)
1163 */
131b4b36 1164void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 1165{
73469585 1166 wb >>= 3;
d38ceaf9 1167 if (wb < adev->wb.num_wb)
73469585 1168 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
1169}
1170
d6895ad3
CK
1171/**
1172 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1173 *
1174 * @adev: amdgpu_device pointer
1175 *
1176 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1177 * to fail, but if any of the BARs is not accessible after the size we abort
1178 * driver loading by returning -ENODEV.
1179 */
1180int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1181{
453f617a 1182 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
31b8adab
CK
1183 struct pci_bus *root;
1184 struct resource *res;
1185 unsigned i;
d6895ad3
CK
1186 u16 cmd;
1187 int r;
1188
0c03b912 1189 /* Bypass for VF */
1190 if (amdgpu_sriov_vf(adev))
1191 return 0;
1192
b7221f2b
AD
1193 /* skip if the bios has already enabled large BAR */
1194 if (adev->gmc.real_vram_size &&
1195 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1196 return 0;
1197
31b8adab
CK
1198 /* Check if the root BUS has 64bit memory resources */
1199 root = adev->pdev->bus;
1200 while (root->parent)
1201 root = root->parent;
1202
1203 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 1204 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1205 res->start > 0x100000000ull)
1206 break;
1207 }
1208
1209 /* Trying to resize is pointless without a root hub window above 4GB */
1210 if (!res)
1211 return 0;
1212
453f617a
ND
1213 /* Limit the BAR size to what is available */
1214 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1215 rbar_size);
1216
d6895ad3
CK
1217 /* Disable memory decoding while we change the BAR addresses and size */
1218 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1219 pci_write_config_word(adev->pdev, PCI_COMMAND,
1220 cmd & ~PCI_COMMAND_MEMORY);
1221
1222 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1223 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1224 if (adev->asic_type >= CHIP_BONAIRE)
1225 pci_release_resource(adev->pdev, 2);
1226
1227 pci_release_resource(adev->pdev, 0);
1228
1229 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1230 if (r == -ENOSPC)
1231 DRM_INFO("Not enough PCI address space for a large BAR.");
1232 else if (r && r != -ENOTSUPP)
1233 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1234
1235 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1236
1237 /* When the doorbell or fb BAR isn't available we have no chance of
1238 * using the device.
1239 */
06ec9070 1240 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1241 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1242 return -ENODEV;
1243
1244 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1245
1246 return 0;
1247}
a05502e5 1248
d38ceaf9
AD
1249/*
1250 * GPU helpers function.
1251 */
1252/**
39c640c0 1253 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1254 *
1255 * @adev: amdgpu_device pointer
1256 *
c836fec5
JQ
1257 * Check if the asic has been initialized (all asics) at driver startup
1258 * or post is needed if hw reset is performed.
1259 * Returns true if need or false if not.
d38ceaf9 1260 */
39c640c0 1261bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1262{
1263 uint32_t reg;
1264
bec86378
ML
1265 if (amdgpu_sriov_vf(adev))
1266 return false;
1267
1268 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1269 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1270 * some old smc fw still need driver do vPost otherwise gpu hang, while
1271 * those smc fw version above 22.15 doesn't have this flaw, so we force
1272 * vpost executed for smc version below 22.15
bec86378
ML
1273 */
1274 if (adev->asic_type == CHIP_FIJI) {
1275 int err;
1276 uint32_t fw_ver;
1277 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1278 /* force vPost if error occured */
1279 if (err)
1280 return true;
1281
1282 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1283 if (fw_ver < 0x00160e00)
1284 return true;
bec86378 1285 }
bec86378 1286 }
91fe77eb 1287
e3c1b071 1288 /* Don't post if we need to reset whole hive on init */
1289 if (adev->gmc.xgmi.pending_reset)
1290 return false;
1291
91fe77eb 1292 if (adev->has_hw_reset) {
1293 adev->has_hw_reset = false;
1294 return true;
1295 }
1296
1297 /* bios scratch used on CIK+ */
1298 if (adev->asic_type >= CHIP_BONAIRE)
1299 return amdgpu_atombios_scratch_need_asic_init(adev);
1300
1301 /* check MEM_SIZE for older asics */
1302 reg = amdgpu_asic_get_config_memsize(adev);
1303
1304 if ((reg != 0) && (reg != 0xffffffff))
1305 return false;
1306
1307 return true;
bec86378
ML
1308}
1309
d38ceaf9
AD
1310/* if we get transitioned to only one device, take VGA back */
1311/**
06ec9070 1312 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9
AD
1313 *
1314 * @cookie: amdgpu_device pointer
1315 * @state: enable/disable vga decode
1316 *
1317 * Enable/disable vga decode (all asics).
1318 * Returns VGA resource flags.
1319 */
06ec9070 1320static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
d38ceaf9
AD
1321{
1322 struct amdgpu_device *adev = cookie;
1323 amdgpu_asic_set_vga_state(adev, state);
1324 if (state)
1325 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1326 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1327 else
1328 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1329}
1330
e3ecdffa
AD
1331/**
1332 * amdgpu_device_check_block_size - validate the vm block size
1333 *
1334 * @adev: amdgpu_device pointer
1335 *
1336 * Validates the vm block size specified via module parameter.
1337 * The vm block size defines number of bits in page table versus page directory,
1338 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1339 * page table and the remaining bits are in the page directory.
1340 */
06ec9070 1341static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1342{
1343 /* defines number of bits in page table versus page directory,
1344 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1345 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1346 if (amdgpu_vm_block_size == -1)
1347 return;
a1adf8be 1348
bab4fee7 1349 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1350 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1351 amdgpu_vm_block_size);
97489129 1352 amdgpu_vm_block_size = -1;
a1adf8be 1353 }
a1adf8be
CZ
1354}
1355
e3ecdffa
AD
1356/**
1357 * amdgpu_device_check_vm_size - validate the vm size
1358 *
1359 * @adev: amdgpu_device pointer
1360 *
1361 * Validates the vm size in GB specified via module parameter.
1362 * The VM size is the size of the GPU virtual memory space in GB.
1363 */
06ec9070 1364static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1365{
64dab074
AD
1366 /* no need to check the default value */
1367 if (amdgpu_vm_size == -1)
1368 return;
1369
83ca145d
ZJ
1370 if (amdgpu_vm_size < 1) {
1371 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1372 amdgpu_vm_size);
f3368128 1373 amdgpu_vm_size = -1;
83ca145d 1374 }
83ca145d
ZJ
1375}
1376
7951e376
RZ
1377static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1378{
1379 struct sysinfo si;
a9d4fe2f 1380 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1381 uint64_t total_memory;
1382 uint64_t dram_size_seven_GB = 0x1B8000000;
1383 uint64_t dram_size_three_GB = 0xB8000000;
1384
1385 if (amdgpu_smu_memory_pool_size == 0)
1386 return;
1387
1388 if (!is_os_64) {
1389 DRM_WARN("Not 64-bit OS, feature not supported\n");
1390 goto def_value;
1391 }
1392 si_meminfo(&si);
1393 total_memory = (uint64_t)si.totalram * si.mem_unit;
1394
1395 if ((amdgpu_smu_memory_pool_size == 1) ||
1396 (amdgpu_smu_memory_pool_size == 2)) {
1397 if (total_memory < dram_size_three_GB)
1398 goto def_value1;
1399 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1400 (amdgpu_smu_memory_pool_size == 8)) {
1401 if (total_memory < dram_size_seven_GB)
1402 goto def_value1;
1403 } else {
1404 DRM_WARN("Smu memory pool size not supported\n");
1405 goto def_value;
1406 }
1407 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1408
1409 return;
1410
1411def_value1:
1412 DRM_WARN("No enough system memory\n");
1413def_value:
1414 adev->pm.smu_prv_buffer_size = 0;
1415}
1416
9f6a7857
HR
1417static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1418{
1419 if (!(adev->flags & AMD_IS_APU) ||
1420 adev->asic_type < CHIP_RAVEN)
1421 return 0;
1422
1423 switch (adev->asic_type) {
1424 case CHIP_RAVEN:
1425 if (adev->pdev->device == 0x15dd)
1426 adev->apu_flags |= AMD_APU_IS_RAVEN;
1427 if (adev->pdev->device == 0x15d8)
1428 adev->apu_flags |= AMD_APU_IS_PICASSO;
1429 break;
1430 case CHIP_RENOIR:
1431 if ((adev->pdev->device == 0x1636) ||
1432 (adev->pdev->device == 0x164c))
1433 adev->apu_flags |= AMD_APU_IS_RENOIR;
1434 else
1435 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1436 break;
1437 case CHIP_VANGOGH:
1438 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1439 break;
1440 case CHIP_YELLOW_CARP:
1441 break;
1442 default:
1443 return -EINVAL;
1444 }
1445
1446 return 0;
1447}
1448
d38ceaf9 1449/**
06ec9070 1450 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1451 *
1452 * @adev: amdgpu_device pointer
1453 *
1454 * Validates certain module parameters and updates
1455 * the associated values used by the driver (all asics).
1456 */
912dfc84 1457static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1458{
5b011235
CZ
1459 if (amdgpu_sched_jobs < 4) {
1460 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1461 amdgpu_sched_jobs);
1462 amdgpu_sched_jobs = 4;
76117507 1463 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1464 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1465 amdgpu_sched_jobs);
1466 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1467 }
d38ceaf9 1468
83e74db6 1469 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1470 /* gart size must be greater or equal to 32M */
1471 dev_warn(adev->dev, "gart size (%d) too small\n",
1472 amdgpu_gart_size);
83e74db6 1473 amdgpu_gart_size = -1;
d38ceaf9
AD
1474 }
1475
36d38372 1476 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1477 /* gtt size must be greater or equal to 32M */
36d38372
CK
1478 dev_warn(adev->dev, "gtt size (%d) too small\n",
1479 amdgpu_gtt_size);
1480 amdgpu_gtt_size = -1;
d38ceaf9
AD
1481 }
1482
d07f14be
RH
1483 /* valid range is between 4 and 9 inclusive */
1484 if (amdgpu_vm_fragment_size != -1 &&
1485 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1486 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1487 amdgpu_vm_fragment_size = -1;
1488 }
1489
5d5bd5e3
KW
1490 if (amdgpu_sched_hw_submission < 2) {
1491 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1492 amdgpu_sched_hw_submission);
1493 amdgpu_sched_hw_submission = 2;
1494 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1495 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1496 amdgpu_sched_hw_submission);
1497 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1498 }
1499
7951e376
RZ
1500 amdgpu_device_check_smu_prv_buffer_size(adev);
1501
06ec9070 1502 amdgpu_device_check_vm_size(adev);
d38ceaf9 1503
06ec9070 1504 amdgpu_device_check_block_size(adev);
6a7f76e7 1505
19aede77 1506 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1507
c6252390 1508 amdgpu_gmc_tmz_set(adev);
01a8dcec 1509
9b498efa
AD
1510 amdgpu_gmc_noretry_set(adev);
1511
e3c00faa 1512 return 0;
d38ceaf9
AD
1513}
1514
1515/**
1516 * amdgpu_switcheroo_set_state - set switcheroo state
1517 *
1518 * @pdev: pci dev pointer
1694467b 1519 * @state: vga_switcheroo state
d38ceaf9
AD
1520 *
1521 * Callback for the switcheroo driver. Suspends or resumes the
1522 * the asics before or after it is powered up using ACPI methods.
1523 */
8aba21b7
LT
1524static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1525 enum vga_switcheroo_state state)
d38ceaf9
AD
1526{
1527 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1528 int r;
d38ceaf9 1529
b98c6299 1530 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1531 return;
1532
1533 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1534 pr_info("switched on\n");
d38ceaf9
AD
1535 /* don't suspend or resume card normally */
1536 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1537
8f66090b
TZ
1538 pci_set_power_state(pdev, PCI_D0);
1539 amdgpu_device_load_pci_state(pdev);
1540 r = pci_enable_device(pdev);
de185019
AD
1541 if (r)
1542 DRM_WARN("pci_enable_device failed (%d)\n", r);
1543 amdgpu_device_resume(dev, true);
d38ceaf9 1544
d38ceaf9 1545 dev->switch_power_state = DRM_SWITCH_POWER_ON;
d38ceaf9 1546 } else {
dd4fa6c1 1547 pr_info("switched off\n");
d38ceaf9 1548 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1549 amdgpu_device_suspend(dev, true);
8f66090b 1550 amdgpu_device_cache_pci_state(pdev);
de185019 1551 /* Shut down the device */
8f66090b
TZ
1552 pci_disable_device(pdev);
1553 pci_set_power_state(pdev, PCI_D3cold);
d38ceaf9
AD
1554 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1555 }
1556}
1557
1558/**
1559 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1560 *
1561 * @pdev: pci dev pointer
1562 *
1563 * Callback for the switcheroo driver. Check of the switcheroo
1564 * state can be changed.
1565 * Returns true if the state can be changed, false if not.
1566 */
1567static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1568{
1569 struct drm_device *dev = pci_get_drvdata(pdev);
1570
1571 /*
1572 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1573 * locking inversion with the driver load path. And the access here is
1574 * completely racy anyway. So don't bother with locking for now.
1575 */
7e13ad89 1576 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1577}
1578
1579static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1580 .set_gpu_state = amdgpu_switcheroo_set_state,
1581 .reprobe = NULL,
1582 .can_switch = amdgpu_switcheroo_can_switch,
1583};
1584
e3ecdffa
AD
1585/**
1586 * amdgpu_device_ip_set_clockgating_state - set the CG state
1587 *
87e3f136 1588 * @dev: amdgpu_device pointer
e3ecdffa
AD
1589 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1590 * @state: clockgating state (gate or ungate)
1591 *
1592 * Sets the requested clockgating state for all instances of
1593 * the hardware IP specified.
1594 * Returns the error code from the last instance.
1595 */
43fa561f 1596int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1597 enum amd_ip_block_type block_type,
1598 enum amd_clockgating_state state)
d38ceaf9 1599{
43fa561f 1600 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1601 int i, r = 0;
1602
1603 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1604 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1605 continue;
c722865a
RZ
1606 if (adev->ip_blocks[i].version->type != block_type)
1607 continue;
1608 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1609 continue;
1610 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1611 (void *)adev, state);
1612 if (r)
1613 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1614 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1615 }
1616 return r;
1617}
1618
e3ecdffa
AD
1619/**
1620 * amdgpu_device_ip_set_powergating_state - set the PG state
1621 *
87e3f136 1622 * @dev: amdgpu_device pointer
e3ecdffa
AD
1623 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1624 * @state: powergating state (gate or ungate)
1625 *
1626 * Sets the requested powergating state for all instances of
1627 * the hardware IP specified.
1628 * Returns the error code from the last instance.
1629 */
43fa561f 1630int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1631 enum amd_ip_block_type block_type,
1632 enum amd_powergating_state state)
d38ceaf9 1633{
43fa561f 1634 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1635 int i, r = 0;
1636
1637 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1638 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1639 continue;
c722865a
RZ
1640 if (adev->ip_blocks[i].version->type != block_type)
1641 continue;
1642 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1643 continue;
1644 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1645 (void *)adev, state);
1646 if (r)
1647 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1648 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1649 }
1650 return r;
1651}
1652
e3ecdffa
AD
1653/**
1654 * amdgpu_device_ip_get_clockgating_state - get the CG state
1655 *
1656 * @adev: amdgpu_device pointer
1657 * @flags: clockgating feature flags
1658 *
1659 * Walks the list of IPs on the device and updates the clockgating
1660 * flags for each IP.
1661 * Updates @flags with the feature flags for each hardware IP where
1662 * clockgating is enabled.
1663 */
2990a1fc
AD
1664void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1665 u32 *flags)
6cb2d4e4
HR
1666{
1667 int i;
1668
1669 for (i = 0; i < adev->num_ip_blocks; i++) {
1670 if (!adev->ip_blocks[i].status.valid)
1671 continue;
1672 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1673 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1674 }
1675}
1676
e3ecdffa
AD
1677/**
1678 * amdgpu_device_ip_wait_for_idle - wait for idle
1679 *
1680 * @adev: amdgpu_device pointer
1681 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1682 *
1683 * Waits for the request hardware IP to be idle.
1684 * Returns 0 for success or a negative error code on failure.
1685 */
2990a1fc
AD
1686int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1687 enum amd_ip_block_type block_type)
5dbbb60b
AD
1688{
1689 int i, r;
1690
1691 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1692 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1693 continue;
a1255107
AD
1694 if (adev->ip_blocks[i].version->type == block_type) {
1695 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1696 if (r)
1697 return r;
1698 break;
1699 }
1700 }
1701 return 0;
1702
1703}
1704
e3ecdffa
AD
1705/**
1706 * amdgpu_device_ip_is_idle - is the hardware IP idle
1707 *
1708 * @adev: amdgpu_device pointer
1709 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1710 *
1711 * Check if the hardware IP is idle or not.
1712 * Returns true if it the IP is idle, false if not.
1713 */
2990a1fc
AD
1714bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1715 enum amd_ip_block_type block_type)
5dbbb60b
AD
1716{
1717 int i;
1718
1719 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1720 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1721 continue;
a1255107
AD
1722 if (adev->ip_blocks[i].version->type == block_type)
1723 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1724 }
1725 return true;
1726
1727}
1728
e3ecdffa
AD
1729/**
1730 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1731 *
1732 * @adev: amdgpu_device pointer
87e3f136 1733 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1734 *
1735 * Returns a pointer to the hardware IP block structure
1736 * if it exists for the asic, otherwise NULL.
1737 */
2990a1fc
AD
1738struct amdgpu_ip_block *
1739amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1740 enum amd_ip_block_type type)
d38ceaf9
AD
1741{
1742 int i;
1743
1744 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1745 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1746 return &adev->ip_blocks[i];
1747
1748 return NULL;
1749}
1750
1751/**
2990a1fc 1752 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1753 *
1754 * @adev: amdgpu_device pointer
5fc3aeeb 1755 * @type: enum amd_ip_block_type
d38ceaf9
AD
1756 * @major: major version
1757 * @minor: minor version
1758 *
1759 * return 0 if equal or greater
1760 * return 1 if smaller or the ip_block doesn't exist
1761 */
2990a1fc
AD
1762int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1763 enum amd_ip_block_type type,
1764 u32 major, u32 minor)
d38ceaf9 1765{
2990a1fc 1766 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1767
a1255107
AD
1768 if (ip_block && ((ip_block->version->major > major) ||
1769 ((ip_block->version->major == major) &&
1770 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1771 return 0;
1772
1773 return 1;
1774}
1775
a1255107 1776/**
2990a1fc 1777 * amdgpu_device_ip_block_add
a1255107
AD
1778 *
1779 * @adev: amdgpu_device pointer
1780 * @ip_block_version: pointer to the IP to add
1781 *
1782 * Adds the IP block driver information to the collection of IPs
1783 * on the asic.
1784 */
2990a1fc
AD
1785int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1786 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1787{
1788 if (!ip_block_version)
1789 return -EINVAL;
1790
7bd939d0
LG
1791 switch (ip_block_version->type) {
1792 case AMD_IP_BLOCK_TYPE_VCN:
1793 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1794 return 0;
1795 break;
1796 case AMD_IP_BLOCK_TYPE_JPEG:
1797 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1798 return 0;
1799 break;
1800 default:
1801 break;
1802 }
1803
e966a725 1804 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1805 ip_block_version->funcs->name);
1806
a1255107
AD
1807 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1808
1809 return 0;
1810}
1811
e3ecdffa
AD
1812/**
1813 * amdgpu_device_enable_virtual_display - enable virtual display feature
1814 *
1815 * @adev: amdgpu_device pointer
1816 *
1817 * Enabled the virtual display feature if the user has enabled it via
1818 * the module parameter virtual_display. This feature provides a virtual
1819 * display hardware on headless boards or in virtualized environments.
1820 * This function parses and validates the configuration string specified by
1821 * the user and configues the virtual display configuration (number of
1822 * virtual connectors, crtcs, etc.) specified.
1823 */
483ef985 1824static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1825{
1826 adev->enable_virtual_display = false;
1827
1828 if (amdgpu_virtual_display) {
8f66090b 1829 const char *pci_address_name = pci_name(adev->pdev);
0f66356d 1830 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1831
1832 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1833 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1834 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1835 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1836 if (!strcmp("all", pciaddname)
1837 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1838 long num_crtc;
1839 int res = -1;
1840
9accf2fd 1841 adev->enable_virtual_display = true;
0f66356d
ED
1842
1843 if (pciaddname_tmp)
1844 res = kstrtol(pciaddname_tmp, 10,
1845 &num_crtc);
1846
1847 if (!res) {
1848 if (num_crtc < 1)
1849 num_crtc = 1;
1850 if (num_crtc > 6)
1851 num_crtc = 6;
1852 adev->mode_info.num_crtc = num_crtc;
1853 } else {
1854 adev->mode_info.num_crtc = 1;
1855 }
9accf2fd
ED
1856 break;
1857 }
1858 }
1859
0f66356d
ED
1860 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1861 amdgpu_virtual_display, pci_address_name,
1862 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1863
1864 kfree(pciaddstr);
1865 }
1866}
1867
e3ecdffa
AD
1868/**
1869 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1870 *
1871 * @adev: amdgpu_device pointer
1872 *
1873 * Parses the asic configuration parameters specified in the gpu info
1874 * firmware and makes them availale to the driver for use in configuring
1875 * the asic.
1876 * Returns 0 on success, -EINVAL on failure.
1877 */
e2a75f88
AD
1878static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1879{
e2a75f88 1880 const char *chip_name;
c0a43457 1881 char fw_name[40];
e2a75f88
AD
1882 int err;
1883 const struct gpu_info_firmware_header_v1_0 *hdr;
1884
ab4fe3e1
HR
1885 adev->firmware.gpu_info_fw = NULL;
1886
72de33f8 1887 if (adev->mman.discovery_bin) {
258620d0 1888 amdgpu_discovery_get_gfx_info(adev);
cc375d8c
TY
1889
1890 /*
1891 * FIXME: The bounding box is still needed by Navi12, so
1892 * temporarily read it from gpu_info firmware. Should be droped
1893 * when DAL no longer needs it.
1894 */
1895 if (adev->asic_type != CHIP_NAVI12)
1896 return 0;
258620d0
AD
1897 }
1898
e2a75f88 1899 switch (adev->asic_type) {
e2a75f88
AD
1900#ifdef CONFIG_DRM_AMDGPU_SI
1901 case CHIP_VERDE:
1902 case CHIP_TAHITI:
1903 case CHIP_PITCAIRN:
1904 case CHIP_OLAND:
1905 case CHIP_HAINAN:
1906#endif
1907#ifdef CONFIG_DRM_AMDGPU_CIK
1908 case CHIP_BONAIRE:
1909 case CHIP_HAWAII:
1910 case CHIP_KAVERI:
1911 case CHIP_KABINI:
1912 case CHIP_MULLINS:
1913#endif
da87c30b
AD
1914 case CHIP_TOPAZ:
1915 case CHIP_TONGA:
1916 case CHIP_FIJI:
1917 case CHIP_POLARIS10:
1918 case CHIP_POLARIS11:
1919 case CHIP_POLARIS12:
1920 case CHIP_VEGAM:
1921 case CHIP_CARRIZO:
1922 case CHIP_STONEY:
27c0bc71 1923 case CHIP_VEGA20:
44b3253a 1924 case CHIP_ALDEBARAN:
84d244a3
JC
1925 case CHIP_SIENNA_CICHLID:
1926 case CHIP_NAVY_FLOUNDER:
eac88a5f 1927 case CHIP_DIMGREY_CAVEFISH:
0e5f4b09 1928 case CHIP_BEIGE_GOBY:
e2a75f88
AD
1929 default:
1930 return 0;
1931 case CHIP_VEGA10:
1932 chip_name = "vega10";
1933 break;
3f76dced
AD
1934 case CHIP_VEGA12:
1935 chip_name = "vega12";
1936 break;
2d2e5e7e 1937 case CHIP_RAVEN:
54f78a76 1938 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1939 chip_name = "raven2";
54f78a76 1940 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1941 chip_name = "picasso";
54c4d17e
FX
1942 else
1943 chip_name = "raven";
2d2e5e7e 1944 break;
65e60f6e
LM
1945 case CHIP_ARCTURUS:
1946 chip_name = "arcturus";
1947 break;
b51a26a0 1948 case CHIP_RENOIR:
2e62f0b5
PL
1949 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1950 chip_name = "renoir";
1951 else
1952 chip_name = "green_sardine";
b51a26a0 1953 break;
23c6268e
HR
1954 case CHIP_NAVI10:
1955 chip_name = "navi10";
1956 break;
ed42cfe1
XY
1957 case CHIP_NAVI14:
1958 chip_name = "navi14";
1959 break;
42b325e5
XY
1960 case CHIP_NAVI12:
1961 chip_name = "navi12";
1962 break;
4e52a9f8
HR
1963 case CHIP_VANGOGH:
1964 chip_name = "vangogh";
1965 break;
8bf84f60
AL
1966 case CHIP_YELLOW_CARP:
1967 chip_name = "yellow_carp";
1968 break;
e2a75f88
AD
1969 }
1970
1971 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1972 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1973 if (err) {
1974 dev_err(adev->dev,
1975 "Failed to load gpu_info firmware \"%s\"\n",
1976 fw_name);
1977 goto out;
1978 }
ab4fe3e1 1979 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1980 if (err) {
1981 dev_err(adev->dev,
1982 "Failed to validate gpu_info firmware \"%s\"\n",
1983 fw_name);
1984 goto out;
1985 }
1986
ab4fe3e1 1987 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1988 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1989
1990 switch (hdr->version_major) {
1991 case 1:
1992 {
1993 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1994 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1995 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1996
cc375d8c
TY
1997 /*
1998 * Should be droped when DAL no longer needs it.
1999 */
2000 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
2001 goto parse_soc_bounding_box;
2002
b5ab16bf
AD
2003 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2004 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2005 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2006 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 2007 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
2008 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2009 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2010 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2011 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2012 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 2013 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
2014 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2015 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
2016 adev->gfx.cu_info.max_waves_per_simd =
2017 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2018 adev->gfx.cu_info.max_scratch_slots_per_cu =
2019 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2020 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 2021 if (hdr->version_minor >= 1) {
35c2e910
HZ
2022 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2023 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2024 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2025 adev->gfx.config.num_sc_per_sh =
2026 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2027 adev->gfx.config.num_packer_per_sc =
2028 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2029 }
ec51d3fa
XY
2030
2031parse_soc_bounding_box:
ec51d3fa
XY
2032 /*
2033 * soc bounding box info is not integrated in disocovery table,
258620d0 2034 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 2035 */
48321c3d
HW
2036 if (hdr->version_minor == 2) {
2037 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2038 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2039 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2040 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2041 }
e2a75f88
AD
2042 break;
2043 }
2044 default:
2045 dev_err(adev->dev,
2046 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2047 err = -EINVAL;
2048 goto out;
2049 }
2050out:
e2a75f88
AD
2051 return err;
2052}
2053
e3ecdffa
AD
2054/**
2055 * amdgpu_device_ip_early_init - run early init for hardware IPs
2056 *
2057 * @adev: amdgpu_device pointer
2058 *
2059 * Early initialization pass for hardware IPs. The hardware IPs that make
2060 * up each asic are discovered each IP's early_init callback is run. This
2061 * is the first stage in initializing the asic.
2062 * Returns 0 on success, negative error code on failure.
2063 */
06ec9070 2064static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 2065{
aaa36a97 2066 int i, r;
d38ceaf9 2067
483ef985 2068 amdgpu_device_enable_virtual_display(adev);
a6be7570 2069
00a979f3 2070 if (amdgpu_sriov_vf(adev)) {
00a979f3 2071 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
2072 if (r)
2073 return r;
00a979f3
WS
2074 }
2075
d38ceaf9 2076 switch (adev->asic_type) {
33f34802
KW
2077#ifdef CONFIG_DRM_AMDGPU_SI
2078 case CHIP_VERDE:
2079 case CHIP_TAHITI:
2080 case CHIP_PITCAIRN:
2081 case CHIP_OLAND:
2082 case CHIP_HAINAN:
295d0daf 2083 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
2084 r = si_set_ip_blocks(adev);
2085 if (r)
2086 return r;
2087 break;
2088#endif
a2e73f56
AD
2089#ifdef CONFIG_DRM_AMDGPU_CIK
2090 case CHIP_BONAIRE:
2091 case CHIP_HAWAII:
2092 case CHIP_KAVERI:
2093 case CHIP_KABINI:
2094 case CHIP_MULLINS:
e1ad2d53 2095 if (adev->flags & AMD_IS_APU)
a2e73f56 2096 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
2097 else
2098 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
2099
2100 r = cik_set_ip_blocks(adev);
2101 if (r)
2102 return r;
2103 break;
2104#endif
da87c30b
AD
2105 case CHIP_TOPAZ:
2106 case CHIP_TONGA:
2107 case CHIP_FIJI:
2108 case CHIP_POLARIS10:
2109 case CHIP_POLARIS11:
2110 case CHIP_POLARIS12:
2111 case CHIP_VEGAM:
2112 case CHIP_CARRIZO:
2113 case CHIP_STONEY:
2114 if (adev->flags & AMD_IS_APU)
2115 adev->family = AMDGPU_FAMILY_CZ;
2116 else
2117 adev->family = AMDGPU_FAMILY_VI;
2118
2119 r = vi_set_ip_blocks(adev);
2120 if (r)
2121 return r;
2122 break;
e48a3cd9
AD
2123 case CHIP_VEGA10:
2124 case CHIP_VEGA12:
e4bd8170 2125 case CHIP_VEGA20:
e48a3cd9 2126 case CHIP_RAVEN:
61cf44c1 2127 case CHIP_ARCTURUS:
b51a26a0 2128 case CHIP_RENOIR:
c00a18ec 2129 case CHIP_ALDEBARAN:
70534d1e 2130 if (adev->flags & AMD_IS_APU)
2ca8a5d2
CZ
2131 adev->family = AMDGPU_FAMILY_RV;
2132 else
2133 adev->family = AMDGPU_FAMILY_AI;
460826e6
KW
2134
2135 r = soc15_set_ip_blocks(adev);
2136 if (r)
2137 return r;
2138 break;
0a5b8c7b 2139 case CHIP_NAVI10:
7ecb5cd4 2140 case CHIP_NAVI14:
4808cf9c 2141 case CHIP_NAVI12:
11e8aef5 2142 case CHIP_SIENNA_CICHLID:
41f446bf 2143 case CHIP_NAVY_FLOUNDER:
144722fa 2144 case CHIP_DIMGREY_CAVEFISH:
b41f5b7a 2145 case CHIP_BEIGE_GOBY:
4e52a9f8 2146 case CHIP_VANGOGH:
8bf84f60 2147 case CHIP_YELLOW_CARP:
4e52a9f8
HR
2148 if (adev->asic_type == CHIP_VANGOGH)
2149 adev->family = AMDGPU_FAMILY_VGH;
8bf84f60
AL
2150 else if (adev->asic_type == CHIP_YELLOW_CARP)
2151 adev->family = AMDGPU_FAMILY_YC;
4e52a9f8
HR
2152 else
2153 adev->family = AMDGPU_FAMILY_NV;
0a5b8c7b
HR
2154
2155 r = nv_set_ip_blocks(adev);
2156 if (r)
2157 return r;
2158 break;
d38ceaf9
AD
2159 default:
2160 /* FIXME: not supported yet */
2161 return -EINVAL;
2162 }
2163
1884734a 2164 amdgpu_amdkfd_device_probe(adev);
2165
3b94fb10 2166 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 2167 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 2168 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
4215a119
HC
2169 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2170 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
00f54b97 2171
d38ceaf9
AD
2172 for (i = 0; i < adev->num_ip_blocks; i++) {
2173 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
2174 DRM_ERROR("disabled ip block: %d <%s>\n",
2175 i, adev->ip_blocks[i].version->funcs->name);
a1255107 2176 adev->ip_blocks[i].status.valid = false;
d38ceaf9 2177 } else {
a1255107
AD
2178 if (adev->ip_blocks[i].version->funcs->early_init) {
2179 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 2180 if (r == -ENOENT) {
a1255107 2181 adev->ip_blocks[i].status.valid = false;
2c1a2784 2182 } else if (r) {
a1255107
AD
2183 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2184 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2185 return r;
2c1a2784 2186 } else {
a1255107 2187 adev->ip_blocks[i].status.valid = true;
2c1a2784 2188 }
974e6b64 2189 } else {
a1255107 2190 adev->ip_blocks[i].status.valid = true;
d38ceaf9 2191 }
d38ceaf9 2192 }
21a249ca
AD
2193 /* get the vbios after the asic_funcs are set up */
2194 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
2195 r = amdgpu_device_parse_gpu_info_fw(adev);
2196 if (r)
2197 return r;
2198
21a249ca
AD
2199 /* Read BIOS */
2200 if (!amdgpu_get_bios(adev))
2201 return -EINVAL;
2202
2203 r = amdgpu_atombios_init(adev);
2204 if (r) {
2205 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2206 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2207 return r;
2208 }
77eabc6f
PJZ
2209
2210 /*get pf2vf msg info at it's earliest time*/
2211 if (amdgpu_sriov_vf(adev))
2212 amdgpu_virt_init_data_exchange(adev);
2213
21a249ca 2214 }
d38ceaf9
AD
2215 }
2216
395d1fb9
NH
2217 adev->cg_flags &= amdgpu_cg_mask;
2218 adev->pg_flags &= amdgpu_pg_mask;
2219
d38ceaf9
AD
2220 return 0;
2221}
2222
0a4f2520
RZ
2223static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2224{
2225 int i, r;
2226
2227 for (i = 0; i < adev->num_ip_blocks; i++) {
2228 if (!adev->ip_blocks[i].status.sw)
2229 continue;
2230 if (adev->ip_blocks[i].status.hw)
2231 continue;
2232 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 2233 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
2234 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2235 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2236 if (r) {
2237 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2238 adev->ip_blocks[i].version->funcs->name, r);
2239 return r;
2240 }
2241 adev->ip_blocks[i].status.hw = true;
2242 }
2243 }
2244
2245 return 0;
2246}
2247
2248static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2249{
2250 int i, r;
2251
2252 for (i = 0; i < adev->num_ip_blocks; i++) {
2253 if (!adev->ip_blocks[i].status.sw)
2254 continue;
2255 if (adev->ip_blocks[i].status.hw)
2256 continue;
2257 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2258 if (r) {
2259 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2260 adev->ip_blocks[i].version->funcs->name, r);
2261 return r;
2262 }
2263 adev->ip_blocks[i].status.hw = true;
2264 }
2265
2266 return 0;
2267}
2268
7a3e0bb2
RZ
2269static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2270{
2271 int r = 0;
2272 int i;
80f41f84 2273 uint32_t smu_version;
7a3e0bb2
RZ
2274
2275 if (adev->asic_type >= CHIP_VEGA10) {
2276 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2277 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2278 continue;
2279
e3c1b071 2280 if (!adev->ip_blocks[i].status.sw)
2281 continue;
2282
482f0e53
ML
2283 /* no need to do the fw loading again if already done*/
2284 if (adev->ip_blocks[i].status.hw == true)
2285 break;
2286
53b3f8f4 2287 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2288 r = adev->ip_blocks[i].version->funcs->resume(adev);
2289 if (r) {
2290 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2291 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2292 return r;
2293 }
2294 } else {
2295 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2296 if (r) {
2297 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2298 adev->ip_blocks[i].version->funcs->name, r);
2299 return r;
7a3e0bb2 2300 }
7a3e0bb2 2301 }
482f0e53
ML
2302
2303 adev->ip_blocks[i].status.hw = true;
2304 break;
7a3e0bb2
RZ
2305 }
2306 }
482f0e53 2307
8973d9ec
ED
2308 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2309 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2310
80f41f84 2311 return r;
7a3e0bb2
RZ
2312}
2313
e3ecdffa
AD
2314/**
2315 * amdgpu_device_ip_init - run init for hardware IPs
2316 *
2317 * @adev: amdgpu_device pointer
2318 *
2319 * Main initialization pass for hardware IPs. The list of all the hardware
2320 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2321 * are run. sw_init initializes the software state associated with each IP
2322 * and hw_init initializes the hardware associated with each IP.
2323 * Returns 0 on success, negative error code on failure.
2324 */
06ec9070 2325static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2326{
2327 int i, r;
2328
c030f2e4 2329 r = amdgpu_ras_init(adev);
2330 if (r)
2331 return r;
2332
d38ceaf9 2333 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2334 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2335 continue;
a1255107 2336 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2337 if (r) {
a1255107
AD
2338 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2339 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2340 goto init_failed;
2c1a2784 2341 }
a1255107 2342 adev->ip_blocks[i].status.sw = true;
bfca0289 2343
d38ceaf9 2344 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2345 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
06ec9070 2346 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2347 if (r) {
2348 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2349 goto init_failed;
2c1a2784 2350 }
a1255107 2351 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2352 if (r) {
2353 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2354 goto init_failed;
2c1a2784 2355 }
06ec9070 2356 r = amdgpu_device_wb_init(adev);
2c1a2784 2357 if (r) {
06ec9070 2358 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2359 goto init_failed;
2c1a2784 2360 }
a1255107 2361 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2362
2363 /* right after GMC hw init, we create CSA */
f92d5c61 2364 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2365 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2366 AMDGPU_GEM_DOMAIN_VRAM,
2367 AMDGPU_CSA_SIZE);
2493664f
ML
2368 if (r) {
2369 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2370 goto init_failed;
2493664f
ML
2371 }
2372 }
d38ceaf9
AD
2373 }
2374 }
2375
c9ffa427
YT
2376 if (amdgpu_sriov_vf(adev))
2377 amdgpu_virt_init_data_exchange(adev);
2378
533aed27
AG
2379 r = amdgpu_ib_pool_init(adev);
2380 if (r) {
2381 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2382 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2383 goto init_failed;
2384 }
2385
c8963ea4
RZ
2386 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2387 if (r)
72d3f592 2388 goto init_failed;
0a4f2520
RZ
2389
2390 r = amdgpu_device_ip_hw_init_phase1(adev);
2391 if (r)
72d3f592 2392 goto init_failed;
0a4f2520 2393
7a3e0bb2
RZ
2394 r = amdgpu_device_fw_loading(adev);
2395 if (r)
72d3f592 2396 goto init_failed;
7a3e0bb2 2397
0a4f2520
RZ
2398 r = amdgpu_device_ip_hw_init_phase2(adev);
2399 if (r)
72d3f592 2400 goto init_failed;
d38ceaf9 2401
121a2bc6
AG
2402 /*
2403 * retired pages will be loaded from eeprom and reserved here,
2404 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2405 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2406 * for I2C communication which only true at this point.
b82e65a9
GC
2407 *
2408 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2409 * failure from bad gpu situation and stop amdgpu init process
2410 * accordingly. For other failed cases, it will still release all
2411 * the resource and print error message, rather than returning one
2412 * negative value to upper level.
121a2bc6
AG
2413 *
2414 * Note: theoretically, this should be called before all vram allocations
2415 * to protect retired page from abusing
2416 */
b82e65a9
GC
2417 r = amdgpu_ras_recovery_init(adev);
2418 if (r)
2419 goto init_failed;
121a2bc6 2420
3e2e2ab5
HZ
2421 if (adev->gmc.xgmi.num_physical_nodes > 1)
2422 amdgpu_xgmi_add_device(adev);
e3c1b071 2423
2424 /* Don't init kfd if whole hive need to be reset during init */
2425 if (!adev->gmc.xgmi.pending_reset)
2426 amdgpu_amdkfd_device_init(adev);
c6332b97 2427
bd607166
KR
2428 amdgpu_fru_get_product_info(adev);
2429
72d3f592 2430init_failed:
c9ffa427 2431 if (amdgpu_sriov_vf(adev))
c6332b97 2432 amdgpu_virt_release_full_gpu(adev, true);
2433
72d3f592 2434 return r;
d38ceaf9
AD
2435}
2436
e3ecdffa
AD
2437/**
2438 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2439 *
2440 * @adev: amdgpu_device pointer
2441 *
2442 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2443 * this function before a GPU reset. If the value is retained after a
2444 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2445 */
06ec9070 2446static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2447{
2448 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2449}
2450
e3ecdffa
AD
2451/**
2452 * amdgpu_device_check_vram_lost - check if vram is valid
2453 *
2454 * @adev: amdgpu_device pointer
2455 *
2456 * Checks the reset magic value written to the gart pointer in VRAM.
2457 * The driver calls this after a GPU reset to see if the contents of
2458 * VRAM is lost or now.
2459 * returns true if vram is lost, false if not.
2460 */
06ec9070 2461static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2462{
dadce777
EQ
2463 if (memcmp(adev->gart.ptr, adev->reset_magic,
2464 AMDGPU_RESET_MAGIC_NUM))
2465 return true;
2466
53b3f8f4 2467 if (!amdgpu_in_reset(adev))
dadce777
EQ
2468 return false;
2469
2470 /*
2471 * For all ASICs with baco/mode1 reset, the VRAM is
2472 * always assumed to be lost.
2473 */
2474 switch (amdgpu_asic_reset_method(adev)) {
2475 case AMD_RESET_METHOD_BACO:
2476 case AMD_RESET_METHOD_MODE1:
2477 return true;
2478 default:
2479 return false;
2480 }
0c49e0b8
CZ
2481}
2482
e3ecdffa 2483/**
1112a46b 2484 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2485 *
2486 * @adev: amdgpu_device pointer
b8b72130 2487 * @state: clockgating state (gate or ungate)
e3ecdffa 2488 *
e3ecdffa 2489 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2490 * set_clockgating_state callbacks are run.
2491 * Late initialization pass enabling clockgating for hardware IPs.
2492 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2493 * Returns 0 on success, negative error code on failure.
2494 */
fdd34271 2495
5d89bb2d
LL
2496int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2497 enum amd_clockgating_state state)
d38ceaf9 2498{
1112a46b 2499 int i, j, r;
d38ceaf9 2500
4a2ba394
SL
2501 if (amdgpu_emu_mode == 1)
2502 return 0;
2503
1112a46b
RZ
2504 for (j = 0; j < adev->num_ip_blocks; j++) {
2505 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2506 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2507 continue;
5d70a549
PV
2508 /* skip CG for GFX on S0ix */
2509 if (adev->in_s0ix &&
2510 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2511 continue;
4a446d55 2512 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2513 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2514 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2515 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2516 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2517 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2518 /* enable clockgating to save power */
a1255107 2519 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2520 state);
4a446d55
AD
2521 if (r) {
2522 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2523 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2524 return r;
2525 }
b0b00ff1 2526 }
d38ceaf9 2527 }
06b18f61 2528
c9f96fd5
RZ
2529 return 0;
2530}
2531
5d89bb2d
LL
2532int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2533 enum amd_powergating_state state)
c9f96fd5 2534{
1112a46b 2535 int i, j, r;
06b18f61 2536
c9f96fd5
RZ
2537 if (amdgpu_emu_mode == 1)
2538 return 0;
2539
1112a46b
RZ
2540 for (j = 0; j < adev->num_ip_blocks; j++) {
2541 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2542 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5 2543 continue;
5d70a549
PV
2544 /* skip PG for GFX on S0ix */
2545 if (adev->in_s0ix &&
2546 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2547 continue;
c9f96fd5
RZ
2548 /* skip CG for VCE/UVD, it's handled specially */
2549 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2550 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2551 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2552 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2553 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2554 /* enable powergating to save power */
2555 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2556 state);
c9f96fd5
RZ
2557 if (r) {
2558 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2559 adev->ip_blocks[i].version->funcs->name, r);
2560 return r;
2561 }
2562 }
2563 }
2dc80b00
S
2564 return 0;
2565}
2566
beff74bc
AD
2567static int amdgpu_device_enable_mgpu_fan_boost(void)
2568{
2569 struct amdgpu_gpu_instance *gpu_ins;
2570 struct amdgpu_device *adev;
2571 int i, ret = 0;
2572
2573 mutex_lock(&mgpu_info.mutex);
2574
2575 /*
2576 * MGPU fan boost feature should be enabled
2577 * only when there are two or more dGPUs in
2578 * the system
2579 */
2580 if (mgpu_info.num_dgpu < 2)
2581 goto out;
2582
2583 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2584 gpu_ins = &(mgpu_info.gpu_ins[i]);
2585 adev = gpu_ins->adev;
2586 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2587 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2588 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2589 if (ret)
2590 break;
2591
2592 gpu_ins->mgpu_fan_enabled = 1;
2593 }
2594 }
2595
2596out:
2597 mutex_unlock(&mgpu_info.mutex);
2598
2599 return ret;
2600}
2601
e3ecdffa
AD
2602/**
2603 * amdgpu_device_ip_late_init - run late init for hardware IPs
2604 *
2605 * @adev: amdgpu_device pointer
2606 *
2607 * Late initialization pass for hardware IPs. The list of all the hardware
2608 * IPs that make up the asic is walked and the late_init callbacks are run.
2609 * late_init covers any special initialization that an IP requires
2610 * after all of the have been initialized or something that needs to happen
2611 * late in the init process.
2612 * Returns 0 on success, negative error code on failure.
2613 */
06ec9070 2614static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2615{
60599a03 2616 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2617 int i = 0, r;
2618
2619 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2620 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2621 continue;
2622 if (adev->ip_blocks[i].version->funcs->late_init) {
2623 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2624 if (r) {
2625 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2626 adev->ip_blocks[i].version->funcs->name, r);
2627 return r;
2628 }
2dc80b00 2629 }
73f847db 2630 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2631 }
2632
a891d239
DL
2633 amdgpu_ras_set_error_query_ready(adev, true);
2634
1112a46b
RZ
2635 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2636 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2637
06ec9070 2638 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2639
beff74bc
AD
2640 r = amdgpu_device_enable_mgpu_fan_boost();
2641 if (r)
2642 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2643
2d02893f 2644 /* For XGMI + passthrough configuration on arcturus, enable light SBR */
2645 if (adev->asic_type == CHIP_ARCTURUS &&
2646 amdgpu_passthrough(adev) &&
2647 adev->gmc.xgmi.num_physical_nodes > 1)
2648 smu_set_light_sbr(&adev->smu, true);
60599a03
EQ
2649
2650 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2651 mutex_lock(&mgpu_info.mutex);
2652
2653 /*
2654 * Reset device p-state to low as this was booted with high.
2655 *
2656 * This should be performed only after all devices from the same
2657 * hive get initialized.
2658 *
2659 * However, it's unknown how many device in the hive in advance.
2660 * As this is counted one by one during devices initializations.
2661 *
2662 * So, we wait for all XGMI interlinked devices initialized.
2663 * This may bring some delays as those devices may come from
2664 * different hives. But that should be OK.
2665 */
2666 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2667 for (i = 0; i < mgpu_info.num_gpu; i++) {
2668 gpu_instance = &(mgpu_info.gpu_ins[i]);
2669 if (gpu_instance->adev->flags & AMD_IS_APU)
2670 continue;
2671
d84a430d
JK
2672 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2673 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2674 if (r) {
2675 DRM_ERROR("pstate setting failed (%d).\n", r);
2676 break;
2677 }
2678 }
2679 }
2680
2681 mutex_unlock(&mgpu_info.mutex);
2682 }
2683
d38ceaf9
AD
2684 return 0;
2685}
2686
e9669fb7 2687static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
d38ceaf9
AD
2688{
2689 int i, r;
2690
e9669fb7
AG
2691 for (i = 0; i < adev->num_ip_blocks; i++) {
2692 if (!adev->ip_blocks[i].version->funcs->early_fini)
2693 continue;
5278a159 2694
e9669fb7
AG
2695 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2696 if (r) {
2697 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2698 adev->ip_blocks[i].version->funcs->name, r);
2699 }
2700 }
c030f2e4 2701
e9669fb7 2702 amdgpu_amdkfd_suspend(adev, false);
a82400b5 2703
05df1f01 2704 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2705 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2706
3e96dbfd
AD
2707 /* need to disable SMC first */
2708 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2709 if (!adev->ip_blocks[i].status.hw)
3e96dbfd 2710 continue;
fdd34271 2711 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
a1255107 2712 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3e96dbfd
AD
2713 /* XXX handle errors */
2714 if (r) {
2715 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
a1255107 2716 adev->ip_blocks[i].version->funcs->name, r);
3e96dbfd 2717 }
a1255107 2718 adev->ip_blocks[i].status.hw = false;
3e96dbfd
AD
2719 break;
2720 }
2721 }
2722
d38ceaf9 2723 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2724 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2725 continue;
8201a67a 2726
a1255107 2727 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2728 /* XXX handle errors */
2c1a2784 2729 if (r) {
a1255107
AD
2730 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2731 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2732 }
8201a67a 2733
a1255107 2734 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2735 }
2736
e9669fb7
AG
2737 return 0;
2738}
2739
2740/**
2741 * amdgpu_device_ip_fini - run fini for hardware IPs
2742 *
2743 * @adev: amdgpu_device pointer
2744 *
2745 * Main teardown pass for hardware IPs. The list of all the hardware
2746 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2747 * are run. hw_fini tears down the hardware associated with each IP
2748 * and sw_fini tears down any software state associated with each IP.
2749 * Returns 0 on success, negative error code on failure.
2750 */
2751static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2752{
2753 int i, r;
2754
2755 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2756 amdgpu_virt_release_ras_err_handler_data(adev);
2757
2758 amdgpu_ras_pre_fini(adev);
2759
2760 if (adev->gmc.xgmi.num_physical_nodes > 1)
2761 amdgpu_xgmi_remove_device(adev);
2762
2763 amdgpu_amdkfd_device_fini_sw(adev);
9950cda2 2764
d38ceaf9 2765 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2766 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2767 continue;
c12aba3a
ML
2768
2769 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2770 amdgpu_ucode_free_bo(adev);
1e256e27 2771 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2772 amdgpu_device_wb_fini(adev);
2773 amdgpu_device_vram_scratch_fini(adev);
533aed27 2774 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2775 }
2776
a1255107 2777 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2778 /* XXX handle errors */
2c1a2784 2779 if (r) {
a1255107
AD
2780 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2781 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2782 }
a1255107
AD
2783 adev->ip_blocks[i].status.sw = false;
2784 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2785 }
2786
a6dcfd9c 2787 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2788 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2789 continue;
a1255107
AD
2790 if (adev->ip_blocks[i].version->funcs->late_fini)
2791 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2792 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2793 }
2794
c030f2e4 2795 amdgpu_ras_fini(adev);
2796
030308fc 2797 if (amdgpu_sriov_vf(adev))
24136135
ML
2798 if (amdgpu_virt_release_full_gpu(adev, false))
2799 DRM_ERROR("failed to release exclusive mode on fini\n");
2493664f 2800
d38ceaf9
AD
2801 return 0;
2802}
2803
e3ecdffa 2804/**
beff74bc 2805 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2806 *
1112a46b 2807 * @work: work_struct.
e3ecdffa 2808 */
beff74bc 2809static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2810{
2811 struct amdgpu_device *adev =
beff74bc 2812 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2813 int r;
2814
2815 r = amdgpu_ib_ring_tests(adev);
2816 if (r)
2817 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2818}
2819
1e317b99
RZ
2820static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2821{
2822 struct amdgpu_device *adev =
2823 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2824
2825 mutex_lock(&adev->gfx.gfx_off_mutex);
2826 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2827 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2828 adev->gfx.gfx_off_state = true;
2829 }
2830 mutex_unlock(&adev->gfx.gfx_off_mutex);
2831}
2832
e3ecdffa 2833/**
e7854a03 2834 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2835 *
2836 * @adev: amdgpu_device pointer
2837 *
2838 * Main suspend function for hardware IPs. The list of all the hardware
2839 * IPs that make up the asic is walked, clockgating is disabled and the
2840 * suspend callbacks are run. suspend puts the hardware and software state
2841 * in each IP into a state suitable for suspend.
2842 * Returns 0 on success, negative error code on failure.
2843 */
e7854a03
AD
2844static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2845{
2846 int i, r;
2847
50ec83f0
AD
2848 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2849 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2850
e7854a03
AD
2851 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2852 if (!adev->ip_blocks[i].status.valid)
2853 continue;
2b9f7848 2854
e7854a03 2855 /* displays are handled separately */
2b9f7848
ND
2856 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2857 continue;
2858
2859 /* XXX handle errors */
2860 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2861 /* XXX handle errors */
2862 if (r) {
2863 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2864 adev->ip_blocks[i].version->funcs->name, r);
2865 return r;
e7854a03 2866 }
2b9f7848
ND
2867
2868 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2869 }
2870
e7854a03
AD
2871 return 0;
2872}
2873
2874/**
2875 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2876 *
2877 * @adev: amdgpu_device pointer
2878 *
2879 * Main suspend function for hardware IPs. The list of all the hardware
2880 * IPs that make up the asic is walked, clockgating is disabled and the
2881 * suspend callbacks are run. suspend puts the hardware and software state
2882 * in each IP into a state suitable for suspend.
2883 * Returns 0 on success, negative error code on failure.
2884 */
2885static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2886{
2887 int i, r;
2888
557f42a2 2889 if (adev->in_s0ix)
34416931 2890 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
34416931 2891
d38ceaf9 2892 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2893 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2894 continue;
e7854a03
AD
2895 /* displays are handled in phase1 */
2896 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2897 continue;
bff77e86
LM
2898 /* PSP lost connection when err_event_athub occurs */
2899 if (amdgpu_ras_intr_triggered() &&
2900 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2901 adev->ip_blocks[i].status.hw = false;
2902 continue;
2903 }
e3c1b071 2904
2905 /* skip unnecessary suspend if we do not initialize them yet */
2906 if (adev->gmc.xgmi.pending_reset &&
2907 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2908 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2909 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2910 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2911 adev->ip_blocks[i].status.hw = false;
2912 continue;
2913 }
557f42a2 2914
32ff160d
AD
2915 /* skip suspend of gfx and psp for S0ix
2916 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2917 * like at runtime. PSP is also part of the always on hardware
2918 * so no need to suspend it.
2919 */
557f42a2 2920 if (adev->in_s0ix &&
32ff160d
AD
2921 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2922 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
557f42a2
AD
2923 continue;
2924
d38ceaf9 2925 /* XXX handle errors */
a1255107 2926 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2927 /* XXX handle errors */
2c1a2784 2928 if (r) {
a1255107
AD
2929 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2930 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2931 }
876923fb 2932 adev->ip_blocks[i].status.hw = false;
a3a09142 2933 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2934 if(!amdgpu_sriov_vf(adev)){
2935 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2936 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2937 if (r) {
2938 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2939 adev->mp1_state, r);
2940 return r;
2941 }
a3a09142
AD
2942 }
2943 }
d38ceaf9
AD
2944 }
2945
2946 return 0;
2947}
2948
e7854a03
AD
2949/**
2950 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2951 *
2952 * @adev: amdgpu_device pointer
2953 *
2954 * Main suspend function for hardware IPs. The list of all the hardware
2955 * IPs that make up the asic is walked, clockgating is disabled and the
2956 * suspend callbacks are run. suspend puts the hardware and software state
2957 * in each IP into a state suitable for suspend.
2958 * Returns 0 on success, negative error code on failure.
2959 */
2960int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2961{
2962 int r;
2963
3c73683c
JC
2964 if (amdgpu_sriov_vf(adev)) {
2965 amdgpu_virt_fini_data_exchange(adev);
e7819644 2966 amdgpu_virt_request_full_gpu(adev, false);
3c73683c 2967 }
e7819644 2968
e7854a03
AD
2969 r = amdgpu_device_ip_suspend_phase1(adev);
2970 if (r)
2971 return r;
2972 r = amdgpu_device_ip_suspend_phase2(adev);
2973
e7819644
YT
2974 if (amdgpu_sriov_vf(adev))
2975 amdgpu_virt_release_full_gpu(adev, false);
2976
e7854a03
AD
2977 return r;
2978}
2979
06ec9070 2980static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2981{
2982 int i, r;
2983
2cb681b6
ML
2984 static enum amd_ip_block_type ip_order[] = {
2985 AMD_IP_BLOCK_TYPE_GMC,
2986 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 2987 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
2988 AMD_IP_BLOCK_TYPE_IH,
2989 };
a90ad3c2 2990
95ea3dbc 2991 for (i = 0; i < adev->num_ip_blocks; i++) {
2cb681b6
ML
2992 int j;
2993 struct amdgpu_ip_block *block;
a90ad3c2 2994
4cd2a96d
J
2995 block = &adev->ip_blocks[i];
2996 block->status.hw = false;
2cb681b6 2997
4cd2a96d 2998 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 2999
4cd2a96d 3000 if (block->version->type != ip_order[j] ||
2cb681b6
ML
3001 !block->status.valid)
3002 continue;
3003
3004 r = block->version->funcs->hw_init(adev);
0aaeefcc 3005 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3006 if (r)
3007 return r;
482f0e53 3008 block->status.hw = true;
a90ad3c2
ML
3009 }
3010 }
3011
3012 return 0;
3013}
3014
06ec9070 3015static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
3016{
3017 int i, r;
3018
2cb681b6
ML
3019 static enum amd_ip_block_type ip_order[] = {
3020 AMD_IP_BLOCK_TYPE_SMC,
3021 AMD_IP_BLOCK_TYPE_DCE,
3022 AMD_IP_BLOCK_TYPE_GFX,
3023 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 3024 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
3025 AMD_IP_BLOCK_TYPE_VCE,
3026 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 3027 };
a90ad3c2 3028
2cb681b6
ML
3029 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3030 int j;
3031 struct amdgpu_ip_block *block;
a90ad3c2 3032
2cb681b6
ML
3033 for (j = 0; j < adev->num_ip_blocks; j++) {
3034 block = &adev->ip_blocks[j];
3035
3036 if (block->version->type != ip_order[i] ||
482f0e53
ML
3037 !block->status.valid ||
3038 block->status.hw)
2cb681b6
ML
3039 continue;
3040
895bd048
JZ
3041 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3042 r = block->version->funcs->resume(adev);
3043 else
3044 r = block->version->funcs->hw_init(adev);
3045
0aaeefcc 3046 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
3047 if (r)
3048 return r;
482f0e53 3049 block->status.hw = true;
a90ad3c2
ML
3050 }
3051 }
3052
3053 return 0;
3054}
3055
e3ecdffa
AD
3056/**
3057 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3058 *
3059 * @adev: amdgpu_device pointer
3060 *
3061 * First resume function for hardware IPs. The list of all the hardware
3062 * IPs that make up the asic is walked and the resume callbacks are run for
3063 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3064 * after a suspend and updates the software state as necessary. This
3065 * function is also used for restoring the GPU after a GPU reset.
3066 * Returns 0 on success, negative error code on failure.
3067 */
06ec9070 3068static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
3069{
3070 int i, r;
3071
a90ad3c2 3072 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3073 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 3074 continue;
a90ad3c2 3075 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
3076 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3077 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 3078
fcf0649f
CZ
3079 r = adev->ip_blocks[i].version->funcs->resume(adev);
3080 if (r) {
3081 DRM_ERROR("resume of IP block <%s> failed %d\n",
3082 adev->ip_blocks[i].version->funcs->name, r);
3083 return r;
3084 }
482f0e53 3085 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
3086 }
3087 }
3088
3089 return 0;
3090}
3091
e3ecdffa
AD
3092/**
3093 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3094 *
3095 * @adev: amdgpu_device pointer
3096 *
3097 * First resume function for hardware IPs. The list of all the hardware
3098 * IPs that make up the asic is walked and the resume callbacks are run for
3099 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3100 * functional state after a suspend and updates the software state as
3101 * necessary. This function is also used for restoring the GPU after a GPU
3102 * reset.
3103 * Returns 0 on success, negative error code on failure.
3104 */
06ec9070 3105static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
3106{
3107 int i, r;
3108
3109 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 3110 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 3111 continue;
fcf0649f 3112 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 3113 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
3114 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3115 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 3116 continue;
a1255107 3117 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 3118 if (r) {
a1255107
AD
3119 DRM_ERROR("resume of IP block <%s> failed %d\n",
3120 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 3121 return r;
2c1a2784 3122 }
482f0e53 3123 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
3124 }
3125
3126 return 0;
3127}
3128
e3ecdffa
AD
3129/**
3130 * amdgpu_device_ip_resume - run resume for hardware IPs
3131 *
3132 * @adev: amdgpu_device pointer
3133 *
3134 * Main resume function for hardware IPs. The hardware IPs
3135 * are split into two resume functions because they are
3136 * are also used in in recovering from a GPU reset and some additional
3137 * steps need to be take between them. In this case (S3/S4) they are
3138 * run sequentially.
3139 * Returns 0 on success, negative error code on failure.
3140 */
06ec9070 3141static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
3142{
3143 int r;
3144
06ec9070 3145 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
3146 if (r)
3147 return r;
7a3e0bb2
RZ
3148
3149 r = amdgpu_device_fw_loading(adev);
3150 if (r)
3151 return r;
3152
06ec9070 3153 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
3154
3155 return r;
3156}
3157
e3ecdffa
AD
3158/**
3159 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3160 *
3161 * @adev: amdgpu_device pointer
3162 *
3163 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3164 */
4e99a44e 3165static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 3166{
6867e1b5
ML
3167 if (amdgpu_sriov_vf(adev)) {
3168 if (adev->is_atom_fw) {
58ff791a 3169 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
6867e1b5
ML
3170 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3171 } else {
3172 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3173 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3174 }
3175
3176 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3177 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 3178 }
048765ad
AR
3179}
3180
e3ecdffa
AD
3181/**
3182 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3183 *
3184 * @asic_type: AMD asic type
3185 *
3186 * Check if there is DC (new modesetting infrastructre) support for an asic.
3187 * returns true if DC has support, false if not.
3188 */
4562236b
HW
3189bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3190{
3191 switch (asic_type) {
3192#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
3193#if defined(CONFIG_DRM_AMD_DC_SI)
3194 case CHIP_TAHITI:
3195 case CHIP_PITCAIRN:
3196 case CHIP_VERDE:
3197 case CHIP_OLAND:
3198#endif
4562236b 3199 case CHIP_BONAIRE:
0d6fbccb 3200 case CHIP_KAVERI:
367e6687
AD
3201 case CHIP_KABINI:
3202 case CHIP_MULLINS:
d9fda248
HW
3203 /*
3204 * We have systems in the wild with these ASICs that require
3205 * LVDS and VGA support which is not supported with DC.
3206 *
3207 * Fallback to the non-DC driver here by default so as not to
3208 * cause regressions.
3209 */
3210 return amdgpu_dc > 0;
3211 case CHIP_HAWAII:
4562236b
HW
3212 case CHIP_CARRIZO:
3213 case CHIP_STONEY:
4562236b 3214 case CHIP_POLARIS10:
675fd32b 3215 case CHIP_POLARIS11:
2c8ad2d5 3216 case CHIP_POLARIS12:
675fd32b 3217 case CHIP_VEGAM:
4562236b
HW
3218 case CHIP_TONGA:
3219 case CHIP_FIJI:
42f8ffa1 3220 case CHIP_VEGA10:
dca7b401 3221 case CHIP_VEGA12:
c6034aa2 3222 case CHIP_VEGA20:
b86a1aa3 3223#if defined(CONFIG_DRM_AMD_DC_DCN)
fd187853 3224 case CHIP_RAVEN:
b4f199c7 3225 case CHIP_NAVI10:
8fceceb6 3226 case CHIP_NAVI14:
078655d9 3227 case CHIP_NAVI12:
e1c14c43 3228 case CHIP_RENOIR:
81d9bfb8 3229 case CHIP_SIENNA_CICHLID:
a6c5308f 3230 case CHIP_NAVY_FLOUNDER:
7cc656e2 3231 case CHIP_DIMGREY_CAVEFISH:
ddaed58b 3232 case CHIP_BEIGE_GOBY:
84b934bc 3233 case CHIP_VANGOGH:
c8b73f7f 3234 case CHIP_YELLOW_CARP:
42f8ffa1 3235#endif
fd187853 3236 return amdgpu_dc != 0;
4562236b
HW
3237#endif
3238 default:
93b09a9a 3239 if (amdgpu_dc > 0)
044a48f4 3240 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
93b09a9a 3241 "but isn't supported by ASIC, ignoring\n");
4562236b
HW
3242 return false;
3243 }
3244}
3245
3246/**
3247 * amdgpu_device_has_dc_support - check if dc is supported
3248 *
982a820b 3249 * @adev: amdgpu_device pointer
4562236b
HW
3250 *
3251 * Returns true for supported, false for not supported
3252 */
3253bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3254{
abaf210c
AS
3255 if (amdgpu_sriov_vf(adev) ||
3256 adev->enable_virtual_display ||
3257 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
2555039d
XY
3258 return false;
3259
4562236b
HW
3260 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3261}
3262
d4535e2c
AG
3263static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3264{
3265 struct amdgpu_device *adev =
3266 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 3267 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 3268
c6a6e2db
AG
3269 /* It's a bug to not have a hive within this function */
3270 if (WARN_ON(!hive))
3271 return;
3272
3273 /*
3274 * Use task barrier to synchronize all xgmi reset works across the
3275 * hive. task_barrier_enter and task_barrier_exit will block
3276 * until all the threads running the xgmi reset works reach
3277 * those points. task_barrier_full will do both blocks.
3278 */
3279 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3280
3281 task_barrier_enter(&hive->tb);
4a580877 3282 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
3283
3284 if (adev->asic_reset_res)
3285 goto fail;
3286
3287 task_barrier_exit(&hive->tb);
4a580877 3288 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
3289
3290 if (adev->asic_reset_res)
3291 goto fail;
43c4d576 3292
8bc7b360
HZ
3293 if (adev->mmhub.ras_funcs &&
3294 adev->mmhub.ras_funcs->reset_ras_error_count)
3295 adev->mmhub.ras_funcs->reset_ras_error_count(adev);
c6a6e2db
AG
3296 } else {
3297
3298 task_barrier_full(&hive->tb);
3299 adev->asic_reset_res = amdgpu_asic_reset(adev);
3300 }
ce316fa5 3301
c6a6e2db 3302fail:
d4535e2c 3303 if (adev->asic_reset_res)
fed184e9 3304 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 3305 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 3306 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
3307}
3308
71f98027
AD
3309static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3310{
3311 char *input = amdgpu_lockup_timeout;
3312 char *timeout_setting = NULL;
3313 int index = 0;
3314 long timeout;
3315 int ret = 0;
3316
3317 /*
67387dfe
AD
3318 * By default timeout for non compute jobs is 10000
3319 * and 60000 for compute jobs.
71f98027 3320 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 3321 * jobs are 60000 by default.
71f98027
AD
3322 */
3323 adev->gfx_timeout = msecs_to_jiffies(10000);
3324 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
9882e278
ED
3325 if (amdgpu_sriov_vf(adev))
3326 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3327 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
71f98027 3328 else
67387dfe 3329 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027 3330
f440ff44 3331 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 3332 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 3333 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
3334 ret = kstrtol(timeout_setting, 0, &timeout);
3335 if (ret)
3336 return ret;
3337
3338 if (timeout == 0) {
3339 index++;
3340 continue;
3341 } else if (timeout < 0) {
3342 timeout = MAX_SCHEDULE_TIMEOUT;
3343 } else {
3344 timeout = msecs_to_jiffies(timeout);
3345 }
3346
3347 switch (index++) {
3348 case 0:
3349 adev->gfx_timeout = timeout;
3350 break;
3351 case 1:
3352 adev->compute_timeout = timeout;
3353 break;
3354 case 2:
3355 adev->sdma_timeout = timeout;
3356 break;
3357 case 3:
3358 adev->video_timeout = timeout;
3359 break;
3360 default:
3361 break;
3362 }
3363 }
3364 /*
3365 * There is only one value specified and
3366 * it should apply to all non-compute jobs.
3367 */
bcccee89 3368 if (index == 1) {
71f98027 3369 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3370 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3371 adev->compute_timeout = adev->gfx_timeout;
3372 }
71f98027
AD
3373 }
3374
3375 return ret;
3376}
d4535e2c 3377
77f3a5cd
ND
3378static const struct attribute *amdgpu_dev_attributes[] = {
3379 &dev_attr_product_name.attr,
3380 &dev_attr_product_number.attr,
3381 &dev_attr_serial_number.attr,
3382 &dev_attr_pcie_replay_count.attr,
3383 NULL
3384};
3385
d38ceaf9
AD
3386/**
3387 * amdgpu_device_init - initialize the driver
3388 *
3389 * @adev: amdgpu_device pointer
d38ceaf9
AD
3390 * @flags: driver flags
3391 *
3392 * Initializes the driver info and hw (all asics).
3393 * Returns 0 for success or an error on failure.
3394 * Called at driver startup.
3395 */
3396int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3397 uint32_t flags)
3398{
8aba21b7
LT
3399 struct drm_device *ddev = adev_to_drm(adev);
3400 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3401 int r, i;
b98c6299 3402 bool px = false;
95844d20 3403 u32 max_MBps;
d38ceaf9
AD
3404
3405 adev->shutdown = false;
d38ceaf9 3406 adev->flags = flags;
4e66d7d2
YZ
3407
3408 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3409 adev->asic_type = amdgpu_force_asic_type;
3410 else
3411 adev->asic_type = flags & AMD_ASIC_MASK;
3412
d38ceaf9 3413 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3414 if (amdgpu_emu_mode == 1)
8bdab6bb 3415 adev->usec_timeout *= 10;
770d13b1 3416 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3417 adev->accel_working = false;
3418 adev->num_rings = 0;
3419 adev->mman.buffer_funcs = NULL;
3420 adev->mman.buffer_funcs_ring = NULL;
3421 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3422 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3423 adev->gmc.gmc_funcs = NULL;
7bd939d0 3424 adev->harvest_ip_mask = 0x0;
f54d1867 3425 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3426 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3427
3428 adev->smc_rreg = &amdgpu_invalid_rreg;
3429 adev->smc_wreg = &amdgpu_invalid_wreg;
3430 adev->pcie_rreg = &amdgpu_invalid_rreg;
3431 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3432 adev->pciep_rreg = &amdgpu_invalid_rreg;
3433 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3434 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3435 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3436 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3437 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3438 adev->didt_rreg = &amdgpu_invalid_rreg;
3439 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3440 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3441 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3442 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3443 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3444
3e39ab90
AD
3445 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3446 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3447 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3448
3449 /* mutex initialization are all done here so we
3450 * can recall function without having locking issues */
0e5ca0d1 3451 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3452 mutex_init(&adev->pm.mutex);
3453 mutex_init(&adev->gfx.gpu_clock_mutex);
3454 mutex_init(&adev->srbm_mutex);
b8866c26 3455 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3456 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3457 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3458 mutex_init(&adev->mn_lock);
e23b74aa 3459 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3460 hash_init(adev->mn_hash);
53b3f8f4 3461 atomic_set(&adev->in_gpu_reset, 0);
6049db43 3462 init_rwsem(&adev->reset_sem);
32eaeae0 3463 mutex_init(&adev->psp.mutex);
bd052211 3464 mutex_init(&adev->notifier_lock);
d38ceaf9 3465
9f6a7857
HR
3466 r = amdgpu_device_init_apu_flags(adev);
3467 if (r)
3468 return r;
3469
912dfc84
EQ
3470 r = amdgpu_device_check_arguments(adev);
3471 if (r)
3472 return r;
d38ceaf9 3473
d38ceaf9
AD
3474 spin_lock_init(&adev->mmio_idx_lock);
3475 spin_lock_init(&adev->smc_idx_lock);
3476 spin_lock_init(&adev->pcie_idx_lock);
3477 spin_lock_init(&adev->uvd_ctx_idx_lock);
3478 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3479 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3480 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3481 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3482 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3483
0c4e7fa5
CZ
3484 INIT_LIST_HEAD(&adev->shadow_list);
3485 mutex_init(&adev->shadow_list_lock);
3486
655ce9cb 3487 INIT_LIST_HEAD(&adev->reset_list);
3488
beff74bc
AD
3489 INIT_DELAYED_WORK(&adev->delayed_init_work,
3490 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3491 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3492 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3493
d4535e2c
AG
3494 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3495
d23ee13f 3496 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3497 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3498
b265bdbd
EQ
3499 atomic_set(&adev->throttling_logging_enabled, 1);
3500 /*
3501 * If throttling continues, logging will be performed every minute
3502 * to avoid log flooding. "-1" is subtracted since the thermal
3503 * throttling interrupt comes every second. Thus, the total logging
3504 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3505 * for throttling interrupt) = 60 seconds.
3506 */
3507 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3508 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3509
0fa49558
AX
3510 /* Registers mapping */
3511 /* TODO: block userspace mapping of io register */
da69c161
KW
3512 if (adev->asic_type >= CHIP_BONAIRE) {
3513 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3514 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3515 } else {
3516 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3517 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3518 }
d38ceaf9 3519
d38ceaf9
AD
3520 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3521 if (adev->rmmio == NULL) {
3522 return -ENOMEM;
3523 }
3524 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3525 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3526
b2109d8e
JX
3527 /* enable PCIE atomic ops */
3528 r = pci_enable_atomic_ops_to_root(adev->pdev,
3529 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3530 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3531 if (r) {
3532 adev->have_atomics_support = false;
3533 DRM_INFO("PCIE atomic ops is not supported\n");
3534 } else {
3535 adev->have_atomics_support = true;
3536 }
3537
5494d864
AD
3538 amdgpu_device_get_pcie_info(adev);
3539
b239c017
JX
3540 if (amdgpu_mcbp)
3541 DRM_INFO("MCBP is enabled\n");
3542
5f84cc63
JX
3543 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3544 adev->enable_mes = true;
3545
3aa0115d
ML
3546 /* detect hw virtualization here */
3547 amdgpu_detect_virtualization(adev);
3548
dffa11b4
ML
3549 r = amdgpu_device_get_job_timeout_settings(adev);
3550 if (r) {
3551 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4ef87d8f 3552 return r;
a190d1c7
XY
3553 }
3554
d38ceaf9 3555 /* early init functions */
06ec9070 3556 r = amdgpu_device_ip_early_init(adev);
d38ceaf9 3557 if (r)
4ef87d8f 3558 return r;
d38ceaf9 3559
6585661d
OZ
3560 /* doorbell bar mapping and doorbell index init*/
3561 amdgpu_device_doorbell_init(adev);
3562
9475a943
SL
3563 if (amdgpu_emu_mode == 1) {
3564 /* post the asic on emulation mode */
3565 emu_soc_asic_init(adev);
bfca0289 3566 goto fence_driver_init;
9475a943 3567 }
bfca0289 3568
04442bf7
LL
3569 amdgpu_reset_init(adev);
3570
4e99a44e
ML
3571 /* detect if we are with an SRIOV vbios */
3572 amdgpu_device_detect_sriov_bios(adev);
048765ad 3573
95e8e59e
AD
3574 /* check if we need to reset the asic
3575 * E.g., driver was not cleanly unloaded previously, etc.
3576 */
f14899fd 3577 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
e3c1b071 3578 if (adev->gmc.xgmi.num_physical_nodes) {
3579 dev_info(adev->dev, "Pending hive reset.\n");
3580 adev->gmc.xgmi.pending_reset = true;
3581 /* Only need to init necessary block for SMU to handle the reset */
3582 for (i = 0; i < adev->num_ip_blocks; i++) {
3583 if (!adev->ip_blocks[i].status.valid)
3584 continue;
3585 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3586 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3587 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3588 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
751f43e7 3589 DRM_DEBUG("IP %s disabled for hw_init.\n",
e3c1b071 3590 adev->ip_blocks[i].version->funcs->name);
3591 adev->ip_blocks[i].status.hw = true;
3592 }
3593 }
3594 } else {
3595 r = amdgpu_asic_reset(adev);
3596 if (r) {
3597 dev_err(adev->dev, "asic reset on init failed\n");
3598 goto failed;
3599 }
95e8e59e
AD
3600 }
3601 }
3602
8f66090b 3603 pci_enable_pcie_error_reporting(adev->pdev);
c9a6b82f 3604
d38ceaf9 3605 /* Post card if necessary */
39c640c0 3606 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3607 if (!adev->bios) {
bec86378 3608 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3609 r = -EINVAL;
3610 goto failed;
d38ceaf9 3611 }
bec86378 3612 DRM_INFO("GPU posting now...\n");
4d2997ab 3613 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3614 if (r) {
3615 dev_err(adev->dev, "gpu post error!\n");
3616 goto failed;
3617 }
d38ceaf9
AD
3618 }
3619
88b64e95
AD
3620 if (adev->is_atom_fw) {
3621 /* Initialize clocks */
3622 r = amdgpu_atomfirmware_get_clock_info(adev);
3623 if (r) {
3624 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3625 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3626 goto failed;
3627 }
3628 } else {
a5bde2f9
AD
3629 /* Initialize clocks */
3630 r = amdgpu_atombios_get_clock_info(adev);
3631 if (r) {
3632 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3633 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3634 goto failed;
a5bde2f9
AD
3635 }
3636 /* init i2c buses */
4562236b
HW
3637 if (!amdgpu_device_has_dc_support(adev))
3638 amdgpu_atombios_i2c_init(adev);
2c1a2784 3639 }
d38ceaf9 3640
bfca0289 3641fence_driver_init:
d38ceaf9
AD
3642 /* Fence driver */
3643 r = amdgpu_fence_driver_init(adev);
2c1a2784
AD
3644 if (r) {
3645 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
e23b74aa 3646 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3647 goto failed;
2c1a2784 3648 }
d38ceaf9
AD
3649
3650 /* init the mode config */
4a580877 3651 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3652
06ec9070 3653 r = amdgpu_device_ip_init(adev);
d38ceaf9 3654 if (r) {
8840a387 3655 /* failed in exclusive mode due to timeout */
3656 if (amdgpu_sriov_vf(adev) &&
3657 !amdgpu_sriov_runtime(adev) &&
3658 amdgpu_virt_mmio_blocked(adev) &&
3659 !amdgpu_virt_wait_reset(adev)) {
3660 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3661 /* Don't send request since VF is inactive. */
3662 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3663 adev->virt.ops = NULL;
8840a387 3664 r = -EAGAIN;
970fd197 3665 goto release_ras_con;
8840a387 3666 }
06ec9070 3667 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3668 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
970fd197 3669 goto release_ras_con;
d38ceaf9
AD
3670 }
3671
d69b8971
YZ
3672 dev_info(adev->dev,
3673 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3674 adev->gfx.config.max_shader_engines,
3675 adev->gfx.config.max_sh_per_se,
3676 adev->gfx.config.max_cu_per_sh,
3677 adev->gfx.cu_info.number);
3678
d38ceaf9
AD
3679 adev->accel_working = true;
3680
e59c0205
AX
3681 amdgpu_vm_check_compute_bug(adev);
3682
95844d20
MO
3683 /* Initialize the buffer migration limit. */
3684 if (amdgpu_moverate >= 0)
3685 max_MBps = amdgpu_moverate;
3686 else
3687 max_MBps = 8; /* Allow 8 MB/s. */
3688 /* Get a log2 for easy divisions. */
3689 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3690
9bc92b9c
ML
3691 amdgpu_fbdev_init(adev);
3692
d2f52ac8 3693 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3694 if (r) {
3695 adev->pm_sysfs_en = false;
d2f52ac8 3696 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3697 } else
3698 adev->pm_sysfs_en = true;
d2f52ac8 3699
5bb23532 3700 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3701 if (r) {
3702 adev->ucode_sysfs_en = false;
5bb23532 3703 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3704 } else
3705 adev->ucode_sysfs_en = true;
5bb23532 3706
d38ceaf9
AD
3707 if ((amdgpu_testing & 1)) {
3708 if (adev->accel_working)
3709 amdgpu_test_moves(adev);
3710 else
3711 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3712 }
d38ceaf9
AD
3713 if (amdgpu_benchmarking) {
3714 if (adev->accel_working)
3715 amdgpu_benchmark(adev, amdgpu_benchmarking);
3716 else
3717 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3718 }
3719
b0adca4d
EQ
3720 /*
3721 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3722 * Otherwise the mgpu fan boost feature will be skipped due to the
3723 * gpu instance is counted less.
3724 */
3725 amdgpu_register_gpu_instance(adev);
3726
d38ceaf9
AD
3727 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3728 * explicit gating rather than handling it automatically.
3729 */
e3c1b071 3730 if (!adev->gmc.xgmi.pending_reset) {
3731 r = amdgpu_device_ip_late_init(adev);
3732 if (r) {
3733 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3734 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
970fd197 3735 goto release_ras_con;
e3c1b071 3736 }
3737 /* must succeed. */
3738 amdgpu_ras_resume(adev);
3739 queue_delayed_work(system_wq, &adev->delayed_init_work,
3740 msecs_to_jiffies(AMDGPU_RESUME_MS));
2c1a2784 3741 }
d38ceaf9 3742
2c738637
ML
3743 if (amdgpu_sriov_vf(adev))
3744 flush_delayed_work(&adev->delayed_init_work);
3745
77f3a5cd 3746 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
5aea5327 3747 if (r)
77f3a5cd 3748 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166 3749
d155bef0
AB
3750 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3751 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3752 if (r)
3753 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3754
c1dd4aa6
AG
3755 /* Have stored pci confspace at hand for restore in sudden PCI error */
3756 if (amdgpu_device_cache_pci_state(adev->pdev))
3757 pci_restore_state(pdev);
3758
8c3dd61c
KHF
3759 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3760 /* this will fail for cards that aren't VGA class devices, just
3761 * ignore it */
3762 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3763 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3764
3765 if (amdgpu_device_supports_px(ddev)) {
3766 px = true;
3767 vga_switcheroo_register_client(adev->pdev,
3768 &amdgpu_switcheroo_ops, px);
3769 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3770 }
3771
e3c1b071 3772 if (adev->gmc.xgmi.pending_reset)
3773 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3774 msecs_to_jiffies(AMDGPU_RESUME_MS));
3775
d38ceaf9 3776 return 0;
83ba126a 3777
970fd197
SY
3778release_ras_con:
3779 amdgpu_release_ras_context(adev);
3780
83ba126a 3781failed:
89041940 3782 amdgpu_vf_error_trans_all(adev);
8840a387 3783
83ba126a 3784 return r;
d38ceaf9
AD
3785}
3786
07775fc1
AG
3787static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3788{
3789 /* Clear all CPU mappings pointing to this device */
3790 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3791
3792 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3793 amdgpu_device_doorbell_fini(adev);
3794
3795 iounmap(adev->rmmio);
3796 adev->rmmio = NULL;
3797 if (adev->mman.aper_base_kaddr)
3798 iounmap(adev->mman.aper_base_kaddr);
3799 adev->mman.aper_base_kaddr = NULL;
3800
3801 /* Memory manager related */
3802 if (!adev->gmc.xgmi.connected_to_cpu) {
3803 arch_phys_wc_del(adev->gmc.vram_mtrr);
3804 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3805 }
3806}
3807
d38ceaf9
AD
3808/**
3809 * amdgpu_device_fini - tear down the driver
3810 *
3811 * @adev: amdgpu_device pointer
3812 *
3813 * Tear down the driver info (all asics).
3814 * Called at driver shutdown.
3815 */
72c8c97b 3816void amdgpu_device_fini_hw(struct amdgpu_device *adev)
d38ceaf9 3817{
aac89168 3818 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3819 flush_delayed_work(&adev->delayed_init_work);
bb0cd09b 3820 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
d0d13fe8 3821 adev->shutdown = true;
9f875167 3822
752c683d
ML
3823 /* make sure IB test finished before entering exclusive mode
3824 * to avoid preemption on IB test
3825 * */
519b8b76 3826 if (amdgpu_sriov_vf(adev)) {
752c683d 3827 amdgpu_virt_request_full_gpu(adev, false);
519b8b76
BZ
3828 amdgpu_virt_fini_data_exchange(adev);
3829 }
752c683d 3830
e5b03032
ML
3831 /* disable all interrupts */
3832 amdgpu_irq_disable_all(adev);
ff97cba8
ML
3833 if (adev->mode_info.mode_config_initialized){
3834 if (!amdgpu_device_has_dc_support(adev))
4a580877 3835 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3836 else
4a580877 3837 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3838 }
72c8c97b
AG
3839 amdgpu_fence_driver_fini_hw(adev);
3840
7c868b59
YT
3841 if (adev->pm_sysfs_en)
3842 amdgpu_pm_sysfs_fini(adev);
72c8c97b
AG
3843 if (adev->ucode_sysfs_en)
3844 amdgpu_ucode_sysfs_fini(adev);
3845 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3846
d38ceaf9 3847 amdgpu_fbdev_fini(adev);
72c8c97b
AG
3848
3849 amdgpu_irq_fini_hw(adev);
e9669fb7
AG
3850
3851 amdgpu_device_ip_fini_early(adev);
d10d0daa
AG
3852
3853 amdgpu_gart_dummy_page_fini(adev);
07775fc1
AG
3854
3855 amdgpu_device_unmap_mmio(adev);
72c8c97b
AG
3856}
3857
3858void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3859{
e230ac11 3860 amdgpu_device_ip_fini(adev);
72c8c97b 3861 amdgpu_fence_driver_fini_sw(adev);
75e1658e
ND
3862 release_firmware(adev->firmware.gpu_info_fw);
3863 adev->firmware.gpu_info_fw = NULL;
d38ceaf9 3864 adev->accel_working = false;
04442bf7
LL
3865
3866 amdgpu_reset_fini(adev);
3867
d38ceaf9 3868 /* free i2c buses */
4562236b
HW
3869 if (!amdgpu_device_has_dc_support(adev))
3870 amdgpu_i2c_fini(adev);
bfca0289
SL
3871
3872 if (amdgpu_emu_mode != 1)
3873 amdgpu_atombios_fini(adev);
3874
d38ceaf9
AD
3875 kfree(adev->bios);
3876 adev->bios = NULL;
b98c6299 3877 if (amdgpu_device_supports_px(adev_to_drm(adev))) {
84c8b22e 3878 vga_switcheroo_unregister_client(adev->pdev);
83ba126a 3879 vga_switcheroo_fini_domain_pm_ops(adev->dev);
b98c6299 3880 }
38d6be81
AD
3881 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3882 vga_client_register(adev->pdev, NULL, NULL, NULL);
e9bc1bf7 3883
d155bef0
AB
3884 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3885 amdgpu_pmu_fini(adev);
72de33f8 3886 if (adev->mman.discovery_bin)
a190d1c7 3887 amdgpu_discovery_fini(adev);
72c8c97b
AG
3888
3889 kfree(adev->pci_state);
3890
d38ceaf9
AD
3891}
3892
3893
3894/*
3895 * Suspend & resume.
3896 */
3897/**
810ddc3a 3898 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 3899 *
87e3f136 3900 * @dev: drm dev pointer
87e3f136 3901 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
3902 *
3903 * Puts the hw in the suspend state (all asics).
3904 * Returns 0 for success or an error on failure.
3905 * Called at driver suspend.
3906 */
de185019 3907int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9 3908{
a2e15b0e 3909 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 3910
d38ceaf9
AD
3911 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3912 return 0;
3913
44779b43 3914 adev->in_suspend = true;
3fa8f89d
S
3915
3916 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
3917 DRM_WARN("smart shift update failed\n");
3918
d38ceaf9
AD
3919 drm_kms_helper_poll_disable(dev);
3920
5f818173
S
3921 if (fbcon)
3922 amdgpu_fbdev_set_suspend(adev, 1);
3923
beff74bc 3924 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 3925
5e6932fe 3926 amdgpu_ras_suspend(adev);
3927
2196927b 3928 amdgpu_device_ip_suspend_phase1(adev);
fe1053b7 3929
5d3a2d95
AD
3930 if (!adev->in_s0ix)
3931 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
94fa5660 3932
d38ceaf9
AD
3933 /* evict vram memory */
3934 amdgpu_bo_evict_vram(adev);
3935
5ceb54c6 3936 amdgpu_fence_driver_suspend(adev);
d38ceaf9 3937
2196927b 3938 amdgpu_device_ip_suspend_phase2(adev);
a0a71e49
AD
3939 /* evict remaining vram memory
3940 * This second call to evict vram is to evict the gart page table
3941 * using the CPU.
3942 */
d38ceaf9
AD
3943 amdgpu_bo_evict_vram(adev);
3944
d38ceaf9
AD
3945 return 0;
3946}
3947
3948/**
810ddc3a 3949 * amdgpu_device_resume - initiate device resume
d38ceaf9 3950 *
87e3f136 3951 * @dev: drm dev pointer
87e3f136 3952 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
3953 *
3954 * Bring the hw back to operating state (all asics).
3955 * Returns 0 for success or an error on failure.
3956 * Called at driver resume.
3957 */
de185019 3958int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9 3959{
1348969a 3960 struct amdgpu_device *adev = drm_to_adev(dev);
03161a6e 3961 int r = 0;
d38ceaf9
AD
3962
3963 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3964 return 0;
3965
62498733 3966 if (adev->in_s0ix)
628c36d7
PL
3967 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3968
d38ceaf9 3969 /* post card */
39c640c0 3970 if (amdgpu_device_need_post(adev)) {
4d2997ab 3971 r = amdgpu_device_asic_init(adev);
74b0b157 3972 if (r)
aac89168 3973 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 3974 }
d38ceaf9 3975
06ec9070 3976 r = amdgpu_device_ip_resume(adev);
e6707218 3977 if (r) {
aac89168 3978 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 3979 return r;
e6707218 3980 }
5ceb54c6
AD
3981 amdgpu_fence_driver_resume(adev);
3982
d38ceaf9 3983
06ec9070 3984 r = amdgpu_device_ip_late_init(adev);
03161a6e 3985 if (r)
4d3b9ae5 3986 return r;
d38ceaf9 3987
beff74bc
AD
3988 queue_delayed_work(system_wq, &adev->delayed_init_work,
3989 msecs_to_jiffies(AMDGPU_RESUME_MS));
3990
5d3a2d95
AD
3991 if (!adev->in_s0ix) {
3992 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3993 if (r)
3994 return r;
3995 }
756e6880 3996
96a5d8d4 3997 /* Make sure IB tests flushed */
beff74bc 3998 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 3999
a2e15b0e 4000 if (fbcon)
4d3b9ae5 4001 amdgpu_fbdev_set_suspend(adev, 0);
d38ceaf9
AD
4002
4003 drm_kms_helper_poll_enable(dev);
23a1a9e5 4004
5e6932fe 4005 amdgpu_ras_resume(adev);
4006
23a1a9e5
L
4007 /*
4008 * Most of the connector probing functions try to acquire runtime pm
4009 * refs to ensure that the GPU is powered on when connector polling is
4010 * performed. Since we're calling this from a runtime PM callback,
4011 * trying to acquire rpm refs will cause us to deadlock.
4012 *
4013 * Since we're guaranteed to be holding the rpm lock, it's safe to
4014 * temporarily disable the rpm helpers so this doesn't deadlock us.
4015 */
4016#ifdef CONFIG_PM
4017 dev->dev->power.disable_depth++;
4018#endif
4562236b
HW
4019 if (!amdgpu_device_has_dc_support(adev))
4020 drm_helper_hpd_irq_event(dev);
4021 else
4022 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
4023#ifdef CONFIG_PM
4024 dev->dev->power.disable_depth--;
4025#endif
44779b43
RZ
4026 adev->in_suspend = false;
4027
3fa8f89d
S
4028 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4029 DRM_WARN("smart shift update failed\n");
4030
4d3b9ae5 4031 return 0;
d38ceaf9
AD
4032}
4033
e3ecdffa
AD
4034/**
4035 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4036 *
4037 * @adev: amdgpu_device pointer
4038 *
4039 * The list of all the hardware IPs that make up the asic is walked and
4040 * the check_soft_reset callbacks are run. check_soft_reset determines
4041 * if the asic is still hung or not.
4042 * Returns true if any of the IPs are still in a hung state, false if not.
4043 */
06ec9070 4044static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
4045{
4046 int i;
4047 bool asic_hang = false;
4048
f993d628
ML
4049 if (amdgpu_sriov_vf(adev))
4050 return true;
4051
8bc04c29
AD
4052 if (amdgpu_asic_need_full_reset(adev))
4053 return true;
4054
63fbf42f 4055 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4056 if (!adev->ip_blocks[i].status.valid)
63fbf42f 4057 continue;
a1255107
AD
4058 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4059 adev->ip_blocks[i].status.hang =
4060 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4061 if (adev->ip_blocks[i].status.hang) {
aac89168 4062 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
4063 asic_hang = true;
4064 }
4065 }
4066 return asic_hang;
4067}
4068
e3ecdffa
AD
4069/**
4070 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4071 *
4072 * @adev: amdgpu_device pointer
4073 *
4074 * The list of all the hardware IPs that make up the asic is walked and the
4075 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4076 * handles any IP specific hardware or software state changes that are
4077 * necessary for a soft reset to succeed.
4078 * Returns 0 on success, negative error code on failure.
4079 */
06ec9070 4080static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
4081{
4082 int i, r = 0;
4083
4084 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4085 if (!adev->ip_blocks[i].status.valid)
d31a501e 4086 continue;
a1255107
AD
4087 if (adev->ip_blocks[i].status.hang &&
4088 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4089 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
4090 if (r)
4091 return r;
4092 }
4093 }
4094
4095 return 0;
4096}
4097
e3ecdffa
AD
4098/**
4099 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4100 *
4101 * @adev: amdgpu_device pointer
4102 *
4103 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4104 * reset is necessary to recover.
4105 * Returns true if a full asic reset is required, false if not.
4106 */
06ec9070 4107static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 4108{
da146d3b
AD
4109 int i;
4110
8bc04c29
AD
4111 if (amdgpu_asic_need_full_reset(adev))
4112 return true;
4113
da146d3b 4114 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4115 if (!adev->ip_blocks[i].status.valid)
da146d3b 4116 continue;
a1255107
AD
4117 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4118 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4119 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
4120 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4121 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 4122 if (adev->ip_blocks[i].status.hang) {
aac89168 4123 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
4124 return true;
4125 }
4126 }
35d782fe
CZ
4127 }
4128 return false;
4129}
4130
e3ecdffa
AD
4131/**
4132 * amdgpu_device_ip_soft_reset - do a soft reset
4133 *
4134 * @adev: amdgpu_device pointer
4135 *
4136 * The list of all the hardware IPs that make up the asic is walked and the
4137 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4138 * IP specific hardware or software state changes that are necessary to soft
4139 * reset the IP.
4140 * Returns 0 on success, negative error code on failure.
4141 */
06ec9070 4142static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4143{
4144 int i, r = 0;
4145
4146 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4147 if (!adev->ip_blocks[i].status.valid)
35d782fe 4148 continue;
a1255107
AD
4149 if (adev->ip_blocks[i].status.hang &&
4150 adev->ip_blocks[i].version->funcs->soft_reset) {
4151 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
4152 if (r)
4153 return r;
4154 }
4155 }
4156
4157 return 0;
4158}
4159
e3ecdffa
AD
4160/**
4161 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4162 *
4163 * @adev: amdgpu_device pointer
4164 *
4165 * The list of all the hardware IPs that make up the asic is walked and the
4166 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4167 * handles any IP specific hardware or software state changes that are
4168 * necessary after the IP has been soft reset.
4169 * Returns 0 on success, negative error code on failure.
4170 */
06ec9070 4171static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
4172{
4173 int i, r = 0;
4174
4175 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 4176 if (!adev->ip_blocks[i].status.valid)
35d782fe 4177 continue;
a1255107
AD
4178 if (adev->ip_blocks[i].status.hang &&
4179 adev->ip_blocks[i].version->funcs->post_soft_reset)
4180 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
4181 if (r)
4182 return r;
4183 }
4184
4185 return 0;
4186}
4187
e3ecdffa 4188/**
c33adbc7 4189 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
4190 *
4191 * @adev: amdgpu_device pointer
4192 *
4193 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4194 * restore things like GPUVM page tables after a GPU reset where
4195 * the contents of VRAM might be lost.
403009bf
CK
4196 *
4197 * Returns:
4198 * 0 on success, negative error code on failure.
e3ecdffa 4199 */
c33adbc7 4200static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 4201{
c41d1cf6 4202 struct dma_fence *fence = NULL, *next = NULL;
403009bf 4203 struct amdgpu_bo *shadow;
e18aaea7 4204 struct amdgpu_bo_vm *vmbo;
403009bf 4205 long r = 1, tmo;
c41d1cf6
ML
4206
4207 if (amdgpu_sriov_runtime(adev))
b045d3af 4208 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
4209 else
4210 tmo = msecs_to_jiffies(100);
4211
aac89168 4212 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 4213 mutex_lock(&adev->shadow_list_lock);
e18aaea7
ND
4214 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4215 shadow = &vmbo->bo;
403009bf 4216 /* No need to recover an evicted BO */
d3116756
CK
4217 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4218 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4219 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
403009bf
CK
4220 continue;
4221
4222 r = amdgpu_bo_restore_shadow(shadow, &next);
4223 if (r)
4224 break;
4225
c41d1cf6 4226 if (fence) {
1712fb1a 4227 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
4228 dma_fence_put(fence);
4229 fence = next;
1712fb1a 4230 if (tmo == 0) {
4231 r = -ETIMEDOUT;
c41d1cf6 4232 break;
1712fb1a 4233 } else if (tmo < 0) {
4234 r = tmo;
4235 break;
4236 }
403009bf
CK
4237 } else {
4238 fence = next;
c41d1cf6 4239 }
c41d1cf6
ML
4240 }
4241 mutex_unlock(&adev->shadow_list_lock);
4242
403009bf
CK
4243 if (fence)
4244 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
4245 dma_fence_put(fence);
4246
1712fb1a 4247 if (r < 0 || tmo <= 0) {
aac89168 4248 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
4249 return -EIO;
4250 }
c41d1cf6 4251
aac89168 4252 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 4253 return 0;
c41d1cf6
ML
4254}
4255
a90ad3c2 4256
e3ecdffa 4257/**
06ec9070 4258 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e 4259 *
982a820b 4260 * @adev: amdgpu_device pointer
87e3f136 4261 * @from_hypervisor: request from hypervisor
5740682e
ML
4262 *
4263 * do VF FLR and reinitialize Asic
3f48c681 4264 * return 0 means succeeded otherwise failed
e3ecdffa
AD
4265 */
4266static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4267 bool from_hypervisor)
5740682e
ML
4268{
4269 int r;
4270
4271 if (from_hypervisor)
4272 r = amdgpu_virt_request_full_gpu(adev, true);
4273 else
4274 r = amdgpu_virt_reset_gpu(adev);
4275 if (r)
4276 return r;
a90ad3c2 4277
b639c22c
JZ
4278 amdgpu_amdkfd_pre_reset(adev);
4279
a90ad3c2 4280 /* Resume IP prior to SMC */
06ec9070 4281 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
4282 if (r)
4283 goto error;
a90ad3c2 4284
c9ffa427 4285 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 4286 /* we need recover gart prior to run SMC/CP/SDMA resume */
6c28aed6 4287 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
a90ad3c2 4288
7a3e0bb2
RZ
4289 r = amdgpu_device_fw_loading(adev);
4290 if (r)
4291 return r;
4292
a90ad3c2 4293 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 4294 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
4295 if (r)
4296 goto error;
a90ad3c2
ML
4297
4298 amdgpu_irq_gpu_reset_resume_helper(adev);
5740682e 4299 r = amdgpu_ib_ring_tests(adev);
f81e8d53 4300 amdgpu_amdkfd_post_reset(adev);
a90ad3c2 4301
abc34253 4302error:
c41d1cf6 4303 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 4304 amdgpu_inc_vram_lost(adev);
c33adbc7 4305 r = amdgpu_device_recover_vram(adev);
a90ad3c2 4306 }
437f3e0b 4307 amdgpu_virt_release_full_gpu(adev, true);
a90ad3c2
ML
4308
4309 return r;
4310}
4311
9a1cddd6 4312/**
4313 * amdgpu_device_has_job_running - check if there is any job in mirror list
4314 *
982a820b 4315 * @adev: amdgpu_device pointer
9a1cddd6 4316 *
4317 * check if there is any job in mirror list
4318 */
4319bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4320{
4321 int i;
4322 struct drm_sched_job *job;
4323
4324 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4325 struct amdgpu_ring *ring = adev->rings[i];
4326
4327 if (!ring || !ring->sched.thread)
4328 continue;
4329
4330 spin_lock(&ring->sched.job_list_lock);
6efa4b46
LT
4331 job = list_first_entry_or_null(&ring->sched.pending_list,
4332 struct drm_sched_job, list);
9a1cddd6 4333 spin_unlock(&ring->sched.job_list_lock);
4334 if (job)
4335 return true;
4336 }
4337 return false;
4338}
4339
12938fad
CK
4340/**
4341 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4342 *
982a820b 4343 * @adev: amdgpu_device pointer
12938fad
CK
4344 *
4345 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4346 * a hung GPU.
4347 */
4348bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4349{
4350 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4351 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4352 return false;
4353 }
4354
3ba7b418
AG
4355 if (amdgpu_gpu_recovery == 0)
4356 goto disabled;
4357
4358 if (amdgpu_sriov_vf(adev))
4359 return true;
4360
4361 if (amdgpu_gpu_recovery == -1) {
4362 switch (adev->asic_type) {
fc42d47c
AG
4363 case CHIP_BONAIRE:
4364 case CHIP_HAWAII:
3ba7b418
AG
4365 case CHIP_TOPAZ:
4366 case CHIP_TONGA:
4367 case CHIP_FIJI:
4368 case CHIP_POLARIS10:
4369 case CHIP_POLARIS11:
4370 case CHIP_POLARIS12:
4371 case CHIP_VEGAM:
4372 case CHIP_VEGA20:
4373 case CHIP_VEGA10:
4374 case CHIP_VEGA12:
c43b849f 4375 case CHIP_RAVEN:
e9d4cf91 4376 case CHIP_ARCTURUS:
2cb44fb0 4377 case CHIP_RENOIR:
658c6639
AD
4378 case CHIP_NAVI10:
4379 case CHIP_NAVI14:
4380 case CHIP_NAVI12:
131a3c74 4381 case CHIP_SIENNA_CICHLID:
665fe4dc 4382 case CHIP_NAVY_FLOUNDER:
27859ee3 4383 case CHIP_DIMGREY_CAVEFISH:
a2f55040 4384 case CHIP_BEIGE_GOBY:
fe68ceef 4385 case CHIP_VANGOGH:
ea4e96a7 4386 case CHIP_ALDEBARAN:
3ba7b418
AG
4387 break;
4388 default:
4389 goto disabled;
4390 }
12938fad
CK
4391 }
4392
4393 return true;
3ba7b418
AG
4394
4395disabled:
aac89168 4396 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4397 return false;
12938fad
CK
4398}
4399
5c03e584
FX
4400int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4401{
4402 u32 i;
4403 int ret = 0;
4404
4405 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4406
4407 dev_info(adev->dev, "GPU mode1 reset\n");
4408
4409 /* disable BM */
4410 pci_clear_master(adev->pdev);
4411
4412 amdgpu_device_cache_pci_state(adev->pdev);
4413
4414 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4415 dev_info(adev->dev, "GPU smu mode1 reset\n");
4416 ret = amdgpu_dpm_mode1_reset(adev);
4417 } else {
4418 dev_info(adev->dev, "GPU psp mode1 reset\n");
4419 ret = psp_gpu_reset(adev);
4420 }
4421
4422 if (ret)
4423 dev_err(adev->dev, "GPU mode1 reset failed\n");
4424
4425 amdgpu_device_load_pci_state(adev->pdev);
4426
4427 /* wait for asic to come out of reset */
4428 for (i = 0; i < adev->usec_timeout; i++) {
4429 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4430
4431 if (memsize != 0xffffffff)
4432 break;
4433 udelay(1);
4434 }
4435
4436 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4437 return ret;
4438}
5c6dd71e 4439
e3c1b071 4440int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
04442bf7 4441 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4442{
4443 int i, r = 0;
04442bf7
LL
4444 struct amdgpu_job *job = NULL;
4445 bool need_full_reset =
4446 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4447
4448 if (reset_context->reset_req_dev == adev)
4449 job = reset_context->job;
71182665 4450
e3c1b071 4451 /* no need to dump if device is not in good state during probe period */
4452 if (!adev->gmc.xgmi.pending_reset)
4453 amdgpu_debugfs_wait_dump(adev);
728e7e0c 4454
b602ca5f
TZ
4455 if (amdgpu_sriov_vf(adev)) {
4456 /* stop the data exchange thread */
4457 amdgpu_virt_fini_data_exchange(adev);
4458 }
4459
71182665 4460 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4461 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4462 struct amdgpu_ring *ring = adev->rings[i];
4463
51687759 4464 if (!ring || !ring->sched.thread)
0875dc9e 4465 continue;
5740682e 4466
2f9d4084
ML
4467 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4468 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4469 }
d38ceaf9 4470
ff99849b 4471 if (job && job->vm)
222b5f04
AG
4472 drm_sched_increase_karma(&job->base);
4473
04442bf7 4474 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
404b277b
LL
4475 /* If reset handler not implemented, continue; otherwise return */
4476 if (r == -ENOSYS)
4477 r = 0;
4478 else
04442bf7
LL
4479 return r;
4480
1d721ed6 4481 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4482 if (!amdgpu_sriov_vf(adev)) {
4483
4484 if (!need_full_reset)
4485 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4486
4487 if (!need_full_reset) {
4488 amdgpu_device_ip_pre_soft_reset(adev);
4489 r = amdgpu_device_ip_soft_reset(adev);
4490 amdgpu_device_ip_post_soft_reset(adev);
4491 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4492 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4493 need_full_reset = true;
4494 }
4495 }
4496
4497 if (need_full_reset)
4498 r = amdgpu_device_ip_suspend(adev);
04442bf7
LL
4499 if (need_full_reset)
4500 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4501 else
4502 clear_bit(AMDGPU_NEED_FULL_RESET,
4503 &reset_context->flags);
26bc5340
AG
4504 }
4505
4506 return r;
4507}
4508
04442bf7
LL
4509int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4510 struct amdgpu_reset_context *reset_context)
26bc5340
AG
4511{
4512 struct amdgpu_device *tmp_adev = NULL;
04442bf7 4513 bool need_full_reset, skip_hw_reset, vram_lost = false;
26bc5340
AG
4514 int r = 0;
4515
04442bf7
LL
4516 /* Try reset handler method first */
4517 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4518 reset_list);
4519 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
404b277b
LL
4520 /* If reset handler not implemented, continue; otherwise return */
4521 if (r == -ENOSYS)
4522 r = 0;
4523 else
04442bf7
LL
4524 return r;
4525
4526 /* Reset handler not implemented, use the default method */
4527 need_full_reset =
4528 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4529 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4530
26bc5340 4531 /*
655ce9cb 4532 * ASIC reset has to be done on all XGMI hive nodes ASAP
26bc5340
AG
4533 * to allow proper links negotiation in FW (within 1 sec)
4534 */
7ac71382 4535 if (!skip_hw_reset && need_full_reset) {
655ce9cb 4536 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
041a62bc 4537 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4538 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
e3c1b071 4539 tmp_adev->gmc.xgmi.pending_reset = false;
c96cf282 4540 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4541 r = -EALREADY;
4542 } else
4543 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4544
041a62bc 4545 if (r) {
aac89168 4546 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4547 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4548 break;
ce316fa5
LM
4549 }
4550 }
4551
041a62bc
AG
4552 /* For XGMI wait for all resets to complete before proceed */
4553 if (!r) {
655ce9cb 4554 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
ce316fa5
LM
4555 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4556 flush_work(&tmp_adev->xgmi_reset_work);
4557 r = tmp_adev->asic_reset_res;
4558 if (r)
4559 break;
ce316fa5
LM
4560 }
4561 }
4562 }
ce316fa5 4563 }
26bc5340 4564
43c4d576 4565 if (!r && amdgpu_ras_intr_triggered()) {
655ce9cb 4566 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
8bc7b360
HZ
4567 if (tmp_adev->mmhub.ras_funcs &&
4568 tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4569 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
43c4d576
JC
4570 }
4571
00eaa571 4572 amdgpu_ras_intr_cleared();
43c4d576 4573 }
00eaa571 4574
655ce9cb 4575 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
26bc5340
AG
4576 if (need_full_reset) {
4577 /* post card */
e3c1b071 4578 r = amdgpu_device_asic_init(tmp_adev);
4579 if (r) {
aac89168 4580 dev_warn(tmp_adev->dev, "asic atom init failed!");
e3c1b071 4581 } else {
26bc5340
AG
4582 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4583 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4584 if (r)
4585 goto out;
4586
4587 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4588 if (vram_lost) {
77e7f829 4589 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4590 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4591 }
4592
6c28aed6 4593 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
26bc5340
AG
4594 if (r)
4595 goto out;
4596
4597 r = amdgpu_device_fw_loading(tmp_adev);
4598 if (r)
4599 return r;
4600
4601 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4602 if (r)
4603 goto out;
4604
4605 if (vram_lost)
4606 amdgpu_device_fill_reset_magic(tmp_adev);
4607
fdafb359
EQ
4608 /*
4609 * Add this ASIC as tracked as reset was already
4610 * complete successfully.
4611 */
4612 amdgpu_register_gpu_instance(tmp_adev);
4613
04442bf7
LL
4614 if (!reset_context->hive &&
4615 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
e3c1b071 4616 amdgpu_xgmi_add_device(tmp_adev);
4617
7c04ca50 4618 r = amdgpu_device_ip_late_init(tmp_adev);
4619 if (r)
4620 goto out;
4621
565d1941
EQ
4622 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4623
e8fbaf03
GC
4624 /*
4625 * The GPU enters bad state once faulty pages
4626 * by ECC has reached the threshold, and ras
4627 * recovery is scheduled next. So add one check
4628 * here to break recovery if it indeed exceeds
4629 * bad page threshold, and remind user to
4630 * retire this GPU or setting one bigger
4631 * bad_page_threshold value to fix this once
4632 * probing driver again.
4633 */
11003c68 4634 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
e8fbaf03
GC
4635 /* must succeed. */
4636 amdgpu_ras_resume(tmp_adev);
4637 } else {
4638 r = -EINVAL;
4639 goto out;
4640 }
e79a04d5 4641
26bc5340 4642 /* Update PSP FW topology after reset */
04442bf7
LL
4643 if (reset_context->hive &&
4644 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4645 r = amdgpu_xgmi_update_topology(
4646 reset_context->hive, tmp_adev);
26bc5340
AG
4647 }
4648 }
4649
26bc5340
AG
4650out:
4651 if (!r) {
4652 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4653 r = amdgpu_ib_ring_tests(tmp_adev);
4654 if (r) {
4655 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
26bc5340
AG
4656 need_full_reset = true;
4657 r = -EAGAIN;
4658 goto end;
4659 }
4660 }
4661
4662 if (!r)
4663 r = amdgpu_device_recover_vram(tmp_adev);
4664 else
4665 tmp_adev->asic_reset_res = r;
4666 }
4667
4668end:
04442bf7
LL
4669 if (need_full_reset)
4670 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4671 else
4672 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
26bc5340
AG
4673 return r;
4674}
4675
08ebb485
DL
4676static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4677 struct amdgpu_hive_info *hive)
26bc5340 4678{
53b3f8f4
DL
4679 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4680 return false;
4681
08ebb485
DL
4682 if (hive) {
4683 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4684 } else {
4685 down_write(&adev->reset_sem);
4686 }
5740682e 4687
a3a09142
AD
4688 switch (amdgpu_asic_reset_method(adev)) {
4689 case AMD_RESET_METHOD_MODE1:
4690 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4691 break;
4692 case AMD_RESET_METHOD_MODE2:
4693 adev->mp1_state = PP_MP1_STATE_RESET;
4694 break;
4695 default:
4696 adev->mp1_state = PP_MP1_STATE_NONE;
4697 break;
4698 }
1d721ed6
AG
4699
4700 return true;
26bc5340 4701}
d38ceaf9 4702
26bc5340
AG
4703static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4704{
89041940 4705 amdgpu_vf_error_trans_all(adev);
a3a09142 4706 adev->mp1_state = PP_MP1_STATE_NONE;
53b3f8f4 4707 atomic_set(&adev->in_gpu_reset, 0);
6049db43 4708 up_write(&adev->reset_sem);
26bc5340
AG
4709}
4710
91fb309d
HC
4711/*
4712 * to lockup a list of amdgpu devices in a hive safely, if not a hive
4713 * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4714 *
4715 * unlock won't require roll back.
4716 */
4717static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4718{
4719 struct amdgpu_device *tmp_adev = NULL;
4720
4721 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4722 if (!hive) {
4723 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4724 return -ENODEV;
4725 }
4726 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4727 if (!amdgpu_device_lock_adev(tmp_adev, hive))
4728 goto roll_back;
4729 }
4730 } else if (!amdgpu_device_lock_adev(adev, hive))
4731 return -EAGAIN;
4732
4733 return 0;
4734roll_back:
4735 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4736 /*
4737 * if the lockup iteration break in the middle of a hive,
4738 * it may means there may has a race issue,
4739 * or a hive device locked up independently.
4740 * we may be in trouble and may not, so will try to roll back
4741 * the lock and give out a warnning.
4742 */
4743 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4744 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4745 amdgpu_device_unlock_adev(tmp_adev);
4746 }
4747 }
4748 return -EAGAIN;
4749}
4750
3f12acc8
EQ
4751static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4752{
4753 struct pci_dev *p = NULL;
4754
4755 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4756 adev->pdev->bus->number, 1);
4757 if (p) {
4758 pm_runtime_enable(&(p->dev));
4759 pm_runtime_resume(&(p->dev));
4760 }
4761}
4762
4763static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4764{
4765 enum amd_reset_method reset_method;
4766 struct pci_dev *p = NULL;
4767 u64 expires;
4768
4769 /*
4770 * For now, only BACO and mode1 reset are confirmed
4771 * to suffer the audio issue without proper suspended.
4772 */
4773 reset_method = amdgpu_asic_reset_method(adev);
4774 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4775 (reset_method != AMD_RESET_METHOD_MODE1))
4776 return -EINVAL;
4777
4778 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4779 adev->pdev->bus->number, 1);
4780 if (!p)
4781 return -ENODEV;
4782
4783 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4784 if (!expires)
4785 /*
4786 * If we cannot get the audio device autosuspend delay,
4787 * a fixed 4S interval will be used. Considering 3S is
4788 * the audio controller default autosuspend delay setting.
4789 * 4S used here is guaranteed to cover that.
4790 */
54b7feb9 4791 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4792
4793 while (!pm_runtime_status_suspended(&(p->dev))) {
4794 if (!pm_runtime_suspend(&(p->dev)))
4795 break;
4796
4797 if (expires < ktime_get_mono_fast_ns()) {
4798 dev_warn(adev->dev, "failed to suspend display audio\n");
4799 /* TODO: abort the succeeding gpu reset? */
4800 return -ETIMEDOUT;
4801 }
4802 }
4803
4804 pm_runtime_disable(&(p->dev));
4805
4806 return 0;
4807}
4808
9d8d96be 4809static void amdgpu_device_recheck_guilty_jobs(
04442bf7
LL
4810 struct amdgpu_device *adev, struct list_head *device_list_handle,
4811 struct amdgpu_reset_context *reset_context)
e6c6338f
JZ
4812{
4813 int i, r = 0;
4814
4815 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4816 struct amdgpu_ring *ring = adev->rings[i];
4817 int ret = 0;
4818 struct drm_sched_job *s_job;
4819
4820 if (!ring || !ring->sched.thread)
4821 continue;
4822
4823 s_job = list_first_entry_or_null(&ring->sched.pending_list,
4824 struct drm_sched_job, list);
4825 if (s_job == NULL)
4826 continue;
4827
4828 /* clear job's guilty and depend the folowing step to decide the real one */
4829 drm_sched_reset_karma(s_job);
4830 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
4831
4832 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
4833 if (ret == 0) { /* timeout */
4834 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
4835 ring->sched.name, s_job->id);
4836
4837 /* set guilty */
4838 drm_sched_increase_karma(s_job);
4839retry:
4840 /* do hw reset */
4841 if (amdgpu_sriov_vf(adev)) {
4842 amdgpu_virt_fini_data_exchange(adev);
4843 r = amdgpu_device_reset_sriov(adev, false);
4844 if (r)
4845 adev->asic_reset_res = r;
4846 } else {
04442bf7
LL
4847 clear_bit(AMDGPU_SKIP_HW_RESET,
4848 &reset_context->flags);
4849 r = amdgpu_do_asic_reset(device_list_handle,
4850 reset_context);
e6c6338f
JZ
4851 if (r && r == -EAGAIN)
4852 goto retry;
4853 }
4854
4855 /*
4856 * add reset counter so that the following
4857 * resubmitted job could flush vmid
4858 */
4859 atomic_inc(&adev->gpu_reset_counter);
4860 continue;
4861 }
4862
4863 /* got the hw fence, signal finished fence */
4864 atomic_dec(ring->sched.score);
4865 dma_fence_get(&s_job->s_fence->finished);
4866 dma_fence_signal(&s_job->s_fence->finished);
4867 dma_fence_put(&s_job->s_fence->finished);
4868
4869 /* remove node from list and free the job */
4870 spin_lock(&ring->sched.job_list_lock);
4871 list_del_init(&s_job->list);
4872 spin_unlock(&ring->sched.job_list_lock);
4873 ring->sched.ops->free_job(s_job);
4874 }
4875}
4876
26bc5340
AG
4877/**
4878 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4879 *
982a820b 4880 * @adev: amdgpu_device pointer
26bc5340
AG
4881 * @job: which job trigger hang
4882 *
4883 * Attempt to reset the GPU if it has hung (all asics).
4884 * Attempt to do soft-reset or full-reset and reinitialize Asic
4885 * Returns 0 for success or an error on failure.
4886 */
4887
4888int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4889 struct amdgpu_job *job)
4890{
1d721ed6 4891 struct list_head device_list, *device_list_handle = NULL;
7dd8c205 4892 bool job_signaled = false;
26bc5340 4893 struct amdgpu_hive_info *hive = NULL;
26bc5340 4894 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 4895 int i, r = 0;
bb5c7235 4896 bool need_emergency_restart = false;
3f12acc8 4897 bool audio_suspended = false;
e6c6338f 4898 int tmp_vram_lost_counter;
04442bf7
LL
4899 struct amdgpu_reset_context reset_context;
4900
4901 memset(&reset_context, 0, sizeof(reset_context));
26bc5340 4902
6e3cd2a9 4903 /*
bb5c7235
WS
4904 * Special case: RAS triggered and full reset isn't supported
4905 */
4906 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4907
d5ea093e
AG
4908 /*
4909 * Flush RAM to disk so that after reboot
4910 * the user can read log and see why the system rebooted.
4911 */
bb5c7235 4912 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
4913 DRM_WARN("Emergency reboot.");
4914
4915 ksys_sync_helper();
4916 emergency_restart();
4917 }
4918
b823821f 4919 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 4920 need_emergency_restart ? "jobs stop":"reset");
26bc5340
AG
4921
4922 /*
1d721ed6
AG
4923 * Here we trylock to avoid chain of resets executing from
4924 * either trigger by jobs on different adevs in XGMI hive or jobs on
4925 * different schedulers for same device while this TO handler is running.
4926 * We always reset all schedulers for device and all devices for XGMI
4927 * hive so that should take care of them too.
26bc5340 4928 */
d95e8e97 4929 hive = amdgpu_get_xgmi_hive(adev);
53b3f8f4
DL
4930 if (hive) {
4931 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4932 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4933 job ? job->base.id : -1, hive->hive_id);
d95e8e97 4934 amdgpu_put_xgmi_hive(hive);
ff99849b 4935 if (job && job->vm)
91fb309d 4936 drm_sched_increase_karma(&job->base);
53b3f8f4
DL
4937 return 0;
4938 }
4939 mutex_lock(&hive->hive_lock);
1d721ed6 4940 }
26bc5340 4941
04442bf7
LL
4942 reset_context.method = AMD_RESET_METHOD_NONE;
4943 reset_context.reset_req_dev = adev;
4944 reset_context.job = job;
4945 reset_context.hive = hive;
4946 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
4947
91fb309d
HC
4948 /*
4949 * lock the device before we try to operate the linked list
4950 * if didn't get the device lock, don't touch the linked list since
4951 * others may iterating it.
4952 */
4953 r = amdgpu_device_lock_hive_adev(adev, hive);
4954 if (r) {
4955 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4956 job ? job->base.id : -1);
4957
4958 /* even we skipped this reset, still need to set the job to guilty */
ff99849b 4959 if (job && job->vm)
91fb309d
HC
4960 drm_sched_increase_karma(&job->base);
4961 goto skip_recovery;
4962 }
4963
9e94d22c
EQ
4964 /*
4965 * Build list of devices to reset.
4966 * In case we are in XGMI hive mode, resort the device list
4967 * to put adev in the 1st position.
4968 */
4969 INIT_LIST_HEAD(&device_list);
4970 if (adev->gmc.xgmi.num_physical_nodes > 1) {
655ce9cb 4971 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
4972 list_add_tail(&tmp_adev->reset_list, &device_list);
4973 if (!list_is_first(&adev->reset_list, &device_list))
4974 list_rotate_to_front(&adev->reset_list, &device_list);
4975 device_list_handle = &device_list;
26bc5340 4976 } else {
655ce9cb 4977 list_add_tail(&adev->reset_list, &device_list);
26bc5340
AG
4978 device_list_handle = &device_list;
4979 }
4980
1d721ed6 4981 /* block all schedulers and reset given job's ring */
655ce9cb 4982 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
3f12acc8
EQ
4983 /*
4984 * Try to put the audio codec into suspend state
4985 * before gpu reset started.
4986 *
4987 * Due to the power domain of the graphics device
4988 * is shared with AZ power domain. Without this,
4989 * we may change the audio hardware from behind
4990 * the audio driver's back. That will trigger
4991 * some audio codec errors.
4992 */
4993 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4994 audio_suspended = true;
4995
9e94d22c
EQ
4996 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4997
52fb44cf
EQ
4998 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4999
9e94d22c
EQ
5000 if (!amdgpu_sriov_vf(tmp_adev))
5001 amdgpu_amdkfd_pre_reset(tmp_adev);
5002
12ffa55d
AG
5003 /*
5004 * Mark these ASICs to be reseted as untracked first
5005 * And add them back after reset completed
5006 */
5007 amdgpu_unregister_gpu_instance(tmp_adev);
5008
a2f63ee8 5009 amdgpu_fbdev_set_suspend(tmp_adev, 1);
565d1941 5010
f1c1314b 5011 /* disable ras on ALL IPs */
bb5c7235 5012 if (!need_emergency_restart &&
b823821f 5013 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 5014 amdgpu_ras_suspend(tmp_adev);
5015
1d721ed6
AG
5016 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5017 struct amdgpu_ring *ring = tmp_adev->rings[i];
5018
5019 if (!ring || !ring->sched.thread)
5020 continue;
5021
0b2d2c2e 5022 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 5023
bb5c7235 5024 if (need_emergency_restart)
7c6e68c7 5025 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6 5026 }
8f8c80f4 5027 atomic_inc(&tmp_adev->gpu_reset_counter);
1d721ed6
AG
5028 }
5029
bb5c7235 5030 if (need_emergency_restart)
7c6e68c7
AG
5031 goto skip_sched_resume;
5032
1d721ed6
AG
5033 /*
5034 * Must check guilty signal here since after this point all old
5035 * HW fences are force signaled.
5036 *
5037 * job->base holds a reference to parent fence
5038 */
5039 if (job && job->base.s_fence->parent &&
7dd8c205 5040 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 5041 job_signaled = true;
1d721ed6
AG
5042 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5043 goto skip_hw_reset;
5044 }
5045
26bc5340 5046retry: /* Rest of adevs pre asic reset from XGMI hive. */
655ce9cb 5047 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
04442bf7 5048 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
26bc5340
AG
5049 /*TODO Should we stop ?*/
5050 if (r) {
aac89168 5051 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 5052 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
5053 tmp_adev->asic_reset_res = r;
5054 }
5055 }
5056
e6c6338f 5057 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
26bc5340
AG
5058 /* Actual ASIC resets if needed.*/
5059 /* TODO Implement XGMI hive reset logic for SRIOV */
5060 if (amdgpu_sriov_vf(adev)) {
5061 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5062 if (r)
5063 adev->asic_reset_res = r;
5064 } else {
04442bf7 5065 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
26bc5340
AG
5066 if (r && r == -EAGAIN)
5067 goto retry;
5068 }
5069
1d721ed6
AG
5070skip_hw_reset:
5071
26bc5340 5072 /* Post ASIC reset for all devs .*/
655ce9cb 5073 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
7c6e68c7 5074
e6c6338f
JZ
5075 /*
5076 * Sometimes a later bad compute job can block a good gfx job as gfx
5077 * and compute ring share internal GC HW mutually. We add an additional
5078 * guilty jobs recheck step to find the real guilty job, it synchronously
5079 * submits and pends for the first job being signaled. If it gets timeout,
5080 * we identify it as a real guilty job.
5081 */
5082 if (amdgpu_gpu_recovery == 2 &&
5083 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
04442bf7
LL
5084 amdgpu_device_recheck_guilty_jobs(
5085 tmp_adev, device_list_handle, &reset_context);
e6c6338f 5086
1d721ed6
AG
5087 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5088 struct amdgpu_ring *ring = tmp_adev->rings[i];
5089
5090 if (!ring || !ring->sched.thread)
5091 continue;
5092
5093 /* No point to resubmit jobs if we didn't HW reset*/
5094 if (!tmp_adev->asic_reset_res && !job_signaled)
5095 drm_sched_resubmit_jobs(&ring->sched);
5096
5097 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5098 }
5099
5100 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4a580877 5101 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
5102 }
5103
5104 tmp_adev->asic_reset_res = 0;
26bc5340
AG
5105
5106 if (r) {
5107 /* bad news, how to tell it to userspace ? */
12ffa55d 5108 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
5109 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5110 } else {
12ffa55d 5111 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
3fa8f89d
S
5112 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5113 DRM_WARN("smart shift update failed\n");
26bc5340 5114 }
7c6e68c7 5115 }
26bc5340 5116
7c6e68c7 5117skip_sched_resume:
655ce9cb 5118 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
8e2712e7 5119 /* unlock kfd: SRIOV would do it separately */
bb5c7235 5120 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
7c6e68c7 5121 amdgpu_amdkfd_post_reset(tmp_adev);
8e2712e7 5122
5123 /* kfd_post_reset will do nothing if kfd device is not initialized,
5124 * need to bring up kfd here if it's not be initialized before
5125 */
5126 if (!adev->kfd.init_complete)
5127 amdgpu_amdkfd_device_init(adev);
5128
3f12acc8
EQ
5129 if (audio_suspended)
5130 amdgpu_device_resume_display_audio(tmp_adev);
26bc5340
AG
5131 amdgpu_device_unlock_adev(tmp_adev);
5132 }
5133
cbfd17f7 5134skip_recovery:
9e94d22c 5135 if (hive) {
53b3f8f4 5136 atomic_set(&hive->in_reset, 0);
9e94d22c 5137 mutex_unlock(&hive->hive_lock);
d95e8e97 5138 amdgpu_put_xgmi_hive(hive);
9e94d22c 5139 }
26bc5340 5140
91fb309d 5141 if (r && r != -EAGAIN)
26bc5340 5142 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
5143 return r;
5144}
5145
e3ecdffa
AD
5146/**
5147 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5148 *
5149 * @adev: amdgpu_device pointer
5150 *
5151 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5152 * and lanes) of the slot the device is in. Handles APUs and
5153 * virtualized environments where PCIE config space may not be available.
5154 */
5494d864 5155static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 5156{
5d9a6330 5157 struct pci_dev *pdev;
c5313457
HK
5158 enum pci_bus_speed speed_cap, platform_speed_cap;
5159 enum pcie_link_width platform_link_width;
d0dd7f0c 5160
cd474ba0
AD
5161 if (amdgpu_pcie_gen_cap)
5162 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 5163
cd474ba0
AD
5164 if (amdgpu_pcie_lane_cap)
5165 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 5166
cd474ba0
AD
5167 /* covers APUs as well */
5168 if (pci_is_root_bus(adev->pdev->bus)) {
5169 if (adev->pm.pcie_gen_mask == 0)
5170 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5171 if (adev->pm.pcie_mlw_mask == 0)
5172 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 5173 return;
cd474ba0 5174 }
d0dd7f0c 5175
c5313457
HK
5176 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5177 return;
5178
dbaa922b
AD
5179 pcie_bandwidth_available(adev->pdev, NULL,
5180 &platform_speed_cap, &platform_link_width);
c5313457 5181
cd474ba0 5182 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
5183 /* asic caps */
5184 pdev = adev->pdev;
5185 speed_cap = pcie_get_speed_cap(pdev);
5186 if (speed_cap == PCI_SPEED_UNKNOWN) {
5187 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
5188 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5189 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 5190 } else {
2b3a1f51
FX
5191 if (speed_cap == PCIE_SPEED_32_0GT)
5192 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5193 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5194 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5195 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5196 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5197 else if (speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5198 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5199 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5200 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5201 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5202 else if (speed_cap == PCIE_SPEED_8_0GT)
5203 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5204 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5205 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5206 else if (speed_cap == PCIE_SPEED_5_0GT)
5207 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5208 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5209 else
5210 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5211 }
5212 /* platform caps */
c5313457 5213 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
5214 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5215 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5216 } else {
2b3a1f51
FX
5217 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5218 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5219 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5220 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5221 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5222 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5223 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
5224 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5225 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5226 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5227 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 5228 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
5229 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5230 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5231 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 5232 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
5233 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5234 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5235 else
5236 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5237
cd474ba0
AD
5238 }
5239 }
5240 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 5241 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
5242 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5243 } else {
c5313457 5244 switch (platform_link_width) {
5d9a6330 5245 case PCIE_LNK_X32:
cd474ba0
AD
5246 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5247 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5248 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5249 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5250 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5251 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5252 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5253 break;
5d9a6330 5254 case PCIE_LNK_X16:
cd474ba0
AD
5255 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5256 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5257 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5258 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5259 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5260 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5261 break;
5d9a6330 5262 case PCIE_LNK_X12:
cd474ba0
AD
5263 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5264 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5265 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5266 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5267 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5268 break;
5d9a6330 5269 case PCIE_LNK_X8:
cd474ba0
AD
5270 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5271 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5272 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5273 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5274 break;
5d9a6330 5275 case PCIE_LNK_X4:
cd474ba0
AD
5276 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5277 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5278 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5279 break;
5d9a6330 5280 case PCIE_LNK_X2:
cd474ba0
AD
5281 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5282 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5283 break;
5d9a6330 5284 case PCIE_LNK_X1:
cd474ba0
AD
5285 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5286 break;
5287 default:
5288 break;
5289 }
d0dd7f0c
AD
5290 }
5291 }
5292}
d38ceaf9 5293
361dbd01
AD
5294int amdgpu_device_baco_enter(struct drm_device *dev)
5295{
1348969a 5296 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5297 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 5298
4a580877 5299 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5300 return -ENOTSUPP;
5301
8ab0d6f0 5302 if (ras && adev->ras_enabled &&
acdae216 5303 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5304 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5305
9530273e 5306 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
5307}
5308
5309int amdgpu_device_baco_exit(struct drm_device *dev)
5310{
1348969a 5311 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 5312 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 5313 int ret = 0;
361dbd01 5314
4a580877 5315 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
5316 return -ENOTSUPP;
5317
9530273e
EQ
5318 ret = amdgpu_dpm_baco_exit(adev);
5319 if (ret)
5320 return ret;
7a22677b 5321
8ab0d6f0 5322 if (ras && adev->ras_enabled &&
acdae216 5323 adev->nbio.funcs->enable_doorbell_interrupt)
7a22677b
LM
5324 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5325
5326 return 0;
361dbd01 5327}
c9a6b82f 5328
acd89fca
AG
5329static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5330{
5331 int i;
5332
5333 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5334 struct amdgpu_ring *ring = adev->rings[i];
5335
5336 if (!ring || !ring->sched.thread)
5337 continue;
5338
5339 cancel_delayed_work_sync(&ring->sched.work_tdr);
5340 }
5341}
5342
c9a6b82f
AG
5343/**
5344 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5345 * @pdev: PCI device struct
5346 * @state: PCI channel state
5347 *
5348 * Description: Called when a PCI error is detected.
5349 *
5350 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5351 */
5352pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5353{
5354 struct drm_device *dev = pci_get_drvdata(pdev);
5355 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5356 int i;
c9a6b82f
AG
5357
5358 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5359
6894305c
AG
5360 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5361 DRM_WARN("No support for XGMI hive yet...");
5362 return PCI_ERS_RESULT_DISCONNECT;
5363 }
5364
c9a6b82f
AG
5365 switch (state) {
5366 case pci_channel_io_normal:
5367 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca 5368 /* Fatal error, prepare for slot reset */
8a11d283
TZ
5369 case pci_channel_io_frozen:
5370 /*
acd89fca
AG
5371 * Cancel and wait for all TDRs in progress if failing to
5372 * set adev->in_gpu_reset in amdgpu_device_lock_adev
5373 *
5374 * Locking adev->reset_sem will prevent any external access
5375 * to GPU during PCI error recovery
5376 */
5377 while (!amdgpu_device_lock_adev(adev, NULL))
5378 amdgpu_cancel_all_tdr(adev);
5379
5380 /*
5381 * Block any work scheduling as we do for regular GPU reset
5382 * for the duration of the recovery
5383 */
5384 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5385 struct amdgpu_ring *ring = adev->rings[i];
5386
5387 if (!ring || !ring->sched.thread)
5388 continue;
5389
5390 drm_sched_stop(&ring->sched, NULL);
5391 }
8f8c80f4 5392 atomic_inc(&adev->gpu_reset_counter);
c9a6b82f
AG
5393 return PCI_ERS_RESULT_NEED_RESET;
5394 case pci_channel_io_perm_failure:
5395 /* Permanent error, prepare for device removal */
5396 return PCI_ERS_RESULT_DISCONNECT;
5397 }
5398
5399 return PCI_ERS_RESULT_NEED_RESET;
5400}
5401
5402/**
5403 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5404 * @pdev: pointer to PCI device
5405 */
5406pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5407{
5408
5409 DRM_INFO("PCI error: mmio enabled callback!!\n");
5410
5411 /* TODO - dump whatever for debugging purposes */
5412
5413 /* This called only if amdgpu_pci_error_detected returns
5414 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5415 * works, no need to reset slot.
5416 */
5417
5418 return PCI_ERS_RESULT_RECOVERED;
5419}
5420
5421/**
5422 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5423 * @pdev: PCI device struct
5424 *
5425 * Description: This routine is called by the pci error recovery
5426 * code after the PCI slot has been reset, just before we
5427 * should resume normal operations.
5428 */
5429pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5430{
5431 struct drm_device *dev = pci_get_drvdata(pdev);
5432 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 5433 int r, i;
04442bf7 5434 struct amdgpu_reset_context reset_context;
362c7b91 5435 u32 memsize;
7ac71382 5436 struct list_head device_list;
c9a6b82f
AG
5437
5438 DRM_INFO("PCI error: slot reset callback!!\n");
5439
04442bf7
LL
5440 memset(&reset_context, 0, sizeof(reset_context));
5441
7ac71382 5442 INIT_LIST_HEAD(&device_list);
655ce9cb 5443 list_add_tail(&adev->reset_list, &device_list);
7ac71382 5444
362c7b91
AG
5445 /* wait for asic to come out of reset */
5446 msleep(500);
5447
7ac71382 5448 /* Restore PCI confspace */
c1dd4aa6 5449 amdgpu_device_load_pci_state(pdev);
c9a6b82f 5450
362c7b91
AG
5451 /* confirm ASIC came out of reset */
5452 for (i = 0; i < adev->usec_timeout; i++) {
5453 memsize = amdgpu_asic_get_config_memsize(adev);
5454
5455 if (memsize != 0xffffffff)
5456 break;
5457 udelay(1);
5458 }
5459 if (memsize == 0xffffffff) {
5460 r = -ETIME;
5461 goto out;
5462 }
5463
04442bf7
LL
5464 reset_context.method = AMD_RESET_METHOD_NONE;
5465 reset_context.reset_req_dev = adev;
5466 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5467 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5468
7afefb81 5469 adev->no_hw_access = true;
04442bf7 5470 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
7afefb81 5471 adev->no_hw_access = false;
c9a6b82f
AG
5472 if (r)
5473 goto out;
5474
04442bf7 5475 r = amdgpu_do_asic_reset(&device_list, &reset_context);
c9a6b82f
AG
5476
5477out:
c9a6b82f 5478 if (!r) {
c1dd4aa6
AG
5479 if (amdgpu_device_cache_pci_state(adev->pdev))
5480 pci_restore_state(adev->pdev);
5481
c9a6b82f
AG
5482 DRM_INFO("PCIe error recovery succeeded\n");
5483 } else {
5484 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5485 amdgpu_device_unlock_adev(adev);
5486 }
5487
5488 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5489}
5490
5491/**
5492 * amdgpu_pci_resume() - resume normal ops after PCI reset
5493 * @pdev: pointer to PCI device
5494 *
5495 * Called when the error recovery driver tells us that its
505199a3 5496 * OK to resume normal operation.
c9a6b82f
AG
5497 */
5498void amdgpu_pci_resume(struct pci_dev *pdev)
5499{
5500 struct drm_device *dev = pci_get_drvdata(pdev);
5501 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5502 int i;
c9a6b82f 5503
c9a6b82f
AG
5504
5505 DRM_INFO("PCI error: resume callback!!\n");
acd89fca
AG
5506
5507 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5508 struct amdgpu_ring *ring = adev->rings[i];
5509
5510 if (!ring || !ring->sched.thread)
5511 continue;
5512
5513
5514 drm_sched_resubmit_jobs(&ring->sched);
5515 drm_sched_start(&ring->sched, true);
5516 }
5517
5518 amdgpu_device_unlock_adev(adev);
c9a6b82f 5519}
c1dd4aa6
AG
5520
5521bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5522{
5523 struct drm_device *dev = pci_get_drvdata(pdev);
5524 struct amdgpu_device *adev = drm_to_adev(dev);
5525 int r;
5526
5527 r = pci_save_state(pdev);
5528 if (!r) {
5529 kfree(adev->pci_state);
5530
5531 adev->pci_state = pci_store_saved_state(pdev);
5532
5533 if (!adev->pci_state) {
5534 DRM_ERROR("Failed to store PCI saved state");
5535 return false;
5536 }
5537 } else {
5538 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5539 return false;
5540 }
5541
5542 return true;
5543}
5544
5545bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5546{
5547 struct drm_device *dev = pci_get_drvdata(pdev);
5548 struct amdgpu_device *adev = drm_to_adev(dev);
5549 int r;
5550
5551 if (!adev->pci_state)
5552 return false;
5553
5554 r = pci_load_saved_state(pdev, adev->pci_state);
5555
5556 if (!r) {
5557 pci_restore_state(pdev);
5558 } else {
5559 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5560 return false;
5561 }
5562
5563 return true;
5564}
5565
810085dd
EH
5566void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5567 struct amdgpu_ring *ring)
5568{
5569#ifdef CONFIG_X86_64
5570 if (adev->flags & AMD_IS_APU)
5571 return;
5572#endif
5573 if (adev->gmc.xgmi.connected_to_cpu)
5574 return;
5575
5576 if (ring && ring->funcs->emit_hdp_flush)
5577 amdgpu_ring_emit_hdp_flush(ring);
5578 else
5579 amdgpu_asic_flush_hdp(adev, ring);
5580}
c1dd4aa6 5581
810085dd
EH
5582void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5583 struct amdgpu_ring *ring)
5584{
5585#ifdef CONFIG_X86_64
5586 if (adev->flags & AMD_IS_APU)
5587 return;
5588#endif
5589 if (adev->gmc.xgmi.connected_to_cpu)
5590 return;
c1dd4aa6 5591
810085dd
EH
5592 amdgpu_asic_invalidate_hdp(adev, ring);
5593}