drm/amdgpu: add vangogh asic header files (v2)
[linux-2.6-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
CommitLineData
d38ceaf9
AD
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
b1ddf548 28#include <linux/power_supply.h>
0875dc9e 29#include <linux/kthread.h>
fdf2f6c5 30#include <linux/module.h>
d38ceaf9
AD
31#include <linux/console.h>
32#include <linux/slab.h>
fdf2f6c5 33
4562236b 34#include <drm/drm_atomic_helper.h>
fcd70cd3 35#include <drm/drm_probe_helper.h>
d38ceaf9
AD
36#include <drm/amdgpu_drm.h>
37#include <linux/vgaarb.h>
38#include <linux/vga_switcheroo.h>
39#include <linux/efi.h>
40#include "amdgpu.h"
f4b373f4 41#include "amdgpu_trace.h"
d38ceaf9
AD
42#include "amdgpu_i2c.h"
43#include "atom.h"
44#include "amdgpu_atombios.h"
a5bde2f9 45#include "amdgpu_atomfirmware.h"
d0dd7f0c 46#include "amd_pcie.h"
33f34802
KW
47#ifdef CONFIG_DRM_AMDGPU_SI
48#include "si.h"
49#endif
a2e73f56
AD
50#ifdef CONFIG_DRM_AMDGPU_CIK
51#include "cik.h"
52#endif
aaa36a97 53#include "vi.h"
460826e6 54#include "soc15.h"
0a5b8c7b 55#include "nv.h"
d38ceaf9 56#include "bif/bif_4_1_d.h"
9accf2fd 57#include <linux/pci.h>
bec86378 58#include <linux/firmware.h>
89041940 59#include "amdgpu_vf_error.h"
d38ceaf9 60
ba997709 61#include "amdgpu_amdkfd.h"
d2f52ac8 62#include "amdgpu_pm.h"
d38ceaf9 63
5183411b 64#include "amdgpu_xgmi.h"
c030f2e4 65#include "amdgpu_ras.h"
9c7c85f7 66#include "amdgpu_pmu.h"
bd607166 67#include "amdgpu_fru_eeprom.h"
5183411b 68
d5ea093e 69#include <linux/suspend.h>
c6a6e2db 70#include <drm/task_barrier.h>
3f12acc8 71#include <linux/pm_runtime.h>
d5ea093e 72
e2a75f88 73MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
3f76dced 74MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
2d2e5e7e 75MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
ad5a67a7 76MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
54c4d17e 77MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
65e60f6e 78MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
b51a26a0 79MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
23c6268e 80MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
ed42cfe1 81MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
42b325e5 82MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
e2a75f88 83
2dc80b00
S
84#define AMDGPU_RESUME_MS 2000
85
050091ab 86const char *amdgpu_asic_name[] = {
da69c161
KW
87 "TAHITI",
88 "PITCAIRN",
89 "VERDE",
90 "OLAND",
91 "HAINAN",
d38ceaf9
AD
92 "BONAIRE",
93 "KAVERI",
94 "KABINI",
95 "HAWAII",
96 "MULLINS",
97 "TOPAZ",
98 "TONGA",
48299f95 99 "FIJI",
d38ceaf9 100 "CARRIZO",
139f4917 101 "STONEY",
2cc0c0b5
FC
102 "POLARIS10",
103 "POLARIS11",
c4642a47 104 "POLARIS12",
48ff108d 105 "VEGAM",
d4196f01 106 "VEGA10",
8fab806a 107 "VEGA12",
956fcddc 108 "VEGA20",
2ca8a5d2 109 "RAVEN",
d6c3b24e 110 "ARCTURUS",
1eee4228 111 "RENOIR",
852a6626 112 "NAVI10",
87dbad02 113 "NAVI14",
9802f5d7 114 "NAVI12",
ccaf72d3 115 "SIENNA_CICHLID",
ddd8fbe7 116 "NAVY_FLOUNDER",
d38ceaf9
AD
117 "LAST",
118};
119
dcea6e65
KR
120/**
121 * DOC: pcie_replay_count
122 *
123 * The amdgpu driver provides a sysfs API for reporting the total number
124 * of PCIe replays (NAKs)
125 * The file pcie_replay_count is used for this and returns the total
126 * number of replays as a sum of the NAKs generated and NAKs received
127 */
128
129static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 struct device_attribute *attr, char *buf)
131{
132 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 133 struct amdgpu_device *adev = drm_to_adev(ddev);
dcea6e65
KR
134 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135
136 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137}
138
139static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 amdgpu_device_get_pcie_replay_count, NULL);
141
5494d864
AD
142static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143
bd607166
KR
144/**
145 * DOC: product_name
146 *
147 * The amdgpu driver provides a sysfs API for reporting the product name
148 * for the device
149 * The file serial_number is used for this and returns the product name
150 * as returned from the FRU.
151 * NOTE: This is only available for certain server cards
152 */
153
154static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 struct device_attribute *attr, char *buf)
156{
157 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 158 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
159
160 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161}
162
163static DEVICE_ATTR(product_name, S_IRUGO,
164 amdgpu_device_get_product_name, NULL);
165
166/**
167 * DOC: product_number
168 *
169 * The amdgpu driver provides a sysfs API for reporting the part number
170 * for the device
171 * The file serial_number is used for this and returns the part number
172 * as returned from the FRU.
173 * NOTE: This is only available for certain server cards
174 */
175
176static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 struct device_attribute *attr, char *buf)
178{
179 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 180 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
181
182 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183}
184
185static DEVICE_ATTR(product_number, S_IRUGO,
186 amdgpu_device_get_product_number, NULL);
187
188/**
189 * DOC: serial_number
190 *
191 * The amdgpu driver provides a sysfs API for reporting the serial number
192 * for the device
193 * The file serial_number is used for this and returns the serial number
194 * as returned from the FRU.
195 * NOTE: This is only available for certain server cards
196 */
197
198static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 struct device_attribute *attr, char *buf)
200{
201 struct drm_device *ddev = dev_get_drvdata(dev);
1348969a 202 struct amdgpu_device *adev = drm_to_adev(ddev);
bd607166
KR
203
204 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205}
206
207static DEVICE_ATTR(serial_number, S_IRUGO,
208 amdgpu_device_get_serial_number, NULL);
209
e3ecdffa 210/**
31af062a 211 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
e3ecdffa
AD
212 *
213 * @dev: drm_device pointer
214 *
215 * Returns true if the device is a dGPU with HG/PX power control,
216 * otherwise return false.
217 */
31af062a 218bool amdgpu_device_supports_boco(struct drm_device *dev)
d38ceaf9 219{
1348969a 220 struct amdgpu_device *adev = drm_to_adev(dev);
d38ceaf9 221
2f7d10b3 222 if (adev->flags & AMD_IS_PX)
d38ceaf9
AD
223 return true;
224 return false;
225}
226
a69cba42
AD
227/**
228 * amdgpu_device_supports_baco - Does the device support BACO
229 *
230 * @dev: drm_device pointer
231 *
232 * Returns true if the device supporte BACO,
233 * otherwise return false.
234 */
235bool amdgpu_device_supports_baco(struct drm_device *dev)
236{
1348969a 237 struct amdgpu_device *adev = drm_to_adev(dev);
a69cba42
AD
238
239 return amdgpu_asic_supports_baco(adev);
240}
241
e35e2b11
TY
242/**
243 * VRAM access helper functions.
244 *
245 * amdgpu_device_vram_access - read/write a buffer in vram
246 *
247 * @adev: amdgpu_device pointer
248 * @pos: offset of the buffer in vram
249 * @buf: virtual address of the buffer in system memory
250 * @size: read/write size, sizeof(@buf) must > @size
251 * @write: true - write to vram, otherwise - read from vram
252 */
253void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
254 uint32_t *buf, size_t size, bool write)
255{
e35e2b11 256 unsigned long flags;
ce05ac56
CK
257 uint32_t hi = ~0;
258 uint64_t last;
259
9d11eb0d
CK
260
261#ifdef CONFIG_64BIT
262 last = min(pos + size, adev->gmc.visible_vram_size);
263 if (last > pos) {
264 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
265 size_t count = last - pos;
266
267 if (write) {
268 memcpy_toio(addr, buf, count);
269 mb();
270 amdgpu_asic_flush_hdp(adev, NULL);
271 } else {
272 amdgpu_asic_invalidate_hdp(adev, NULL);
273 mb();
274 memcpy_fromio(buf, addr, count);
275 }
276
277 if (count == size)
278 return;
279
280 pos += count;
281 buf += count / 4;
282 size -= count;
283 }
284#endif
285
ce05ac56
CK
286 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
287 for (last = pos + size; pos < last; pos += 4) {
288 uint32_t tmp = pos >> 31;
e35e2b11 289
e35e2b11 290 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
ce05ac56
CK
291 if (tmp != hi) {
292 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
293 hi = tmp;
294 }
e35e2b11
TY
295 if (write)
296 WREG32_NO_KIQ(mmMM_DATA, *buf++);
297 else
298 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
e35e2b11 299 }
ce05ac56 300 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
e35e2b11
TY
301}
302
d38ceaf9 303/*
f7ee1874 304 * register access helper functions.
d38ceaf9 305 */
e3ecdffa 306/**
f7ee1874 307 * amdgpu_device_rreg - read a memory mapped IO or indirect register
e3ecdffa
AD
308 *
309 * @adev: amdgpu_device pointer
310 * @reg: dword aligned register offset
311 * @acc_flags: access flags which require special behavior
312 *
313 * Returns the 32 bit value from the offset specified.
314 */
f7ee1874
HZ
315uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
316 uint32_t reg, uint32_t acc_flags)
d38ceaf9 317{
f4b373f4
TSD
318 uint32_t ret;
319
bf36b52e
AG
320 if (adev->in_pci_err_recovery)
321 return 0;
322
f7ee1874
HZ
323 if ((reg * 4) < adev->rmmio_size) {
324 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
325 amdgpu_sriov_runtime(adev) &&
326 down_read_trylock(&adev->reset_sem)) {
327 ret = amdgpu_kiq_rreg(adev, reg);
328 up_read(&adev->reset_sem);
329 } else {
330 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
331 }
332 } else {
333 ret = adev->pcie_rreg(adev, reg * 4);
81202807 334 }
bc992ba5 335
f7ee1874 336 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
e78b579d 337
f4b373f4 338 return ret;
d38ceaf9
AD
339}
340
421a2a30
ML
341/*
342 * MMIO register read with bytes helper functions
343 * @offset:bytes offset from MMIO start
344 *
345*/
346
e3ecdffa
AD
347/**
348 * amdgpu_mm_rreg8 - read a memory mapped IO register
349 *
350 * @adev: amdgpu_device pointer
351 * @offset: byte aligned register offset
352 *
353 * Returns the 8 bit value from the offset specified.
354 */
7cbbc745
AG
355uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
356{
bf36b52e
AG
357 if (adev->in_pci_err_recovery)
358 return 0;
359
421a2a30
ML
360 if (offset < adev->rmmio_size)
361 return (readb(adev->rmmio + offset));
362 BUG();
363}
364
365/*
366 * MMIO register write with bytes helper functions
367 * @offset:bytes offset from MMIO start
368 * @value: the value want to be written to the register
369 *
370*/
e3ecdffa
AD
371/**
372 * amdgpu_mm_wreg8 - read a memory mapped IO register
373 *
374 * @adev: amdgpu_device pointer
375 * @offset: byte aligned register offset
376 * @value: 8 bit value to write
377 *
378 * Writes the value specified to the offset specified.
379 */
7cbbc745
AG
380void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
381{
bf36b52e
AG
382 if (adev->in_pci_err_recovery)
383 return;
384
421a2a30
ML
385 if (offset < adev->rmmio_size)
386 writeb(value, adev->rmmio + offset);
387 else
388 BUG();
389}
390
e3ecdffa 391/**
f7ee1874 392 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
e3ecdffa
AD
393 *
394 * @adev: amdgpu_device pointer
395 * @reg: dword aligned register offset
396 * @v: 32 bit value to write to the register
397 * @acc_flags: access flags which require special behavior
398 *
399 * Writes the value specified to the offset specified.
400 */
f7ee1874
HZ
401void amdgpu_device_wreg(struct amdgpu_device *adev,
402 uint32_t reg, uint32_t v,
403 uint32_t acc_flags)
d38ceaf9 404{
bf36b52e
AG
405 if (adev->in_pci_err_recovery)
406 return;
407
f7ee1874
HZ
408 if ((reg * 4) < adev->rmmio_size) {
409 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
410 amdgpu_sriov_runtime(adev) &&
411 down_read_trylock(&adev->reset_sem)) {
412 amdgpu_kiq_wreg(adev, reg, v);
413 up_read(&adev->reset_sem);
414 } else {
415 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
416 }
417 } else {
418 adev->pcie_wreg(adev, reg * 4, v);
81202807 419 }
bc992ba5 420
f7ee1874 421 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
2e0cc4d4 422}
d38ceaf9 423
2e0cc4d4
ML
424/*
425 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
426 *
427 * this function is invoked only the debugfs register access
428 * */
f7ee1874
HZ
429void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
430 uint32_t reg, uint32_t v)
2e0cc4d4 431{
bf36b52e
AG
432 if (adev->in_pci_err_recovery)
433 return;
434
2e0cc4d4 435 if (amdgpu_sriov_fullaccess(adev) &&
f7ee1874
HZ
436 adev->gfx.rlc.funcs &&
437 adev->gfx.rlc.funcs->is_rlcg_access_range) {
2e0cc4d4
ML
438 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
439 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
f7ee1874
HZ
440 } else {
441 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
47ed4e1c 442 }
d38ceaf9
AD
443}
444
e3ecdffa
AD
445/**
446 * amdgpu_io_rreg - read an IO register
447 *
448 * @adev: amdgpu_device pointer
449 * @reg: dword aligned register offset
450 *
451 * Returns the 32 bit value from the offset specified.
452 */
d38ceaf9
AD
453u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
454{
bf36b52e
AG
455 if (adev->in_pci_err_recovery)
456 return 0;
457
d38ceaf9
AD
458 if ((reg * 4) < adev->rio_mem_size)
459 return ioread32(adev->rio_mem + (reg * 4));
460 else {
461 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
462 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
463 }
464}
465
e3ecdffa
AD
466/**
467 * amdgpu_io_wreg - write to an IO register
468 *
469 * @adev: amdgpu_device pointer
470 * @reg: dword aligned register offset
471 * @v: 32 bit value to write to the register
472 *
473 * Writes the value specified to the offset specified.
474 */
d38ceaf9
AD
475void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
476{
bf36b52e
AG
477 if (adev->in_pci_err_recovery)
478 return;
479
d38ceaf9
AD
480 if ((reg * 4) < adev->rio_mem_size)
481 iowrite32(v, adev->rio_mem + (reg * 4));
482 else {
483 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
484 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
485 }
486}
487
488/**
489 * amdgpu_mm_rdoorbell - read a doorbell dword
490 *
491 * @adev: amdgpu_device pointer
492 * @index: doorbell index
493 *
494 * Returns the value in the doorbell aperture at the
495 * requested doorbell index (CIK).
496 */
497u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
498{
bf36b52e
AG
499 if (adev->in_pci_err_recovery)
500 return 0;
501
d38ceaf9
AD
502 if (index < adev->doorbell.num_doorbells) {
503 return readl(adev->doorbell.ptr + index);
504 } else {
505 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
506 return 0;
507 }
508}
509
510/**
511 * amdgpu_mm_wdoorbell - write a doorbell dword
512 *
513 * @adev: amdgpu_device pointer
514 * @index: doorbell index
515 * @v: value to write
516 *
517 * Writes @v to the doorbell aperture at the
518 * requested doorbell index (CIK).
519 */
520void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
521{
bf36b52e
AG
522 if (adev->in_pci_err_recovery)
523 return;
524
d38ceaf9
AD
525 if (index < adev->doorbell.num_doorbells) {
526 writel(v, adev->doorbell.ptr + index);
527 } else {
528 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
529 }
530}
531
832be404
KW
532/**
533 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
534 *
535 * @adev: amdgpu_device pointer
536 * @index: doorbell index
537 *
538 * Returns the value in the doorbell aperture at the
539 * requested doorbell index (VEGA10+).
540 */
541u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
542{
bf36b52e
AG
543 if (adev->in_pci_err_recovery)
544 return 0;
545
832be404
KW
546 if (index < adev->doorbell.num_doorbells) {
547 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
548 } else {
549 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
550 return 0;
551 }
552}
553
554/**
555 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
556 *
557 * @adev: amdgpu_device pointer
558 * @index: doorbell index
559 * @v: value to write
560 *
561 * Writes @v to the doorbell aperture at the
562 * requested doorbell index (VEGA10+).
563 */
564void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
565{
bf36b52e
AG
566 if (adev->in_pci_err_recovery)
567 return;
568
832be404
KW
569 if (index < adev->doorbell.num_doorbells) {
570 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
571 } else {
572 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
573 }
574}
575
1bba3683
HZ
576/**
577 * amdgpu_device_indirect_rreg - read an indirect register
578 *
579 * @adev: amdgpu_device pointer
580 * @pcie_index: mmio register offset
581 * @pcie_data: mmio register offset
582 *
583 * Returns the value of indirect register @reg_addr
584 */
585u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
586 u32 pcie_index, u32 pcie_data,
587 u32 reg_addr)
588{
589 unsigned long flags;
590 u32 r;
591 void __iomem *pcie_index_offset;
592 void __iomem *pcie_data_offset;
593
594 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
595 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
596 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
597
598 writel(reg_addr, pcie_index_offset);
599 readl(pcie_index_offset);
600 r = readl(pcie_data_offset);
601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
602
603 return r;
604}
605
606/**
607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
608 *
609 * @adev: amdgpu_device pointer
610 * @pcie_index: mmio register offset
611 * @pcie_data: mmio register offset
612 *
613 * Returns the value of indirect register @reg_addr
614 */
615u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
616 u32 pcie_index, u32 pcie_data,
617 u32 reg_addr)
618{
619 unsigned long flags;
620 u64 r;
621 void __iomem *pcie_index_offset;
622 void __iomem *pcie_data_offset;
623
624 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
627
628 /* read low 32 bits */
629 writel(reg_addr, pcie_index_offset);
630 readl(pcie_index_offset);
631 r = readl(pcie_data_offset);
632 /* read high 32 bits */
633 writel(reg_addr + 4, pcie_index_offset);
634 readl(pcie_index_offset);
635 r |= ((u64)readl(pcie_data_offset) << 32);
636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
637
638 return r;
639}
640
641/**
642 * amdgpu_device_indirect_wreg - write an indirect register address
643 *
644 * @adev: amdgpu_device pointer
645 * @pcie_index: mmio register offset
646 * @pcie_data: mmio register offset
647 * @reg_addr: indirect register offset
648 * @reg_data: indirect register data
649 *
650 */
651void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
652 u32 pcie_index, u32 pcie_data,
653 u32 reg_addr, u32 reg_data)
654{
655 unsigned long flags;
656 void __iomem *pcie_index_offset;
657 void __iomem *pcie_data_offset;
658
659 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
662
663 writel(reg_addr, pcie_index_offset);
664 readl(pcie_index_offset);
665 writel(reg_data, pcie_data_offset);
666 readl(pcie_data_offset);
667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
668}
669
670/**
671 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
672 *
673 * @adev: amdgpu_device pointer
674 * @pcie_index: mmio register offset
675 * @pcie_data: mmio register offset
676 * @reg_addr: indirect register offset
677 * @reg_data: indirect register data
678 *
679 */
680void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
681 u32 pcie_index, u32 pcie_data,
682 u32 reg_addr, u64 reg_data)
683{
684 unsigned long flags;
685 void __iomem *pcie_index_offset;
686 void __iomem *pcie_data_offset;
687
688 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
689 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
690 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
691
692 /* write low 32 bits */
693 writel(reg_addr, pcie_index_offset);
694 readl(pcie_index_offset);
695 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
696 readl(pcie_data_offset);
697 /* write high 32 bits */
698 writel(reg_addr + 4, pcie_index_offset);
699 readl(pcie_index_offset);
700 writel((u32)(reg_data >> 32), pcie_data_offset);
701 readl(pcie_data_offset);
702 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
703}
704
d38ceaf9
AD
705/**
706 * amdgpu_invalid_rreg - dummy reg read function
707 *
708 * @adev: amdgpu device pointer
709 * @reg: offset of register
710 *
711 * Dummy register read function. Used for register blocks
712 * that certain asics don't have (all asics).
713 * Returns the value in the register.
714 */
715static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
716{
717 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
718 BUG();
719 return 0;
720}
721
722/**
723 * amdgpu_invalid_wreg - dummy reg write function
724 *
725 * @adev: amdgpu device pointer
726 * @reg: offset of register
727 * @v: value to write to the register
728 *
729 * Dummy register read function. Used for register blocks
730 * that certain asics don't have (all asics).
731 */
732static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
733{
734 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
735 reg, v);
736 BUG();
737}
738
4fa1c6a6
TZ
739/**
740 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
741 *
742 * @adev: amdgpu device pointer
743 * @reg: offset of register
744 *
745 * Dummy register read function. Used for register blocks
746 * that certain asics don't have (all asics).
747 * Returns the value in the register.
748 */
749static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
750{
751 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
752 BUG();
753 return 0;
754}
755
756/**
757 * amdgpu_invalid_wreg64 - dummy reg write function
758 *
759 * @adev: amdgpu device pointer
760 * @reg: offset of register
761 * @v: value to write to the register
762 *
763 * Dummy register read function. Used for register blocks
764 * that certain asics don't have (all asics).
765 */
766static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
767{
768 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
769 reg, v);
770 BUG();
771}
772
d38ceaf9
AD
773/**
774 * amdgpu_block_invalid_rreg - dummy reg read function
775 *
776 * @adev: amdgpu device pointer
777 * @block: offset of instance
778 * @reg: offset of register
779 *
780 * Dummy register read function. Used for register blocks
781 * that certain asics don't have (all asics).
782 * Returns the value in the register.
783 */
784static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
785 uint32_t block, uint32_t reg)
786{
787 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
788 reg, block);
789 BUG();
790 return 0;
791}
792
793/**
794 * amdgpu_block_invalid_wreg - dummy reg write function
795 *
796 * @adev: amdgpu device pointer
797 * @block: offset of instance
798 * @reg: offset of register
799 * @v: value to write to the register
800 *
801 * Dummy register read function. Used for register blocks
802 * that certain asics don't have (all asics).
803 */
804static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
805 uint32_t block,
806 uint32_t reg, uint32_t v)
807{
808 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
809 reg, block, v);
810 BUG();
811}
812
4d2997ab
AD
813/**
814 * amdgpu_device_asic_init - Wrapper for atom asic_init
815 *
816 * @dev: drm_device pointer
817 *
818 * Does any asic specific work and then calls atom asic init.
819 */
820static int amdgpu_device_asic_init(struct amdgpu_device *adev)
821{
822 amdgpu_asic_pre_asic_init(adev);
823
824 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
825}
826
e3ecdffa
AD
827/**
828 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
829 *
830 * @adev: amdgpu device pointer
831 *
832 * Allocates a scratch page of VRAM for use by various things in the
833 * driver.
834 */
06ec9070 835static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
d38ceaf9 836{
a4a02777
CK
837 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
838 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
839 &adev->vram_scratch.robj,
840 &adev->vram_scratch.gpu_addr,
841 (void **)&adev->vram_scratch.ptr);
d38ceaf9
AD
842}
843
e3ecdffa
AD
844/**
845 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
846 *
847 * @adev: amdgpu device pointer
848 *
849 * Frees the VRAM scratch page.
850 */
06ec9070 851static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
d38ceaf9 852{
078af1a3 853 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
d38ceaf9
AD
854}
855
856/**
9c3f2b54 857 * amdgpu_device_program_register_sequence - program an array of registers.
d38ceaf9
AD
858 *
859 * @adev: amdgpu_device pointer
860 * @registers: pointer to the register array
861 * @array_size: size of the register array
862 *
863 * Programs an array or registers with and and or masks.
864 * This is a helper for setting golden registers.
865 */
9c3f2b54
AD
866void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
867 const u32 *registers,
868 const u32 array_size)
d38ceaf9
AD
869{
870 u32 tmp, reg, and_mask, or_mask;
871 int i;
872
873 if (array_size % 3)
874 return;
875
876 for (i = 0; i < array_size; i +=3) {
877 reg = registers[i + 0];
878 and_mask = registers[i + 1];
879 or_mask = registers[i + 2];
880
881 if (and_mask == 0xffffffff) {
882 tmp = or_mask;
883 } else {
884 tmp = RREG32(reg);
885 tmp &= ~and_mask;
e0d07657
HZ
886 if (adev->family >= AMDGPU_FAMILY_AI)
887 tmp |= (or_mask & and_mask);
888 else
889 tmp |= or_mask;
d38ceaf9
AD
890 }
891 WREG32(reg, tmp);
892 }
893}
894
e3ecdffa
AD
895/**
896 * amdgpu_device_pci_config_reset - reset the GPU
897 *
898 * @adev: amdgpu_device pointer
899 *
900 * Resets the GPU using the pci config reset sequence.
901 * Only applicable to asics prior to vega10.
902 */
8111c387 903void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
d38ceaf9
AD
904{
905 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
906}
907
908/*
909 * GPU doorbell aperture helpers function.
910 */
911/**
06ec9070 912 * amdgpu_device_doorbell_init - Init doorbell driver information.
d38ceaf9
AD
913 *
914 * @adev: amdgpu_device pointer
915 *
916 * Init doorbell driver information (CIK)
917 * Returns 0 on success, error on failure.
918 */
06ec9070 919static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
d38ceaf9 920{
6585661d 921
705e519e
CK
922 /* No doorbell on SI hardware generation */
923 if (adev->asic_type < CHIP_BONAIRE) {
924 adev->doorbell.base = 0;
925 adev->doorbell.size = 0;
926 adev->doorbell.num_doorbells = 0;
927 adev->doorbell.ptr = NULL;
928 return 0;
929 }
930
d6895ad3
CK
931 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
932 return -EINVAL;
933
22357775
AD
934 amdgpu_asic_init_doorbell_index(adev);
935
d38ceaf9
AD
936 /* doorbell bar mapping */
937 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
938 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
939
edf600da 940 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
9564f192 941 adev->doorbell_index.max_assignment+1);
d38ceaf9
AD
942 if (adev->doorbell.num_doorbells == 0)
943 return -EINVAL;
944
ec3db8a6 945 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
88dc26e4
OZ
946 * paging queue doorbell use the second page. The
947 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
948 * doorbells are in the first page. So with paging queue enabled,
949 * the max num_doorbells should + 1 page (0x400 in dword)
ec3db8a6
PY
950 */
951 if (adev->asic_type >= CHIP_VEGA10)
88dc26e4 952 adev->doorbell.num_doorbells += 0x400;
ec3db8a6 953
8972e5d2
CK
954 adev->doorbell.ptr = ioremap(adev->doorbell.base,
955 adev->doorbell.num_doorbells *
956 sizeof(u32));
957 if (adev->doorbell.ptr == NULL)
d38ceaf9 958 return -ENOMEM;
d38ceaf9
AD
959
960 return 0;
961}
962
963/**
06ec9070 964 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
d38ceaf9
AD
965 *
966 * @adev: amdgpu_device pointer
967 *
968 * Tear down doorbell driver information (CIK)
969 */
06ec9070 970static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
d38ceaf9
AD
971{
972 iounmap(adev->doorbell.ptr);
973 adev->doorbell.ptr = NULL;
974}
975
22cb0164 976
d38ceaf9
AD
977
978/*
06ec9070 979 * amdgpu_device_wb_*()
455a7bc2 980 * Writeback is the method by which the GPU updates special pages in memory
ea81a173 981 * with the status of certain GPU events (fences, ring pointers,etc.).
d38ceaf9
AD
982 */
983
984/**
06ec9070 985 * amdgpu_device_wb_fini - Disable Writeback and free memory
d38ceaf9
AD
986 *
987 * @adev: amdgpu_device pointer
988 *
989 * Disables Writeback and frees the Writeback memory (all asics).
990 * Used at driver shutdown.
991 */
06ec9070 992static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
d38ceaf9
AD
993{
994 if (adev->wb.wb_obj) {
a76ed485
AD
995 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
996 &adev->wb.gpu_addr,
997 (void **)&adev->wb.wb);
d38ceaf9
AD
998 adev->wb.wb_obj = NULL;
999 }
1000}
1001
1002/**
06ec9070 1003 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
d38ceaf9
AD
1004 *
1005 * @adev: amdgpu_device pointer
1006 *
455a7bc2 1007 * Initializes writeback and allocates writeback memory (all asics).
d38ceaf9
AD
1008 * Used at driver startup.
1009 * Returns 0 on success or an -error on failure.
1010 */
06ec9070 1011static int amdgpu_device_wb_init(struct amdgpu_device *adev)
d38ceaf9
AD
1012{
1013 int r;
1014
1015 if (adev->wb.wb_obj == NULL) {
97407b63
AD
1016 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1017 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
a76ed485
AD
1018 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1019 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1020 (void **)&adev->wb.wb);
d38ceaf9
AD
1021 if (r) {
1022 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1023 return r;
1024 }
d38ceaf9
AD
1025
1026 adev->wb.num_wb = AMDGPU_MAX_WB;
1027 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1028
1029 /* clear wb memory */
73469585 1030 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
d38ceaf9
AD
1031 }
1032
1033 return 0;
1034}
1035
1036/**
131b4b36 1037 * amdgpu_device_wb_get - Allocate a wb entry
d38ceaf9
AD
1038 *
1039 * @adev: amdgpu_device pointer
1040 * @wb: wb index
1041 *
1042 * Allocate a wb slot for use by the driver (all asics).
1043 * Returns 0 on success or -EINVAL on failure.
1044 */
131b4b36 1045int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
d38ceaf9
AD
1046{
1047 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
d38ceaf9 1048
97407b63 1049 if (offset < adev->wb.num_wb) {
7014285a 1050 __set_bit(offset, adev->wb.used);
63ae07ca 1051 *wb = offset << 3; /* convert to dw offset */
0915fdbc
ML
1052 return 0;
1053 } else {
1054 return -EINVAL;
1055 }
1056}
1057
d38ceaf9 1058/**
131b4b36 1059 * amdgpu_device_wb_free - Free a wb entry
d38ceaf9
AD
1060 *
1061 * @adev: amdgpu_device pointer
1062 * @wb: wb index
1063 *
1064 * Free a wb slot allocated for use by the driver (all asics)
1065 */
131b4b36 1066void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
d38ceaf9 1067{
73469585 1068 wb >>= 3;
d38ceaf9 1069 if (wb < adev->wb.num_wb)
73469585 1070 __clear_bit(wb, adev->wb.used);
d38ceaf9
AD
1071}
1072
d6895ad3
CK
1073/**
1074 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1075 *
1076 * @adev: amdgpu_device pointer
1077 *
1078 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1079 * to fail, but if any of the BARs is not accessible after the size we abort
1080 * driver loading by returning -ENODEV.
1081 */
1082int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1083{
770d13b1 1084 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
d6895ad3 1085 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
31b8adab
CK
1086 struct pci_bus *root;
1087 struct resource *res;
1088 unsigned i;
d6895ad3
CK
1089 u16 cmd;
1090 int r;
1091
0c03b912 1092 /* Bypass for VF */
1093 if (amdgpu_sriov_vf(adev))
1094 return 0;
1095
b7221f2b
AD
1096 /* skip if the bios has already enabled large BAR */
1097 if (adev->gmc.real_vram_size &&
1098 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1099 return 0;
1100
31b8adab
CK
1101 /* Check if the root BUS has 64bit memory resources */
1102 root = adev->pdev->bus;
1103 while (root->parent)
1104 root = root->parent;
1105
1106 pci_bus_for_each_resource(root, res, i) {
0ebb7c54 1107 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
31b8adab
CK
1108 res->start > 0x100000000ull)
1109 break;
1110 }
1111
1112 /* Trying to resize is pointless without a root hub window above 4GB */
1113 if (!res)
1114 return 0;
1115
d6895ad3
CK
1116 /* Disable memory decoding while we change the BAR addresses and size */
1117 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1118 pci_write_config_word(adev->pdev, PCI_COMMAND,
1119 cmd & ~PCI_COMMAND_MEMORY);
1120
1121 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
06ec9070 1122 amdgpu_device_doorbell_fini(adev);
d6895ad3
CK
1123 if (adev->asic_type >= CHIP_BONAIRE)
1124 pci_release_resource(adev->pdev, 2);
1125
1126 pci_release_resource(adev->pdev, 0);
1127
1128 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1129 if (r == -ENOSPC)
1130 DRM_INFO("Not enough PCI address space for a large BAR.");
1131 else if (r && r != -ENOTSUPP)
1132 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1133
1134 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1135
1136 /* When the doorbell or fb BAR isn't available we have no chance of
1137 * using the device.
1138 */
06ec9070 1139 r = amdgpu_device_doorbell_init(adev);
d6895ad3
CK
1140 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1141 return -ENODEV;
1142
1143 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1144
1145 return 0;
1146}
a05502e5 1147
d38ceaf9
AD
1148/*
1149 * GPU helpers function.
1150 */
1151/**
39c640c0 1152 * amdgpu_device_need_post - check if the hw need post or not
d38ceaf9
AD
1153 *
1154 * @adev: amdgpu_device pointer
1155 *
c836fec5
JQ
1156 * Check if the asic has been initialized (all asics) at driver startup
1157 * or post is needed if hw reset is performed.
1158 * Returns true if need or false if not.
d38ceaf9 1159 */
39c640c0 1160bool amdgpu_device_need_post(struct amdgpu_device *adev)
d38ceaf9
AD
1161{
1162 uint32_t reg;
1163
bec86378
ML
1164 if (amdgpu_sriov_vf(adev))
1165 return false;
1166
1167 if (amdgpu_passthrough(adev)) {
1da2c326
ML
1168 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1169 * some old smc fw still need driver do vPost otherwise gpu hang, while
1170 * those smc fw version above 22.15 doesn't have this flaw, so we force
1171 * vpost executed for smc version below 22.15
bec86378
ML
1172 */
1173 if (adev->asic_type == CHIP_FIJI) {
1174 int err;
1175 uint32_t fw_ver;
1176 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1177 /* force vPost if error occured */
1178 if (err)
1179 return true;
1180
1181 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1da2c326
ML
1182 if (fw_ver < 0x00160e00)
1183 return true;
bec86378 1184 }
bec86378 1185 }
91fe77eb 1186
1187 if (adev->has_hw_reset) {
1188 adev->has_hw_reset = false;
1189 return true;
1190 }
1191
1192 /* bios scratch used on CIK+ */
1193 if (adev->asic_type >= CHIP_BONAIRE)
1194 return amdgpu_atombios_scratch_need_asic_init(adev);
1195
1196 /* check MEM_SIZE for older asics */
1197 reg = amdgpu_asic_get_config_memsize(adev);
1198
1199 if ((reg != 0) && (reg != 0xffffffff))
1200 return false;
1201
1202 return true;
bec86378
ML
1203}
1204
d38ceaf9
AD
1205/* if we get transitioned to only one device, take VGA back */
1206/**
06ec9070 1207 * amdgpu_device_vga_set_decode - enable/disable vga decode
d38ceaf9
AD
1208 *
1209 * @cookie: amdgpu_device pointer
1210 * @state: enable/disable vga decode
1211 *
1212 * Enable/disable vga decode (all asics).
1213 * Returns VGA resource flags.
1214 */
06ec9070 1215static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
d38ceaf9
AD
1216{
1217 struct amdgpu_device *adev = cookie;
1218 amdgpu_asic_set_vga_state(adev, state);
1219 if (state)
1220 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1221 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1222 else
1223 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1224}
1225
e3ecdffa
AD
1226/**
1227 * amdgpu_device_check_block_size - validate the vm block size
1228 *
1229 * @adev: amdgpu_device pointer
1230 *
1231 * Validates the vm block size specified via module parameter.
1232 * The vm block size defines number of bits in page table versus page directory,
1233 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1234 * page table and the remaining bits are in the page directory.
1235 */
06ec9070 1236static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
a1adf8be
CZ
1237{
1238 /* defines number of bits in page table versus page directory,
1239 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1240 * page table and the remaining bits are in the page directory */
bab4fee7
JZ
1241 if (amdgpu_vm_block_size == -1)
1242 return;
a1adf8be 1243
bab4fee7 1244 if (amdgpu_vm_block_size < 9) {
a1adf8be
CZ
1245 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1246 amdgpu_vm_block_size);
97489129 1247 amdgpu_vm_block_size = -1;
a1adf8be 1248 }
a1adf8be
CZ
1249}
1250
e3ecdffa
AD
1251/**
1252 * amdgpu_device_check_vm_size - validate the vm size
1253 *
1254 * @adev: amdgpu_device pointer
1255 *
1256 * Validates the vm size in GB specified via module parameter.
1257 * The VM size is the size of the GPU virtual memory space in GB.
1258 */
06ec9070 1259static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
83ca145d 1260{
64dab074
AD
1261 /* no need to check the default value */
1262 if (amdgpu_vm_size == -1)
1263 return;
1264
83ca145d
ZJ
1265 if (amdgpu_vm_size < 1) {
1266 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1267 amdgpu_vm_size);
f3368128 1268 amdgpu_vm_size = -1;
83ca145d 1269 }
83ca145d
ZJ
1270}
1271
7951e376
RZ
1272static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1273{
1274 struct sysinfo si;
a9d4fe2f 1275 bool is_os_64 = (sizeof(void *) == 8);
7951e376
RZ
1276 uint64_t total_memory;
1277 uint64_t dram_size_seven_GB = 0x1B8000000;
1278 uint64_t dram_size_three_GB = 0xB8000000;
1279
1280 if (amdgpu_smu_memory_pool_size == 0)
1281 return;
1282
1283 if (!is_os_64) {
1284 DRM_WARN("Not 64-bit OS, feature not supported\n");
1285 goto def_value;
1286 }
1287 si_meminfo(&si);
1288 total_memory = (uint64_t)si.totalram * si.mem_unit;
1289
1290 if ((amdgpu_smu_memory_pool_size == 1) ||
1291 (amdgpu_smu_memory_pool_size == 2)) {
1292 if (total_memory < dram_size_three_GB)
1293 goto def_value1;
1294 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1295 (amdgpu_smu_memory_pool_size == 8)) {
1296 if (total_memory < dram_size_seven_GB)
1297 goto def_value1;
1298 } else {
1299 DRM_WARN("Smu memory pool size not supported\n");
1300 goto def_value;
1301 }
1302 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1303
1304 return;
1305
1306def_value1:
1307 DRM_WARN("No enough system memory\n");
1308def_value:
1309 adev->pm.smu_prv_buffer_size = 0;
1310}
1311
d38ceaf9 1312/**
06ec9070 1313 * amdgpu_device_check_arguments - validate module params
d38ceaf9
AD
1314 *
1315 * @adev: amdgpu_device pointer
1316 *
1317 * Validates certain module parameters and updates
1318 * the associated values used by the driver (all asics).
1319 */
912dfc84 1320static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
d38ceaf9 1321{
5b011235
CZ
1322 if (amdgpu_sched_jobs < 4) {
1323 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1324 amdgpu_sched_jobs);
1325 amdgpu_sched_jobs = 4;
76117507 1326 } else if (!is_power_of_2(amdgpu_sched_jobs)){
5b011235
CZ
1327 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1328 amdgpu_sched_jobs);
1329 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1330 }
d38ceaf9 1331
83e74db6 1332 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
f9321cc4
CK
1333 /* gart size must be greater or equal to 32M */
1334 dev_warn(adev->dev, "gart size (%d) too small\n",
1335 amdgpu_gart_size);
83e74db6 1336 amdgpu_gart_size = -1;
d38ceaf9
AD
1337 }
1338
36d38372 1339 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
c4e1a13a 1340 /* gtt size must be greater or equal to 32M */
36d38372
CK
1341 dev_warn(adev->dev, "gtt size (%d) too small\n",
1342 amdgpu_gtt_size);
1343 amdgpu_gtt_size = -1;
d38ceaf9
AD
1344 }
1345
d07f14be
RH
1346 /* valid range is between 4 and 9 inclusive */
1347 if (amdgpu_vm_fragment_size != -1 &&
1348 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1349 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1350 amdgpu_vm_fragment_size = -1;
1351 }
1352
5d5bd5e3
KW
1353 if (amdgpu_sched_hw_submission < 2) {
1354 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1355 amdgpu_sched_hw_submission);
1356 amdgpu_sched_hw_submission = 2;
1357 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1358 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1359 amdgpu_sched_hw_submission);
1360 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1361 }
1362
7951e376
RZ
1363 amdgpu_device_check_smu_prv_buffer_size(adev);
1364
06ec9070 1365 amdgpu_device_check_vm_size(adev);
d38ceaf9 1366
06ec9070 1367 amdgpu_device_check_block_size(adev);
6a7f76e7 1368
19aede77 1369 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
912dfc84 1370
c6252390 1371 amdgpu_gmc_tmz_set(adev);
01a8dcec 1372
a300de40
ML
1373 if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1374 amdgpu_num_kcq = 8;
c16ce562 1375 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
a300de40
ML
1376 }
1377
9b498efa
AD
1378 amdgpu_gmc_noretry_set(adev);
1379
e3c00faa 1380 return 0;
d38ceaf9
AD
1381}
1382
1383/**
1384 * amdgpu_switcheroo_set_state - set switcheroo state
1385 *
1386 * @pdev: pci dev pointer
1694467b 1387 * @state: vga_switcheroo state
d38ceaf9
AD
1388 *
1389 * Callback for the switcheroo driver. Suspends or resumes the
1390 * the asics before or after it is powered up using ACPI methods.
1391 */
8aba21b7
LT
1392static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1393 enum vga_switcheroo_state state)
d38ceaf9
AD
1394{
1395 struct drm_device *dev = pci_get_drvdata(pdev);
de185019 1396 int r;
d38ceaf9 1397
31af062a 1398 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
d38ceaf9
AD
1399 return;
1400
1401 if (state == VGA_SWITCHEROO_ON) {
dd4fa6c1 1402 pr_info("switched on\n");
d38ceaf9
AD
1403 /* don't suspend or resume card normally */
1404 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1405
de185019 1406 pci_set_power_state(dev->pdev, PCI_D0);
c1dd4aa6 1407 amdgpu_device_load_pci_state(dev->pdev);
de185019
AD
1408 r = pci_enable_device(dev->pdev);
1409 if (r)
1410 DRM_WARN("pci_enable_device failed (%d)\n", r);
1411 amdgpu_device_resume(dev, true);
d38ceaf9 1412
d38ceaf9
AD
1413 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1414 drm_kms_helper_poll_enable(dev);
1415 } else {
dd4fa6c1 1416 pr_info("switched off\n");
d38ceaf9
AD
1417 drm_kms_helper_poll_disable(dev);
1418 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
de185019 1419 amdgpu_device_suspend(dev, true);
c1dd4aa6 1420 amdgpu_device_cache_pci_state(dev->pdev);
de185019
AD
1421 /* Shut down the device */
1422 pci_disable_device(dev->pdev);
1423 pci_set_power_state(dev->pdev, PCI_D3cold);
d38ceaf9
AD
1424 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1425 }
1426}
1427
1428/**
1429 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1430 *
1431 * @pdev: pci dev pointer
1432 *
1433 * Callback for the switcheroo driver. Check of the switcheroo
1434 * state can be changed.
1435 * Returns true if the state can be changed, false if not.
1436 */
1437static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1438{
1439 struct drm_device *dev = pci_get_drvdata(pdev);
1440
1441 /*
1442 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1443 * locking inversion with the driver load path. And the access here is
1444 * completely racy anyway. So don't bother with locking for now.
1445 */
7e13ad89 1446 return atomic_read(&dev->open_count) == 0;
d38ceaf9
AD
1447}
1448
1449static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1450 .set_gpu_state = amdgpu_switcheroo_set_state,
1451 .reprobe = NULL,
1452 .can_switch = amdgpu_switcheroo_can_switch,
1453};
1454
e3ecdffa
AD
1455/**
1456 * amdgpu_device_ip_set_clockgating_state - set the CG state
1457 *
87e3f136 1458 * @dev: amdgpu_device pointer
e3ecdffa
AD
1459 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1460 * @state: clockgating state (gate or ungate)
1461 *
1462 * Sets the requested clockgating state for all instances of
1463 * the hardware IP specified.
1464 * Returns the error code from the last instance.
1465 */
43fa561f 1466int amdgpu_device_ip_set_clockgating_state(void *dev,
2990a1fc
AD
1467 enum amd_ip_block_type block_type,
1468 enum amd_clockgating_state state)
d38ceaf9 1469{
43fa561f 1470 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1471 int i, r = 0;
1472
1473 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1474 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1475 continue;
c722865a
RZ
1476 if (adev->ip_blocks[i].version->type != block_type)
1477 continue;
1478 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1479 continue;
1480 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1481 (void *)adev, state);
1482 if (r)
1483 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1484 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1485 }
1486 return r;
1487}
1488
e3ecdffa
AD
1489/**
1490 * amdgpu_device_ip_set_powergating_state - set the PG state
1491 *
87e3f136 1492 * @dev: amdgpu_device pointer
e3ecdffa
AD
1493 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1494 * @state: powergating state (gate or ungate)
1495 *
1496 * Sets the requested powergating state for all instances of
1497 * the hardware IP specified.
1498 * Returns the error code from the last instance.
1499 */
43fa561f 1500int amdgpu_device_ip_set_powergating_state(void *dev,
2990a1fc
AD
1501 enum amd_ip_block_type block_type,
1502 enum amd_powergating_state state)
d38ceaf9 1503{
43fa561f 1504 struct amdgpu_device *adev = dev;
d38ceaf9
AD
1505 int i, r = 0;
1506
1507 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1508 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1509 continue;
c722865a
RZ
1510 if (adev->ip_blocks[i].version->type != block_type)
1511 continue;
1512 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1513 continue;
1514 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1515 (void *)adev, state);
1516 if (r)
1517 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1518 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9
AD
1519 }
1520 return r;
1521}
1522
e3ecdffa
AD
1523/**
1524 * amdgpu_device_ip_get_clockgating_state - get the CG state
1525 *
1526 * @adev: amdgpu_device pointer
1527 * @flags: clockgating feature flags
1528 *
1529 * Walks the list of IPs on the device and updates the clockgating
1530 * flags for each IP.
1531 * Updates @flags with the feature flags for each hardware IP where
1532 * clockgating is enabled.
1533 */
2990a1fc
AD
1534void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1535 u32 *flags)
6cb2d4e4
HR
1536{
1537 int i;
1538
1539 for (i = 0; i < adev->num_ip_blocks; i++) {
1540 if (!adev->ip_blocks[i].status.valid)
1541 continue;
1542 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1543 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1544 }
1545}
1546
e3ecdffa
AD
1547/**
1548 * amdgpu_device_ip_wait_for_idle - wait for idle
1549 *
1550 * @adev: amdgpu_device pointer
1551 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1552 *
1553 * Waits for the request hardware IP to be idle.
1554 * Returns 0 for success or a negative error code on failure.
1555 */
2990a1fc
AD
1556int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1557 enum amd_ip_block_type block_type)
5dbbb60b
AD
1558{
1559 int i, r;
1560
1561 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1562 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1563 continue;
a1255107
AD
1564 if (adev->ip_blocks[i].version->type == block_type) {
1565 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
5dbbb60b
AD
1566 if (r)
1567 return r;
1568 break;
1569 }
1570 }
1571 return 0;
1572
1573}
1574
e3ecdffa
AD
1575/**
1576 * amdgpu_device_ip_is_idle - is the hardware IP idle
1577 *
1578 * @adev: amdgpu_device pointer
1579 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1580 *
1581 * Check if the hardware IP is idle or not.
1582 * Returns true if it the IP is idle, false if not.
1583 */
2990a1fc
AD
1584bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1585 enum amd_ip_block_type block_type)
5dbbb60b
AD
1586{
1587 int i;
1588
1589 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 1590 if (!adev->ip_blocks[i].status.valid)
9ecbe7f5 1591 continue;
a1255107
AD
1592 if (adev->ip_blocks[i].version->type == block_type)
1593 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
5dbbb60b
AD
1594 }
1595 return true;
1596
1597}
1598
e3ecdffa
AD
1599/**
1600 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1601 *
1602 * @adev: amdgpu_device pointer
87e3f136 1603 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
e3ecdffa
AD
1604 *
1605 * Returns a pointer to the hardware IP block structure
1606 * if it exists for the asic, otherwise NULL.
1607 */
2990a1fc
AD
1608struct amdgpu_ip_block *
1609amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1610 enum amd_ip_block_type type)
d38ceaf9
AD
1611{
1612 int i;
1613
1614 for (i = 0; i < adev->num_ip_blocks; i++)
a1255107 1615 if (adev->ip_blocks[i].version->type == type)
d38ceaf9
AD
1616 return &adev->ip_blocks[i];
1617
1618 return NULL;
1619}
1620
1621/**
2990a1fc 1622 * amdgpu_device_ip_block_version_cmp
d38ceaf9
AD
1623 *
1624 * @adev: amdgpu_device pointer
5fc3aeeb 1625 * @type: enum amd_ip_block_type
d38ceaf9
AD
1626 * @major: major version
1627 * @minor: minor version
1628 *
1629 * return 0 if equal or greater
1630 * return 1 if smaller or the ip_block doesn't exist
1631 */
2990a1fc
AD
1632int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1633 enum amd_ip_block_type type,
1634 u32 major, u32 minor)
d38ceaf9 1635{
2990a1fc 1636 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
d38ceaf9 1637
a1255107
AD
1638 if (ip_block && ((ip_block->version->major > major) ||
1639 ((ip_block->version->major == major) &&
1640 (ip_block->version->minor >= minor))))
d38ceaf9
AD
1641 return 0;
1642
1643 return 1;
1644}
1645
a1255107 1646/**
2990a1fc 1647 * amdgpu_device_ip_block_add
a1255107
AD
1648 *
1649 * @adev: amdgpu_device pointer
1650 * @ip_block_version: pointer to the IP to add
1651 *
1652 * Adds the IP block driver information to the collection of IPs
1653 * on the asic.
1654 */
2990a1fc
AD
1655int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1656 const struct amdgpu_ip_block_version *ip_block_version)
a1255107
AD
1657{
1658 if (!ip_block_version)
1659 return -EINVAL;
1660
e966a725 1661 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
a0bae357
HR
1662 ip_block_version->funcs->name);
1663
a1255107
AD
1664 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1665
1666 return 0;
1667}
1668
e3ecdffa
AD
1669/**
1670 * amdgpu_device_enable_virtual_display - enable virtual display feature
1671 *
1672 * @adev: amdgpu_device pointer
1673 *
1674 * Enabled the virtual display feature if the user has enabled it via
1675 * the module parameter virtual_display. This feature provides a virtual
1676 * display hardware on headless boards or in virtualized environments.
1677 * This function parses and validates the configuration string specified by
1678 * the user and configues the virtual display configuration (number of
1679 * virtual connectors, crtcs, etc.) specified.
1680 */
483ef985 1681static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
9accf2fd
ED
1682{
1683 adev->enable_virtual_display = false;
1684
1685 if (amdgpu_virtual_display) {
4a580877 1686 struct drm_device *ddev = adev_to_drm(adev);
9accf2fd 1687 const char *pci_address_name = pci_name(ddev->pdev);
0f66356d 1688 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
9accf2fd
ED
1689
1690 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1691 pciaddstr_tmp = pciaddstr;
0f66356d
ED
1692 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1693 pciaddname = strsep(&pciaddname_tmp, ",");
967de2a9
YT
1694 if (!strcmp("all", pciaddname)
1695 || !strcmp(pci_address_name, pciaddname)) {
0f66356d
ED
1696 long num_crtc;
1697 int res = -1;
1698
9accf2fd 1699 adev->enable_virtual_display = true;
0f66356d
ED
1700
1701 if (pciaddname_tmp)
1702 res = kstrtol(pciaddname_tmp, 10,
1703 &num_crtc);
1704
1705 if (!res) {
1706 if (num_crtc < 1)
1707 num_crtc = 1;
1708 if (num_crtc > 6)
1709 num_crtc = 6;
1710 adev->mode_info.num_crtc = num_crtc;
1711 } else {
1712 adev->mode_info.num_crtc = 1;
1713 }
9accf2fd
ED
1714 break;
1715 }
1716 }
1717
0f66356d
ED
1718 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1719 amdgpu_virtual_display, pci_address_name,
1720 adev->enable_virtual_display, adev->mode_info.num_crtc);
9accf2fd
ED
1721
1722 kfree(pciaddstr);
1723 }
1724}
1725
e3ecdffa
AD
1726/**
1727 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1728 *
1729 * @adev: amdgpu_device pointer
1730 *
1731 * Parses the asic configuration parameters specified in the gpu info
1732 * firmware and makes them availale to the driver for use in configuring
1733 * the asic.
1734 * Returns 0 on success, -EINVAL on failure.
1735 */
e2a75f88
AD
1736static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1737{
e2a75f88 1738 const char *chip_name;
c0a43457 1739 char fw_name[40];
e2a75f88
AD
1740 int err;
1741 const struct gpu_info_firmware_header_v1_0 *hdr;
1742
ab4fe3e1
HR
1743 adev->firmware.gpu_info_fw = NULL;
1744
72de33f8 1745 if (adev->mman.discovery_bin) {
258620d0 1746 amdgpu_discovery_get_gfx_info(adev);
cc375d8c
TY
1747
1748 /*
1749 * FIXME: The bounding box is still needed by Navi12, so
1750 * temporarily read it from gpu_info firmware. Should be droped
1751 * when DAL no longer needs it.
1752 */
1753 if (adev->asic_type != CHIP_NAVI12)
1754 return 0;
258620d0
AD
1755 }
1756
e2a75f88 1757 switch (adev->asic_type) {
e2a75f88
AD
1758#ifdef CONFIG_DRM_AMDGPU_SI
1759 case CHIP_VERDE:
1760 case CHIP_TAHITI:
1761 case CHIP_PITCAIRN:
1762 case CHIP_OLAND:
1763 case CHIP_HAINAN:
1764#endif
1765#ifdef CONFIG_DRM_AMDGPU_CIK
1766 case CHIP_BONAIRE:
1767 case CHIP_HAWAII:
1768 case CHIP_KAVERI:
1769 case CHIP_KABINI:
1770 case CHIP_MULLINS:
1771#endif
da87c30b
AD
1772 case CHIP_TOPAZ:
1773 case CHIP_TONGA:
1774 case CHIP_FIJI:
1775 case CHIP_POLARIS10:
1776 case CHIP_POLARIS11:
1777 case CHIP_POLARIS12:
1778 case CHIP_VEGAM:
1779 case CHIP_CARRIZO:
1780 case CHIP_STONEY:
27c0bc71 1781 case CHIP_VEGA20:
84d244a3
JC
1782 case CHIP_SIENNA_CICHLID:
1783 case CHIP_NAVY_FLOUNDER:
e2a75f88
AD
1784 default:
1785 return 0;
1786 case CHIP_VEGA10:
1787 chip_name = "vega10";
1788 break;
3f76dced
AD
1789 case CHIP_VEGA12:
1790 chip_name = "vega12";
1791 break;
2d2e5e7e 1792 case CHIP_RAVEN:
54f78a76 1793 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
54c4d17e 1794 chip_name = "raven2";
54f78a76 1795 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
741deade 1796 chip_name = "picasso";
54c4d17e
FX
1797 else
1798 chip_name = "raven";
2d2e5e7e 1799 break;
65e60f6e
LM
1800 case CHIP_ARCTURUS:
1801 chip_name = "arcturus";
1802 break;
b51a26a0
HR
1803 case CHIP_RENOIR:
1804 chip_name = "renoir";
1805 break;
23c6268e
HR
1806 case CHIP_NAVI10:
1807 chip_name = "navi10";
1808 break;
ed42cfe1
XY
1809 case CHIP_NAVI14:
1810 chip_name = "navi14";
1811 break;
42b325e5
XY
1812 case CHIP_NAVI12:
1813 chip_name = "navi12";
1814 break;
e2a75f88
AD
1815 }
1816
1817 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
ab4fe3e1 1818 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
e2a75f88
AD
1819 if (err) {
1820 dev_err(adev->dev,
1821 "Failed to load gpu_info firmware \"%s\"\n",
1822 fw_name);
1823 goto out;
1824 }
ab4fe3e1 1825 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
e2a75f88
AD
1826 if (err) {
1827 dev_err(adev->dev,
1828 "Failed to validate gpu_info firmware \"%s\"\n",
1829 fw_name);
1830 goto out;
1831 }
1832
ab4fe3e1 1833 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
e2a75f88
AD
1834 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1835
1836 switch (hdr->version_major) {
1837 case 1:
1838 {
1839 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
ab4fe3e1 1840 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
e2a75f88
AD
1841 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1842
cc375d8c
TY
1843 /*
1844 * Should be droped when DAL no longer needs it.
1845 */
1846 if (adev->asic_type == CHIP_NAVI12)
ec51d3fa
XY
1847 goto parse_soc_bounding_box;
1848
b5ab16bf
AD
1849 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1850 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1851 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1852 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
e2a75f88 1853 adev->gfx.config.max_texture_channel_caches =
b5ab16bf
AD
1854 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1855 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1856 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1857 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1858 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
e2a75f88 1859 adev->gfx.config.double_offchip_lds_buf =
b5ab16bf
AD
1860 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1861 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
51fd0370
HZ
1862 adev->gfx.cu_info.max_waves_per_simd =
1863 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1864 adev->gfx.cu_info.max_scratch_slots_per_cu =
1865 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1866 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
48321c3d 1867 if (hdr->version_minor >= 1) {
35c2e910
HZ
1868 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1869 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1870 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1871 adev->gfx.config.num_sc_per_sh =
1872 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1873 adev->gfx.config.num_packer_per_sc =
1874 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1875 }
ec51d3fa
XY
1876
1877parse_soc_bounding_box:
ec51d3fa
XY
1878 /*
1879 * soc bounding box info is not integrated in disocovery table,
258620d0 1880 * we always need to parse it from gpu info firmware if needed.
ec51d3fa 1881 */
48321c3d
HW
1882 if (hdr->version_minor == 2) {
1883 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1884 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1885 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1886 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1887 }
e2a75f88
AD
1888 break;
1889 }
1890 default:
1891 dev_err(adev->dev,
1892 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1893 err = -EINVAL;
1894 goto out;
1895 }
1896out:
e2a75f88
AD
1897 return err;
1898}
1899
e3ecdffa
AD
1900/**
1901 * amdgpu_device_ip_early_init - run early init for hardware IPs
1902 *
1903 * @adev: amdgpu_device pointer
1904 *
1905 * Early initialization pass for hardware IPs. The hardware IPs that make
1906 * up each asic are discovered each IP's early_init callback is run. This
1907 * is the first stage in initializing the asic.
1908 * Returns 0 on success, negative error code on failure.
1909 */
06ec9070 1910static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
d38ceaf9 1911{
aaa36a97 1912 int i, r;
d38ceaf9 1913
483ef985 1914 amdgpu_device_enable_virtual_display(adev);
a6be7570 1915
00a979f3 1916 if (amdgpu_sriov_vf(adev)) {
00a979f3 1917 r = amdgpu_virt_request_full_gpu(adev, true);
aaa36a97
AD
1918 if (r)
1919 return r;
00a979f3
WS
1920 }
1921
d38ceaf9 1922 switch (adev->asic_type) {
33f34802
KW
1923#ifdef CONFIG_DRM_AMDGPU_SI
1924 case CHIP_VERDE:
1925 case CHIP_TAHITI:
1926 case CHIP_PITCAIRN:
1927 case CHIP_OLAND:
1928 case CHIP_HAINAN:
295d0daf 1929 adev->family = AMDGPU_FAMILY_SI;
33f34802
KW
1930 r = si_set_ip_blocks(adev);
1931 if (r)
1932 return r;
1933 break;
1934#endif
a2e73f56
AD
1935#ifdef CONFIG_DRM_AMDGPU_CIK
1936 case CHIP_BONAIRE:
1937 case CHIP_HAWAII:
1938 case CHIP_KAVERI:
1939 case CHIP_KABINI:
1940 case CHIP_MULLINS:
e1ad2d53 1941 if (adev->flags & AMD_IS_APU)
a2e73f56 1942 adev->family = AMDGPU_FAMILY_KV;
e1ad2d53
AD
1943 else
1944 adev->family = AMDGPU_FAMILY_CI;
a2e73f56
AD
1945
1946 r = cik_set_ip_blocks(adev);
1947 if (r)
1948 return r;
1949 break;
1950#endif
da87c30b
AD
1951 case CHIP_TOPAZ:
1952 case CHIP_TONGA:
1953 case CHIP_FIJI:
1954 case CHIP_POLARIS10:
1955 case CHIP_POLARIS11:
1956 case CHIP_POLARIS12:
1957 case CHIP_VEGAM:
1958 case CHIP_CARRIZO:
1959 case CHIP_STONEY:
1960 if (adev->flags & AMD_IS_APU)
1961 adev->family = AMDGPU_FAMILY_CZ;
1962 else
1963 adev->family = AMDGPU_FAMILY_VI;
1964
1965 r = vi_set_ip_blocks(adev);
1966 if (r)
1967 return r;
1968 break;
e48a3cd9
AD
1969 case CHIP_VEGA10:
1970 case CHIP_VEGA12:
e4bd8170 1971 case CHIP_VEGA20:
e48a3cd9 1972 case CHIP_RAVEN:
61cf44c1 1973 case CHIP_ARCTURUS:
b51a26a0 1974 case CHIP_RENOIR:
70534d1e 1975 if (adev->flags & AMD_IS_APU)
2ca8a5d2
CZ
1976 adev->family = AMDGPU_FAMILY_RV;
1977 else
1978 adev->family = AMDGPU_FAMILY_AI;
460826e6
KW
1979
1980 r = soc15_set_ip_blocks(adev);
1981 if (r)
1982 return r;
1983 break;
0a5b8c7b 1984 case CHIP_NAVI10:
7ecb5cd4 1985 case CHIP_NAVI14:
4808cf9c 1986 case CHIP_NAVI12:
11e8aef5 1987 case CHIP_SIENNA_CICHLID:
41f446bf 1988 case CHIP_NAVY_FLOUNDER:
0a5b8c7b
HR
1989 adev->family = AMDGPU_FAMILY_NV;
1990
1991 r = nv_set_ip_blocks(adev);
1992 if (r)
1993 return r;
1994 break;
d38ceaf9
AD
1995 default:
1996 /* FIXME: not supported yet */
1997 return -EINVAL;
1998 }
1999
1884734a 2000 amdgpu_amdkfd_device_probe(adev);
2001
3b94fb10 2002 adev->pm.pp_feature = amdgpu_pp_feature_mask;
a35ad98b 2003 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
00544006 2004 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
00f54b97 2005
d38ceaf9
AD
2006 for (i = 0; i < adev->num_ip_blocks; i++) {
2007 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
ed8cf00c
HR
2008 DRM_ERROR("disabled ip block: %d <%s>\n",
2009 i, adev->ip_blocks[i].version->funcs->name);
a1255107 2010 adev->ip_blocks[i].status.valid = false;
d38ceaf9 2011 } else {
a1255107
AD
2012 if (adev->ip_blocks[i].version->funcs->early_init) {
2013 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2c1a2784 2014 if (r == -ENOENT) {
a1255107 2015 adev->ip_blocks[i].status.valid = false;
2c1a2784 2016 } else if (r) {
a1255107
AD
2017 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2018 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2019 return r;
2c1a2784 2020 } else {
a1255107 2021 adev->ip_blocks[i].status.valid = true;
2c1a2784 2022 }
974e6b64 2023 } else {
a1255107 2024 adev->ip_blocks[i].status.valid = true;
d38ceaf9 2025 }
d38ceaf9 2026 }
21a249ca
AD
2027 /* get the vbios after the asic_funcs are set up */
2028 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
6e29c227
AD
2029 r = amdgpu_device_parse_gpu_info_fw(adev);
2030 if (r)
2031 return r;
2032
21a249ca
AD
2033 /* Read BIOS */
2034 if (!amdgpu_get_bios(adev))
2035 return -EINVAL;
2036
2037 r = amdgpu_atombios_init(adev);
2038 if (r) {
2039 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2040 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2041 return r;
2042 }
2043 }
d38ceaf9
AD
2044 }
2045
395d1fb9
NH
2046 adev->cg_flags &= amdgpu_cg_mask;
2047 adev->pg_flags &= amdgpu_pg_mask;
2048
d38ceaf9
AD
2049 return 0;
2050}
2051
0a4f2520
RZ
2052static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2053{
2054 int i, r;
2055
2056 for (i = 0; i < adev->num_ip_blocks; i++) {
2057 if (!adev->ip_blocks[i].status.sw)
2058 continue;
2059 if (adev->ip_blocks[i].status.hw)
2060 continue;
2061 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2d11fd3f 2062 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
0a4f2520
RZ
2063 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2064 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2065 if (r) {
2066 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2067 adev->ip_blocks[i].version->funcs->name, r);
2068 return r;
2069 }
2070 adev->ip_blocks[i].status.hw = true;
2071 }
2072 }
2073
2074 return 0;
2075}
2076
2077static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2078{
2079 int i, r;
2080
2081 for (i = 0; i < adev->num_ip_blocks; i++) {
2082 if (!adev->ip_blocks[i].status.sw)
2083 continue;
2084 if (adev->ip_blocks[i].status.hw)
2085 continue;
2086 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2087 if (r) {
2088 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2089 adev->ip_blocks[i].version->funcs->name, r);
2090 return r;
2091 }
2092 adev->ip_blocks[i].status.hw = true;
2093 }
2094
2095 return 0;
2096}
2097
7a3e0bb2
RZ
2098static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2099{
2100 int r = 0;
2101 int i;
80f41f84 2102 uint32_t smu_version;
7a3e0bb2
RZ
2103
2104 if (adev->asic_type >= CHIP_VEGA10) {
2105 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53
ML
2106 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2107 continue;
2108
2109 /* no need to do the fw loading again if already done*/
2110 if (adev->ip_blocks[i].status.hw == true)
2111 break;
2112
53b3f8f4 2113 if (amdgpu_in_reset(adev) || adev->in_suspend) {
482f0e53
ML
2114 r = adev->ip_blocks[i].version->funcs->resume(adev);
2115 if (r) {
2116 DRM_ERROR("resume of IP block <%s> failed %d\n",
7a3e0bb2 2117 adev->ip_blocks[i].version->funcs->name, r);
482f0e53
ML
2118 return r;
2119 }
2120 } else {
2121 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2122 if (r) {
2123 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2124 adev->ip_blocks[i].version->funcs->name, r);
2125 return r;
7a3e0bb2 2126 }
7a3e0bb2 2127 }
482f0e53
ML
2128
2129 adev->ip_blocks[i].status.hw = true;
2130 break;
7a3e0bb2
RZ
2131 }
2132 }
482f0e53 2133
8973d9ec
ED
2134 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2135 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
7a3e0bb2 2136
80f41f84 2137 return r;
7a3e0bb2
RZ
2138}
2139
e3ecdffa
AD
2140/**
2141 * amdgpu_device_ip_init - run init for hardware IPs
2142 *
2143 * @adev: amdgpu_device pointer
2144 *
2145 * Main initialization pass for hardware IPs. The list of all the hardware
2146 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2147 * are run. sw_init initializes the software state associated with each IP
2148 * and hw_init initializes the hardware associated with each IP.
2149 * Returns 0 on success, negative error code on failure.
2150 */
06ec9070 2151static int amdgpu_device_ip_init(struct amdgpu_device *adev)
d38ceaf9
AD
2152{
2153 int i, r;
2154
c030f2e4 2155 r = amdgpu_ras_init(adev);
2156 if (r)
2157 return r;
2158
d38ceaf9 2159 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2160 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2161 continue;
a1255107 2162 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2c1a2784 2163 if (r) {
a1255107
AD
2164 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2165 adev->ip_blocks[i].version->funcs->name, r);
72d3f592 2166 goto init_failed;
2c1a2784 2167 }
a1255107 2168 adev->ip_blocks[i].status.sw = true;
bfca0289 2169
d38ceaf9 2170 /* need to do gmc hw init early so we can allocate gpu mem */
a1255107 2171 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
06ec9070 2172 r = amdgpu_device_vram_scratch_init(adev);
2c1a2784
AD
2173 if (r) {
2174 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
72d3f592 2175 goto init_failed;
2c1a2784 2176 }
a1255107 2177 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2c1a2784
AD
2178 if (r) {
2179 DRM_ERROR("hw_init %d failed %d\n", i, r);
72d3f592 2180 goto init_failed;
2c1a2784 2181 }
06ec9070 2182 r = amdgpu_device_wb_init(adev);
2c1a2784 2183 if (r) {
06ec9070 2184 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
72d3f592 2185 goto init_failed;
2c1a2784 2186 }
a1255107 2187 adev->ip_blocks[i].status.hw = true;
2493664f
ML
2188
2189 /* right after GMC hw init, we create CSA */
f92d5c61 2190 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1e256e27
RZ
2191 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2192 AMDGPU_GEM_DOMAIN_VRAM,
2193 AMDGPU_CSA_SIZE);
2493664f
ML
2194 if (r) {
2195 DRM_ERROR("allocate CSA failed %d\n", r);
72d3f592 2196 goto init_failed;
2493664f
ML
2197 }
2198 }
d38ceaf9
AD
2199 }
2200 }
2201
c9ffa427
YT
2202 if (amdgpu_sriov_vf(adev))
2203 amdgpu_virt_init_data_exchange(adev);
2204
533aed27
AG
2205 r = amdgpu_ib_pool_init(adev);
2206 if (r) {
2207 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2208 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2209 goto init_failed;
2210 }
2211
c8963ea4
RZ
2212 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2213 if (r)
72d3f592 2214 goto init_failed;
0a4f2520
RZ
2215
2216 r = amdgpu_device_ip_hw_init_phase1(adev);
2217 if (r)
72d3f592 2218 goto init_failed;
0a4f2520 2219
7a3e0bb2
RZ
2220 r = amdgpu_device_fw_loading(adev);
2221 if (r)
72d3f592 2222 goto init_failed;
7a3e0bb2 2223
0a4f2520
RZ
2224 r = amdgpu_device_ip_hw_init_phase2(adev);
2225 if (r)
72d3f592 2226 goto init_failed;
d38ceaf9 2227
121a2bc6
AG
2228 /*
2229 * retired pages will be loaded from eeprom and reserved here,
2230 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2231 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2232 * for I2C communication which only true at this point.
b82e65a9
GC
2233 *
2234 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2235 * failure from bad gpu situation and stop amdgpu init process
2236 * accordingly. For other failed cases, it will still release all
2237 * the resource and print error message, rather than returning one
2238 * negative value to upper level.
121a2bc6
AG
2239 *
2240 * Note: theoretically, this should be called before all vram allocations
2241 * to protect retired page from abusing
2242 */
b82e65a9
GC
2243 r = amdgpu_ras_recovery_init(adev);
2244 if (r)
2245 goto init_failed;
121a2bc6 2246
3e2e2ab5
HZ
2247 if (adev->gmc.xgmi.num_physical_nodes > 1)
2248 amdgpu_xgmi_add_device(adev);
1884734a 2249 amdgpu_amdkfd_device_init(adev);
c6332b97 2250
bd607166
KR
2251 amdgpu_fru_get_product_info(adev);
2252
72d3f592 2253init_failed:
c9ffa427 2254 if (amdgpu_sriov_vf(adev))
c6332b97 2255 amdgpu_virt_release_full_gpu(adev, true);
2256
72d3f592 2257 return r;
d38ceaf9
AD
2258}
2259
e3ecdffa
AD
2260/**
2261 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2262 *
2263 * @adev: amdgpu_device pointer
2264 *
2265 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2266 * this function before a GPU reset. If the value is retained after a
2267 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2268 */
06ec9070 2269static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
0c49e0b8
CZ
2270{
2271 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2272}
2273
e3ecdffa
AD
2274/**
2275 * amdgpu_device_check_vram_lost - check if vram is valid
2276 *
2277 * @adev: amdgpu_device pointer
2278 *
2279 * Checks the reset magic value written to the gart pointer in VRAM.
2280 * The driver calls this after a GPU reset to see if the contents of
2281 * VRAM is lost or now.
2282 * returns true if vram is lost, false if not.
2283 */
06ec9070 2284static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
0c49e0b8 2285{
dadce777
EQ
2286 if (memcmp(adev->gart.ptr, adev->reset_magic,
2287 AMDGPU_RESET_MAGIC_NUM))
2288 return true;
2289
53b3f8f4 2290 if (!amdgpu_in_reset(adev))
dadce777
EQ
2291 return false;
2292
2293 /*
2294 * For all ASICs with baco/mode1 reset, the VRAM is
2295 * always assumed to be lost.
2296 */
2297 switch (amdgpu_asic_reset_method(adev)) {
2298 case AMD_RESET_METHOD_BACO:
2299 case AMD_RESET_METHOD_MODE1:
2300 return true;
2301 default:
2302 return false;
2303 }
0c49e0b8
CZ
2304}
2305
e3ecdffa 2306/**
1112a46b 2307 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
e3ecdffa
AD
2308 *
2309 * @adev: amdgpu_device pointer
b8b72130 2310 * @state: clockgating state (gate or ungate)
e3ecdffa 2311 *
e3ecdffa 2312 * The list of all the hardware IPs that make up the asic is walked and the
1112a46b
RZ
2313 * set_clockgating_state callbacks are run.
2314 * Late initialization pass enabling clockgating for hardware IPs.
2315 * Fini or suspend, pass disabling clockgating for hardware IPs.
e3ecdffa
AD
2316 * Returns 0 on success, negative error code on failure.
2317 */
fdd34271 2318
1112a46b
RZ
2319static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2320 enum amd_clockgating_state state)
d38ceaf9 2321{
1112a46b 2322 int i, j, r;
d38ceaf9 2323
4a2ba394
SL
2324 if (amdgpu_emu_mode == 1)
2325 return 0;
2326
1112a46b
RZ
2327 for (j = 0; j < adev->num_ip_blocks; j++) {
2328 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2329 if (!adev->ip_blocks[i].status.late_initialized)
d38ceaf9 2330 continue;
4a446d55 2331 /* skip CG for VCE/UVD, it's handled specially */
a1255107 2332 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
57716327 2333 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
34319b32 2334 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2335 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
57716327 2336 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
4a446d55 2337 /* enable clockgating to save power */
a1255107 2338 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1112a46b 2339 state);
4a446d55
AD
2340 if (r) {
2341 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
a1255107 2342 adev->ip_blocks[i].version->funcs->name, r);
4a446d55
AD
2343 return r;
2344 }
b0b00ff1 2345 }
d38ceaf9 2346 }
06b18f61 2347
c9f96fd5
RZ
2348 return 0;
2349}
2350
1112a46b 2351static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
c9f96fd5 2352{
1112a46b 2353 int i, j, r;
06b18f61 2354
c9f96fd5
RZ
2355 if (amdgpu_emu_mode == 1)
2356 return 0;
2357
1112a46b
RZ
2358 for (j = 0; j < adev->num_ip_blocks; j++) {
2359 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
a2d31dc3 2360 if (!adev->ip_blocks[i].status.late_initialized)
c9f96fd5
RZ
2361 continue;
2362 /* skip CG for VCE/UVD, it's handled specially */
2363 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2364 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2365 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
52f2e779 2366 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
c9f96fd5
RZ
2367 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2368 /* enable powergating to save power */
2369 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1112a46b 2370 state);
c9f96fd5
RZ
2371 if (r) {
2372 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2373 adev->ip_blocks[i].version->funcs->name, r);
2374 return r;
2375 }
2376 }
2377 }
2dc80b00
S
2378 return 0;
2379}
2380
beff74bc
AD
2381static int amdgpu_device_enable_mgpu_fan_boost(void)
2382{
2383 struct amdgpu_gpu_instance *gpu_ins;
2384 struct amdgpu_device *adev;
2385 int i, ret = 0;
2386
2387 mutex_lock(&mgpu_info.mutex);
2388
2389 /*
2390 * MGPU fan boost feature should be enabled
2391 * only when there are two or more dGPUs in
2392 * the system
2393 */
2394 if (mgpu_info.num_dgpu < 2)
2395 goto out;
2396
2397 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2398 gpu_ins = &(mgpu_info.gpu_ins[i]);
2399 adev = gpu_ins->adev;
2400 if (!(adev->flags & AMD_IS_APU) &&
f10bb940 2401 !gpu_ins->mgpu_fan_enabled) {
beff74bc
AD
2402 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2403 if (ret)
2404 break;
2405
2406 gpu_ins->mgpu_fan_enabled = 1;
2407 }
2408 }
2409
2410out:
2411 mutex_unlock(&mgpu_info.mutex);
2412
2413 return ret;
2414}
2415
e3ecdffa
AD
2416/**
2417 * amdgpu_device_ip_late_init - run late init for hardware IPs
2418 *
2419 * @adev: amdgpu_device pointer
2420 *
2421 * Late initialization pass for hardware IPs. The list of all the hardware
2422 * IPs that make up the asic is walked and the late_init callbacks are run.
2423 * late_init covers any special initialization that an IP requires
2424 * after all of the have been initialized or something that needs to happen
2425 * late in the init process.
2426 * Returns 0 on success, negative error code on failure.
2427 */
06ec9070 2428static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2dc80b00 2429{
60599a03 2430 struct amdgpu_gpu_instance *gpu_instance;
2dc80b00
S
2431 int i = 0, r;
2432
2433 for (i = 0; i < adev->num_ip_blocks; i++) {
73f847db 2434 if (!adev->ip_blocks[i].status.hw)
2dc80b00
S
2435 continue;
2436 if (adev->ip_blocks[i].version->funcs->late_init) {
2437 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2438 if (r) {
2439 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2440 adev->ip_blocks[i].version->funcs->name, r);
2441 return r;
2442 }
2dc80b00 2443 }
73f847db 2444 adev->ip_blocks[i].status.late_initialized = true;
2dc80b00
S
2445 }
2446
a891d239
DL
2447 amdgpu_ras_set_error_query_ready(adev, true);
2448
1112a46b
RZ
2449 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2450 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
916ac57f 2451
06ec9070 2452 amdgpu_device_fill_reset_magic(adev);
d38ceaf9 2453
beff74bc
AD
2454 r = amdgpu_device_enable_mgpu_fan_boost();
2455 if (r)
2456 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2457
60599a03
EQ
2458
2459 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2460 mutex_lock(&mgpu_info.mutex);
2461
2462 /*
2463 * Reset device p-state to low as this was booted with high.
2464 *
2465 * This should be performed only after all devices from the same
2466 * hive get initialized.
2467 *
2468 * However, it's unknown how many device in the hive in advance.
2469 * As this is counted one by one during devices initializations.
2470 *
2471 * So, we wait for all XGMI interlinked devices initialized.
2472 * This may bring some delays as those devices may come from
2473 * different hives. But that should be OK.
2474 */
2475 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2476 for (i = 0; i < mgpu_info.num_gpu; i++) {
2477 gpu_instance = &(mgpu_info.gpu_ins[i]);
2478 if (gpu_instance->adev->flags & AMD_IS_APU)
2479 continue;
2480
d84a430d
JK
2481 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2482 AMDGPU_XGMI_PSTATE_MIN);
60599a03
EQ
2483 if (r) {
2484 DRM_ERROR("pstate setting failed (%d).\n", r);
2485 break;
2486 }
2487 }
2488 }
2489
2490 mutex_unlock(&mgpu_info.mutex);
2491 }
2492
d38ceaf9
AD
2493 return 0;
2494}
2495
e3ecdffa
AD
2496/**
2497 * amdgpu_device_ip_fini - run fini for hardware IPs
2498 *
2499 * @adev: amdgpu_device pointer
2500 *
2501 * Main teardown pass for hardware IPs. The list of all the hardware
2502 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2503 * are run. hw_fini tears down the hardware associated with each IP
2504 * and sw_fini tears down any software state associated with each IP.
2505 * Returns 0 on success, negative error code on failure.
2506 */
06ec9070 2507static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
d38ceaf9
AD
2508{
2509 int i, r;
2510
5278a159
SY
2511 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2512 amdgpu_virt_release_ras_err_handler_data(adev);
2513
c030f2e4 2514 amdgpu_ras_pre_fini(adev);
2515
a82400b5
AG
2516 if (adev->gmc.xgmi.num_physical_nodes > 1)
2517 amdgpu_xgmi_remove_device(adev);
2518
1884734a 2519 amdgpu_amdkfd_device_fini(adev);
05df1f01
RZ
2520
2521 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
fdd34271
RZ
2522 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2523
3e96dbfd
AD
2524 /* need to disable SMC first */
2525 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 2526 if (!adev->ip_blocks[i].status.hw)
3e96dbfd 2527 continue;
fdd34271 2528 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
a1255107 2529 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3e96dbfd
AD
2530 /* XXX handle errors */
2531 if (r) {
2532 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
a1255107 2533 adev->ip_blocks[i].version->funcs->name, r);
3e96dbfd 2534 }
a1255107 2535 adev->ip_blocks[i].status.hw = false;
3e96dbfd
AD
2536 break;
2537 }
2538 }
2539
d38ceaf9 2540 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2541 if (!adev->ip_blocks[i].status.hw)
d38ceaf9 2542 continue;
8201a67a 2543
a1255107 2544 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
d38ceaf9 2545 /* XXX handle errors */
2c1a2784 2546 if (r) {
a1255107
AD
2547 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2548 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2549 }
8201a67a 2550
a1255107 2551 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2552 }
2553
9950cda2 2554
d38ceaf9 2555 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2556 if (!adev->ip_blocks[i].status.sw)
d38ceaf9 2557 continue;
c12aba3a
ML
2558
2559 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
c8963ea4 2560 amdgpu_ucode_free_bo(adev);
1e256e27 2561 amdgpu_free_static_csa(&adev->virt.csa_obj);
c12aba3a
ML
2562 amdgpu_device_wb_fini(adev);
2563 amdgpu_device_vram_scratch_fini(adev);
533aed27 2564 amdgpu_ib_pool_fini(adev);
c12aba3a
ML
2565 }
2566
a1255107 2567 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
d38ceaf9 2568 /* XXX handle errors */
2c1a2784 2569 if (r) {
a1255107
AD
2570 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2571 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2572 }
a1255107
AD
2573 adev->ip_blocks[i].status.sw = false;
2574 adev->ip_blocks[i].status.valid = false;
d38ceaf9
AD
2575 }
2576
a6dcfd9c 2577 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2578 if (!adev->ip_blocks[i].status.late_initialized)
8a2eef1d 2579 continue;
a1255107
AD
2580 if (adev->ip_blocks[i].version->funcs->late_fini)
2581 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2582 adev->ip_blocks[i].status.late_initialized = false;
a6dcfd9c
ML
2583 }
2584
c030f2e4 2585 amdgpu_ras_fini(adev);
2586
030308fc 2587 if (amdgpu_sriov_vf(adev))
24136135
ML
2588 if (amdgpu_virt_release_full_gpu(adev, false))
2589 DRM_ERROR("failed to release exclusive mode on fini\n");
2493664f 2590
d38ceaf9
AD
2591 return 0;
2592}
2593
e3ecdffa 2594/**
beff74bc 2595 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
e3ecdffa 2596 *
1112a46b 2597 * @work: work_struct.
e3ecdffa 2598 */
beff74bc 2599static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2dc80b00
S
2600{
2601 struct amdgpu_device *adev =
beff74bc 2602 container_of(work, struct amdgpu_device, delayed_init_work.work);
916ac57f
RZ
2603 int r;
2604
2605 r = amdgpu_ib_ring_tests(adev);
2606 if (r)
2607 DRM_ERROR("ib ring test failed (%d).\n", r);
2dc80b00
S
2608}
2609
1e317b99
RZ
2610static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2611{
2612 struct amdgpu_device *adev =
2613 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2614
2615 mutex_lock(&adev->gfx.gfx_off_mutex);
2616 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2617 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2618 adev->gfx.gfx_off_state = true;
2619 }
2620 mutex_unlock(&adev->gfx.gfx_off_mutex);
2621}
2622
e3ecdffa 2623/**
e7854a03 2624 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
e3ecdffa
AD
2625 *
2626 * @adev: amdgpu_device pointer
2627 *
2628 * Main suspend function for hardware IPs. The list of all the hardware
2629 * IPs that make up the asic is walked, clockgating is disabled and the
2630 * suspend callbacks are run. suspend puts the hardware and software state
2631 * in each IP into a state suitable for suspend.
2632 * Returns 0 on success, negative error code on failure.
2633 */
e7854a03
AD
2634static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2635{
2636 int i, r;
2637
ced1ba97
PL
2638 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2639 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
05df1f01 2640
e7854a03
AD
2641 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2642 if (!adev->ip_blocks[i].status.valid)
2643 continue;
2b9f7848 2644
e7854a03 2645 /* displays are handled separately */
2b9f7848
ND
2646 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2647 continue;
2648
2649 /* XXX handle errors */
2650 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2651 /* XXX handle errors */
2652 if (r) {
2653 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2654 adev->ip_blocks[i].version->funcs->name, r);
2655 return r;
e7854a03 2656 }
2b9f7848
ND
2657
2658 adev->ip_blocks[i].status.hw = false;
e7854a03
AD
2659 }
2660
e7854a03
AD
2661 return 0;
2662}
2663
2664/**
2665 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2666 *
2667 * @adev: amdgpu_device pointer
2668 *
2669 * Main suspend function for hardware IPs. The list of all the hardware
2670 * IPs that make up the asic is walked, clockgating is disabled and the
2671 * suspend callbacks are run. suspend puts the hardware and software state
2672 * in each IP into a state suitable for suspend.
2673 * Returns 0 on success, negative error code on failure.
2674 */
2675static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2676{
2677 int i, r;
2678
2679 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
a1255107 2680 if (!adev->ip_blocks[i].status.valid)
d38ceaf9 2681 continue;
e7854a03
AD
2682 /* displays are handled in phase1 */
2683 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2684 continue;
bff77e86
LM
2685 /* PSP lost connection when err_event_athub occurs */
2686 if (amdgpu_ras_intr_triggered() &&
2687 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2688 adev->ip_blocks[i].status.hw = false;
2689 continue;
2690 }
d38ceaf9 2691 /* XXX handle errors */
a1255107 2692 r = adev->ip_blocks[i].version->funcs->suspend(adev);
d38ceaf9 2693 /* XXX handle errors */
2c1a2784 2694 if (r) {
a1255107
AD
2695 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2696 adev->ip_blocks[i].version->funcs->name, r);
2c1a2784 2697 }
876923fb 2698 adev->ip_blocks[i].status.hw = false;
a3a09142 2699 /* handle putting the SMC in the appropriate state */
86b93fd6
JZ
2700 if(!amdgpu_sriov_vf(adev)){
2701 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2702 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2703 if (r) {
2704 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2705 adev->mp1_state, r);
2706 return r;
2707 }
a3a09142
AD
2708 }
2709 }
b5507c7e 2710 adev->ip_blocks[i].status.hw = false;
d38ceaf9
AD
2711 }
2712
2713 return 0;
2714}
2715
e7854a03
AD
2716/**
2717 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2718 *
2719 * @adev: amdgpu_device pointer
2720 *
2721 * Main suspend function for hardware IPs. The list of all the hardware
2722 * IPs that make up the asic is walked, clockgating is disabled and the
2723 * suspend callbacks are run. suspend puts the hardware and software state
2724 * in each IP into a state suitable for suspend.
2725 * Returns 0 on success, negative error code on failure.
2726 */
2727int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2728{
2729 int r;
2730
e7819644
YT
2731 if (amdgpu_sriov_vf(adev))
2732 amdgpu_virt_request_full_gpu(adev, false);
2733
e7854a03
AD
2734 r = amdgpu_device_ip_suspend_phase1(adev);
2735 if (r)
2736 return r;
2737 r = amdgpu_device_ip_suspend_phase2(adev);
2738
e7819644
YT
2739 if (amdgpu_sriov_vf(adev))
2740 amdgpu_virt_release_full_gpu(adev, false);
2741
e7854a03
AD
2742 return r;
2743}
2744
06ec9070 2745static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2746{
2747 int i, r;
2748
2cb681b6
ML
2749 static enum amd_ip_block_type ip_order[] = {
2750 AMD_IP_BLOCK_TYPE_GMC,
2751 AMD_IP_BLOCK_TYPE_COMMON,
39186aef 2752 AMD_IP_BLOCK_TYPE_PSP,
2cb681b6
ML
2753 AMD_IP_BLOCK_TYPE_IH,
2754 };
a90ad3c2 2755
2cb681b6
ML
2756 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2757 int j;
2758 struct amdgpu_ip_block *block;
a90ad3c2 2759
4cd2a96d
J
2760 block = &adev->ip_blocks[i];
2761 block->status.hw = false;
2cb681b6 2762
4cd2a96d 2763 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2cb681b6 2764
4cd2a96d 2765 if (block->version->type != ip_order[j] ||
2cb681b6
ML
2766 !block->status.valid)
2767 continue;
2768
2769 r = block->version->funcs->hw_init(adev);
0aaeefcc 2770 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2771 if (r)
2772 return r;
482f0e53 2773 block->status.hw = true;
a90ad3c2
ML
2774 }
2775 }
2776
2777 return 0;
2778}
2779
06ec9070 2780static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
a90ad3c2
ML
2781{
2782 int i, r;
2783
2cb681b6
ML
2784 static enum amd_ip_block_type ip_order[] = {
2785 AMD_IP_BLOCK_TYPE_SMC,
2786 AMD_IP_BLOCK_TYPE_DCE,
2787 AMD_IP_BLOCK_TYPE_GFX,
2788 AMD_IP_BLOCK_TYPE_SDMA,
257deb8c 2789 AMD_IP_BLOCK_TYPE_UVD,
d83c7a07
JJ
2790 AMD_IP_BLOCK_TYPE_VCE,
2791 AMD_IP_BLOCK_TYPE_VCN
2cb681b6 2792 };
a90ad3c2 2793
2cb681b6
ML
2794 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2795 int j;
2796 struct amdgpu_ip_block *block;
a90ad3c2 2797
2cb681b6
ML
2798 for (j = 0; j < adev->num_ip_blocks; j++) {
2799 block = &adev->ip_blocks[j];
2800
2801 if (block->version->type != ip_order[i] ||
482f0e53
ML
2802 !block->status.valid ||
2803 block->status.hw)
2cb681b6
ML
2804 continue;
2805
895bd048
JZ
2806 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2807 r = block->version->funcs->resume(adev);
2808 else
2809 r = block->version->funcs->hw_init(adev);
2810
0aaeefcc 2811 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
c41d1cf6
ML
2812 if (r)
2813 return r;
482f0e53 2814 block->status.hw = true;
a90ad3c2
ML
2815 }
2816 }
2817
2818 return 0;
2819}
2820
e3ecdffa
AD
2821/**
2822 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2823 *
2824 * @adev: amdgpu_device pointer
2825 *
2826 * First resume function for hardware IPs. The list of all the hardware
2827 * IPs that make up the asic is walked and the resume callbacks are run for
2828 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2829 * after a suspend and updates the software state as necessary. This
2830 * function is also used for restoring the GPU after a GPU reset.
2831 * Returns 0 on success, negative error code on failure.
2832 */
06ec9070 2833static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
d38ceaf9
AD
2834{
2835 int i, r;
2836
a90ad3c2 2837 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2838 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
a90ad3c2 2839 continue;
a90ad3c2 2840 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa
AD
2841 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2842 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
482f0e53 2843
fcf0649f
CZ
2844 r = adev->ip_blocks[i].version->funcs->resume(adev);
2845 if (r) {
2846 DRM_ERROR("resume of IP block <%s> failed %d\n",
2847 adev->ip_blocks[i].version->funcs->name, r);
2848 return r;
2849 }
482f0e53 2850 adev->ip_blocks[i].status.hw = true;
a90ad3c2
ML
2851 }
2852 }
2853
2854 return 0;
2855}
2856
e3ecdffa
AD
2857/**
2858 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2859 *
2860 * @adev: amdgpu_device pointer
2861 *
2862 * First resume function for hardware IPs. The list of all the hardware
2863 * IPs that make up the asic is walked and the resume callbacks are run for
2864 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2865 * functional state after a suspend and updates the software state as
2866 * necessary. This function is also used for restoring the GPU after a GPU
2867 * reset.
2868 * Returns 0 on success, negative error code on failure.
2869 */
06ec9070 2870static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
d38ceaf9
AD
2871{
2872 int i, r;
2873
2874 for (i = 0; i < adev->num_ip_blocks; i++) {
482f0e53 2875 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
d38ceaf9 2876 continue;
fcf0649f 2877 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
e3ecdffa 2878 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
7a3e0bb2
RZ
2879 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2880 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
fcf0649f 2881 continue;
a1255107 2882 r = adev->ip_blocks[i].version->funcs->resume(adev);
2c1a2784 2883 if (r) {
a1255107
AD
2884 DRM_ERROR("resume of IP block <%s> failed %d\n",
2885 adev->ip_blocks[i].version->funcs->name, r);
d38ceaf9 2886 return r;
2c1a2784 2887 }
482f0e53 2888 adev->ip_blocks[i].status.hw = true;
d38ceaf9
AD
2889 }
2890
2891 return 0;
2892}
2893
e3ecdffa
AD
2894/**
2895 * amdgpu_device_ip_resume - run resume for hardware IPs
2896 *
2897 * @adev: amdgpu_device pointer
2898 *
2899 * Main resume function for hardware IPs. The hardware IPs
2900 * are split into two resume functions because they are
2901 * are also used in in recovering from a GPU reset and some additional
2902 * steps need to be take between them. In this case (S3/S4) they are
2903 * run sequentially.
2904 * Returns 0 on success, negative error code on failure.
2905 */
06ec9070 2906static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
fcf0649f
CZ
2907{
2908 int r;
2909
06ec9070 2910 r = amdgpu_device_ip_resume_phase1(adev);
fcf0649f
CZ
2911 if (r)
2912 return r;
7a3e0bb2
RZ
2913
2914 r = amdgpu_device_fw_loading(adev);
2915 if (r)
2916 return r;
2917
06ec9070 2918 r = amdgpu_device_ip_resume_phase2(adev);
fcf0649f
CZ
2919
2920 return r;
2921}
2922
e3ecdffa
AD
2923/**
2924 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2925 *
2926 * @adev: amdgpu_device pointer
2927 *
2928 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2929 */
4e99a44e 2930static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
048765ad 2931{
6867e1b5
ML
2932 if (amdgpu_sriov_vf(adev)) {
2933 if (adev->is_atom_fw) {
2934 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2935 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2936 } else {
2937 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2938 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2939 }
2940
2941 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2942 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
a5bde2f9 2943 }
048765ad
AR
2944}
2945
e3ecdffa
AD
2946/**
2947 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2948 *
2949 * @asic_type: AMD asic type
2950 *
2951 * Check if there is DC (new modesetting infrastructre) support for an asic.
2952 * returns true if DC has support, false if not.
2953 */
4562236b
HW
2954bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2955{
2956 switch (asic_type) {
2957#if defined(CONFIG_DRM_AMD_DC)
64200c46
MR
2958#if defined(CONFIG_DRM_AMD_DC_SI)
2959 case CHIP_TAHITI:
2960 case CHIP_PITCAIRN:
2961 case CHIP_VERDE:
2962 case CHIP_OLAND:
2963#endif
4562236b 2964 case CHIP_BONAIRE:
0d6fbccb 2965 case CHIP_KAVERI:
367e6687
AD
2966 case CHIP_KABINI:
2967 case CHIP_MULLINS:
d9fda248
HW
2968 /*
2969 * We have systems in the wild with these ASICs that require
2970 * LVDS and VGA support which is not supported with DC.
2971 *
2972 * Fallback to the non-DC driver here by default so as not to
2973 * cause regressions.
2974 */
2975 return amdgpu_dc > 0;
2976 case CHIP_HAWAII:
4562236b
HW
2977 case CHIP_CARRIZO:
2978 case CHIP_STONEY:
4562236b 2979 case CHIP_POLARIS10:
675fd32b 2980 case CHIP_POLARIS11:
2c8ad2d5 2981 case CHIP_POLARIS12:
675fd32b 2982 case CHIP_VEGAM:
4562236b
HW
2983 case CHIP_TONGA:
2984 case CHIP_FIJI:
42f8ffa1 2985 case CHIP_VEGA10:
dca7b401 2986 case CHIP_VEGA12:
c6034aa2 2987 case CHIP_VEGA20:
b86a1aa3 2988#if defined(CONFIG_DRM_AMD_DC_DCN)
fd187853 2989 case CHIP_RAVEN:
b4f199c7 2990 case CHIP_NAVI10:
8fceceb6 2991 case CHIP_NAVI14:
078655d9 2992 case CHIP_NAVI12:
e1c14c43 2993 case CHIP_RENOIR:
81d9bfb8
JFZ
2994#endif
2995#if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2996 case CHIP_SIENNA_CICHLID:
a6c5308f 2997 case CHIP_NAVY_FLOUNDER:
42f8ffa1 2998#endif
fd187853 2999 return amdgpu_dc != 0;
4562236b
HW
3000#endif
3001 default:
93b09a9a
SS
3002 if (amdgpu_dc > 0)
3003 DRM_INFO("Display Core has been requested via kernel parameter "
3004 "but isn't supported by ASIC, ignoring\n");
4562236b
HW
3005 return false;
3006 }
3007}
3008
3009/**
3010 * amdgpu_device_has_dc_support - check if dc is supported
3011 *
3012 * @adev: amdgpu_device_pointer
3013 *
3014 * Returns true for supported, false for not supported
3015 */
3016bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3017{
c997e8e2 3018 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
2555039d
XY
3019 return false;
3020
4562236b
HW
3021 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3022}
3023
d4535e2c
AG
3024
3025static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3026{
3027 struct amdgpu_device *adev =
3028 container_of(__work, struct amdgpu_device, xgmi_reset_work);
d95e8e97 3029 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
d4535e2c 3030
c6a6e2db
AG
3031 /* It's a bug to not have a hive within this function */
3032 if (WARN_ON(!hive))
3033 return;
3034
3035 /*
3036 * Use task barrier to synchronize all xgmi reset works across the
3037 * hive. task_barrier_enter and task_barrier_exit will block
3038 * until all the threads running the xgmi reset works reach
3039 * those points. task_barrier_full will do both blocks.
3040 */
3041 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3042
3043 task_barrier_enter(&hive->tb);
4a580877 3044 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
c6a6e2db
AG
3045
3046 if (adev->asic_reset_res)
3047 goto fail;
3048
3049 task_barrier_exit(&hive->tb);
4a580877 3050 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
c6a6e2db
AG
3051
3052 if (adev->asic_reset_res)
3053 goto fail;
43c4d576
JC
3054
3055 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3056 adev->mmhub.funcs->reset_ras_error_count(adev);
c6a6e2db
AG
3057 } else {
3058
3059 task_barrier_full(&hive->tb);
3060 adev->asic_reset_res = amdgpu_asic_reset(adev);
3061 }
ce316fa5 3062
c6a6e2db 3063fail:
d4535e2c 3064 if (adev->asic_reset_res)
fed184e9 3065 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4a580877 3066 adev->asic_reset_res, adev_to_drm(adev)->unique);
d95e8e97 3067 amdgpu_put_xgmi_hive(hive);
d4535e2c
AG
3068}
3069
71f98027
AD
3070static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3071{
3072 char *input = amdgpu_lockup_timeout;
3073 char *timeout_setting = NULL;
3074 int index = 0;
3075 long timeout;
3076 int ret = 0;
3077
3078 /*
3079 * By default timeout for non compute jobs is 10000.
3080 * And there is no timeout enforced on compute jobs.
3081 * In SR-IOV or passthrough mode, timeout for compute
b7b2a316 3082 * jobs are 60000 by default.
71f98027
AD
3083 */
3084 adev->gfx_timeout = msecs_to_jiffies(10000);
3085 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3086 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
b7b2a316 3087 adev->compute_timeout = msecs_to_jiffies(60000);
71f98027
AD
3088 else
3089 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3090
f440ff44 3091 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027 3092 while ((timeout_setting = strsep(&input, ",")) &&
f440ff44 3093 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
71f98027
AD
3094 ret = kstrtol(timeout_setting, 0, &timeout);
3095 if (ret)
3096 return ret;
3097
3098 if (timeout == 0) {
3099 index++;
3100 continue;
3101 } else if (timeout < 0) {
3102 timeout = MAX_SCHEDULE_TIMEOUT;
3103 } else {
3104 timeout = msecs_to_jiffies(timeout);
3105 }
3106
3107 switch (index++) {
3108 case 0:
3109 adev->gfx_timeout = timeout;
3110 break;
3111 case 1:
3112 adev->compute_timeout = timeout;
3113 break;
3114 case 2:
3115 adev->sdma_timeout = timeout;
3116 break;
3117 case 3:
3118 adev->video_timeout = timeout;
3119 break;
3120 default:
3121 break;
3122 }
3123 }
3124 /*
3125 * There is only one value specified and
3126 * it should apply to all non-compute jobs.
3127 */
bcccee89 3128 if (index == 1) {
71f98027 3129 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
bcccee89
ED
3130 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3131 adev->compute_timeout = adev->gfx_timeout;
3132 }
71f98027
AD
3133 }
3134
3135 return ret;
3136}
d4535e2c 3137
77f3a5cd
ND
3138static const struct attribute *amdgpu_dev_attributes[] = {
3139 &dev_attr_product_name.attr,
3140 &dev_attr_product_number.attr,
3141 &dev_attr_serial_number.attr,
3142 &dev_attr_pcie_replay_count.attr,
3143 NULL
3144};
3145
c9a6b82f 3146
d38ceaf9
AD
3147/**
3148 * amdgpu_device_init - initialize the driver
3149 *
3150 * @adev: amdgpu_device pointer
d38ceaf9
AD
3151 * @flags: driver flags
3152 *
3153 * Initializes the driver info and hw (all asics).
3154 * Returns 0 for success or an error on failure.
3155 * Called at driver startup.
3156 */
3157int amdgpu_device_init(struct amdgpu_device *adev,
d38ceaf9
AD
3158 uint32_t flags)
3159{
8aba21b7
LT
3160 struct drm_device *ddev = adev_to_drm(adev);
3161 struct pci_dev *pdev = adev->pdev;
d38ceaf9 3162 int r, i;
3840c5bc 3163 bool boco = false;
95844d20 3164 u32 max_MBps;
d38ceaf9
AD
3165
3166 adev->shutdown = false;
d38ceaf9 3167 adev->flags = flags;
4e66d7d2
YZ
3168
3169 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3170 adev->asic_type = amdgpu_force_asic_type;
3171 else
3172 adev->asic_type = flags & AMD_ASIC_MASK;
3173
d38ceaf9 3174 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
593aa2d2 3175 if (amdgpu_emu_mode == 1)
8bdab6bb 3176 adev->usec_timeout *= 10;
770d13b1 3177 adev->gmc.gart_size = 512 * 1024 * 1024;
d38ceaf9
AD
3178 adev->accel_working = false;
3179 adev->num_rings = 0;
3180 adev->mman.buffer_funcs = NULL;
3181 adev->mman.buffer_funcs_ring = NULL;
3182 adev->vm_manager.vm_pte_funcs = NULL;
0c88b430 3183 adev->vm_manager.vm_pte_num_scheds = 0;
132f34e4 3184 adev->gmc.gmc_funcs = NULL;
f54d1867 3185 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
b8866c26 3186 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
d38ceaf9
AD
3187
3188 adev->smc_rreg = &amdgpu_invalid_rreg;
3189 adev->smc_wreg = &amdgpu_invalid_wreg;
3190 adev->pcie_rreg = &amdgpu_invalid_rreg;
3191 adev->pcie_wreg = &amdgpu_invalid_wreg;
36b9a952
HR
3192 adev->pciep_rreg = &amdgpu_invalid_rreg;
3193 adev->pciep_wreg = &amdgpu_invalid_wreg;
4fa1c6a6
TZ
3194 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3195 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
d38ceaf9
AD
3196 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3197 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3198 adev->didt_rreg = &amdgpu_invalid_rreg;
3199 adev->didt_wreg = &amdgpu_invalid_wreg;
ccdbb20a
RZ
3200 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3201 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
d38ceaf9
AD
3202 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3203 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3204
3e39ab90
AD
3205 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3206 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3207 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
d38ceaf9
AD
3208
3209 /* mutex initialization are all done here so we
3210 * can recall function without having locking issues */
d38ceaf9 3211 atomic_set(&adev->irq.ih.lock, 0);
0e5ca0d1 3212 mutex_init(&adev->firmware.mutex);
d38ceaf9
AD
3213 mutex_init(&adev->pm.mutex);
3214 mutex_init(&adev->gfx.gpu_clock_mutex);
3215 mutex_init(&adev->srbm_mutex);
b8866c26 3216 mutex_init(&adev->gfx.pipe_reserve_mutex);
d23ee13f 3217 mutex_init(&adev->gfx.gfx_off_mutex);
d38ceaf9 3218 mutex_init(&adev->grbm_idx_mutex);
d38ceaf9 3219 mutex_init(&adev->mn_lock);
e23b74aa 3220 mutex_init(&adev->virt.vf_errors.lock);
d38ceaf9 3221 hash_init(adev->mn_hash);
53b3f8f4 3222 atomic_set(&adev->in_gpu_reset, 0);
6049db43 3223 init_rwsem(&adev->reset_sem);
32eaeae0 3224 mutex_init(&adev->psp.mutex);
bd052211 3225 mutex_init(&adev->notifier_lock);
d38ceaf9 3226
912dfc84
EQ
3227 r = amdgpu_device_check_arguments(adev);
3228 if (r)
3229 return r;
d38ceaf9 3230
d38ceaf9
AD
3231 spin_lock_init(&adev->mmio_idx_lock);
3232 spin_lock_init(&adev->smc_idx_lock);
3233 spin_lock_init(&adev->pcie_idx_lock);
3234 spin_lock_init(&adev->uvd_ctx_idx_lock);
3235 spin_lock_init(&adev->didt_idx_lock);
ccdbb20a 3236 spin_lock_init(&adev->gc_cac_idx_lock);
16abb5d2 3237 spin_lock_init(&adev->se_cac_idx_lock);
d38ceaf9 3238 spin_lock_init(&adev->audio_endpt_idx_lock);
95844d20 3239 spin_lock_init(&adev->mm_stats.lock);
d38ceaf9 3240
0c4e7fa5
CZ
3241 INIT_LIST_HEAD(&adev->shadow_list);
3242 mutex_init(&adev->shadow_list_lock);
3243
beff74bc
AD
3244 INIT_DELAYED_WORK(&adev->delayed_init_work,
3245 amdgpu_device_delayed_init_work_handler);
1e317b99
RZ
3246 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3247 amdgpu_device_delay_enable_gfx_off);
2dc80b00 3248
d4535e2c
AG
3249 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3250
d23ee13f 3251 adev->gfx.gfx_off_req_count = 1;
b6e79d9a 3252 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
b1ddf548 3253
b265bdbd
EQ
3254 atomic_set(&adev->throttling_logging_enabled, 1);
3255 /*
3256 * If throttling continues, logging will be performed every minute
3257 * to avoid log flooding. "-1" is subtracted since the thermal
3258 * throttling interrupt comes every second. Thus, the total logging
3259 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3260 * for throttling interrupt) = 60 seconds.
3261 */
3262 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3263 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3264
0fa49558
AX
3265 /* Registers mapping */
3266 /* TODO: block userspace mapping of io register */
da69c161
KW
3267 if (adev->asic_type >= CHIP_BONAIRE) {
3268 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3269 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3270 } else {
3271 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3272 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3273 }
d38ceaf9 3274
d38ceaf9
AD
3275 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3276 if (adev->rmmio == NULL) {
3277 return -ENOMEM;
3278 }
3279 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3280 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3281
d38ceaf9
AD
3282 /* io port mapping */
3283 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3284 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3285 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3286 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3287 break;
3288 }
3289 }
3290 if (adev->rio_mem == NULL)
b64a18c5 3291 DRM_INFO("PCI I/O BAR is not found.\n");
d38ceaf9 3292
b2109d8e
JX
3293 /* enable PCIE atomic ops */
3294 r = pci_enable_atomic_ops_to_root(adev->pdev,
3295 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3296 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3297 if (r) {
3298 adev->have_atomics_support = false;
3299 DRM_INFO("PCIE atomic ops is not supported\n");
3300 } else {
3301 adev->have_atomics_support = true;
3302 }
3303
5494d864
AD
3304 amdgpu_device_get_pcie_info(adev);
3305
b239c017
JX
3306 if (amdgpu_mcbp)
3307 DRM_INFO("MCBP is enabled\n");
3308
5f84cc63
JX
3309 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3310 adev->enable_mes = true;
3311
3aa0115d
ML
3312 /* detect hw virtualization here */
3313 amdgpu_detect_virtualization(adev);
3314
dffa11b4
ML
3315 r = amdgpu_device_get_job_timeout_settings(adev);
3316 if (r) {
3317 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4192f7b5 3318 goto failed_unmap;
a190d1c7
XY
3319 }
3320
d38ceaf9 3321 /* early init functions */
06ec9070 3322 r = amdgpu_device_ip_early_init(adev);
d38ceaf9 3323 if (r)
4192f7b5 3324 goto failed_unmap;
d38ceaf9 3325
6585661d
OZ
3326 /* doorbell bar mapping and doorbell index init*/
3327 amdgpu_device_doorbell_init(adev);
3328
d38ceaf9
AD
3329 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3330 /* this will fail for cards that aren't VGA class devices, just
3331 * ignore it */
06ec9070 3332 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
d38ceaf9 3333
31af062a 3334 if (amdgpu_device_supports_boco(ddev))
3840c5bc
AD
3335 boco = true;
3336 if (amdgpu_has_atpx() &&
3337 (amdgpu_is_atpx_hybrid() ||
3338 amdgpu_has_atpx_dgpu_power_cntl()) &&
3339 !pci_is_thunderbolt_attached(adev->pdev))
84c8b22e 3340 vga_switcheroo_register_client(adev->pdev,
3840c5bc
AD
3341 &amdgpu_switcheroo_ops, boco);
3342 if (boco)
d38ceaf9
AD
3343 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3344
9475a943
SL
3345 if (amdgpu_emu_mode == 1) {
3346 /* post the asic on emulation mode */
3347 emu_soc_asic_init(adev);
bfca0289 3348 goto fence_driver_init;
9475a943 3349 }
bfca0289 3350
4e99a44e
ML
3351 /* detect if we are with an SRIOV vbios */
3352 amdgpu_device_detect_sriov_bios(adev);
048765ad 3353
95e8e59e
AD
3354 /* check if we need to reset the asic
3355 * E.g., driver was not cleanly unloaded previously, etc.
3356 */
f14899fd 3357 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
95e8e59e
AD
3358 r = amdgpu_asic_reset(adev);
3359 if (r) {
3360 dev_err(adev->dev, "asic reset on init failed\n");
3361 goto failed;
3362 }
3363 }
3364
c9a6b82f
AG
3365 pci_enable_pcie_error_reporting(adev->ddev.pdev);
3366
d38ceaf9 3367 /* Post card if necessary */
39c640c0 3368 if (amdgpu_device_need_post(adev)) {
d38ceaf9 3369 if (!adev->bios) {
bec86378 3370 dev_err(adev->dev, "no vBIOS found\n");
83ba126a
AD
3371 r = -EINVAL;
3372 goto failed;
d38ceaf9 3373 }
bec86378 3374 DRM_INFO("GPU posting now...\n");
4d2997ab 3375 r = amdgpu_device_asic_init(adev);
4e99a44e
ML
3376 if (r) {
3377 dev_err(adev->dev, "gpu post error!\n");
3378 goto failed;
3379 }
d38ceaf9
AD
3380 }
3381
88b64e95
AD
3382 if (adev->is_atom_fw) {
3383 /* Initialize clocks */
3384 r = amdgpu_atomfirmware_get_clock_info(adev);
3385 if (r) {
3386 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
e23b74aa 3387 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
88b64e95
AD
3388 goto failed;
3389 }
3390 } else {
a5bde2f9
AD
3391 /* Initialize clocks */
3392 r = amdgpu_atombios_get_clock_info(adev);
3393 if (r) {
3394 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
e23b74aa 3395 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
89041940 3396 goto failed;
a5bde2f9
AD
3397 }
3398 /* init i2c buses */
4562236b
HW
3399 if (!amdgpu_device_has_dc_support(adev))
3400 amdgpu_atombios_i2c_init(adev);
2c1a2784 3401 }
d38ceaf9 3402
bfca0289 3403fence_driver_init:
d38ceaf9
AD
3404 /* Fence driver */
3405 r = amdgpu_fence_driver_init(adev);
2c1a2784
AD
3406 if (r) {
3407 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
e23b74aa 3408 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
83ba126a 3409 goto failed;
2c1a2784 3410 }
d38ceaf9
AD
3411
3412 /* init the mode config */
4a580877 3413 drm_mode_config_init(adev_to_drm(adev));
d38ceaf9 3414
06ec9070 3415 r = amdgpu_device_ip_init(adev);
d38ceaf9 3416 if (r) {
8840a387 3417 /* failed in exclusive mode due to timeout */
3418 if (amdgpu_sriov_vf(adev) &&
3419 !amdgpu_sriov_runtime(adev) &&
3420 amdgpu_virt_mmio_blocked(adev) &&
3421 !amdgpu_virt_wait_reset(adev)) {
3422 dev_err(adev->dev, "VF exclusive mode timeout\n");
1daee8b4
PD
3423 /* Don't send request since VF is inactive. */
3424 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3425 adev->virt.ops = NULL;
8840a387 3426 r = -EAGAIN;
3427 goto failed;
3428 }
06ec9070 3429 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
e23b74aa 3430 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
83ba126a 3431 goto failed;
d38ceaf9
AD
3432 }
3433
d69b8971
YZ
3434 dev_info(adev->dev,
3435 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
d7f72fe4
YZ
3436 adev->gfx.config.max_shader_engines,
3437 adev->gfx.config.max_sh_per_se,
3438 adev->gfx.config.max_cu_per_sh,
3439 adev->gfx.cu_info.number);
3440
d38ceaf9
AD
3441 adev->accel_working = true;
3442
e59c0205
AX
3443 amdgpu_vm_check_compute_bug(adev);
3444
95844d20
MO
3445 /* Initialize the buffer migration limit. */
3446 if (amdgpu_moverate >= 0)
3447 max_MBps = amdgpu_moverate;
3448 else
3449 max_MBps = 8; /* Allow 8 MB/s. */
3450 /* Get a log2 for easy divisions. */
3451 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3452
9bc92b9c
ML
3453 amdgpu_fbdev_init(adev);
3454
d2f52ac8 3455 r = amdgpu_pm_sysfs_init(adev);
7c868b59
YT
3456 if (r) {
3457 adev->pm_sysfs_en = false;
d2f52ac8 3458 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
7c868b59
YT
3459 } else
3460 adev->pm_sysfs_en = true;
d2f52ac8 3461
5bb23532 3462 r = amdgpu_ucode_sysfs_init(adev);
7c868b59
YT
3463 if (r) {
3464 adev->ucode_sysfs_en = false;
5bb23532 3465 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
7c868b59
YT
3466 } else
3467 adev->ucode_sysfs_en = true;
5bb23532 3468
d38ceaf9
AD
3469 if ((amdgpu_testing & 1)) {
3470 if (adev->accel_working)
3471 amdgpu_test_moves(adev);
3472 else
3473 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3474 }
d38ceaf9
AD
3475 if (amdgpu_benchmarking) {
3476 if (adev->accel_working)
3477 amdgpu_benchmark(adev, amdgpu_benchmarking);
3478 else
3479 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3480 }
3481
b0adca4d
EQ
3482 /*
3483 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3484 * Otherwise the mgpu fan boost feature will be skipped due to the
3485 * gpu instance is counted less.
3486 */
3487 amdgpu_register_gpu_instance(adev);
3488
d38ceaf9
AD
3489 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3490 * explicit gating rather than handling it automatically.
3491 */
06ec9070 3492 r = amdgpu_device_ip_late_init(adev);
2c1a2784 3493 if (r) {
06ec9070 3494 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
e23b74aa 3495 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
83ba126a 3496 goto failed;
2c1a2784 3497 }
d38ceaf9 3498
108c6a63 3499 /* must succeed. */
511fdbc3 3500 amdgpu_ras_resume(adev);
108c6a63 3501
beff74bc
AD
3502 queue_delayed_work(system_wq, &adev->delayed_init_work,
3503 msecs_to_jiffies(AMDGPU_RESUME_MS));
3504
2c738637
ML
3505 if (amdgpu_sriov_vf(adev))
3506 flush_delayed_work(&adev->delayed_init_work);
3507
77f3a5cd 3508 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
5aea5327 3509 if (r)
77f3a5cd 3510 dev_err(adev->dev, "Could not create amdgpu device attr\n");
bd607166 3511
d155bef0
AB
3512 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3513 r = amdgpu_pmu_init(adev);
9c7c85f7
JK
3514 if (r)
3515 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3516
c1dd4aa6
AG
3517 /* Have stored pci confspace at hand for restore in sudden PCI error */
3518 if (amdgpu_device_cache_pci_state(adev->pdev))
3519 pci_restore_state(pdev);
3520
d38ceaf9 3521 return 0;
83ba126a
AD
3522
3523failed:
89041940 3524 amdgpu_vf_error_trans_all(adev);
3840c5bc 3525 if (boco)
83ba126a 3526 vga_switcheroo_fini_domain_pm_ops(adev->dev);
8840a387 3527
4192f7b5
AD
3528failed_unmap:
3529 iounmap(adev->rmmio);
3530 adev->rmmio = NULL;
3531
83ba126a 3532 return r;
d38ceaf9
AD
3533}
3534
d38ceaf9
AD
3535/**
3536 * amdgpu_device_fini - tear down the driver
3537 *
3538 * @adev: amdgpu_device pointer
3539 *
3540 * Tear down the driver info (all asics).
3541 * Called at driver shutdown.
3542 */
3543void amdgpu_device_fini(struct amdgpu_device *adev)
3544{
aac89168 3545 dev_info(adev->dev, "amdgpu: finishing device.\n");
9f875167 3546 flush_delayed_work(&adev->delayed_init_work);
d0d13fe8 3547 adev->shutdown = true;
9f875167 3548
c1dd4aa6
AG
3549 kfree(adev->pci_state);
3550
752c683d
ML
3551 /* make sure IB test finished before entering exclusive mode
3552 * to avoid preemption on IB test
3553 * */
519b8b76 3554 if (amdgpu_sriov_vf(adev)) {
752c683d 3555 amdgpu_virt_request_full_gpu(adev, false);
519b8b76
BZ
3556 amdgpu_virt_fini_data_exchange(adev);
3557 }
752c683d 3558
e5b03032
ML
3559 /* disable all interrupts */
3560 amdgpu_irq_disable_all(adev);
ff97cba8
ML
3561 if (adev->mode_info.mode_config_initialized){
3562 if (!amdgpu_device_has_dc_support(adev))
4a580877 3563 drm_helper_force_disable_all(adev_to_drm(adev));
ff97cba8 3564 else
4a580877 3565 drm_atomic_helper_shutdown(adev_to_drm(adev));
ff97cba8 3566 }
d38ceaf9 3567 amdgpu_fence_driver_fini(adev);
7c868b59
YT
3568 if (adev->pm_sysfs_en)
3569 amdgpu_pm_sysfs_fini(adev);
d38ceaf9 3570 amdgpu_fbdev_fini(adev);
e230ac11 3571 amdgpu_device_ip_fini(adev);
75e1658e
ND
3572 release_firmware(adev->firmware.gpu_info_fw);
3573 adev->firmware.gpu_info_fw = NULL;
d38ceaf9
AD
3574 adev->accel_working = false;
3575 /* free i2c buses */
4562236b
HW
3576 if (!amdgpu_device_has_dc_support(adev))
3577 amdgpu_i2c_fini(adev);
bfca0289
SL
3578
3579 if (amdgpu_emu_mode != 1)
3580 amdgpu_atombios_fini(adev);
3581
d38ceaf9
AD
3582 kfree(adev->bios);
3583 adev->bios = NULL;
3840c5bc
AD
3584 if (amdgpu_has_atpx() &&
3585 (amdgpu_is_atpx_hybrid() ||
3586 amdgpu_has_atpx_dgpu_power_cntl()) &&
3587 !pci_is_thunderbolt_attached(adev->pdev))
84c8b22e 3588 vga_switcheroo_unregister_client(adev->pdev);
4a580877 3589 if (amdgpu_device_supports_boco(adev_to_drm(adev)))
83ba126a 3590 vga_switcheroo_fini_domain_pm_ops(adev->dev);
d38ceaf9
AD
3591 vga_client_register(adev->pdev, NULL, NULL, NULL);
3592 if (adev->rio_mem)
3593 pci_iounmap(adev->pdev, adev->rio_mem);
3594 adev->rio_mem = NULL;
3595 iounmap(adev->rmmio);
3596 adev->rmmio = NULL;
06ec9070 3597 amdgpu_device_doorbell_fini(adev);
e9bc1bf7 3598
7c868b59
YT
3599 if (adev->ucode_sysfs_en)
3600 amdgpu_ucode_sysfs_fini(adev);
77f3a5cd
ND
3601
3602 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
d155bef0
AB
3603 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3604 amdgpu_pmu_fini(adev);
72de33f8 3605 if (adev->mman.discovery_bin)
a190d1c7 3606 amdgpu_discovery_fini(adev);
d38ceaf9
AD
3607}
3608
3609
3610/*
3611 * Suspend & resume.
3612 */
3613/**
810ddc3a 3614 * amdgpu_device_suspend - initiate device suspend
d38ceaf9 3615 *
87e3f136 3616 * @dev: drm dev pointer
87e3f136 3617 * @fbcon : notify the fbdev of suspend
d38ceaf9
AD
3618 *
3619 * Puts the hw in the suspend state (all asics).
3620 * Returns 0 for success or an error on failure.
3621 * Called at driver suspend.
3622 */
de185019 3623int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
d38ceaf9
AD
3624{
3625 struct amdgpu_device *adev;
3626 struct drm_crtc *crtc;
3627 struct drm_connector *connector;
f8d2d39e 3628 struct drm_connector_list_iter iter;
5ceb54c6 3629 int r;
d38ceaf9 3630
1348969a 3631 adev = drm_to_adev(dev);
d38ceaf9
AD
3632
3633 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3634 return 0;
3635
44779b43 3636 adev->in_suspend = true;
d38ceaf9
AD
3637 drm_kms_helper_poll_disable(dev);
3638
5f818173
S
3639 if (fbcon)
3640 amdgpu_fbdev_set_suspend(adev, 1);
3641
beff74bc 3642 cancel_delayed_work_sync(&adev->delayed_init_work);
a5459475 3643
4562236b
HW
3644 if (!amdgpu_device_has_dc_support(adev)) {
3645 /* turn off display hw */
3646 drm_modeset_lock_all(dev);
f8d2d39e
LP
3647 drm_connector_list_iter_begin(dev, &iter);
3648 drm_for_each_connector_iter(connector, &iter)
3649 drm_helper_connector_dpms(connector,
3650 DRM_MODE_DPMS_OFF);
3651 drm_connector_list_iter_end(&iter);
4562236b 3652 drm_modeset_unlock_all(dev);
fe1053b7
AD
3653 /* unpin the front buffers and cursors */
3654 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3655 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3656 struct drm_framebuffer *fb = crtc->primary->fb;
3657 struct amdgpu_bo *robj;
3658
91334223 3659 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
fe1053b7
AD
3660 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3661 r = amdgpu_bo_reserve(aobj, true);
3662 if (r == 0) {
3663 amdgpu_bo_unpin(aobj);
3664 amdgpu_bo_unreserve(aobj);
3665 }
756e6880 3666 }
756e6880 3667
fe1053b7
AD
3668 if (fb == NULL || fb->obj[0] == NULL) {
3669 continue;
3670 }
3671 robj = gem_to_amdgpu_bo(fb->obj[0]);
3672 /* don't unpin kernel fb objects */
3673 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3674 r = amdgpu_bo_reserve(robj, true);
3675 if (r == 0) {
3676 amdgpu_bo_unpin(robj);
3677 amdgpu_bo_unreserve(robj);
3678 }
d38ceaf9
AD
3679 }
3680 }
3681 }
fe1053b7 3682
5e6932fe 3683 amdgpu_ras_suspend(adev);
3684
fe1053b7
AD
3685 r = amdgpu_device_ip_suspend_phase1(adev);
3686
94fa5660
EQ
3687 amdgpu_amdkfd_suspend(adev, !fbcon);
3688
d38ceaf9
AD
3689 /* evict vram memory */
3690 amdgpu_bo_evict_vram(adev);
3691
5ceb54c6 3692 amdgpu_fence_driver_suspend(adev);
d38ceaf9 3693
fe1053b7 3694 r = amdgpu_device_ip_suspend_phase2(adev);
d38ceaf9 3695
a0a71e49
AD
3696 /* evict remaining vram memory
3697 * This second call to evict vram is to evict the gart page table
3698 * using the CPU.
3699 */
d38ceaf9
AD
3700 amdgpu_bo_evict_vram(adev);
3701
d38ceaf9
AD
3702 return 0;
3703}
3704
3705/**
810ddc3a 3706 * amdgpu_device_resume - initiate device resume
d38ceaf9 3707 *
87e3f136 3708 * @dev: drm dev pointer
87e3f136 3709 * @fbcon : notify the fbdev of resume
d38ceaf9
AD
3710 *
3711 * Bring the hw back to operating state (all asics).
3712 * Returns 0 for success or an error on failure.
3713 * Called at driver resume.
3714 */
de185019 3715int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
d38ceaf9
AD
3716{
3717 struct drm_connector *connector;
f8d2d39e 3718 struct drm_connector_list_iter iter;
1348969a 3719 struct amdgpu_device *adev = drm_to_adev(dev);
756e6880 3720 struct drm_crtc *crtc;
03161a6e 3721 int r = 0;
d38ceaf9
AD
3722
3723 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3724 return 0;
3725
d38ceaf9 3726 /* post card */
39c640c0 3727 if (amdgpu_device_need_post(adev)) {
4d2997ab 3728 r = amdgpu_device_asic_init(adev);
74b0b157 3729 if (r)
aac89168 3730 dev_err(adev->dev, "amdgpu asic init failed\n");
74b0b157 3731 }
d38ceaf9 3732
06ec9070 3733 r = amdgpu_device_ip_resume(adev);
e6707218 3734 if (r) {
aac89168 3735 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4d3b9ae5 3736 return r;
e6707218 3737 }
5ceb54c6
AD
3738 amdgpu_fence_driver_resume(adev);
3739
d38ceaf9 3740
06ec9070 3741 r = amdgpu_device_ip_late_init(adev);
03161a6e 3742 if (r)
4d3b9ae5 3743 return r;
d38ceaf9 3744
beff74bc
AD
3745 queue_delayed_work(system_wq, &adev->delayed_init_work,
3746 msecs_to_jiffies(AMDGPU_RESUME_MS));
3747
fe1053b7
AD
3748 if (!amdgpu_device_has_dc_support(adev)) {
3749 /* pin cursors */
3750 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3751 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3752
91334223 3753 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
fe1053b7
AD
3754 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3755 r = amdgpu_bo_reserve(aobj, true);
3756 if (r == 0) {
3757 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3758 if (r != 0)
aac89168 3759 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
fe1053b7
AD
3760 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3761 amdgpu_bo_unreserve(aobj);
3762 }
756e6880
AD
3763 }
3764 }
3765 }
9593f4d6 3766 r = amdgpu_amdkfd_resume(adev, !fbcon);
ba997709
YZ
3767 if (r)
3768 return r;
756e6880 3769
96a5d8d4 3770 /* Make sure IB tests flushed */
beff74bc 3771 flush_delayed_work(&adev->delayed_init_work);
96a5d8d4 3772
d38ceaf9
AD
3773 /* blat the mode back in */
3774 if (fbcon) {
4562236b
HW
3775 if (!amdgpu_device_has_dc_support(adev)) {
3776 /* pre DCE11 */
3777 drm_helper_resume_force_mode(dev);
3778
3779 /* turn on display hw */
3780 drm_modeset_lock_all(dev);
f8d2d39e
LP
3781
3782 drm_connector_list_iter_begin(dev, &iter);
3783 drm_for_each_connector_iter(connector, &iter)
3784 drm_helper_connector_dpms(connector,
3785 DRM_MODE_DPMS_ON);
3786 drm_connector_list_iter_end(&iter);
3787
4562236b 3788 drm_modeset_unlock_all(dev);
d38ceaf9 3789 }
4d3b9ae5 3790 amdgpu_fbdev_set_suspend(adev, 0);
d38ceaf9
AD
3791 }
3792
3793 drm_kms_helper_poll_enable(dev);
23a1a9e5 3794
5e6932fe 3795 amdgpu_ras_resume(adev);
3796
23a1a9e5
L
3797 /*
3798 * Most of the connector probing functions try to acquire runtime pm
3799 * refs to ensure that the GPU is powered on when connector polling is
3800 * performed. Since we're calling this from a runtime PM callback,
3801 * trying to acquire rpm refs will cause us to deadlock.
3802 *
3803 * Since we're guaranteed to be holding the rpm lock, it's safe to
3804 * temporarily disable the rpm helpers so this doesn't deadlock us.
3805 */
3806#ifdef CONFIG_PM
3807 dev->dev->power.disable_depth++;
3808#endif
4562236b
HW
3809 if (!amdgpu_device_has_dc_support(adev))
3810 drm_helper_hpd_irq_event(dev);
3811 else
3812 drm_kms_helper_hotplug_event(dev);
23a1a9e5
L
3813#ifdef CONFIG_PM
3814 dev->dev->power.disable_depth--;
3815#endif
44779b43
RZ
3816 adev->in_suspend = false;
3817
4d3b9ae5 3818 return 0;
d38ceaf9
AD
3819}
3820
e3ecdffa
AD
3821/**
3822 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3823 *
3824 * @adev: amdgpu_device pointer
3825 *
3826 * The list of all the hardware IPs that make up the asic is walked and
3827 * the check_soft_reset callbacks are run. check_soft_reset determines
3828 * if the asic is still hung or not.
3829 * Returns true if any of the IPs are still in a hung state, false if not.
3830 */
06ec9070 3831static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
63fbf42f
CZ
3832{
3833 int i;
3834 bool asic_hang = false;
3835
f993d628
ML
3836 if (amdgpu_sriov_vf(adev))
3837 return true;
3838
8bc04c29
AD
3839 if (amdgpu_asic_need_full_reset(adev))
3840 return true;
3841
63fbf42f 3842 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3843 if (!adev->ip_blocks[i].status.valid)
63fbf42f 3844 continue;
a1255107
AD
3845 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3846 adev->ip_blocks[i].status.hang =
3847 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3848 if (adev->ip_blocks[i].status.hang) {
aac89168 3849 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
63fbf42f
CZ
3850 asic_hang = true;
3851 }
3852 }
3853 return asic_hang;
3854}
3855
e3ecdffa
AD
3856/**
3857 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3858 *
3859 * @adev: amdgpu_device pointer
3860 *
3861 * The list of all the hardware IPs that make up the asic is walked and the
3862 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3863 * handles any IP specific hardware or software state changes that are
3864 * necessary for a soft reset to succeed.
3865 * Returns 0 on success, negative error code on failure.
3866 */
06ec9070 3867static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
d31a501e
CZ
3868{
3869 int i, r = 0;
3870
3871 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3872 if (!adev->ip_blocks[i].status.valid)
d31a501e 3873 continue;
a1255107
AD
3874 if (adev->ip_blocks[i].status.hang &&
3875 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3876 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
d31a501e
CZ
3877 if (r)
3878 return r;
3879 }
3880 }
3881
3882 return 0;
3883}
3884
e3ecdffa
AD
3885/**
3886 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3887 *
3888 * @adev: amdgpu_device pointer
3889 *
3890 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3891 * reset is necessary to recover.
3892 * Returns true if a full asic reset is required, false if not.
3893 */
06ec9070 3894static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
35d782fe 3895{
da146d3b
AD
3896 int i;
3897
8bc04c29
AD
3898 if (amdgpu_asic_need_full_reset(adev))
3899 return true;
3900
da146d3b 3901 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3902 if (!adev->ip_blocks[i].status.valid)
da146d3b 3903 continue;
a1255107
AD
3904 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3905 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3906 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
98512bb8
KW
3907 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3908 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
a1255107 3909 if (adev->ip_blocks[i].status.hang) {
aac89168 3910 dev_info(adev->dev, "Some block need full reset!\n");
da146d3b
AD
3911 return true;
3912 }
3913 }
35d782fe
CZ
3914 }
3915 return false;
3916}
3917
e3ecdffa
AD
3918/**
3919 * amdgpu_device_ip_soft_reset - do a soft reset
3920 *
3921 * @adev: amdgpu_device pointer
3922 *
3923 * The list of all the hardware IPs that make up the asic is walked and the
3924 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3925 * IP specific hardware or software state changes that are necessary to soft
3926 * reset the IP.
3927 * Returns 0 on success, negative error code on failure.
3928 */
06ec9070 3929static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
3930{
3931 int i, r = 0;
3932
3933 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3934 if (!adev->ip_blocks[i].status.valid)
35d782fe 3935 continue;
a1255107
AD
3936 if (adev->ip_blocks[i].status.hang &&
3937 adev->ip_blocks[i].version->funcs->soft_reset) {
3938 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
35d782fe
CZ
3939 if (r)
3940 return r;
3941 }
3942 }
3943
3944 return 0;
3945}
3946
e3ecdffa
AD
3947/**
3948 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3949 *
3950 * @adev: amdgpu_device pointer
3951 *
3952 * The list of all the hardware IPs that make up the asic is walked and the
3953 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3954 * handles any IP specific hardware or software state changes that are
3955 * necessary after the IP has been soft reset.
3956 * Returns 0 on success, negative error code on failure.
3957 */
06ec9070 3958static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
35d782fe
CZ
3959{
3960 int i, r = 0;
3961
3962 for (i = 0; i < adev->num_ip_blocks; i++) {
a1255107 3963 if (!adev->ip_blocks[i].status.valid)
35d782fe 3964 continue;
a1255107
AD
3965 if (adev->ip_blocks[i].status.hang &&
3966 adev->ip_blocks[i].version->funcs->post_soft_reset)
3967 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
35d782fe
CZ
3968 if (r)
3969 return r;
3970 }
3971
3972 return 0;
3973}
3974
e3ecdffa 3975/**
c33adbc7 3976 * amdgpu_device_recover_vram - Recover some VRAM contents
e3ecdffa
AD
3977 *
3978 * @adev: amdgpu_device pointer
3979 *
3980 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
3981 * restore things like GPUVM page tables after a GPU reset where
3982 * the contents of VRAM might be lost.
403009bf
CK
3983 *
3984 * Returns:
3985 * 0 on success, negative error code on failure.
e3ecdffa 3986 */
c33adbc7 3987static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
c41d1cf6 3988{
c41d1cf6 3989 struct dma_fence *fence = NULL, *next = NULL;
403009bf
CK
3990 struct amdgpu_bo *shadow;
3991 long r = 1, tmo;
c41d1cf6
ML
3992
3993 if (amdgpu_sriov_runtime(adev))
b045d3af 3994 tmo = msecs_to_jiffies(8000);
c41d1cf6
ML
3995 else
3996 tmo = msecs_to_jiffies(100);
3997
aac89168 3998 dev_info(adev->dev, "recover vram bo from shadow start\n");
c41d1cf6 3999 mutex_lock(&adev->shadow_list_lock);
403009bf
CK
4000 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4001
4002 /* No need to recover an evicted BO */
4003 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
b575f10d 4004 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
403009bf
CK
4005 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4006 continue;
4007
4008 r = amdgpu_bo_restore_shadow(shadow, &next);
4009 if (r)
4010 break;
4011
c41d1cf6 4012 if (fence) {
1712fb1a 4013 tmo = dma_fence_wait_timeout(fence, false, tmo);
403009bf
CK
4014 dma_fence_put(fence);
4015 fence = next;
1712fb1a 4016 if (tmo == 0) {
4017 r = -ETIMEDOUT;
c41d1cf6 4018 break;
1712fb1a 4019 } else if (tmo < 0) {
4020 r = tmo;
4021 break;
4022 }
403009bf
CK
4023 } else {
4024 fence = next;
c41d1cf6 4025 }
c41d1cf6
ML
4026 }
4027 mutex_unlock(&adev->shadow_list_lock);
4028
403009bf
CK
4029 if (fence)
4030 tmo = dma_fence_wait_timeout(fence, false, tmo);
c41d1cf6
ML
4031 dma_fence_put(fence);
4032
1712fb1a 4033 if (r < 0 || tmo <= 0) {
aac89168 4034 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
403009bf
CK
4035 return -EIO;
4036 }
c41d1cf6 4037
aac89168 4038 dev_info(adev->dev, "recover vram bo from shadow done\n");
403009bf 4039 return 0;
c41d1cf6
ML
4040}
4041
a90ad3c2 4042
e3ecdffa 4043/**
06ec9070 4044 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5740682e
ML
4045 *
4046 * @adev: amdgpu device pointer
87e3f136 4047 * @from_hypervisor: request from hypervisor
5740682e
ML
4048 *
4049 * do VF FLR and reinitialize Asic
3f48c681 4050 * return 0 means succeeded otherwise failed
e3ecdffa
AD
4051 */
4052static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4053 bool from_hypervisor)
5740682e
ML
4054{
4055 int r;
4056
4057 if (from_hypervisor)
4058 r = amdgpu_virt_request_full_gpu(adev, true);
4059 else
4060 r = amdgpu_virt_reset_gpu(adev);
4061 if (r)
4062 return r;
a90ad3c2 4063
b639c22c
JZ
4064 amdgpu_amdkfd_pre_reset(adev);
4065
a90ad3c2 4066 /* Resume IP prior to SMC */
06ec9070 4067 r = amdgpu_device_ip_reinit_early_sriov(adev);
5740682e
ML
4068 if (r)
4069 goto error;
a90ad3c2 4070
c9ffa427 4071 amdgpu_virt_init_data_exchange(adev);
a90ad3c2 4072 /* we need recover gart prior to run SMC/CP/SDMA resume */
6c28aed6 4073 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
a90ad3c2 4074
7a3e0bb2
RZ
4075 r = amdgpu_device_fw_loading(adev);
4076 if (r)
4077 return r;
4078
a90ad3c2 4079 /* now we are okay to resume SMC/CP/SDMA */
06ec9070 4080 r = amdgpu_device_ip_reinit_late_sriov(adev);
5740682e
ML
4081 if (r)
4082 goto error;
a90ad3c2
ML
4083
4084 amdgpu_irq_gpu_reset_resume_helper(adev);
5740682e 4085 r = amdgpu_ib_ring_tests(adev);
f81e8d53 4086 amdgpu_amdkfd_post_reset(adev);
a90ad3c2 4087
abc34253
ED
4088error:
4089 amdgpu_virt_release_full_gpu(adev, true);
c41d1cf6 4090 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
e3526257 4091 amdgpu_inc_vram_lost(adev);
c33adbc7 4092 r = amdgpu_device_recover_vram(adev);
a90ad3c2
ML
4093 }
4094
4095 return r;
4096}
4097
9a1cddd6 4098/**
4099 * amdgpu_device_has_job_running - check if there is any job in mirror list
4100 *
4101 * @adev: amdgpu device pointer
4102 *
4103 * check if there is any job in mirror list
4104 */
4105bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4106{
4107 int i;
4108 struct drm_sched_job *job;
4109
4110 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4111 struct amdgpu_ring *ring = adev->rings[i];
4112
4113 if (!ring || !ring->sched.thread)
4114 continue;
4115
4116 spin_lock(&ring->sched.job_list_lock);
4117 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4118 struct drm_sched_job, node);
4119 spin_unlock(&ring->sched.job_list_lock);
4120 if (job)
4121 return true;
4122 }
4123 return false;
4124}
4125
12938fad
CK
4126/**
4127 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4128 *
4129 * @adev: amdgpu device pointer
4130 *
4131 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4132 * a hung GPU.
4133 */
4134bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4135{
4136 if (!amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4137 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
12938fad
CK
4138 return false;
4139 }
4140
3ba7b418
AG
4141 if (amdgpu_gpu_recovery == 0)
4142 goto disabled;
4143
4144 if (amdgpu_sriov_vf(adev))
4145 return true;
4146
4147 if (amdgpu_gpu_recovery == -1) {
4148 switch (adev->asic_type) {
fc42d47c
AG
4149 case CHIP_BONAIRE:
4150 case CHIP_HAWAII:
3ba7b418
AG
4151 case CHIP_TOPAZ:
4152 case CHIP_TONGA:
4153 case CHIP_FIJI:
4154 case CHIP_POLARIS10:
4155 case CHIP_POLARIS11:
4156 case CHIP_POLARIS12:
4157 case CHIP_VEGAM:
4158 case CHIP_VEGA20:
4159 case CHIP_VEGA10:
4160 case CHIP_VEGA12:
c43b849f 4161 case CHIP_RAVEN:
e9d4cf91 4162 case CHIP_ARCTURUS:
2cb44fb0 4163 case CHIP_RENOIR:
658c6639
AD
4164 case CHIP_NAVI10:
4165 case CHIP_NAVI14:
4166 case CHIP_NAVI12:
131a3c74 4167 case CHIP_SIENNA_CICHLID:
3ba7b418
AG
4168 break;
4169 default:
4170 goto disabled;
4171 }
12938fad
CK
4172 }
4173
4174 return true;
3ba7b418
AG
4175
4176disabled:
aac89168 4177 dev_info(adev->dev, "GPU recovery disabled.\n");
3ba7b418 4178 return false;
12938fad
CK
4179}
4180
5c6dd71e 4181
26bc5340
AG
4182static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4183 struct amdgpu_job *job,
4184 bool *need_full_reset_arg)
4185{
4186 int i, r = 0;
4187 bool need_full_reset = *need_full_reset_arg;
71182665 4188
728e7e0c
JZ
4189 amdgpu_debugfs_wait_dump(adev);
4190
b602ca5f
TZ
4191 if (amdgpu_sriov_vf(adev)) {
4192 /* stop the data exchange thread */
4193 amdgpu_virt_fini_data_exchange(adev);
4194 }
4195
71182665 4196 /* block all schedulers and reset given job's ring */
0875dc9e
CZ
4197 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4198 struct amdgpu_ring *ring = adev->rings[i];
4199
51687759 4200 if (!ring || !ring->sched.thread)
0875dc9e 4201 continue;
5740682e 4202
2f9d4084
ML
4203 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4204 amdgpu_fence_driver_force_completion(ring);
0875dc9e 4205 }
d38ceaf9 4206
222b5f04
AG
4207 if(job)
4208 drm_sched_increase_karma(&job->base);
4209
1d721ed6 4210 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
26bc5340
AG
4211 if (!amdgpu_sriov_vf(adev)) {
4212
4213 if (!need_full_reset)
4214 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4215
4216 if (!need_full_reset) {
4217 amdgpu_device_ip_pre_soft_reset(adev);
4218 r = amdgpu_device_ip_soft_reset(adev);
4219 amdgpu_device_ip_post_soft_reset(adev);
4220 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
aac89168 4221 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
26bc5340
AG
4222 need_full_reset = true;
4223 }
4224 }
4225
4226 if (need_full_reset)
4227 r = amdgpu_device_ip_suspend(adev);
4228
4229 *need_full_reset_arg = need_full_reset;
4230 }
4231
4232 return r;
4233}
4234
041a62bc 4235static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
26bc5340 4236 struct list_head *device_list_handle,
7ac71382
AG
4237 bool *need_full_reset_arg,
4238 bool skip_hw_reset)
26bc5340
AG
4239{
4240 struct amdgpu_device *tmp_adev = NULL;
4241 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4242 int r = 0;
4243
4244 /*
4245 * ASIC reset has to be done on all HGMI hive nodes ASAP
4246 * to allow proper links negotiation in FW (within 1 sec)
4247 */
7ac71382 4248 if (!skip_hw_reset && need_full_reset) {
26bc5340 4249 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
041a62bc 4250 /* For XGMI run all resets in parallel to speed up the process */
d4535e2c 4251 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
c96cf282 4252 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
d4535e2c
AG
4253 r = -EALREADY;
4254 } else
4255 r = amdgpu_asic_reset(tmp_adev);
d4535e2c 4256
041a62bc 4257 if (r) {
aac89168 4258 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4a580877 4259 r, adev_to_drm(tmp_adev)->unique);
041a62bc 4260 break;
ce316fa5
LM
4261 }
4262 }
4263
041a62bc
AG
4264 /* For XGMI wait for all resets to complete before proceed */
4265 if (!r) {
ce316fa5
LM
4266 list_for_each_entry(tmp_adev, device_list_handle,
4267 gmc.xgmi.head) {
4268 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4269 flush_work(&tmp_adev->xgmi_reset_work);
4270 r = tmp_adev->asic_reset_res;
4271 if (r)
4272 break;
ce316fa5
LM
4273 }
4274 }
4275 }
ce316fa5 4276 }
26bc5340 4277
43c4d576
JC
4278 if (!r && amdgpu_ras_intr_triggered()) {
4279 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4280 if (tmp_adev->mmhub.funcs &&
4281 tmp_adev->mmhub.funcs->reset_ras_error_count)
4282 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4283 }
4284
00eaa571 4285 amdgpu_ras_intr_cleared();
43c4d576 4286 }
00eaa571 4287
26bc5340
AG
4288 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4289 if (need_full_reset) {
4290 /* post card */
4d2997ab 4291 if (amdgpu_device_asic_init(tmp_adev))
aac89168 4292 dev_warn(tmp_adev->dev, "asic atom init failed!");
26bc5340
AG
4293
4294 if (!r) {
4295 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4296 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4297 if (r)
4298 goto out;
4299
4300 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4301 if (vram_lost) {
77e7f829 4302 DRM_INFO("VRAM is lost due to GPU reset!\n");
e3526257 4303 amdgpu_inc_vram_lost(tmp_adev);
26bc5340
AG
4304 }
4305
6c28aed6 4306 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
26bc5340
AG
4307 if (r)
4308 goto out;
4309
4310 r = amdgpu_device_fw_loading(tmp_adev);
4311 if (r)
4312 return r;
4313
4314 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4315 if (r)
4316 goto out;
4317
4318 if (vram_lost)
4319 amdgpu_device_fill_reset_magic(tmp_adev);
4320
fdafb359
EQ
4321 /*
4322 * Add this ASIC as tracked as reset was already
4323 * complete successfully.
4324 */
4325 amdgpu_register_gpu_instance(tmp_adev);
4326
7c04ca50 4327 r = amdgpu_device_ip_late_init(tmp_adev);
4328 if (r)
4329 goto out;
4330
565d1941
EQ
4331 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4332
e8fbaf03
GC
4333 /*
4334 * The GPU enters bad state once faulty pages
4335 * by ECC has reached the threshold, and ras
4336 * recovery is scheduled next. So add one check
4337 * here to break recovery if it indeed exceeds
4338 * bad page threshold, and remind user to
4339 * retire this GPU or setting one bigger
4340 * bad_page_threshold value to fix this once
4341 * probing driver again.
4342 */
4343 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4344 /* must succeed. */
4345 amdgpu_ras_resume(tmp_adev);
4346 } else {
4347 r = -EINVAL;
4348 goto out;
4349 }
e79a04d5 4350
26bc5340
AG
4351 /* Update PSP FW topology after reset */
4352 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4353 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4354 }
4355 }
4356
26bc5340
AG
4357out:
4358 if (!r) {
4359 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4360 r = amdgpu_ib_ring_tests(tmp_adev);
4361 if (r) {
4362 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4363 r = amdgpu_device_ip_suspend(tmp_adev);
4364 need_full_reset = true;
4365 r = -EAGAIN;
4366 goto end;
4367 }
4368 }
4369
4370 if (!r)
4371 r = amdgpu_device_recover_vram(tmp_adev);
4372 else
4373 tmp_adev->asic_reset_res = r;
4374 }
4375
4376end:
4377 *need_full_reset_arg = need_full_reset;
4378 return r;
4379}
4380
08ebb485
DL
4381static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4382 struct amdgpu_hive_info *hive)
26bc5340 4383{
53b3f8f4
DL
4384 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4385 return false;
4386
08ebb485
DL
4387 if (hive) {
4388 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4389 } else {
4390 down_write(&adev->reset_sem);
4391 }
5740682e 4392
26bc5340 4393 atomic_inc(&adev->gpu_reset_counter);
a3a09142
AD
4394 switch (amdgpu_asic_reset_method(adev)) {
4395 case AMD_RESET_METHOD_MODE1:
4396 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4397 break;
4398 case AMD_RESET_METHOD_MODE2:
4399 adev->mp1_state = PP_MP1_STATE_RESET;
4400 break;
4401 default:
4402 adev->mp1_state = PP_MP1_STATE_NONE;
4403 break;
4404 }
1d721ed6
AG
4405
4406 return true;
26bc5340 4407}
d38ceaf9 4408
26bc5340
AG
4409static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4410{
89041940 4411 amdgpu_vf_error_trans_all(adev);
a3a09142 4412 adev->mp1_state = PP_MP1_STATE_NONE;
53b3f8f4 4413 atomic_set(&adev->in_gpu_reset, 0);
6049db43 4414 up_write(&adev->reset_sem);
26bc5340
AG
4415}
4416
3f12acc8
EQ
4417static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4418{
4419 struct pci_dev *p = NULL;
4420
4421 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4422 adev->pdev->bus->number, 1);
4423 if (p) {
4424 pm_runtime_enable(&(p->dev));
4425 pm_runtime_resume(&(p->dev));
4426 }
4427}
4428
4429static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4430{
4431 enum amd_reset_method reset_method;
4432 struct pci_dev *p = NULL;
4433 u64 expires;
4434
4435 /*
4436 * For now, only BACO and mode1 reset are confirmed
4437 * to suffer the audio issue without proper suspended.
4438 */
4439 reset_method = amdgpu_asic_reset_method(adev);
4440 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4441 (reset_method != AMD_RESET_METHOD_MODE1))
4442 return -EINVAL;
4443
4444 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4445 adev->pdev->bus->number, 1);
4446 if (!p)
4447 return -ENODEV;
4448
4449 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4450 if (!expires)
4451 /*
4452 * If we cannot get the audio device autosuspend delay,
4453 * a fixed 4S interval will be used. Considering 3S is
4454 * the audio controller default autosuspend delay setting.
4455 * 4S used here is guaranteed to cover that.
4456 */
54b7feb9 4457 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
3f12acc8
EQ
4458
4459 while (!pm_runtime_status_suspended(&(p->dev))) {
4460 if (!pm_runtime_suspend(&(p->dev)))
4461 break;
4462
4463 if (expires < ktime_get_mono_fast_ns()) {
4464 dev_warn(adev->dev, "failed to suspend display audio\n");
4465 /* TODO: abort the succeeding gpu reset? */
4466 return -ETIMEDOUT;
4467 }
4468 }
4469
4470 pm_runtime_disable(&(p->dev));
4471
4472 return 0;
4473}
4474
26bc5340
AG
4475/**
4476 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4477 *
4478 * @adev: amdgpu device pointer
4479 * @job: which job trigger hang
4480 *
4481 * Attempt to reset the GPU if it has hung (all asics).
4482 * Attempt to do soft-reset or full-reset and reinitialize Asic
4483 * Returns 0 for success or an error on failure.
4484 */
4485
4486int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4487 struct amdgpu_job *job)
4488{
1d721ed6 4489 struct list_head device_list, *device_list_handle = NULL;
7dd8c205
EQ
4490 bool need_full_reset = false;
4491 bool job_signaled = false;
26bc5340 4492 struct amdgpu_hive_info *hive = NULL;
26bc5340 4493 struct amdgpu_device *tmp_adev = NULL;
1d721ed6 4494 int i, r = 0;
bb5c7235 4495 bool need_emergency_restart = false;
3f12acc8 4496 bool audio_suspended = false;
26bc5340 4497
bb5c7235
WS
4498 /**
4499 * Special case: RAS triggered and full reset isn't supported
4500 */
4501 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4502
d5ea093e
AG
4503 /*
4504 * Flush RAM to disk so that after reboot
4505 * the user can read log and see why the system rebooted.
4506 */
bb5c7235 4507 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
d5ea093e
AG
4508 DRM_WARN("Emergency reboot.");
4509
4510 ksys_sync_helper();
4511 emergency_restart();
4512 }
4513
b823821f 4514 dev_info(adev->dev, "GPU %s begin!\n",
bb5c7235 4515 need_emergency_restart ? "jobs stop":"reset");
26bc5340
AG
4516
4517 /*
1d721ed6
AG
4518 * Here we trylock to avoid chain of resets executing from
4519 * either trigger by jobs on different adevs in XGMI hive or jobs on
4520 * different schedulers for same device while this TO handler is running.
4521 * We always reset all schedulers for device and all devices for XGMI
4522 * hive so that should take care of them too.
26bc5340 4523 */
d95e8e97 4524 hive = amdgpu_get_xgmi_hive(adev);
53b3f8f4
DL
4525 if (hive) {
4526 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4527 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4528 job ? job->base.id : -1, hive->hive_id);
d95e8e97 4529 amdgpu_put_xgmi_hive(hive);
53b3f8f4
DL
4530 return 0;
4531 }
4532 mutex_lock(&hive->hive_lock);
1d721ed6 4533 }
26bc5340 4534
9e94d22c
EQ
4535 /*
4536 * Build list of devices to reset.
4537 * In case we are in XGMI hive mode, resort the device list
4538 * to put adev in the 1st position.
4539 */
4540 INIT_LIST_HEAD(&device_list);
4541 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4542 if (!hive)
26bc5340 4543 return -ENODEV;
9e94d22c
EQ
4544 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4545 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
26bc5340
AG
4546 device_list_handle = &hive->device_list;
4547 } else {
4548 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4549 device_list_handle = &device_list;
4550 }
4551
1d721ed6
AG
4552 /* block all schedulers and reset given job's ring */
4553 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
08ebb485 4554 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
aac89168 4555 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
9e94d22c 4556 job ? job->base.id : -1);
cbfd17f7
DL
4557 r = 0;
4558 goto skip_recovery;
7c6e68c7
AG
4559 }
4560
3f12acc8
EQ
4561 /*
4562 * Try to put the audio codec into suspend state
4563 * before gpu reset started.
4564 *
4565 * Due to the power domain of the graphics device
4566 * is shared with AZ power domain. Without this,
4567 * we may change the audio hardware from behind
4568 * the audio driver's back. That will trigger
4569 * some audio codec errors.
4570 */
4571 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4572 audio_suspended = true;
4573
9e94d22c
EQ
4574 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4575
52fb44cf
EQ
4576 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4577
9e94d22c
EQ
4578 if (!amdgpu_sriov_vf(tmp_adev))
4579 amdgpu_amdkfd_pre_reset(tmp_adev);
4580
12ffa55d
AG
4581 /*
4582 * Mark these ASICs to be reseted as untracked first
4583 * And add them back after reset completed
4584 */
4585 amdgpu_unregister_gpu_instance(tmp_adev);
4586
a2f63ee8 4587 amdgpu_fbdev_set_suspend(tmp_adev, 1);
565d1941 4588
f1c1314b 4589 /* disable ras on ALL IPs */
bb5c7235 4590 if (!need_emergency_restart &&
b823821f 4591 amdgpu_device_ip_need_full_reset(tmp_adev))
f1c1314b 4592 amdgpu_ras_suspend(tmp_adev);
4593
1d721ed6
AG
4594 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4595 struct amdgpu_ring *ring = tmp_adev->rings[i];
4596
4597 if (!ring || !ring->sched.thread)
4598 continue;
4599
0b2d2c2e 4600 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
7c6e68c7 4601
bb5c7235 4602 if (need_emergency_restart)
7c6e68c7 4603 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
1d721ed6
AG
4604 }
4605 }
4606
bb5c7235 4607 if (need_emergency_restart)
7c6e68c7
AG
4608 goto skip_sched_resume;
4609
1d721ed6
AG
4610 /*
4611 * Must check guilty signal here since after this point all old
4612 * HW fences are force signaled.
4613 *
4614 * job->base holds a reference to parent fence
4615 */
4616 if (job && job->base.s_fence->parent &&
7dd8c205 4617 dma_fence_is_signaled(job->base.s_fence->parent)) {
1d721ed6 4618 job_signaled = true;
1d721ed6
AG
4619 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4620 goto skip_hw_reset;
4621 }
4622
26bc5340
AG
4623retry: /* Rest of adevs pre asic reset from XGMI hive. */
4624 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
26bc5340
AG
4625 r = amdgpu_device_pre_asic_reset(tmp_adev,
4626 NULL,
4627 &need_full_reset);
4628 /*TODO Should we stop ?*/
4629 if (r) {
aac89168 4630 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4a580877 4631 r, adev_to_drm(tmp_adev)->unique);
26bc5340
AG
4632 tmp_adev->asic_reset_res = r;
4633 }
4634 }
4635
4636 /* Actual ASIC resets if needed.*/
4637 /* TODO Implement XGMI hive reset logic for SRIOV */
4638 if (amdgpu_sriov_vf(adev)) {
4639 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4640 if (r)
4641 adev->asic_reset_res = r;
4642 } else {
7ac71382 4643 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
26bc5340
AG
4644 if (r && r == -EAGAIN)
4645 goto retry;
4646 }
4647
1d721ed6
AG
4648skip_hw_reset:
4649
26bc5340
AG
4650 /* Post ASIC reset for all devs .*/
4651 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
7c6e68c7 4652
1d721ed6
AG
4653 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4654 struct amdgpu_ring *ring = tmp_adev->rings[i];
4655
4656 if (!ring || !ring->sched.thread)
4657 continue;
4658
4659 /* No point to resubmit jobs if we didn't HW reset*/
4660 if (!tmp_adev->asic_reset_res && !job_signaled)
4661 drm_sched_resubmit_jobs(&ring->sched);
4662
4663 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4664 }
4665
4666 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4a580877 4667 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
1d721ed6
AG
4668 }
4669
4670 tmp_adev->asic_reset_res = 0;
26bc5340
AG
4671
4672 if (r) {
4673 /* bad news, how to tell it to userspace ? */
12ffa55d 4674 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340
AG
4675 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4676 } else {
12ffa55d 4677 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
26bc5340 4678 }
7c6e68c7 4679 }
26bc5340 4680
7c6e68c7
AG
4681skip_sched_resume:
4682 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4683 /*unlock kfd: SRIOV would do it separately */
bb5c7235 4684 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
7c6e68c7 4685 amdgpu_amdkfd_post_reset(tmp_adev);
3f12acc8
EQ
4686 if (audio_suspended)
4687 amdgpu_device_resume_display_audio(tmp_adev);
26bc5340
AG
4688 amdgpu_device_unlock_adev(tmp_adev);
4689 }
4690
cbfd17f7 4691skip_recovery:
9e94d22c 4692 if (hive) {
53b3f8f4 4693 atomic_set(&hive->in_reset, 0);
9e94d22c 4694 mutex_unlock(&hive->hive_lock);
d95e8e97 4695 amdgpu_put_xgmi_hive(hive);
9e94d22c 4696 }
26bc5340
AG
4697
4698 if (r)
4699 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
d38ceaf9
AD
4700 return r;
4701}
4702
e3ecdffa
AD
4703/**
4704 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4705 *
4706 * @adev: amdgpu_device pointer
4707 *
4708 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4709 * and lanes) of the slot the device is in. Handles APUs and
4710 * virtualized environments where PCIE config space may not be available.
4711 */
5494d864 4712static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
d0dd7f0c 4713{
5d9a6330 4714 struct pci_dev *pdev;
c5313457
HK
4715 enum pci_bus_speed speed_cap, platform_speed_cap;
4716 enum pcie_link_width platform_link_width;
d0dd7f0c 4717
cd474ba0
AD
4718 if (amdgpu_pcie_gen_cap)
4719 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
d0dd7f0c 4720
cd474ba0
AD
4721 if (amdgpu_pcie_lane_cap)
4722 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
d0dd7f0c 4723
cd474ba0
AD
4724 /* covers APUs as well */
4725 if (pci_is_root_bus(adev->pdev->bus)) {
4726 if (adev->pm.pcie_gen_mask == 0)
4727 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4728 if (adev->pm.pcie_mlw_mask == 0)
4729 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
d0dd7f0c 4730 return;
cd474ba0 4731 }
d0dd7f0c 4732
c5313457
HK
4733 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4734 return;
4735
dbaa922b
AD
4736 pcie_bandwidth_available(adev->pdev, NULL,
4737 &platform_speed_cap, &platform_link_width);
c5313457 4738
cd474ba0 4739 if (adev->pm.pcie_gen_mask == 0) {
5d9a6330
AD
4740 /* asic caps */
4741 pdev = adev->pdev;
4742 speed_cap = pcie_get_speed_cap(pdev);
4743 if (speed_cap == PCI_SPEED_UNKNOWN) {
4744 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
cd474ba0
AD
4745 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4746 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
cd474ba0 4747 } else {
5d9a6330
AD
4748 if (speed_cap == PCIE_SPEED_16_0GT)
4749 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4750 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4751 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4752 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4753 else if (speed_cap == PCIE_SPEED_8_0GT)
4754 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4755 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4756 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4757 else if (speed_cap == PCIE_SPEED_5_0GT)
4758 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4759 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4760 else
4761 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4762 }
4763 /* platform caps */
c5313457 4764 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5d9a6330
AD
4765 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4766 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4767 } else {
c5313457 4768 if (platform_speed_cap == PCIE_SPEED_16_0GT)
5d9a6330
AD
4769 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4770 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4771 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4772 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
c5313457 4773 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5d9a6330
AD
4774 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4775 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4776 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
c5313457 4777 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5d9a6330
AD
4778 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4779 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4780 else
4781 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4782
cd474ba0
AD
4783 }
4784 }
4785 if (adev->pm.pcie_mlw_mask == 0) {
c5313457 4786 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5d9a6330
AD
4787 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4788 } else {
c5313457 4789 switch (platform_link_width) {
5d9a6330 4790 case PCIE_LNK_X32:
cd474ba0
AD
4791 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4792 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4793 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4794 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4795 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4796 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4797 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4798 break;
5d9a6330 4799 case PCIE_LNK_X16:
cd474ba0
AD
4800 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4801 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4802 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4803 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4804 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4806 break;
5d9a6330 4807 case PCIE_LNK_X12:
cd474ba0
AD
4808 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4809 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4810 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4811 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4812 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4813 break;
5d9a6330 4814 case PCIE_LNK_X8:
cd474ba0
AD
4815 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4819 break;
5d9a6330 4820 case PCIE_LNK_X4:
cd474ba0
AD
4821 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4823 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4824 break;
5d9a6330 4825 case PCIE_LNK_X2:
cd474ba0
AD
4826 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4828 break;
5d9a6330 4829 case PCIE_LNK_X1:
cd474ba0
AD
4830 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4831 break;
4832 default:
4833 break;
4834 }
d0dd7f0c
AD
4835 }
4836 }
4837}
d38ceaf9 4838
361dbd01
AD
4839int amdgpu_device_baco_enter(struct drm_device *dev)
4840{
1348969a 4841 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 4842 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
361dbd01 4843
4a580877 4844 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
4845 return -ENOTSUPP;
4846
7a22677b
LM
4847 if (ras && ras->supported)
4848 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4849
9530273e 4850 return amdgpu_dpm_baco_enter(adev);
361dbd01
AD
4851}
4852
4853int amdgpu_device_baco_exit(struct drm_device *dev)
4854{
1348969a 4855 struct amdgpu_device *adev = drm_to_adev(dev);
7a22677b 4856 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
9530273e 4857 int ret = 0;
361dbd01 4858
4a580877 4859 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
361dbd01
AD
4860 return -ENOTSUPP;
4861
9530273e
EQ
4862 ret = amdgpu_dpm_baco_exit(adev);
4863 if (ret)
4864 return ret;
7a22677b
LM
4865
4866 if (ras && ras->supported)
4867 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4868
4869 return 0;
361dbd01 4870}
c9a6b82f 4871
acd89fca
AG
4872static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4873{
4874 int i;
4875
4876 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4877 struct amdgpu_ring *ring = adev->rings[i];
4878
4879 if (!ring || !ring->sched.thread)
4880 continue;
4881
4882 cancel_delayed_work_sync(&ring->sched.work_tdr);
4883 }
4884}
4885
c9a6b82f
AG
4886/**
4887 * amdgpu_pci_error_detected - Called when a PCI error is detected.
4888 * @pdev: PCI device struct
4889 * @state: PCI channel state
4890 *
4891 * Description: Called when a PCI error is detected.
4892 *
4893 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4894 */
4895pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4896{
4897 struct drm_device *dev = pci_get_drvdata(pdev);
4898 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 4899 int i;
c9a6b82f
AG
4900
4901 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4902
6894305c
AG
4903 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4904 DRM_WARN("No support for XGMI hive yet...");
4905 return PCI_ERS_RESULT_DISCONNECT;
4906 }
4907
c9a6b82f
AG
4908 switch (state) {
4909 case pci_channel_io_normal:
4910 return PCI_ERS_RESULT_CAN_RECOVER;
acd89fca
AG
4911 /* Fatal error, prepare for slot reset */
4912 case pci_channel_io_frozen:
4913 /*
4914 * Cancel and wait for all TDRs in progress if failing to
4915 * set adev->in_gpu_reset in amdgpu_device_lock_adev
4916 *
4917 * Locking adev->reset_sem will prevent any external access
4918 * to GPU during PCI error recovery
4919 */
4920 while (!amdgpu_device_lock_adev(adev, NULL))
4921 amdgpu_cancel_all_tdr(adev);
4922
4923 /*
4924 * Block any work scheduling as we do for regular GPU reset
4925 * for the duration of the recovery
4926 */
4927 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4928 struct amdgpu_ring *ring = adev->rings[i];
4929
4930 if (!ring || !ring->sched.thread)
4931 continue;
4932
4933 drm_sched_stop(&ring->sched, NULL);
4934 }
c9a6b82f
AG
4935 return PCI_ERS_RESULT_NEED_RESET;
4936 case pci_channel_io_perm_failure:
4937 /* Permanent error, prepare for device removal */
4938 return PCI_ERS_RESULT_DISCONNECT;
4939 }
4940
4941 return PCI_ERS_RESULT_NEED_RESET;
4942}
4943
4944/**
4945 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4946 * @pdev: pointer to PCI device
4947 */
4948pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4949{
4950
4951 DRM_INFO("PCI error: mmio enabled callback!!\n");
4952
4953 /* TODO - dump whatever for debugging purposes */
4954
4955 /* This called only if amdgpu_pci_error_detected returns
4956 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4957 * works, no need to reset slot.
4958 */
4959
4960 return PCI_ERS_RESULT_RECOVERED;
4961}
4962
4963/**
4964 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4965 * @pdev: PCI device struct
4966 *
4967 * Description: This routine is called by the pci error recovery
4968 * code after the PCI slot has been reset, just before we
4969 * should resume normal operations.
4970 */
4971pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4972{
4973 struct drm_device *dev = pci_get_drvdata(pdev);
4974 struct amdgpu_device *adev = drm_to_adev(dev);
362c7b91 4975 int r, i;
7ac71382 4976 bool need_full_reset = true;
362c7b91 4977 u32 memsize;
7ac71382 4978 struct list_head device_list;
c9a6b82f
AG
4979
4980 DRM_INFO("PCI error: slot reset callback!!\n");
4981
7ac71382
AG
4982 INIT_LIST_HEAD(&device_list);
4983 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4984
362c7b91
AG
4985 /* wait for asic to come out of reset */
4986 msleep(500);
4987
7ac71382 4988 /* Restore PCI confspace */
c1dd4aa6 4989 amdgpu_device_load_pci_state(pdev);
c9a6b82f 4990
362c7b91
AG
4991 /* confirm ASIC came out of reset */
4992 for (i = 0; i < adev->usec_timeout; i++) {
4993 memsize = amdgpu_asic_get_config_memsize(adev);
4994
4995 if (memsize != 0xffffffff)
4996 break;
4997 udelay(1);
4998 }
4999 if (memsize == 0xffffffff) {
5000 r = -ETIME;
5001 goto out;
5002 }
5003
362c7b91 5004 adev->in_pci_err_recovery = true;
7ac71382 5005 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
bf36b52e 5006 adev->in_pci_err_recovery = false;
c9a6b82f
AG
5007 if (r)
5008 goto out;
5009
7ac71382 5010 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
c9a6b82f
AG
5011
5012out:
c9a6b82f 5013 if (!r) {
c1dd4aa6
AG
5014 if (amdgpu_device_cache_pci_state(adev->pdev))
5015 pci_restore_state(adev->pdev);
5016
c9a6b82f
AG
5017 DRM_INFO("PCIe error recovery succeeded\n");
5018 } else {
5019 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5020 amdgpu_device_unlock_adev(adev);
5021 }
5022
5023 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5024}
5025
5026/**
5027 * amdgpu_pci_resume() - resume normal ops after PCI reset
5028 * @pdev: pointer to PCI device
5029 *
5030 * Called when the error recovery driver tells us that its
5031 * OK to resume normal operation. Use completion to allow
5032 * halted scsi ops to resume.
5033 */
5034void amdgpu_pci_resume(struct pci_dev *pdev)
5035{
5036 struct drm_device *dev = pci_get_drvdata(pdev);
5037 struct amdgpu_device *adev = drm_to_adev(dev);
acd89fca 5038 int i;
c9a6b82f 5039
c9a6b82f
AG
5040
5041 DRM_INFO("PCI error: resume callback!!\n");
acd89fca
AG
5042
5043 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5044 struct amdgpu_ring *ring = adev->rings[i];
5045
5046 if (!ring || !ring->sched.thread)
5047 continue;
5048
5049
5050 drm_sched_resubmit_jobs(&ring->sched);
5051 drm_sched_start(&ring->sched, true);
5052 }
5053
5054 amdgpu_device_unlock_adev(adev);
c9a6b82f 5055}
c1dd4aa6
AG
5056
5057bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5058{
5059 struct drm_device *dev = pci_get_drvdata(pdev);
5060 struct amdgpu_device *adev = drm_to_adev(dev);
5061 int r;
5062
5063 r = pci_save_state(pdev);
5064 if (!r) {
5065 kfree(adev->pci_state);
5066
5067 adev->pci_state = pci_store_saved_state(pdev);
5068
5069 if (!adev->pci_state) {
5070 DRM_ERROR("Failed to store PCI saved state");
5071 return false;
5072 }
5073 } else {
5074 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5075 return false;
5076 }
5077
5078 return true;
5079}
5080
5081bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5082{
5083 struct drm_device *dev = pci_get_drvdata(pdev);
5084 struct amdgpu_device *adev = drm_to_adev(dev);
5085 int r;
5086
5087 if (!adev->pci_state)
5088 return false;
5089
5090 r = pci_load_saved_state(pdev, adev->pci_state);
5091
5092 if (!r) {
5093 pci_restore_state(pdev);
5094 } else {
5095 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5096 return false;
5097 }
5098
5099 return true;
5100}
5101
5102