drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33 #include <linux/iommu.h>
  34 #include <linux/pci.h>
  35 #include <linux/devcoredump.h>
  36 #include <generated/utsrelease.h>
  37 #include <linux/pci-p2pdma.h>
  38 #include <linux/apple-gmux.h>
  39
  40 #include <drm/drm_aperture.h>
  41 #include <drm/drm_atomic_helper.h>
  42 #include <drm/drm_crtc_helper.h>
  43 #include <drm/drm_fb_helper.h>
  44 #include <drm/drm_probe_helper.h>
  45 #include <drm/amdgpu_drm.h>
  46 #include <linux/vgaarb.h>
  47 #include <linux/vga_switcheroo.h>
  48 #include <linux/efi.h>
  49 #include "amdgpu.h"
  50 #include "amdgpu_trace.h"
  51 #include "amdgpu_i2c.h"
  52 #include "atom.h"
  53 #include "amdgpu_atombios.h"
  54 #include "amdgpu_atomfirmware.h"
  55 #include "amd_pcie.h"
  56 #ifdef CONFIG_DRM_AMDGPU_SI
  57 #include "si.h"
  58 #endif
  59 #ifdef CONFIG_DRM_AMDGPU_CIK
  60 #include "cik.h"
  61 #endif
  62 #include "vi.h"
  63 #include "soc15.h"
  64 #include "nv.h"
  65 #include "bif/bif_4_1_d.h"
  66 #include <linux/firmware.h>
  67 #include "amdgpu_vf_error.h"
  68
  69 #include "amdgpu_amdkfd.h"
  70 #include "amdgpu_pm.h"
  71
  72 #include "amdgpu_xgmi.h"
  73 #include "amdgpu_ras.h"
  74 #include "amdgpu_pmu.h"
  75 #include "amdgpu_fru_eeprom.h"
  76 #include "amdgpu_reset.h"
  77
  78 #include <linux/suspend.h>
  79 #include <drm/task_barrier.h>
  80 #include <linux/pm_runtime.h>
  81
  82 #include <drm/drm_drv.h>
  83
  84 #if IS_ENABLED(CONFIG_X86)
  85 #include <asm/intel-family.h>
  86 #endif
  87
  88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  95
  96 #define AMDGPU_RESUME_MS                2000
  97 #define AMDGPU_MAX_RETRY_LIMIT          2
  98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
  99
 100 static const struct drm_driver amdgpu_kms_driver;
 101
 102 const char *amdgpu_asic_name[] = {
 103         "TAHITI",
 104         "PITCAIRN",
 105         "VERDE",
 106         "OLAND",
 107         "HAINAN",
 108         "BONAIRE",
 109         "KAVERI",
 110         "KABINI",
 111         "HAWAII",
 112         "MULLINS",
 113         "TOPAZ",
 114         "TONGA",
 115         "FIJI",
 116         "CARRIZO",
 117         "STONEY",
 118         "POLARIS10",
 119         "POLARIS11",
 120         "POLARIS12",
 121         "VEGAM",
 122         "VEGA10",
 123         "VEGA12",
 124         "VEGA20",
 125         "RAVEN",
 126         "ARCTURUS",
 127         "RENOIR",
 128         "ALDEBARAN",
 129         "NAVI10",
 130         "CYAN_SKILLFISH",
 131         "NAVI14",
 132         "NAVI12",
 133         "SIENNA_CICHLID",
 134         "NAVY_FLOUNDER",
 135         "VANGOGH",
 136         "DIMGREY_CAVEFISH",
 137         "BEIGE_GOBY",
 138         "YELLOW_CARP",
 139         "IP DISCOVERY",
 140         "LAST",
 141 };
 142
 143 /**
 144  * DOC: pcie_replay_count
 145  *
 146  * The amdgpu driver provides a sysfs API for reporting the total number
 147  * of PCIe replays (NAKs)
 148  * The file pcie_replay_count is used for this and returns the total
 149  * number of replays as a sum of the NAKs generated and NAKs received
 150  */
 151
 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 153                 struct device_attribute *attr, char *buf)
 154 {
 155         struct drm_device *ddev = dev_get_drvdata(dev);
 156         struct amdgpu_device *adev = drm_to_adev(ddev);
 157         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 158
 159         return sysfs_emit(buf, "%llu\n", cnt);
 160 }
 161
 162 static DEVICE_ATTR(pcie_replay_count, 0444,
 163                 amdgpu_device_get_pcie_replay_count, NULL);
 164
 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 166
 167
 168 /**
 169  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
 170  *
 171  * @dev: drm_device pointer
 172  *
 173  * Returns true if the device is a dGPU with ATPX power control,
 174  * otherwise return false.
 175  */
 176 bool amdgpu_device_supports_px(struct drm_device *dev)
 177 {
 178         struct amdgpu_device *adev = drm_to_adev(dev);
 179
 180         if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
 181                 return true;
 182         return false;
 183 }
 184
 185 /**
 186  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
 187  *
 188  * @dev: drm_device pointer
 189  *
 190  * Returns true if the device is a dGPU with ACPI power control,
 191  * otherwise return false.
 192  */
 193 bool amdgpu_device_supports_boco(struct drm_device *dev)
 194 {
 195         struct amdgpu_device *adev = drm_to_adev(dev);
 196
 197         if (adev->has_pr3 ||
 198             ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
 199                 return true;
 200         return false;
 201 }
 202
 203 /**
 204  * amdgpu_device_supports_baco - Does the device support BACO
 205  *
 206  * @dev: drm_device pointer
 207  *
 208  * Returns true if the device supporte BACO,
 209  * otherwise return false.
 210  */
 211 bool amdgpu_device_supports_baco(struct drm_device *dev)
 212 {
 213         struct amdgpu_device *adev = drm_to_adev(dev);
 214
 215         return amdgpu_asic_supports_baco(adev);
 216 }
 217
 218 /**
 219  * amdgpu_device_supports_smart_shift - Is the device dGPU with
 220  * smart shift support
 221  *
 222  * @dev: drm_device pointer
 223  *
 224  * Returns true if the device is a dGPU with Smart Shift support,
 225  * otherwise returns false.
 226  */
 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
 228 {
 229         return (amdgpu_device_supports_boco(dev) &&
 230                 amdgpu_acpi_is_power_shift_control_supported());
 231 }
 232
 233 /*
 234  * VRAM access helper functions
 235  */
 236
 237 /**
 238  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
 239  *
 240  * @adev: amdgpu_device pointer
 241  * @pos: offset of the buffer in vram
 242  * @buf: virtual address of the buffer in system memory
 243  * @size: read/write size, sizeof(@buf) must > @size
 244  * @write: true - write to vram, otherwise - read from vram
 245  */
 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
 247                              void *buf, size_t size, bool write)
 248 {
 249         unsigned long flags;
 250         uint32_t hi = ~0, tmp = 0;
 251         uint32_t *data = buf;
 252         uint64_t last;
 253         int idx;
 254
 255         if (!drm_dev_enter(adev_to_drm(adev), &idx))
 256                 return;
 257
 258         BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
 259
 260         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 261         for (last = pos + size; pos < last; pos += 4) {
 262                 tmp = pos >> 31;
 263
 264                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 265                 if (tmp != hi) {
 266                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 267                         hi = tmp;
 268                 }
 269                 if (write)
 270                         WREG32_NO_KIQ(mmMM_DATA, *data++);
 271                 else
 272                         *data++ = RREG32_NO_KIQ(mmMM_DATA);
 273         }
 274
 275         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 276         drm_dev_exit(idx);
 277 }
 278
 279 /**
 280  * amdgpu_device_aper_access - access vram by vram aperature
 281  *
 282  * @adev: amdgpu_device pointer
 283  * @pos: offset of the buffer in vram
 284  * @buf: virtual address of the buffer in system memory
 285  * @size: read/write size, sizeof(@buf) must > @size
 286  * @write: true - write to vram, otherwise - read from vram
 287  *
 288  * The return value means how many bytes have been transferred.
 289  */
 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
 291                                  void *buf, size_t size, bool write)
 292 {
 293 #ifdef CONFIG_64BIT
 294         void __iomem *addr;
 295         size_t count = 0;
 296         uint64_t last;
 297
 298         if (!adev->mman.aper_base_kaddr)
 299                 return 0;
 300
 301         last = min(pos + size, adev->gmc.visible_vram_size);
 302         if (last > pos) {
 303                 addr = adev->mman.aper_base_kaddr + pos;
 304                 count = last - pos;
 305
 306                 if (write) {
 307                         memcpy_toio(addr, buf, count);
 308                         mb();
 309                         amdgpu_device_flush_hdp(adev, NULL);
 310                 } else {
 311                         amdgpu_device_invalidate_hdp(adev, NULL);
 312                         mb();
 313                         memcpy_fromio(buf, addr, count);
 314                 }
 315
 316         }
 317
 318         return count;
 319 #else
 320         return 0;
 321 #endif
 322 }
 323
 324 /**
 325  * amdgpu_device_vram_access - read/write a buffer in vram
 326  *
 327  * @adev: amdgpu_device pointer
 328  * @pos: offset of the buffer in vram
 329  * @buf: virtual address of the buffer in system memory
 330  * @size: read/write size, sizeof(@buf) must > @size
 331  * @write: true - write to vram, otherwise - read from vram
 332  */
 333 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 334                                void *buf, size_t size, bool write)
 335 {
 336         size_t count;
 337
 338         /* try to using vram apreature to access vram first */
 339         count = amdgpu_device_aper_access(adev, pos, buf, size, write);
 340         size -= count;
 341         if (size) {
 342                 /* using MM to access rest vram */
 343                 pos += count;
 344                 buf += count;
 345                 amdgpu_device_mm_access(adev, pos, buf, size, write);
 346         }
 347 }
 348
 349 /*
 350  * register access helper functions.
 351  */
 352
 353 /* Check if hw access should be skipped because of hotplug or device error */
 354 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
 355 {
 356         if (adev->no_hw_access)
 357                 return true;
 358
 359 #ifdef CONFIG_LOCKDEP
 360         /*
 361          * This is a bit complicated to understand, so worth a comment. What we assert
 362          * here is that the GPU reset is not running on another thread in parallel.
 363          *
 364          * For this we trylock the read side of the reset semaphore, if that succeeds
 365          * we know that the reset is not running in paralell.
 366          *
 367          * If the trylock fails we assert that we are either already holding the read
 368          * side of the lock or are the reset thread itself and hold the write side of
 369          * the lock.
 370          */
 371         if (in_task()) {
 372                 if (down_read_trylock(&adev->reset_domain->sem))
 373                         up_read(&adev->reset_domain->sem);
 374                 else
 375                         lockdep_assert_held(&adev->reset_domain->sem);
 376         }
 377 #endif
 378         return false;
 379 }
 380
 381 /**
 382  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 383  *
 384  * @adev: amdgpu_device pointer
 385  * @reg: dword aligned register offset
 386  * @acc_flags: access flags which require special behavior
 387  *
 388  * Returns the 32 bit value from the offset specified.
 389  */
 390 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 391                             uint32_t reg, uint32_t acc_flags)
 392 {
 393         uint32_t ret;
 394
 395         if (amdgpu_device_skip_hw_access(adev))
 396                 return 0;
 397
 398         if ((reg * 4) < adev->rmmio_size) {
 399                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 400                     amdgpu_sriov_runtime(adev) &&
 401                     down_read_trylock(&adev->reset_domain->sem)) {
 402                         ret = amdgpu_kiq_rreg(adev, reg);
 403                         up_read(&adev->reset_domain->sem);
 404                 } else {
 405                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 406                 }
 407         } else {
 408                 ret = adev->pcie_rreg(adev, reg * 4);
 409         }
 410
 411         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 412
 413         return ret;
 414 }
 415
 416 /*
 417  * MMIO register read with bytes helper functions
 418  * @offset:bytes offset from MMIO start
 419  */
 420
 421 /**
 422  * amdgpu_mm_rreg8 - read a memory mapped IO register
 423  *
 424  * @adev: amdgpu_device pointer
 425  * @offset: byte aligned register offset
 426  *
 427  * Returns the 8 bit value from the offset specified.
 428  */
 429 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 430 {
 431         if (amdgpu_device_skip_hw_access(adev))
 432                 return 0;
 433
 434         if (offset < adev->rmmio_size)
 435                 return (readb(adev->rmmio + offset));
 436         BUG();
 437 }
 438
 439 /*
 440  * MMIO register write with bytes helper functions
 441  * @offset:bytes offset from MMIO start
 442  * @value: the value want to be written to the register
 443  */
 444
 445 /**
 446  * amdgpu_mm_wreg8 - read a memory mapped IO register
 447  *
 448  * @adev: amdgpu_device pointer
 449  * @offset: byte aligned register offset
 450  * @value: 8 bit value to write
 451  *
 452  * Writes the value specified to the offset specified.
 453  */
 454 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 455 {
 456         if (amdgpu_device_skip_hw_access(adev))
 457                 return;
 458
 459         if (offset < adev->rmmio_size)
 460                 writeb(value, adev->rmmio + offset);
 461         else
 462                 BUG();
 463 }
 464
 465 /**
 466  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 467  *
 468  * @adev: amdgpu_device pointer
 469  * @reg: dword aligned register offset
 470  * @v: 32 bit value to write to the register
 471  * @acc_flags: access flags which require special behavior
 472  *
 473  * Writes the value specified to the offset specified.
 474  */
 475 void amdgpu_device_wreg(struct amdgpu_device *adev,
 476                         uint32_t reg, uint32_t v,
 477                         uint32_t acc_flags)
 478 {
 479         if (amdgpu_device_skip_hw_access(adev))
 480                 return;
 481
 482         if ((reg * 4) < adev->rmmio_size) {
 483                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 484                     amdgpu_sriov_runtime(adev) &&
 485                     down_read_trylock(&adev->reset_domain->sem)) {
 486                         amdgpu_kiq_wreg(adev, reg, v);
 487                         up_read(&adev->reset_domain->sem);
 488                 } else {
 489                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 490                 }
 491         } else {
 492                 adev->pcie_wreg(adev, reg * 4, v);
 493         }
 494
 495         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 496 }
 497
 498 /**
 499  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
 500  *
 501  * @adev: amdgpu_device pointer
 502  * @reg: mmio/rlc register
 503  * @v: value to write
 504  *
 505  * this function is invoked only for the debugfs register access
 506  */
 507 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 508                              uint32_t reg, uint32_t v,
 509                              uint32_t xcc_id)
 510 {
 511         if (amdgpu_device_skip_hw_access(adev))
 512                 return;
 513
 514         if (amdgpu_sriov_fullaccess(adev) &&
 515             adev->gfx.rlc.funcs &&
 516             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 517                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 518                         return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
 519         } else if ((reg * 4) >= adev->rmmio_size) {
 520                 adev->pcie_wreg(adev, reg * 4, v);
 521         } else {
 522                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 523         }
 524 }
 525
 526 /**
 527  * amdgpu_device_indirect_rreg - read an indirect register
 528  *
 529  * @adev: amdgpu_device pointer
 530  * @reg_addr: indirect register address to read from
 531  *
 532  * Returns the value of indirect register @reg_addr
 533  */
 534 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 535                                 u32 reg_addr)
 536 {
 537         unsigned long flags, pcie_index, pcie_data;
 538         void __iomem *pcie_index_offset;
 539         void __iomem *pcie_data_offset;
 540         u32 r;
 541
 542         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 543         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 544
 545         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 546         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 547         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 548
 549         writel(reg_addr, pcie_index_offset);
 550         readl(pcie_index_offset);
 551         r = readl(pcie_data_offset);
 552         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 553
 554         return r;
 555 }
 556
 557 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
 558                                     u64 reg_addr)
 559 {
 560         unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
 561         u32 r;
 562         void __iomem *pcie_index_offset;
 563         void __iomem *pcie_index_hi_offset;
 564         void __iomem *pcie_data_offset;
 565
 566         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 567         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 568         if (adev->nbio.funcs->get_pcie_index_hi_offset)
 569                 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
 570         else
 571                 pcie_index_hi = 0;
 572
 573         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 574         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 575         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 576         if (pcie_index_hi != 0)
 577                 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
 578                                 pcie_index_hi * 4;
 579
 580         writel(reg_addr, pcie_index_offset);
 581         readl(pcie_index_offset);
 582         if (pcie_index_hi != 0) {
 583                 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
 584                 readl(pcie_index_hi_offset);
 585         }
 586         r = readl(pcie_data_offset);
 587
 588         /* clear the high bits */
 589         if (pcie_index_hi != 0) {
 590                 writel(0, pcie_index_hi_offset);
 591                 readl(pcie_index_hi_offset);
 592         }
 593
 594         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 595
 596         return r;
 597 }
 598
 599 /**
 600  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 601  *
 602  * @adev: amdgpu_device pointer
 603  * @reg_addr: indirect register address to read from
 604  *
 605  * Returns the value of indirect register @reg_addr
 606  */
 607 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 608                                   u32 reg_addr)
 609 {
 610         unsigned long flags, pcie_index, pcie_data;
 611         void __iomem *pcie_index_offset;
 612         void __iomem *pcie_data_offset;
 613         u64 r;
 614
 615         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 616         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 617
 618         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 619         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 620         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 621
 622         /* read low 32 bits */
 623         writel(reg_addr, pcie_index_offset);
 624         readl(pcie_index_offset);
 625         r = readl(pcie_data_offset);
 626         /* read high 32 bits */
 627         writel(reg_addr + 4, pcie_index_offset);
 628         readl(pcie_index_offset);
 629         r |= ((u64)readl(pcie_data_offset) << 32);
 630         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 631
 632         return r;
 633 }
 634
 635 /**
 636  * amdgpu_device_indirect_wreg - write an indirect register address
 637  *
 638  * @adev: amdgpu_device pointer
 639  * @reg_addr: indirect register offset
 640  * @reg_data: indirect register data
 641  *
 642  */
 643 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 644                                  u32 reg_addr, u32 reg_data)
 645 {
 646         unsigned long flags, pcie_index, pcie_data;
 647         void __iomem *pcie_index_offset;
 648         void __iomem *pcie_data_offset;
 649
 650         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 651         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 652
 653         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 654         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 655         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 656
 657         writel(reg_addr, pcie_index_offset);
 658         readl(pcie_index_offset);
 659         writel(reg_data, pcie_data_offset);
 660         readl(pcie_data_offset);
 661         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 662 }
 663
 664 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
 665                                      u64 reg_addr, u32 reg_data)
 666 {
 667         unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
 668         void __iomem *pcie_index_offset;
 669         void __iomem *pcie_index_hi_offset;
 670         void __iomem *pcie_data_offset;
 671
 672         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 673         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 674         if (adev->nbio.funcs->get_pcie_index_hi_offset)
 675                 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
 676         else
 677                 pcie_index_hi = 0;
 678
 679         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 680         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 681         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 682         if (pcie_index_hi != 0)
 683                 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
 684                                 pcie_index_hi * 4;
 685
 686         writel(reg_addr, pcie_index_offset);
 687         readl(pcie_index_offset);
 688         if (pcie_index_hi != 0) {
 689                 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
 690                 readl(pcie_index_hi_offset);
 691         }
 692         writel(reg_data, pcie_data_offset);
 693         readl(pcie_data_offset);
 694
 695         /* clear the high bits */
 696         if (pcie_index_hi != 0) {
 697                 writel(0, pcie_index_hi_offset);
 698                 readl(pcie_index_hi_offset);
 699         }
 700
 701         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 702 }
 703
 704 /**
 705  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 706  *
 707  * @adev: amdgpu_device pointer
 708  * @reg_addr: indirect register offset
 709  * @reg_data: indirect register data
 710  *
 711  */
 712 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 713                                    u32 reg_addr, u64 reg_data)
 714 {
 715         unsigned long flags, pcie_index, pcie_data;
 716         void __iomem *pcie_index_offset;
 717         void __iomem *pcie_data_offset;
 718
 719         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 720         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 721
 722         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 723         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 724         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 725
 726         /* write low 32 bits */
 727         writel(reg_addr, pcie_index_offset);
 728         readl(pcie_index_offset);
 729         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 730         readl(pcie_data_offset);
 731         /* write high 32 bits */
 732         writel(reg_addr + 4, pcie_index_offset);
 733         readl(pcie_index_offset);
 734         writel((u32)(reg_data >> 32), pcie_data_offset);
 735         readl(pcie_data_offset);
 736         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 737 }
 738
 739 /**
 740  * amdgpu_device_get_rev_id - query device rev_id
 741  *
 742  * @adev: amdgpu_device pointer
 743  *
 744  * Return device rev_id
 745  */
 746 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
 747 {
 748         return adev->nbio.funcs->get_rev_id(adev);
 749 }
 750
 751 /**
 752  * amdgpu_invalid_rreg - dummy reg read function
 753  *
 754  * @adev: amdgpu_device pointer
 755  * @reg: offset of register
 756  *
 757  * Dummy register read function.  Used for register blocks
 758  * that certain asics don't have (all asics).
 759  * Returns the value in the register.
 760  */
 761 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 762 {
 763         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 764         BUG();
 765         return 0;
 766 }
 767
 768 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
 769 {
 770         DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
 771         BUG();
 772         return 0;
 773 }
 774
 775 /**
 776  * amdgpu_invalid_wreg - dummy reg write function
 777  *
 778  * @adev: amdgpu_device pointer
 779  * @reg: offset of register
 780  * @v: value to write to the register
 781  *
 782  * Dummy register read function.  Used for register blocks
 783  * that certain asics don't have (all asics).
 784  */
 785 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 786 {
 787         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 788                   reg, v);
 789         BUG();
 790 }
 791
 792 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
 793 {
 794         DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
 795                   reg, v);
 796         BUG();
 797 }
 798
 799 /**
 800  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 801  *
 802  * @adev: amdgpu_device pointer
 803  * @reg: offset of register
 804  *
 805  * Dummy register read function.  Used for register blocks
 806  * that certain asics don't have (all asics).
 807  * Returns the value in the register.
 808  */
 809 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 810 {
 811         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 812         BUG();
 813         return 0;
 814 }
 815
 816 /**
 817  * amdgpu_invalid_wreg64 - dummy reg write function
 818  *
 819  * @adev: amdgpu_device pointer
 820  * @reg: offset of register
 821  * @v: value to write to the register
 822  *
 823  * Dummy register read function.  Used for register blocks
 824  * that certain asics don't have (all asics).
 825  */
 826 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 827 {
 828         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 829                   reg, v);
 830         BUG();
 831 }
 832
 833 /**
 834  * amdgpu_block_invalid_rreg - dummy reg read function
 835  *
 836  * @adev: amdgpu_device pointer
 837  * @block: offset of instance
 838  * @reg: offset of register
 839  *
 840  * Dummy register read function.  Used for register blocks
 841  * that certain asics don't have (all asics).
 842  * Returns the value in the register.
 843  */
 844 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 845                                           uint32_t block, uint32_t reg)
 846 {
 847         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 848                   reg, block);
 849         BUG();
 850         return 0;
 851 }
 852
 853 /**
 854  * amdgpu_block_invalid_wreg - dummy reg write function
 855  *
 856  * @adev: amdgpu_device pointer
 857  * @block: offset of instance
 858  * @reg: offset of register
 859  * @v: value to write to the register
 860  *
 861  * Dummy register read function.  Used for register blocks
 862  * that certain asics don't have (all asics).
 863  */
 864 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 865                                       uint32_t block,
 866                                       uint32_t reg, uint32_t v)
 867 {
 868         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 869                   reg, block, v);
 870         BUG();
 871 }
 872
 873 /**
 874  * amdgpu_device_asic_init - Wrapper for atom asic_init
 875  *
 876  * @adev: amdgpu_device pointer
 877  *
 878  * Does any asic specific work and then calls atom asic init.
 879  */
 880 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 881 {
 882         amdgpu_asic_pre_asic_init(adev);
 883
 884         if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
 885             adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
 886                 return amdgpu_atomfirmware_asic_init(adev, true);
 887         else
 888                 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 889 }
 890
 891 /**
 892  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
 893  *
 894  * @adev: amdgpu_device pointer
 895  *
 896  * Allocates a scratch page of VRAM for use by various things in the
 897  * driver.
 898  */
 899 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
 900 {
 901         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
 902                                        AMDGPU_GEM_DOMAIN_VRAM |
 903                                        AMDGPU_GEM_DOMAIN_GTT,
 904                                        &adev->mem_scratch.robj,
 905                                        &adev->mem_scratch.gpu_addr,
 906                                        (void **)&adev->mem_scratch.ptr);
 907 }
 908
 909 /**
 910  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
 911  *
 912  * @adev: amdgpu_device pointer
 913  *
 914  * Frees the VRAM scratch page.
 915  */
 916 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
 917 {
 918         amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
 919 }
 920
 921 /**
 922  * amdgpu_device_program_register_sequence - program an array of registers.
 923  *
 924  * @adev: amdgpu_device pointer
 925  * @registers: pointer to the register array
 926  * @array_size: size of the register array
 927  *
 928  * Programs an array or registers with and or masks.
 929  * This is a helper for setting golden registers.
 930  */
 931 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 932                                              const u32 *registers,
 933                                              const u32 array_size)
 934 {
 935         u32 tmp, reg, and_mask, or_mask;
 936         int i;
 937
 938         if (array_size % 3)
 939                 return;
 940
 941         for (i = 0; i < array_size; i += 3) {
 942                 reg = registers[i + 0];
 943                 and_mask = registers[i + 1];
 944                 or_mask = registers[i + 2];
 945
 946                 if (and_mask == 0xffffffff) {
 947                         tmp = or_mask;
 948                 } else {
 949                         tmp = RREG32(reg);
 950                         tmp &= ~and_mask;
 951                         if (adev->family >= AMDGPU_FAMILY_AI)
 952                                 tmp |= (or_mask & and_mask);
 953                         else
 954                                 tmp |= or_mask;
 955                 }
 956                 WREG32(reg, tmp);
 957         }
 958 }
 959
 960 /**
 961  * amdgpu_device_pci_config_reset - reset the GPU
 962  *
 963  * @adev: amdgpu_device pointer
 964  *
 965  * Resets the GPU using the pci config reset sequence.
 966  * Only applicable to asics prior to vega10.
 967  */
 968 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 969 {
 970         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 971 }
 972
 973 /**
 974  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
 975  *
 976  * @adev: amdgpu_device pointer
 977  *
 978  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
 979  */
 980 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
 981 {
 982         return pci_reset_function(adev->pdev);
 983 }
 984
 985 /*
 986  * amdgpu_device_wb_*()
 987  * Writeback is the method by which the GPU updates special pages in memory
 988  * with the status of certain GPU events (fences, ring pointers,etc.).
 989  */
 990
 991 /**
 992  * amdgpu_device_wb_fini - Disable Writeback and free memory
 993  *
 994  * @adev: amdgpu_device pointer
 995  *
 996  * Disables Writeback and frees the Writeback memory (all asics).
 997  * Used at driver shutdown.
 998  */
 999 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1000 {
1001         if (adev->wb.wb_obj) {
1002                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1003                                       &adev->wb.gpu_addr,
1004                                       (void **)&adev->wb.wb);
1005                 adev->wb.wb_obj = NULL;
1006         }
1007 }
1008
1009 /**
1010  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Initializes writeback and allocates writeback memory (all asics).
1015  * Used at driver startup.
1016  * Returns 0 on success or an -error on failure.
1017  */
1018 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1019 {
1020         int r;
1021
1022         if (adev->wb.wb_obj == NULL) {
1023                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1024                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1025                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1026                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1027                                             (void **)&adev->wb.wb);
1028                 if (r) {
1029                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1030                         return r;
1031                 }
1032
1033                 adev->wb.num_wb = AMDGPU_MAX_WB;
1034                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1035
1036                 /* clear wb memory */
1037                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1038         }
1039
1040         return 0;
1041 }
1042
1043 /**
1044  * amdgpu_device_wb_get - Allocate a wb entry
1045  *
1046  * @adev: amdgpu_device pointer
1047  * @wb: wb index
1048  *
1049  * Allocate a wb slot for use by the driver (all asics).
1050  * Returns 0 on success or -EINVAL on failure.
1051  */
1052 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1053 {
1054         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1055
1056         if (offset < adev->wb.num_wb) {
1057                 __set_bit(offset, adev->wb.used);
1058                 *wb = offset << 3; /* convert to dw offset */
1059                 return 0;
1060         } else {
1061                 return -EINVAL;
1062         }
1063 }
1064
1065 /**
1066  * amdgpu_device_wb_free - Free a wb entry
1067  *
1068  * @adev: amdgpu_device pointer
1069  * @wb: wb index
1070  *
1071  * Free a wb slot allocated for use by the driver (all asics)
1072  */
1073 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1074 {
1075         wb >>= 3;
1076         if (wb < adev->wb.num_wb)
1077                 __clear_bit(wb, adev->wb.used);
1078 }
1079
1080 /**
1081  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1082  *
1083  * @adev: amdgpu_device pointer
1084  *
1085  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1086  * to fail, but if any of the BARs is not accessible after the size we abort
1087  * driver loading by returning -ENODEV.
1088  */
1089 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1090 {
1091         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1092         struct pci_bus *root;
1093         struct resource *res;
1094         unsigned int i;
1095         u16 cmd;
1096         int r;
1097
1098         if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1099                 return 0;
1100
1101         /* Bypass for VF */
1102         if (amdgpu_sriov_vf(adev))
1103                 return 0;
1104
1105         /* skip if the bios has already enabled large BAR */
1106         if (adev->gmc.real_vram_size &&
1107             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1108                 return 0;
1109
1110         /* Check if the root BUS has 64bit memory resources */
1111         root = adev->pdev->bus;
1112         while (root->parent)
1113                 root = root->parent;
1114
1115         pci_bus_for_each_resource(root, res, i) {
1116                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1117                     res->start > 0x100000000ull)
1118                         break;
1119         }
1120
1121         /* Trying to resize is pointless without a root hub window above 4GB */
1122         if (!res)
1123                 return 0;
1124
1125         /* Limit the BAR size to what is available */
1126         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1127                         rbar_size);
1128
1129         /* Disable memory decoding while we change the BAR addresses and size */
1130         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1131         pci_write_config_word(adev->pdev, PCI_COMMAND,
1132                               cmd & ~PCI_COMMAND_MEMORY);
1133
1134         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1135         amdgpu_doorbell_fini(adev);
1136         if (adev->asic_type >= CHIP_BONAIRE)
1137                 pci_release_resource(adev->pdev, 2);
1138
1139         pci_release_resource(adev->pdev, 0);
1140
1141         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1142         if (r == -ENOSPC)
1143                 DRM_INFO("Not enough PCI address space for a large BAR.");
1144         else if (r && r != -ENOTSUPP)
1145                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1146
1147         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1148
1149         /* When the doorbell or fb BAR isn't available we have no chance of
1150          * using the device.
1151          */
1152         r = amdgpu_doorbell_init(adev);
1153         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1154                 return -ENODEV;
1155
1156         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1157
1158         return 0;
1159 }
1160
1161 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1162 {
1163         if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1164                 return false;
1165
1166         return true;
1167 }
1168
1169 /*
1170  * GPU helpers function.
1171  */
1172 /**
1173  * amdgpu_device_need_post - check if the hw need post or not
1174  *
1175  * @adev: amdgpu_device pointer
1176  *
1177  * Check if the asic has been initialized (all asics) at driver startup
1178  * or post is needed if  hw reset is performed.
1179  * Returns true if need or false if not.
1180  */
1181 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1182 {
1183         uint32_t reg;
1184
1185         if (amdgpu_sriov_vf(adev))
1186                 return false;
1187
1188         if (!amdgpu_device_read_bios(adev))
1189                 return false;
1190
1191         if (amdgpu_passthrough(adev)) {
1192                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1193                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1194                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1195                  * vpost executed for smc version below 22.15
1196                  */
1197                 if (adev->asic_type == CHIP_FIJI) {
1198                         int err;
1199                         uint32_t fw_ver;
1200
1201                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1202                         /* force vPost if error occured */
1203                         if (err)
1204                                 return true;
1205
1206                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1207                         if (fw_ver < 0x00160e00)
1208                                 return true;
1209                 }
1210         }
1211
1212         /* Don't post if we need to reset whole hive on init */
1213         if (adev->gmc.xgmi.pending_reset)
1214                 return false;
1215
1216         if (adev->has_hw_reset) {
1217                 adev->has_hw_reset = false;
1218                 return true;
1219         }
1220
1221         /* bios scratch used on CIK+ */
1222         if (adev->asic_type >= CHIP_BONAIRE)
1223                 return amdgpu_atombios_scratch_need_asic_init(adev);
1224
1225         /* check MEM_SIZE for older asics */
1226         reg = amdgpu_asic_get_config_memsize(adev);
1227
1228         if ((reg != 0) && (reg != 0xffffffff))
1229                 return false;
1230
1231         return true;
1232 }
1233
1234 /*
1235  * On APUs with >= 64GB white flickering has been observed w/ SG enabled.
1236  * Disable S/G on such systems until we have a proper fix.
1237  * https://gitlab.freedesktop.org/drm/amd/-/issues/2354
1238  * https://gitlab.freedesktop.org/drm/amd/-/issues/2735
1239  */
1240 bool amdgpu_sg_display_supported(struct amdgpu_device *adev)
1241 {
1242         switch (amdgpu_sg_display) {
1243         case -1:
1244                 break;
1245         case 0:
1246                 return false;
1247         case 1:
1248                 return true;
1249         default:
1250                 return false;
1251         }
1252         if ((totalram_pages() << (PAGE_SHIFT - 10)) +
1253             (adev->gmc.real_vram_size / 1024) >= 64000000) {
1254                 DRM_WARN("Disabling S/G due to >=64GB RAM\n");
1255                 return false;
1256         }
1257         return true;
1258 }
1259
1260 /*
1261  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1262  * speed switching. Until we have confirmation from Intel that a specific host
1263  * supports it, it's safer that we keep it disabled for all.
1264  *
1265  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1266  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1267  */
1268 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1269 {
1270 #if IS_ENABLED(CONFIG_X86)
1271         struct cpuinfo_x86 *c = &cpu_data(0);
1272
1273         if (c->x86_vendor == X86_VENDOR_INTEL)
1274                 return false;
1275 #endif
1276         return true;
1277 }
1278
1279 /**
1280  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1281  *
1282  * @adev: amdgpu_device pointer
1283  *
1284  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1285  * be set for this device.
1286  *
1287  * Returns true if it should be used or false if not.
1288  */
1289 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1290 {
1291         switch (amdgpu_aspm) {
1292         case -1:
1293                 break;
1294         case 0:
1295                 return false;
1296         case 1:
1297                 return true;
1298         default:
1299                 return false;
1300         }
1301         return pcie_aspm_enabled(adev->pdev);
1302 }
1303
1304 bool amdgpu_device_aspm_support_quirk(void)
1305 {
1306 #if IS_ENABLED(CONFIG_X86)
1307         struct cpuinfo_x86 *c = &cpu_data(0);
1308
1309         return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1310 #else
1311         return true;
1312 #endif
1313 }
1314
1315 /* if we get transitioned to only one device, take VGA back */
1316 /**
1317  * amdgpu_device_vga_set_decode - enable/disable vga decode
1318  *
1319  * @pdev: PCI device pointer
1320  * @state: enable/disable vga decode
1321  *
1322  * Enable/disable vga decode (all asics).
1323  * Returns VGA resource flags.
1324  */
1325 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1326                 bool state)
1327 {
1328         struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1329
1330         amdgpu_asic_set_vga_state(adev, state);
1331         if (state)
1332                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1333                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1334         else
1335                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1336 }
1337
1338 /**
1339  * amdgpu_device_check_block_size - validate the vm block size
1340  *
1341  * @adev: amdgpu_device pointer
1342  *
1343  * Validates the vm block size specified via module parameter.
1344  * The vm block size defines number of bits in page table versus page directory,
1345  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1346  * page table and the remaining bits are in the page directory.
1347  */
1348 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1349 {
1350         /* defines number of bits in page table versus page directory,
1351          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1352          * page table and the remaining bits are in the page directory
1353          */
1354         if (amdgpu_vm_block_size == -1)
1355                 return;
1356
1357         if (amdgpu_vm_block_size < 9) {
1358                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1359                          amdgpu_vm_block_size);
1360                 amdgpu_vm_block_size = -1;
1361         }
1362 }
1363
1364 /**
1365  * amdgpu_device_check_vm_size - validate the vm size
1366  *
1367  * @adev: amdgpu_device pointer
1368  *
1369  * Validates the vm size in GB specified via module parameter.
1370  * The VM size is the size of the GPU virtual memory space in GB.
1371  */
1372 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1373 {
1374         /* no need to check the default value */
1375         if (amdgpu_vm_size == -1)
1376                 return;
1377
1378         if (amdgpu_vm_size < 1) {
1379                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1380                          amdgpu_vm_size);
1381                 amdgpu_vm_size = -1;
1382         }
1383 }
1384
1385 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1386 {
1387         struct sysinfo si;
1388         bool is_os_64 = (sizeof(void *) == 8);
1389         uint64_t total_memory;
1390         uint64_t dram_size_seven_GB = 0x1B8000000;
1391         uint64_t dram_size_three_GB = 0xB8000000;
1392
1393         if (amdgpu_smu_memory_pool_size == 0)
1394                 return;
1395
1396         if (!is_os_64) {
1397                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1398                 goto def_value;
1399         }
1400         si_meminfo(&si);
1401         total_memory = (uint64_t)si.totalram * si.mem_unit;
1402
1403         if ((amdgpu_smu_memory_pool_size == 1) ||
1404                 (amdgpu_smu_memory_pool_size == 2)) {
1405                 if (total_memory < dram_size_three_GB)
1406                         goto def_value1;
1407         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1408                 (amdgpu_smu_memory_pool_size == 8)) {
1409                 if (total_memory < dram_size_seven_GB)
1410                         goto def_value1;
1411         } else {
1412                 DRM_WARN("Smu memory pool size not supported\n");
1413                 goto def_value;
1414         }
1415         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1416
1417         return;
1418
1419 def_value1:
1420         DRM_WARN("No enough system memory\n");
1421 def_value:
1422         adev->pm.smu_prv_buffer_size = 0;
1423 }
1424
1425 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1426 {
1427         if (!(adev->flags & AMD_IS_APU) ||
1428             adev->asic_type < CHIP_RAVEN)
1429                 return 0;
1430
1431         switch (adev->asic_type) {
1432         case CHIP_RAVEN:
1433                 if (adev->pdev->device == 0x15dd)
1434                         adev->apu_flags |= AMD_APU_IS_RAVEN;
1435                 if (adev->pdev->device == 0x15d8)
1436                         adev->apu_flags |= AMD_APU_IS_PICASSO;
1437                 break;
1438         case CHIP_RENOIR:
1439                 if ((adev->pdev->device == 0x1636) ||
1440                     (adev->pdev->device == 0x164c))
1441                         adev->apu_flags |= AMD_APU_IS_RENOIR;
1442                 else
1443                         adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1444                 break;
1445         case CHIP_VANGOGH:
1446                 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1447                 break;
1448         case CHIP_YELLOW_CARP:
1449                 break;
1450         case CHIP_CYAN_SKILLFISH:
1451                 if ((adev->pdev->device == 0x13FE) ||
1452                     (adev->pdev->device == 0x143F))
1453                         adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1454                 break;
1455         default:
1456                 break;
1457         }
1458
1459         return 0;
1460 }
1461
1462 /**
1463  * amdgpu_device_check_arguments - validate module params
1464  *
1465  * @adev: amdgpu_device pointer
1466  *
1467  * Validates certain module parameters and updates
1468  * the associated values used by the driver (all asics).
1469  */
1470 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1471 {
1472         if (amdgpu_sched_jobs < 4) {
1473                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1474                          amdgpu_sched_jobs);
1475                 amdgpu_sched_jobs = 4;
1476         } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1477                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1478                          amdgpu_sched_jobs);
1479                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1480         }
1481
1482         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1483                 /* gart size must be greater or equal to 32M */
1484                 dev_warn(adev->dev, "gart size (%d) too small\n",
1485                          amdgpu_gart_size);
1486                 amdgpu_gart_size = -1;
1487         }
1488
1489         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1490                 /* gtt size must be greater or equal to 32M */
1491                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1492                                  amdgpu_gtt_size);
1493                 amdgpu_gtt_size = -1;
1494         }
1495
1496         /* valid range is between 4 and 9 inclusive */
1497         if (amdgpu_vm_fragment_size != -1 &&
1498             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1499                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1500                 amdgpu_vm_fragment_size = -1;
1501         }
1502
1503         if (amdgpu_sched_hw_submission < 2) {
1504                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1505                          amdgpu_sched_hw_submission);
1506                 amdgpu_sched_hw_submission = 2;
1507         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1508                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1509                          amdgpu_sched_hw_submission);
1510                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1511         }
1512
1513         if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1514                 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1515                 amdgpu_reset_method = -1;
1516         }
1517
1518         amdgpu_device_check_smu_prv_buffer_size(adev);
1519
1520         amdgpu_device_check_vm_size(adev);
1521
1522         amdgpu_device_check_block_size(adev);
1523
1524         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1525
1526         return 0;
1527 }
1528
1529 /**
1530  * amdgpu_switcheroo_set_state - set switcheroo state
1531  *
1532  * @pdev: pci dev pointer
1533  * @state: vga_switcheroo state
1534  *
1535  * Callback for the switcheroo driver.  Suspends or resumes
1536  * the asics before or after it is powered up using ACPI methods.
1537  */
1538 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1539                                         enum vga_switcheroo_state state)
1540 {
1541         struct drm_device *dev = pci_get_drvdata(pdev);
1542         int r;
1543
1544         if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1545                 return;
1546
1547         if (state == VGA_SWITCHEROO_ON) {
1548                 pr_info("switched on\n");
1549                 /* don't suspend or resume card normally */
1550                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1551
1552                 pci_set_power_state(pdev, PCI_D0);
1553                 amdgpu_device_load_pci_state(pdev);
1554                 r = pci_enable_device(pdev);
1555                 if (r)
1556                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1557                 amdgpu_device_resume(dev, true);
1558
1559                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1560         } else {
1561                 pr_info("switched off\n");
1562                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1563                 amdgpu_device_suspend(dev, true);
1564                 amdgpu_device_cache_pci_state(pdev);
1565                 /* Shut down the device */
1566                 pci_disable_device(pdev);
1567                 pci_set_power_state(pdev, PCI_D3cold);
1568                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1569         }
1570 }
1571
1572 /**
1573  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1574  *
1575  * @pdev: pci dev pointer
1576  *
1577  * Callback for the switcheroo driver.  Check of the switcheroo
1578  * state can be changed.
1579  * Returns true if the state can be changed, false if not.
1580  */
1581 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1582 {
1583         struct drm_device *dev = pci_get_drvdata(pdev);
1584
1585        /*
1586         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1587         * locking inversion with the driver load path. And the access here is
1588         * completely racy anyway. So don't bother with locking for now.
1589         */
1590         return atomic_read(&dev->open_count) == 0;
1591 }
1592
1593 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1594         .set_gpu_state = amdgpu_switcheroo_set_state,
1595         .reprobe = NULL,
1596         .can_switch = amdgpu_switcheroo_can_switch,
1597 };
1598
1599 /**
1600  * amdgpu_device_ip_set_clockgating_state - set the CG state
1601  *
1602  * @dev: amdgpu_device pointer
1603  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1604  * @state: clockgating state (gate or ungate)
1605  *
1606  * Sets the requested clockgating state for all instances of
1607  * the hardware IP specified.
1608  * Returns the error code from the last instance.
1609  */
1610 int amdgpu_device_ip_set_clockgating_state(void *dev,
1611                                            enum amd_ip_block_type block_type,
1612                                            enum amd_clockgating_state state)
1613 {
1614         struct amdgpu_device *adev = dev;
1615         int i, r = 0;
1616
1617         for (i = 0; i < adev->num_ip_blocks; i++) {
1618                 if (!adev->ip_blocks[i].status.valid)
1619                         continue;
1620                 if (adev->ip_blocks[i].version->type != block_type)
1621                         continue;
1622                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1623                         continue;
1624                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1625                         (void *)adev, state);
1626                 if (r)
1627                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1628                                   adev->ip_blocks[i].version->funcs->name, r);
1629         }
1630         return r;
1631 }
1632
1633 /**
1634  * amdgpu_device_ip_set_powergating_state - set the PG state
1635  *
1636  * @dev: amdgpu_device pointer
1637  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1638  * @state: powergating state (gate or ungate)
1639  *
1640  * Sets the requested powergating state for all instances of
1641  * the hardware IP specified.
1642  * Returns the error code from the last instance.
1643  */
1644 int amdgpu_device_ip_set_powergating_state(void *dev,
1645                                            enum amd_ip_block_type block_type,
1646                                            enum amd_powergating_state state)
1647 {
1648         struct amdgpu_device *adev = dev;
1649         int i, r = 0;
1650
1651         for (i = 0; i < adev->num_ip_blocks; i++) {
1652                 if (!adev->ip_blocks[i].status.valid)
1653                         continue;
1654                 if (adev->ip_blocks[i].version->type != block_type)
1655                         continue;
1656                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1657                         continue;
1658                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1659                         (void *)adev, state);
1660                 if (r)
1661                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1662                                   adev->ip_blocks[i].version->funcs->name, r);
1663         }
1664         return r;
1665 }
1666
1667 /**
1668  * amdgpu_device_ip_get_clockgating_state - get the CG state
1669  *
1670  * @adev: amdgpu_device pointer
1671  * @flags: clockgating feature flags
1672  *
1673  * Walks the list of IPs on the device and updates the clockgating
1674  * flags for each IP.
1675  * Updates @flags with the feature flags for each hardware IP where
1676  * clockgating is enabled.
1677  */
1678 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1679                                             u64 *flags)
1680 {
1681         int i;
1682
1683         for (i = 0; i < adev->num_ip_blocks; i++) {
1684                 if (!adev->ip_blocks[i].status.valid)
1685                         continue;
1686                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1687                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1688         }
1689 }
1690
1691 /**
1692  * amdgpu_device_ip_wait_for_idle - wait for idle
1693  *
1694  * @adev: amdgpu_device pointer
1695  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1696  *
1697  * Waits for the request hardware IP to be idle.
1698  * Returns 0 for success or a negative error code on failure.
1699  */
1700 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1701                                    enum amd_ip_block_type block_type)
1702 {
1703         int i, r;
1704
1705         for (i = 0; i < adev->num_ip_blocks; i++) {
1706                 if (!adev->ip_blocks[i].status.valid)
1707                         continue;
1708                 if (adev->ip_blocks[i].version->type == block_type) {
1709                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1710                         if (r)
1711                                 return r;
1712                         break;
1713                 }
1714         }
1715         return 0;
1716
1717 }
1718
1719 /**
1720  * amdgpu_device_ip_is_idle - is the hardware IP idle
1721  *
1722  * @adev: amdgpu_device pointer
1723  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1724  *
1725  * Check if the hardware IP is idle or not.
1726  * Returns true if it the IP is idle, false if not.
1727  */
1728 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1729                               enum amd_ip_block_type block_type)
1730 {
1731         int i;
1732
1733         for (i = 0; i < adev->num_ip_blocks; i++) {
1734                 if (!adev->ip_blocks[i].status.valid)
1735                         continue;
1736                 if (adev->ip_blocks[i].version->type == block_type)
1737                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1738         }
1739         return true;
1740
1741 }
1742
1743 /**
1744  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1745  *
1746  * @adev: amdgpu_device pointer
1747  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1748  *
1749  * Returns a pointer to the hardware IP block structure
1750  * if it exists for the asic, otherwise NULL.
1751  */
1752 struct amdgpu_ip_block *
1753 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1754                               enum amd_ip_block_type type)
1755 {
1756         int i;
1757
1758         for (i = 0; i < adev->num_ip_blocks; i++)
1759                 if (adev->ip_blocks[i].version->type == type)
1760                         return &adev->ip_blocks[i];
1761
1762         return NULL;
1763 }
1764
1765 /**
1766  * amdgpu_device_ip_block_version_cmp
1767  *
1768  * @adev: amdgpu_device pointer
1769  * @type: enum amd_ip_block_type
1770  * @major: major version
1771  * @minor: minor version
1772  *
1773  * return 0 if equal or greater
1774  * return 1 if smaller or the ip_block doesn't exist
1775  */
1776 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1777                                        enum amd_ip_block_type type,
1778                                        u32 major, u32 minor)
1779 {
1780         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1781
1782         if (ip_block && ((ip_block->version->major > major) ||
1783                         ((ip_block->version->major == major) &&
1784                         (ip_block->version->minor >= minor))))
1785                 return 0;
1786
1787         return 1;
1788 }
1789
1790 /**
1791  * amdgpu_device_ip_block_add
1792  *
1793  * @adev: amdgpu_device pointer
1794  * @ip_block_version: pointer to the IP to add
1795  *
1796  * Adds the IP block driver information to the collection of IPs
1797  * on the asic.
1798  */
1799 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1800                                const struct amdgpu_ip_block_version *ip_block_version)
1801 {
1802         if (!ip_block_version)
1803                 return -EINVAL;
1804
1805         switch (ip_block_version->type) {
1806         case AMD_IP_BLOCK_TYPE_VCN:
1807                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1808                         return 0;
1809                 break;
1810         case AMD_IP_BLOCK_TYPE_JPEG:
1811                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1812                         return 0;
1813                 break;
1814         default:
1815                 break;
1816         }
1817
1818         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1819                   ip_block_version->funcs->name);
1820
1821         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1822
1823         return 0;
1824 }
1825
1826 /**
1827  * amdgpu_device_enable_virtual_display - enable virtual display feature
1828  *
1829  * @adev: amdgpu_device pointer
1830  *
1831  * Enabled the virtual display feature if the user has enabled it via
1832  * the module parameter virtual_display.  This feature provides a virtual
1833  * display hardware on headless boards or in virtualized environments.
1834  * This function parses and validates the configuration string specified by
1835  * the user and configues the virtual display configuration (number of
1836  * virtual connectors, crtcs, etc.) specified.
1837  */
1838 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1839 {
1840         adev->enable_virtual_display = false;
1841
1842         if (amdgpu_virtual_display) {
1843                 const char *pci_address_name = pci_name(adev->pdev);
1844                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1845
1846                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1847                 pciaddstr_tmp = pciaddstr;
1848                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1849                         pciaddname = strsep(&pciaddname_tmp, ",");
1850                         if (!strcmp("all", pciaddname)
1851                             || !strcmp(pci_address_name, pciaddname)) {
1852                                 long num_crtc;
1853                                 int res = -1;
1854
1855                                 adev->enable_virtual_display = true;
1856
1857                                 if (pciaddname_tmp)
1858                                         res = kstrtol(pciaddname_tmp, 10,
1859                                                       &num_crtc);
1860
1861                                 if (!res) {
1862                                         if (num_crtc < 1)
1863                                                 num_crtc = 1;
1864                                         if (num_crtc > 6)
1865                                                 num_crtc = 6;
1866                                         adev->mode_info.num_crtc = num_crtc;
1867                                 } else {
1868                                         adev->mode_info.num_crtc = 1;
1869                                 }
1870                                 break;
1871                         }
1872                 }
1873
1874                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1875                          amdgpu_virtual_display, pci_address_name,
1876                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1877
1878                 kfree(pciaddstr);
1879         }
1880 }
1881
1882 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1883 {
1884         if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1885                 adev->mode_info.num_crtc = 1;
1886                 adev->enable_virtual_display = true;
1887                 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1888                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1889         }
1890 }
1891
1892 /**
1893  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1894  *
1895  * @adev: amdgpu_device pointer
1896  *
1897  * Parses the asic configuration parameters specified in the gpu info
1898  * firmware and makes them availale to the driver for use in configuring
1899  * the asic.
1900  * Returns 0 on success, -EINVAL on failure.
1901  */
1902 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1903 {
1904         const char *chip_name;
1905         char fw_name[40];
1906         int err;
1907         const struct gpu_info_firmware_header_v1_0 *hdr;
1908
1909         adev->firmware.gpu_info_fw = NULL;
1910
1911         if (adev->mman.discovery_bin) {
1912                 /*
1913                  * FIXME: The bounding box is still needed by Navi12, so
1914                  * temporarily read it from gpu_info firmware. Should be dropped
1915                  * when DAL no longer needs it.
1916                  */
1917                 if (adev->asic_type != CHIP_NAVI12)
1918                         return 0;
1919         }
1920
1921         switch (adev->asic_type) {
1922         default:
1923                 return 0;
1924         case CHIP_VEGA10:
1925                 chip_name = "vega10";
1926                 break;
1927         case CHIP_VEGA12:
1928                 chip_name = "vega12";
1929                 break;
1930         case CHIP_RAVEN:
1931                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1932                         chip_name = "raven2";
1933                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1934                         chip_name = "picasso";
1935                 else
1936                         chip_name = "raven";
1937                 break;
1938         case CHIP_ARCTURUS:
1939                 chip_name = "arcturus";
1940                 break;
1941         case CHIP_NAVI12:
1942                 chip_name = "navi12";
1943                 break;
1944         }
1945
1946         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1947         err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1948         if (err) {
1949                 dev_err(adev->dev,
1950                         "Failed to get gpu_info firmware \"%s\"\n",
1951                         fw_name);
1952                 goto out;
1953         }
1954
1955         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1956         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1957
1958         switch (hdr->version_major) {
1959         case 1:
1960         {
1961                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1962                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1963                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1964
1965                 /*
1966                  * Should be droped when DAL no longer needs it.
1967                  */
1968                 if (adev->asic_type == CHIP_NAVI12)
1969                         goto parse_soc_bounding_box;
1970
1971                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1972                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1973                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1974                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1975                 adev->gfx.config.max_texture_channel_caches =
1976                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1977                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1978                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1979                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1980                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1981                 adev->gfx.config.double_offchip_lds_buf =
1982                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1983                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1984                 adev->gfx.cu_info.max_waves_per_simd =
1985                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1986                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1987                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1988                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1989                 if (hdr->version_minor >= 1) {
1990                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1991                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1992                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1993                         adev->gfx.config.num_sc_per_sh =
1994                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1995                         adev->gfx.config.num_packer_per_sc =
1996                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1997                 }
1998
1999 parse_soc_bounding_box:
2000                 /*
2001                  * soc bounding box info is not integrated in disocovery table,
2002                  * we always need to parse it from gpu info firmware if needed.
2003                  */
2004                 if (hdr->version_minor == 2) {
2005                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2006                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2007                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2008                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2009                 }
2010                 break;
2011         }
2012         default:
2013                 dev_err(adev->dev,
2014                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2015                 err = -EINVAL;
2016                 goto out;
2017         }
2018 out:
2019         return err;
2020 }
2021
2022 /**
2023  * amdgpu_device_ip_early_init - run early init for hardware IPs
2024  *
2025  * @adev: amdgpu_device pointer
2026  *
2027  * Early initialization pass for hardware IPs.  The hardware IPs that make
2028  * up each asic are discovered each IP's early_init callback is run.  This
2029  * is the first stage in initializing the asic.
2030  * Returns 0 on success, negative error code on failure.
2031  */
2032 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2033 {
2034         struct drm_device *dev = adev_to_drm(adev);
2035         struct pci_dev *parent;
2036         int i, r;
2037         bool total;
2038
2039         amdgpu_device_enable_virtual_display(adev);
2040
2041         if (amdgpu_sriov_vf(adev)) {
2042                 r = amdgpu_virt_request_full_gpu(adev, true);
2043                 if (r)
2044                         return r;
2045         }
2046
2047         switch (adev->asic_type) {
2048 #ifdef CONFIG_DRM_AMDGPU_SI
2049         case CHIP_VERDE:
2050         case CHIP_TAHITI:
2051         case CHIP_PITCAIRN:
2052         case CHIP_OLAND:
2053         case CHIP_HAINAN:
2054                 adev->family = AMDGPU_FAMILY_SI;
2055                 r = si_set_ip_blocks(adev);
2056                 if (r)
2057                         return r;
2058                 break;
2059 #endif
2060 #ifdef CONFIG_DRM_AMDGPU_CIK
2061         case CHIP_BONAIRE:
2062         case CHIP_HAWAII:
2063         case CHIP_KAVERI:
2064         case CHIP_KABINI:
2065         case CHIP_MULLINS:
2066                 if (adev->flags & AMD_IS_APU)
2067                         adev->family = AMDGPU_FAMILY_KV;
2068                 else
2069                         adev->family = AMDGPU_FAMILY_CI;
2070
2071                 r = cik_set_ip_blocks(adev);
2072                 if (r)
2073                         return r;
2074                 break;
2075 #endif
2076         case CHIP_TOPAZ:
2077         case CHIP_TONGA:
2078         case CHIP_FIJI:
2079         case CHIP_POLARIS10:
2080         case CHIP_POLARIS11:
2081         case CHIP_POLARIS12:
2082         case CHIP_VEGAM:
2083         case CHIP_CARRIZO:
2084         case CHIP_STONEY:
2085                 if (adev->flags & AMD_IS_APU)
2086                         adev->family = AMDGPU_FAMILY_CZ;
2087                 else
2088                         adev->family = AMDGPU_FAMILY_VI;
2089
2090                 r = vi_set_ip_blocks(adev);
2091                 if (r)
2092                         return r;
2093                 break;
2094         default:
2095                 r = amdgpu_discovery_set_ip_blocks(adev);
2096                 if (r)
2097                         return r;
2098                 break;
2099         }
2100
2101         if (amdgpu_has_atpx() &&
2102             (amdgpu_is_atpx_hybrid() ||
2103              amdgpu_has_atpx_dgpu_power_cntl()) &&
2104             ((adev->flags & AMD_IS_APU) == 0) &&
2105             !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2106                 adev->flags |= AMD_IS_PX;
2107
2108         if (!(adev->flags & AMD_IS_APU)) {
2109                 parent = pci_upstream_bridge(adev->pdev);
2110                 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2111         }
2112
2113
2114         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2115         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2116                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2117         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2118                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2119
2120         total = true;
2121         for (i = 0; i < adev->num_ip_blocks; i++) {
2122                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2123                         DRM_WARN("disabled ip block: %d <%s>\n",
2124                                   i, adev->ip_blocks[i].version->funcs->name);
2125                         adev->ip_blocks[i].status.valid = false;
2126                 } else {
2127                         if (adev->ip_blocks[i].version->funcs->early_init) {
2128                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2129                                 if (r == -ENOENT) {
2130                                         adev->ip_blocks[i].status.valid = false;
2131                                 } else if (r) {
2132                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2133                                                   adev->ip_blocks[i].version->funcs->name, r);
2134                                         total = false;
2135                                 } else {
2136                                         adev->ip_blocks[i].status.valid = true;
2137                                 }
2138                         } else {
2139                                 adev->ip_blocks[i].status.valid = true;
2140                         }
2141                 }
2142                 /* get the vbios after the asic_funcs are set up */
2143                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2144                         r = amdgpu_device_parse_gpu_info_fw(adev);
2145                         if (r)
2146                                 return r;
2147
2148                         /* Read BIOS */
2149                         if (amdgpu_device_read_bios(adev)) {
2150                                 if (!amdgpu_get_bios(adev))
2151                                         return -EINVAL;
2152
2153                                 r = amdgpu_atombios_init(adev);
2154                                 if (r) {
2155                                         dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2156                                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2157                                         return r;
2158                                 }
2159                         }
2160
2161                         /*get pf2vf msg info at it's earliest time*/
2162                         if (amdgpu_sriov_vf(adev))
2163                                 amdgpu_virt_init_data_exchange(adev);
2164
2165                 }
2166         }
2167         if (!total)
2168                 return -ENODEV;
2169
2170         amdgpu_amdkfd_device_probe(adev);
2171         adev->cg_flags &= amdgpu_cg_mask;
2172         adev->pg_flags &= amdgpu_pg_mask;
2173
2174         return 0;
2175 }
2176
2177 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2178 {
2179         int i, r;
2180
2181         for (i = 0; i < adev->num_ip_blocks; i++) {
2182                 if (!adev->ip_blocks[i].status.sw)
2183                         continue;
2184                 if (adev->ip_blocks[i].status.hw)
2185                         continue;
2186                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2187                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2188                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2189                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2190                         if (r) {
2191                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2192                                           adev->ip_blocks[i].version->funcs->name, r);
2193                                 return r;
2194                         }
2195                         adev->ip_blocks[i].status.hw = true;
2196                 }
2197         }
2198
2199         return 0;
2200 }
2201
2202 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2203 {
2204         int i, r;
2205
2206         for (i = 0; i < adev->num_ip_blocks; i++) {
2207                 if (!adev->ip_blocks[i].status.sw)
2208                         continue;
2209                 if (adev->ip_blocks[i].status.hw)
2210                         continue;
2211                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2212                 if (r) {
2213                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2214                                   adev->ip_blocks[i].version->funcs->name, r);
2215                         return r;
2216                 }
2217                 adev->ip_blocks[i].status.hw = true;
2218         }
2219
2220         return 0;
2221 }
2222
2223 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2224 {
2225         int r = 0;
2226         int i;
2227         uint32_t smu_version;
2228
2229         if (adev->asic_type >= CHIP_VEGA10) {
2230                 for (i = 0; i < adev->num_ip_blocks; i++) {
2231                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2232                                 continue;
2233
2234                         if (!adev->ip_blocks[i].status.sw)
2235                                 continue;
2236
2237                         /* no need to do the fw loading again if already done*/
2238                         if (adev->ip_blocks[i].status.hw == true)
2239                                 break;
2240
2241                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2242                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2243                                 if (r) {
2244                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2245                                                           adev->ip_blocks[i].version->funcs->name, r);
2246                                         return r;
2247                                 }
2248                         } else {
2249                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2250                                 if (r) {
2251                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2252                                                           adev->ip_blocks[i].version->funcs->name, r);
2253                                         return r;
2254                                 }
2255                         }
2256
2257                         adev->ip_blocks[i].status.hw = true;
2258                         break;
2259                 }
2260         }
2261
2262         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2263                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2264
2265         return r;
2266 }
2267
2268 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2269 {
2270         long timeout;
2271         int r, i;
2272
2273         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2274                 struct amdgpu_ring *ring = adev->rings[i];
2275
2276                 /* No need to setup the GPU scheduler for rings that don't need it */
2277                 if (!ring || ring->no_scheduler)
2278                         continue;
2279
2280                 switch (ring->funcs->type) {
2281                 case AMDGPU_RING_TYPE_GFX:
2282                         timeout = adev->gfx_timeout;
2283                         break;
2284                 case AMDGPU_RING_TYPE_COMPUTE:
2285                         timeout = adev->compute_timeout;
2286                         break;
2287                 case AMDGPU_RING_TYPE_SDMA:
2288                         timeout = adev->sdma_timeout;
2289                         break;
2290                 default:
2291                         timeout = adev->video_timeout;
2292                         break;
2293                 }
2294
2295                 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2296                                    ring->num_hw_submission, 0,
2297                                    timeout, adev->reset_domain->wq,
2298                                    ring->sched_score, ring->name,
2299                                    adev->dev);
2300                 if (r) {
2301                         DRM_ERROR("Failed to create scheduler on ring %s.\n",
2302                                   ring->name);
2303                         return r;
2304                 }
2305         }
2306
2307         amdgpu_xcp_update_partition_sched_list(adev);
2308
2309         return 0;
2310 }
2311
2312
2313 /**
2314  * amdgpu_device_ip_init - run init for hardware IPs
2315  *
2316  * @adev: amdgpu_device pointer
2317  *
2318  * Main initialization pass for hardware IPs.  The list of all the hardware
2319  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2320  * are run.  sw_init initializes the software state associated with each IP
2321  * and hw_init initializes the hardware associated with each IP.
2322  * Returns 0 on success, negative error code on failure.
2323  */
2324 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2325 {
2326         int i, r;
2327
2328         r = amdgpu_ras_init(adev);
2329         if (r)
2330                 return r;
2331
2332         for (i = 0; i < adev->num_ip_blocks; i++) {
2333                 if (!adev->ip_blocks[i].status.valid)
2334                         continue;
2335                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2336                 if (r) {
2337                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2338                                   adev->ip_blocks[i].version->funcs->name, r);
2339                         goto init_failed;
2340                 }
2341                 adev->ip_blocks[i].status.sw = true;
2342
2343                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2344                         /* need to do common hw init early so everything is set up for gmc */
2345                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2346                         if (r) {
2347                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2348                                 goto init_failed;
2349                         }
2350                         adev->ip_blocks[i].status.hw = true;
2351                 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2352                         /* need to do gmc hw init early so we can allocate gpu mem */
2353                         /* Try to reserve bad pages early */
2354                         if (amdgpu_sriov_vf(adev))
2355                                 amdgpu_virt_exchange_data(adev);
2356
2357                         r = amdgpu_device_mem_scratch_init(adev);
2358                         if (r) {
2359                                 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2360                                 goto init_failed;
2361                         }
2362                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2363                         if (r) {
2364                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2365                                 goto init_failed;
2366                         }
2367                         r = amdgpu_device_wb_init(adev);
2368                         if (r) {
2369                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2370                                 goto init_failed;
2371                         }
2372                         adev->ip_blocks[i].status.hw = true;
2373
2374                         /* right after GMC hw init, we create CSA */
2375                         if (adev->gfx.mcbp) {
2376                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2377                                                                AMDGPU_GEM_DOMAIN_VRAM |
2378                                                                AMDGPU_GEM_DOMAIN_GTT,
2379                                                                AMDGPU_CSA_SIZE);
2380                                 if (r) {
2381                                         DRM_ERROR("allocate CSA failed %d\n", r);
2382                                         goto init_failed;
2383                                 }
2384                         }
2385                 }
2386         }
2387
2388         if (amdgpu_sriov_vf(adev))
2389                 amdgpu_virt_init_data_exchange(adev);
2390
2391         r = amdgpu_ib_pool_init(adev);
2392         if (r) {
2393                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2394                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2395                 goto init_failed;
2396         }
2397
2398         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2399         if (r)
2400                 goto init_failed;
2401
2402         r = amdgpu_device_ip_hw_init_phase1(adev);
2403         if (r)
2404                 goto init_failed;
2405
2406         r = amdgpu_device_fw_loading(adev);
2407         if (r)
2408                 goto init_failed;
2409
2410         r = amdgpu_device_ip_hw_init_phase2(adev);
2411         if (r)
2412                 goto init_failed;
2413
2414         /*
2415          * retired pages will be loaded from eeprom and reserved here,
2416          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2417          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2418          * for I2C communication which only true at this point.
2419          *
2420          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2421          * failure from bad gpu situation and stop amdgpu init process
2422          * accordingly. For other failed cases, it will still release all
2423          * the resource and print error message, rather than returning one
2424          * negative value to upper level.
2425          *
2426          * Note: theoretically, this should be called before all vram allocations
2427          * to protect retired page from abusing
2428          */
2429         r = amdgpu_ras_recovery_init(adev);
2430         if (r)
2431                 goto init_failed;
2432
2433         /**
2434          * In case of XGMI grab extra reference for reset domain for this device
2435          */
2436         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2437                 if (amdgpu_xgmi_add_device(adev) == 0) {
2438                         if (!amdgpu_sriov_vf(adev)) {
2439                                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2440
2441                                 if (WARN_ON(!hive)) {
2442                                         r = -ENOENT;
2443                                         goto init_failed;
2444                                 }
2445
2446                                 if (!hive->reset_domain ||
2447                                     !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2448                                         r = -ENOENT;
2449                                         amdgpu_put_xgmi_hive(hive);
2450                                         goto init_failed;
2451                                 }
2452
2453                                 /* Drop the early temporary reset domain we created for device */
2454                                 amdgpu_reset_put_reset_domain(adev->reset_domain);
2455                                 adev->reset_domain = hive->reset_domain;
2456                                 amdgpu_put_xgmi_hive(hive);
2457                         }
2458                 }
2459         }
2460
2461         r = amdgpu_device_init_schedulers(adev);
2462         if (r)
2463                 goto init_failed;
2464
2465         /* Don't init kfd if whole hive need to be reset during init */
2466         if (!adev->gmc.xgmi.pending_reset) {
2467                 kgd2kfd_init_zone_device(adev);
2468                 amdgpu_amdkfd_device_init(adev);
2469         }
2470
2471         amdgpu_fru_get_product_info(adev);
2472
2473 init_failed:
2474
2475         return r;
2476 }
2477
2478 /**
2479  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2480  *
2481  * @adev: amdgpu_device pointer
2482  *
2483  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2484  * this function before a GPU reset.  If the value is retained after a
2485  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2486  */
2487 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2488 {
2489         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2490 }
2491
2492 /**
2493  * amdgpu_device_check_vram_lost - check if vram is valid
2494  *
2495  * @adev: amdgpu_device pointer
2496  *
2497  * Checks the reset magic value written to the gart pointer in VRAM.
2498  * The driver calls this after a GPU reset to see if the contents of
2499  * VRAM is lost or now.
2500  * returns true if vram is lost, false if not.
2501  */
2502 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2503 {
2504         if (memcmp(adev->gart.ptr, adev->reset_magic,
2505                         AMDGPU_RESET_MAGIC_NUM))
2506                 return true;
2507
2508         if (!amdgpu_in_reset(adev))
2509                 return false;
2510
2511         /*
2512          * For all ASICs with baco/mode1 reset, the VRAM is
2513          * always assumed to be lost.
2514          */
2515         switch (amdgpu_asic_reset_method(adev)) {
2516         case AMD_RESET_METHOD_BACO:
2517         case AMD_RESET_METHOD_MODE1:
2518                 return true;
2519         default:
2520                 return false;
2521         }
2522 }
2523
2524 /**
2525  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2526  *
2527  * @adev: amdgpu_device pointer
2528  * @state: clockgating state (gate or ungate)
2529  *
2530  * The list of all the hardware IPs that make up the asic is walked and the
2531  * set_clockgating_state callbacks are run.
2532  * Late initialization pass enabling clockgating for hardware IPs.
2533  * Fini or suspend, pass disabling clockgating for hardware IPs.
2534  * Returns 0 on success, negative error code on failure.
2535  */
2536
2537 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2538                                enum amd_clockgating_state state)
2539 {
2540         int i, j, r;
2541
2542         if (amdgpu_emu_mode == 1)
2543                 return 0;
2544
2545         for (j = 0; j < adev->num_ip_blocks; j++) {
2546                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2547                 if (!adev->ip_blocks[i].status.late_initialized)
2548                         continue;
2549                 /* skip CG for GFX, SDMA on S0ix */
2550                 if (adev->in_s0ix &&
2551                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2552                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2553                         continue;
2554                 /* skip CG for VCE/UVD, it's handled specially */
2555                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2556                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2557                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2558                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2559                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2560                         /* enable clockgating to save power */
2561                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2562                                                                                      state);
2563                         if (r) {
2564                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2565                                           adev->ip_blocks[i].version->funcs->name, r);
2566                                 return r;
2567                         }
2568                 }
2569         }
2570
2571         return 0;
2572 }
2573
2574 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2575                                enum amd_powergating_state state)
2576 {
2577         int i, j, r;
2578
2579         if (amdgpu_emu_mode == 1)
2580                 return 0;
2581
2582         for (j = 0; j < adev->num_ip_blocks; j++) {
2583                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2584                 if (!adev->ip_blocks[i].status.late_initialized)
2585                         continue;
2586                 /* skip PG for GFX, SDMA on S0ix */
2587                 if (adev->in_s0ix &&
2588                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2589                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2590                         continue;
2591                 /* skip CG for VCE/UVD, it's handled specially */
2592                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2593                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2594                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2595                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2596                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2597                         /* enable powergating to save power */
2598                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2599                                                                                         state);
2600                         if (r) {
2601                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2602                                           adev->ip_blocks[i].version->funcs->name, r);
2603                                 return r;
2604                         }
2605                 }
2606         }
2607         return 0;
2608 }
2609
2610 static int amdgpu_device_enable_mgpu_fan_boost(void)
2611 {
2612         struct amdgpu_gpu_instance *gpu_ins;
2613         struct amdgpu_device *adev;
2614         int i, ret = 0;
2615
2616         mutex_lock(&mgpu_info.mutex);
2617
2618         /*
2619          * MGPU fan boost feature should be enabled
2620          * only when there are two or more dGPUs in
2621          * the system
2622          */
2623         if (mgpu_info.num_dgpu < 2)
2624                 goto out;
2625
2626         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2627                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2628                 adev = gpu_ins->adev;
2629                 if (!(adev->flags & AMD_IS_APU) &&
2630                     !gpu_ins->mgpu_fan_enabled) {
2631                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2632                         if (ret)
2633                                 break;
2634
2635                         gpu_ins->mgpu_fan_enabled = 1;
2636                 }
2637         }
2638
2639 out:
2640         mutex_unlock(&mgpu_info.mutex);
2641
2642         return ret;
2643 }
2644
2645 /**
2646  * amdgpu_device_ip_late_init - run late init for hardware IPs
2647  *
2648  * @adev: amdgpu_device pointer
2649  *
2650  * Late initialization pass for hardware IPs.  The list of all the hardware
2651  * IPs that make up the asic is walked and the late_init callbacks are run.
2652  * late_init covers any special initialization that an IP requires
2653  * after all of the have been initialized or something that needs to happen
2654  * late in the init process.
2655  * Returns 0 on success, negative error code on failure.
2656  */
2657 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2658 {
2659         struct amdgpu_gpu_instance *gpu_instance;
2660         int i = 0, r;
2661
2662         for (i = 0; i < adev->num_ip_blocks; i++) {
2663                 if (!adev->ip_blocks[i].status.hw)
2664                         continue;
2665                 if (adev->ip_blocks[i].version->funcs->late_init) {
2666                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2667                         if (r) {
2668                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2669                                           adev->ip_blocks[i].version->funcs->name, r);
2670                                 return r;
2671                         }
2672                 }
2673                 adev->ip_blocks[i].status.late_initialized = true;
2674         }
2675
2676         r = amdgpu_ras_late_init(adev);
2677         if (r) {
2678                 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2679                 return r;
2680         }
2681
2682         amdgpu_ras_set_error_query_ready(adev, true);
2683
2684         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2685         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2686
2687         amdgpu_device_fill_reset_magic(adev);
2688
2689         r = amdgpu_device_enable_mgpu_fan_boost();
2690         if (r)
2691                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2692
2693         /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2694         if (amdgpu_passthrough(adev) &&
2695             ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2696              adev->asic_type == CHIP_ALDEBARAN))
2697                 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2698
2699         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2700                 mutex_lock(&mgpu_info.mutex);
2701
2702                 /*
2703                  * Reset device p-state to low as this was booted with high.
2704                  *
2705                  * This should be performed only after all devices from the same
2706                  * hive get initialized.
2707                  *
2708                  * However, it's unknown how many device in the hive in advance.
2709                  * As this is counted one by one during devices initializations.
2710                  *
2711                  * So, we wait for all XGMI interlinked devices initialized.
2712                  * This may bring some delays as those devices may come from
2713                  * different hives. But that should be OK.
2714                  */
2715                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2716                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2717                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2718                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2719                                         continue;
2720
2721                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2722                                                 AMDGPU_XGMI_PSTATE_MIN);
2723                                 if (r) {
2724                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2725                                         break;
2726                                 }
2727                         }
2728                 }
2729
2730                 mutex_unlock(&mgpu_info.mutex);
2731         }
2732
2733         return 0;
2734 }
2735
2736 /**
2737  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2738  *
2739  * @adev: amdgpu_device pointer
2740  *
2741  * For ASICs need to disable SMC first
2742  */
2743 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2744 {
2745         int i, r;
2746
2747         if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2748                 return;
2749
2750         for (i = 0; i < adev->num_ip_blocks; i++) {
2751                 if (!adev->ip_blocks[i].status.hw)
2752                         continue;
2753                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2754                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2755                         /* XXX handle errors */
2756                         if (r) {
2757                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2758                                           adev->ip_blocks[i].version->funcs->name, r);
2759                         }
2760                         adev->ip_blocks[i].status.hw = false;
2761                         break;
2762                 }
2763         }
2764 }
2765
2766 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2767 {
2768         int i, r;
2769
2770         for (i = 0; i < adev->num_ip_blocks; i++) {
2771                 if (!adev->ip_blocks[i].version->funcs->early_fini)
2772                         continue;
2773
2774                 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2775                 if (r) {
2776                         DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2777                                   adev->ip_blocks[i].version->funcs->name, r);
2778                 }
2779         }
2780
2781         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2782         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2783
2784         amdgpu_amdkfd_suspend(adev, false);
2785
2786         /* Workaroud for ASICs need to disable SMC first */
2787         amdgpu_device_smu_fini_early(adev);
2788
2789         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2790                 if (!adev->ip_blocks[i].status.hw)
2791                         continue;
2792
2793                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2794                 /* XXX handle errors */
2795                 if (r) {
2796                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2797                                   adev->ip_blocks[i].version->funcs->name, r);
2798                 }
2799
2800                 adev->ip_blocks[i].status.hw = false;
2801         }
2802
2803         if (amdgpu_sriov_vf(adev)) {
2804                 if (amdgpu_virt_release_full_gpu(adev, false))
2805                         DRM_ERROR("failed to release exclusive mode on fini\n");
2806         }
2807
2808         return 0;
2809 }
2810
2811 /**
2812  * amdgpu_device_ip_fini - run fini for hardware IPs
2813  *
2814  * @adev: amdgpu_device pointer
2815  *
2816  * Main teardown pass for hardware IPs.  The list of all the hardware
2817  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2818  * are run.  hw_fini tears down the hardware associated with each IP
2819  * and sw_fini tears down any software state associated with each IP.
2820  * Returns 0 on success, negative error code on failure.
2821  */
2822 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2823 {
2824         int i, r;
2825
2826         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2827                 amdgpu_virt_release_ras_err_handler_data(adev);
2828
2829         if (adev->gmc.xgmi.num_physical_nodes > 1)
2830                 amdgpu_xgmi_remove_device(adev);
2831
2832         amdgpu_amdkfd_device_fini_sw(adev);
2833
2834         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2835                 if (!adev->ip_blocks[i].status.sw)
2836                         continue;
2837
2838                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2839                         amdgpu_ucode_free_bo(adev);
2840                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2841                         amdgpu_device_wb_fini(adev);
2842                         amdgpu_device_mem_scratch_fini(adev);
2843                         amdgpu_ib_pool_fini(adev);
2844                 }
2845
2846                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2847                 /* XXX handle errors */
2848                 if (r) {
2849                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2850                                   adev->ip_blocks[i].version->funcs->name, r);
2851                 }
2852                 adev->ip_blocks[i].status.sw = false;
2853                 adev->ip_blocks[i].status.valid = false;
2854         }
2855
2856         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2857                 if (!adev->ip_blocks[i].status.late_initialized)
2858                         continue;
2859                 if (adev->ip_blocks[i].version->funcs->late_fini)
2860                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2861                 adev->ip_blocks[i].status.late_initialized = false;
2862         }
2863
2864         amdgpu_ras_fini(adev);
2865
2866         return 0;
2867 }
2868
2869 /**
2870  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2871  *
2872  * @work: work_struct.
2873  */
2874 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2875 {
2876         struct amdgpu_device *adev =
2877                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2878         int r;
2879
2880         r = amdgpu_ib_ring_tests(adev);
2881         if (r)
2882                 DRM_ERROR("ib ring test failed (%d).\n", r);
2883 }
2884
2885 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2886 {
2887         struct amdgpu_device *adev =
2888                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2889
2890         WARN_ON_ONCE(adev->gfx.gfx_off_state);
2891         WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2892
2893         if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2894                 adev->gfx.gfx_off_state = true;
2895 }
2896
2897 /**
2898  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2899  *
2900  * @adev: amdgpu_device pointer
2901  *
2902  * Main suspend function for hardware IPs.  The list of all the hardware
2903  * IPs that make up the asic is walked, clockgating is disabled and the
2904  * suspend callbacks are run.  suspend puts the hardware and software state
2905  * in each IP into a state suitable for suspend.
2906  * Returns 0 on success, negative error code on failure.
2907  */
2908 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2909 {
2910         int i, r;
2911
2912         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2913         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2914
2915         /*
2916          * Per PMFW team's suggestion, driver needs to handle gfxoff
2917          * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2918          * scenario. Add the missing df cstate disablement here.
2919          */
2920         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2921                 dev_warn(adev->dev, "Failed to disallow df cstate");
2922
2923         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2924                 if (!adev->ip_blocks[i].status.valid)
2925                         continue;
2926
2927                 /* displays are handled separately */
2928                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2929                         continue;
2930
2931                 /* XXX handle errors */
2932                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2933                 /* XXX handle errors */
2934                 if (r) {
2935                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2936                                   adev->ip_blocks[i].version->funcs->name, r);
2937                         return r;
2938                 }
2939
2940                 adev->ip_blocks[i].status.hw = false;
2941         }
2942
2943         return 0;
2944 }
2945
2946 /**
2947  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2948  *
2949  * @adev: amdgpu_device pointer
2950  *
2951  * Main suspend function for hardware IPs.  The list of all the hardware
2952  * IPs that make up the asic is walked, clockgating is disabled and the
2953  * suspend callbacks are run.  suspend puts the hardware and software state
2954  * in each IP into a state suitable for suspend.
2955  * Returns 0 on success, negative error code on failure.
2956  */
2957 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2958 {
2959         int i, r;
2960
2961         if (adev->in_s0ix)
2962                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2963
2964         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2965                 if (!adev->ip_blocks[i].status.valid)
2966                         continue;
2967                 /* displays are handled in phase1 */
2968                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2969                         continue;
2970                 /* PSP lost connection when err_event_athub occurs */
2971                 if (amdgpu_ras_intr_triggered() &&
2972                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2973                         adev->ip_blocks[i].status.hw = false;
2974                         continue;
2975                 }
2976
2977                 /* skip unnecessary suspend if we do not initialize them yet */
2978                 if (adev->gmc.xgmi.pending_reset &&
2979                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2980                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2981                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2982                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2983                         adev->ip_blocks[i].status.hw = false;
2984                         continue;
2985                 }
2986
2987                 /* skip suspend of gfx/mes and psp for S0ix
2988                  * gfx is in gfxoff state, so on resume it will exit gfxoff just
2989                  * like at runtime. PSP is also part of the always on hardware
2990                  * so no need to suspend it.
2991                  */
2992                 if (adev->in_s0ix &&
2993                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2994                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2995                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
2996                         continue;
2997
2998                 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
2999                 if (adev->in_s0ix &&
3000                     (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3001                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3002                         continue;
3003
3004                 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3005                  * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3006                  * from this location and RLC Autoload automatically also gets loaded
3007                  * from here based on PMFW -> PSP message during re-init sequence.
3008                  * Therefore, the psp suspend & resume should be skipped to avoid destroy
3009                  * the TMR and reload FWs again for IMU enabled APU ASICs.
3010                  */
3011                 if (amdgpu_in_reset(adev) &&
3012                     (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3013                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3014                         continue;
3015
3016                 /* XXX handle errors */
3017                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3018                 /* XXX handle errors */
3019                 if (r) {
3020                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
3021                                   adev->ip_blocks[i].version->funcs->name, r);
3022                 }
3023                 adev->ip_blocks[i].status.hw = false;
3024                 /* handle putting the SMC in the appropriate state */
3025                 if (!amdgpu_sriov_vf(adev)) {
3026                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3027                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3028                                 if (r) {
3029                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3030                                                         adev->mp1_state, r);
3031                                         return r;
3032                                 }
3033                         }
3034                 }
3035         }
3036
3037         return 0;
3038 }
3039
3040 /**
3041  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3042  *
3043  * @adev: amdgpu_device pointer
3044  *
3045  * Main suspend function for hardware IPs.  The list of all the hardware
3046  * IPs that make up the asic is walked, clockgating is disabled and the
3047  * suspend callbacks are run.  suspend puts the hardware and software state
3048  * in each IP into a state suitable for suspend.
3049  * Returns 0 on success, negative error code on failure.
3050  */
3051 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3052 {
3053         int r;
3054
3055         if (amdgpu_sriov_vf(adev)) {
3056                 amdgpu_virt_fini_data_exchange(adev);
3057                 amdgpu_virt_request_full_gpu(adev, false);
3058         }
3059
3060         r = amdgpu_device_ip_suspend_phase1(adev);
3061         if (r)
3062                 return r;
3063         r = amdgpu_device_ip_suspend_phase2(adev);
3064
3065         if (amdgpu_sriov_vf(adev))
3066                 amdgpu_virt_release_full_gpu(adev, false);
3067
3068         return r;
3069 }
3070
3071 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3072 {
3073         int i, r;
3074
3075         static enum amd_ip_block_type ip_order[] = {
3076                 AMD_IP_BLOCK_TYPE_COMMON,
3077                 AMD_IP_BLOCK_TYPE_GMC,
3078                 AMD_IP_BLOCK_TYPE_PSP,
3079                 AMD_IP_BLOCK_TYPE_IH,
3080         };
3081
3082         for (i = 0; i < adev->num_ip_blocks; i++) {
3083                 int j;
3084                 struct amdgpu_ip_block *block;
3085
3086                 block = &adev->ip_blocks[i];
3087                 block->status.hw = false;
3088
3089                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3090
3091                         if (block->version->type != ip_order[j] ||
3092                                 !block->status.valid)
3093                                 continue;
3094
3095                         r = block->version->funcs->hw_init(adev);
3096                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3097                         if (r)
3098                                 return r;
3099                         block->status.hw = true;
3100                 }
3101         }
3102
3103         return 0;
3104 }
3105
3106 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3107 {
3108         int i, r;
3109
3110         static enum amd_ip_block_type ip_order[] = {
3111                 AMD_IP_BLOCK_TYPE_SMC,
3112                 AMD_IP_BLOCK_TYPE_DCE,
3113                 AMD_IP_BLOCK_TYPE_GFX,
3114                 AMD_IP_BLOCK_TYPE_SDMA,
3115                 AMD_IP_BLOCK_TYPE_MES,
3116                 AMD_IP_BLOCK_TYPE_UVD,
3117                 AMD_IP_BLOCK_TYPE_VCE,
3118                 AMD_IP_BLOCK_TYPE_VCN,
3119                 AMD_IP_BLOCK_TYPE_JPEG
3120         };
3121
3122         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3123                 int j;
3124                 struct amdgpu_ip_block *block;
3125
3126                 for (j = 0; j < adev->num_ip_blocks; j++) {
3127                         block = &adev->ip_blocks[j];
3128
3129                         if (block->version->type != ip_order[i] ||
3130                                 !block->status.valid ||
3131                                 block->status.hw)
3132                                 continue;
3133
3134                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3135                                 r = block->version->funcs->resume(adev);
3136                         else
3137                                 r = block->version->funcs->hw_init(adev);
3138
3139                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3140                         if (r)
3141                                 return r;
3142                         block->status.hw = true;
3143                 }
3144         }
3145
3146         return 0;
3147 }
3148
3149 /**
3150  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3151  *
3152  * @adev: amdgpu_device pointer
3153  *
3154  * First resume function for hardware IPs.  The list of all the hardware
3155  * IPs that make up the asic is walked and the resume callbacks are run for
3156  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3157  * after a suspend and updates the software state as necessary.  This
3158  * function is also used for restoring the GPU after a GPU reset.
3159  * Returns 0 on success, negative error code on failure.
3160  */
3161 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3162 {
3163         int i, r;
3164
3165         for (i = 0; i < adev->num_ip_blocks; i++) {
3166                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3167                         continue;
3168                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3169                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3170                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3171                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3172
3173                         r = adev->ip_blocks[i].version->funcs->resume(adev);
3174                         if (r) {
3175                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
3176                                           adev->ip_blocks[i].version->funcs->name, r);
3177                                 return r;
3178                         }
3179                         adev->ip_blocks[i].status.hw = true;
3180                 }
3181         }
3182
3183         return 0;
3184 }
3185
3186 /**
3187  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3188  *
3189  * @adev: amdgpu_device pointer
3190  *
3191  * First resume function for hardware IPs.  The list of all the hardware
3192  * IPs that make up the asic is walked and the resume callbacks are run for
3193  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3194  * functional state after a suspend and updates the software state as
3195  * necessary.  This function is also used for restoring the GPU after a GPU
3196  * reset.
3197  * Returns 0 on success, negative error code on failure.
3198  */
3199 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3200 {
3201         int i, r;
3202
3203         for (i = 0; i < adev->num_ip_blocks; i++) {
3204                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3205                         continue;
3206                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3207                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3208                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3209                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3210                         continue;
3211                 r = adev->ip_blocks[i].version->funcs->resume(adev);
3212                 if (r) {
3213                         DRM_ERROR("resume of IP block <%s> failed %d\n",
3214                                   adev->ip_blocks[i].version->funcs->name, r);
3215                         return r;
3216                 }
3217                 adev->ip_blocks[i].status.hw = true;
3218         }
3219
3220         return 0;
3221 }
3222
3223 /**
3224  * amdgpu_device_ip_resume - run resume for hardware IPs
3225  *
3226  * @adev: amdgpu_device pointer
3227  *
3228  * Main resume function for hardware IPs.  The hardware IPs
3229  * are split into two resume functions because they are
3230  * also used in recovering from a GPU reset and some additional
3231  * steps need to be take between them.  In this case (S3/S4) they are
3232  * run sequentially.
3233  * Returns 0 on success, negative error code on failure.
3234  */
3235 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3236 {
3237         int r;
3238
3239         if (!adev->in_s0ix) {
3240                 r = amdgpu_amdkfd_resume_iommu(adev);
3241                 if (r)
3242                         return r;
3243         }
3244
3245         r = amdgpu_device_ip_resume_phase1(adev);
3246         if (r)
3247                 return r;
3248
3249         r = amdgpu_device_fw_loading(adev);
3250         if (r)
3251                 return r;
3252
3253         r = amdgpu_device_ip_resume_phase2(adev);
3254
3255         return r;
3256 }
3257
3258 /**
3259  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3260  *
3261  * @adev: amdgpu_device pointer
3262  *
3263  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3264  */
3265 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3266 {
3267         if (amdgpu_sriov_vf(adev)) {
3268                 if (adev->is_atom_fw) {
3269                         if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3270                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3271                 } else {
3272                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3273                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3274                 }
3275
3276                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3277                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3278         }
3279 }
3280
3281 /**
3282  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3283  *
3284  * @asic_type: AMD asic type
3285  *
3286  * Check if there is DC (new modesetting infrastructre) support for an asic.
3287  * returns true if DC has support, false if not.
3288  */
3289 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3290 {
3291         switch (asic_type) {
3292 #ifdef CONFIG_DRM_AMDGPU_SI
3293         case CHIP_HAINAN:
3294 #endif
3295         case CHIP_TOPAZ:
3296                 /* chips with no display hardware */
3297                 return false;
3298 #if defined(CONFIG_DRM_AMD_DC)
3299         case CHIP_TAHITI:
3300         case CHIP_PITCAIRN:
3301         case CHIP_VERDE:
3302         case CHIP_OLAND:
3303                 /*
3304                  * We have systems in the wild with these ASICs that require
3305                  * LVDS and VGA support which is not supported with DC.
3306                  *
3307                  * Fallback to the non-DC driver here by default so as not to
3308                  * cause regressions.
3309                  */
3310 #if defined(CONFIG_DRM_AMD_DC_SI)
3311                 return amdgpu_dc > 0;
3312 #else
3313                 return false;
3314 #endif
3315         case CHIP_BONAIRE:
3316         case CHIP_KAVERI:
3317         case CHIP_KABINI:
3318         case CHIP_MULLINS:
3319                 /*
3320                  * We have systems in the wild with these ASICs that require
3321                  * VGA support which is not supported with DC.
3322                  *
3323                  * Fallback to the non-DC driver here by default so as not to
3324                  * cause regressions.
3325                  */
3326                 return amdgpu_dc > 0;
3327         default:
3328                 return amdgpu_dc != 0;
3329 #else
3330         default:
3331                 if (amdgpu_dc > 0)
3332                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3333                 return false;
3334 #endif
3335         }
3336 }
3337
3338 /**
3339  * amdgpu_device_has_dc_support - check if dc is supported
3340  *
3341  * @adev: amdgpu_device pointer
3342  *
3343  * Returns true for supported, false for not supported
3344  */
3345 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3346 {
3347         if (adev->enable_virtual_display ||
3348             (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3349                 return false;
3350
3351         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3352 }
3353
3354 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3355 {
3356         struct amdgpu_device *adev =
3357                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3358         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3359
3360         /* It's a bug to not have a hive within this function */
3361         if (WARN_ON(!hive))
3362                 return;
3363
3364         /*
3365          * Use task barrier to synchronize all xgmi reset works across the
3366          * hive. task_barrier_enter and task_barrier_exit will block
3367          * until all the threads running the xgmi reset works reach
3368          * those points. task_barrier_full will do both blocks.
3369          */
3370         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3371
3372                 task_barrier_enter(&hive->tb);
3373                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3374
3375                 if (adev->asic_reset_res)
3376                         goto fail;
3377
3378                 task_barrier_exit(&hive->tb);
3379                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3380
3381                 if (adev->asic_reset_res)
3382                         goto fail;
3383
3384                 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3385                     adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3386                         adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3387         } else {
3388
3389                 task_barrier_full(&hive->tb);
3390                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3391         }
3392
3393 fail:
3394         if (adev->asic_reset_res)
3395                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3396                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3397         amdgpu_put_xgmi_hive(hive);
3398 }
3399
3400 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3401 {
3402         char *input = amdgpu_lockup_timeout;
3403         char *timeout_setting = NULL;
3404         int index = 0;
3405         long timeout;
3406         int ret = 0;
3407
3408         /*
3409          * By default timeout for non compute jobs is 10000
3410          * and 60000 for compute jobs.
3411          * In SR-IOV or passthrough mode, timeout for compute
3412          * jobs are 60000 by default.
3413          */
3414         adev->gfx_timeout = msecs_to_jiffies(10000);
3415         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3416         if (amdgpu_sriov_vf(adev))
3417                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3418                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3419         else
3420                 adev->compute_timeout =  msecs_to_jiffies(60000);
3421
3422         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3423                 while ((timeout_setting = strsep(&input, ",")) &&
3424                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3425                         ret = kstrtol(timeout_setting, 0, &timeout);
3426                         if (ret)
3427                                 return ret;
3428
3429                         if (timeout == 0) {
3430                                 index++;
3431                                 continue;
3432                         } else if (timeout < 0) {
3433                                 timeout = MAX_SCHEDULE_TIMEOUT;
3434                                 dev_warn(adev->dev, "lockup timeout disabled");
3435                                 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3436                         } else {
3437                                 timeout = msecs_to_jiffies(timeout);
3438                         }
3439
3440                         switch (index++) {
3441                         case 0:
3442                                 adev->gfx_timeout = timeout;
3443                                 break;
3444                         case 1:
3445                                 adev->compute_timeout = timeout;
3446                                 break;
3447                         case 2:
3448                                 adev->sdma_timeout = timeout;
3449                                 break;
3450                         case 3:
3451                                 adev->video_timeout = timeout;
3452                                 break;
3453                         default:
3454                                 break;
3455                         }
3456                 }
3457                 /*
3458                  * There is only one value specified and
3459                  * it should apply to all non-compute jobs.
3460                  */
3461                 if (index == 1) {
3462                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3463                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3464                                 adev->compute_timeout = adev->gfx_timeout;
3465                 }
3466         }
3467
3468         return ret;
3469 }
3470
3471 /**
3472  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3473  *
3474  * @adev: amdgpu_device pointer
3475  *
3476  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3477  */
3478 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3479 {
3480         struct iommu_domain *domain;
3481
3482         domain = iommu_get_domain_for_dev(adev->dev);
3483         if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3484                 adev->ram_is_direct_mapped = true;
3485 }
3486
3487 static const struct attribute *amdgpu_dev_attributes[] = {
3488         &dev_attr_pcie_replay_count.attr,
3489         NULL
3490 };
3491
3492 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3493 {
3494         if (amdgpu_mcbp == 1)
3495                 adev->gfx.mcbp = true;
3496
3497         if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
3498             (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
3499             adev->gfx.num_gfx_rings)
3500                 adev->gfx.mcbp = true;
3501
3502         if (amdgpu_sriov_vf(adev))
3503                 adev->gfx.mcbp = true;
3504
3505         if (adev->gfx.mcbp)
3506                 DRM_INFO("MCBP is enabled\n");
3507 }
3508
3509 /**
3510  * amdgpu_device_init - initialize the driver
3511  *
3512  * @adev: amdgpu_device pointer
3513  * @flags: driver flags
3514  *
3515  * Initializes the driver info and hw (all asics).
3516  * Returns 0 for success or an error on failure.
3517  * Called at driver startup.
3518  */
3519 int amdgpu_device_init(struct amdgpu_device *adev,
3520                        uint32_t flags)
3521 {
3522         struct drm_device *ddev = adev_to_drm(adev);
3523         struct pci_dev *pdev = adev->pdev;
3524         int r, i;
3525         bool px = false;
3526         u32 max_MBps;
3527         int tmp;
3528
3529         adev->shutdown = false;
3530         adev->flags = flags;
3531
3532         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3533                 adev->asic_type = amdgpu_force_asic_type;
3534         else
3535                 adev->asic_type = flags & AMD_ASIC_MASK;
3536
3537         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3538         if (amdgpu_emu_mode == 1)
3539                 adev->usec_timeout *= 10;
3540         adev->gmc.gart_size = 512 * 1024 * 1024;
3541         adev->accel_working = false;
3542         adev->num_rings = 0;
3543         RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3544         adev->mman.buffer_funcs = NULL;
3545         adev->mman.buffer_funcs_ring = NULL;
3546         adev->vm_manager.vm_pte_funcs = NULL;
3547         adev->vm_manager.vm_pte_num_scheds = 0;
3548         adev->gmc.gmc_funcs = NULL;
3549         adev->harvest_ip_mask = 0x0;
3550         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3551         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3552
3553         adev->smc_rreg = &amdgpu_invalid_rreg;
3554         adev->smc_wreg = &amdgpu_invalid_wreg;
3555         adev->pcie_rreg = &amdgpu_invalid_rreg;
3556         adev->pcie_wreg = &amdgpu_invalid_wreg;
3557         adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3558         adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3559         adev->pciep_rreg = &amdgpu_invalid_rreg;
3560         adev->pciep_wreg = &amdgpu_invalid_wreg;
3561         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3562         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3563         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3564         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3565         adev->didt_rreg = &amdgpu_invalid_rreg;
3566         adev->didt_wreg = &amdgpu_invalid_wreg;
3567         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3568         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3569         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3570         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3571
3572         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3573                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3574                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3575
3576         /* mutex initialization are all done here so we
3577          * can recall function without having locking issues
3578          */
3579         mutex_init(&adev->firmware.mutex);
3580         mutex_init(&adev->pm.mutex);
3581         mutex_init(&adev->gfx.gpu_clock_mutex);
3582         mutex_init(&adev->srbm_mutex);
3583         mutex_init(&adev->gfx.pipe_reserve_mutex);
3584         mutex_init(&adev->gfx.gfx_off_mutex);
3585         mutex_init(&adev->gfx.partition_mutex);
3586         mutex_init(&adev->grbm_idx_mutex);
3587         mutex_init(&adev->mn_lock);
3588         mutex_init(&adev->virt.vf_errors.lock);
3589         hash_init(adev->mn_hash);
3590         mutex_init(&adev->psp.mutex);
3591         mutex_init(&adev->notifier_lock);
3592         mutex_init(&adev->pm.stable_pstate_ctx_lock);
3593         mutex_init(&adev->benchmark_mutex);
3594
3595         amdgpu_device_init_apu_flags(adev);
3596
3597         r = amdgpu_device_check_arguments(adev);
3598         if (r)
3599                 return r;
3600
3601         spin_lock_init(&adev->mmio_idx_lock);
3602         spin_lock_init(&adev->smc_idx_lock);
3603         spin_lock_init(&adev->pcie_idx_lock);
3604         spin_lock_init(&adev->uvd_ctx_idx_lock);
3605         spin_lock_init(&adev->didt_idx_lock);
3606         spin_lock_init(&adev->gc_cac_idx_lock);
3607         spin_lock_init(&adev->se_cac_idx_lock);
3608         spin_lock_init(&adev->audio_endpt_idx_lock);
3609         spin_lock_init(&adev->mm_stats.lock);
3610
3611         INIT_LIST_HEAD(&adev->shadow_list);
3612         mutex_init(&adev->shadow_list_lock);
3613
3614         INIT_LIST_HEAD(&adev->reset_list);
3615
3616         INIT_LIST_HEAD(&adev->ras_list);
3617
3618         INIT_DELAYED_WORK(&adev->delayed_init_work,
3619                           amdgpu_device_delayed_init_work_handler);
3620         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3621                           amdgpu_device_delay_enable_gfx_off);
3622
3623         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3624
3625         adev->gfx.gfx_off_req_count = 1;
3626         adev->gfx.gfx_off_residency = 0;
3627         adev->gfx.gfx_off_entrycount = 0;
3628         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3629
3630         atomic_set(&adev->throttling_logging_enabled, 1);
3631         /*
3632          * If throttling continues, logging will be performed every minute
3633          * to avoid log flooding. "-1" is subtracted since the thermal
3634          * throttling interrupt comes every second. Thus, the total logging
3635          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3636          * for throttling interrupt) = 60 seconds.
3637          */
3638         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3639         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3640
3641         /* Registers mapping */
3642         /* TODO: block userspace mapping of io register */
3643         if (adev->asic_type >= CHIP_BONAIRE) {
3644                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3645                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3646         } else {
3647                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3648                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3649         }
3650
3651         for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3652                 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3653
3654         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3655         if (!adev->rmmio)
3656                 return -ENOMEM;
3657
3658         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3659         DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3660
3661         /*
3662          * Reset domain needs to be present early, before XGMI hive discovered
3663          * (if any) and intitialized to use reset sem and in_gpu reset flag
3664          * early on during init and before calling to RREG32.
3665          */
3666         adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3667         if (!adev->reset_domain)
3668                 return -ENOMEM;
3669
3670         /* detect hw virtualization here */
3671         amdgpu_detect_virtualization(adev);
3672
3673         amdgpu_device_get_pcie_info(adev);
3674
3675         r = amdgpu_device_get_job_timeout_settings(adev);
3676         if (r) {
3677                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3678                 return r;
3679         }
3680
3681         /* early init functions */
3682         r = amdgpu_device_ip_early_init(adev);
3683         if (r)
3684                 return r;
3685
3686         amdgpu_device_set_mcbp(adev);
3687
3688         /* Get rid of things like offb */
3689         r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3690         if (r)
3691                 return r;
3692
3693         /* Enable TMZ based on IP_VERSION */
3694         amdgpu_gmc_tmz_set(adev);
3695
3696         amdgpu_gmc_noretry_set(adev);
3697         /* Need to get xgmi info early to decide the reset behavior*/
3698         if (adev->gmc.xgmi.supported) {
3699                 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3700                 if (r)
3701                         return r;
3702         }
3703
3704         /* enable PCIE atomic ops */
3705         if (amdgpu_sriov_vf(adev)) {
3706                 if (adev->virt.fw_reserve.p_pf2vf)
3707                         adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3708                                                       adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3709                                 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3710         /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3711          * internal path natively support atomics, set have_atomics_support to true.
3712          */
3713         } else if ((adev->flags & AMD_IS_APU) &&
3714                    (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3715                 adev->have_atomics_support = true;
3716         } else {
3717                 adev->have_atomics_support =
3718                         !pci_enable_atomic_ops_to_root(adev->pdev,
3719                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3720                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3721         }
3722
3723         if (!adev->have_atomics_support)
3724                 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3725
3726         /* doorbell bar mapping and doorbell index init*/
3727         amdgpu_doorbell_init(adev);
3728
3729         if (amdgpu_emu_mode == 1) {
3730                 /* post the asic on emulation mode */
3731                 emu_soc_asic_init(adev);
3732                 goto fence_driver_init;
3733         }
3734
3735         amdgpu_reset_init(adev);
3736
3737         /* detect if we are with an SRIOV vbios */
3738         if (adev->bios)
3739                 amdgpu_device_detect_sriov_bios(adev);
3740
3741         /* check if we need to reset the asic
3742          *  E.g., driver was not cleanly unloaded previously, etc.
3743          */
3744         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3745                 if (adev->gmc.xgmi.num_physical_nodes) {
3746                         dev_info(adev->dev, "Pending hive reset.\n");
3747                         adev->gmc.xgmi.pending_reset = true;
3748                         /* Only need to init necessary block for SMU to handle the reset */
3749                         for (i = 0; i < adev->num_ip_blocks; i++) {
3750                                 if (!adev->ip_blocks[i].status.valid)
3751                                         continue;
3752                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3753                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3754                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3755                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3756                                         DRM_DEBUG("IP %s disabled for hw_init.\n",
3757                                                 adev->ip_blocks[i].version->funcs->name);
3758                                         adev->ip_blocks[i].status.hw = true;
3759                                 }
3760                         }
3761                 } else {
3762                         tmp = amdgpu_reset_method;
3763                         /* It should do a default reset when loading or reloading the driver,
3764                          * regardless of the module parameter reset_method.
3765                          */
3766                         amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3767                         r = amdgpu_asic_reset(adev);
3768                         amdgpu_reset_method = tmp;
3769                         if (r) {
3770                                 dev_err(adev->dev, "asic reset on init failed\n");
3771                                 goto failed;
3772                         }
3773                 }
3774         }
3775
3776         /* Post card if necessary */
3777         if (amdgpu_device_need_post(adev)) {
3778                 if (!adev->bios) {
3779                         dev_err(adev->dev, "no vBIOS found\n");
3780                         r = -EINVAL;
3781                         goto failed;
3782                 }
3783                 DRM_INFO("GPU posting now...\n");
3784                 r = amdgpu_device_asic_init(adev);
3785                 if (r) {
3786                         dev_err(adev->dev, "gpu post error!\n");
3787                         goto failed;
3788                 }
3789         }
3790
3791         if (adev->bios) {
3792                 if (adev->is_atom_fw) {
3793                         /* Initialize clocks */
3794                         r = amdgpu_atomfirmware_get_clock_info(adev);
3795                         if (r) {
3796                                 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3797                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3798                                 goto failed;
3799                         }
3800                 } else {
3801                         /* Initialize clocks */
3802                         r = amdgpu_atombios_get_clock_info(adev);
3803                         if (r) {
3804                                 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3805                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3806                                 goto failed;
3807                         }
3808                         /* init i2c buses */
3809                         if (!amdgpu_device_has_dc_support(adev))
3810                                 amdgpu_atombios_i2c_init(adev);
3811                 }
3812         }
3813
3814 fence_driver_init:
3815         /* Fence driver */
3816         r = amdgpu_fence_driver_sw_init(adev);
3817         if (r) {
3818                 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3819                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3820                 goto failed;
3821         }
3822
3823         /* init the mode config */
3824         drm_mode_config_init(adev_to_drm(adev));
3825
3826         r = amdgpu_device_ip_init(adev);
3827         if (r) {
3828                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3829                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3830                 goto release_ras_con;
3831         }
3832
3833         amdgpu_fence_driver_hw_init(adev);
3834
3835         dev_info(adev->dev,
3836                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3837                         adev->gfx.config.max_shader_engines,
3838                         adev->gfx.config.max_sh_per_se,
3839                         adev->gfx.config.max_cu_per_sh,
3840                         adev->gfx.cu_info.number);
3841
3842         adev->accel_working = true;
3843
3844         amdgpu_vm_check_compute_bug(adev);
3845
3846         /* Initialize the buffer migration limit. */
3847         if (amdgpu_moverate >= 0)
3848                 max_MBps = amdgpu_moverate;
3849         else
3850                 max_MBps = 8; /* Allow 8 MB/s. */
3851         /* Get a log2 for easy divisions. */
3852         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3853
3854         r = amdgpu_atombios_sysfs_init(adev);
3855         if (r)
3856                 drm_err(&adev->ddev,
3857                         "registering atombios sysfs failed (%d).\n", r);
3858
3859         r = amdgpu_pm_sysfs_init(adev);
3860         if (r)
3861                 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3862
3863         r = amdgpu_ucode_sysfs_init(adev);
3864         if (r) {
3865                 adev->ucode_sysfs_en = false;
3866                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3867         } else
3868                 adev->ucode_sysfs_en = true;
3869
3870         /*
3871          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3872          * Otherwise the mgpu fan boost feature will be skipped due to the
3873          * gpu instance is counted less.
3874          */
3875         amdgpu_register_gpu_instance(adev);
3876
3877         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3878          * explicit gating rather than handling it automatically.
3879          */
3880         if (!adev->gmc.xgmi.pending_reset) {
3881                 r = amdgpu_device_ip_late_init(adev);
3882                 if (r) {
3883                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3884                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3885                         goto release_ras_con;
3886                 }
3887                 /* must succeed. */
3888                 amdgpu_ras_resume(adev);
3889                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3890                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3891         }
3892
3893         if (amdgpu_sriov_vf(adev)) {
3894                 amdgpu_virt_release_full_gpu(adev, true);
3895                 flush_delayed_work(&adev->delayed_init_work);
3896         }
3897
3898         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3899         if (r)
3900                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3901
3902         amdgpu_fru_sysfs_init(adev);
3903
3904         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3905                 r = amdgpu_pmu_init(adev);
3906         if (r)
3907                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3908
3909         /* Have stored pci confspace at hand for restore in sudden PCI error */
3910         if (amdgpu_device_cache_pci_state(adev->pdev))
3911                 pci_restore_state(pdev);
3912
3913         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3914         /* this will fail for cards that aren't VGA class devices, just
3915          * ignore it
3916          */
3917         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3918                 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3919
3920         px = amdgpu_device_supports_px(ddev);
3921
3922         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
3923                                 apple_gmux_detect(NULL, NULL)))
3924                 vga_switcheroo_register_client(adev->pdev,
3925                                                &amdgpu_switcheroo_ops, px);
3926
3927         if (px)
3928                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3929
3930         if (adev->gmc.xgmi.pending_reset)
3931                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3932                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3933
3934         amdgpu_device_check_iommu_direct_map(adev);
3935
3936         return 0;
3937
3938 release_ras_con:
3939         if (amdgpu_sriov_vf(adev))
3940                 amdgpu_virt_release_full_gpu(adev, true);
3941
3942         /* failed in exclusive mode due to timeout */
3943         if (amdgpu_sriov_vf(adev) &&
3944                 !amdgpu_sriov_runtime(adev) &&
3945                 amdgpu_virt_mmio_blocked(adev) &&
3946                 !amdgpu_virt_wait_reset(adev)) {
3947                 dev_err(adev->dev, "VF exclusive mode timeout\n");
3948                 /* Don't send request since VF is inactive. */
3949                 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3950                 adev->virt.ops = NULL;
3951                 r = -EAGAIN;
3952         }
3953         amdgpu_release_ras_context(adev);
3954
3955 failed:
3956         amdgpu_vf_error_trans_all(adev);
3957
3958         return r;
3959 }
3960
3961 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3962 {
3963
3964         /* Clear all CPU mappings pointing to this device */
3965         unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3966
3967         /* Unmap all mapped bars - Doorbell, registers and VRAM */
3968         amdgpu_doorbell_fini(adev);
3969
3970         iounmap(adev->rmmio);
3971         adev->rmmio = NULL;
3972         if (adev->mman.aper_base_kaddr)
3973                 iounmap(adev->mman.aper_base_kaddr);
3974         adev->mman.aper_base_kaddr = NULL;
3975
3976         /* Memory manager related */
3977         if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
3978                 arch_phys_wc_del(adev->gmc.vram_mtrr);
3979                 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3980         }
3981 }
3982
3983 /**
3984  * amdgpu_device_fini_hw - tear down the driver
3985  *
3986  * @adev: amdgpu_device pointer
3987  *
3988  * Tear down the driver info (all asics).
3989  * Called at driver shutdown.
3990  */
3991 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3992 {
3993         dev_info(adev->dev, "amdgpu: finishing device.\n");
3994         flush_delayed_work(&adev->delayed_init_work);
3995         adev->shutdown = true;
3996
3997         /* make sure IB test finished before entering exclusive mode
3998          * to avoid preemption on IB test
3999          */
4000         if (amdgpu_sriov_vf(adev)) {
4001                 amdgpu_virt_request_full_gpu(adev, false);
4002                 amdgpu_virt_fini_data_exchange(adev);
4003         }
4004
4005         /* disable all interrupts */
4006         amdgpu_irq_disable_all(adev);
4007         if (adev->mode_info.mode_config_initialized) {
4008                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4009                         drm_helper_force_disable_all(adev_to_drm(adev));
4010                 else
4011                         drm_atomic_helper_shutdown(adev_to_drm(adev));
4012         }
4013         amdgpu_fence_driver_hw_fini(adev);
4014
4015         if (adev->mman.initialized)
4016                 drain_workqueue(adev->mman.bdev.wq);
4017
4018         if (adev->pm.sysfs_initialized)
4019                 amdgpu_pm_sysfs_fini(adev);
4020         if (adev->ucode_sysfs_en)
4021                 amdgpu_ucode_sysfs_fini(adev);
4022         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4023         amdgpu_fru_sysfs_fini(adev);
4024
4025         /* disable ras feature must before hw fini */
4026         amdgpu_ras_pre_fini(adev);
4027
4028         amdgpu_device_ip_fini_early(adev);
4029
4030         amdgpu_irq_fini_hw(adev);
4031
4032         if (adev->mman.initialized)
4033                 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4034
4035         amdgpu_gart_dummy_page_fini(adev);
4036
4037         if (drm_dev_is_unplugged(adev_to_drm(adev)))
4038                 amdgpu_device_unmap_mmio(adev);
4039
4040 }
4041
4042 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4043 {
4044         int idx;
4045         bool px;
4046
4047         amdgpu_fence_driver_sw_fini(adev);
4048         amdgpu_device_ip_fini(adev);
4049         amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4050         adev->accel_working = false;
4051         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4052
4053         amdgpu_reset_fini(adev);
4054
4055         /* free i2c buses */
4056         if (!amdgpu_device_has_dc_support(adev))
4057                 amdgpu_i2c_fini(adev);
4058
4059         if (amdgpu_emu_mode != 1)
4060                 amdgpu_atombios_fini(adev);
4061
4062         kfree(adev->bios);
4063         adev->bios = NULL;
4064
4065         px = amdgpu_device_supports_px(adev_to_drm(adev));
4066
4067         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4068                                 apple_gmux_detect(NULL, NULL)))
4069                 vga_switcheroo_unregister_client(adev->pdev);
4070
4071         if (px)
4072                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4073
4074         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4075                 vga_client_unregister(adev->pdev);
4076
4077         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4078
4079                 iounmap(adev->rmmio);
4080                 adev->rmmio = NULL;
4081                 amdgpu_doorbell_fini(adev);
4082                 drm_dev_exit(idx);
4083         }
4084
4085         if (IS_ENABLED(CONFIG_PERF_EVENTS))
4086                 amdgpu_pmu_fini(adev);
4087         if (adev->mman.discovery_bin)
4088                 amdgpu_discovery_fini(adev);
4089
4090         amdgpu_reset_put_reset_domain(adev->reset_domain);
4091         adev->reset_domain = NULL;
4092
4093         kfree(adev->pci_state);
4094
4095 }
4096
4097 /**
4098  * amdgpu_device_evict_resources - evict device resources
4099  * @adev: amdgpu device object
4100  *
4101  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4102  * of the vram memory type. Mainly used for evicting device resources
4103  * at suspend time.
4104  *
4105  */
4106 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4107 {
4108         int ret;
4109
4110         /* No need to evict vram on APUs for suspend to ram or s2idle */
4111         if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4112                 return 0;
4113
4114         ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4115         if (ret)
4116                 DRM_WARN("evicting device resources failed\n");
4117         return ret;
4118 }
4119
4120 /*
4121  * Suspend & resume.
4122  */
4123 /**
4124  * amdgpu_device_suspend - initiate device suspend
4125  *
4126  * @dev: drm dev pointer
4127  * @fbcon : notify the fbdev of suspend
4128  *
4129  * Puts the hw in the suspend state (all asics).
4130  * Returns 0 for success or an error on failure.
4131  * Called at driver suspend.
4132  */
4133 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4134 {
4135         struct amdgpu_device *adev = drm_to_adev(dev);
4136         int r = 0;
4137
4138         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4139                 return 0;
4140
4141         adev->in_suspend = true;
4142
4143         /* Evict the majority of BOs before grabbing the full access */
4144         r = amdgpu_device_evict_resources(adev);
4145         if (r)
4146                 return r;
4147
4148         if (amdgpu_sriov_vf(adev)) {
4149                 amdgpu_virt_fini_data_exchange(adev);
4150                 r = amdgpu_virt_request_full_gpu(adev, false);
4151                 if (r)
4152                         return r;
4153         }
4154
4155         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4156                 DRM_WARN("smart shift update failed\n");
4157
4158         if (fbcon)
4159                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4160
4161         cancel_delayed_work_sync(&adev->delayed_init_work);
4162
4163         amdgpu_ras_suspend(adev);
4164
4165         amdgpu_device_ip_suspend_phase1(adev);
4166
4167         if (!adev->in_s0ix)
4168                 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4169
4170         r = amdgpu_device_evict_resources(adev);
4171         if (r)
4172                 return r;
4173
4174         amdgpu_fence_driver_hw_fini(adev);
4175
4176         amdgpu_device_ip_suspend_phase2(adev);
4177
4178         if (amdgpu_sriov_vf(adev))
4179                 amdgpu_virt_release_full_gpu(adev, false);
4180
4181         return 0;
4182 }
4183
4184 /**
4185  * amdgpu_device_resume - initiate device resume
4186  *
4187  * @dev: drm dev pointer
4188  * @fbcon : notify the fbdev of resume
4189  *
4190  * Bring the hw back to operating state (all asics).
4191  * Returns 0 for success or an error on failure.
4192  * Called at driver resume.
4193  */
4194 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4195 {
4196         struct amdgpu_device *adev = drm_to_adev(dev);
4197         int r = 0;
4198
4199         if (amdgpu_sriov_vf(adev)) {
4200                 r = amdgpu_virt_request_full_gpu(adev, true);
4201                 if (r)
4202                         return r;
4203         }
4204
4205         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4206                 return 0;
4207
4208         if (adev->in_s0ix)
4209                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4210
4211         /* post card */
4212         if (amdgpu_device_need_post(adev)) {
4213                 r = amdgpu_device_asic_init(adev);
4214                 if (r)
4215                         dev_err(adev->dev, "amdgpu asic init failed\n");
4216         }
4217
4218         r = amdgpu_device_ip_resume(adev);
4219
4220         if (r) {
4221                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4222                 goto exit;
4223         }
4224         amdgpu_fence_driver_hw_init(adev);
4225
4226         r = amdgpu_device_ip_late_init(adev);
4227         if (r)
4228                 goto exit;
4229
4230         queue_delayed_work(system_wq, &adev->delayed_init_work,
4231                            msecs_to_jiffies(AMDGPU_RESUME_MS));
4232
4233         if (!adev->in_s0ix) {
4234                 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4235                 if (r)
4236                         goto exit;
4237         }
4238
4239 exit:
4240         if (amdgpu_sriov_vf(adev)) {
4241                 amdgpu_virt_init_data_exchange(adev);
4242                 amdgpu_virt_release_full_gpu(adev, true);
4243         }
4244
4245         if (r)
4246                 return r;
4247
4248         /* Make sure IB tests flushed */
4249         flush_delayed_work(&adev->delayed_init_work);
4250
4251         if (fbcon)
4252                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4253
4254         amdgpu_ras_resume(adev);
4255
4256         if (adev->mode_info.num_crtc) {
4257                 /*
4258                  * Most of the connector probing functions try to acquire runtime pm
4259                  * refs to ensure that the GPU is powered on when connector polling is
4260                  * performed. Since we're calling this from a runtime PM callback,
4261                  * trying to acquire rpm refs will cause us to deadlock.
4262                  *
4263                  * Since we're guaranteed to be holding the rpm lock, it's safe to
4264                  * temporarily disable the rpm helpers so this doesn't deadlock us.
4265                  */
4266 #ifdef CONFIG_PM
4267                 dev->dev->power.disable_depth++;
4268 #endif
4269                 if (!adev->dc_enabled)
4270                         drm_helper_hpd_irq_event(dev);
4271                 else
4272                         drm_kms_helper_hotplug_event(dev);
4273 #ifdef CONFIG_PM
4274                 dev->dev->power.disable_depth--;
4275 #endif
4276         }
4277         adev->in_suspend = false;
4278
4279         if (adev->enable_mes)
4280                 amdgpu_mes_self_test(adev);
4281
4282         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4283                 DRM_WARN("smart shift update failed\n");
4284
4285         return 0;
4286 }
4287
4288 /**
4289  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4290  *
4291  * @adev: amdgpu_device pointer
4292  *
4293  * The list of all the hardware IPs that make up the asic is walked and
4294  * the check_soft_reset callbacks are run.  check_soft_reset determines
4295  * if the asic is still hung or not.
4296  * Returns true if any of the IPs are still in a hung state, false if not.
4297  */
4298 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4299 {
4300         int i;
4301         bool asic_hang = false;
4302
4303         if (amdgpu_sriov_vf(adev))
4304                 return true;
4305
4306         if (amdgpu_asic_need_full_reset(adev))
4307                 return true;
4308
4309         for (i = 0; i < adev->num_ip_blocks; i++) {
4310                 if (!adev->ip_blocks[i].status.valid)
4311                         continue;
4312                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4313                         adev->ip_blocks[i].status.hang =
4314                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4315                 if (adev->ip_blocks[i].status.hang) {
4316                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4317                         asic_hang = true;
4318                 }
4319         }
4320         return asic_hang;
4321 }
4322
4323 /**
4324  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4325  *
4326  * @adev: amdgpu_device pointer
4327  *
4328  * The list of all the hardware IPs that make up the asic is walked and the
4329  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4330  * handles any IP specific hardware or software state changes that are
4331  * necessary for a soft reset to succeed.
4332  * Returns 0 on success, negative error code on failure.
4333  */
4334 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4335 {
4336         int i, r = 0;
4337
4338         for (i = 0; i < adev->num_ip_blocks; i++) {
4339                 if (!adev->ip_blocks[i].status.valid)
4340                         continue;
4341                 if (adev->ip_blocks[i].status.hang &&
4342                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4343                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4344                         if (r)
4345                                 return r;
4346                 }
4347         }
4348
4349         return 0;
4350 }
4351
4352 /**
4353  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4354  *
4355  * @adev: amdgpu_device pointer
4356  *
4357  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4358  * reset is necessary to recover.
4359  * Returns true if a full asic reset is required, false if not.
4360  */
4361 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4362 {
4363         int i;
4364
4365         if (amdgpu_asic_need_full_reset(adev))
4366                 return true;
4367
4368         for (i = 0; i < adev->num_ip_blocks; i++) {
4369                 if (!adev->ip_blocks[i].status.valid)
4370                         continue;
4371                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4372                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4373                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4374                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4375                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4376                         if (adev->ip_blocks[i].status.hang) {
4377                                 dev_info(adev->dev, "Some block need full reset!\n");
4378                                 return true;
4379                         }
4380                 }
4381         }
4382         return false;
4383 }
4384
4385 /**
4386  * amdgpu_device_ip_soft_reset - do a soft reset
4387  *
4388  * @adev: amdgpu_device pointer
4389  *
4390  * The list of all the hardware IPs that make up the asic is walked and the
4391  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4392  * IP specific hardware or software state changes that are necessary to soft
4393  * reset the IP.
4394  * Returns 0 on success, negative error code on failure.
4395  */
4396 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4397 {
4398         int i, r = 0;
4399
4400         for (i = 0; i < adev->num_ip_blocks; i++) {
4401                 if (!adev->ip_blocks[i].status.valid)
4402                         continue;
4403                 if (adev->ip_blocks[i].status.hang &&
4404                     adev->ip_blocks[i].version->funcs->soft_reset) {
4405                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4406                         if (r)
4407                                 return r;
4408                 }
4409         }
4410
4411         return 0;
4412 }
4413
4414 /**
4415  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4416  *
4417  * @adev: amdgpu_device pointer
4418  *
4419  * The list of all the hardware IPs that make up the asic is walked and the
4420  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4421  * handles any IP specific hardware or software state changes that are
4422  * necessary after the IP has been soft reset.
4423  * Returns 0 on success, negative error code on failure.
4424  */
4425 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4426 {
4427         int i, r = 0;
4428
4429         for (i = 0; i < adev->num_ip_blocks; i++) {
4430                 if (!adev->ip_blocks[i].status.valid)
4431                         continue;
4432                 if (adev->ip_blocks[i].status.hang &&
4433                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4434                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4435                 if (r)
4436                         return r;
4437         }
4438
4439         return 0;
4440 }
4441
4442 /**
4443  * amdgpu_device_recover_vram - Recover some VRAM contents
4444  *
4445  * @adev: amdgpu_device pointer
4446  *
4447  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4448  * restore things like GPUVM page tables after a GPU reset where
4449  * the contents of VRAM might be lost.
4450  *
4451  * Returns:
4452  * 0 on success, negative error code on failure.
4453  */
4454 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4455 {
4456         struct dma_fence *fence = NULL, *next = NULL;
4457         struct amdgpu_bo *shadow;
4458         struct amdgpu_bo_vm *vmbo;
4459         long r = 1, tmo;
4460
4461         if (amdgpu_sriov_runtime(adev))
4462                 tmo = msecs_to_jiffies(8000);
4463         else
4464                 tmo = msecs_to_jiffies(100);
4465
4466         dev_info(adev->dev, "recover vram bo from shadow start\n");
4467         mutex_lock(&adev->shadow_list_lock);
4468         list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4469                 /* If vm is compute context or adev is APU, shadow will be NULL */
4470                 if (!vmbo->shadow)
4471                         continue;
4472                 shadow = vmbo->shadow;
4473
4474                 /* No need to recover an evicted BO */
4475                 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4476                     shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4477                     shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4478                         continue;
4479
4480                 r = amdgpu_bo_restore_shadow(shadow, &next);
4481                 if (r)
4482                         break;
4483
4484                 if (fence) {
4485                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4486                         dma_fence_put(fence);
4487                         fence = next;
4488                         if (tmo == 0) {
4489                                 r = -ETIMEDOUT;
4490                                 break;
4491                         } else if (tmo < 0) {
4492                                 r = tmo;
4493                                 break;
4494                         }
4495                 } else {
4496                         fence = next;
4497                 }
4498         }
4499         mutex_unlock(&adev->shadow_list_lock);
4500
4501         if (fence)
4502                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4503         dma_fence_put(fence);
4504
4505         if (r < 0 || tmo <= 0) {
4506                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4507                 return -EIO;
4508         }
4509
4510         dev_info(adev->dev, "recover vram bo from shadow done\n");
4511         return 0;
4512 }
4513
4514
4515 /**
4516  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4517  *
4518  * @adev: amdgpu_device pointer
4519  * @from_hypervisor: request from hypervisor
4520  *
4521  * do VF FLR and reinitialize Asic
4522  * return 0 means succeeded otherwise failed
4523  */
4524 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4525                                      bool from_hypervisor)
4526 {
4527         int r;
4528         struct amdgpu_hive_info *hive = NULL;
4529         int retry_limit = 0;
4530
4531 retry:
4532         amdgpu_amdkfd_pre_reset(adev);
4533
4534         if (from_hypervisor)
4535                 r = amdgpu_virt_request_full_gpu(adev, true);
4536         else
4537                 r = amdgpu_virt_reset_gpu(adev);
4538         if (r)
4539                 return r;
4540         amdgpu_irq_gpu_reset_resume_helper(adev);
4541
4542         /* some sw clean up VF needs to do before recover */
4543         amdgpu_virt_post_reset(adev);
4544
4545         /* Resume IP prior to SMC */
4546         r = amdgpu_device_ip_reinit_early_sriov(adev);
4547         if (r)
4548                 goto error;
4549
4550         amdgpu_virt_init_data_exchange(adev);
4551
4552         r = amdgpu_device_fw_loading(adev);
4553         if (r)
4554                 return r;
4555
4556         /* now we are okay to resume SMC/CP/SDMA */
4557         r = amdgpu_device_ip_reinit_late_sriov(adev);
4558         if (r)
4559                 goto error;
4560
4561         hive = amdgpu_get_xgmi_hive(adev);
4562         /* Update PSP FW topology after reset */
4563         if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4564                 r = amdgpu_xgmi_update_topology(hive, adev);
4565
4566         if (hive)
4567                 amdgpu_put_xgmi_hive(hive);
4568
4569         if (!r) {
4570                 r = amdgpu_ib_ring_tests(adev);
4571
4572                 amdgpu_amdkfd_post_reset(adev);
4573         }
4574
4575 error:
4576         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4577                 amdgpu_inc_vram_lost(adev);
4578                 r = amdgpu_device_recover_vram(adev);
4579         }
4580         amdgpu_virt_release_full_gpu(adev, true);
4581
4582         if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4583                 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4584                         retry_limit++;
4585                         goto retry;
4586                 } else
4587                         DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4588         }
4589
4590         return r;
4591 }
4592
4593 /**
4594  * amdgpu_device_has_job_running - check if there is any job in mirror list
4595  *
4596  * @adev: amdgpu_device pointer
4597  *
4598  * check if there is any job in mirror list
4599  */
4600 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4601 {
4602         int i;
4603         struct drm_sched_job *job;
4604
4605         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4606                 struct amdgpu_ring *ring = adev->rings[i];
4607
4608                 if (!ring || !ring->sched.thread)
4609                         continue;
4610
4611                 spin_lock(&ring->sched.job_list_lock);
4612                 job = list_first_entry_or_null(&ring->sched.pending_list,
4613                                                struct drm_sched_job, list);
4614                 spin_unlock(&ring->sched.job_list_lock);
4615                 if (job)
4616                         return true;
4617         }
4618         return false;
4619 }
4620
4621 /**
4622  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4623  *
4624  * @adev: amdgpu_device pointer
4625  *
4626  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4627  * a hung GPU.
4628  */
4629 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4630 {
4631
4632         if (amdgpu_gpu_recovery == 0)
4633                 goto disabled;
4634
4635         /* Skip soft reset check in fatal error mode */
4636         if (!amdgpu_ras_is_poison_mode_supported(adev))
4637                 return true;
4638
4639         if (amdgpu_sriov_vf(adev))
4640                 return true;
4641
4642         if (amdgpu_gpu_recovery == -1) {
4643                 switch (adev->asic_type) {
4644 #ifdef CONFIG_DRM_AMDGPU_SI
4645                 case CHIP_VERDE:
4646                 case CHIP_TAHITI:
4647                 case CHIP_PITCAIRN:
4648                 case CHIP_OLAND:
4649                 case CHIP_HAINAN:
4650 #endif
4651 #ifdef CONFIG_DRM_AMDGPU_CIK
4652                 case CHIP_KAVERI:
4653                 case CHIP_KABINI:
4654                 case CHIP_MULLINS:
4655 #endif
4656                 case CHIP_CARRIZO:
4657                 case CHIP_STONEY:
4658                 case CHIP_CYAN_SKILLFISH:
4659                         goto disabled;
4660                 default:
4661                         break;
4662                 }
4663         }
4664
4665         return true;
4666
4667 disabled:
4668                 dev_info(adev->dev, "GPU recovery disabled.\n");
4669                 return false;
4670 }
4671
4672 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4673 {
4674         u32 i;
4675         int ret = 0;
4676
4677         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4678
4679         dev_info(adev->dev, "GPU mode1 reset\n");
4680
4681         /* disable BM */
4682         pci_clear_master(adev->pdev);
4683
4684         amdgpu_device_cache_pci_state(adev->pdev);
4685
4686         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4687                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4688                 ret = amdgpu_dpm_mode1_reset(adev);
4689         } else {
4690                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4691                 ret = psp_gpu_reset(adev);
4692         }
4693
4694         if (ret)
4695                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4696
4697         amdgpu_device_load_pci_state(adev->pdev);
4698
4699         /* wait for asic to come out of reset */
4700         for (i = 0; i < adev->usec_timeout; i++) {
4701                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4702
4703                 if (memsize != 0xffffffff)
4704                         break;
4705                 udelay(1);
4706         }
4707
4708         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4709         return ret;
4710 }
4711
4712 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4713                                  struct amdgpu_reset_context *reset_context)
4714 {
4715         int i, r = 0;
4716         struct amdgpu_job *job = NULL;
4717         bool need_full_reset =
4718                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4719
4720         if (reset_context->reset_req_dev == adev)
4721                 job = reset_context->job;
4722
4723         if (amdgpu_sriov_vf(adev)) {
4724                 /* stop the data exchange thread */
4725                 amdgpu_virt_fini_data_exchange(adev);
4726         }
4727
4728         amdgpu_fence_driver_isr_toggle(adev, true);
4729
4730         /* block all schedulers and reset given job's ring */
4731         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4732                 struct amdgpu_ring *ring = adev->rings[i];
4733
4734                 if (!ring || !ring->sched.thread)
4735                         continue;
4736
4737                 /* Clear job fence from fence drv to avoid force_completion
4738                  * leave NULL and vm flush fence in fence drv
4739                  */
4740                 amdgpu_fence_driver_clear_job_fences(ring);
4741
4742                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4743                 amdgpu_fence_driver_force_completion(ring);
4744         }
4745
4746         amdgpu_fence_driver_isr_toggle(adev, false);
4747
4748         if (job && job->vm)
4749                 drm_sched_increase_karma(&job->base);
4750
4751         r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4752         /* If reset handler not implemented, continue; otherwise return */
4753         if (r == -EOPNOTSUPP)
4754                 r = 0;
4755         else
4756                 return r;
4757
4758         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4759         if (!amdgpu_sriov_vf(adev)) {
4760
4761                 if (!need_full_reset)
4762                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4763
4764                 if (!need_full_reset && amdgpu_gpu_recovery &&
4765                     amdgpu_device_ip_check_soft_reset(adev)) {
4766                         amdgpu_device_ip_pre_soft_reset(adev);
4767                         r = amdgpu_device_ip_soft_reset(adev);
4768                         amdgpu_device_ip_post_soft_reset(adev);
4769                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4770                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4771                                 need_full_reset = true;
4772                         }
4773                 }
4774
4775                 if (need_full_reset)
4776                         r = amdgpu_device_ip_suspend(adev);
4777                 if (need_full_reset)
4778                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4779                 else
4780                         clear_bit(AMDGPU_NEED_FULL_RESET,
4781                                   &reset_context->flags);
4782         }
4783
4784         return r;
4785 }
4786
4787 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4788 {
4789         int i;
4790
4791         lockdep_assert_held(&adev->reset_domain->sem);
4792
4793         for (i = 0; i < adev->num_regs; i++) {
4794                 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4795                 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4796                                              adev->reset_dump_reg_value[i]);
4797         }
4798
4799         return 0;
4800 }
4801
4802 #ifdef CONFIG_DEV_COREDUMP
4803 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4804                 size_t count, void *data, size_t datalen)
4805 {
4806         struct drm_printer p;
4807         struct amdgpu_device *adev = data;
4808         struct drm_print_iterator iter;
4809         int i;
4810
4811         iter.data = buffer;
4812         iter.offset = 0;
4813         iter.start = offset;
4814         iter.remain = count;
4815
4816         p = drm_coredump_printer(&iter);
4817
4818         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4819         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4820         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4821         drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4822         if (adev->reset_task_info.pid)
4823                 drm_printf(&p, "process_name: %s PID: %d\n",
4824                            adev->reset_task_info.process_name,
4825                            adev->reset_task_info.pid);
4826
4827         if (adev->reset_vram_lost)
4828                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4829         if (adev->num_regs) {
4830                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4831
4832                 for (i = 0; i < adev->num_regs; i++)
4833                         drm_printf(&p, "0x%08x: 0x%08x\n",
4834                                    adev->reset_dump_reg_list[i],
4835                                    adev->reset_dump_reg_value[i]);
4836         }
4837
4838         return count - iter.remain;
4839 }
4840
4841 static void amdgpu_devcoredump_free(void *data)
4842 {
4843 }
4844
4845 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4846 {
4847         struct drm_device *dev = adev_to_drm(adev);
4848
4849         ktime_get_ts64(&adev->reset_time);
4850         dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4851                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4852 }
4853 #endif
4854
4855 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4856                          struct amdgpu_reset_context *reset_context)
4857 {
4858         struct amdgpu_device *tmp_adev = NULL;
4859         bool need_full_reset, skip_hw_reset, vram_lost = false;
4860         int r = 0;
4861         bool gpu_reset_for_dev_remove = 0;
4862
4863         /* Try reset handler method first */
4864         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4865                                     reset_list);
4866         amdgpu_reset_reg_dumps(tmp_adev);
4867
4868         reset_context->reset_device_list = device_list_handle;
4869         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4870         /* If reset handler not implemented, continue; otherwise return */
4871         if (r == -EOPNOTSUPP)
4872                 r = 0;
4873         else
4874                 return r;
4875
4876         /* Reset handler not implemented, use the default method */
4877         need_full_reset =
4878                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4879         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4880
4881         gpu_reset_for_dev_remove =
4882                 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4883                         test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4884
4885         /*
4886          * ASIC reset has to be done on all XGMI hive nodes ASAP
4887          * to allow proper links negotiation in FW (within 1 sec)
4888          */
4889         if (!skip_hw_reset && need_full_reset) {
4890                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4891                         /* For XGMI run all resets in parallel to speed up the process */
4892                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4893                                 tmp_adev->gmc.xgmi.pending_reset = false;
4894                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4895                                         r = -EALREADY;
4896                         } else
4897                                 r = amdgpu_asic_reset(tmp_adev);
4898
4899                         if (r) {
4900                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4901                                          r, adev_to_drm(tmp_adev)->unique);
4902                                 break;
4903                         }
4904                 }
4905
4906                 /* For XGMI wait for all resets to complete before proceed */
4907                 if (!r) {
4908                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4909                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4910                                         flush_work(&tmp_adev->xgmi_reset_work);
4911                                         r = tmp_adev->asic_reset_res;
4912                                         if (r)
4913                                                 break;
4914                                 }
4915                         }
4916                 }
4917         }
4918
4919         if (!r && amdgpu_ras_intr_triggered()) {
4920                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4921                         if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4922                             tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4923                                 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4924                 }
4925
4926                 amdgpu_ras_intr_cleared();
4927         }
4928
4929         /* Since the mode1 reset affects base ip blocks, the
4930          * phase1 ip blocks need to be resumed. Otherwise there
4931          * will be a BIOS signature error and the psp bootloader
4932          * can't load kdb on the next amdgpu install.
4933          */
4934         if (gpu_reset_for_dev_remove) {
4935                 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4936                         amdgpu_device_ip_resume_phase1(tmp_adev);
4937
4938                 goto end;
4939         }
4940
4941         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4942                 if (need_full_reset) {
4943                         /* post card */
4944                         r = amdgpu_device_asic_init(tmp_adev);
4945                         if (r) {
4946                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4947                         } else {
4948                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4949                                 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4950                                 if (r)
4951                                         goto out;
4952
4953                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4954                                 if (r)
4955                                         goto out;
4956
4957                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4958 #ifdef CONFIG_DEV_COREDUMP
4959                                 tmp_adev->reset_vram_lost = vram_lost;
4960                                 memset(&tmp_adev->reset_task_info, 0,
4961                                                 sizeof(tmp_adev->reset_task_info));
4962                                 if (reset_context->job && reset_context->job->vm)
4963                                         tmp_adev->reset_task_info =
4964                                                 reset_context->job->vm->task_info;
4965                                 amdgpu_reset_capture_coredumpm(tmp_adev);
4966 #endif
4967                                 if (vram_lost) {
4968                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4969                                         amdgpu_inc_vram_lost(tmp_adev);
4970                                 }
4971
4972                                 r = amdgpu_device_fw_loading(tmp_adev);
4973                                 if (r)
4974                                         return r;
4975
4976                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4977                                 if (r)
4978                                         goto out;
4979
4980                                 if (vram_lost)
4981                                         amdgpu_device_fill_reset_magic(tmp_adev);
4982
4983                                 /*
4984                                  * Add this ASIC as tracked as reset was already
4985                                  * complete successfully.
4986                                  */
4987                                 amdgpu_register_gpu_instance(tmp_adev);
4988
4989                                 if (!reset_context->hive &&
4990                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4991                                         amdgpu_xgmi_add_device(tmp_adev);
4992
4993                                 r = amdgpu_device_ip_late_init(tmp_adev);
4994                                 if (r)
4995                                         goto out;
4996
4997                                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4998
4999                                 /*
5000                                  * The GPU enters bad state once faulty pages
5001                                  * by ECC has reached the threshold, and ras
5002                                  * recovery is scheduled next. So add one check
5003                                  * here to break recovery if it indeed exceeds
5004                                  * bad page threshold, and remind user to
5005                                  * retire this GPU or setting one bigger
5006                                  * bad_page_threshold value to fix this once
5007                                  * probing driver again.
5008                                  */
5009                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5010                                         /* must succeed. */
5011                                         amdgpu_ras_resume(tmp_adev);
5012                                 } else {
5013                                         r = -EINVAL;
5014                                         goto out;
5015                                 }
5016
5017                                 /* Update PSP FW topology after reset */
5018                                 if (reset_context->hive &&
5019                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5020                                         r = amdgpu_xgmi_update_topology(
5021                                                 reset_context->hive, tmp_adev);
5022                         }
5023                 }
5024
5025 out:
5026                 if (!r) {
5027                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5028                         r = amdgpu_ib_ring_tests(tmp_adev);
5029                         if (r) {
5030                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5031                                 need_full_reset = true;
5032                                 r = -EAGAIN;
5033                                 goto end;
5034                         }
5035                 }
5036
5037                 if (!r)
5038                         r = amdgpu_device_recover_vram(tmp_adev);
5039                 else
5040                         tmp_adev->asic_reset_res = r;
5041         }
5042
5043 end:
5044         if (need_full_reset)
5045                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5046         else
5047                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5048         return r;
5049 }
5050
5051 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5052 {
5053
5054         switch (amdgpu_asic_reset_method(adev)) {
5055         case AMD_RESET_METHOD_MODE1:
5056                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5057                 break;
5058         case AMD_RESET_METHOD_MODE2:
5059                 adev->mp1_state = PP_MP1_STATE_RESET;
5060                 break;
5061         default:
5062                 adev->mp1_state = PP_MP1_STATE_NONE;
5063                 break;
5064         }
5065 }
5066
5067 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5068 {
5069         amdgpu_vf_error_trans_all(adev);
5070         adev->mp1_state = PP_MP1_STATE_NONE;
5071 }
5072
5073 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5074 {
5075         struct pci_dev *p = NULL;
5076
5077         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5078                         adev->pdev->bus->number, 1);
5079         if (p) {
5080                 pm_runtime_enable(&(p->dev));
5081                 pm_runtime_resume(&(p->dev));
5082         }
5083
5084         pci_dev_put(p);
5085 }
5086
5087 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5088 {
5089         enum amd_reset_method reset_method;
5090         struct pci_dev *p = NULL;
5091         u64 expires;
5092
5093         /*
5094          * For now, only BACO and mode1 reset are confirmed
5095          * to suffer the audio issue without proper suspended.
5096          */
5097         reset_method = amdgpu_asic_reset_method(adev);
5098         if ((reset_method != AMD_RESET_METHOD_BACO) &&
5099              (reset_method != AMD_RESET_METHOD_MODE1))
5100                 return -EINVAL;
5101
5102         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5103                         adev->pdev->bus->number, 1);
5104         if (!p)
5105                 return -ENODEV;
5106
5107         expires = pm_runtime_autosuspend_expiration(&(p->dev));
5108         if (!expires)
5109                 /*
5110                  * If we cannot get the audio device autosuspend delay,
5111                  * a fixed 4S interval will be used. Considering 3S is
5112                  * the audio controller default autosuspend delay setting.
5113                  * 4S used here is guaranteed to cover that.
5114                  */
5115                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5116
5117         while (!pm_runtime_status_suspended(&(p->dev))) {
5118                 if (!pm_runtime_suspend(&(p->dev)))
5119                         break;
5120
5121                 if (expires < ktime_get_mono_fast_ns()) {
5122                         dev_warn(adev->dev, "failed to suspend display audio\n");
5123                         pci_dev_put(p);
5124                         /* TODO: abort the succeeding gpu reset? */
5125                         return -ETIMEDOUT;
5126                 }
5127         }
5128
5129         pm_runtime_disable(&(p->dev));
5130
5131         pci_dev_put(p);
5132         return 0;
5133 }
5134
5135 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5136 {
5137         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5138
5139 #if defined(CONFIG_DEBUG_FS)
5140         if (!amdgpu_sriov_vf(adev))
5141                 cancel_work(&adev->reset_work);
5142 #endif
5143
5144         if (adev->kfd.dev)
5145                 cancel_work(&adev->kfd.reset_work);
5146
5147         if (amdgpu_sriov_vf(adev))
5148                 cancel_work(&adev->virt.flr_work);
5149
5150         if (con && adev->ras_enabled)
5151                 cancel_work(&con->recovery_work);
5152
5153 }
5154
5155 /**
5156  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5157  *
5158  * @adev: amdgpu_device pointer
5159  * @job: which job trigger hang
5160  * @reset_context: amdgpu reset context pointer
5161  *
5162  * Attempt to reset the GPU if it has hung (all asics).
5163  * Attempt to do soft-reset or full-reset and reinitialize Asic
5164  * Returns 0 for success or an error on failure.
5165  */
5166
5167 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5168                               struct amdgpu_job *job,
5169                               struct amdgpu_reset_context *reset_context)
5170 {
5171         struct list_head device_list, *device_list_handle =  NULL;
5172         bool job_signaled = false;
5173         struct amdgpu_hive_info *hive = NULL;
5174         struct amdgpu_device *tmp_adev = NULL;
5175         int i, r = 0;
5176         bool need_emergency_restart = false;
5177         bool audio_suspended = false;
5178         bool gpu_reset_for_dev_remove = false;
5179
5180         gpu_reset_for_dev_remove =
5181                         test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5182                                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5183
5184         /*
5185          * Special case: RAS triggered and full reset isn't supported
5186          */
5187         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5188
5189         /*
5190          * Flush RAM to disk so that after reboot
5191          * the user can read log and see why the system rebooted.
5192          */
5193         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5194                 DRM_WARN("Emergency reboot.");
5195
5196                 ksys_sync_helper();
5197                 emergency_restart();
5198         }
5199
5200         dev_info(adev->dev, "GPU %s begin!\n",
5201                 need_emergency_restart ? "jobs stop":"reset");
5202
5203         if (!amdgpu_sriov_vf(adev))
5204                 hive = amdgpu_get_xgmi_hive(adev);
5205         if (hive)
5206                 mutex_lock(&hive->hive_lock);
5207
5208         reset_context->job = job;
5209         reset_context->hive = hive;
5210         /*
5211          * Build list of devices to reset.
5212          * In case we are in XGMI hive mode, resort the device list
5213          * to put adev in the 1st position.
5214          */
5215         INIT_LIST_HEAD(&device_list);
5216         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5217                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5218                         list_add_tail(&tmp_adev->reset_list, &device_list);
5219                         if (gpu_reset_for_dev_remove && adev->shutdown)
5220                                 tmp_adev->shutdown = true;
5221                 }
5222                 if (!list_is_first(&adev->reset_list, &device_list))
5223                         list_rotate_to_front(&adev->reset_list, &device_list);
5224                 device_list_handle = &device_list;
5225         } else {
5226                 list_add_tail(&adev->reset_list, &device_list);
5227                 device_list_handle = &device_list;
5228         }
5229
5230         /* We need to lock reset domain only once both for XGMI and single device */
5231         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5232                                     reset_list);
5233         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5234
5235         /* block all schedulers and reset given job's ring */
5236         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5237
5238                 amdgpu_device_set_mp1_state(tmp_adev);
5239
5240                 /*
5241                  * Try to put the audio codec into suspend state
5242                  * before gpu reset started.
5243                  *
5244                  * Due to the power domain of the graphics device
5245                  * is shared with AZ power domain. Without this,
5246                  * we may change the audio hardware from behind
5247                  * the audio driver's back. That will trigger
5248                  * some audio codec errors.
5249                  */
5250                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5251                         audio_suspended = true;
5252
5253                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5254
5255                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5256
5257                 if (!amdgpu_sriov_vf(tmp_adev))
5258                         amdgpu_amdkfd_pre_reset(tmp_adev);
5259
5260                 /*
5261                  * Mark these ASICs to be reseted as untracked first
5262                  * And add them back after reset completed
5263                  */
5264                 amdgpu_unregister_gpu_instance(tmp_adev);
5265
5266                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5267
5268                 /* disable ras on ALL IPs */
5269                 if (!need_emergency_restart &&
5270                       amdgpu_device_ip_need_full_reset(tmp_adev))
5271                         amdgpu_ras_suspend(tmp_adev);
5272
5273                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5274                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5275
5276                         if (!ring || !ring->sched.thread)
5277                                 continue;
5278
5279                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5280
5281                         if (need_emergency_restart)
5282                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5283                 }
5284                 atomic_inc(&tmp_adev->gpu_reset_counter);
5285         }
5286
5287         if (need_emergency_restart)
5288                 goto skip_sched_resume;
5289
5290         /*
5291          * Must check guilty signal here since after this point all old
5292          * HW fences are force signaled.
5293          *
5294          * job->base holds a reference to parent fence
5295          */
5296         if (job && dma_fence_is_signaled(&job->hw_fence)) {
5297                 job_signaled = true;
5298                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5299                 goto skip_hw_reset;
5300         }
5301
5302 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
5303         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5304                 if (gpu_reset_for_dev_remove) {
5305                         /* Workaroud for ASICs need to disable SMC first */
5306                         amdgpu_device_smu_fini_early(tmp_adev);
5307                 }
5308                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5309                 /*TODO Should we stop ?*/
5310                 if (r) {
5311                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5312                                   r, adev_to_drm(tmp_adev)->unique);
5313                         tmp_adev->asic_reset_res = r;
5314                 }
5315
5316                 /*
5317                  * Drop all pending non scheduler resets. Scheduler resets
5318                  * were already dropped during drm_sched_stop
5319                  */
5320                 amdgpu_device_stop_pending_resets(tmp_adev);
5321         }
5322
5323         /* Actual ASIC resets if needed.*/
5324         /* Host driver will handle XGMI hive reset for SRIOV */
5325         if (amdgpu_sriov_vf(adev)) {
5326                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5327                 if (r)
5328                         adev->asic_reset_res = r;
5329
5330                 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5331                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5332                     adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5333                         amdgpu_ras_resume(adev);
5334         } else {
5335                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5336                 if (r && r == -EAGAIN)
5337                         goto retry;
5338
5339                 if (!r && gpu_reset_for_dev_remove)
5340                         goto recover_end;
5341         }
5342
5343 skip_hw_reset:
5344
5345         /* Post ASIC reset for all devs .*/
5346         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5347
5348                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5349                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5350
5351                         if (!ring || !ring->sched.thread)
5352                                 continue;
5353
5354                         drm_sched_start(&ring->sched, true);
5355                 }
5356
5357                 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5358                         amdgpu_mes_self_test(tmp_adev);
5359
5360                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5361                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5362
5363                 if (tmp_adev->asic_reset_res)
5364                         r = tmp_adev->asic_reset_res;
5365
5366                 tmp_adev->asic_reset_res = 0;
5367
5368                 if (r) {
5369                         /* bad news, how to tell it to userspace ? */
5370                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5371                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5372                 } else {
5373                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5374                         if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5375                                 DRM_WARN("smart shift update failed\n");
5376                 }
5377         }
5378
5379 skip_sched_resume:
5380         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5381                 /* unlock kfd: SRIOV would do it separately */
5382                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5383                         amdgpu_amdkfd_post_reset(tmp_adev);
5384
5385                 /* kfd_post_reset will do nothing if kfd device is not initialized,
5386                  * need to bring up kfd here if it's not be initialized before
5387                  */
5388                 if (!adev->kfd.init_complete)
5389                         amdgpu_amdkfd_device_init(adev);
5390
5391                 if (audio_suspended)
5392                         amdgpu_device_resume_display_audio(tmp_adev);
5393
5394                 amdgpu_device_unset_mp1_state(tmp_adev);
5395
5396                 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5397         }
5398
5399 recover_end:
5400         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5401                                             reset_list);
5402         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5403
5404         if (hive) {
5405                 mutex_unlock(&hive->hive_lock);
5406                 amdgpu_put_xgmi_hive(hive);
5407         }
5408
5409         if (r)
5410                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5411
5412         atomic_set(&adev->reset_domain->reset_res, r);
5413         return r;
5414 }
5415
5416 /**
5417  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5418  *
5419  * @adev: amdgpu_device pointer
5420  *
5421  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5422  * and lanes) of the slot the device is in. Handles APUs and
5423  * virtualized environments where PCIE config space may not be available.
5424  */
5425 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5426 {
5427         struct pci_dev *pdev;
5428         enum pci_bus_speed speed_cap, platform_speed_cap;
5429         enum pcie_link_width platform_link_width;
5430
5431         if (amdgpu_pcie_gen_cap)
5432                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5433
5434         if (amdgpu_pcie_lane_cap)
5435                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5436
5437         /* covers APUs as well */
5438         if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5439                 if (adev->pm.pcie_gen_mask == 0)
5440                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5441                 if (adev->pm.pcie_mlw_mask == 0)
5442                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5443                 return;
5444         }
5445
5446         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5447                 return;
5448
5449         pcie_bandwidth_available(adev->pdev, NULL,
5450                                  &platform_speed_cap, &platform_link_width);
5451
5452         if (adev->pm.pcie_gen_mask == 0) {
5453                 /* asic caps */
5454                 pdev = adev->pdev;
5455                 speed_cap = pcie_get_speed_cap(pdev);
5456                 if (speed_cap == PCI_SPEED_UNKNOWN) {
5457                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5458                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5459                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5460                 } else {
5461                         if (speed_cap == PCIE_SPEED_32_0GT)
5462                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5463                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5464                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5465                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5466                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5467                         else if (speed_cap == PCIE_SPEED_16_0GT)
5468                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5469                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5470                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5471                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5472                         else if (speed_cap == PCIE_SPEED_8_0GT)
5473                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5474                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5475                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5476                         else if (speed_cap == PCIE_SPEED_5_0GT)
5477                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5478                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5479                         else
5480                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5481                 }
5482                 /* platform caps */
5483                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5484                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5485                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5486                 } else {
5487                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
5488                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5489                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5490                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5491                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5492                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5493                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5494                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5495                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5496                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5497                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5498                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5499                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5500                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5501                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5502                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5503                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5504                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5505                         else
5506                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5507
5508                 }
5509         }
5510         if (adev->pm.pcie_mlw_mask == 0) {
5511                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5512                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5513                 } else {
5514                         switch (platform_link_width) {
5515                         case PCIE_LNK_X32:
5516                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5517                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5518                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5519                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5520                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5521                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5522                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5523                                 break;
5524                         case PCIE_LNK_X16:
5525                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5526                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5527                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5528                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5529                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5530                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5531                                 break;
5532                         case PCIE_LNK_X12:
5533                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5534                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5535                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5536                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5537                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5538                                 break;
5539                         case PCIE_LNK_X8:
5540                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5541                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5542                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5543                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5544                                 break;
5545                         case PCIE_LNK_X4:
5546                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5547                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5548                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5549                                 break;
5550                         case PCIE_LNK_X2:
5551                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5552                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5553                                 break;
5554                         case PCIE_LNK_X1:
5555                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5556                                 break;
5557                         default:
5558                                 break;
5559                         }
5560                 }
5561         }
5562 }
5563
5564 /**
5565  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5566  *
5567  * @adev: amdgpu_device pointer
5568  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5569  *
5570  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5571  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5572  * @peer_adev.
5573  */
5574 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5575                                       struct amdgpu_device *peer_adev)
5576 {
5577 #ifdef CONFIG_HSA_AMD_P2P
5578         uint64_t address_mask = peer_adev->dev->dma_mask ?
5579                 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5580         resource_size_t aper_limit =
5581                 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5582         bool p2p_access =
5583                 !adev->gmc.xgmi.connected_to_cpu &&
5584                 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5585
5586         return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5587                 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5588                 !(adev->gmc.aper_base & address_mask ||
5589                   aper_limit & address_mask));
5590 #else
5591         return false;
5592 #endif
5593 }
5594
5595 int amdgpu_device_baco_enter(struct drm_device *dev)
5596 {
5597         struct amdgpu_device *adev = drm_to_adev(dev);
5598         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5599
5600         if (!amdgpu_device_supports_baco(dev))
5601                 return -ENOTSUPP;
5602
5603         if (ras && adev->ras_enabled &&
5604             adev->nbio.funcs->enable_doorbell_interrupt)
5605                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5606
5607         return amdgpu_dpm_baco_enter(adev);
5608 }
5609
5610 int amdgpu_device_baco_exit(struct drm_device *dev)
5611 {
5612         struct amdgpu_device *adev = drm_to_adev(dev);
5613         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5614         int ret = 0;
5615
5616         if (!amdgpu_device_supports_baco(dev))
5617                 return -ENOTSUPP;
5618
5619         ret = amdgpu_dpm_baco_exit(adev);
5620         if (ret)
5621                 return ret;
5622
5623         if (ras && adev->ras_enabled &&
5624             adev->nbio.funcs->enable_doorbell_interrupt)
5625                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5626
5627         if (amdgpu_passthrough(adev) &&
5628             adev->nbio.funcs->clear_doorbell_interrupt)
5629                 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5630
5631         return 0;
5632 }
5633
5634 /**
5635  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5636  * @pdev: PCI device struct
5637  * @state: PCI channel state
5638  *
5639  * Description: Called when a PCI error is detected.
5640  *
5641  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5642  */
5643 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5644 {
5645         struct drm_device *dev = pci_get_drvdata(pdev);
5646         struct amdgpu_device *adev = drm_to_adev(dev);
5647         int i;
5648
5649         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5650
5651         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5652                 DRM_WARN("No support for XGMI hive yet...");
5653                 return PCI_ERS_RESULT_DISCONNECT;
5654         }
5655
5656         adev->pci_channel_state = state;
5657
5658         switch (state) {
5659         case pci_channel_io_normal:
5660                 return PCI_ERS_RESULT_CAN_RECOVER;
5661         /* Fatal error, prepare for slot reset */
5662         case pci_channel_io_frozen:
5663                 /*
5664                  * Locking adev->reset_domain->sem will prevent any external access
5665                  * to GPU during PCI error recovery
5666                  */
5667                 amdgpu_device_lock_reset_domain(adev->reset_domain);
5668                 amdgpu_device_set_mp1_state(adev);
5669
5670                 /*
5671                  * Block any work scheduling as we do for regular GPU reset
5672                  * for the duration of the recovery
5673                  */
5674                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5675                         struct amdgpu_ring *ring = adev->rings[i];
5676
5677                         if (!ring || !ring->sched.thread)
5678                                 continue;
5679
5680                         drm_sched_stop(&ring->sched, NULL);
5681                 }
5682                 atomic_inc(&adev->gpu_reset_counter);
5683                 return PCI_ERS_RESULT_NEED_RESET;
5684         case pci_channel_io_perm_failure:
5685                 /* Permanent error, prepare for device removal */
5686                 return PCI_ERS_RESULT_DISCONNECT;
5687         }
5688
5689         return PCI_ERS_RESULT_NEED_RESET;
5690 }
5691
5692 /**
5693  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5694  * @pdev: pointer to PCI device
5695  */
5696 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5697 {
5698
5699         DRM_INFO("PCI error: mmio enabled callback!!\n");
5700
5701         /* TODO - dump whatever for debugging purposes */
5702
5703         /* This called only if amdgpu_pci_error_detected returns
5704          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5705          * works, no need to reset slot.
5706          */
5707
5708         return PCI_ERS_RESULT_RECOVERED;
5709 }
5710
5711 /**
5712  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5713  * @pdev: PCI device struct
5714  *
5715  * Description: This routine is called by the pci error recovery
5716  * code after the PCI slot has been reset, just before we
5717  * should resume normal operations.
5718  */
5719 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5720 {
5721         struct drm_device *dev = pci_get_drvdata(pdev);
5722         struct amdgpu_device *adev = drm_to_adev(dev);
5723         int r, i;
5724         struct amdgpu_reset_context reset_context;
5725         u32 memsize;
5726         struct list_head device_list;
5727
5728         DRM_INFO("PCI error: slot reset callback!!\n");
5729
5730         memset(&reset_context, 0, sizeof(reset_context));
5731
5732         INIT_LIST_HEAD(&device_list);
5733         list_add_tail(&adev->reset_list, &device_list);
5734
5735         /* wait for asic to come out of reset */
5736         msleep(500);
5737
5738         /* Restore PCI confspace */
5739         amdgpu_device_load_pci_state(pdev);
5740
5741         /* confirm  ASIC came out of reset */
5742         for (i = 0; i < adev->usec_timeout; i++) {
5743                 memsize = amdgpu_asic_get_config_memsize(adev);
5744
5745                 if (memsize != 0xffffffff)
5746                         break;
5747                 udelay(1);
5748         }
5749         if (memsize == 0xffffffff) {
5750                 r = -ETIME;
5751                 goto out;
5752         }
5753
5754         reset_context.method = AMD_RESET_METHOD_NONE;
5755         reset_context.reset_req_dev = adev;
5756         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5757         set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5758
5759         adev->no_hw_access = true;
5760         r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5761         adev->no_hw_access = false;
5762         if (r)
5763                 goto out;
5764
5765         r = amdgpu_do_asic_reset(&device_list, &reset_context);
5766
5767 out:
5768         if (!r) {
5769                 if (amdgpu_device_cache_pci_state(adev->pdev))
5770                         pci_restore_state(adev->pdev);
5771
5772                 DRM_INFO("PCIe error recovery succeeded\n");
5773         } else {
5774                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5775                 amdgpu_device_unset_mp1_state(adev);
5776                 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5777         }
5778
5779         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5780 }
5781
5782 /**
5783  * amdgpu_pci_resume() - resume normal ops after PCI reset
5784  * @pdev: pointer to PCI device
5785  *
5786  * Called when the error recovery driver tells us that its
5787  * OK to resume normal operation.
5788  */
5789 void amdgpu_pci_resume(struct pci_dev *pdev)
5790 {
5791         struct drm_device *dev = pci_get_drvdata(pdev);
5792         struct amdgpu_device *adev = drm_to_adev(dev);
5793         int i;
5794
5795
5796         DRM_INFO("PCI error: resume callback!!\n");
5797
5798         /* Only continue execution for the case of pci_channel_io_frozen */
5799         if (adev->pci_channel_state != pci_channel_io_frozen)
5800                 return;
5801
5802         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5803                 struct amdgpu_ring *ring = adev->rings[i];
5804
5805                 if (!ring || !ring->sched.thread)
5806                         continue;
5807
5808                 drm_sched_start(&ring->sched, true);
5809         }
5810
5811         amdgpu_device_unset_mp1_state(adev);
5812         amdgpu_device_unlock_reset_domain(adev->reset_domain);
5813 }
5814
5815 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5816 {
5817         struct drm_device *dev = pci_get_drvdata(pdev);
5818         struct amdgpu_device *adev = drm_to_adev(dev);
5819         int r;
5820
5821         r = pci_save_state(pdev);
5822         if (!r) {
5823                 kfree(adev->pci_state);
5824
5825                 adev->pci_state = pci_store_saved_state(pdev);
5826
5827                 if (!adev->pci_state) {
5828                         DRM_ERROR("Failed to store PCI saved state");
5829                         return false;
5830                 }
5831         } else {
5832                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5833                 return false;
5834         }
5835
5836         return true;
5837 }
5838
5839 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5840 {
5841         struct drm_device *dev = pci_get_drvdata(pdev);
5842         struct amdgpu_device *adev = drm_to_adev(dev);
5843         int r;
5844
5845         if (!adev->pci_state)
5846                 return false;
5847
5848         r = pci_load_saved_state(pdev, adev->pci_state);
5849
5850         if (!r) {
5851                 pci_restore_state(pdev);
5852         } else {
5853                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5854                 return false;
5855         }
5856
5857         return true;
5858 }
5859
5860 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5861                 struct amdgpu_ring *ring)
5862 {
5863 #ifdef CONFIG_X86_64
5864         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5865                 return;
5866 #endif
5867         if (adev->gmc.xgmi.connected_to_cpu)
5868                 return;
5869
5870         if (ring && ring->funcs->emit_hdp_flush)
5871                 amdgpu_ring_emit_hdp_flush(ring);
5872         else
5873                 amdgpu_asic_flush_hdp(adev, ring);
5874 }
5875
5876 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5877                 struct amdgpu_ring *ring)
5878 {
5879 #ifdef CONFIG_X86_64
5880         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5881                 return;
5882 #endif
5883         if (adev->gmc.xgmi.connected_to_cpu)
5884                 return;
5885
5886         amdgpu_asic_invalidate_hdp(adev, ring);
5887 }
5888
5889 int amdgpu_in_reset(struct amdgpu_device *adev)
5890 {
5891         return atomic_read(&adev->reset_domain->in_gpu_reset);
5892 }
5893
5894 /**
5895  * amdgpu_device_halt() - bring hardware to some kind of halt state
5896  *
5897  * @adev: amdgpu_device pointer
5898  *
5899  * Bring hardware to some kind of halt state so that no one can touch it
5900  * any more. It will help to maintain error context when error occurred.
5901  * Compare to a simple hang, the system will keep stable at least for SSH
5902  * access. Then it should be trivial to inspect the hardware state and
5903  * see what's going on. Implemented as following:
5904  *
5905  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5906  *    clears all CPU mappings to device, disallows remappings through page faults
5907  * 2. amdgpu_irq_disable_all() disables all interrupts
5908  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5909  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5910  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5911  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5912  *    flush any in flight DMA operations
5913  */
5914 void amdgpu_device_halt(struct amdgpu_device *adev)
5915 {
5916         struct pci_dev *pdev = adev->pdev;
5917         struct drm_device *ddev = adev_to_drm(adev);
5918
5919         amdgpu_xcp_dev_unplug(adev);
5920         drm_dev_unplug(ddev);
5921
5922         amdgpu_irq_disable_all(adev);
5923
5924         amdgpu_fence_driver_hw_fini(adev);
5925
5926         adev->no_hw_access = true;
5927
5928         amdgpu_device_unmap_mmio(adev);
5929
5930         pci_disable_device(pdev);
5931         pci_wait_for_pending_transaction(pdev);
5932 }
5933
5934 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5935                                 u32 reg)
5936 {
5937         unsigned long flags, address, data;
5938         u32 r;
5939
5940         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5941         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5942
5943         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5944         WREG32(address, reg * 4);
5945         (void)RREG32(address);
5946         r = RREG32(data);
5947         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5948         return r;
5949 }
5950
5951 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5952                                 u32 reg, u32 v)
5953 {
5954         unsigned long flags, address, data;
5955
5956         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5957         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5958
5959         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5960         WREG32(address, reg * 4);
5961         (void)RREG32(address);
5962         WREG32(data, v);
5963         (void)RREG32(data);
5964         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5965 }
5966
5967 /**
5968  * amdgpu_device_switch_gang - switch to a new gang
5969  * @adev: amdgpu_device pointer
5970  * @gang: the gang to switch to
5971  *
5972  * Try to switch to a new gang.
5973  * Returns: NULL if we switched to the new gang or a reference to the current
5974  * gang leader.
5975  */
5976 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
5977                                             struct dma_fence *gang)
5978 {
5979         struct dma_fence *old = NULL;
5980
5981         do {
5982                 dma_fence_put(old);
5983                 rcu_read_lock();
5984                 old = dma_fence_get_rcu_safe(&adev->gang_submit);
5985                 rcu_read_unlock();
5986
5987                 if (old == gang)
5988                         break;
5989
5990                 if (!dma_fence_is_signaled(old))
5991                         return old;
5992
5993         } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
5994                          old, gang) != old);
5995
5996         dma_fence_put(old);
5997         return NULL;
5998 }
5999
6000 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6001 {
6002         switch (adev->asic_type) {
6003 #ifdef CONFIG_DRM_AMDGPU_SI
6004         case CHIP_HAINAN:
6005 #endif
6006         case CHIP_TOPAZ:
6007                 /* chips with no display hardware */
6008                 return false;
6009 #ifdef CONFIG_DRM_AMDGPU_SI
6010         case CHIP_TAHITI:
6011         case CHIP_PITCAIRN:
6012         case CHIP_VERDE:
6013         case CHIP_OLAND:
6014 #endif
6015 #ifdef CONFIG_DRM_AMDGPU_CIK
6016         case CHIP_BONAIRE:
6017         case CHIP_HAWAII:
6018         case CHIP_KAVERI:
6019         case CHIP_KABINI:
6020         case CHIP_MULLINS:
6021 #endif
6022         case CHIP_TONGA:
6023         case CHIP_FIJI:
6024         case CHIP_POLARIS10:
6025         case CHIP_POLARIS11:
6026         case CHIP_POLARIS12:
6027         case CHIP_VEGAM:
6028         case CHIP_CARRIZO:
6029         case CHIP_STONEY:
6030                 /* chips with display hardware */
6031                 return true;
6032         default:
6033                 /* IP discovery */
6034                 if (!adev->ip_versions[DCE_HWIP][0] ||
6035                     (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6036                         return false;
6037                 return true;
6038         }
6039 }
6040
6041 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6042                 uint32_t inst, uint32_t reg_addr, char reg_name[],
6043                 uint32_t expected_value, uint32_t mask)
6044 {
6045         uint32_t ret = 0;
6046         uint32_t old_ = 0;
6047         uint32_t tmp_ = RREG32(reg_addr);
6048         uint32_t loop = adev->usec_timeout;
6049
6050         while ((tmp_ & (mask)) != (expected_value)) {
6051                 if (old_ != tmp_) {
6052                         loop = adev->usec_timeout;
6053                         old_ = tmp_;
6054                 } else
6055                         udelay(1);
6056                 tmp_ = RREG32(reg_addr);
6057                 loop--;
6058                 if (!loop) {
6059                         DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6060                                   inst, reg_name, (uint32_t)expected_value,
6061                                   (uint32_t)(tmp_ & (mask)));
6062                         ret = -ETIMEDOUT;
6063                         break;
6064                 }
6065         }
6066         return ret;
6067 }