drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
  84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
  85
  86 #define AMDGPU_RESUME_MS                2000
  87
  88 const char *amdgpu_asic_name[] = {
  89         "TAHITI",
  90         "PITCAIRN",
  91         "VERDE",
  92         "OLAND",
  93         "HAINAN",
  94         "BONAIRE",
  95         "KAVERI",
  96         "KABINI",
  97         "HAWAII",
  98         "MULLINS",
  99         "TOPAZ",
 100         "TONGA",
 101         "FIJI",
 102         "CARRIZO",
 103         "STONEY",
 104         "POLARIS10",
 105         "POLARIS11",
 106         "POLARIS12",
 107         "VEGAM",
 108         "VEGA10",
 109         "VEGA12",
 110         "VEGA20",
 111         "RAVEN",
 112         "ARCTURUS",
 113         "RENOIR",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "LAST",
 120 };
 121
 122 /**
 123  * DOC: pcie_replay_count
 124  *
 125  * The amdgpu driver provides a sysfs API for reporting the total number
 126  * of PCIe replays (NAKs)
 127  * The file pcie_replay_count is used for this and returns the total
 128  * number of replays as a sum of the NAKs generated and NAKs received
 129  */
 130
 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 132                 struct device_attribute *attr, char *buf)
 133 {
 134         struct drm_device *ddev = dev_get_drvdata(dev);
 135         struct amdgpu_device *adev = ddev->dev_private;
 136         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 137
 138         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 139 }
 140
 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 142                 amdgpu_device_get_pcie_replay_count, NULL);
 143
 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 145
 146 /**
 147  * DOC: product_name
 148  *
 149  * The amdgpu driver provides a sysfs API for reporting the product name
 150  * for the device
 151  * The file serial_number is used for this and returns the product name
 152  * as returned from the FRU.
 153  * NOTE: This is only available for certain server cards
 154  */
 155
 156 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 157                 struct device_attribute *attr, char *buf)
 158 {
 159         struct drm_device *ddev = dev_get_drvdata(dev);
 160         struct amdgpu_device *adev = ddev->dev_private;
 161
 162         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 163 }
 164
 165 static DEVICE_ATTR(product_name, S_IRUGO,
 166                 amdgpu_device_get_product_name, NULL);
 167
 168 /**
 169  * DOC: product_number
 170  *
 171  * The amdgpu driver provides a sysfs API for reporting the part number
 172  * for the device
 173  * The file serial_number is used for this and returns the part number
 174  * as returned from the FRU.
 175  * NOTE: This is only available for certain server cards
 176  */
 177
 178 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 179                 struct device_attribute *attr, char *buf)
 180 {
 181         struct drm_device *ddev = dev_get_drvdata(dev);
 182         struct amdgpu_device *adev = ddev->dev_private;
 183
 184         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 185 }
 186
 187 static DEVICE_ATTR(product_number, S_IRUGO,
 188                 amdgpu_device_get_product_number, NULL);
 189
 190 /**
 191  * DOC: serial_number
 192  *
 193  * The amdgpu driver provides a sysfs API for reporting the serial number
 194  * for the device
 195  * The file serial_number is used for this and returns the serial number
 196  * as returned from the FRU.
 197  * NOTE: This is only available for certain server cards
 198  */
 199
 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 201                 struct device_attribute *attr, char *buf)
 202 {
 203         struct drm_device *ddev = dev_get_drvdata(dev);
 204         struct amdgpu_device *adev = ddev->dev_private;
 205
 206         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 207 }
 208
 209 static DEVICE_ATTR(serial_number, S_IRUGO,
 210                 amdgpu_device_get_serial_number, NULL);
 211
 212 /**
 213  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 214  *
 215  * @dev: drm_device pointer
 216  *
 217  * Returns true if the device is a dGPU with HG/PX power control,
 218  * otherwise return false.
 219  */
 220 bool amdgpu_device_supports_boco(struct drm_device *dev)
 221 {
 222         struct amdgpu_device *adev = dev->dev_private;
 223
 224         if (adev->flags & AMD_IS_PX)
 225                 return true;
 226         return false;
 227 }
 228
 229 /**
 230  * amdgpu_device_supports_baco - Does the device support BACO
 231  *
 232  * @dev: drm_device pointer
 233  *
 234  * Returns true if the device supporte BACO,
 235  * otherwise return false.
 236  */
 237 bool amdgpu_device_supports_baco(struct drm_device *dev)
 238 {
 239         struct amdgpu_device *adev = dev->dev_private;
 240
 241         return amdgpu_asic_supports_baco(adev);
 242 }
 243
 244 /**
 245  * VRAM access helper functions.
 246  *
 247  * amdgpu_device_vram_access - read/write a buffer in vram
 248  *
 249  * @adev: amdgpu_device pointer
 250  * @pos: offset of the buffer in vram
 251  * @buf: virtual address of the buffer in system memory
 252  * @size: read/write size, sizeof(@buf) must > @size
 253  * @write: true - write to vram, otherwise - read from vram
 254  */
 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 256                                uint32_t *buf, size_t size, bool write)
 257 {
 258         unsigned long flags;
 259         uint32_t hi = ~0;
 260         uint64_t last;
 261
 262
 263 #ifdef CONFIG_64BIT
 264         last = min(pos + size, adev->gmc.visible_vram_size);
 265         if (last > pos) {
 266                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 267                 size_t count = last - pos;
 268
 269                 if (write) {
 270                         memcpy_toio(addr, buf, count);
 271                         mb();
 272                         amdgpu_asic_flush_hdp(adev, NULL);
 273                 } else {
 274                         amdgpu_asic_invalidate_hdp(adev, NULL);
 275                         mb();
 276                         memcpy_fromio(buf, addr, count);
 277                 }
 278
 279                 if (count == size)
 280                         return;
 281
 282                 pos += count;
 283                 buf += count / 4;
 284                 size -= count;
 285         }
 286 #endif
 287
 288         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 289         for (last = pos + size; pos < last; pos += 4) {
 290                 uint32_t tmp = pos >> 31;
 291
 292                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 293                 if (tmp != hi) {
 294                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 295                         hi = tmp;
 296                 }
 297                 if (write)
 298                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 299                 else
 300                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 301         }
 302         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 303 }
 304
 305 /*
 306  * MMIO register access helper functions.
 307  */
 308 /**
 309  * amdgpu_mm_rreg - read a memory mapped IO register
 310  *
 311  * @adev: amdgpu_device pointer
 312  * @reg: dword aligned register offset
 313  * @acc_flags: access flags which require special behavior
 314  *
 315  * Returns the 32 bit value from the offset specified.
 316  */
 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 318                         uint32_t acc_flags)
 319 {
 320         uint32_t ret;
 321
 322         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 323                 return amdgpu_kiq_rreg(adev, reg);
 324
 325         if ((reg * 4) < adev->rmmio_size)
 326                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 327         else {
 328                 unsigned long flags;
 329
 330                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 331                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 332                 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 333                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 334         }
 335         trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
 336         return ret;
 337 }
 338
 339 /*
 340  * MMIO register read with bytes helper functions
 341  * @offset:bytes offset from MMIO start
 342  *
 343 */
 344
 345 /**
 346  * amdgpu_mm_rreg8 - read a memory mapped IO register
 347  *
 348  * @adev: amdgpu_device pointer
 349  * @offset: byte aligned register offset
 350  *
 351  * Returns the 8 bit value from the offset specified.
 352  */
 353 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
 354         if (offset < adev->rmmio_size)
 355                 return (readb(adev->rmmio + offset));
 356         BUG();
 357 }
 358
 359 /*
 360  * MMIO register write with bytes helper functions
 361  * @offset:bytes offset from MMIO start
 362  * @value: the value want to be written to the register
 363  *
 364 */
 365 /**
 366  * amdgpu_mm_wreg8 - read a memory mapped IO register
 367  *
 368  * @adev: amdgpu_device pointer
 369  * @offset: byte aligned register offset
 370  * @value: 8 bit value to write
 371  *
 372  * Writes the value specified to the offset specified.
 373  */
 374 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
 375         if (offset < adev->rmmio_size)
 376                 writeb(value, adev->rmmio + offset);
 377         else
 378                 BUG();
 379 }
 380
 381 void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
 382 {
 383         trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
 384
 385         if ((reg * 4) < adev->rmmio_size)
 386                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 387         else {
 388                 unsigned long flags;
 389
 390                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 391                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 392                 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 393                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 394         }
 395 }
 396
 397 /**
 398  * amdgpu_mm_wreg - write to a memory mapped IO register
 399  *
 400  * @adev: amdgpu_device pointer
 401  * @reg: dword aligned register offset
 402  * @v: 32 bit value to write to the register
 403  * @acc_flags: access flags which require special behavior
 404  *
 405  * Writes the value specified to the offset specified.
 406  */
 407 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 408                     uint32_t acc_flags)
 409 {
 410         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 411                 return amdgpu_kiq_wreg(adev, reg, v);
 412
 413         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 414 }
 415
 416 /*
 417  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 418  *
 419  * this function is invoked only the debugfs register access
 420  * */
 421 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 422                     uint32_t acc_flags)
 423 {
 424         if (amdgpu_sriov_fullaccess(adev) &&
 425                 adev->gfx.rlc.funcs &&
 426                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
 427
 428                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 429                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 430         }
 431
 432         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 433 }
 434
 435 /**
 436  * amdgpu_io_rreg - read an IO register
 437  *
 438  * @adev: amdgpu_device pointer
 439  * @reg: dword aligned register offset
 440  *
 441  * Returns the 32 bit value from the offset specified.
 442  */
 443 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 444 {
 445         if ((reg * 4) < adev->rio_mem_size)
 446                 return ioread32(adev->rio_mem + (reg * 4));
 447         else {
 448                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 449                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 450         }
 451 }
 452
 453 /**
 454  * amdgpu_io_wreg - write to an IO register
 455  *
 456  * @adev: amdgpu_device pointer
 457  * @reg: dword aligned register offset
 458  * @v: 32 bit value to write to the register
 459  *
 460  * Writes the value specified to the offset specified.
 461  */
 462 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 463 {
 464         if ((reg * 4) < adev->rio_mem_size)
 465                 iowrite32(v, adev->rio_mem + (reg * 4));
 466         else {
 467                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 468                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 469         }
 470 }
 471
 472 /**
 473  * amdgpu_mm_rdoorbell - read a doorbell dword
 474  *
 475  * @adev: amdgpu_device pointer
 476  * @index: doorbell index
 477  *
 478  * Returns the value in the doorbell aperture at the
 479  * requested doorbell index (CIK).
 480  */
 481 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 482 {
 483         if (index < adev->doorbell.num_doorbells) {
 484                 return readl(adev->doorbell.ptr + index);
 485         } else {
 486                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 487                 return 0;
 488         }
 489 }
 490
 491 /**
 492  * amdgpu_mm_wdoorbell - write a doorbell dword
 493  *
 494  * @adev: amdgpu_device pointer
 495  * @index: doorbell index
 496  * @v: value to write
 497  *
 498  * Writes @v to the doorbell aperture at the
 499  * requested doorbell index (CIK).
 500  */
 501 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 502 {
 503         if (index < adev->doorbell.num_doorbells) {
 504                 writel(v, adev->doorbell.ptr + index);
 505         } else {
 506                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 507         }
 508 }
 509
 510 /**
 511  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 512  *
 513  * @adev: amdgpu_device pointer
 514  * @index: doorbell index
 515  *
 516  * Returns the value in the doorbell aperture at the
 517  * requested doorbell index (VEGA10+).
 518  */
 519 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 520 {
 521         if (index < adev->doorbell.num_doorbells) {
 522                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 523         } else {
 524                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 525                 return 0;
 526         }
 527 }
 528
 529 /**
 530  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 531  *
 532  * @adev: amdgpu_device pointer
 533  * @index: doorbell index
 534  * @v: value to write
 535  *
 536  * Writes @v to the doorbell aperture at the
 537  * requested doorbell index (VEGA10+).
 538  */
 539 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 540 {
 541         if (index < adev->doorbell.num_doorbells) {
 542                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 543         } else {
 544                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 545         }
 546 }
 547
 548 /**
 549  * amdgpu_invalid_rreg - dummy reg read function
 550  *
 551  * @adev: amdgpu device pointer
 552  * @reg: offset of register
 553  *
 554  * Dummy register read function.  Used for register blocks
 555  * that certain asics don't have (all asics).
 556  * Returns the value in the register.
 557  */
 558 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 559 {
 560         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 561         BUG();
 562         return 0;
 563 }
 564
 565 /**
 566  * amdgpu_invalid_wreg - dummy reg write function
 567  *
 568  * @adev: amdgpu device pointer
 569  * @reg: offset of register
 570  * @v: value to write to the register
 571  *
 572  * Dummy register read function.  Used for register blocks
 573  * that certain asics don't have (all asics).
 574  */
 575 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 576 {
 577         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 578                   reg, v);
 579         BUG();
 580 }
 581
 582 /**
 583  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 584  *
 585  * @adev: amdgpu device pointer
 586  * @reg: offset of register
 587  *
 588  * Dummy register read function.  Used for register blocks
 589  * that certain asics don't have (all asics).
 590  * Returns the value in the register.
 591  */
 592 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 593 {
 594         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 595         BUG();
 596         return 0;
 597 }
 598
 599 /**
 600  * amdgpu_invalid_wreg64 - dummy reg write function
 601  *
 602  * @adev: amdgpu device pointer
 603  * @reg: offset of register
 604  * @v: value to write to the register
 605  *
 606  * Dummy register read function.  Used for register blocks
 607  * that certain asics don't have (all asics).
 608  */
 609 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 610 {
 611         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 612                   reg, v);
 613         BUG();
 614 }
 615
 616 /**
 617  * amdgpu_block_invalid_rreg - dummy reg read function
 618  *
 619  * @adev: amdgpu device pointer
 620  * @block: offset of instance
 621  * @reg: offset of register
 622  *
 623  * Dummy register read function.  Used for register blocks
 624  * that certain asics don't have (all asics).
 625  * Returns the value in the register.
 626  */
 627 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 628                                           uint32_t block, uint32_t reg)
 629 {
 630         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 631                   reg, block);
 632         BUG();
 633         return 0;
 634 }
 635
 636 /**
 637  * amdgpu_block_invalid_wreg - dummy reg write function
 638  *
 639  * @adev: amdgpu device pointer
 640  * @block: offset of instance
 641  * @reg: offset of register
 642  * @v: value to write to the register
 643  *
 644  * Dummy register read function.  Used for register blocks
 645  * that certain asics don't have (all asics).
 646  */
 647 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 648                                       uint32_t block,
 649                                       uint32_t reg, uint32_t v)
 650 {
 651         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 652                   reg, block, v);
 653         BUG();
 654 }
 655
 656 /**
 657  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 658  *
 659  * @adev: amdgpu device pointer
 660  *
 661  * Allocates a scratch page of VRAM for use by various things in the
 662  * driver.
 663  */
 664 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 665 {
 666         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 667                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 668                                        &adev->vram_scratch.robj,
 669                                        &adev->vram_scratch.gpu_addr,
 670                                        (void **)&adev->vram_scratch.ptr);
 671 }
 672
 673 /**
 674  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 675  *
 676  * @adev: amdgpu device pointer
 677  *
 678  * Frees the VRAM scratch page.
 679  */
 680 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 681 {
 682         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 683 }
 684
 685 /**
 686  * amdgpu_device_program_register_sequence - program an array of registers.
 687  *
 688  * @adev: amdgpu_device pointer
 689  * @registers: pointer to the register array
 690  * @array_size: size of the register array
 691  *
 692  * Programs an array or registers with and and or masks.
 693  * This is a helper for setting golden registers.
 694  */
 695 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 696                                              const u32 *registers,
 697                                              const u32 array_size)
 698 {
 699         u32 tmp, reg, and_mask, or_mask;
 700         int i;
 701
 702         if (array_size % 3)
 703                 return;
 704
 705         for (i = 0; i < array_size; i +=3) {
 706                 reg = registers[i + 0];
 707                 and_mask = registers[i + 1];
 708                 or_mask = registers[i + 2];
 709
 710                 if (and_mask == 0xffffffff) {
 711                         tmp = or_mask;
 712                 } else {
 713                         tmp = RREG32(reg);
 714                         tmp &= ~and_mask;
 715                         if (adev->family >= AMDGPU_FAMILY_AI)
 716                                 tmp |= (or_mask & and_mask);
 717                         else
 718                                 tmp |= or_mask;
 719                 }
 720                 WREG32(reg, tmp);
 721         }
 722 }
 723
 724 /**
 725  * amdgpu_device_pci_config_reset - reset the GPU
 726  *
 727  * @adev: amdgpu_device pointer
 728  *
 729  * Resets the GPU using the pci config reset sequence.
 730  * Only applicable to asics prior to vega10.
 731  */
 732 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 733 {
 734         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 735 }
 736
 737 /*
 738  * GPU doorbell aperture helpers function.
 739  */
 740 /**
 741  * amdgpu_device_doorbell_init - Init doorbell driver information.
 742  *
 743  * @adev: amdgpu_device pointer
 744  *
 745  * Init doorbell driver information (CIK)
 746  * Returns 0 on success, error on failure.
 747  */
 748 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 749 {
 750
 751         /* No doorbell on SI hardware generation */
 752         if (adev->asic_type < CHIP_BONAIRE) {
 753                 adev->doorbell.base = 0;
 754                 adev->doorbell.size = 0;
 755                 adev->doorbell.num_doorbells = 0;
 756                 adev->doorbell.ptr = NULL;
 757                 return 0;
 758         }
 759
 760         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 761                 return -EINVAL;
 762
 763         amdgpu_asic_init_doorbell_index(adev);
 764
 765         /* doorbell bar mapping */
 766         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 767         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 768
 769         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 770                                              adev->doorbell_index.max_assignment+1);
 771         if (adev->doorbell.num_doorbells == 0)
 772                 return -EINVAL;
 773
 774         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 775          * paging queue doorbell use the second page. The
 776          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 777          * doorbells are in the first page. So with paging queue enabled,
 778          * the max num_doorbells should + 1 page (0x400 in dword)
 779          */
 780         if (adev->asic_type >= CHIP_VEGA10)
 781                 adev->doorbell.num_doorbells += 0x400;
 782
 783         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 784                                      adev->doorbell.num_doorbells *
 785                                      sizeof(u32));
 786         if (adev->doorbell.ptr == NULL)
 787                 return -ENOMEM;
 788
 789         return 0;
 790 }
 791
 792 /**
 793  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 794  *
 795  * @adev: amdgpu_device pointer
 796  *
 797  * Tear down doorbell driver information (CIK)
 798  */
 799 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 800 {
 801         iounmap(adev->doorbell.ptr);
 802         adev->doorbell.ptr = NULL;
 803 }
 804
 805
 806
 807 /*
 808  * amdgpu_device_wb_*()
 809  * Writeback is the method by which the GPU updates special pages in memory
 810  * with the status of certain GPU events (fences, ring pointers,etc.).
 811  */
 812
 813 /**
 814  * amdgpu_device_wb_fini - Disable Writeback and free memory
 815  *
 816  * @adev: amdgpu_device pointer
 817  *
 818  * Disables Writeback and frees the Writeback memory (all asics).
 819  * Used at driver shutdown.
 820  */
 821 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 822 {
 823         if (adev->wb.wb_obj) {
 824                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 825                                       &adev->wb.gpu_addr,
 826                                       (void **)&adev->wb.wb);
 827                 adev->wb.wb_obj = NULL;
 828         }
 829 }
 830
 831 /**
 832  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
 833  *
 834  * @adev: amdgpu_device pointer
 835  *
 836  * Initializes writeback and allocates writeback memory (all asics).
 837  * Used at driver startup.
 838  * Returns 0 on success or an -error on failure.
 839  */
 840 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
 841 {
 842         int r;
 843
 844         if (adev->wb.wb_obj == NULL) {
 845                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
 846                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
 847                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
 848                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
 849                                             (void **)&adev->wb.wb);
 850                 if (r) {
 851                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
 852                         return r;
 853                 }
 854
 855                 adev->wb.num_wb = AMDGPU_MAX_WB;
 856                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
 857
 858                 /* clear wb memory */
 859                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
 860         }
 861
 862         return 0;
 863 }
 864
 865 /**
 866  * amdgpu_device_wb_get - Allocate a wb entry
 867  *
 868  * @adev: amdgpu_device pointer
 869  * @wb: wb index
 870  *
 871  * Allocate a wb slot for use by the driver (all asics).
 872  * Returns 0 on success or -EINVAL on failure.
 873  */
 874 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
 875 {
 876         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
 877
 878         if (offset < adev->wb.num_wb) {
 879                 __set_bit(offset, adev->wb.used);
 880                 *wb = offset << 3; /* convert to dw offset */
 881                 return 0;
 882         } else {
 883                 return -EINVAL;
 884         }
 885 }
 886
 887 /**
 888  * amdgpu_device_wb_free - Free a wb entry
 889  *
 890  * @adev: amdgpu_device pointer
 891  * @wb: wb index
 892  *
 893  * Free a wb slot allocated for use by the driver (all asics)
 894  */
 895 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
 896 {
 897         wb >>= 3;
 898         if (wb < adev->wb.num_wb)
 899                 __clear_bit(wb, adev->wb.used);
 900 }
 901
 902 /**
 903  * amdgpu_device_resize_fb_bar - try to resize FB BAR
 904  *
 905  * @adev: amdgpu_device pointer
 906  *
 907  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
 908  * to fail, but if any of the BARs is not accessible after the size we abort
 909  * driver loading by returning -ENODEV.
 910  */
 911 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 912 {
 913         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
 914         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
 915         struct pci_bus *root;
 916         struct resource *res;
 917         unsigned i;
 918         u16 cmd;
 919         int r;
 920
 921         /* Bypass for VF */
 922         if (amdgpu_sriov_vf(adev))
 923                 return 0;
 924
 925         /* skip if the bios has already enabled large BAR */
 926         if (adev->gmc.real_vram_size &&
 927             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
 928                 return 0;
 929
 930         /* Check if the root BUS has 64bit memory resources */
 931         root = adev->pdev->bus;
 932         while (root->parent)
 933                 root = root->parent;
 934
 935         pci_bus_for_each_resource(root, res, i) {
 936                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
 937                     res->start > 0x100000000ull)
 938                         break;
 939         }
 940
 941         /* Trying to resize is pointless without a root hub window above 4GB */
 942         if (!res)
 943                 return 0;
 944
 945         /* Disable memory decoding while we change the BAR addresses and size */
 946         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
 947         pci_write_config_word(adev->pdev, PCI_COMMAND,
 948                               cmd & ~PCI_COMMAND_MEMORY);
 949
 950         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
 951         amdgpu_device_doorbell_fini(adev);
 952         if (adev->asic_type >= CHIP_BONAIRE)
 953                 pci_release_resource(adev->pdev, 2);
 954
 955         pci_release_resource(adev->pdev, 0);
 956
 957         r = pci_resize_resource(adev->pdev, 0, rbar_size);
 958         if (r == -ENOSPC)
 959                 DRM_INFO("Not enough PCI address space for a large BAR.");
 960         else if (r && r != -ENOTSUPP)
 961                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
 962
 963         pci_assign_unassigned_bus_resources(adev->pdev->bus);
 964
 965         /* When the doorbell or fb BAR isn't available we have no chance of
 966          * using the device.
 967          */
 968         r = amdgpu_device_doorbell_init(adev);
 969         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
 970                 return -ENODEV;
 971
 972         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
 973
 974         return 0;
 975 }
 976
 977 /*
 978  * GPU helpers function.
 979  */
 980 /**
 981  * amdgpu_device_need_post - check if the hw need post or not
 982  *
 983  * @adev: amdgpu_device pointer
 984  *
 985  * Check if the asic has been initialized (all asics) at driver startup
 986  * or post is needed if  hw reset is performed.
 987  * Returns true if need or false if not.
 988  */
 989 bool amdgpu_device_need_post(struct amdgpu_device *adev)
 990 {
 991         uint32_t reg;
 992
 993         if (amdgpu_sriov_vf(adev))
 994                 return false;
 995
 996         if (amdgpu_passthrough(adev)) {
 997                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
 998                  * some old smc fw still need driver do vPost otherwise gpu hang, while
 999                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1000                  * vpost executed for smc version below 22.15
1001                  */
1002                 if (adev->asic_type == CHIP_FIJI) {
1003                         int err;
1004                         uint32_t fw_ver;
1005                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1006                         /* force vPost if error occured */
1007                         if (err)
1008                                 return true;
1009
1010                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1011                         if (fw_ver < 0x00160e00)
1012                                 return true;
1013                 }
1014         }
1015
1016         if (adev->has_hw_reset) {
1017                 adev->has_hw_reset = false;
1018                 return true;
1019         }
1020
1021         /* bios scratch used on CIK+ */
1022         if (adev->asic_type >= CHIP_BONAIRE)
1023                 return amdgpu_atombios_scratch_need_asic_init(adev);
1024
1025         /* check MEM_SIZE for older asics */
1026         reg = amdgpu_asic_get_config_memsize(adev);
1027
1028         if ((reg != 0) && (reg != 0xffffffff))
1029                 return false;
1030
1031         return true;
1032 }
1033
1034 /* if we get transitioned to only one device, take VGA back */
1035 /**
1036  * amdgpu_device_vga_set_decode - enable/disable vga decode
1037  *
1038  * @cookie: amdgpu_device pointer
1039  * @state: enable/disable vga decode
1040  *
1041  * Enable/disable vga decode (all asics).
1042  * Returns VGA resource flags.
1043  */
1044 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1045 {
1046         struct amdgpu_device *adev = cookie;
1047         amdgpu_asic_set_vga_state(adev, state);
1048         if (state)
1049                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1050                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1051         else
1052                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1053 }
1054
1055 /**
1056  * amdgpu_device_check_block_size - validate the vm block size
1057  *
1058  * @adev: amdgpu_device pointer
1059  *
1060  * Validates the vm block size specified via module parameter.
1061  * The vm block size defines number of bits in page table versus page directory,
1062  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1063  * page table and the remaining bits are in the page directory.
1064  */
1065 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1066 {
1067         /* defines number of bits in page table versus page directory,
1068          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1069          * page table and the remaining bits are in the page directory */
1070         if (amdgpu_vm_block_size == -1)
1071                 return;
1072
1073         if (amdgpu_vm_block_size < 9) {
1074                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1075                          amdgpu_vm_block_size);
1076                 amdgpu_vm_block_size = -1;
1077         }
1078 }
1079
1080 /**
1081  * amdgpu_device_check_vm_size - validate the vm size
1082  *
1083  * @adev: amdgpu_device pointer
1084  *
1085  * Validates the vm size in GB specified via module parameter.
1086  * The VM size is the size of the GPU virtual memory space in GB.
1087  */
1088 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1089 {
1090         /* no need to check the default value */
1091         if (amdgpu_vm_size == -1)
1092                 return;
1093
1094         if (amdgpu_vm_size < 1) {
1095                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1096                          amdgpu_vm_size);
1097                 amdgpu_vm_size = -1;
1098         }
1099 }
1100
1101 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1102 {
1103         struct sysinfo si;
1104         bool is_os_64 = (sizeof(void *) == 8);
1105         uint64_t total_memory;
1106         uint64_t dram_size_seven_GB = 0x1B8000000;
1107         uint64_t dram_size_three_GB = 0xB8000000;
1108
1109         if (amdgpu_smu_memory_pool_size == 0)
1110                 return;
1111
1112         if (!is_os_64) {
1113                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1114                 goto def_value;
1115         }
1116         si_meminfo(&si);
1117         total_memory = (uint64_t)si.totalram * si.mem_unit;
1118
1119         if ((amdgpu_smu_memory_pool_size == 1) ||
1120                 (amdgpu_smu_memory_pool_size == 2)) {
1121                 if (total_memory < dram_size_three_GB)
1122                         goto def_value1;
1123         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1124                 (amdgpu_smu_memory_pool_size == 8)) {
1125                 if (total_memory < dram_size_seven_GB)
1126                         goto def_value1;
1127         } else {
1128                 DRM_WARN("Smu memory pool size not supported\n");
1129                 goto def_value;
1130         }
1131         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1132
1133         return;
1134
1135 def_value1:
1136         DRM_WARN("No enough system memory\n");
1137 def_value:
1138         adev->pm.smu_prv_buffer_size = 0;
1139 }
1140
1141 /**
1142  * amdgpu_device_check_arguments - validate module params
1143  *
1144  * @adev: amdgpu_device pointer
1145  *
1146  * Validates certain module parameters and updates
1147  * the associated values used by the driver (all asics).
1148  */
1149 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1150 {
1151         if (amdgpu_sched_jobs < 4) {
1152                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1153                          amdgpu_sched_jobs);
1154                 amdgpu_sched_jobs = 4;
1155         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1156                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1157                          amdgpu_sched_jobs);
1158                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1159         }
1160
1161         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1162                 /* gart size must be greater or equal to 32M */
1163                 dev_warn(adev->dev, "gart size (%d) too small\n",
1164                          amdgpu_gart_size);
1165                 amdgpu_gart_size = -1;
1166         }
1167
1168         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1169                 /* gtt size must be greater or equal to 32M */
1170                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1171                                  amdgpu_gtt_size);
1172                 amdgpu_gtt_size = -1;
1173         }
1174
1175         /* valid range is between 4 and 9 inclusive */
1176         if (amdgpu_vm_fragment_size != -1 &&
1177             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1178                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1179                 amdgpu_vm_fragment_size = -1;
1180         }
1181
1182         if (amdgpu_sched_hw_submission < 2) {
1183                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1184                          amdgpu_sched_hw_submission);
1185                 amdgpu_sched_hw_submission = 2;
1186         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1187                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1188                          amdgpu_sched_hw_submission);
1189                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1190         }
1191
1192         amdgpu_device_check_smu_prv_buffer_size(adev);
1193
1194         amdgpu_device_check_vm_size(adev);
1195
1196         amdgpu_device_check_block_size(adev);
1197
1198         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1199
1200         amdgpu_gmc_tmz_set(adev);
1201
1202         if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1203                 amdgpu_num_kcq = 8;
1204                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1205         }
1206
1207         return 0;
1208 }
1209
1210 /**
1211  * amdgpu_switcheroo_set_state - set switcheroo state
1212  *
1213  * @pdev: pci dev pointer
1214  * @state: vga_switcheroo state
1215  *
1216  * Callback for the switcheroo driver.  Suspends or resumes the
1217  * the asics before or after it is powered up using ACPI methods.
1218  */
1219 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1220 {
1221         struct drm_device *dev = pci_get_drvdata(pdev);
1222         int r;
1223
1224         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1225                 return;
1226
1227         if (state == VGA_SWITCHEROO_ON) {
1228                 pr_info("switched on\n");
1229                 /* don't suspend or resume card normally */
1230                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1231
1232                 pci_set_power_state(dev->pdev, PCI_D0);
1233                 pci_restore_state(dev->pdev);
1234                 r = pci_enable_device(dev->pdev);
1235                 if (r)
1236                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1237                 amdgpu_device_resume(dev, true);
1238
1239                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1240                 drm_kms_helper_poll_enable(dev);
1241         } else {
1242                 pr_info("switched off\n");
1243                 drm_kms_helper_poll_disable(dev);
1244                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1245                 amdgpu_device_suspend(dev, true);
1246                 pci_save_state(dev->pdev);
1247                 /* Shut down the device */
1248                 pci_disable_device(dev->pdev);
1249                 pci_set_power_state(dev->pdev, PCI_D3cold);
1250                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1251         }
1252 }
1253
1254 /**
1255  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1256  *
1257  * @pdev: pci dev pointer
1258  *
1259  * Callback for the switcheroo driver.  Check of the switcheroo
1260  * state can be changed.
1261  * Returns true if the state can be changed, false if not.
1262  */
1263 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1264 {
1265         struct drm_device *dev = pci_get_drvdata(pdev);
1266
1267         /*
1268         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1269         * locking inversion with the driver load path. And the access here is
1270         * completely racy anyway. So don't bother with locking for now.
1271         */
1272         return atomic_read(&dev->open_count) == 0;
1273 }
1274
1275 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1276         .set_gpu_state = amdgpu_switcheroo_set_state,
1277         .reprobe = NULL,
1278         .can_switch = amdgpu_switcheroo_can_switch,
1279 };
1280
1281 /**
1282  * amdgpu_device_ip_set_clockgating_state - set the CG state
1283  *
1284  * @dev: amdgpu_device pointer
1285  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1286  * @state: clockgating state (gate or ungate)
1287  *
1288  * Sets the requested clockgating state for all instances of
1289  * the hardware IP specified.
1290  * Returns the error code from the last instance.
1291  */
1292 int amdgpu_device_ip_set_clockgating_state(void *dev,
1293                                            enum amd_ip_block_type block_type,
1294                                            enum amd_clockgating_state state)
1295 {
1296         struct amdgpu_device *adev = dev;
1297         int i, r = 0;
1298
1299         for (i = 0; i < adev->num_ip_blocks; i++) {
1300                 if (!adev->ip_blocks[i].status.valid)
1301                         continue;
1302                 if (adev->ip_blocks[i].version->type != block_type)
1303                         continue;
1304                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1305                         continue;
1306                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1307                         (void *)adev, state);
1308                 if (r)
1309                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1310                                   adev->ip_blocks[i].version->funcs->name, r);
1311         }
1312         return r;
1313 }
1314
1315 /**
1316  * amdgpu_device_ip_set_powergating_state - set the PG state
1317  *
1318  * @dev: amdgpu_device pointer
1319  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1320  * @state: powergating state (gate or ungate)
1321  *
1322  * Sets the requested powergating state for all instances of
1323  * the hardware IP specified.
1324  * Returns the error code from the last instance.
1325  */
1326 int amdgpu_device_ip_set_powergating_state(void *dev,
1327                                            enum amd_ip_block_type block_type,
1328                                            enum amd_powergating_state state)
1329 {
1330         struct amdgpu_device *adev = dev;
1331         int i, r = 0;
1332
1333         for (i = 0; i < adev->num_ip_blocks; i++) {
1334                 if (!adev->ip_blocks[i].status.valid)
1335                         continue;
1336                 if (adev->ip_blocks[i].version->type != block_type)
1337                         continue;
1338                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1339                         continue;
1340                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1341                         (void *)adev, state);
1342                 if (r)
1343                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1344                                   adev->ip_blocks[i].version->funcs->name, r);
1345         }
1346         return r;
1347 }
1348
1349 /**
1350  * amdgpu_device_ip_get_clockgating_state - get the CG state
1351  *
1352  * @adev: amdgpu_device pointer
1353  * @flags: clockgating feature flags
1354  *
1355  * Walks the list of IPs on the device and updates the clockgating
1356  * flags for each IP.
1357  * Updates @flags with the feature flags for each hardware IP where
1358  * clockgating is enabled.
1359  */
1360 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1361                                             u32 *flags)
1362 {
1363         int i;
1364
1365         for (i = 0; i < adev->num_ip_blocks; i++) {
1366                 if (!adev->ip_blocks[i].status.valid)
1367                         continue;
1368                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1369                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1370         }
1371 }
1372
1373 /**
1374  * amdgpu_device_ip_wait_for_idle - wait for idle
1375  *
1376  * @adev: amdgpu_device pointer
1377  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1378  *
1379  * Waits for the request hardware IP to be idle.
1380  * Returns 0 for success or a negative error code on failure.
1381  */
1382 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1383                                    enum amd_ip_block_type block_type)
1384 {
1385         int i, r;
1386
1387         for (i = 0; i < adev->num_ip_blocks; i++) {
1388                 if (!adev->ip_blocks[i].status.valid)
1389                         continue;
1390                 if (adev->ip_blocks[i].version->type == block_type) {
1391                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1392                         if (r)
1393                                 return r;
1394                         break;
1395                 }
1396         }
1397         return 0;
1398
1399 }
1400
1401 /**
1402  * amdgpu_device_ip_is_idle - is the hardware IP idle
1403  *
1404  * @adev: amdgpu_device pointer
1405  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1406  *
1407  * Check if the hardware IP is idle or not.
1408  * Returns true if it the IP is idle, false if not.
1409  */
1410 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1411                               enum amd_ip_block_type block_type)
1412 {
1413         int i;
1414
1415         for (i = 0; i < adev->num_ip_blocks; i++) {
1416                 if (!adev->ip_blocks[i].status.valid)
1417                         continue;
1418                 if (adev->ip_blocks[i].version->type == block_type)
1419                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1420         }
1421         return true;
1422
1423 }
1424
1425 /**
1426  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1427  *
1428  * @adev: amdgpu_device pointer
1429  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1430  *
1431  * Returns a pointer to the hardware IP block structure
1432  * if it exists for the asic, otherwise NULL.
1433  */
1434 struct amdgpu_ip_block *
1435 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1436                               enum amd_ip_block_type type)
1437 {
1438         int i;
1439
1440         for (i = 0; i < adev->num_ip_blocks; i++)
1441                 if (adev->ip_blocks[i].version->type == type)
1442                         return &adev->ip_blocks[i];
1443
1444         return NULL;
1445 }
1446
1447 /**
1448  * amdgpu_device_ip_block_version_cmp
1449  *
1450  * @adev: amdgpu_device pointer
1451  * @type: enum amd_ip_block_type
1452  * @major: major version
1453  * @minor: minor version
1454  *
1455  * return 0 if equal or greater
1456  * return 1 if smaller or the ip_block doesn't exist
1457  */
1458 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1459                                        enum amd_ip_block_type type,
1460                                        u32 major, u32 minor)
1461 {
1462         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1463
1464         if (ip_block && ((ip_block->version->major > major) ||
1465                         ((ip_block->version->major == major) &&
1466                         (ip_block->version->minor >= minor))))
1467                 return 0;
1468
1469         return 1;
1470 }
1471
1472 /**
1473  * amdgpu_device_ip_block_add
1474  *
1475  * @adev: amdgpu_device pointer
1476  * @ip_block_version: pointer to the IP to add
1477  *
1478  * Adds the IP block driver information to the collection of IPs
1479  * on the asic.
1480  */
1481 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1482                                const struct amdgpu_ip_block_version *ip_block_version)
1483 {
1484         if (!ip_block_version)
1485                 return -EINVAL;
1486
1487         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1488                   ip_block_version->funcs->name);
1489
1490         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1491
1492         return 0;
1493 }
1494
1495 /**
1496  * amdgpu_device_enable_virtual_display - enable virtual display feature
1497  *
1498  * @adev: amdgpu_device pointer
1499  *
1500  * Enabled the virtual display feature if the user has enabled it via
1501  * the module parameter virtual_display.  This feature provides a virtual
1502  * display hardware on headless boards or in virtualized environments.
1503  * This function parses and validates the configuration string specified by
1504  * the user and configues the virtual display configuration (number of
1505  * virtual connectors, crtcs, etc.) specified.
1506  */
1507 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1508 {
1509         adev->enable_virtual_display = false;
1510
1511         if (amdgpu_virtual_display) {
1512                 struct drm_device *ddev = adev->ddev;
1513                 const char *pci_address_name = pci_name(ddev->pdev);
1514                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1515
1516                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1517                 pciaddstr_tmp = pciaddstr;
1518                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1519                         pciaddname = strsep(&pciaddname_tmp, ",");
1520                         if (!strcmp("all", pciaddname)
1521                             || !strcmp(pci_address_name, pciaddname)) {
1522                                 long num_crtc;
1523                                 int res = -1;
1524
1525                                 adev->enable_virtual_display = true;
1526
1527                                 if (pciaddname_tmp)
1528                                         res = kstrtol(pciaddname_tmp, 10,
1529                                                       &num_crtc);
1530
1531                                 if (!res) {
1532                                         if (num_crtc < 1)
1533                                                 num_crtc = 1;
1534                                         if (num_crtc > 6)
1535                                                 num_crtc = 6;
1536                                         adev->mode_info.num_crtc = num_crtc;
1537                                 } else {
1538                                         adev->mode_info.num_crtc = 1;
1539                                 }
1540                                 break;
1541                         }
1542                 }
1543
1544                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1545                          amdgpu_virtual_display, pci_address_name,
1546                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1547
1548                 kfree(pciaddstr);
1549         }
1550 }
1551
1552 /**
1553  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1554  *
1555  * @adev: amdgpu_device pointer
1556  *
1557  * Parses the asic configuration parameters specified in the gpu info
1558  * firmware and makes them availale to the driver for use in configuring
1559  * the asic.
1560  * Returns 0 on success, -EINVAL on failure.
1561  */
1562 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1563 {
1564         const char *chip_name;
1565         char fw_name[40];
1566         int err;
1567         const struct gpu_info_firmware_header_v1_0 *hdr;
1568
1569         adev->firmware.gpu_info_fw = NULL;
1570
1571         if (adev->mman.discovery_bin) {
1572                 amdgpu_discovery_get_gfx_info(adev);
1573
1574                 /*
1575                  * FIXME: The bounding box is still needed by Navi12, so
1576                  * temporarily read it from gpu_info firmware. Should be droped
1577                  * when DAL no longer needs it.
1578                  */
1579                 if (adev->asic_type != CHIP_NAVI12)
1580                         return 0;
1581         }
1582
1583         switch (adev->asic_type) {
1584 #ifdef CONFIG_DRM_AMDGPU_SI
1585         case CHIP_VERDE:
1586         case CHIP_TAHITI:
1587         case CHIP_PITCAIRN:
1588         case CHIP_OLAND:
1589         case CHIP_HAINAN:
1590 #endif
1591 #ifdef CONFIG_DRM_AMDGPU_CIK
1592         case CHIP_BONAIRE:
1593         case CHIP_HAWAII:
1594         case CHIP_KAVERI:
1595         case CHIP_KABINI:
1596         case CHIP_MULLINS:
1597 #endif
1598         case CHIP_TOPAZ:
1599         case CHIP_TONGA:
1600         case CHIP_FIJI:
1601         case CHIP_POLARIS10:
1602         case CHIP_POLARIS11:
1603         case CHIP_POLARIS12:
1604         case CHIP_VEGAM:
1605         case CHIP_CARRIZO:
1606         case CHIP_STONEY:
1607         case CHIP_VEGA20:
1608         default:
1609                 return 0;
1610         case CHIP_VEGA10:
1611                 chip_name = "vega10";
1612                 break;
1613         case CHIP_VEGA12:
1614                 chip_name = "vega12";
1615                 break;
1616         case CHIP_RAVEN:
1617                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1618                         chip_name = "raven2";
1619                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1620                         chip_name = "picasso";
1621                 else
1622                         chip_name = "raven";
1623                 break;
1624         case CHIP_ARCTURUS:
1625                 chip_name = "arcturus";
1626                 break;
1627         case CHIP_RENOIR:
1628                 chip_name = "renoir";
1629                 break;
1630         case CHIP_NAVI10:
1631                 chip_name = "navi10";
1632                 break;
1633         case CHIP_NAVI14:
1634                 chip_name = "navi14";
1635                 break;
1636         case CHIP_NAVI12:
1637                 chip_name = "navi12";
1638                 break;
1639         case CHIP_SIENNA_CICHLID:
1640                 chip_name = "sienna_cichlid";
1641                 break;
1642         case CHIP_NAVY_FLOUNDER:
1643                 chip_name = "navy_flounder";
1644                 break;
1645         }
1646
1647         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1648         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1649         if (err) {
1650                 dev_err(adev->dev,
1651                         "Failed to load gpu_info firmware \"%s\"\n",
1652                         fw_name);
1653                 goto out;
1654         }
1655         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1656         if (err) {
1657                 dev_err(adev->dev,
1658                         "Failed to validate gpu_info firmware \"%s\"\n",
1659                         fw_name);
1660                 goto out;
1661         }
1662
1663         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1664         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1665
1666         switch (hdr->version_major) {
1667         case 1:
1668         {
1669                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1670                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1671                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1672
1673                 /*
1674                  * Should be droped when DAL no longer needs it.
1675                  */
1676                 if (adev->asic_type == CHIP_NAVI12)
1677                         goto parse_soc_bounding_box;
1678
1679                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1680                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1681                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1682                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1683                 adev->gfx.config.max_texture_channel_caches =
1684                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1685                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1686                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1687                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1688                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1689                 adev->gfx.config.double_offchip_lds_buf =
1690                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1691                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1692                 adev->gfx.cu_info.max_waves_per_simd =
1693                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1694                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1695                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1696                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1697                 if (hdr->version_minor >= 1) {
1698                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1699                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1700                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1701                         adev->gfx.config.num_sc_per_sh =
1702                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1703                         adev->gfx.config.num_packer_per_sc =
1704                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1705                 }
1706
1707 parse_soc_bounding_box:
1708                 /*
1709                  * soc bounding box info is not integrated in disocovery table,
1710                  * we always need to parse it from gpu info firmware if needed.
1711                  */
1712                 if (hdr->version_minor == 2) {
1713                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1714                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1715                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1716                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1717                 }
1718                 break;
1719         }
1720         default:
1721                 dev_err(adev->dev,
1722                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1723                 err = -EINVAL;
1724                 goto out;
1725         }
1726 out:
1727         return err;
1728 }
1729
1730 /**
1731  * amdgpu_device_ip_early_init - run early init for hardware IPs
1732  *
1733  * @adev: amdgpu_device pointer
1734  *
1735  * Early initialization pass for hardware IPs.  The hardware IPs that make
1736  * up each asic are discovered each IP's early_init callback is run.  This
1737  * is the first stage in initializing the asic.
1738  * Returns 0 on success, negative error code on failure.
1739  */
1740 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1741 {
1742         int i, r;
1743
1744         amdgpu_device_enable_virtual_display(adev);
1745
1746         if (amdgpu_sriov_vf(adev)) {
1747                 r = amdgpu_virt_request_full_gpu(adev, true);
1748                 if (r)
1749                         return r;
1750         }
1751
1752         switch (adev->asic_type) {
1753 #ifdef CONFIG_DRM_AMDGPU_SI
1754         case CHIP_VERDE:
1755         case CHIP_TAHITI:
1756         case CHIP_PITCAIRN:
1757         case CHIP_OLAND:
1758         case CHIP_HAINAN:
1759                 adev->family = AMDGPU_FAMILY_SI;
1760                 r = si_set_ip_blocks(adev);
1761                 if (r)
1762                         return r;
1763                 break;
1764 #endif
1765 #ifdef CONFIG_DRM_AMDGPU_CIK
1766         case CHIP_BONAIRE:
1767         case CHIP_HAWAII:
1768         case CHIP_KAVERI:
1769         case CHIP_KABINI:
1770         case CHIP_MULLINS:
1771                 if (adev->flags & AMD_IS_APU)
1772                         adev->family = AMDGPU_FAMILY_KV;
1773                 else
1774                         adev->family = AMDGPU_FAMILY_CI;
1775
1776                 r = cik_set_ip_blocks(adev);
1777                 if (r)
1778                         return r;
1779                 break;
1780 #endif
1781         case CHIP_TOPAZ:
1782         case CHIP_TONGA:
1783         case CHIP_FIJI:
1784         case CHIP_POLARIS10:
1785         case CHIP_POLARIS11:
1786         case CHIP_POLARIS12:
1787         case CHIP_VEGAM:
1788         case CHIP_CARRIZO:
1789         case CHIP_STONEY:
1790                 if (adev->flags & AMD_IS_APU)
1791                         adev->family = AMDGPU_FAMILY_CZ;
1792                 else
1793                         adev->family = AMDGPU_FAMILY_VI;
1794
1795                 r = vi_set_ip_blocks(adev);
1796                 if (r)
1797                         return r;
1798                 break;
1799         case CHIP_VEGA10:
1800         case CHIP_VEGA12:
1801         case CHIP_VEGA20:
1802         case CHIP_RAVEN:
1803         case CHIP_ARCTURUS:
1804         case CHIP_RENOIR:
1805                 if (adev->flags & AMD_IS_APU)
1806                         adev->family = AMDGPU_FAMILY_RV;
1807                 else
1808                         adev->family = AMDGPU_FAMILY_AI;
1809
1810                 r = soc15_set_ip_blocks(adev);
1811                 if (r)
1812                         return r;
1813                 break;
1814         case  CHIP_NAVI10:
1815         case  CHIP_NAVI14:
1816         case  CHIP_NAVI12:
1817         case  CHIP_SIENNA_CICHLID:
1818         case  CHIP_NAVY_FLOUNDER:
1819                 adev->family = AMDGPU_FAMILY_NV;
1820
1821                 r = nv_set_ip_blocks(adev);
1822                 if (r)
1823                         return r;
1824                 break;
1825         default:
1826                 /* FIXME: not supported yet */
1827                 return -EINVAL;
1828         }
1829
1830         amdgpu_amdkfd_device_probe(adev);
1831
1832         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1833         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1834                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1835
1836         for (i = 0; i < adev->num_ip_blocks; i++) {
1837                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1838                         DRM_ERROR("disabled ip block: %d <%s>\n",
1839                                   i, adev->ip_blocks[i].version->funcs->name);
1840                         adev->ip_blocks[i].status.valid = false;
1841                 } else {
1842                         if (adev->ip_blocks[i].version->funcs->early_init) {
1843                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1844                                 if (r == -ENOENT) {
1845                                         adev->ip_blocks[i].status.valid = false;
1846                                 } else if (r) {
1847                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1848                                                   adev->ip_blocks[i].version->funcs->name, r);
1849                                         return r;
1850                                 } else {
1851                                         adev->ip_blocks[i].status.valid = true;
1852                                 }
1853                         } else {
1854                                 adev->ip_blocks[i].status.valid = true;
1855                         }
1856                 }
1857                 /* get the vbios after the asic_funcs are set up */
1858                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1859                         r = amdgpu_device_parse_gpu_info_fw(adev);
1860                         if (r)
1861                                 return r;
1862
1863                         /* Read BIOS */
1864                         if (!amdgpu_get_bios(adev))
1865                                 return -EINVAL;
1866
1867                         r = amdgpu_atombios_init(adev);
1868                         if (r) {
1869                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1870                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1871                                 return r;
1872                         }
1873                 }
1874         }
1875
1876         adev->cg_flags &= amdgpu_cg_mask;
1877         adev->pg_flags &= amdgpu_pg_mask;
1878
1879         return 0;
1880 }
1881
1882 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1883 {
1884         int i, r;
1885
1886         for (i = 0; i < adev->num_ip_blocks; i++) {
1887                 if (!adev->ip_blocks[i].status.sw)
1888                         continue;
1889                 if (adev->ip_blocks[i].status.hw)
1890                         continue;
1891                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1892                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1893                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1894                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1895                         if (r) {
1896                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1897                                           adev->ip_blocks[i].version->funcs->name, r);
1898                                 return r;
1899                         }
1900                         adev->ip_blocks[i].status.hw = true;
1901                 }
1902         }
1903
1904         return 0;
1905 }
1906
1907 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1908 {
1909         int i, r;
1910
1911         for (i = 0; i < adev->num_ip_blocks; i++) {
1912                 if (!adev->ip_blocks[i].status.sw)
1913                         continue;
1914                 if (adev->ip_blocks[i].status.hw)
1915                         continue;
1916                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1917                 if (r) {
1918                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1919                                   adev->ip_blocks[i].version->funcs->name, r);
1920                         return r;
1921                 }
1922                 adev->ip_blocks[i].status.hw = true;
1923         }
1924
1925         return 0;
1926 }
1927
1928 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1929 {
1930         int r = 0;
1931         int i;
1932         uint32_t smu_version;
1933
1934         if (adev->asic_type >= CHIP_VEGA10) {
1935                 for (i = 0; i < adev->num_ip_blocks; i++) {
1936                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1937                                 continue;
1938
1939                         /* no need to do the fw loading again if already done*/
1940                         if (adev->ip_blocks[i].status.hw == true)
1941                                 break;
1942
1943                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
1944                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
1945                                 if (r) {
1946                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
1947                                                           adev->ip_blocks[i].version->funcs->name, r);
1948                                         return r;
1949                                 }
1950                         } else {
1951                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1952                                 if (r) {
1953                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1954                                                           adev->ip_blocks[i].version->funcs->name, r);
1955                                         return r;
1956                                 }
1957                         }
1958
1959                         adev->ip_blocks[i].status.hw = true;
1960                         break;
1961                 }
1962         }
1963
1964         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1965                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1966
1967         return r;
1968 }
1969
1970 /**
1971  * amdgpu_device_ip_init - run init for hardware IPs
1972  *
1973  * @adev: amdgpu_device pointer
1974  *
1975  * Main initialization pass for hardware IPs.  The list of all the hardware
1976  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1977  * are run.  sw_init initializes the software state associated with each IP
1978  * and hw_init initializes the hardware associated with each IP.
1979  * Returns 0 on success, negative error code on failure.
1980  */
1981 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1982 {
1983         int i, r;
1984
1985         r = amdgpu_ras_init(adev);
1986         if (r)
1987                 return r;
1988
1989         for (i = 0; i < adev->num_ip_blocks; i++) {
1990                 if (!adev->ip_blocks[i].status.valid)
1991                         continue;
1992                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1993                 if (r) {
1994                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1995                                   adev->ip_blocks[i].version->funcs->name, r);
1996                         goto init_failed;
1997                 }
1998                 adev->ip_blocks[i].status.sw = true;
1999
2000                 /* need to do gmc hw init early so we can allocate gpu mem */
2001                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2002                         r = amdgpu_device_vram_scratch_init(adev);
2003                         if (r) {
2004                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2005                                 goto init_failed;
2006                         }
2007                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2008                         if (r) {
2009                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2010                                 goto init_failed;
2011                         }
2012                         r = amdgpu_device_wb_init(adev);
2013                         if (r) {
2014                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2015                                 goto init_failed;
2016                         }
2017                         adev->ip_blocks[i].status.hw = true;
2018
2019                         /* right after GMC hw init, we create CSA */
2020                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2021                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2022                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2023                                                                 AMDGPU_CSA_SIZE);
2024                                 if (r) {
2025                                         DRM_ERROR("allocate CSA failed %d\n", r);
2026                                         goto init_failed;
2027                                 }
2028                         }
2029                 }
2030         }
2031
2032         if (amdgpu_sriov_vf(adev))
2033                 amdgpu_virt_init_data_exchange(adev);
2034
2035         r = amdgpu_ib_pool_init(adev);
2036         if (r) {
2037                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2038                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2039                 goto init_failed;
2040         }
2041
2042         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2043         if (r)
2044                 goto init_failed;
2045
2046         r = amdgpu_device_ip_hw_init_phase1(adev);
2047         if (r)
2048                 goto init_failed;
2049
2050         r = amdgpu_device_fw_loading(adev);
2051         if (r)
2052                 goto init_failed;
2053
2054         r = amdgpu_device_ip_hw_init_phase2(adev);
2055         if (r)
2056                 goto init_failed;
2057
2058         /*
2059          * retired pages will be loaded from eeprom and reserved here,
2060          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2061          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2062          * for I2C communication which only true at this point.
2063          *
2064          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2065          * failure from bad gpu situation and stop amdgpu init process
2066          * accordingly. For other failed cases, it will still release all
2067          * the resource and print error message, rather than returning one
2068          * negative value to upper level.
2069          *
2070          * Note: theoretically, this should be called before all vram allocations
2071          * to protect retired page from abusing
2072          */
2073         r = amdgpu_ras_recovery_init(adev);
2074         if (r)
2075                 goto init_failed;
2076
2077         if (adev->gmc.xgmi.num_physical_nodes > 1)
2078                 amdgpu_xgmi_add_device(adev);
2079         amdgpu_amdkfd_device_init(adev);
2080
2081         amdgpu_fru_get_product_info(adev);
2082
2083 init_failed:
2084         if (amdgpu_sriov_vf(adev))
2085                 amdgpu_virt_release_full_gpu(adev, true);
2086
2087         return r;
2088 }
2089
2090 /**
2091  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2092  *
2093  * @adev: amdgpu_device pointer
2094  *
2095  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2096  * this function before a GPU reset.  If the value is retained after a
2097  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2098  */
2099 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2100 {
2101         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2102 }
2103
2104 /**
2105  * amdgpu_device_check_vram_lost - check if vram is valid
2106  *
2107  * @adev: amdgpu_device pointer
2108  *
2109  * Checks the reset magic value written to the gart pointer in VRAM.
2110  * The driver calls this after a GPU reset to see if the contents of
2111  * VRAM is lost or now.
2112  * returns true if vram is lost, false if not.
2113  */
2114 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2115 {
2116         if (memcmp(adev->gart.ptr, adev->reset_magic,
2117                         AMDGPU_RESET_MAGIC_NUM))
2118                 return true;
2119
2120         if (!amdgpu_in_reset(adev))
2121                 return false;
2122
2123         /*
2124          * For all ASICs with baco/mode1 reset, the VRAM is
2125          * always assumed to be lost.
2126          */
2127         switch (amdgpu_asic_reset_method(adev)) {
2128         case AMD_RESET_METHOD_BACO:
2129         case AMD_RESET_METHOD_MODE1:
2130                 return true;
2131         default:
2132                 return false;
2133         }
2134 }
2135
2136 /**
2137  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2138  *
2139  * @adev: amdgpu_device pointer
2140  * @state: clockgating state (gate or ungate)
2141  *
2142  * The list of all the hardware IPs that make up the asic is walked and the
2143  * set_clockgating_state callbacks are run.
2144  * Late initialization pass enabling clockgating for hardware IPs.
2145  * Fini or suspend, pass disabling clockgating for hardware IPs.
2146  * Returns 0 on success, negative error code on failure.
2147  */
2148
2149 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2150                                                 enum amd_clockgating_state state)
2151 {
2152         int i, j, r;
2153
2154         if (amdgpu_emu_mode == 1)
2155                 return 0;
2156
2157         for (j = 0; j < adev->num_ip_blocks; j++) {
2158                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2159                 if (!adev->ip_blocks[i].status.late_initialized)
2160                         continue;
2161                 /* skip CG for VCE/UVD, it's handled specially */
2162                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2163                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2164                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2165                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2166                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2167                         /* enable clockgating to save power */
2168                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2169                                                                                      state);
2170                         if (r) {
2171                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2172                                           adev->ip_blocks[i].version->funcs->name, r);
2173                                 return r;
2174                         }
2175                 }
2176         }
2177
2178         return 0;
2179 }
2180
2181 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2182 {
2183         int i, j, r;
2184
2185         if (amdgpu_emu_mode == 1)
2186                 return 0;
2187
2188         for (j = 0; j < adev->num_ip_blocks; j++) {
2189                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2190                 if (!adev->ip_blocks[i].status.late_initialized)
2191                         continue;
2192                 /* skip CG for VCE/UVD, it's handled specially */
2193                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2194                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2195                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2196                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2197                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2198                         /* enable powergating to save power */
2199                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2200                                                                                         state);
2201                         if (r) {
2202                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2203                                           adev->ip_blocks[i].version->funcs->name, r);
2204                                 return r;
2205                         }
2206                 }
2207         }
2208         return 0;
2209 }
2210
2211 static int amdgpu_device_enable_mgpu_fan_boost(void)
2212 {
2213         struct amdgpu_gpu_instance *gpu_ins;
2214         struct amdgpu_device *adev;
2215         int i, ret = 0;
2216
2217         mutex_lock(&mgpu_info.mutex);
2218
2219         /*
2220          * MGPU fan boost feature should be enabled
2221          * only when there are two or more dGPUs in
2222          * the system
2223          */
2224         if (mgpu_info.num_dgpu < 2)
2225                 goto out;
2226
2227         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2228                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2229                 adev = gpu_ins->adev;
2230                 if (!(adev->flags & AMD_IS_APU) &&
2231                     !gpu_ins->mgpu_fan_enabled) {
2232                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2233                         if (ret)
2234                                 break;
2235
2236                         gpu_ins->mgpu_fan_enabled = 1;
2237                 }
2238         }
2239
2240 out:
2241         mutex_unlock(&mgpu_info.mutex);
2242
2243         return ret;
2244 }
2245
2246 /**
2247  * amdgpu_device_ip_late_init - run late init for hardware IPs
2248  *
2249  * @adev: amdgpu_device pointer
2250  *
2251  * Late initialization pass for hardware IPs.  The list of all the hardware
2252  * IPs that make up the asic is walked and the late_init callbacks are run.
2253  * late_init covers any special initialization that an IP requires
2254  * after all of the have been initialized or something that needs to happen
2255  * late in the init process.
2256  * Returns 0 on success, negative error code on failure.
2257  */
2258 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2259 {
2260         struct amdgpu_gpu_instance *gpu_instance;
2261         int i = 0, r;
2262
2263         for (i = 0; i < adev->num_ip_blocks; i++) {
2264                 if (!adev->ip_blocks[i].status.hw)
2265                         continue;
2266                 if (adev->ip_blocks[i].version->funcs->late_init) {
2267                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2268                         if (r) {
2269                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2270                                           adev->ip_blocks[i].version->funcs->name, r);
2271                                 return r;
2272                         }
2273                 }
2274                 adev->ip_blocks[i].status.late_initialized = true;
2275         }
2276
2277         amdgpu_ras_set_error_query_ready(adev, true);
2278
2279         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2280         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2281
2282         amdgpu_device_fill_reset_magic(adev);
2283
2284         r = amdgpu_device_enable_mgpu_fan_boost();
2285         if (r)
2286                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2287
2288
2289         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2290                 mutex_lock(&mgpu_info.mutex);
2291
2292                 /*
2293                  * Reset device p-state to low as this was booted with high.
2294                  *
2295                  * This should be performed only after all devices from the same
2296                  * hive get initialized.
2297                  *
2298                  * However, it's unknown how many device in the hive in advance.
2299                  * As this is counted one by one during devices initializations.
2300                  *
2301                  * So, we wait for all XGMI interlinked devices initialized.
2302                  * This may bring some delays as those devices may come from
2303                  * different hives. But that should be OK.
2304                  */
2305                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2306                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2307                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2308                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2309                                         continue;
2310
2311                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2312                                                 AMDGPU_XGMI_PSTATE_MIN);
2313                                 if (r) {
2314                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2315                                         break;
2316                                 }
2317                         }
2318                 }
2319
2320                 mutex_unlock(&mgpu_info.mutex);
2321         }
2322
2323         return 0;
2324 }
2325
2326 /**
2327  * amdgpu_device_ip_fini - run fini for hardware IPs
2328  *
2329  * @adev: amdgpu_device pointer
2330  *
2331  * Main teardown pass for hardware IPs.  The list of all the hardware
2332  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2333  * are run.  hw_fini tears down the hardware associated with each IP
2334  * and sw_fini tears down any software state associated with each IP.
2335  * Returns 0 on success, negative error code on failure.
2336  */
2337 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2338 {
2339         int i, r;
2340
2341         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2342                 amdgpu_virt_release_ras_err_handler_data(adev);
2343
2344         amdgpu_ras_pre_fini(adev);
2345
2346         if (adev->gmc.xgmi.num_physical_nodes > 1)
2347                 amdgpu_xgmi_remove_device(adev);
2348
2349         amdgpu_amdkfd_device_fini(adev);
2350
2351         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2352         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2353
2354         /* need to disable SMC first */
2355         for (i = 0; i < adev->num_ip_blocks; i++) {
2356                 if (!adev->ip_blocks[i].status.hw)
2357                         continue;
2358                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2359                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2360                         /* XXX handle errors */
2361                         if (r) {
2362                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2363                                           adev->ip_blocks[i].version->funcs->name, r);
2364                         }
2365                         adev->ip_blocks[i].status.hw = false;
2366                         break;
2367                 }
2368         }
2369
2370         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2371                 if (!adev->ip_blocks[i].status.hw)
2372                         continue;
2373
2374                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2375                 /* XXX handle errors */
2376                 if (r) {
2377                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2378                                   adev->ip_blocks[i].version->funcs->name, r);
2379                 }
2380
2381                 adev->ip_blocks[i].status.hw = false;
2382         }
2383
2384
2385         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2386                 if (!adev->ip_blocks[i].status.sw)
2387                         continue;
2388
2389                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2390                         amdgpu_ucode_free_bo(adev);
2391                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2392                         amdgpu_device_wb_fini(adev);
2393                         amdgpu_device_vram_scratch_fini(adev);
2394                         amdgpu_ib_pool_fini(adev);
2395                 }
2396
2397                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2398                 /* XXX handle errors */
2399                 if (r) {
2400                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2401                                   adev->ip_blocks[i].version->funcs->name, r);
2402                 }
2403                 adev->ip_blocks[i].status.sw = false;
2404                 adev->ip_blocks[i].status.valid = false;
2405         }
2406
2407         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2408                 if (!adev->ip_blocks[i].status.late_initialized)
2409                         continue;
2410                 if (adev->ip_blocks[i].version->funcs->late_fini)
2411                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2412                 adev->ip_blocks[i].status.late_initialized = false;
2413         }
2414
2415         amdgpu_ras_fini(adev);
2416
2417         if (amdgpu_sriov_vf(adev))
2418                 if (amdgpu_virt_release_full_gpu(adev, false))
2419                         DRM_ERROR("failed to release exclusive mode on fini\n");
2420
2421         return 0;
2422 }
2423
2424 /**
2425  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2426  *
2427  * @work: work_struct.
2428  */
2429 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2430 {
2431         struct amdgpu_device *adev =
2432                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2433         int r;
2434
2435         r = amdgpu_ib_ring_tests(adev);
2436         if (r)
2437                 DRM_ERROR("ib ring test failed (%d).\n", r);
2438 }
2439
2440 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2441 {
2442         struct amdgpu_device *adev =
2443                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2444
2445         mutex_lock(&adev->gfx.gfx_off_mutex);
2446         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2447                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2448                         adev->gfx.gfx_off_state = true;
2449         }
2450         mutex_unlock(&adev->gfx.gfx_off_mutex);
2451 }
2452
2453 /**
2454  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2455  *
2456  * @adev: amdgpu_device pointer
2457  *
2458  * Main suspend function for hardware IPs.  The list of all the hardware
2459  * IPs that make up the asic is walked, clockgating is disabled and the
2460  * suspend callbacks are run.  suspend puts the hardware and software state
2461  * in each IP into a state suitable for suspend.
2462  * Returns 0 on success, negative error code on failure.
2463  */
2464 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2465 {
2466         int i, r;
2467
2468         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2469         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2470
2471         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2472                 if (!adev->ip_blocks[i].status.valid)
2473                         continue;
2474
2475                 /* displays are handled separately */
2476                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2477                         continue;
2478
2479                 /* XXX handle errors */
2480                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2481                 /* XXX handle errors */
2482                 if (r) {
2483                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2484                                   adev->ip_blocks[i].version->funcs->name, r);
2485                         return r;
2486                 }
2487
2488                 adev->ip_blocks[i].status.hw = false;
2489         }
2490
2491         return 0;
2492 }
2493
2494 /**
2495  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2496  *
2497  * @adev: amdgpu_device pointer
2498  *
2499  * Main suspend function for hardware IPs.  The list of all the hardware
2500  * IPs that make up the asic is walked, clockgating is disabled and the
2501  * suspend callbacks are run.  suspend puts the hardware and software state
2502  * in each IP into a state suitable for suspend.
2503  * Returns 0 on success, negative error code on failure.
2504  */
2505 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2506 {
2507         int i, r;
2508
2509         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2510                 if (!adev->ip_blocks[i].status.valid)
2511                         continue;
2512                 /* displays are handled in phase1 */
2513                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2514                         continue;
2515                 /* PSP lost connection when err_event_athub occurs */
2516                 if (amdgpu_ras_intr_triggered() &&
2517                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2518                         adev->ip_blocks[i].status.hw = false;
2519                         continue;
2520                 }
2521                 /* XXX handle errors */
2522                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2523                 /* XXX handle errors */
2524                 if (r) {
2525                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2526                                   adev->ip_blocks[i].version->funcs->name, r);
2527                 }
2528                 adev->ip_blocks[i].status.hw = false;
2529                 /* handle putting the SMC in the appropriate state */
2530                 if(!amdgpu_sriov_vf(adev)){
2531                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2532                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2533                                 if (r) {
2534                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2535                                                         adev->mp1_state, r);
2536                                         return r;
2537                                 }
2538                         }
2539                 }
2540                 adev->ip_blocks[i].status.hw = false;
2541         }
2542
2543         return 0;
2544 }
2545
2546 /**
2547  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2548  *
2549  * @adev: amdgpu_device pointer
2550  *
2551  * Main suspend function for hardware IPs.  The list of all the hardware
2552  * IPs that make up the asic is walked, clockgating is disabled and the
2553  * suspend callbacks are run.  suspend puts the hardware and software state
2554  * in each IP into a state suitable for suspend.
2555  * Returns 0 on success, negative error code on failure.
2556  */
2557 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2558 {
2559         int r;
2560
2561         if (amdgpu_sriov_vf(adev))
2562                 amdgpu_virt_request_full_gpu(adev, false);
2563
2564         r = amdgpu_device_ip_suspend_phase1(adev);
2565         if (r)
2566                 return r;
2567         r = amdgpu_device_ip_suspend_phase2(adev);
2568
2569         if (amdgpu_sriov_vf(adev))
2570                 amdgpu_virt_release_full_gpu(adev, false);
2571
2572         return r;
2573 }
2574
2575 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2576 {
2577         int i, r;
2578
2579         static enum amd_ip_block_type ip_order[] = {
2580                 AMD_IP_BLOCK_TYPE_GMC,
2581                 AMD_IP_BLOCK_TYPE_COMMON,
2582                 AMD_IP_BLOCK_TYPE_PSP,
2583                 AMD_IP_BLOCK_TYPE_IH,
2584         };
2585
2586         for (i = 0; i < adev->num_ip_blocks; i++)
2587                 adev->ip_blocks[i].status.hw = false;
2588
2589         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2590                 int j;
2591                 struct amdgpu_ip_block *block;
2592
2593                 for (j = 0; j < adev->num_ip_blocks; j++) {
2594                         block = &adev->ip_blocks[j];
2595
2596                         if (block->version->type != ip_order[i] ||
2597                                 !block->status.valid)
2598                                 continue;
2599
2600                         r = block->version->funcs->hw_init(adev);
2601                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2602                         if (r)
2603                                 return r;
2604                         block->status.hw = true;
2605                 }
2606         }
2607
2608         return 0;
2609 }
2610
2611 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2612 {
2613         int i, r;
2614
2615         static enum amd_ip_block_type ip_order[] = {
2616                 AMD_IP_BLOCK_TYPE_SMC,
2617                 AMD_IP_BLOCK_TYPE_DCE,
2618                 AMD_IP_BLOCK_TYPE_GFX,
2619                 AMD_IP_BLOCK_TYPE_SDMA,
2620                 AMD_IP_BLOCK_TYPE_UVD,
2621                 AMD_IP_BLOCK_TYPE_VCE,
2622                 AMD_IP_BLOCK_TYPE_VCN
2623         };
2624
2625         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2626                 int j;
2627                 struct amdgpu_ip_block *block;
2628
2629                 for (j = 0; j < adev->num_ip_blocks; j++) {
2630                         block = &adev->ip_blocks[j];
2631
2632                         if (block->version->type != ip_order[i] ||
2633                                 !block->status.valid ||
2634                                 block->status.hw)
2635                                 continue;
2636
2637                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2638                                 r = block->version->funcs->resume(adev);
2639                         else
2640                                 r = block->version->funcs->hw_init(adev);
2641
2642                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2643                         if (r)
2644                                 return r;
2645                         block->status.hw = true;
2646                 }
2647         }
2648
2649         return 0;
2650 }
2651
2652 /**
2653  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2654  *
2655  * @adev: amdgpu_device pointer
2656  *
2657  * First resume function for hardware IPs.  The list of all the hardware
2658  * IPs that make up the asic is walked and the resume callbacks are run for
2659  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2660  * after a suspend and updates the software state as necessary.  This
2661  * function is also used for restoring the GPU after a GPU reset.
2662  * Returns 0 on success, negative error code on failure.
2663  */
2664 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2665 {
2666         int i, r;
2667
2668         for (i = 0; i < adev->num_ip_blocks; i++) {
2669                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2670                         continue;
2671                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2672                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2673                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2674
2675                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2676                         if (r) {
2677                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2678                                           adev->ip_blocks[i].version->funcs->name, r);
2679                                 return r;
2680                         }
2681                         adev->ip_blocks[i].status.hw = true;
2682                 }
2683         }
2684
2685         return 0;
2686 }
2687
2688 /**
2689  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2690  *
2691  * @adev: amdgpu_device pointer
2692  *
2693  * First resume function for hardware IPs.  The list of all the hardware
2694  * IPs that make up the asic is walked and the resume callbacks are run for
2695  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2696  * functional state after a suspend and updates the software state as
2697  * necessary.  This function is also used for restoring the GPU after a GPU
2698  * reset.
2699  * Returns 0 on success, negative error code on failure.
2700  */
2701 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2702 {
2703         int i, r;
2704
2705         for (i = 0; i < adev->num_ip_blocks; i++) {
2706                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2707                         continue;
2708                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2709                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2710                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2711                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2712                         continue;
2713                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2714                 if (r) {
2715                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2716                                   adev->ip_blocks[i].version->funcs->name, r);
2717                         return r;
2718                 }
2719                 adev->ip_blocks[i].status.hw = true;
2720         }
2721
2722         return 0;
2723 }
2724
2725 /**
2726  * amdgpu_device_ip_resume - run resume for hardware IPs
2727  *
2728  * @adev: amdgpu_device pointer
2729  *
2730  * Main resume function for hardware IPs.  The hardware IPs
2731  * are split into two resume functions because they are
2732  * are also used in in recovering from a GPU reset and some additional
2733  * steps need to be take between them.  In this case (S3/S4) they are
2734  * run sequentially.
2735  * Returns 0 on success, negative error code on failure.
2736  */
2737 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2738 {
2739         int r;
2740
2741         r = amdgpu_device_ip_resume_phase1(adev);
2742         if (r)
2743                 return r;
2744
2745         r = amdgpu_device_fw_loading(adev);
2746         if (r)
2747                 return r;
2748
2749         r = amdgpu_device_ip_resume_phase2(adev);
2750
2751         return r;
2752 }
2753
2754 /**
2755  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2756  *
2757  * @adev: amdgpu_device pointer
2758  *
2759  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2760  */
2761 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2762 {
2763         if (amdgpu_sriov_vf(adev)) {
2764                 if (adev->is_atom_fw) {
2765                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2766                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2767                 } else {
2768                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2769                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2770                 }
2771
2772                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2773                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2774         }
2775 }
2776
2777 /**
2778  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2779  *
2780  * @asic_type: AMD asic type
2781  *
2782  * Check if there is DC (new modesetting infrastructre) support for an asic.
2783  * returns true if DC has support, false if not.
2784  */
2785 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2786 {
2787         switch (asic_type) {
2788 #if defined(CONFIG_DRM_AMD_DC)
2789 #if defined(CONFIG_DRM_AMD_DC_SI)
2790         case CHIP_TAHITI:
2791         case CHIP_PITCAIRN:
2792         case CHIP_VERDE:
2793         case CHIP_OLAND:
2794 #endif
2795         case CHIP_BONAIRE:
2796         case CHIP_KAVERI:
2797         case CHIP_KABINI:
2798         case CHIP_MULLINS:
2799                 /*
2800                  * We have systems in the wild with these ASICs that require
2801                  * LVDS and VGA support which is not supported with DC.
2802                  *
2803                  * Fallback to the non-DC driver here by default so as not to
2804                  * cause regressions.
2805                  */
2806                 return amdgpu_dc > 0;
2807         case CHIP_HAWAII:
2808         case CHIP_CARRIZO:
2809         case CHIP_STONEY:
2810         case CHIP_POLARIS10:
2811         case CHIP_POLARIS11:
2812         case CHIP_POLARIS12:
2813         case CHIP_VEGAM:
2814         case CHIP_TONGA:
2815         case CHIP_FIJI:
2816         case CHIP_VEGA10:
2817         case CHIP_VEGA12:
2818         case CHIP_VEGA20:
2819 #if defined(CONFIG_DRM_AMD_DC_DCN)
2820         case CHIP_RAVEN:
2821         case CHIP_NAVI10:
2822         case CHIP_NAVI14:
2823         case CHIP_NAVI12:
2824         case CHIP_RENOIR:
2825 #endif
2826 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2827         case CHIP_SIENNA_CICHLID:
2828         case CHIP_NAVY_FLOUNDER:
2829 #endif
2830                 return amdgpu_dc != 0;
2831 #endif
2832         default:
2833                 if (amdgpu_dc > 0)
2834                         DRM_INFO("Display Core has been requested via kernel parameter "
2835                                          "but isn't supported by ASIC, ignoring\n");
2836                 return false;
2837         }
2838 }
2839
2840 /**
2841  * amdgpu_device_has_dc_support - check if dc is supported
2842  *
2843  * @adev: amdgpu_device_pointer
2844  *
2845  * Returns true for supported, false for not supported
2846  */
2847 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2848 {
2849         if (amdgpu_sriov_vf(adev))
2850                 return false;
2851
2852         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2853 }
2854
2855
2856 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2857 {
2858         struct amdgpu_device *adev =
2859                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2860         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2861
2862         /* It's a bug to not have a hive within this function */
2863         if (WARN_ON(!hive))
2864                 return;
2865
2866         /*
2867          * Use task barrier to synchronize all xgmi reset works across the
2868          * hive. task_barrier_enter and task_barrier_exit will block
2869          * until all the threads running the xgmi reset works reach
2870          * those points. task_barrier_full will do both blocks.
2871          */
2872         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2873
2874                 task_barrier_enter(&hive->tb);
2875                 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2876
2877                 if (adev->asic_reset_res)
2878                         goto fail;
2879
2880                 task_barrier_exit(&hive->tb);
2881                 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2882
2883                 if (adev->asic_reset_res)
2884                         goto fail;
2885
2886                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2887                         adev->mmhub.funcs->reset_ras_error_count(adev);
2888         } else {
2889
2890                 task_barrier_full(&hive->tb);
2891                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
2892         }
2893
2894 fail:
2895         if (adev->asic_reset_res)
2896                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2897                          adev->asic_reset_res, adev->ddev->unique);
2898 }
2899
2900 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2901 {
2902         char *input = amdgpu_lockup_timeout;
2903         char *timeout_setting = NULL;
2904         int index = 0;
2905         long timeout;
2906         int ret = 0;
2907
2908         /*
2909          * By default timeout for non compute jobs is 10000.
2910          * And there is no timeout enforced on compute jobs.
2911          * In SR-IOV or passthrough mode, timeout for compute
2912          * jobs are 60000 by default.
2913          */
2914         adev->gfx_timeout = msecs_to_jiffies(10000);
2915         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2916         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2917                 adev->compute_timeout =  msecs_to_jiffies(60000);
2918         else
2919                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2920
2921         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2922                 while ((timeout_setting = strsep(&input, ",")) &&
2923                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2924                         ret = kstrtol(timeout_setting, 0, &timeout);
2925                         if (ret)
2926                                 return ret;
2927
2928                         if (timeout == 0) {
2929                                 index++;
2930                                 continue;
2931                         } else if (timeout < 0) {
2932                                 timeout = MAX_SCHEDULE_TIMEOUT;
2933                         } else {
2934                                 timeout = msecs_to_jiffies(timeout);
2935                         }
2936
2937                         switch (index++) {
2938                         case 0:
2939                                 adev->gfx_timeout = timeout;
2940                                 break;
2941                         case 1:
2942                                 adev->compute_timeout = timeout;
2943                                 break;
2944                         case 2:
2945                                 adev->sdma_timeout = timeout;
2946                                 break;
2947                         case 3:
2948                                 adev->video_timeout = timeout;
2949                                 break;
2950                         default:
2951                                 break;
2952                         }
2953                 }
2954                 /*
2955                  * There is only one value specified and
2956                  * it should apply to all non-compute jobs.
2957                  */
2958                 if (index == 1) {
2959                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2960                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2961                                 adev->compute_timeout = adev->gfx_timeout;
2962                 }
2963         }
2964
2965         return ret;
2966 }
2967
2968 static const struct attribute *amdgpu_dev_attributes[] = {
2969         &dev_attr_product_name.attr,
2970         &dev_attr_product_number.attr,
2971         &dev_attr_serial_number.attr,
2972         &dev_attr_pcie_replay_count.attr,
2973         NULL
2974 };
2975
2976 /**
2977  * amdgpu_device_init - initialize the driver
2978  *
2979  * @adev: amdgpu_device pointer
2980  * @ddev: drm dev pointer
2981  * @pdev: pci dev pointer
2982  * @flags: driver flags
2983  *
2984  * Initializes the driver info and hw (all asics).
2985  * Returns 0 for success or an error on failure.
2986  * Called at driver startup.
2987  */
2988 int amdgpu_device_init(struct amdgpu_device *adev,
2989                        struct drm_device *ddev,
2990                        struct pci_dev *pdev,
2991                        uint32_t flags)
2992 {
2993         int r, i;
2994         bool boco = false;
2995         u32 max_MBps;
2996
2997         adev->shutdown = false;
2998         adev->dev = &pdev->dev;
2999         adev->ddev = ddev;
3000         adev->pdev = pdev;
3001         adev->flags = flags;
3002
3003         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3004                 adev->asic_type = amdgpu_force_asic_type;
3005         else
3006                 adev->asic_type = flags & AMD_ASIC_MASK;
3007
3008         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3009         if (amdgpu_emu_mode == 1)
3010                 adev->usec_timeout *= 10;
3011         adev->gmc.gart_size = 512 * 1024 * 1024;
3012         adev->accel_working = false;
3013         adev->num_rings = 0;
3014         adev->mman.buffer_funcs = NULL;
3015         adev->mman.buffer_funcs_ring = NULL;
3016         adev->vm_manager.vm_pte_funcs = NULL;
3017         adev->vm_manager.vm_pte_num_scheds = 0;
3018         adev->gmc.gmc_funcs = NULL;
3019         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3020         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3021
3022         adev->smc_rreg = &amdgpu_invalid_rreg;
3023         adev->smc_wreg = &amdgpu_invalid_wreg;
3024         adev->pcie_rreg = &amdgpu_invalid_rreg;
3025         adev->pcie_wreg = &amdgpu_invalid_wreg;
3026         adev->pciep_rreg = &amdgpu_invalid_rreg;
3027         adev->pciep_wreg = &amdgpu_invalid_wreg;
3028         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3029         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3030         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3031         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3032         adev->didt_rreg = &amdgpu_invalid_rreg;
3033         adev->didt_wreg = &amdgpu_invalid_wreg;
3034         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3035         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3036         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3037         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3038
3039         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3040                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3041                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3042
3043         /* mutex initialization are all done here so we
3044          * can recall function without having locking issues */
3045         atomic_set(&adev->irq.ih.lock, 0);
3046         mutex_init(&adev->firmware.mutex);
3047         mutex_init(&adev->pm.mutex);
3048         mutex_init(&adev->gfx.gpu_clock_mutex);
3049         mutex_init(&adev->srbm_mutex);
3050         mutex_init(&adev->gfx.pipe_reserve_mutex);
3051         mutex_init(&adev->gfx.gfx_off_mutex);
3052         mutex_init(&adev->grbm_idx_mutex);
3053         mutex_init(&adev->mn_lock);
3054         mutex_init(&adev->virt.vf_errors.lock);
3055         hash_init(adev->mn_hash);
3056         init_rwsem(&adev->reset_sem);
3057         atomic_set(&adev->in_gpu_reset, 0);
3058         mutex_init(&adev->psp.mutex);
3059         mutex_init(&adev->notifier_lock);
3060
3061         r = amdgpu_device_check_arguments(adev);
3062         if (r)
3063                 return r;
3064
3065         spin_lock_init(&adev->mmio_idx_lock);
3066         spin_lock_init(&adev->smc_idx_lock);
3067         spin_lock_init(&adev->pcie_idx_lock);
3068         spin_lock_init(&adev->uvd_ctx_idx_lock);
3069         spin_lock_init(&adev->didt_idx_lock);
3070         spin_lock_init(&adev->gc_cac_idx_lock);
3071         spin_lock_init(&adev->se_cac_idx_lock);
3072         spin_lock_init(&adev->audio_endpt_idx_lock);
3073         spin_lock_init(&adev->mm_stats.lock);
3074
3075         INIT_LIST_HEAD(&adev->shadow_list);
3076         mutex_init(&adev->shadow_list_lock);
3077
3078         INIT_DELAYED_WORK(&adev->delayed_init_work,
3079                           amdgpu_device_delayed_init_work_handler);
3080         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3081                           amdgpu_device_delay_enable_gfx_off);
3082
3083         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3084
3085         adev->gfx.gfx_off_req_count = 1;
3086         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3087
3088         atomic_set(&adev->throttling_logging_enabled, 1);
3089         /*
3090          * If throttling continues, logging will be performed every minute
3091          * to avoid log flooding. "-1" is subtracted since the thermal
3092          * throttling interrupt comes every second. Thus, the total logging
3093          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3094          * for throttling interrupt) = 60 seconds.
3095          */
3096         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3097         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3098
3099         /* Registers mapping */
3100         /* TODO: block userspace mapping of io register */
3101         if (adev->asic_type >= CHIP_BONAIRE) {
3102                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3103                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3104         } else {
3105                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3106                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3107         }
3108
3109         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3110         if (adev->rmmio == NULL) {
3111                 return -ENOMEM;
3112         }
3113         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3114         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3115
3116         /* io port mapping */
3117         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3118                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3119                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3120                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3121                         break;
3122                 }
3123         }
3124         if (adev->rio_mem == NULL)
3125                 DRM_INFO("PCI I/O BAR is not found.\n");
3126
3127         /* enable PCIE atomic ops */
3128         r = pci_enable_atomic_ops_to_root(adev->pdev,
3129                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3130                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3131         if (r) {
3132                 adev->have_atomics_support = false;
3133                 DRM_INFO("PCIE atomic ops is not supported\n");
3134         } else {
3135                 adev->have_atomics_support = true;
3136         }
3137
3138         amdgpu_device_get_pcie_info(adev);
3139
3140         if (amdgpu_mcbp)
3141                 DRM_INFO("MCBP is enabled\n");
3142
3143         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3144                 adev->enable_mes = true;
3145
3146         /* detect hw virtualization here */
3147         amdgpu_detect_virtualization(adev);
3148
3149         r = amdgpu_device_get_job_timeout_settings(adev);
3150         if (r) {
3151                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3152                 return r;
3153         }
3154
3155         /* early init functions */
3156         r = amdgpu_device_ip_early_init(adev);
3157         if (r)
3158                 return r;
3159
3160         /* doorbell bar mapping and doorbell index init*/
3161         amdgpu_device_doorbell_init(adev);
3162
3163         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3164         /* this will fail for cards that aren't VGA class devices, just
3165          * ignore it */
3166         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3167
3168         if (amdgpu_device_supports_boco(ddev))
3169                 boco = true;
3170         if (amdgpu_has_atpx() &&
3171             (amdgpu_is_atpx_hybrid() ||
3172              amdgpu_has_atpx_dgpu_power_cntl()) &&
3173             !pci_is_thunderbolt_attached(adev->pdev))
3174                 vga_switcheroo_register_client(adev->pdev,
3175                                                &amdgpu_switcheroo_ops, boco);
3176         if (boco)
3177                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3178
3179         if (amdgpu_emu_mode == 1) {
3180                 /* post the asic on emulation mode */
3181                 emu_soc_asic_init(adev);
3182                 goto fence_driver_init;
3183         }
3184
3185         /* detect if we are with an SRIOV vbios */
3186         amdgpu_device_detect_sriov_bios(adev);
3187
3188         /* check if we need to reset the asic
3189          *  E.g., driver was not cleanly unloaded previously, etc.
3190          */
3191         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3192                 r = amdgpu_asic_reset(adev);
3193                 if (r) {
3194                         dev_err(adev->dev, "asic reset on init failed\n");
3195                         goto failed;
3196                 }
3197         }
3198
3199         /* Post card if necessary */
3200         if (amdgpu_device_need_post(adev)) {
3201                 if (!adev->bios) {
3202                         dev_err(adev->dev, "no vBIOS found\n");
3203                         r = -EINVAL;
3204                         goto failed;
3205                 }
3206                 DRM_INFO("GPU posting now...\n");
3207                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3208                 if (r) {
3209                         dev_err(adev->dev, "gpu post error!\n");
3210                         goto failed;
3211                 }
3212         }
3213
3214         if (adev->is_atom_fw) {
3215                 /* Initialize clocks */
3216                 r = amdgpu_atomfirmware_get_clock_info(adev);
3217                 if (r) {
3218                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3219                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3220                         goto failed;
3221                 }
3222         } else {
3223                 /* Initialize clocks */
3224                 r = amdgpu_atombios_get_clock_info(adev);
3225                 if (r) {
3226                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3227                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3228                         goto failed;
3229                 }
3230                 /* init i2c buses */
3231                 if (!amdgpu_device_has_dc_support(adev))
3232                         amdgpu_atombios_i2c_init(adev);
3233         }
3234
3235 fence_driver_init:
3236         /* Fence driver */
3237         r = amdgpu_fence_driver_init(adev);
3238         if (r) {
3239                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3240                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3241                 goto failed;
3242         }
3243
3244         /* init the mode config */
3245         drm_mode_config_init(adev->ddev);
3246
3247         r = amdgpu_device_ip_init(adev);
3248         if (r) {
3249                 /* failed in exclusive mode due to timeout */
3250                 if (amdgpu_sriov_vf(adev) &&
3251                     !amdgpu_sriov_runtime(adev) &&
3252                     amdgpu_virt_mmio_blocked(adev) &&
3253                     !amdgpu_virt_wait_reset(adev)) {
3254                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3255                         /* Don't send request since VF is inactive. */
3256                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3257                         adev->virt.ops = NULL;
3258                         r = -EAGAIN;
3259                         goto failed;
3260                 }
3261                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3262                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3263                 goto failed;
3264         }
3265
3266         dev_info(adev->dev,
3267                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3268                         adev->gfx.config.max_shader_engines,
3269                         adev->gfx.config.max_sh_per_se,
3270                         adev->gfx.config.max_cu_per_sh,
3271                         adev->gfx.cu_info.number);
3272
3273         adev->accel_working = true;
3274
3275         amdgpu_vm_check_compute_bug(adev);
3276
3277         /* Initialize the buffer migration limit. */
3278         if (amdgpu_moverate >= 0)
3279                 max_MBps = amdgpu_moverate;
3280         else
3281                 max_MBps = 8; /* Allow 8 MB/s. */
3282         /* Get a log2 for easy divisions. */
3283         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3284
3285         amdgpu_fbdev_init(adev);
3286
3287         r = amdgpu_pm_sysfs_init(adev);
3288         if (r) {
3289                 adev->pm_sysfs_en = false;
3290                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3291         } else
3292                 adev->pm_sysfs_en = true;
3293
3294         r = amdgpu_ucode_sysfs_init(adev);
3295         if (r) {
3296                 adev->ucode_sysfs_en = false;
3297                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3298         } else
3299                 adev->ucode_sysfs_en = true;
3300
3301         if ((amdgpu_testing & 1)) {
3302                 if (adev->accel_working)
3303                         amdgpu_test_moves(adev);
3304                 else
3305                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3306         }
3307         if (amdgpu_benchmarking) {
3308                 if (adev->accel_working)
3309                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3310                 else
3311                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3312         }
3313
3314         /*
3315          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3316          * Otherwise the mgpu fan boost feature will be skipped due to the
3317          * gpu instance is counted less.
3318          */
3319         amdgpu_register_gpu_instance(adev);
3320
3321         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3322          * explicit gating rather than handling it automatically.
3323          */
3324         r = amdgpu_device_ip_late_init(adev);
3325         if (r) {
3326                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3327                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3328                 goto failed;
3329         }
3330
3331         /* must succeed. */
3332         amdgpu_ras_resume(adev);
3333
3334         queue_delayed_work(system_wq, &adev->delayed_init_work,
3335                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3336
3337         if (amdgpu_sriov_vf(adev))
3338                 flush_delayed_work(&adev->delayed_init_work);
3339
3340         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3341         if (r) {
3342                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3343                 return r;
3344         }
3345
3346         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3347                 r = amdgpu_pmu_init(adev);
3348         if (r)
3349                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3350
3351         return 0;
3352
3353 failed:
3354         amdgpu_vf_error_trans_all(adev);
3355         if (boco)
3356                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3357
3358         return r;
3359 }
3360
3361 /**
3362  * amdgpu_device_fini - tear down the driver
3363  *
3364  * @adev: amdgpu_device pointer
3365  *
3366  * Tear down the driver info (all asics).
3367  * Called at driver shutdown.
3368  */
3369 void amdgpu_device_fini(struct amdgpu_device *adev)
3370 {
3371         int r;
3372
3373         DRM_INFO("amdgpu: finishing device.\n");
3374         flush_delayed_work(&adev->delayed_init_work);
3375         adev->shutdown = true;
3376
3377         /* make sure IB test finished before entering exclusive mode
3378          * to avoid preemption on IB test
3379          * */
3380         if (amdgpu_sriov_vf(adev))
3381                 amdgpu_virt_request_full_gpu(adev, false);
3382
3383         /* disable all interrupts */
3384         amdgpu_irq_disable_all(adev);
3385         if (adev->mode_info.mode_config_initialized){
3386                 if (!amdgpu_device_has_dc_support(adev))
3387                         drm_helper_force_disable_all(adev->ddev);
3388                 else
3389                         drm_atomic_helper_shutdown(adev->ddev);
3390         }
3391         amdgpu_fence_driver_fini(adev);
3392         if (adev->pm_sysfs_en)
3393                 amdgpu_pm_sysfs_fini(adev);
3394         amdgpu_fbdev_fini(adev);
3395         r = amdgpu_device_ip_fini(adev);
3396         release_firmware(adev->firmware.gpu_info_fw);
3397         adev->firmware.gpu_info_fw = NULL;
3398         adev->accel_working = false;
3399         /* free i2c buses */
3400         if (!amdgpu_device_has_dc_support(adev))
3401                 amdgpu_i2c_fini(adev);
3402
3403         if (amdgpu_emu_mode != 1)
3404                 amdgpu_atombios_fini(adev);
3405
3406         kfree(adev->bios);
3407         adev->bios = NULL;
3408         if (amdgpu_has_atpx() &&
3409             (amdgpu_is_atpx_hybrid() ||
3410              amdgpu_has_atpx_dgpu_power_cntl()) &&
3411             !pci_is_thunderbolt_attached(adev->pdev))
3412                 vga_switcheroo_unregister_client(adev->pdev);
3413         if (amdgpu_device_supports_boco(adev->ddev))
3414                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3415         vga_client_register(adev->pdev, NULL, NULL, NULL);
3416         if (adev->rio_mem)
3417                 pci_iounmap(adev->pdev, adev->rio_mem);
3418         adev->rio_mem = NULL;
3419         iounmap(adev->rmmio);
3420         adev->rmmio = NULL;
3421         amdgpu_device_doorbell_fini(adev);
3422
3423         if (adev->ucode_sysfs_en)
3424                 amdgpu_ucode_sysfs_fini(adev);
3425
3426         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3427         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3428                 amdgpu_pmu_fini(adev);
3429         if (adev->mman.discovery_bin)
3430                 amdgpu_discovery_fini(adev);
3431 }
3432
3433
3434 /*
3435  * Suspend & resume.
3436  */
3437 /**
3438  * amdgpu_device_suspend - initiate device suspend
3439  *
3440  * @dev: drm dev pointer
3441  * @fbcon : notify the fbdev of suspend
3442  *
3443  * Puts the hw in the suspend state (all asics).
3444  * Returns 0 for success or an error on failure.
3445  * Called at driver suspend.
3446  */
3447 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3448 {
3449         struct amdgpu_device *adev;
3450         struct drm_crtc *crtc;
3451         struct drm_connector *connector;
3452         struct drm_connector_list_iter iter;
3453         int r;
3454
3455         if (dev == NULL || dev->dev_private == NULL) {
3456                 return -ENODEV;
3457         }
3458
3459         adev = dev->dev_private;
3460
3461         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3462                 return 0;
3463
3464         adev->in_suspend = true;
3465         drm_kms_helper_poll_disable(dev);
3466
3467         if (fbcon)
3468                 amdgpu_fbdev_set_suspend(adev, 1);
3469
3470         cancel_delayed_work_sync(&adev->delayed_init_work);
3471
3472         if (!amdgpu_device_has_dc_support(adev)) {
3473                 /* turn off display hw */
3474                 drm_modeset_lock_all(dev);
3475                 drm_connector_list_iter_begin(dev, &iter);
3476                 drm_for_each_connector_iter(connector, &iter)
3477                         drm_helper_connector_dpms(connector,
3478                                                   DRM_MODE_DPMS_OFF);
3479                 drm_connector_list_iter_end(&iter);
3480                 drm_modeset_unlock_all(dev);
3481                         /* unpin the front buffers and cursors */
3482                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3483                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3484                         struct drm_framebuffer *fb = crtc->primary->fb;
3485                         struct amdgpu_bo *robj;
3486
3487                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3488                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3489                                 r = amdgpu_bo_reserve(aobj, true);
3490                                 if (r == 0) {
3491                                         amdgpu_bo_unpin(aobj);
3492                                         amdgpu_bo_unreserve(aobj);
3493                                 }
3494                         }
3495
3496                         if (fb == NULL || fb->obj[0] == NULL) {
3497                                 continue;
3498                         }
3499                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3500                         /* don't unpin kernel fb objects */
3501                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3502                                 r = amdgpu_bo_reserve(robj, true);
3503                                 if (r == 0) {
3504                                         amdgpu_bo_unpin(robj);
3505                                         amdgpu_bo_unreserve(robj);
3506                                 }
3507                         }
3508                 }
3509         }
3510
3511         amdgpu_ras_suspend(adev);
3512
3513         r = amdgpu_device_ip_suspend_phase1(adev);
3514
3515         amdgpu_amdkfd_suspend(adev, !fbcon);
3516
3517         /* evict vram memory */
3518         amdgpu_bo_evict_vram(adev);
3519
3520         amdgpu_fence_driver_suspend(adev);
3521
3522         r = amdgpu_device_ip_suspend_phase2(adev);
3523
3524         /* evict remaining vram memory
3525          * This second call to evict vram is to evict the gart page table
3526          * using the CPU.
3527          */
3528         amdgpu_bo_evict_vram(adev);
3529
3530         return 0;
3531 }
3532
3533 /**
3534  * amdgpu_device_resume - initiate device resume
3535  *
3536  * @dev: drm dev pointer
3537  * @fbcon : notify the fbdev of resume
3538  *
3539  * Bring the hw back to operating state (all asics).
3540  * Returns 0 for success or an error on failure.
3541  * Called at driver resume.
3542  */
3543 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3544 {
3545         struct drm_connector *connector;
3546         struct drm_connector_list_iter iter;
3547         struct amdgpu_device *adev = dev->dev_private;
3548         struct drm_crtc *crtc;
3549         int r = 0;
3550
3551         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3552                 return 0;
3553
3554         /* post card */
3555         if (amdgpu_device_need_post(adev)) {
3556                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3557                 if (r)
3558                         DRM_ERROR("amdgpu asic init failed\n");
3559         }
3560
3561         r = amdgpu_device_ip_resume(adev);
3562         if (r) {
3563                 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3564                 return r;
3565         }
3566         amdgpu_fence_driver_resume(adev);
3567
3568
3569         r = amdgpu_device_ip_late_init(adev);
3570         if (r)
3571                 return r;
3572
3573         queue_delayed_work(system_wq, &adev->delayed_init_work,
3574                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3575
3576         if (!amdgpu_device_has_dc_support(adev)) {
3577                 /* pin cursors */
3578                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3579                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3580
3581                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3582                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3583                                 r = amdgpu_bo_reserve(aobj, true);
3584                                 if (r == 0) {
3585                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3586                                         if (r != 0)
3587                                                 DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3588                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3589                                         amdgpu_bo_unreserve(aobj);
3590                                 }
3591                         }
3592                 }
3593         }
3594         r = amdgpu_amdkfd_resume(adev, !fbcon);
3595         if (r)
3596                 return r;
3597
3598         /* Make sure IB tests flushed */
3599         flush_delayed_work(&adev->delayed_init_work);
3600
3601         /* blat the mode back in */
3602         if (fbcon) {
3603                 if (!amdgpu_device_has_dc_support(adev)) {
3604                         /* pre DCE11 */
3605                         drm_helper_resume_force_mode(dev);
3606
3607                         /* turn on display hw */
3608                         drm_modeset_lock_all(dev);
3609
3610                         drm_connector_list_iter_begin(dev, &iter);
3611                         drm_for_each_connector_iter(connector, &iter)
3612                                 drm_helper_connector_dpms(connector,
3613                                                           DRM_MODE_DPMS_ON);
3614                         drm_connector_list_iter_end(&iter);
3615
3616                         drm_modeset_unlock_all(dev);
3617                 }
3618                 amdgpu_fbdev_set_suspend(adev, 0);
3619         }
3620
3621         drm_kms_helper_poll_enable(dev);
3622
3623         amdgpu_ras_resume(adev);
3624
3625         /*
3626          * Most of the connector probing functions try to acquire runtime pm
3627          * refs to ensure that the GPU is powered on when connector polling is
3628          * performed. Since we're calling this from a runtime PM callback,
3629          * trying to acquire rpm refs will cause us to deadlock.
3630          *
3631          * Since we're guaranteed to be holding the rpm lock, it's safe to
3632          * temporarily disable the rpm helpers so this doesn't deadlock us.
3633          */
3634 #ifdef CONFIG_PM
3635         dev->dev->power.disable_depth++;
3636 #endif
3637         if (!amdgpu_device_has_dc_support(adev))
3638                 drm_helper_hpd_irq_event(dev);
3639         else
3640                 drm_kms_helper_hotplug_event(dev);
3641 #ifdef CONFIG_PM
3642         dev->dev->power.disable_depth--;
3643 #endif
3644         adev->in_suspend = false;
3645
3646         return 0;
3647 }
3648
3649 /**
3650  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3651  *
3652  * @adev: amdgpu_device pointer
3653  *
3654  * The list of all the hardware IPs that make up the asic is walked and
3655  * the check_soft_reset callbacks are run.  check_soft_reset determines
3656  * if the asic is still hung or not.
3657  * Returns true if any of the IPs are still in a hung state, false if not.
3658  */
3659 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3660 {
3661         int i;
3662         bool asic_hang = false;
3663
3664         if (amdgpu_sriov_vf(adev))
3665                 return true;
3666
3667         if (amdgpu_asic_need_full_reset(adev))
3668                 return true;
3669
3670         for (i = 0; i < adev->num_ip_blocks; i++) {
3671                 if (!adev->ip_blocks[i].status.valid)
3672                         continue;
3673                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3674                         adev->ip_blocks[i].status.hang =
3675                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3676                 if (adev->ip_blocks[i].status.hang) {
3677                         DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3678                         asic_hang = true;
3679                 }
3680         }
3681         return asic_hang;
3682 }
3683
3684 /**
3685  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3686  *
3687  * @adev: amdgpu_device pointer
3688  *
3689  * The list of all the hardware IPs that make up the asic is walked and the
3690  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3691  * handles any IP specific hardware or software state changes that are
3692  * necessary for a soft reset to succeed.
3693  * Returns 0 on success, negative error code on failure.
3694  */
3695 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3696 {
3697         int i, r = 0;
3698
3699         for (i = 0; i < adev->num_ip_blocks; i++) {
3700                 if (!adev->ip_blocks[i].status.valid)
3701                         continue;
3702                 if (adev->ip_blocks[i].status.hang &&
3703                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3704                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3705                         if (r)
3706                                 return r;
3707                 }
3708         }
3709
3710         return 0;
3711 }
3712
3713 /**
3714  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3715  *
3716  * @adev: amdgpu_device pointer
3717  *
3718  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3719  * reset is necessary to recover.
3720  * Returns true if a full asic reset is required, false if not.
3721  */
3722 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3723 {
3724         int i;
3725
3726         if (amdgpu_asic_need_full_reset(adev))
3727                 return true;
3728
3729         for (i = 0; i < adev->num_ip_blocks; i++) {
3730                 if (!adev->ip_blocks[i].status.valid)
3731                         continue;
3732                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3733                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3734                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3735                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3736                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3737                         if (adev->ip_blocks[i].status.hang) {
3738                                 DRM_INFO("Some block need full reset!\n");
3739                                 return true;
3740                         }
3741                 }
3742         }
3743         return false;
3744 }
3745
3746 /**
3747  * amdgpu_device_ip_soft_reset - do a soft reset
3748  *
3749  * @adev: amdgpu_device pointer
3750  *
3751  * The list of all the hardware IPs that make up the asic is walked and the
3752  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3753  * IP specific hardware or software state changes that are necessary to soft
3754  * reset the IP.
3755  * Returns 0 on success, negative error code on failure.
3756  */
3757 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3758 {
3759         int i, r = 0;
3760
3761         for (i = 0; i < adev->num_ip_blocks; i++) {
3762                 if (!adev->ip_blocks[i].status.valid)
3763                         continue;
3764                 if (adev->ip_blocks[i].status.hang &&
3765                     adev->ip_blocks[i].version->funcs->soft_reset) {
3766                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3767                         if (r)
3768                                 return r;
3769                 }
3770         }
3771
3772         return 0;
3773 }
3774
3775 /**
3776  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3777  *
3778  * @adev: amdgpu_device pointer
3779  *
3780  * The list of all the hardware IPs that make up the asic is walked and the
3781  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3782  * handles any IP specific hardware or software state changes that are
3783  * necessary after the IP has been soft reset.
3784  * Returns 0 on success, negative error code on failure.
3785  */
3786 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3787 {
3788         int i, r = 0;
3789
3790         for (i = 0; i < adev->num_ip_blocks; i++) {
3791                 if (!adev->ip_blocks[i].status.valid)
3792                         continue;
3793                 if (adev->ip_blocks[i].status.hang &&
3794                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3795                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3796                 if (r)
3797                         return r;
3798         }
3799
3800         return 0;
3801 }
3802
3803 /**
3804  * amdgpu_device_recover_vram - Recover some VRAM contents
3805  *
3806  * @adev: amdgpu_device pointer
3807  *
3808  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3809  * restore things like GPUVM page tables after a GPU reset where
3810  * the contents of VRAM might be lost.
3811  *
3812  * Returns:
3813  * 0 on success, negative error code on failure.
3814  */
3815 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3816 {
3817         struct dma_fence *fence = NULL, *next = NULL;
3818         struct amdgpu_bo *shadow;
3819         long r = 1, tmo;
3820
3821         if (amdgpu_sriov_runtime(adev))
3822                 tmo = msecs_to_jiffies(8000);
3823         else
3824                 tmo = msecs_to_jiffies(100);
3825
3826         DRM_INFO("recover vram bo from shadow start\n");
3827         mutex_lock(&adev->shadow_list_lock);
3828         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3829
3830                 /* No need to recover an evicted BO */
3831                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3832                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3833                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3834                         continue;
3835
3836                 r = amdgpu_bo_restore_shadow(shadow, &next);
3837                 if (r)
3838                         break;
3839
3840                 if (fence) {
3841                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3842                         dma_fence_put(fence);
3843                         fence = next;
3844                         if (tmo == 0) {
3845                                 r = -ETIMEDOUT;
3846                                 break;
3847                         } else if (tmo < 0) {
3848                                 r = tmo;
3849                                 break;
3850                         }
3851                 } else {
3852                         fence = next;
3853                 }
3854         }
3855         mutex_unlock(&adev->shadow_list_lock);
3856
3857         if (fence)
3858                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3859         dma_fence_put(fence);
3860
3861         if (r < 0 || tmo <= 0) {
3862                 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3863                 return -EIO;
3864         }
3865
3866         DRM_INFO("recover vram bo from shadow done\n");
3867         return 0;
3868 }
3869
3870
3871 /**
3872  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3873  *
3874  * @adev: amdgpu device pointer
3875  * @from_hypervisor: request from hypervisor
3876  *
3877  * do VF FLR and reinitialize Asic
3878  * return 0 means succeeded otherwise failed
3879  */
3880 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3881                                      bool from_hypervisor)
3882 {
3883         int r;
3884
3885         if (from_hypervisor)
3886                 r = amdgpu_virt_request_full_gpu(adev, true);
3887         else
3888                 r = amdgpu_virt_reset_gpu(adev);
3889         if (r)
3890                 return r;
3891
3892         amdgpu_amdkfd_pre_reset(adev);
3893
3894         /* Resume IP prior to SMC */
3895         r = amdgpu_device_ip_reinit_early_sriov(adev);
3896         if (r)
3897                 goto error;
3898
3899         amdgpu_virt_init_data_exchange(adev);
3900         /* we need recover gart prior to run SMC/CP/SDMA resume */
3901         amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3902
3903         r = amdgpu_device_fw_loading(adev);
3904         if (r)
3905                 return r;
3906
3907         /* now we are okay to resume SMC/CP/SDMA */
3908         r = amdgpu_device_ip_reinit_late_sriov(adev);
3909         if (r)
3910                 goto error;
3911
3912         amdgpu_irq_gpu_reset_resume_helper(adev);
3913         r = amdgpu_ib_ring_tests(adev);
3914         amdgpu_amdkfd_post_reset(adev);
3915
3916 error:
3917         amdgpu_virt_release_full_gpu(adev, true);
3918         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3919                 amdgpu_inc_vram_lost(adev);
3920                 r = amdgpu_device_recover_vram(adev);
3921         }
3922
3923         return r;
3924 }
3925
3926 /**
3927  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3928  *
3929  * @adev: amdgpu device pointer
3930  *
3931  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3932  * a hung GPU.
3933  */
3934 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3935 {
3936         if (!amdgpu_device_ip_check_soft_reset(adev)) {
3937                 DRM_INFO("Timeout, but no hardware hang detected.\n");
3938                 return false;
3939         }
3940
3941         if (amdgpu_gpu_recovery == 0)
3942                 goto disabled;
3943
3944         if (amdgpu_sriov_vf(adev))
3945                 return true;
3946
3947         if (amdgpu_gpu_recovery == -1) {
3948                 switch (adev->asic_type) {
3949                 case CHIP_BONAIRE:
3950                 case CHIP_HAWAII:
3951                 case CHIP_TOPAZ:
3952                 case CHIP_TONGA:
3953                 case CHIP_FIJI:
3954                 case CHIP_POLARIS10:
3955                 case CHIP_POLARIS11:
3956                 case CHIP_POLARIS12:
3957                 case CHIP_VEGAM:
3958                 case CHIP_VEGA20:
3959                 case CHIP_VEGA10:
3960                 case CHIP_VEGA12:
3961                 case CHIP_RAVEN:
3962                 case CHIP_ARCTURUS:
3963                 case CHIP_RENOIR:
3964                 case CHIP_NAVI10:
3965                 case CHIP_NAVI14:
3966                 case CHIP_NAVI12:
3967                 case CHIP_SIENNA_CICHLID:
3968                         break;
3969                 default:
3970                         goto disabled;
3971                 }
3972         }
3973
3974         return true;
3975
3976 disabled:
3977                 DRM_INFO("GPU recovery disabled.\n");
3978                 return false;
3979 }
3980
3981
3982 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3983                                         struct amdgpu_job *job,
3984                                         bool *need_full_reset_arg)
3985 {
3986         int i, r = 0;
3987         bool need_full_reset  = *need_full_reset_arg;
3988
3989         amdgpu_debugfs_wait_dump(adev);
3990
3991         /* block all schedulers and reset given job's ring */
3992         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3993                 struct amdgpu_ring *ring = adev->rings[i];
3994
3995                 if (!ring || !ring->sched.thread)
3996                         continue;
3997
3998                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3999                 amdgpu_fence_driver_force_completion(ring);
4000         }
4001
4002         if(job)
4003                 drm_sched_increase_karma(&job->base);
4004
4005         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4006         if (!amdgpu_sriov_vf(adev)) {
4007
4008                 if (!need_full_reset)
4009                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4010
4011                 if (!need_full_reset) {
4012                         amdgpu_device_ip_pre_soft_reset(adev);
4013                         r = amdgpu_device_ip_soft_reset(adev);
4014                         amdgpu_device_ip_post_soft_reset(adev);
4015                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4016                                 DRM_INFO("soft reset failed, will fallback to full reset!\n");
4017                                 need_full_reset = true;
4018                         }
4019                 }
4020
4021                 if (need_full_reset)
4022                         r = amdgpu_device_ip_suspend(adev);
4023
4024                 *need_full_reset_arg = need_full_reset;
4025         }
4026
4027         return r;
4028 }
4029
4030 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4031                                struct list_head *device_list_handle,
4032                                bool *need_full_reset_arg)
4033 {
4034         struct amdgpu_device *tmp_adev = NULL;
4035         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4036         int r = 0;
4037
4038         /*
4039          * ASIC reset has to be done on all HGMI hive nodes ASAP
4040          * to allow proper links negotiation in FW (within 1 sec)
4041          */
4042         if (need_full_reset) {
4043                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4044                         /* For XGMI run all resets in parallel to speed up the process */
4045                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4046                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4047                                         r = -EALREADY;
4048                         } else
4049                                 r = amdgpu_asic_reset(tmp_adev);
4050
4051                         if (r) {
4052                                 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
4053                                          r, tmp_adev->ddev->unique);
4054                                 break;
4055                         }
4056                 }
4057
4058                 /* For XGMI wait for all resets to complete before proceed */
4059                 if (!r) {
4060                         list_for_each_entry(tmp_adev, device_list_handle,
4061                                             gmc.xgmi.head) {
4062                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4063                                         flush_work(&tmp_adev->xgmi_reset_work);
4064                                         r = tmp_adev->asic_reset_res;
4065                                         if (r)
4066                                                 break;
4067                                 }
4068                         }
4069                 }
4070         }
4071
4072         if (!r && amdgpu_ras_intr_triggered()) {
4073                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4074                         if (tmp_adev->mmhub.funcs &&
4075                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4076                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4077                 }
4078
4079                 amdgpu_ras_intr_cleared();
4080         }
4081
4082         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4083                 if (need_full_reset) {
4084                         /* post card */
4085                         if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
4086                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4087                                 r = -EAGAIN;
4088                                 goto out;
4089                         }
4090
4091                         if (!r) {
4092                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4093                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4094                                 if (r)
4095                                         goto out;
4096
4097                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4098                                 if (vram_lost) {
4099                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4100                                         amdgpu_inc_vram_lost(tmp_adev);
4101                                 }
4102
4103                                 r = amdgpu_gtt_mgr_recover(
4104                                         &tmp_adev->mman.bdev.man[TTM_PL_TT]);
4105                                 if (r)
4106                                         goto out;
4107
4108                                 r = amdgpu_device_fw_loading(tmp_adev);
4109                                 if (r)
4110                                         return r;
4111
4112                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4113                                 if (r)
4114                                         goto out;
4115
4116                                 if (vram_lost)
4117                                         amdgpu_device_fill_reset_magic(tmp_adev);
4118
4119                                 /*
4120                                  * Add this ASIC as tracked as reset was already
4121                                  * complete successfully.
4122                                  */
4123                                 amdgpu_register_gpu_instance(tmp_adev);
4124
4125                                 r = amdgpu_device_ip_late_init(tmp_adev);
4126                                 if (r)
4127                                         goto out;
4128
4129                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4130
4131                                 /*
4132                                  * The GPU enters bad state once faulty pages
4133                                  * by ECC has reached the threshold, and ras
4134                                  * recovery is scheduled next. So add one check
4135                                  * here to break recovery if it indeed exceeds
4136                                  * bad page threshold, and remind user to
4137                                  * retire this GPU or setting one bigger
4138                                  * bad_page_threshold value to fix this once
4139                                  * probing driver again.
4140                                  */
4141                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4142                                         /* must succeed. */
4143                                         amdgpu_ras_resume(tmp_adev);
4144                                 } else {
4145                                         r = -EINVAL;
4146                                         goto out;
4147                                 }
4148
4149                                 /* Update PSP FW topology after reset */
4150                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4151                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4152                         }
4153                 }
4154
4155 out:
4156                 if (!r) {
4157                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4158                         r = amdgpu_ib_ring_tests(tmp_adev);
4159                         if (r) {
4160                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4161                                 r = amdgpu_device_ip_suspend(tmp_adev);
4162                                 need_full_reset = true;
4163                                 r = -EAGAIN;
4164                                 goto end;
4165                         }
4166                 }
4167
4168                 if (!r)
4169                         r = amdgpu_device_recover_vram(tmp_adev);
4170                 else
4171                         tmp_adev->asic_reset_res = r;
4172         }
4173
4174 end:
4175         *need_full_reset_arg = need_full_reset;
4176         return r;
4177 }
4178
4179 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4180 {
4181         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4182                 return false;
4183
4184         if (hive) {
4185                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4186         } else {
4187                 down_write(&adev->reset_sem);
4188         }
4189
4190         atomic_inc(&adev->gpu_reset_counter);
4191         switch (amdgpu_asic_reset_method(adev)) {
4192         case AMD_RESET_METHOD_MODE1:
4193                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4194                 break;
4195         case AMD_RESET_METHOD_MODE2:
4196                 adev->mp1_state = PP_MP1_STATE_RESET;
4197                 break;
4198         default:
4199                 adev->mp1_state = PP_MP1_STATE_NONE;
4200                 break;
4201         }
4202
4203         return true;
4204 }
4205
4206 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4207 {
4208         amdgpu_vf_error_trans_all(adev);
4209         adev->mp1_state = PP_MP1_STATE_NONE;
4210         atomic_set(&adev->in_gpu_reset, 0);
4211         up_write(&adev->reset_sem);
4212 }
4213
4214 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4215 {
4216         struct pci_dev *p = NULL;
4217
4218         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4219                         adev->pdev->bus->number, 1);
4220         if (p) {
4221                 pm_runtime_enable(&(p->dev));
4222                 pm_runtime_resume(&(p->dev));
4223         }
4224 }
4225
4226 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4227 {
4228         enum amd_reset_method reset_method;
4229         struct pci_dev *p = NULL;
4230         u64 expires;
4231
4232         /*
4233          * For now, only BACO and mode1 reset are confirmed
4234          * to suffer the audio issue without proper suspended.
4235          */
4236         reset_method = amdgpu_asic_reset_method(adev);
4237         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4238              (reset_method != AMD_RESET_METHOD_MODE1))
4239                 return -EINVAL;
4240
4241         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4242                         adev->pdev->bus->number, 1);
4243         if (!p)
4244                 return -ENODEV;
4245
4246         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4247         if (!expires)
4248                 /*
4249                  * If we cannot get the audio device autosuspend delay,
4250                  * a fixed 4S interval will be used. Considering 3S is
4251                  * the audio controller default autosuspend delay setting.
4252                  * 4S used here is guaranteed to cover that.
4253                  */
4254                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4255
4256         while (!pm_runtime_status_suspended(&(p->dev))) {
4257                 if (!pm_runtime_suspend(&(p->dev)))
4258                         break;
4259
4260                 if (expires < ktime_get_mono_fast_ns()) {
4261                         dev_warn(adev->dev, "failed to suspend display audio\n");
4262                         /* TODO: abort the succeeding gpu reset? */
4263                         return -ETIMEDOUT;
4264                 }
4265         }
4266
4267         pm_runtime_disable(&(p->dev));
4268
4269         return 0;
4270 }
4271
4272 /**
4273  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4274  *
4275  * @adev: amdgpu device pointer
4276  * @job: which job trigger hang
4277  *
4278  * Attempt to reset the GPU if it has hung (all asics).
4279  * Attempt to do soft-reset or full-reset and reinitialize Asic
4280  * Returns 0 for success or an error on failure.
4281  */
4282
4283 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4284                               struct amdgpu_job *job)
4285 {
4286         struct list_head device_list, *device_list_handle =  NULL;
4287         bool need_full_reset = false;
4288         bool job_signaled = false;
4289         struct amdgpu_hive_info *hive = NULL;
4290         struct amdgpu_device *tmp_adev = NULL;
4291         int i, r = 0;
4292         bool need_emergency_restart = false;
4293         bool audio_suspended = false;
4294
4295         /**
4296          * Special case: RAS triggered and full reset isn't supported
4297          */
4298         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4299
4300         /*
4301          * Flush RAM to disk so that after reboot
4302          * the user can read log and see why the system rebooted.
4303          */
4304         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4305                 DRM_WARN("Emergency reboot.");
4306
4307                 ksys_sync_helper();
4308                 emergency_restart();
4309         }
4310
4311         dev_info(adev->dev, "GPU %s begin!\n",
4312                 need_emergency_restart ? "jobs stop":"reset");
4313
4314         /*
4315          * Here we trylock to avoid chain of resets executing from
4316          * either trigger by jobs on different adevs in XGMI hive or jobs on
4317          * different schedulers for same device while this TO handler is running.
4318          * We always reset all schedulers for device and all devices for XGMI
4319          * hive so that should take care of them too.
4320          */
4321         hive = amdgpu_get_xgmi_hive(adev, false);
4322         if (hive) {
4323                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4324                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4325                                 job ? job->base.id : -1, hive->hive_id);
4326                         return 0;
4327                 }
4328                 mutex_lock(&hive->hive_lock);
4329         }
4330
4331         /*
4332          * Build list of devices to reset.
4333          * In case we are in XGMI hive mode, resort the device list
4334          * to put adev in the 1st position.
4335          */
4336         INIT_LIST_HEAD(&device_list);
4337         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4338                 if (!hive)
4339                         return -ENODEV;
4340                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4341                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4342                 device_list_handle = &hive->device_list;
4343         } else {
4344                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4345                 device_list_handle = &device_list;
4346         }
4347
4348         /* block all schedulers and reset given job's ring */
4349         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4350                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4351                         DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
4352                                   job ? job->base.id : -1);
4353                         r = 0;
4354                         goto skip_recovery;
4355                 }
4356
4357                 /*
4358                  * Try to put the audio codec into suspend state
4359                  * before gpu reset started.
4360                  *
4361                  * Due to the power domain of the graphics device
4362                  * is shared with AZ power domain. Without this,
4363                  * we may change the audio hardware from behind
4364                  * the audio driver's back. That will trigger
4365                  * some audio codec errors.
4366                  */
4367                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4368                         audio_suspended = true;
4369
4370                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4371
4372                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4373
4374                 if (!amdgpu_sriov_vf(tmp_adev))
4375                         amdgpu_amdkfd_pre_reset(tmp_adev);
4376
4377                 /*
4378                  * Mark these ASICs to be reseted as untracked first
4379                  * And add them back after reset completed
4380                  */
4381                 amdgpu_unregister_gpu_instance(tmp_adev);
4382
4383                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4384
4385                 /* disable ras on ALL IPs */
4386                 if (!need_emergency_restart &&
4387                       amdgpu_device_ip_need_full_reset(tmp_adev))
4388                         amdgpu_ras_suspend(tmp_adev);
4389
4390                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4391                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4392
4393                         if (!ring || !ring->sched.thread)
4394                                 continue;
4395
4396                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4397
4398                         if (need_emergency_restart)
4399                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4400                 }
4401         }
4402
4403         if (need_emergency_restart)
4404                 goto skip_sched_resume;
4405
4406         /*
4407          * Must check guilty signal here since after this point all old
4408          * HW fences are force signaled.
4409          *
4410          * job->base holds a reference to parent fence
4411          */
4412         if (job && job->base.s_fence->parent &&
4413             dma_fence_is_signaled(job->base.s_fence->parent)) {
4414                 job_signaled = true;
4415                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4416                 goto skip_hw_reset;
4417         }
4418
4419 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4420         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4421                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4422                                                  NULL,
4423                                                  &need_full_reset);
4424                 /*TODO Should we stop ?*/
4425                 if (r) {
4426                         DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4427                                   r, tmp_adev->ddev->unique);
4428                         tmp_adev->asic_reset_res = r;
4429                 }
4430         }
4431
4432         /* Actual ASIC resets if needed.*/
4433         /* TODO Implement XGMI hive reset logic for SRIOV */
4434         if (amdgpu_sriov_vf(adev)) {
4435                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4436                 if (r)
4437                         adev->asic_reset_res = r;
4438         } else {
4439                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4440                 if (r && r == -EAGAIN)
4441                         goto retry;
4442         }
4443
4444 skip_hw_reset:
4445
4446         /* Post ASIC reset for all devs .*/
4447         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4448
4449                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4450                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4451
4452                         if (!ring || !ring->sched.thread)
4453                                 continue;
4454
4455                         /* No point to resubmit jobs if we didn't HW reset*/
4456                         if (!tmp_adev->asic_reset_res && !job_signaled)
4457                                 drm_sched_resubmit_jobs(&ring->sched);
4458
4459                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4460                 }
4461
4462                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4463                         drm_helper_resume_force_mode(tmp_adev->ddev);
4464                 }
4465
4466                 tmp_adev->asic_reset_res = 0;
4467
4468                 if (r) {
4469                         /* bad news, how to tell it to userspace ? */
4470                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4471                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4472                 } else {
4473                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4474                 }
4475         }
4476
4477 skip_sched_resume:
4478         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4479                 /*unlock kfd: SRIOV would do it separately */
4480                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4481                         amdgpu_amdkfd_post_reset(tmp_adev);
4482                 if (audio_suspended)
4483                         amdgpu_device_resume_display_audio(tmp_adev);
4484                 amdgpu_device_unlock_adev(tmp_adev);
4485         }
4486
4487 skip_recovery:
4488         if (hive) {
4489                 atomic_set(&hive->in_reset, 0);
4490                 mutex_unlock(&hive->hive_lock);
4491         }
4492
4493         if (r)
4494                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4495         return r;
4496 }
4497
4498 /**
4499  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4500  *
4501  * @adev: amdgpu_device pointer
4502  *
4503  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4504  * and lanes) of the slot the device is in. Handles APUs and
4505  * virtualized environments where PCIE config space may not be available.
4506  */
4507 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4508 {
4509         struct pci_dev *pdev;
4510         enum pci_bus_speed speed_cap, platform_speed_cap;
4511         enum pcie_link_width platform_link_width;
4512
4513         if (amdgpu_pcie_gen_cap)
4514                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4515
4516         if (amdgpu_pcie_lane_cap)
4517                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4518
4519         /* covers APUs as well */
4520         if (pci_is_root_bus(adev->pdev->bus)) {
4521                 if (adev->pm.pcie_gen_mask == 0)
4522                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4523                 if (adev->pm.pcie_mlw_mask == 0)
4524                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4525                 return;
4526         }
4527
4528         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4529                 return;
4530
4531         pcie_bandwidth_available(adev->pdev, NULL,
4532                                  &platform_speed_cap, &platform_link_width);
4533
4534         if (adev->pm.pcie_gen_mask == 0) {
4535                 /* asic caps */
4536                 pdev = adev->pdev;
4537                 speed_cap = pcie_get_speed_cap(pdev);
4538                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4539                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4540                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4541                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4542                 } else {
4543                         if (speed_cap == PCIE_SPEED_16_0GT)
4544                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4545                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4546                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4547                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4548                         else if (speed_cap == PCIE_SPEED_8_0GT)
4549                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4550                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4551                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4552                         else if (speed_cap == PCIE_SPEED_5_0GT)
4553                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4554                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4555                         else
4556                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4557                 }
4558                 /* platform caps */
4559                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4560                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4561                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4562                 } else {
4563                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4564                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4565                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4566                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4567                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4568                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4569                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4570                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4571                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4572                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4573                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4574                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4575                         else
4576                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4577
4578                 }
4579         }
4580         if (adev->pm.pcie_mlw_mask == 0) {
4581                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4582                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4583                 } else {
4584                         switch (platform_link_width) {
4585                         case PCIE_LNK_X32:
4586                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4587                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4588                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4589                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4590                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4591                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4592                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4593                                 break;
4594                         case PCIE_LNK_X16:
4595                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4596                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4597                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4598                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4599                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4600                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4601                                 break;
4602                         case PCIE_LNK_X12:
4603                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4604                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4605                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4606                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4607                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4608                                 break;
4609                         case PCIE_LNK_X8:
4610                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4611                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4612                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4613                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4614                                 break;
4615                         case PCIE_LNK_X4:
4616                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4617                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4618                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4619                                 break;
4620                         case PCIE_LNK_X2:
4621                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4622                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4623                                 break;
4624                         case PCIE_LNK_X1:
4625                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4626                                 break;
4627                         default:
4628                                 break;
4629                         }
4630                 }
4631         }
4632 }
4633
4634 int amdgpu_device_baco_enter(struct drm_device *dev)
4635 {
4636         struct amdgpu_device *adev = dev->dev_private;
4637         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4638
4639         if (!amdgpu_device_supports_baco(adev->ddev))
4640                 return -ENOTSUPP;
4641
4642         if (ras && ras->supported)
4643                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4644
4645         return amdgpu_dpm_baco_enter(adev);
4646 }
4647
4648 int amdgpu_device_baco_exit(struct drm_device *dev)
4649 {
4650         struct amdgpu_device *adev = dev->dev_private;
4651         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4652         int ret = 0;
4653
4654         if (!amdgpu_device_supports_baco(adev->ddev))
4655                 return -ENOTSUPP;
4656
4657         ret = amdgpu_dpm_baco_exit(adev);
4658         if (ret)
4659                 return ret;
4660
4661         if (ras && ras->supported)
4662                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4663
4664         return 0;
4665 }