drm/amdgpu: enable watchdog feature for SQ of aldebaran

author Dennis Li <Dennis.Li@amd.com>

Fri, 5 Mar 2021 21:30:54 +0000 (16:30 -0500)

committer Alex Deucher <alexander.deucher@amd.com>

Wed, 24 Mar 2021 02:59:52 +0000 (22:59 -0400)
author Dennis Li <Dennis.Li@amd.com>
Fri, 5 Mar 2021 21:30:54 +0000 (16:30 -0500)
committer Alex Deucher <alexander.deucher@amd.com>
Wed, 24 Mar 2021 02:59:52 +0000 (22:59 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index bdaaba42bda40c3f201279de00d7ee569a2712ec..951a2a19c19e30e6efc772c20f936c01f7ae9aa8 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -126,6 +126,12 @@ struct amdgpu_mgpu_info
         uint32_t                        num_apu;
  };
  
+struct amdgpu_watchdog_timer
+{
+       bool timeout_fatal_disable;
+       uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
+};
+
  #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH        256
  
  /*
@@ -187,6 +193,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
  extern int amdgpu_ras_enable;
  extern uint amdgpu_ras_mask;
  extern int amdgpu_bad_page_threshold;
+extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
  extern int amdgpu_async_gfx_ring;
  extern int amdgpu_mcbp;
  extern int amdgpu_discovery;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index 5179d5f032ee5fb185a94d4345627159f29e4ac4..e39d81b68169338a3d2e9387888f3bb79a319c1b 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -175,6 +175,10 @@ struct amdgpu_mgpu_info mgpu_info = {
  int amdgpu_ras_enable = -1;
  uint amdgpu_ras_mask = 0xffffffff;
  int amdgpu_bad_page_threshold = 100;
+struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
+       .timeout_fatal_disable = false,
+       .period = 0x3f, /* about 8s */
+};
  
  /**
   * DOC: vramlimit (int)
@@ -530,6 +534,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
  MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1");
  module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444);
  
+/**
+ * DOC: timeout_fatal_disable (bool)
+ * Disable Watchdog timeout fatal error event
+ */
+MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)");
+module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644);
+
+/**
+ * DOC: timeout_period (uint)
+ * Modify the watchdog timeout max_cycles as (1 << period)
+ */
+MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)");
+module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
+
  /**
   * DOC: si_support (int)
   * Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h

index 1ab9632282d4695441f6f058b1a49cbc0fb2e1ac..d92f0f14cbebc5bf488101b25169f72d1c9bc160 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs {
         void (*init_spm_golden)(struct amdgpu_device *adev);
         void (*query_ras_error_status) (struct amdgpu_device *adev);
         void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
+       void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+       void (*query_sq_timeout_status)(struct amdgpu_device *adev);
  };
  
  struct sq_work {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index c669435ccc747b85c46de9ad500906ed5036ad9a..c1516d871881ba7679fccc8e79a62c6777b74dd6 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
         case AMDGPU_RAS_BLOCK__GFX:
                 if (adev->gfx.funcs->query_ras_error_status)
                         adev->gfx.funcs->query_ras_error_status(adev);
+
+               if (adev->gfx.funcs->query_sq_timeout_status)
+                       adev->gfx.funcs->query_sq_timeout_status(adev);
                 break;
         case AMDGPU_RAS_BLOCK__MMHUB:
                 if (adev->mmhub.funcs->query_ras_error_status)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

index 6af4ecf8e9f0cefc413f5293d240826a58b36ebc..8b6ba1594f413a9ae7f8c740b80af4cb267c776b 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2124,6 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = {
         .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
         .reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
         .query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
+       .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+       .query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
  };
  
  static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -3968,6 +3970,9 @@ static int gfx_v9_0_hw_init(void *handle)
         if (adev->asic_type == CHIP_ALDEBARAN)
                 gfx_v9_4_2_set_power_brake_sequence(adev);
  
+       if (adev->gfx.funcs->enable_watchdog_timer)
+               adev->gfx.funcs->enable_watchdog_timer(adev);
+
         return r;
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c

index b2e2026c3ec74ced702355a88c95abf57e97cea4..1faeae14ead90a6667d0db04fafbcf15e43a0110 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
         gfx_v9_4_2_query_ea_err_status(adev);
         gfx_v9_4_2_query_utc_err_status(adev);
  }
+
+void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev)
+{
+       uint32_t i;
+       uint32_t data;
+
+       data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
+                            amdgpu_watchdog_timer.timeout_fatal_disable ? 1 :
+                                                                          0);
+       data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL,
+                            amdgpu_watchdog_timer.period);
+
+       mutex_lock(&adev->grbm_idx_mutex);
+       for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
+               gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff);
+               WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data);
+       }
+       gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+       mutex_unlock(&adev->grbm_idx_mutex);
+}
+
+static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
+{
+       WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX,
+               (wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
+               (simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
+               (address << SQ_IND_INDEX__INDEX__SHIFT) |
+               (SQ_IND_INDEX__FORCE_READ_MASK));
+       return RREG32_SOC15(GC, 0, regSQ_IND_DATA);
+}
+
+static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev,
+                                       uint32_t status)
+{
+       struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
+       uint32_t i, simd, wave;
+       uint32_t wave_status;
+       uint32_t wave_pc_lo, wave_pc_hi;
+       uint32_t wave_exec_lo, wave_exec_hi;
+       uint32_t wave_inst_dw0, wave_inst_dw1;
+       uint32_t wave_ib_sts;
+
+       for (i = 0; i < 32; i++) {
+               if (!((i << 1) & status))
+                       continue;
+
+               simd = i / cu_info->max_waves_per_simd;
+               wave = i % cu_info->max_waves_per_simd;
+
+               wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS);
+               wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO);
+               wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI);
+               wave_exec_lo =
+                       wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO);
+               wave_exec_hi =
+                       wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI);
+               wave_inst_dw0 =
+                       wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0);
+               wave_inst_dw1 =
+                       wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1);
+               wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS);
+
+               dev_info(
+                       adev->dev,
+                       "\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n",
+                       simd, wave, wave_status,
+                       ((uint64_t)wave_pc_hi << 32 | wave_pc_lo),
+                       ((uint64_t)wave_exec_hi << 32 | wave_exec_lo),
+                       ((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0),
+                       wave_ib_sts);
+       }
+}
+
+void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
+{
+       uint32_t se_idx, sh_idx, cu_idx;
+       uint32_t status;
+
+       mutex_lock(&adev->grbm_idx_mutex);
+       for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines;
+            se_idx++) {
+               for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se;
+                    sh_idx++) {
+                       for (cu_idx = 0;
+                            cu_idx < adev->gfx.config.max_cu_per_sh;
+                            cu_idx++) {
+                               gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx,
+                                                       cu_idx);
+                               status = RREG32_SOC15(GC, 0,
+                                                     regSQ_TIMEOUT_STATUS);
+                               if (status != 0) {
+                                       dev_info(
+                                               adev->dev,
+                                               "GFX Watchdog Timeout: SE %d, SH %d, CU %d\n",
+                                               se_idx, sh_idx, cu_idx);
+                                       gfx_v9_4_2_log_cu_timeout_status(
+                                               adev, status);
+                               }
+                               /* clear old status */
+                               WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0);
+                       }
+               }
+       }
+       gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+       mutex_unlock(&adev->grbm_idx_mutex);
+}
+\ No newline at end of file
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h

index d7e3041947e8ef7432153de9570e8aa101faf321..e01fa6afa8e4f70a6a57ea2ed2cd4870a95dfa55 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
@@ -35,4 +35,7 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if);
  void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev);
  int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
                                    void *ras_error_status);
+
+void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev);
+void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
  #endif /* __GFX_V9_4_2_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h

index 52ffbea63a4f6413ea09a6fc888fc18d9c6db7e9..8cdf5d1685cbbb789287fd17d37cacd0cd1120b5 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
@@ -100,6 +100,30 @@
                 }       \
         } while (0)
  
+#define WREG32_RLC_EX(prefix, reg, value) \
+       do {                                                    \
+               if (amdgpu_sriov_fullaccess(adev)) {    \
+                       uint32_t i = 0; \
+                       uint32_t retries = 50000;       \
+                       uint32_t r0 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG0_BASE_IDX] + prefix##SCRATCH_REG0;       \
+                       uint32_t r1 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG1_BASE_IDX] + prefix##SCRATCH_REG1;       \
+                       uint32_t spare_int = adev->reg_offset[GC_HWIP][0][prefix##RLC_SPARE_INT_BASE_IDX] + prefix##RLC_SPARE_INT;      \
+                       WREG32(r0, value);      \
+                       WREG32(r1, (reg | 0x80000000)); \
+                       WREG32(spare_int, 0x1); \
+                       for (i = 0; i < retries; i++) { \
+                               u32 tmp = RREG32(r1);   \
+                               if (!(tmp & 0x80000000))        \
+                                       break;  \
+                               udelay(10);     \
+                       }       \
+                       if (i >= retries)       \
+                               pr_err("timeout: rlcg program reg:0x%05x failed !\n", reg);     \
+               } else {        \
+                       WREG32(reg, value); \
+               }       \
+       } while (0)
+
  #define WREG32_SOC15_RLC_SHADOW(ip, inst, reg, value) \
         do {                                                    \
                 uint32_t target_reg = adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg;\
@@ -142,6 +166,12 @@
                         WREG32_RLC(target_reg, value); \
         } while (0)
  
+#define WREG32_SOC15_RLC_EX(prefix, ip, inst, reg, value) \
+       do {                                                    \
+                       uint32_t target_reg = adev->reg_offset[GC_HWIP][0][reg##_BASE_IDX] + reg;\
+                       WREG32_RLC_EX(prefix, target_reg, value); \
+       } while (0)
+
  #define WREG32_FIELD15_RLC(ip, idx, reg, field, val)   \
      WREG32_RLC((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \
      (RREG32(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg) \
author	Dennis Li <Dennis.Li@amd.com>
	Fri, 5 Mar 2021 21:30:54 +0000 (16:30 -0500)
committer	Alex Deucher <alexander.deucher@amd.com>
	Wed, 24 Mar 2021 02:59:52 +0000 (22:59 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/soc15_common.h		patch \| blob \| blame \| history