drm/amdgpu: enable watchdog feature for SQ of aldebaran
authorDennis Li <Dennis.Li@amd.com>
Fri, 5 Mar 2021 21:30:54 +0000 (16:30 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 24 Mar 2021 02:59:52 +0000 (22:59 -0400)
SQ's watchdog timer monitors forward progress, a mask of which waves
caused the watchdog timeout is recorded into ras status registers and
then trigger a system fatal error event.

v2:
1. change *query_timeout_status to *query_sq_timeout_status.
2. move query_sq_timeout_status into amdgpu_ras_do_recovery.
3. add module parameters to enable/disable fatal error event and modify
the watchdog timer.

v3:
1. remove unused parameters of *enable_watchdog_timer

Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
drivers/gpu/drm/amd/amdgpu/soc15_common.h

index bdaaba42bda40c3f201279de00d7ee569a2712ec..951a2a19c19e30e6efc772c20f936c01f7ae9aa8 100644 (file)
@@ -126,6 +126,12 @@ struct amdgpu_mgpu_info
        uint32_t                        num_apu;
 };
 
+struct amdgpu_watchdog_timer
+{
+       bool timeout_fatal_disable;
+       uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
+};
+
 #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH        256
 
 /*
@@ -187,6 +193,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
 extern int amdgpu_ras_enable;
 extern uint amdgpu_ras_mask;
 extern int amdgpu_bad_page_threshold;
+extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
 extern int amdgpu_async_gfx_ring;
 extern int amdgpu_mcbp;
 extern int amdgpu_discovery;
index 5179d5f032ee5fb185a94d4345627159f29e4ac4..e39d81b68169338a3d2e9387888f3bb79a319c1b 100644 (file)
@@ -175,6 +175,10 @@ struct amdgpu_mgpu_info mgpu_info = {
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0xffffffff;
 int amdgpu_bad_page_threshold = 100;
+struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
+       .timeout_fatal_disable = false,
+       .period = 0x3f, /* about 8s */
+};
 
 /**
  * DOC: vramlimit (int)
@@ -530,6 +534,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
 MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1");
 module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444);
 
+/**
+ * DOC: timeout_fatal_disable (bool)
+ * Disable Watchdog timeout fatal error event
+ */
+MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)");
+module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644);
+
+/**
+ * DOC: timeout_period (uint)
+ * Modify the watchdog timeout max_cycles as (1 << period)
+ */
+MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)");
+module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
+
 /**
  * DOC: si_support (int)
  * Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled,
index 1ab9632282d4695441f6f058b1a49cbc0fb2e1ac..d92f0f14cbebc5bf488101b25169f72d1c9bc160 100644 (file)
@@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs {
        void (*init_spm_golden)(struct amdgpu_device *adev);
        void (*query_ras_error_status) (struct amdgpu_device *adev);
        void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
+       void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+       void (*query_sq_timeout_status)(struct amdgpu_device *adev);
 };
 
 struct sq_work {
index c669435ccc747b85c46de9ad500906ed5036ad9a..c1516d871881ba7679fccc8e79a62c6777b74dd6 100644 (file)
@@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
        case AMDGPU_RAS_BLOCK__GFX:
                if (adev->gfx.funcs->query_ras_error_status)
                        adev->gfx.funcs->query_ras_error_status(adev);
+
+               if (adev->gfx.funcs->query_sq_timeout_status)
+                       adev->gfx.funcs->query_sq_timeout_status(adev);
                break;
        case AMDGPU_RAS_BLOCK__MMHUB:
                if (adev->mmhub.funcs->query_ras_error_status)
index 6af4ecf8e9f0cefc413f5293d240826a58b36ebc..8b6ba1594f413a9ae7f8c740b80af4cb267c776b 100644 (file)
@@ -2124,6 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = {
        .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
        .reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
        .query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
+       .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+       .query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
 };
 
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -3968,6 +3970,9 @@ static int gfx_v9_0_hw_init(void *handle)
        if (adev->asic_type == CHIP_ALDEBARAN)
                gfx_v9_4_2_set_power_brake_sequence(adev);
 
+       if (adev->gfx.funcs->enable_watchdog_timer)
+               adev->gfx.funcs->enable_watchdog_timer(adev);
+
        return r;
 }
 
index b2e2026c3ec74ced702355a88c95abf57e97cea4..1faeae14ead90a6667d0db04fafbcf15e43a0110 100644 (file)
@@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
        gfx_v9_4_2_query_ea_err_status(adev);
        gfx_v9_4_2_query_utc_err_status(adev);
 }
+
+void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev)
+{
+       uint32_t i;
+       uint32_t data;
+
+       data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
+                            amdgpu_watchdog_timer.timeout_fatal_disable ? 1 :
+                                                                          0);
+       data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL,
+                            amdgpu_watchdog_timer.period);
+
+       mutex_lock(&adev->grbm_idx_mutex);
+       for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
+               gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff);
+               WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data);
+       }
+       gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+       mutex_unlock(&adev->grbm_idx_mutex);
+}
+
+static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
+{
+       WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX,
+               (wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
+               (simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
+               (address << SQ_IND_INDEX__INDEX__SHIFT) |
+               (SQ_IND_INDEX__FORCE_READ_MASK));
+       return RREG32_SOC15(GC, 0, regSQ_IND_DATA);
+}
+
+static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev,
+                                       uint32_t status)
+{
+       struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
+       uint32_t i, simd, wave;
+       uint32_t wave_status;
+       uint32_t wave_pc_lo, wave_pc_hi;
+       uint32_t wave_exec_lo, wave_exec_hi;
+       uint32_t wave_inst_dw0, wave_inst_dw1;
+       uint32_t wave_ib_sts;
+
+       for (i = 0; i < 32; i++) {
+               if (!((i << 1) & status))
+                       continue;
+
+               simd = i / cu_info->max_waves_per_simd;
+               wave = i % cu_info->max_waves_per_simd;
+
+               wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS);
+               wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO);
+               wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI);
+               wave_exec_lo =
+                       wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO);
+               wave_exec_hi =
+                       wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI);
+               wave_inst_dw0 =
+                       wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0);
+               wave_inst_dw1 =
+                       wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1);
+               wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS);
+
+               dev_info(
+                       adev->dev,
+                       "\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n",
+                       simd, wave, wave_status,
+                       ((uint64_t)wave_pc_hi << 32 | wave_pc_lo),
+                       ((uint64_t)wave_exec_hi << 32 | wave_exec_lo),
+                       ((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0),
+                       wave_ib_sts);
+       }
+}
+
+void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
+{
+       uint32_t se_idx, sh_idx, cu_idx;
+       uint32_t status;
+
+       mutex_lock(&adev->grbm_idx_mutex);
+       for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines;
+            se_idx++) {
+               for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se;
+                    sh_idx++) {
+                       for (cu_idx = 0;
+                            cu_idx < adev->gfx.config.max_cu_per_sh;
+                            cu_idx++) {
+                               gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx,
+                                                       cu_idx);
+                               status = RREG32_SOC15(GC, 0,
+                                                     regSQ_TIMEOUT_STATUS);
+                               if (status != 0) {
+                                       dev_info(
+                                               adev->dev,
+                                               "GFX Watchdog Timeout: SE %d, SH %d, CU %d\n",
+                                               se_idx, sh_idx, cu_idx);
+                                       gfx_v9_4_2_log_cu_timeout_status(
+                                               adev, status);
+                               }
+                               /* clear old status */
+                               WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0);
+                       }
+               }
+       }
+       gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+       mutex_unlock(&adev->grbm_idx_mutex);
+}
\ No newline at end of file
index d7e3041947e8ef7432153de9570e8aa101faf321..e01fa6afa8e4f70a6a57ea2ed2cd4870a95dfa55 100644 (file)
@@ -35,4 +35,7 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if);
 void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev);
 int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
                                   void *ras_error_status);
+
+void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev);
+void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
 #endif /* __GFX_V9_4_2_H__ */
index 52ffbea63a4f6413ea09a6fc888fc18d9c6db7e9..8cdf5d1685cbbb789287fd17d37cacd0cd1120b5 100644 (file)
                }       \
        } while (0)
 
+#define WREG32_RLC_EX(prefix, reg, value) \
+       do {                                                    \
+               if (amdgpu_sriov_fullaccess(adev)) {    \
+                       uint32_t i = 0; \
+                       uint32_t retries = 50000;       \
+                       uint32_t r0 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG0_BASE_IDX] + prefix##SCRATCH_REG0;       \
+                       uint32_t r1 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG1_BASE_IDX] + prefix##SCRATCH_REG1;       \
+                       uint32_t spare_int = adev->reg_offset[GC_HWIP][0][prefix##RLC_SPARE_INT_BASE_IDX] + prefix##RLC_SPARE_INT;      \
+                       WREG32(r0, value);      \
+                       WREG32(r1, (reg | 0x80000000)); \
+                       WREG32(spare_int, 0x1); \
+                       for (i = 0; i < retries; i++) { \
+                               u32 tmp = RREG32(r1);   \
+                               if (!(tmp & 0x80000000))        \
+                                       break;  \
+                               udelay(10);     \
+                       }       \
+                       if (i >= retries)       \
+                               pr_err("timeout: rlcg program reg:0x%05x failed !\n", reg);     \
+               } else {        \
+                       WREG32(reg, value); \
+               }       \
+       } while (0)
+
 #define WREG32_SOC15_RLC_SHADOW(ip, inst, reg, value) \
        do {                                                    \
                uint32_t target_reg = adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg;\
                        WREG32_RLC(target_reg, value); \
        } while (0)
 
+#define WREG32_SOC15_RLC_EX(prefix, ip, inst, reg, value) \
+       do {                                                    \
+                       uint32_t target_reg = adev->reg_offset[GC_HWIP][0][reg##_BASE_IDX] + reg;\
+                       WREG32_RLC_EX(prefix, target_reg, value); \
+       } while (0)
+
 #define WREG32_FIELD15_RLC(ip, idx, reg, field, val)   \
     WREG32_RLC((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \
     (RREG32(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg) \