drm/amdgpu: Generate bad page threshold cper records
authorXiang Liu <xiang.liu@amd.com>
Tue, 11 Feb 2025 11:45:52 +0000 (19:45 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 17 Feb 2025 19:09:30 +0000 (14:09 -0500)
Generate CPER record when bad page threshold exceed and
commit to CPER ring.

v2: return -ENOMEM instead of false
v2: check return value of fill section function

Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
drivers/gpu/drm/amd/pm/amdgpu_dpm.c

index 26e0655e7ed49332dbba7d5cdb9f412a5e00051f..8805381e19b990a3562e745f35042626a4f55b18 100644 (file)
@@ -207,7 +207,7 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
                   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
 
        amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,
-                                           CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,
+                                           CPER_SEV_NUM, RUNTIME, NONSTD_SEC_LEN,
                                            NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
 
        section->hdr.valid_bits.err_info_cnt = 1;
@@ -308,6 +308,28 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
        return 0;
 }
 
+int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
+{
+       struct cper_hdr *bp_threshold = NULL;
+       struct amdgpu_ring *ring = &adev->cper.ring_buf;
+       int ret;
+
+       bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1);
+       if (!bp_threshold) {
+               dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");
+               return -ENOMEM;
+       }
+
+       amdgpu_cper_entry_fill_hdr(adev, bp_threshold, AMDGPU_CPER_TYPE_BP_THRESHOLD, CPER_SEV_NUM);
+       ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);
+       if (ret)
+               return ret;
+
+       amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);
+
+       return 0;
+}
+
 static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
                                                                enum aca_error_type aca_err_type)
 {
index d35d1ddac7ccf01b327757d3a5bffa54d20ed9d7..bcb97d245673bf6cd7426c7a71c3247ebbad688a 100644 (file)
@@ -95,6 +95,8 @@ int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
 int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
                                    struct aca_banks *banks,
                                    uint16_t bank_count);
+/* Bad page threshold is encoded into separated cper entry */
+int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev);
 void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
                        void *src, int count);
 int amdgpu_cper_init(struct amdgpu_device *adev);
index 7a22aef6e59c38cd59cae8de661eb587a86f1294..faae9bf48aa4f673b6e724014d34f97384462205 100644 (file)
@@ -716,6 +716,9 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device *adev)
        ret = smu_send_rma_reason(smu);
        mutex_unlock(&adev->pm.mutex);
 
+       if (amdgpu_cper_generate_bp_threshold_record(adev))
+               dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
+
        return ret;
 }