drm/amdgpu: Support nbif v6_3_1 fatal error handling
authorCandice Li <candice.li@amd.com>
Thu, 8 Aug 2024 08:40:41 +0000 (16:40 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 10 Dec 2024 15:31:00 +0000 (10:31 -0500)
Add nbif v6_3_1 fatal error handling support.

Signed-off-by: Candice Li <candice.li@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.c
drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.h
drivers/gpu/drm/amd/amdgpu/soc24.c

index 623ae9b3880037df767a162da56f3d75963d5410..db081618e85c3b609ba785c1c70f9c9fc3eaae42 100644 (file)
@@ -36,6 +36,7 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "nbio_v4_3.h"
+#include "nbif_v6_3_1.h"
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
@@ -3911,6 +3912,17 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                         * check DF RAS */
                        adev->nbio.ras = &nbio_v4_3_ras;
                break;
+       case IP_VERSION(6, 3, 1):
+               if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
+                       /* unlike other generation of nbio ras,
+                        * nbif v6_3_1 only support fatal error interrupt
+                        * to inform software that DF is freezed due to
+                        * system fatal error event. driver should not
+                        * enable nbio ras in such case. Instead,
+                        * check DF RAS
+                        */
+                       adev->nbio.ras = &nbif_v6_3_1_ras;
+               break;
        case IP_VERSION(7, 9, 0):
        case IP_VERSION(7, 9, 1):
                if (!adev->gmc.is_app_apu)
index 39919e0892c1480ce3917369ef277bfd33a7997d..c92875ceb31f4505138758e1327c5bd5f825b700 100644 (file)
@@ -28,6 +28,7 @@
 #include "nbif/nbif_6_3_1_sh_mask.h"
 #include "pcie/pcie_6_1_0_offset.h"
 #include "pcie/pcie_6_1_0_sh_mask.h"
+#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include <uapi/linux/kfd_ioctl.h>
 
 static void nbif_v6_3_1_remap_hdp_registers(struct amdgpu_device *adev)
@@ -518,3 +519,83 @@ const struct amdgpu_nbio_funcs nbif_v6_3_1_sriov_funcs = {
        .get_rom_offset = nbif_v6_3_1_get_rom_offset,
        .set_reg_remap = nbif_v6_3_1_set_reg_remap,
 };
+
+static int nbif_v6_3_1_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev,
+                                                      struct amdgpu_irq_src *src,
+                                                      unsigned type,
+                                                      enum amdgpu_interrupt_state state)
+{
+       /* The ras_controller_irq enablement should be done in psp bl when it
+        * tries to enable ras feature. Driver only need to set the correct interrupt
+        * vector for bare-metal and sriov use case respectively
+        */
+       uint32_t bif_doorbell_int_cntl;
+
+       bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+       bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+                                             BIF_BX0_BIF_DOORBELL_INT_CNTL,
+                                             RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
+                                             (state == AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
+       WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl);
+
+       return 0;
+}
+
+static int nbif_v6_3_1_process_err_event_athub_irq(struct amdgpu_device *adev,
+                                                struct amdgpu_irq_src *source,
+                                                struct amdgpu_iv_entry *entry)
+{
+       /* By design, the ih cookie for err_event_athub_irq should be written
+        * to bif ring. since bif ring is not enabled, just leave process callback
+        * as a dummy one.
+        */
+       return 0;
+}
+
+static const struct amdgpu_irq_src_funcs nbif_v6_3_1_ras_err_event_athub_irq_funcs = {
+       .set = nbif_v6_3_1_set_ras_err_event_athub_irq_state,
+       .process = nbif_v6_3_1_process_err_event_athub_irq,
+};
+
+static void nbif_v6_3_1_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev)
+{
+       uint32_t bif_doorbell_int_cntl;
+
+       bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+       if (REG_GET_FIELD(bif_doorbell_int_cntl,
+                         BIF_BX0_BIF_DOORBELL_INT_CNTL,
+                         RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
+               /* driver has to clear the interrupt status when bif ring is disabled */
+               bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+                                               BIF_BX0_BIF_DOORBELL_INT_CNTL,
+                                               RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
+               WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl);
+               amdgpu_ras_global_ras_isr(adev);
+       }
+}
+
+static int nbif_v6_3_1_init_ras_err_event_athub_interrupt(struct amdgpu_device *adev)
+{
+       int r;
+
+       /* init the irq funcs */
+       adev->nbio.ras_err_event_athub_irq.funcs =
+               &nbif_v6_3_1_ras_err_event_athub_irq_funcs;
+       adev->nbio.ras_err_event_athub_irq.num_types = 1;
+
+       /* register ras err event athub interrupt
+        * nbif v6_3_1 uses the same irq source as nbio v7_4
+        */
+       r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF,
+                             NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT,
+                             &adev->nbio.ras_err_event_athub_irq);
+
+       return r;
+}
+
+struct amdgpu_nbio_ras nbif_v6_3_1_ras = {
+       .handle_ras_err_event_athub_intr_no_bifring =
+               nbif_v6_3_1_handle_ras_err_event_athub_intr_no_bifring,
+       .init_ras_err_event_athub_interrupt =
+               nbif_v6_3_1_init_ras_err_event_athub_interrupt,
+};
index b7f2e0d88905d203787722442fde8292bf40cb5b..9ac4831d39e17bb23f9d2a70554a5db6df20a422 100644 (file)
@@ -29,5 +29,6 @@
 extern const struct nbio_hdp_flush_reg nbif_v6_3_1_hdp_flush_reg;
 extern const struct amdgpu_nbio_funcs nbif_v6_3_1_funcs;
 extern const struct amdgpu_nbio_funcs nbif_v6_3_1_sriov_funcs;
+extern struct amdgpu_nbio_ras nbif_v6_3_1_ras;
 
 #endif
index eda03d40d76589e49ebaaf6bf3bb96bde40bf7fc..6b8e078ee7c7519c5ebdcd76ba56c5064117dca0 100644 (file)
@@ -444,8 +444,18 @@ static int soc24_common_late_init(struct amdgpu_ip_block *ip_block)
 {
        struct amdgpu_device *adev = ip_block->adev;
 
-       if (amdgpu_sriov_vf(adev))
+       if (amdgpu_sriov_vf(adev)) {
                xgpu_nv_mailbox_get_irq(adev);
+       } else {
+               if (adev->nbio.ras &&
+                   adev->nbio.ras_err_event_athub_irq.funcs)
+                       /* don't need to fail gpu late init
+                        * if enabling athub_err_event interrupt failed
+                        * nbif v6_3_1 only support fatal error hanlding
+                        * just enable the interrupt directly
+                        */
+                       amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+       }
 
        /* Enable selfring doorbell aperture late because doorbell BAR
         * aperture will change if resize BAR successfully in gmc sw_init.
@@ -501,8 +511,13 @@ static int soc24_common_hw_fini(struct amdgpu_ip_block *ip_block)
        adev->nbio.funcs->enable_doorbell_aperture(adev, false);
        adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
 
-       if (amdgpu_sriov_vf(adev))
+       if (amdgpu_sriov_vf(adev)) {
                xgpu_nv_mailbox_put_irq(adev);
+       } else {
+               if (adev->nbio.ras &&
+                   adev->nbio.ras_err_event_athub_irq.funcs)
+                       amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+       }
 
        return 0;
 }