accel/ivpu: Implement heartbeat-based TDR mechanism
authorKarol Wachowski <karol.wachowski@intel.com>
Wed, 16 Apr 2025 10:25:55 +0000 (12:25 +0200)
committerJacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Fri, 25 Apr 2025 07:49:11 +0000 (09:49 +0200)
Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
The enhancement aims to improve the reliability of device hang detection by
monitoring heartbeat updates.

Each progressing inference will update heartbeat counter allowing driver to
monitor its progression. Limit maximum number of reschedules when heartbeat
indicates progression to 30. This increases the maximum running time of
single inference to about 60 seconds.

The heartbeat mechanism provides a more robust method for detecting device
hangs, potentially reducing false positive recoveries due to long running
inferences.

Signed-off-by: Karol Wachowski <karol.wachowski@intel.com>
Signed-off-by: Maciej Falkowski <maciej.falkowski@linux.intel.com>
Reviewed-by: Jeff Hugo <jeff.hugo@oss.qualcomm.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Link: https://lore.kernel.org/r/20250416102555.384526-1-maciej.falkowski@linux.intel.com
drivers/accel/ivpu/ivpu_drv.c
drivers/accel/ivpu/ivpu_drv.h
drivers/accel/ivpu/ivpu_fw.h
drivers/accel/ivpu/ivpu_pm.c

index 4fa73189502e1a9c869d1a272f4ddc0e835f22d8..392d6790b4cdd93eec2dfaf363036c7fe36b4c98 100644 (file)
@@ -374,6 +374,9 @@ int ivpu_boot(struct ivpu_device *vdev)
 {
        int ret;
 
+       drm_WARN_ON(&vdev->drm, atomic_read(&vdev->job_timeout_counter));
+       drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
+
        /* Update boot params located at first 4KB of FW memory */
        ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem));
 
@@ -573,6 +576,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
        vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID;
        vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
        atomic64_set(&vdev->unique_id_counter, 0);
+       atomic_set(&vdev->job_timeout_counter, 0);
        xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
        xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
        xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
index 92753effb1c9966289f26f2183d77d61753f2b7f..5497e7030e91570d3bb7d85cf76868bfd0f85300 100644 (file)
@@ -154,6 +154,7 @@ struct ivpu_device {
        struct mutex submitted_jobs_lock; /* Protects submitted_jobs */
        struct xarray submitted_jobs_xa;
        struct ivpu_ipc_consumer job_done_consumer;
+       atomic_t job_timeout_counter;
 
        atomic64_t unique_id_counter;
 
index 1d0b2bd9d65cf02ac8a6ba2592ae551504135ff4..9a3935be1c0574e7dbb46b30cdcb52536d2883b6 100644 (file)
@@ -39,6 +39,7 @@ struct ivpu_fw_info {
        u64 read_only_addr;
        u32 read_only_size;
        u32 sched_mode;
+       u64 last_heartbeat;
 };
 
 int ivpu_fw_init(struct ivpu_device *vdev);
index b5891e91f7abaf56f711267c0dc2bd19bee214f6..1fe03fc16bbc84faacfca49dade63667f916e9c8 100644 (file)
@@ -34,6 +34,7 @@ module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
 
 #define PM_RESCHEDULE_LIMIT     5
+#define PM_TDR_HEARTBEAT_LIMIT  30
 
 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
 {
@@ -44,6 +45,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
        ivpu_fw_log_reset(vdev);
        ivpu_fw_load(vdev);
        fw->entry_point = fw->cold_boot_entry_point;
+       fw->last_heartbeat = 0;
 }
 
 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
@@ -189,7 +191,24 @@ static void ivpu_job_timeout_work(struct work_struct *work)
 {
        struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
        struct ivpu_device *vdev = pm->vdev;
+       u64 heartbeat;
 
+       if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
+               ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
+               goto recovery;
+       }
+
+       if (atomic_fetch_inc(&vdev->job_timeout_counter) > PM_TDR_HEARTBEAT_LIMIT) {
+               ivpu_err(vdev, "Job timeout detected, heartbeat limit exceeded\n");
+               goto recovery;
+       }
+
+       vdev->fw->last_heartbeat = heartbeat;
+       ivpu_start_job_timeout_detection(vdev);
+       return;
+
+recovery:
+       atomic_set(&vdev->job_timeout_counter, 0);
        ivpu_pm_trigger_recovery(vdev, "TDR");
 }
 
@@ -204,6 +223,7 @@ void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
 {
        cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
+       atomic_set(&vdev->job_timeout_counter, 0);
 }
 
 int ivpu_pm_suspend_cb(struct device *dev)