From 1dab4561a341afdbaafe0ce6091106d0c63c79e0 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 29 Jun 2022 21:39:59 -0700 Subject: [PATCH] drm/i915/reset: Handle reset timeouts under unrelated kernel hangs When resuming after hibernate sometimes we see hangs in unrelated kernel subsystems. These hangs often result in the following i915 trace: i915 0000:00:02.0: [drm] *ERROR* \ intel_gt_reset_global timed out, cancelling all in-flight rendering implying our reset task has been starved by the hanging kernel subsystem, causing us to inappropiately declare the system as wedged beyond recovery. The trace would be caused by our synchronize_srcu_expedited() taking more than the allowed 5s due to the unrelated kernel hang. But we neither need to perform that synchronisation inside the reset watchdog, nor do we need such a short timeout before declaring the device as unrecoverable. v2: Restore watchdog timeout to the previous 5 seconds (Ashutosh) Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/3575 Signed-off-by: Chris Wilson Signed-off-by: Ashutosh Dixit Reviewed-by: Ashutosh Dixit Signed-off-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20220630043959.5708-1-ashutosh.dixit@intel.com --- drivers/gpu/drm/i915/gt/intel_reset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index c68d36fb5bbd..1211774e1d91 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1281,9 +1281,6 @@ static void intel_gt_reset_global(struct intel_gt *gt, intel_wedge_on_timeout(&w, gt, 5 * HZ) { intel_display_prepare_reset(gt->i915); - /* Flush everyone using a resource about to be clobbered */ - synchronize_srcu_expedited(>->reset.backoff_srcu); - intel_gt_reset(gt, engine_mask, reason); intel_display_finish_reset(gt->i915); @@ -1392,6 +1389,9 @@ void intel_gt_handle_error(struct intel_gt *gt, } } + /* Flush everyone using a resource about to be clobbered */ + synchronize_srcu_expedited(>->reset.backoff_srcu); + intel_gt_reset_global(gt, engine_mask, msg); if (!intel_uc_uses_guc_submission(>->uc)) { -- 2.25.1