drm/i915: Reboot CI if we get wedged during driver init
authorMichał Winiarski <michal.winiarski@intel.com>
Mon, 6 Jul 2020 14:41:05 +0000 (16:41 +0200)
committerChris Wilson <chris@chris-wilson.co.uk>
Mon, 6 Jul 2020 18:21:07 +0000 (19:21 +0100)
Getting wedged device on driver init is pretty much unrecoverable.
Since we're running various scenarios that may potentially hit this in
CI (module reload / selftests / hotunplug), and if it happens, it means
that we can't trust any subsequent CI results, we should just apply the
taint to let the CI know that it should reboot (CI checks taint between
test runs).

v2: Comment that WEDGED_ON_INIT is non-recoverable, distinguish
    WEDGED_ON_INIT from WEDGED_ON_FINI (Chris)
v3: Appease checkpatch, fixup search-replace logic expression mindbomb
    in assert (Chris)

Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20200706144107.204821-1-michal@hardline.pl
drivers/gpu/drm/i915/gt/intel_engine_user.c
drivers/gpu/drm/i915/gt/intel_gt.c
drivers/gpu/drm/i915/gt/intel_gt.h
drivers/gpu/drm/i915/gt/intel_gt_pm.c
drivers/gpu/drm/i915/gt/intel_reset.c
drivers/gpu/drm/i915/gt/intel_reset.h
drivers/gpu/drm/i915/gt/intel_reset_types.h

index 848decee90665ab4460e40bbe22c609c3c9def02..34e6096f196ed8eb647cfcce68faf427a23e60cf 100644 (file)
@@ -201,7 +201,7 @@ void intel_engines_driver_register(struct drm_i915_private *i915)
                                     uabi_node);
                char old[sizeof(engine->name)];
 
-               if (intel_gt_has_init_error(engine->gt))
+               if (intel_gt_has_unrecoverable_error(engine->gt))
                        continue; /* ignore incomplete engines */
 
                GEM_BUG_ON(engine->class >= ARRAY_SIZE(uabi_classes));
index ebc29b6ee86cbd738279d219eae04b44bbfcdd03..876f78759095bb79a3d19af75c6b5ba328a5e2f0 100644 (file)
@@ -510,7 +510,7 @@ static int __engines_verify_workarounds(struct intel_gt *gt)
 
 static void __intel_gt_disable(struct intel_gt *gt)
 {
-       intel_gt_set_wedged_on_init(gt);
+       intel_gt_set_wedged_on_fini(gt);
 
        intel_gt_suspend_prepare(gt);
        intel_gt_suspend_late(gt);
index 4fac043750aa39ad2fa3d5c96a10686f951d6277..982957ca4e62c1227632483d13c1f43662546903 100644 (file)
@@ -58,14 +58,18 @@ static inline u32 intel_gt_scratch_offset(const struct intel_gt *gt,
        return i915_ggtt_offset(gt->scratch) + field;
 }
 
-static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
+static inline bool intel_gt_has_unrecoverable_error(const struct intel_gt *gt)
 {
-       return __intel_reset_failed(&gt->reset);
+       return test_bit(I915_WEDGED_ON_INIT, &gt->reset.flags) ||
+              test_bit(I915_WEDGED_ON_FINI, &gt->reset.flags);
 }
 
-static inline bool intel_gt_has_init_error(const struct intel_gt *gt)
+static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
 {
-       return test_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
+       GEM_BUG_ON(intel_gt_has_unrecoverable_error(gt) &&
+                  !test_bit(I915_WEDGED, &gt->reset.flags));
+
+       return unlikely(test_bit(I915_WEDGED, &gt->reset.flags));
 }
 
 #endif /* __INTEL_GT_H__ */
index f1d5333f9456335191af534b7ea9c96ed40a45e3..274aa0dd7050e9bf271ecc2fc985f0166355c95a 100644 (file)
@@ -188,7 +188,7 @@ int intel_gt_resume(struct intel_gt *gt)
        enum intel_engine_id id;
        int err;
 
-       err = intel_gt_has_init_error(gt);
+       err = intel_gt_has_unrecoverable_error(gt);
        if (err)
                return err;
 
index 0156f1f5c736fd441a33d8f621065f5717fc4947..6f94b6479a2f6b736e4436bae61c60bb9da2bbd4 100644 (file)
@@ -880,7 +880,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
                return true;
 
        /* Never fully initialised, recovery impossible */
-       if (test_bit(I915_WEDGED_ON_INIT, &gt->reset.flags))
+       if (intel_gt_has_unrecoverable_error(gt))
                return false;
 
        GT_TRACE(gt, "start\n");
@@ -1342,7 +1342,7 @@ int intel_gt_terminally_wedged(struct intel_gt *gt)
        if (!intel_gt_is_wedged(gt))
                return 0;
 
-       if (intel_gt_has_init_error(gt))
+       if (intel_gt_has_unrecoverable_error(gt))
                return -EIO;
 
        /* Reset still in progress? Maybe we will recover? */
@@ -1360,6 +1360,15 @@ void intel_gt_set_wedged_on_init(struct intel_gt *gt)
                     I915_WEDGED_ON_INIT);
        intel_gt_set_wedged(gt);
        set_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
+
+       /* Wedged on init is non-recoverable */
+       add_taint_for_CI(TAINT_WARN);
+}
+
+void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
+{
+       intel_gt_set_wedged(gt);
+       set_bit(I915_WEDGED_ON_FINI, &gt->reset.flags);
 }
 
 void intel_gt_init_reset(struct intel_gt *gt)
index 8e8d5f7611665904565275c99f6037bd206e1709..a0eec7c11c0ccd1d7a484ffecc18353519ac1ebf 100644 (file)
@@ -47,8 +47,10 @@ int intel_gt_terminally_wedged(struct intel_gt *gt);
 /*
  * There's no unset_wedged_on_init paired with this one.
  * Once we're wedged on init, there's no going back.
+ * Same thing for unset_wedged_on_fini.
  */
 void intel_gt_set_wedged_on_init(struct intel_gt *gt);
+void intel_gt_set_wedged_on_fini(struct intel_gt *gt);
 
 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask);
 
@@ -71,14 +73,6 @@ void __intel_fini_wedge(struct intel_wedge_me *w);
             (W)->gt;                                                   \
             __intel_fini_wedge((W)))
 
-static inline bool __intel_reset_failed(const struct intel_reset *reset)
-{
-       GEM_BUG_ON(test_bit(I915_WEDGED_ON_INIT, &reset->flags) ?
-                  !test_bit(I915_WEDGED, &reset->flags) : false);
-
-       return unlikely(test_bit(I915_WEDGED, &reset->flags));
-}
-
 bool intel_has_gpu_reset(const struct intel_gt *gt);
 bool intel_has_reset_engine(const struct intel_gt *gt);
 
index f43bc3a0fe4feb149985b45a143ff2ab36f3a66a..add6b86d9d03f471be600b205e0b332c7e2be313 100644 (file)
@@ -34,12 +34,17 @@ struct intel_reset {
         * longer use the GPU - similar to #I915_WEDGED bit. The difference in
         * in the way we're handling "forced" unwedged (e.g. through debugfs),
         * which is not allowed in case we failed to initialize.
+        *
+        * #I915_WEDGED_ON_FINI - Similar to #I915_WEDGED_ON_INIT, except we
+        * use it to mark that the GPU is no longer available (and prevent
+        * users from using it).
         */
        unsigned long flags;
 #define I915_RESET_BACKOFF     0
 #define I915_RESET_MODESET     1
 #define I915_RESET_ENGINE      2
-#define I915_WEDGED_ON_INIT    (BITS_PER_LONG - 2)
+#define I915_WEDGED_ON_INIT    (BITS_PER_LONG - 3)
+#define I915_WEDGED_ON_FINI    (BITS_PER_LONG - 2)
 #define I915_WEDGED            (BITS_PER_LONG - 1)
 
        struct mutex mutex; /* serialises wedging/unwedging */