drm/i915: Harden detection of missed interrupts
authorChris Wilson <chris@chris-wilson.co.uk>
Sat, 9 Apr 2016 09:57:55 +0000 (10:57 +0100)
committerChris Wilson <chris@chris-wilson.co.uk>
Sat, 9 Apr 2016 11:09:29 +0000 (12:09 +0100)
Only declare a missed interrupt if we find that the GPU is idle with
waiters and a hangcheck interval has passed in which no new user
interrupts have been raised.

v2: Clear the stuck interrupt marker between successful batches

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1460195877-20520-3-git-send-email-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_irq.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index 919c05ba9932c041df1d19775c4ddae6d0c979fa..9640738aabf27d5e2cb00140857bffd0af6b5390 100644 (file)
@@ -728,10 +728,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
 static void i915_ring_seqno_info(struct seq_file *m,
                                 struct intel_engine_cs *engine)
 {
-       if (engine->get_seqno) {
-               seq_printf(m, "Current sequence (%s): %x\n",
-                          engine->name, engine->get_seqno(engine));
-       }
+       seq_printf(m, "Current sequence (%s): %x\n",
+                  engine->name, engine->get_seqno(engine));
+       seq_printf(m, "Current user interrupts (%s): %x\n",
+                  engine->name, READ_ONCE(engine->user_interrupts));
 }
 
 static int i915_gem_seqno_info(struct seq_file *m, void *data)
@@ -1367,6 +1367,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
                           engine->hangcheck.seqno,
                           seqno[id],
                           engine->last_submitted_seqno);
+               seq_printf(m, "\tuser interrupts = %x [current %x]\n",
+                          engine->hangcheck.user_interrupts,
+                          READ_ONCE(engine->user_interrupts));
                seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
                           (long long)engine->hangcheck.acthd,
                           (long long)acthd[id]);
index 3b946e1c76146d3a049abbdc705254f793513e7d..679f08c944ef6cc9be8a56818ac73d4729437409 100644 (file)
@@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *engine)
                return;
 
        trace_i915_gem_request_notify(engine);
+       engine->user_interrupts++;
 
        wake_up_all(&engine->irq_queue);
 }
@@ -3054,6 +3055,24 @@ ring_stuck(struct intel_engine_cs *engine, u64 acthd)
        return HANGCHECK_HUNG;
 }
 
+static unsigned kick_waiters(struct intel_engine_cs *engine)
+{
+       struct drm_i915_private *i915 = to_i915(engine->dev);
+       unsigned user_interrupts = READ_ONCE(engine->user_interrupts);
+
+       if (engine->hangcheck.user_interrupts == user_interrupts &&
+           !test_and_set_bit(engine->id, &i915->gpu_error.missed_irq_rings)) {
+               if (!(i915->gpu_error.test_irq_rings & intel_engine_flag(engine)))
+                       DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+                                 engine->name);
+               else
+                       DRM_INFO("Fake missed irq on %s\n",
+                                engine->name);
+               wake_up_all(&engine->irq_queue);
+       }
+
+       return user_interrupts;
+}
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -3096,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
        for_each_engine_id(engine, dev_priv, id) {
                u64 acthd;
                u32 seqno;
+               unsigned user_interrupts;
                bool busy = true;
 
                semaphore_clear_deadlocks(dev_priv);
@@ -3113,22 +3133,15 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
                acthd = intel_ring_get_active_head(engine);
                seqno = engine->get_seqno(engine);
 
+               /* Reset stuck interrupts between batch advances */
+               user_interrupts = 0;
+
                if (engine->hangcheck.seqno == seqno) {
                        if (ring_idle(engine, seqno)) {
                                engine->hangcheck.action = HANGCHECK_IDLE;
-
                                if (waitqueue_active(&engine->irq_queue)) {
-                                       /* Issue a wake-up to catch stuck h/w. */
-                                       if (!test_and_set_bit(engine->id, &dev_priv->gpu_error.missed_irq_rings)) {
-                                               if (!(dev_priv->gpu_error.test_irq_rings & intel_engine_flag(engine)))
-                                                       DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-                                                                 engine->name);
-                                               else
-                                                       DRM_INFO("Fake missed irq on %s\n",
-                                                                engine->name);
-                                               wake_up_all(&engine->irq_queue);
-                                       }
                                        /* Safeguard against driver failure */
+                                       user_interrupts = kick_waiters(engine);
                                        engine->hangcheck.score += BUSY;
                                } else
                                        busy = false;
@@ -3179,7 +3192,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
                                engine->hangcheck.score = 0;
 
                        /* Clear head and subunit states on seqno movement */
-                       engine->hangcheck.acthd = 0;
+                       acthd = 0;
 
                        memset(engine->hangcheck.instdone, 0,
                               sizeof(engine->hangcheck.instdone));
@@ -3187,6 +3200,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 
                engine->hangcheck.seqno = seqno;
                engine->hangcheck.acthd = acthd;
+               engine->hangcheck.user_interrupts = user_interrupts;
                busy_count += busy;
        }
 
index 3f04906a081fbe7025788b2c2d93c739bac2bf73..29c54cc1ee5cf06bc9d3d83da8c797317c974b5a 100644 (file)
@@ -87,6 +87,7 @@ enum intel_ring_hangcheck_action {
 struct intel_ring_hangcheck {
        u64 acthd;
        u32 seqno;
+       unsigned user_interrupts;
        int score;
        enum intel_ring_hangcheck_action action;
        int deadlock;
@@ -305,6 +306,7 @@ struct  intel_engine_cs {
         * inspecting request list.
         */
        u32 last_submitted_seqno;
+       unsigned user_interrupts;
 
        bool gpu_caches_dirty;