drm/i915: Track context current active time
authorTvrtko Ursulin <tvrtko.ursulin@intel.com>
Fri, 1 Apr 2022 14:22:02 +0000 (15:22 +0100)
committerTvrtko Ursulin <tvrtko.ursulin@intel.com>
Tue, 5 Apr 2022 07:39:10 +0000 (08:39 +0100)
Track context active (on hardware) status together with the start
timestamp.

This will be used to provide better granularity of context
runtime reporting in conjunction with already tracked pphwsp accumulated
runtime.

The latter is only updated on context save so does not give us visibility
to any currently executing work.

As part of the patch the existing runtime tracking data is moved under the
new ce->stats member and updated under the seqlock. This provides the
ability to atomically read out accumulated plus active runtime.

v2:
 * Rename and make __intel_context_get_active_time unlocked.

v3:
 * Use GRAPHICS_VER.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Aravind Iddamsetty <aravind.iddamsetty@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220401142205.3123159-6-tvrtko.ursulin@linux.intel.com
drivers/gpu/drm/i915/gt/intel_context.c
drivers/gpu/drm/i915/gt/intel_context.h
drivers/gpu/drm/i915/gt/intel_context_types.h
drivers/gpu/drm/i915/gt/intel_execlists_submission.c
drivers/gpu/drm/i915/gt/intel_gt_clock_utils.c
drivers/gpu/drm/i915/gt/intel_lrc.c
drivers/gpu/drm/i915/gt/intel_lrc.h
drivers/gpu/drm/i915/gt/selftest_lrc.c
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_gpu_error.h

index d87145b8fca0c973e4d2e62d9ad9a142059135d8..4070cb5711d88f672ee3505d98177d74cc0af98c 100644 (file)
@@ -386,7 +386,7 @@ intel_context_init(struct intel_context *ce, struct intel_engine_cs *engine)
        ce->ring = NULL;
        ce->ring_size = SZ_4K;
 
-       ewma_runtime_init(&ce->runtime.avg);
+       ewma_runtime_init(&ce->stats.runtime.avg);
 
        ce->vm = i915_vm_get(engine->gt->vm);
 
@@ -576,6 +576,31 @@ void intel_context_bind_parent_child(struct intel_context *parent,
        child->parallel.parent = parent;
 }
 
+u64 intel_context_get_total_runtime_ns(const struct intel_context *ce)
+{
+       u64 total, active;
+
+       total = ce->stats.runtime.total;
+       if (ce->ops->flags & COPS_RUNTIME_CYCLES)
+               total *= ce->engine->gt->clock_period_ns;
+
+       active = READ_ONCE(ce->stats.active);
+       if (active)
+               active = intel_context_clock() - active;
+
+       return total + active;
+}
+
+u64 intel_context_get_avg_runtime_ns(struct intel_context *ce)
+{
+       u64 avg = ewma_runtime_read(&ce->stats.runtime.avg);
+
+       if (ce->ops->flags & COPS_RUNTIME_CYCLES)
+               avg *= ce->engine->gt->clock_period_ns;
+
+       return avg;
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftest_context.c"
 #endif
index d8c74bbf9aae2bd80358c907af868ea95b0757da..b7d3214d2cdd8a4d933308159101797720ecbf35 100644 (file)
@@ -351,18 +351,13 @@ intel_context_clear_nopreempt(struct intel_context *ce)
        clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
 }
 
-static inline u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
-{
-       const u32 period = ce->engine->gt->clock_period_ns;
-
-       return READ_ONCE(ce->runtime.total) * period;
-}
+u64 intel_context_get_total_runtime_ns(const struct intel_context *ce);
+u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
 
-static inline u64 intel_context_get_avg_runtime_ns(struct intel_context *ce)
+static inline u64 intel_context_clock(void)
 {
-       const u32 period = ce->engine->gt->clock_period_ns;
-
-       return mul_u32_u32(ewma_runtime_read(&ce->runtime.avg), period);
+       /* As we mix CS cycles with CPU clocks, use the raw monotonic clock. */
+       return ktime_get_raw_fast_ns();
 }
 
 #endif /* __INTEL_CONTEXT_H__ */
index 30cd81ad8911a1fc98bc14f460fc223035b44e60..09f82545789f194b305cbadd42305d82d66b22d4 100644 (file)
@@ -35,6 +35,9 @@ struct intel_context_ops {
 #define COPS_HAS_INFLIGHT_BIT 0
 #define COPS_HAS_INFLIGHT BIT(COPS_HAS_INFLIGHT_BIT)
 
+#define COPS_RUNTIME_CYCLES_BIT 1
+#define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
+
        int (*alloc)(struct intel_context *ce);
 
        void (*ban)(struct intel_context *ce, struct i915_request *rq);
@@ -134,14 +137,19 @@ struct intel_context {
        } lrc;
        u32 tag; /* cookie passed to HW to track this context on submission */
 
-       /* Time on GPU as tracked by the hw. */
-       struct {
-               struct ewma_runtime avg;
-               u64 total;
-               u32 last;
-               I915_SELFTEST_DECLARE(u32 num_underflow);
-               I915_SELFTEST_DECLARE(u32 max_underflow);
-       } runtime;
+       /** stats: Context GPU engine busyness tracking. */
+       struct intel_context_stats {
+               u64 active;
+
+               /* Time on GPU as tracked by the hw. */
+               struct {
+                       struct ewma_runtime avg;
+                       u64 total;
+                       u32 last;
+                       I915_SELFTEST_DECLARE(u32 num_underflow);
+                       I915_SELFTEST_DECLARE(u32 max_underflow);
+               } runtime;
+       } stats;
 
        unsigned int active_count; /* protected by timeline->mutex */
 
index e181029c1e3e46f6d465a75055086ca8a5607776..94d41a04dd75cc6a0baf6b7832bb4c4e3f640f10 100644 (file)
@@ -624,8 +624,6 @@ static void __execlists_schedule_out(struct i915_request * const rq,
                GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
                __set_bit(ccid - 1, &engine->context_tag);
        }
-
-       lrc_update_runtime(ce);
        intel_engine_context_out(engine);
        execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
        if (engine->fw_domain && !--engine->fw_active)
@@ -2004,8 +2002,23 @@ process_csb(struct intel_engine_cs *engine, struct i915_request **inactive)
         * and merits a fresh timeslice. We reinstall the timer after
         * inspecting the queue to see if we need to resumbit.
         */
-       if (*prev != *execlists->active) /* elide lite-restores */
+       if (*prev != *execlists->active) { /* elide lite-restores */
+               /*
+                * Note the inherent discrepancy between the HW runtime,
+                * recorded as part of the context switch, and the CPU
+                * adjustment for active contexts. We have to hope that
+                * the delay in processing the CS event is very small
+                * and consistent. It works to our advantage to have
+                * the CPU adjustment _undershoot_ (i.e. start later than)
+                * the CS timestamp so we never overreport the runtime
+                * and correct overselves later when updating from HW.
+                */
+               if (*prev)
+                       lrc_runtime_stop((*prev)->context);
+               if (*execlists->active)
+                       lrc_runtime_start((*execlists->active)->context);
                new_timeslice(execlists);
+       }
 
        return inactive;
 }
@@ -2637,7 +2650,7 @@ unwind:
 }
 
 static const struct intel_context_ops execlists_context_ops = {
-       .flags = COPS_HAS_INFLIGHT,
+       .flags = COPS_HAS_INFLIGHT | COPS_RUNTIME_CYCLES,
 
        .alloc = execlists_context_alloc,
 
@@ -3695,7 +3708,7 @@ virtual_get_sibling(struct intel_engine_cs *engine, unsigned int sibling)
 }
 
 static const struct intel_context_ops virtual_context_ops = {
-       .flags = COPS_HAS_INFLIGHT,
+       .flags = COPS_HAS_INFLIGHT | COPS_RUNTIME_CYCLES,
 
        .alloc = virtual_context_alloc,
 
index 0db822c3b7e50f3bb4282c7fafb067583dae55c9..d5d1b04dbcad2ce714257b7fa792ace03a114a1e 100644 (file)
@@ -161,6 +161,10 @@ void intel_gt_init_clock_frequency(struct intel_gt *gt)
        if (gt->clock_frequency)
                gt->clock_period_ns = intel_gt_clock_interval_to_ns(gt, 1);
 
+       /* Icelake appears to use another fixed frequency for CTX_TIMESTAMP */
+       if (GRAPHICS_VER(gt->i915) == 11)
+               gt->clock_period_ns = NSEC_PER_SEC / 13750000;
+
        GT_TRACE(gt,
                 "Using clock frequency: %dkHz, period: %dns, wrap: %lldms\n",
                 gt->clock_frequency / 1000,
index dffef6ab4baf79bad26f7cda64ab25f9dbc18078..3f83a9038e13b7897f6274902b5f7bea787e5238 100644 (file)
@@ -778,7 +778,7 @@ static void init_common_regs(u32 * const regs,
                                           CTX_CTRL_RS_CTX_ENABLE);
        regs[CTX_CONTEXT_CONTROL] = ctl;
 
-       regs[CTX_TIMESTAMP] = ce->runtime.last;
+       regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
 }
 
 static void init_wa_bb_regs(u32 * const regs,
@@ -1734,11 +1734,12 @@ err:
        }
 }
 
-static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
+static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
 {
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
-       ce->runtime.num_underflow++;
-       ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
+       stats->runtime.num_underflow++;
+       stats->runtime.max_underflow =
+               max_t(u32, stats->runtime.max_underflow, -dt);
 #endif
 }
 
@@ -1755,25 +1756,25 @@ static u32 lrc_get_runtime(const struct intel_context *ce)
 
 void lrc_update_runtime(struct intel_context *ce)
 {
+       struct intel_context_stats *stats = &ce->stats;
        u32 old;
        s32 dt;
 
-       if (intel_context_is_barrier(ce))
+       old = stats->runtime.last;
+       stats->runtime.last = lrc_get_runtime(ce);
+       dt = stats->runtime.last - old;
+       if (!dt)
                return;
 
-       old = ce->runtime.last;
-       ce->runtime.last = lrc_get_runtime(ce);
-       dt = ce->runtime.last - old;
-
        if (unlikely(dt < 0)) {
                CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
-                        old, ce->runtime.last, dt);
-               st_update_runtime_underflow(ce, dt);
+                        old, stats->runtime.last, dt);
+               st_runtime_underflow(stats, dt);
                return;
        }
 
-       ewma_runtime_add(&ce->runtime.avg, dt);
-       ce->runtime.total += dt;
+       ewma_runtime_add(&stats->runtime.avg, dt);
+       stats->runtime.total += dt;
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
index 6e4f9f58fca5b73bae16c5f4094a2bb935eeaf5f..7371bb5c8129348f96aefe3adbbee3c10c417b45 100644 (file)
 #include <linux/bitfield.h>
 #include <linux/types.h>
 
+#include "intel_context.h"
+
 struct drm_i915_gem_object;
 struct i915_gem_ww_ctx;
-struct intel_context;
 struct intel_engine_cs;
 struct intel_ring;
 struct kref;
@@ -120,4 +121,28 @@ static inline u32 lrc_desc_priority(int prio)
                return GEN12_CTX_PRIORITY_NORMAL;
 }
 
+static inline void lrc_runtime_start(struct intel_context *ce)
+{
+       struct intel_context_stats *stats = &ce->stats;
+
+       if (intel_context_is_barrier(ce))
+               return;
+
+       if (stats->active)
+               return;
+
+       WRITE_ONCE(stats->active, intel_context_clock());
+}
+
+static inline void lrc_runtime_stop(struct intel_context *ce)
+{
+       struct intel_context_stats *stats = &ce->stats;
+
+       if (!stats->active)
+               return;
+
+       lrc_update_runtime(ce);
+       WRITE_ONCE(stats->active, 0);
+}
+
 #endif /* __INTEL_LRC_H__ */
index 21c29d315cc0b7de45985313f9c2e644b5b24df1..6ba52ef1acb8e085ca205898bb68137755e8c1ea 100644 (file)
@@ -1753,8 +1753,8 @@ static int __live_pphwsp_runtime(struct intel_engine_cs *engine)
        if (IS_ERR(ce))
                return PTR_ERR(ce);
 
-       ce->runtime.num_underflow = 0;
-       ce->runtime.max_underflow = 0;
+       ce->stats.runtime.num_underflow = 0;
+       ce->stats.runtime.max_underflow = 0;
 
        do {
                unsigned int loop = 1024;
@@ -1792,11 +1792,11 @@ static int __live_pphwsp_runtime(struct intel_engine_cs *engine)
                intel_context_get_avg_runtime_ns(ce));
 
        err = 0;
-       if (ce->runtime.num_underflow) {
+       if (ce->stats.runtime.num_underflow) {
                pr_err("%s: pphwsp underflow %u time(s), max %u cycles!\n",
                       engine->name,
-                      ce->runtime.num_underflow,
-                      ce->runtime.max_underflow);
+                      ce->stats.runtime.num_underflow,
+                      ce->stats.runtime.max_underflow);
                GEM_TRACE_DUMP();
                err = -EOVERFLOW;
        }
index f41eb4d12b3459f18aa6355f21f34317047dbc13..7d09f92695af46389f2e8c7401d080105b919f59 100644 (file)
@@ -509,13 +509,10 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
                                const char *header,
                                const struct i915_gem_context_coredump *ctx)
 {
-       const u32 period = to_gt(m->i915)->clock_period_ns;
-
        err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n",
                   header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
                   ctx->guilty, ctx->active,
-                  ctx->total_runtime * period,
-                  mul_u32_u32(ctx->avg_runtime, period));
+                  ctx->total_runtime, ctx->avg_runtime);
 }
 
 static struct i915_vma_coredump *
@@ -1364,8 +1361,8 @@ static bool record_context(struct i915_gem_context_coredump *e,
        e->guilty = atomic_read(&ctx->guilty_count);
        e->active = atomic_read(&ctx->active_count);
 
-       e->total_runtime = rq->context->runtime.total;
-       e->avg_runtime = ewma_runtime_read(&rq->context->runtime.avg);
+       e->total_runtime = intel_context_get_total_runtime_ns(rq->context);
+       e->avg_runtime = intel_context_get_avg_runtime_ns(rq->context);
 
        simulated = i915_gem_context_no_error_capture(ctx);
 
index 09159ff01411ab31e9f65fc3908864e1d0947e31..72d86071c0d1b67e30e7f46fabec36b43c18e809 100644 (file)
@@ -94,7 +94,7 @@ struct intel_engine_coredump {
                char comm[TASK_COMM_LEN];
 
                u64 total_runtime;
-               u32 avg_runtime;
+               u64 avg_runtime;
 
                pid_t pid;
                int active;