drm/i915/perf: implement active wait for noa configurations
authorLionel Landwerlin <lionel.g.landwerlin@intel.com>
Sat, 12 Oct 2019 07:23:07 +0000 (08:23 +0100)
committerChris Wilson <chris@chris-wilson.co.uk>
Sat, 12 Oct 2019 08:08:33 +0000 (09:08 +0100)
NOA configuration take some amount of time to apply. That amount of
time depends on the size of the GT. There is no documented time for
this. For example, past experimentations with powergating
configuration changes seem to indicate a 60~70us delay. We go with
500us as default for now which should be over the required amount of
time (according to HW architects).

v2: Don't forget to save/restore registers used for the wait (Chris)

v3: Name used CS_GPR registers (Chris)
    Fix compile issue due to rebase (Lionel)

v4: Fix save/restore helpers (Umesh)

v5: Move noa_wait from drm_i915_private to i915_perf_stream (Lionel)

v6: Add missing struct declarations in i915_perf.h

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20191012072308.30312-2-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/gt/intel_gpu_commands.h
drivers/gpu/drm/i915/gt/intel_gt_types.h
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_perf.c
drivers/gpu/drm/i915/i915_perf_types.h
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/selftests/i915_live_selftests.h
drivers/gpu/drm/i915/selftests/i915_perf.c [new file with mode: 0644]

index 0987100c786b0e9c4831483e5f7491cf2df8e6a7..8e63cffcabe06c28be94e8c5ea014a054656d65e 100644 (file)
 #define MI_BATCH_BUFFER_START  MI_INSTR(0x31, 0)
 #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
 #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
-#define   MI_BATCH_RESOURCE_STREAMER (1<<10)
+#define   MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
+#define   MI_BATCH_PREDICATE         REG_BIT(15) /* HSW+ on RCS only*/
 
 /*
  * 3D instructions used by the kernel
 #define   PIPE_CONTROL_CS_STALL                                (1<<20)
 #define   PIPE_CONTROL_TLB_INVALIDATE                  (1<<18)
 #define   PIPE_CONTROL_MEDIA_STATE_CLEAR               (1<<16)
+#define   PIPE_CONTROL_WRITE_TIMESTAMP                 (3<<14)
 #define   PIPE_CONTROL_QW_WRITE                                (1<<14)
 #define   PIPE_CONTROL_POST_SYNC_OP_MASK                (3<<14)
 #define   PIPE_CONTROL_DEPTH_STALL                     (1<<13)
index 802f516a34301f739b1c814d8d095230e0bde02e..be4b263621c8fddee7cb124c1960248cd0f7636e 100644 (file)
@@ -109,6 +109,11 @@ enum intel_gt_scratch_field {
        /* 8 bytes */
        INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
 
+       /* 6 * 8 bytes */
+       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
+
+       /* 4 bytes */
+       INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
 };
 
 #endif /* __INTEL_GT_TYPES_H__ */
index e575761550ac56af5eb6de71e5d8d493e8f9aae8..a541b6ae534fa76593c69ed9459a092d3c409945 100644 (file)
@@ -3590,6 +3590,37 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
                        i915_wedged_get, i915_wedged_set,
                        "%llu\n");
 
+static int
+i915_perf_noa_delay_set(void *data, u64 val)
+{
+       struct drm_i915_private *i915 = data;
+       const u32 clk = RUNTIME_INFO(i915)->cs_timestamp_frequency_khz;
+
+       /*
+        * This would lead to infinite waits as we're doing timestamp
+        * difference on the CS with only 32bits.
+        */
+       if (val > mul_u32_u32(U32_MAX, clk))
+               return -EINVAL;
+
+       atomic64_set(&i915->perf.noa_programming_delay, val);
+       return 0;
+}
+
+static int
+i915_perf_noa_delay_get(void *data, u64 *val)
+{
+       struct drm_i915_private *i915 = data;
+
+       *val = atomic64_read(&i915->perf.noa_programming_delay);
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
+                       i915_perf_noa_delay_get,
+                       i915_perf_noa_delay_set,
+                       "%llu\n");
+
 #define DROP_UNBOUND   BIT(0)
 #define DROP_BOUND     BIT(1)
 #define DROP_RETIRE    BIT(2)
@@ -4345,6 +4376,7 @@ static const struct i915_debugfs_files {
        const char *name;
        const struct file_operations *fops;
 } i915_debugfs_files[] = {
+       {"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
        {"i915_wedged", &i915_wedged_fops},
        {"i915_cache_sharing", &i915_cache_sharing_fops},
        {"i915_gem_drop_caches", &i915_drop_caches_fops},
index 50f2f972020df31071a17d6d29c8e7bc5f3f3159..81e8a79340019968cdfbb90805b250a20871380f 100644 (file)
 #include "gem/i915_gem_context.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
+#include "gt/intel_gt.h"
 #include "gt/intel_lrc_reg.h"
 
 #include "i915_drv.h"
@@ -1347,6 +1348,12 @@ free_oa_configs(struct i915_perf_stream *stream)
                free_oa_config_bo(oa_bo);
 }
 
+static void
+free_noa_wait(struct i915_perf_stream *stream)
+{
+       i915_vma_unpin_and_release(&stream->noa_wait, 0);
+}
+
 static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 {
        struct i915_perf *perf = stream->perf;
@@ -1369,6 +1376,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
                oa_put_render_ctx_id(stream);
 
        free_oa_configs(stream);
+       free_noa_wait(stream);
 
        if (perf->spurious_report_rs.missed) {
                DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
@@ -1529,6 +1537,206 @@ err_unref:
        return ret;
 }
 
+static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs,
+                                 bool save, i915_reg_t reg, u32 offset,
+                                 u32 dword_count)
+{
+       u32 cmd;
+       u32 d;
+
+       cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM;
+       if (INTEL_GEN(stream->perf->i915) >= 8)
+               cmd++;
+
+       for (d = 0; d < dword_count; d++) {
+               *cs++ = cmd;
+               *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
+               *cs++ = intel_gt_scratch_offset(stream->engine->gt,
+                                               offset) + 4 * d;
+               *cs++ = 0;
+       }
+
+       return cs;
+}
+
+static int alloc_noa_wait(struct i915_perf_stream *stream)
+{
+       struct drm_i915_private *i915 = stream->perf->i915;
+       struct drm_i915_gem_object *bo;
+       struct i915_vma *vma;
+       const u64 delay_ticks = 0xffffffffffffffff -
+               DIV64_U64_ROUND_UP(
+                       atomic64_read(&stream->perf->noa_programming_delay) *
+                       RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
+                       1000000ull);
+       const u32 base = stream->engine->mmio_base;
+#define CS_GPR(x) GEN8_RING_CS_GPR(base, x)
+       u32 *batch, *ts0, *cs, *jump;
+       int ret, i;
+       enum {
+               START_TS,
+               NOW_TS,
+               DELTA_TS,
+               JUMP_PREDICATE,
+               DELTA_TARGET,
+               N_CS_GPR
+       };
+
+       bo = i915_gem_object_create_internal(i915, 4096);
+       if (IS_ERR(bo)) {
+               DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
+               return PTR_ERR(bo);
+       }
+
+       /*
+        * We pin in GGTT because we jump into this buffer now because
+        * multiple OA config BOs will have a jump to this address and it
+        * needs to be fixed during the lifetime of the i915/perf stream.
+        */
+       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 0, PIN_HIGH);
+       if (IS_ERR(vma)) {
+               ret = PTR_ERR(vma);
+               goto err_unref;
+       }
+
+       batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
+       if (IS_ERR(batch)) {
+               ret = PTR_ERR(batch);
+               goto err_unpin;
+       }
+
+       /* Save registers. */
+       for (i = 0; i < N_CS_GPR; i++)
+               cs = save_restore_register(
+                       stream, cs, true /* save */, CS_GPR(i),
+                       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+       cs = save_restore_register(
+               stream, cs, true /* save */, MI_PREDICATE_RESULT_1,
+               INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+       /* First timestamp snapshot location. */
+       ts0 = cs;
+
+       /*
+        * Initial snapshot of the timestamp register to implement the wait.
+        * We work with 32b values, so clear out the top 32b bits of the
+        * register because the ALU works 64bits.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(1);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS)) + 4;
+       *cs++ = 0;
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
+       *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS));
+
+       /*
+        * This is the location we're going to jump back into until the
+        * required amount of time has passed.
+        */
+       jump = cs;
+
+       /*
+        * Take another snapshot of the timestamp register. Take care to clear
+        * up the top 32bits of CS_GPR(1) as we're using it for other
+        * operations below.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(1);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS)) + 4;
+       *cs++ = 0;
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
+       *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS));
+
+       /*
+        * Do a diff between the 2 timestamps and store the result back into
+        * CS_GPR(1).
+        */
+       *cs++ = MI_MATH(5);
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+       *cs++ = MI_MATH_SUB;
+       *cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU);
+       *cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
+
+       /*
+        * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
+        * timestamp have rolled over the 32bits) into the predicate register
+        * to be used for the predicated jump.
+        */
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
+       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+       /* Restart from the beginning if we had timestamps roll over. */
+       *cs++ = (INTEL_GEN(i915) < 8 ?
+                MI_BATCH_BUFFER_START :
+                MI_BATCH_BUFFER_START_GEN8) |
+               MI_BATCH_PREDICATE;
+       *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4;
+       *cs++ = 0;
+
+       /*
+        * Now add the diff between to previous timestamps and add it to :
+        *      (((1 * << 64) - 1) - delay_ns)
+        *
+        * When the Carry Flag contains 1 this means the elapsed time is
+        * longer than the expected delay, and we can exit the wait loop.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET));
+       *cs++ = lower_32_bits(delay_ticks);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET)) + 4;
+       *cs++ = upper_32_bits(delay_ticks);
+
+       *cs++ = MI_MATH(4);
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS));
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET));
+       *cs++ = MI_MATH_ADD;
+       *cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
+
+       /*
+        * Transfer the result into the predicate register to be used for the
+        * predicated jump.
+        */
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
+       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+       /* Predicate the jump.  */
+       *cs++ = (INTEL_GEN(i915) < 8 ?
+                MI_BATCH_BUFFER_START :
+                MI_BATCH_BUFFER_START_GEN8) |
+               MI_BATCH_PREDICATE;
+       *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4;
+       *cs++ = 0;
+
+       /* Restore registers. */
+       for (i = 0; i < N_CS_GPR; i++)
+               cs = save_restore_register(
+                       stream, cs, false /* restore */, CS_GPR(i),
+                       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+       cs = save_restore_register(
+               stream, cs, false /* restore */, MI_PREDICATE_RESULT_1,
+               INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+       /* And return to the ring. */
+       *cs++ = MI_BATCH_BUFFER_END;
+
+       GEM_BUG_ON(cs - batch > PAGE_SIZE / sizeof(*batch));
+
+       i915_gem_object_flush_map(bo);
+       i915_gem_object_unpin_map(bo);
+
+       stream->noa_wait = vma;
+       return 0;
+
+err_unpin:
+       __i915_vma_unpin(vma);
+err_unref:
+       i915_gem_object_put(bo);
+       return ret;
+}
+
 static void config_oa_regs(struct intel_uncore *uncore,
                           const struct i915_oa_reg *regs,
                           u32 n_regs)
@@ -2206,6 +2414,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
                }
        }
 
+       ret = alloc_noa_wait(stream);
+       if (ret) {
+               DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
+               goto err_noa_wait_alloc;
+       }
+
        stream->oa_config = i915_perf_get_oa_config(perf, props->metrics_set);
        if (!stream->oa_config) {
                DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
@@ -2265,6 +2479,9 @@ err_oa_buf_alloc:
        intel_engine_pm_put(stream->engine);
 
 err_config:
+       free_noa_wait(stream);
+
+err_noa_wait_alloc:
        if (stream->ctx)
                oa_put_render_ctx_id(stream);
 
@@ -3651,6 +3868,9 @@ void i915_perf_init(struct drm_i915_private *i915)
                ratelimit_set_flags(&perf->spurious_report_rs,
                                    RATELIMIT_MSG_ON_RELEASE);
 
+               atomic64_set(&perf->noa_programming_delay,
+                            500 * 1000 /* 500us */);
+
                perf->i915 = i915;
        }
 }
@@ -3680,3 +3900,7 @@ void i915_perf_fini(struct drm_i915_private *i915)
        memset(&perf->ops, 0, sizeof(perf->ops));
        perf->i915 = NULL;
 }
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftests/i915_perf.c"
+#endif
index 337cd7d2ad779f49ece0dc9289664b8de5e488dc..d35a3c1946c35030ad7a6a2f9559db3606ae4107 100644 (file)
@@ -266,6 +266,12 @@ struct i915_perf_stream {
                 */
                u32 head;
        } oa_buffer;
+
+       /**
+        * A batch buffer doing a wait on the GPU for the NOA logic to be
+        * reprogrammed.
+        */
+       struct i915_vma *noa_wait;
 };
 
 /**
@@ -385,6 +391,8 @@ struct i915_perf {
 
        struct i915_oa_ops ops;
        const struct i915_oa_format *oa_formats;
+
+       atomic64_t noa_programming_delay;
 };
 
 #endif /* _I915_PERF_TYPES_H_ */
index 0fb9030b89f1d57b957e778e8c345b0835623b48..e24991e548973700f4abffc2bd6cb84fe43cd0a0 100644 (file)
@@ -545,7 +545,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define MI_PREDICATE_SRC0_UDW  _MMIO(0x2400 + 4)
 #define MI_PREDICATE_SRC1      _MMIO(0x2408)
 #define MI_PREDICATE_SRC1_UDW  _MMIO(0x2408 + 4)
-
+#define MI_PREDICATE_DATA       _MMIO(0x2410)
+#define MI_PREDICATE_RESULT     _MMIO(0x2418)
+#define MI_PREDICATE_RESULT_1   _MMIO(0x241c)
 #define MI_PREDICATE_RESULT_2  _MMIO(0x2214)
 #define  LOWER_SLICE_ENABLED   (1 << 0)
 #define  LOWER_SLICE_DISABLED  (0 << 0)
index 6713efea350b706ea89a5ffc428857b7fc8b0595..6daf6599ec79d1a6a52819a464badcce80248557 100644 (file)
@@ -35,3 +35,4 @@ selftest(reset, intel_reset_live_selftests)
 selftest(hangcheck, intel_hangcheck_live_selftests)
 selftest(execlists, intel_execlists_live_selftests)
 selftest(guc, intel_guc_live_selftest)
+selftest(perf, i915_perf_live_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_perf.c b/drivers/gpu/drm/i915/selftests/i915_perf.c
new file mode 100644 (file)
index 0000000..dc6d689
--- /dev/null
@@ -0,0 +1,216 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include <linux/kref.h>
+
+#include "gem/i915_gem_pm.h"
+#include "gt/intel_gt.h"
+
+#include "i915_selftest.h"
+
+#include "igt_flush_test.h"
+#include "lib_sw_fence.h"
+
+static struct i915_perf_stream *
+test_stream(struct i915_perf *perf)
+{
+       struct drm_i915_perf_open_param param = {};
+       struct perf_open_properties props = {
+               .engine = intel_engine_lookup_user(perf->i915,
+                                                  I915_ENGINE_CLASS_RENDER,
+                                                  0),
+               .sample_flags = SAMPLE_OA_REPORT,
+               .oa_format = I915_OA_FORMAT_C4_B8,
+               .metrics_set = 1,
+       };
+       struct i915_perf_stream *stream;
+
+       stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+       if (!stream)
+               return NULL;
+
+       stream->perf = perf;
+
+       mutex_lock(&perf->lock);
+       if (i915_oa_stream_init(stream, &param, &props)) {
+               kfree(stream);
+               stream =  NULL;
+       }
+       mutex_unlock(&perf->lock);
+
+       return stream;
+}
+
+static void stream_destroy(struct i915_perf_stream *stream)
+{
+       struct i915_perf *perf = stream->perf;
+
+       mutex_lock(&perf->lock);
+       i915_perf_destroy_locked(stream);
+       mutex_unlock(&perf->lock);
+}
+
+static int live_sanitycheck(void *arg)
+{
+       struct drm_i915_private *i915 = arg;
+       struct i915_perf_stream *stream;
+
+       /* Quick check we can create a perf stream */
+
+       stream = test_stream(&i915->perf);
+       if (!stream)
+               return -EINVAL;
+
+       stream_destroy(stream);
+       return 0;
+}
+
+static int write_timestamp(struct i915_request *rq, int slot)
+{
+       u32 *cs;
+       int len;
+
+       cs = intel_ring_begin(rq, 6);
+       if (IS_ERR(cs))
+               return PTR_ERR(cs);
+
+       len = 5;
+       if (INTEL_GEN(rq->i915) >= 8)
+               len++;
+
+       *cs++ = GFX_OP_PIPE_CONTROL(len);
+       *cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB |
+               PIPE_CONTROL_STORE_DATA_INDEX |
+               PIPE_CONTROL_WRITE_TIMESTAMP;
+       *cs++ = slot * sizeof(u32);
+       *cs++ = 0;
+       *cs++ = 0;
+       *cs++ = 0;
+
+       intel_ring_advance(rq, cs);
+
+       return 0;
+}
+
+static ktime_t poll_status(struct i915_request *rq, int slot)
+{
+       while (!intel_read_status_page(rq->engine, slot) &&
+              !i915_request_completed(rq))
+               cpu_relax();
+
+       return ktime_get();
+}
+
+static int live_noa_delay(void *arg)
+{
+       struct drm_i915_private *i915 = arg;
+       struct i915_perf_stream *stream;
+       struct i915_request *rq;
+       ktime_t t0, t1;
+       u64 expected;
+       u32 delay;
+       int err;
+       int i;
+
+       /* Check that the GPU delays matches expectations */
+
+       stream = test_stream(&i915->perf);
+       if (!stream)
+               return -ENOMEM;
+
+       expected = atomic64_read(&stream->perf->noa_programming_delay);
+
+       if (stream->engine->class != RENDER_CLASS) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       for (i = 0; i < 4; i++)
+               intel_write_status_page(stream->engine, 0x100 + i, 0);
+
+       rq = i915_request_create(stream->engine->kernel_context);
+       if (IS_ERR(rq)) {
+               err = PTR_ERR(rq);
+               goto out;
+       }
+
+       if (rq->engine->emit_init_breadcrumb &&
+           i915_request_timeline(rq)->has_initial_breadcrumb) {
+               err = rq->engine->emit_init_breadcrumb(rq);
+               if (err) {
+                       i915_request_add(rq);
+                       goto out;
+               }
+       }
+
+       err = write_timestamp(rq, 0x100);
+       if (err) {
+               i915_request_add(rq);
+               goto out;
+       }
+
+       err = rq->engine->emit_bb_start(rq,
+                                       i915_ggtt_offset(stream->noa_wait), 0,
+                                       I915_DISPATCH_SECURE);
+       if (err) {
+               i915_request_add(rq);
+               goto out;
+       }
+
+       err = write_timestamp(rq, 0x102);
+       if (err) {
+               i915_request_add(rq);
+               goto out;
+       }
+
+       i915_request_get(rq);
+       i915_request_add(rq);
+
+       preempt_disable();
+       t0 = poll_status(rq, 0x100);
+       t1 = poll_status(rq, 0x102);
+       preempt_enable();
+
+       pr_info("CPU delay: %lluns, expected %lluns\n",
+               ktime_sub(t1, t0), expected);
+
+       delay = intel_read_status_page(stream->engine, 0x102);
+       delay -= intel_read_status_page(stream->engine, 0x100);
+       delay = div_u64(mul_u32_u32(delay, 1000 * 1000),
+                       RUNTIME_INFO(i915)->cs_timestamp_frequency_khz);
+       pr_info("GPU delay: %uns, expected %lluns\n",
+               delay, expected);
+
+       if (4 * delay < 3 * expected || 2 * delay > 3 * expected) {
+               pr_err("GPU delay [%uus] outside of expected threshold! [%lluus, %lluus]\n",
+                      delay / 1000,
+                      div_u64(3 * expected, 4000),
+                      div_u64(3 * expected, 2000));
+               err = -EINVAL;
+       }
+
+       i915_request_put(rq);
+out:
+       stream_destroy(stream);
+       return err;
+}
+
+int i915_perf_live_selftests(struct drm_i915_private *i915)
+{
+       static const struct i915_subtest tests[] = {
+               SUBTEST(live_sanitycheck),
+               SUBTEST(live_noa_delay),
+       };
+       struct i915_perf *perf = &i915->perf;
+
+       if (!perf->metrics_kobj || !perf->ops.enable_metric_set)
+               return 0;
+
+       if (intel_gt_is_wedged(&i915->gt))
+               return 0;
+
+       return i915_subtests(tests, i915);
+}