drm/xe/uapi: Use hint for guc to set GT frequency
authorTejas Upadhyay <tejas.upadhyay@intel.com>
Fri, 28 Feb 2025 07:02:24 +0000 (12:32 +0530)
committerTejas Upadhyay <tejas.upadhyay@intel.com>
Wed, 5 Mar 2025 04:24:24 +0000 (09:54 +0530)
Allow user to provide a low latency hint. When set, KMD sends a hint
to GuC which results in special handling for that process. SLPC will
ramp the GT frequency aggressively every time it switches to this
process.

We need to enable the use of SLPC Compute strategy during init, but
it will apply only to processes that set this bit during process
creation.

Improvement with this approach as below:

Before,

:~$ NEOReadDebugKeys=1 EnableDirectSubmission=0 clpeak --kernel-latency
Platform: Intel(R) OpenCL Graphics
  Device: Intel(R) Graphics [0xe20b]
    Driver version  : 24.52.0 (Linux x64)
    Compute units   : 160
    Clock frequency : 2850 MHz
    Kernel launch latency : 283.16 us

After,

:~$ NEOReadDebugKeys=1 EnableDirectSubmission=0 clpeak --kernel-latency
Platform: Intel(R) OpenCL Graphics
  Device: Intel(R) Graphics [0xe20b]
    Driver version  : 24.52.0 (Linux x64)
    Compute units   : 160
    Clock frequency : 2850 MHz

    Kernel launch latency : 63.38 us

Compute PR: https://github.com/intel/compute-runtime/pull/794
Mesa PR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33214
IGT PR: https://patchwork.freedesktop.org/patch/639989/

V10(Lucas):
  - Remove doc from drm-uapi.rst
v9(Vinay):
  - remove extra line, align commit message
v8(Vinay):
  - Add separate example for using low latency hint
v7(Jose):
  - Update UMD PR
  - applicable to all gpus
V6:
  - init flags, remove redundant flags check (MAuld)
V5:
  - Move uapi doc to documentation and GuC ABI specific change (Rodrigo)
  - Modify logic to restrict exec queue flags (MAuld)
V4:
  - To make it clear, dont use exec queue word (Vinay)
  - Correct typo in description of flag (Jose/Vinay)
  - rename set_strategy api and replace ctx with exec queue(Vinay)
  - Start with 0th bit to indentify user flags (Jose)
V3:
  - Conver user flag to kernel internal flag and use (Oak)
  - Support query config for use to check kernel support (Jose)
  - Dont need to take runtime pm (Vinay)
V2:
  - DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT 1 planned for other hint(Szymon)
  - Add motivation to description (Lucas)

Acked-by: Lucas De Marchi <lucas.demarchi@intel.com>
Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250228070224.739295-2-tejas.upadhyay@intel.com
Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h
drivers/gpu/drm/xe/xe_exec_queue.c
drivers/gpu/drm/xe/xe_exec_queue_types.h
drivers/gpu/drm/xe/xe_guc_pc.c
drivers/gpu/drm/xe/xe_guc_submit.c
drivers/gpu/drm/xe/xe_query.c
include/uapi/drm/xe_drm.h

index 85abe4f09ae27146ecf24180def9af466dc0522f..b28c8fa061f7be75972f66d44286b72d88d905ee 100644 (file)
@@ -174,6 +174,9 @@ struct slpc_task_state_data {
        };
 } __packed;
 
+#define SLPC_CTX_FREQ_REQ_IS_COMPUTE           REG_BIT(28)
+#define SLPC_OPTIMIZED_STRATEGY_COMPUTE                REG_BIT(0)
+
 struct slpc_shared_data_header {
        /* Total size in bytes of this shared buffer. */
        u32 size;
index 23a9f519ce1c7d2529519fdd686930dd61a970c6..7c5c003d3c408a1fd528f7f8aec03db44fbe225d 100644 (file)
@@ -604,11 +604,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
        struct xe_tile *tile;
        struct xe_exec_queue *q = NULL;
        u32 logical_mask;
+       u32 flags = 0;
        u32 id;
        u32 len;
        int err;
 
-       if (XE_IOCTL_DBG(xe, args->flags) ||
+       if (XE_IOCTL_DBG(xe, args->flags & ~DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT) ||
            XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
                return -EINVAL;
 
@@ -625,6 +626,9 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
        if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count))
                return -EINVAL;
 
+       if (args->flags & DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT)
+               flags |= EXEC_QUEUE_FLAG_LOW_LATENCY;
+
        if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) {
                if (XE_IOCTL_DBG(xe, args->width != 1) ||
                    XE_IOCTL_DBG(xe, args->num_placements != 1) ||
@@ -633,8 +637,8 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
 
                for_each_tile(tile, xe, id) {
                        struct xe_exec_queue *new;
-                       u32 flags = EXEC_QUEUE_FLAG_VM;
 
+                       flags |= EXEC_QUEUE_FLAG_VM;
                        if (id)
                                flags |= EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD;
 
@@ -680,7 +684,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
                }
 
                q = xe_exec_queue_create(xe, vm, logical_mask,
-                                        args->width, hwe, 0,
+                                        args->width, hwe, flags,
                                         args->extensions);
                up_read(&vm->lock);
                xe_vm_put(vm);
index 6eb7ff091534f42e0f9dddbff40109769080c100..cc1cffb5c87f1d0a4d2cea9e7b57cee9835762af 100644 (file)
@@ -85,6 +85,8 @@ struct xe_exec_queue {
 #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD      BIT(3)
 /* kernel exec_queue only, set priority to highest level */
 #define EXEC_QUEUE_FLAG_HIGH_PRIORITY          BIT(4)
+/* flag to indicate low latency hint to guc */
+#define EXEC_QUEUE_FLAG_LOW_LATENCY            BIT(5)
 
        /**
         * @flags: flags for this exec queue, should statically setup aside from ban
index 02409eedb91438dca98a0f5b5af4a11658167cad..25040efa043fa26fcc5b9f394767885b9017f35a 100644 (file)
@@ -995,6 +995,17 @@ out:
        return ret;
 }
 
+static int pc_action_set_strategy(struct xe_guc_pc *pc, u32 val)
+{
+       int ret = 0;
+
+       ret = pc_action_set_param(pc,
+                                 SLPC_PARAM_STRATEGIES,
+                                 val);
+
+       return ret;
+}
+
 /**
  * xe_guc_pc_start - Start GuC's Power Conservation component
  * @pc: Xe_GuC_PC instance
@@ -1054,6 +1065,11 @@ int xe_guc_pc_start(struct xe_guc_pc *pc)
        }
 
        ret = pc_action_setup_gucrc(pc, GUCRC_FIRMWARE_CONTROL);
+       if (ret)
+               goto out;
+
+       /* Enable SLPC Optimized Strategy for compute */
+       ret = pc_action_set_strategy(pc, SLPC_OPTIMIZED_STRATEGY_COMPUTE);
 
 out:
        xe_force_wake_put(gt_to_fw(gt), fw_ref);
index b6a2dd742ebdc08a02bb8cf29adb22902fc40c3c..b95934055f727baf975f8f642927e800010ba26e 100644 (file)
@@ -15,6 +15,7 @@
 #include <drm/drm_managed.h>
 
 #include "abi/guc_actions_abi.h"
+#include "abi/guc_actions_slpc_abi.h"
 #include "abi/guc_klvs_abi.h"
 #include "regs/xe_lrc_layout.h"
 #include "xe_assert.h"
@@ -400,6 +401,7 @@ static void __guc_exec_queue_policy_add_##func(struct exec_queue_policy *policy,
 MAKE_EXEC_QUEUE_POLICY_ADD(execution_quantum, EXECUTION_QUANTUM)
 MAKE_EXEC_QUEUE_POLICY_ADD(preemption_timeout, PREEMPTION_TIMEOUT)
 MAKE_EXEC_QUEUE_POLICY_ADD(priority, SCHEDULING_PRIORITY)
+MAKE_EXEC_QUEUE_POLICY_ADD(slpc_exec_queue_freq_req, SLPM_GT_FREQUENCY)
 #undef MAKE_EXEC_QUEUE_POLICY_ADD
 
 static const int xe_exec_queue_prio_to_guc[] = {
@@ -414,14 +416,20 @@ static void init_policies(struct xe_guc *guc, struct xe_exec_queue *q)
        struct exec_queue_policy policy;
        enum xe_exec_queue_priority prio = q->sched_props.priority;
        u32 timeslice_us = q->sched_props.timeslice_us;
+       u32 slpc_exec_queue_freq_req = 0;
        u32 preempt_timeout_us = q->sched_props.preempt_timeout_us;
 
        xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
 
+       if (q->flags & EXEC_QUEUE_FLAG_LOW_LATENCY)
+               slpc_exec_queue_freq_req |= SLPC_CTX_FREQ_REQ_IS_COMPUTE;
+
        __guc_exec_queue_policy_start_klv(&policy, q->guc->id);
        __guc_exec_queue_policy_add_priority(&policy, xe_exec_queue_prio_to_guc[prio]);
        __guc_exec_queue_policy_add_execution_quantum(&policy, timeslice_us);
        __guc_exec_queue_policy_add_preemption_timeout(&policy, preempt_timeout_us);
+       __guc_exec_queue_policy_add_slpc_exec_queue_freq_req(&policy,
+                                                            slpc_exec_queue_freq_req);
 
        xe_guc_ct_send(&guc->ct, (u32 *)&policy.h2g,
                       __guc_exec_queue_policy_action_size(&policy), 0, 0);
index 781dd21682e56cf4349d1ef5398da89887c645c5..ce2a2767de1a3ae3d3161fda167faa1bad5acc92 100644 (file)
@@ -340,6 +340,8 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query)
        if (xe_device_get_root_tile(xe)->mem.vram.usable_size)
                config->info[DRM_XE_QUERY_CONFIG_FLAGS] =
                        DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM;
+       config->info[DRM_XE_QUERY_CONFIG_FLAGS] |=
+                       DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY;
        config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] =
                xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K;
        config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits;
index 76a462fae05ff8e2230dd5d390a5046a09a93631..d1f0018342b60bcba7f16168b4b2c96e0d275db0 100644 (file)
@@ -393,6 +393,8 @@ struct drm_xe_query_mem_regions {
  *
  *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM - Flag is set if the device
  *      has usable VRAM
+ *    - %DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY - Flag is set if the device
+ *      has low latency hint support
  *  - %DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT - Minimal memory alignment
  *    required by this device, typically SZ_4K or SZ_64K
  *  - %DRM_XE_QUERY_CONFIG_VA_BITS - Maximum bits of a virtual address
@@ -409,6 +411,7 @@ struct drm_xe_query_config {
 #define DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID  0
 #define DRM_XE_QUERY_CONFIG_FLAGS                      1
        #define DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM       (1 << 0)
+       #define DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY        (1 << 1)
 #define DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT              2
 #define DRM_XE_QUERY_CONFIG_VA_BITS                    3
 #define DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY    4
@@ -1205,6 +1208,21 @@ struct drm_xe_vm_bind {
  *     };
  *     ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
  *
+ *     Allow users to provide a hint to kernel for cases demanding low latency
+ *     profile. Please note it will have impact on power consumption. User can
+ *     indicate low latency hint with flag while creating exec queue as
+ *     mentioned below,
+ *
+ *     struct drm_xe_exec_queue_create exec_queue_create = {
+ *          .flags = DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT,
+ *          .extensions = 0,
+ *          .vm_id = vm,
+ *          .num_bb_per_exec = 1,
+ *          .num_eng_per_bb = 1,
+ *          .instances = to_user_pointer(&instance),
+ *     };
+ *     ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &exec_queue_create);
+ *
  */
 struct drm_xe_exec_queue_create {
 #define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY               0
@@ -1223,7 +1241,8 @@ struct drm_xe_exec_queue_create {
        /** @vm_id: VM to use for this exec queue */
        __u32 vm_id;
 
-       /** @flags: MBZ */
+#define DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT     (1 << 0)
+       /** @flags: flags to use for this exec queue */
        __u32 flags;
 
        /** @exec_queue_id: Returned exec queue ID */