drm/amdkfd: Store queue cwsr area size to node properties
authorPhilip Yang <Philip.Yang@amd.com>
Wed, 26 Jun 2024 18:52:28 +0000 (14:52 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 24 Jul 2024 18:45:58 +0000 (14:45 -0400)
Use the queue eop buffer size, cwsr area size, ctl stack size
calculation from Thunk, store the value to KFD node properties.

Those will be used to validate queue eop buffer size, cwsr area size,
ctl stack size when creating KFD user compute queue.

Those will be exposed to user space via sysfs KFD node properties, to
remove the duplicate calculation code from Thunk.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_queue.c
drivers/gpu/drm/amd/amdkfd/kfd_topology.c
drivers/gpu/drm/amd/amdkfd/kfd_topology.h

index c31589043d5bd21cc430f36f2220264f508442bb..b5cae48dff669939efbe4b4931766d2986c9d703 100644 (file)
@@ -1295,6 +1295,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_
 void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
+void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev);
 
 struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
                struct kfd_node *dev);
index 67242ce051b5c0fd97a21a370fbd68ff54f4b8ad..adcda9730c9feaa94884383fc6168ff05c7b64af 100644 (file)
@@ -24,6 +24,7 @@
 
 #include <linux/slab.h>
 #include "kfd_priv.h"
+#include "kfd_topology.h"
 #include "kfd_svm.h"
 
 void print_queue_properties(struct queue_properties *q)
@@ -305,3 +306,77 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope
                                 properties->ctx_save_restore_area_size);
        return 0;
 }
+
+#define SGPR_SIZE_PER_CU       0x4000
+#define LDS_SIZE_PER_CU                0x10000
+#define HWREG_SIZE_PER_CU      0x1000
+#define DEBUGGER_BYTES_ALIGN   64
+#define DEBUGGER_BYTES_PER_WAVE        32
+
+static u32 kfd_get_vgpr_size_per_cu(u32 gfxv)
+{
+       u32 vgpr_size = 0x40000;
+
+       if ((gfxv / 100 * 100) == 90400 ||      /* GFX_VERSION_AQUA_VANJARAM */
+           gfxv == 90010 ||                    /* GFX_VERSION_ALDEBARAN */
+           gfxv == 90008)                      /* GFX_VERSION_ARCTURUS */
+               vgpr_size = 0x80000;
+       else if (gfxv == 110000 ||              /* GFX_VERSION_PLUM_BONITO */
+                gfxv == 110001 ||              /* GFX_VERSION_WHEAT_NAS */
+                gfxv == 120000 ||              /* GFX_VERSION_GFX1200 */
+                gfxv == 120001)                /* GFX_VERSION_GFX1201 */
+               vgpr_size = 0x60000;
+
+       return vgpr_size;
+}
+
+#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv)      \
+       (kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\
+        LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU)
+
+#define CNTL_STACK_BYTES_PER_WAVE(gfxv)        \
+       ((gfxv) >= 100100 ? 12 : 8)     /* GFX_VERSION_NAVI10*/
+
+#define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40
+
+void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
+{
+       struct kfd_node_properties *props = &dev->node_props;
+       u32 gfxv = props->gfx_target_version;
+       u32 ctl_stack_size;
+       u32 wg_data_size;
+       u32 wave_num;
+       u32 cu_num;
+
+       if (gfxv < 80001)       /* GFX_VERSION_CARRIZO */
+               return;
+
+       cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask);
+       wave_num = (gfxv < 100100) ?    /* GFX_VERSION_NAVI10 */
+                   min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512)
+                   : cu_num * 32;
+
+       wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv), PAGE_SIZE);
+       ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8;
+       ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size,
+                              PAGE_SIZE);
+
+       if ((gfxv / 10000 * 10000) == 100000) {
+               /* HW design limits control stack size to 0x7000.
+                * This is insufficient for theoretical PM4 cases
+                * but sufficient for AQL, limited by SPI events.
+                */
+               ctl_stack_size = min(ctl_stack_size, 0x7000);
+       }
+
+       props->ctl_stack_size = ctl_stack_size;
+       props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN);
+       props->cwsr_size = ctl_stack_size + wg_data_size;
+
+       if (gfxv == 80002)      /* GFX_VERSION_TONGA */
+               props->eop_buffer_size = 0x8000;
+       else if ((gfxv / 100 * 100) == 90400)   /* GFX_VERSION_AQUA_VANJARAM */
+               props->eop_buffer_size = 4096;
+       else if (gfxv >= 80000)
+               props->eop_buffer_size = 4096;
+}
index 6f89b06f89d3803a4028e869e99fb0f72a3b9beb..a9b3eda65a2ccbf16fcfc4f1c6ab302d77e69d50 100644 (file)
@@ -2120,6 +2120,8 @@ int kfd_topology_add_device(struct kfd_node *gpu)
                dev->gpu->adev->gmc.xgmi.connected_to_cpu)
                dev->node_props.capability |= HSA_CAP_FLAGS_COHERENTHOSTACCESS;
 
+       kfd_queue_ctx_save_restore_size(dev);
+
        kfd_debug_print_topology();
 
        kfd_notify_gpu_change(gpu_id, 1);
index 2d1c9d771bef2df57cbac276b151a7bb5fb766a4..43ba0d32e5bd7145d41855eb07ed3ff5c6a23b66 100644 (file)
@@ -74,6 +74,10 @@ struct kfd_node_properties {
        uint32_t num_sdma_xgmi_engines;
        uint32_t num_sdma_queues_per_engine;
        uint32_t num_cp_queues;
+       uint32_t cwsr_size;
+       uint32_t ctl_stack_size;
+       uint32_t eop_buffer_size;
+       uint32_t debug_memory_size;
        char name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
 };