drm/amdgpu: Store CU info from all XCCs for GFX v9.4.3
authorMukul Joshi <mukul.joshi@amd.com>
Fri, 25 Aug 2023 15:59:09 +0000 (11:59 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 11 Sep 2023 21:10:19 +0000 (17:10 -0400)
Currently, we store CU info only for a single XCC assuming
that it is the same for all XCCs. However, that may not be
true. As a result, store CU info for all XCCs. This info is
later used for CU masking.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
14 files changed:
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
drivers/gpu/drm/amd/amdkfd/kfd_crat.c
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_topology.c
drivers/gpu/drm/amd/include/kgd_kfd_interface.h

index cdf6087706aa832a3cd6c67378877b50be9f1910..25d5fda5b243e3c598c141952f9537fe3c9652d7 100644 (file)
@@ -478,7 +478,7 @@ void amdgpu_amdkfd_get_cu_info(struct amdgpu_device *adev, struct kfd_cu_info *c
        cu_info->cu_active_number = acu_info.number;
        cu_info->cu_ao_mask = acu_info.ao_cu_mask;
        memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0],
-              sizeof(acu_info.bitmap));
+              sizeof(cu_info->cu_bitmap));
        cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
        cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
        cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
index 395c1768b9fc7d867416fca45b513f0573a19bd2..0ca95c4d4bfbe1634b270ac5afe04aec57636347 100644 (file)
@@ -43,6 +43,7 @@
 #define AMDGPU_GFX_LBPW_DISABLED_MODE          0x00000008L
 
 #define AMDGPU_MAX_GC_INSTANCES                8
+#define KGD_MAX_QUEUES                 128
 
 #define AMDGPU_MAX_GFX_QUEUES KGD_MAX_QUEUES
 #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES
@@ -257,7 +258,7 @@ struct amdgpu_cu_info {
        uint32_t number;
        uint32_t ao_cu_mask;
        uint32_t ao_cu_bitmap[4][4];
-       uint32_t bitmap[4][4];
+       uint32_t bitmap[AMDGPU_MAX_GC_INSTANCES][4][4];
 };
 
 struct amdgpu_gfx_ras {
index 3a48bec10aea8bbecfc106c6d8e04224bbc991d1..d462b36adf4b147289a5954ea2186aba753740aa 100644 (file)
@@ -850,7 +850,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
                memcpy(&dev_info->cu_ao_bitmap[0], &adev->gfx.cu_info.ao_cu_bitmap[0],
                       sizeof(adev->gfx.cu_info.ao_cu_bitmap));
                memcpy(&dev_info->cu_bitmap[0], &adev->gfx.cu_info.bitmap[0],
-                      sizeof(adev->gfx.cu_info.bitmap));
+                      sizeof(dev_info->cu_bitmap));
                dev_info->vram_type = adev->gmc.vram_type;
                dev_info->vram_bit_width = adev->gmc.vram_width;
                dev_info->vce_harvest_config = adev->vce.harvest_config;
index 6ccde07ed63e1c1a298797dcb6c8dd8cd353b811..62329a8220227a551e2225c6fd3459abff89e3a5 100644 (file)
@@ -9442,7 +9442,7 @@ static int gfx_v10_0_get_cu_info(struct amdgpu_device *adev,
                                gfx_v10_0_set_user_wgp_inactive_bitmap_per_sh(
                                        adev, disable_masks[i * 2 + j]);
                        bitmap = gfx_v10_0_get_cu_active_bitmap_per_sh(adev);
-                       cu_info->bitmap[i][j] = bitmap;
+                       cu_info->bitmap[0][i][j] = bitmap;
 
                        for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
                                if (bitmap & mask) {
index 337ed771605f7ad2b91e2e9a21ca3cf7482ca84a..39c434ca0dad322537b0aa4deb7182addd687a62 100644 (file)
@@ -6392,7 +6392,7 @@ static int gfx_v11_0_get_cu_info(struct amdgpu_device *adev,
                         *    SE6: {SH0,SH1} --> {bitmap[2][2], bitmap[2][3]}
                         *    SE7: {SH0,SH1} --> {bitmap[3][2], bitmap[3][3]}
                         */
-                       cu_info->bitmap[i % 4][j + (i / 4) * 2] = bitmap;
+                       cu_info->bitmap[0][i % 4][j + (i / 4) * 2] = bitmap;
 
                        for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
                                if (bitmap & mask)
index da6caff78c22be23ba918c49f4334c97236c0104..34f9211b26793ff705aef4ab75e926dfba1f1dbb 100644 (file)
@@ -3577,7 +3577,7 @@ static void gfx_v6_0_get_cu_info(struct amdgpu_device *adev)
                                gfx_v6_0_set_user_cu_inactive_bitmap(
                                        adev, disable_masks[i * 2 + j]);
                        bitmap = gfx_v6_0_get_cu_enabled(adev);
-                       cu_info->bitmap[i][j] = bitmap;
+                       cu_info->bitmap[0][i][j] = bitmap;
 
                        for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
                                if (bitmap & mask) {
index 90b034b173c1cc9385c4668eb478c93e88b93ed1..c2faf6b4c2fced463cc24598cf10c9775a9663ee 100644 (file)
@@ -5119,7 +5119,7 @@ static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
                                gfx_v7_0_set_user_cu_inactive_bitmap(
                                        adev, disable_masks[i * 2 + j]);
                        bitmap = gfx_v7_0_get_cu_active_bitmap(adev);
-                       cu_info->bitmap[i][j] = bitmap;
+                       cu_info->bitmap[0][i][j] = bitmap;
 
                        for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
                                if (bitmap & mask) {
index 51c1745c83697b04a7eab230f5a64847fceabed3..885ebd703260f05a92aa15928910e171112b2cc7 100644 (file)
@@ -7121,7 +7121,7 @@ static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev)
                                gfx_v8_0_set_user_cu_inactive_bitmap(
                                        adev, disable_masks[i * 2 + j]);
                        bitmap = gfx_v8_0_get_cu_active_bitmap(adev);
-                       cu_info->bitmap[i][j] = bitmap;
+                       cu_info->bitmap[0][i][j] = bitmap;
 
                        for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) {
                                if (bitmap & mask) {
index 5c85ac34360f98e3ebc3727b172fd28e3c6873e7..f99a3a6bfd9157e6db75cc8db8c4f20954340399 100644 (file)
@@ -1499,7 +1499,7 @@ static void gfx_v9_0_init_always_on_cu_mask(struct amdgpu_device *adev)
                        amdgpu_gfx_select_se_sh(adev, i, j, 0xffffffff, 0);
 
                        for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) {
-                               if (cu_info->bitmap[i][j] & mask) {
+                               if (cu_info->bitmap[0][i][j] & mask) {
                                        if (counter == pg_always_on_cu_num)
                                                WREG32_SOC15(GC, 0, mmRLC_PG_ALWAYS_ON_CU_MASK, cu_bitmap);
                                        if (counter < always_on_cu_num)
@@ -7233,7 +7233,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
                         *    SE6,SH0 --> bitmap[2][1]
                         *    SE7,SH0 --> bitmap[3][1]
                         */
-                       cu_info->bitmap[i % 4][j + i / 4] = bitmap;
+                       cu_info->bitmap[0][i % 4][j + i / 4] = bitmap;
 
                        for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) {
                                if (bitmap & mask) {
index a60d1a8405d484b603aed016623c6ad431dbfa23..32a740104868aa07bdbf7114f3f0177454d1865f 100644 (file)
@@ -4259,7 +4259,7 @@ static void gfx_v9_4_3_set_gds_init(struct amdgpu_device *adev)
 }
 
 static void gfx_v9_4_3_set_user_cu_inactive_bitmap(struct amdgpu_device *adev,
-                                                u32 bitmap)
+                                                u32 bitmap, int xcc_id)
 {
        u32 data;
 
@@ -4269,15 +4269,15 @@ static void gfx_v9_4_3_set_user_cu_inactive_bitmap(struct amdgpu_device *adev,
        data = bitmap << GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_CUS__SHIFT;
        data &= GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_CUS_MASK;
 
-       WREG32_SOC15(GC, GET_INST(GC, 0), regGC_USER_SHADER_ARRAY_CONFIG, data);
+       WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG, data);
 }
 
-static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev)
+static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev, int xcc_id)
 {
        u32 data, mask;
 
-       data = RREG32_SOC15(GC, GET_INST(GC, 0), regCC_GC_SHADER_ARRAY_CONFIG);
-       data |= RREG32_SOC15(GC, GET_INST(GC, 0), regGC_USER_SHADER_ARRAY_CONFIG);
+       data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCC_GC_SHADER_ARRAY_CONFIG);
+       data |= RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG);
 
        data &= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_CUS_MASK;
        data >>= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_CUS__SHIFT;
@@ -4290,7 +4290,7 @@ static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev)
 static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev,
                                 struct amdgpu_cu_info *cu_info)
 {
-       int i, j, k, counter, active_cu_number = 0;
+       int i, j, k, counter, xcc_id, active_cu_number = 0;
        u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
        unsigned disable_masks[4 * 4];
 
@@ -4309,46 +4309,38 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev,
                                    adev->gfx.config.max_sh_per_se);
 
        mutex_lock(&adev->grbm_idx_mutex);
-       for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
-               for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
-                       mask = 1;
-                       ao_bitmap = 0;
-                       counter = 0;
-                       gfx_v9_4_3_xcc_select_se_sh(adev, i, j, 0xffffffff, 0);
-                       gfx_v9_4_3_set_user_cu_inactive_bitmap(
-                               adev, disable_masks[i * adev->gfx.config.max_sh_per_se + j]);
-                       bitmap = gfx_v9_4_3_get_cu_active_bitmap(adev);
-
-                       /*
-                        * The bitmap(and ao_cu_bitmap) in cu_info structure is
-                        * 4x4 size array, and it's usually suitable for Vega
-                        * ASICs which has 4*2 SE/SH layout.
-                        * But for Arcturus, SE/SH layout is changed to 8*1.
-                        * To mostly reduce the impact, we make it compatible
-                        * with current bitmap array as below:
-                        *    SE4,SH0 --> bitmap[0][1]
-                        *    SE5,SH0 --> bitmap[1][1]
-                        *    SE6,SH0 --> bitmap[2][1]
-                        *    SE7,SH0 --> bitmap[3][1]
-                        */
-                       cu_info->bitmap[i % 4][j + i / 4] = bitmap;
-
-                       for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
-                               if (bitmap & mask) {
-                                       if (counter < adev->gfx.config.max_cu_per_sh)
-                                               ao_bitmap |= mask;
-                                       counter++;
+       for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) {
+               for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
+                       for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
+                               mask = 1;
+                               ao_bitmap = 0;
+                               counter = 0;
+                               gfx_v9_4_3_xcc_select_se_sh(adev, i, j, 0xffffffff, xcc_id);
+                               gfx_v9_4_3_set_user_cu_inactive_bitmap(
+                                       adev,
+                                       disable_masks[i * adev->gfx.config.max_sh_per_se + j],
+                                       xcc_id);
+                               bitmap = gfx_v9_4_3_get_cu_active_bitmap(adev, xcc_id);
+
+                               cu_info->bitmap[xcc_id][i][j] = bitmap;
+
+                               for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
+                                       if (bitmap & mask) {
+                                               if (counter < adev->gfx.config.max_cu_per_sh)
+                                                       ao_bitmap |= mask;
+                                               counter++;
+                                       }
+                                       mask <<= 1;
                                }
-                               mask <<= 1;
+                               active_cu_number += counter;
+                               if (i < 2 && j < 2)
+                                       ao_cu_mask |= (ao_bitmap << (i * 16 + j * 8));
+                               cu_info->ao_cu_bitmap[i][j] = ao_bitmap;
                        }
-                       active_cu_number += counter;
-                       if (i < 2 && j < 2)
-                               ao_cu_mask |= (ao_bitmap << (i * 16 + j * 8));
-                       cu_info->ao_cu_bitmap[i % 4][j + i / 4] = ao_bitmap;
                }
+               gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
+                                           xcc_id);
        }
-       gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
-                                   0);
        mutex_unlock(&adev->grbm_idx_mutex);
 
        cu_info->number = active_cu_number;
index 2e9612cf56ae0a27c75a592c009d858b84737693..950810bb5c71122d4250e4ae0063a711a1b89e64 100644 (file)
@@ -2088,7 +2088,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 
        amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
        cu->num_simd_per_cu = cu_info.simd_per_cu;
-       cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
+       cu->num_simd_cores = cu_info.simd_per_cu *
+                       (cu_info.cu_active_number / kdev->kfd->num_nodes);
        cu->max_waves_simd = cu_info.max_waves_per_simd;
 
        cu->wave_front_size = cu_info.wave_front_size;
index d01bb57733b368c4ed692c873027352b3aa865a3..763966236658e01c012247835558ccb8daba3c17 100644 (file)
@@ -104,11 +104,13 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
        bool wgp_mode_req = KFD_GC_VERSION(mm->dev) >= IP_VERSION(10, 0, 0);
        uint32_t en_mask = wgp_mode_req ? 0x3 : 0x1;
        int i, se, sh, cu, cu_bitmap_sh_mul, inc = wgp_mode_req ? 2 : 1;
+       uint32_t cu_active_per_node;
 
        amdgpu_amdkfd_get_cu_info(mm->dev->adev, &cu_info);
 
-       if (cu_mask_count > cu_info.cu_active_number)
-               cu_mask_count = cu_info.cu_active_number;
+       cu_active_per_node = cu_info.cu_active_number / mm->dev->kfd->num_nodes;
+       if (cu_mask_count > cu_active_per_node)
+               cu_mask_count = cu_active_per_node;
 
        /* Exceeding these bounds corrupts the stack and indicates a coding error.
         * Returning with no CU's enabled will hang the queue, which should be
@@ -141,7 +143,7 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
        for (se = 0; se < cu_info.num_shader_engines; se++)
                for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++)
                        cu_per_sh[se][sh] = hweight32(
-                               cu_info.cu_bitmap[se % 4][sh + (se / 4) * cu_bitmap_sh_mul]);
+                               cu_info.cu_bitmap[0][se % 4][sh + (se / 4) * cu_bitmap_sh_mul]);
 
        /* Symmetrically map cu_mask to all SEs & SHs:
         * se_mask programs up to 2 SH in the upper and lower 16 bits.
index ff98fded953499017a90b57c066570c09721e7e0..c54795682dfbd1e4c65ef1ae31d9696fda86b5c0 100644 (file)
@@ -450,8 +450,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
        sysfs_show_32bit_prop(buffer, offs, "cpu_cores_count",
                              dev->node_props.cpu_cores_count);
        sysfs_show_32bit_prop(buffer, offs, "simd_count",
-                             dev->gpu ? (dev->node_props.simd_count *
-                                         NUM_XCC(dev->gpu->xcc_mask)) : 0);
+                             dev->gpu ? dev->node_props.simd_count : 0);
        sysfs_show_32bit_prop(buffer, offs, "mem_banks_count",
                              dev->node_props.mem_banks_count);
        sysfs_show_32bit_prop(buffer, offs, "caches_count",
@@ -1604,7 +1603,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
        int i, j, k;
        struct kfd_cache_properties *pcache = NULL;
 
-       cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
+       cu_sibling_map_mask = cu_info->cu_bitmap[0][0][0];
        cu_sibling_map_mask &=
                ((1 << pcache_info[cache_type].num_cu_shared) - 1);
        first_active_cu = ffs(cu_sibling_map_mask);
@@ -1647,7 +1646,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
                                pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
                                k += 4;
 
-                               cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4];
+                               cu_sibling_map_mask = cu_info->cu_bitmap[0][i % 4][j + i / 4];
                                cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
                        }
                }
@@ -1708,8 +1707,8 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
                                        for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
 
                                                ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
-                                                                               pcu_info->cu_bitmap[i % 4][j + i / 4], ct,
-                                                                               cu_processor_id, k);
+                                                                       pcu_info->cu_bitmap[0][i % 4][j + i / 4], ct,
+                                                                       cu_processor_id, k);
 
                                                if (ret < 0)
                                                        break;
index f3f40dbb8ff71b86f9ab094413932d3cd1ff7daa..3b5a56585c4b7283017651aad21ed1c0565ad37f 100644 (file)
 #include <linux/types.h>
 #include <linux/bitmap.h>
 #include <linux/dma-fence.h>
+#include "amdgpu_irq.h"
+#include "amdgpu_gfx.h"
 
 struct pci_dev;
 struct amdgpu_device;
 
-#define KGD_MAX_QUEUES 128
-
 struct kfd_dev;
 struct kgd_mem;
 
@@ -68,7 +68,7 @@ struct kfd_cu_info {
        uint32_t wave_front_size;
        uint32_t max_scratch_slots_per_cu;
        uint32_t lds_size;
-       uint32_t cu_bitmap[4][4];
+       uint32_t cu_bitmap[AMDGPU_MAX_GC_INSTANCES][4][4];
 };
 
 /* For getting GPU local memory information from KGD */