drm/amdkfd: allow users to target recommended SDMA engines
authorJonathan Kim <Jonathan.Kim@amd.com>
Tue, 21 May 2024 17:22:15 +0000 (13:22 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 25 Jul 2024 21:43:41 +0000 (17:43 -0400)
Certain GPUs have better copy performance over xGMI on specific
SDMA engines depending on the source and destination GPU.
Allow users to create SDMA queues on these recommended engines.
Close to 2x overall performance has been observed with this
optimization.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_topology.c
drivers/gpu/drm/amd/amdkfd/kfd_topology.h
include/uapi/linux/kfd_ioctl.h

index 65a37ac5a0f0bb56bd990bdd46b0ad935b7a27e9..0622ebd7e8efa50eca203d8ea31cf37008b2c136 100644 (file)
@@ -255,6 +255,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
                        args->ctx_save_restore_address;
        q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
        q_properties->ctl_stack_size = args->ctl_stack_size;
+       q_properties->sdma_engine_id = args->sdma_engine_id;
        if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
                args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
                q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
@@ -262,6 +263,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
                q_properties->type = KFD_QUEUE_TYPE_SDMA;
        else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
                q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
+       else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID)
+               q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID;
        else
                return -ENOTSUPP;
 
@@ -333,6 +336,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
                goto err_bind_process;
        }
 
+       if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
+               int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) +
+                                     kfd_get_num_xgmi_sdma_engines(dev) - 1;
+
+               if (q_properties.sdma_engine_id > max_sdma_eng_id) {
+                       err = -EINVAL;
+                       pr_err("sdma_engine_id %i exceeds maximum id of %i\n",
+                              q_properties.sdma_engine_id, max_sdma_eng_id);
+                       goto err_sdma_engine_id;
+               }
+       }
+
        if (!pdd->qpd.proc_doorbells) {
                err = kfd_alloc_process_doorbells(dev->kfd, pdd);
                if (err) {
@@ -387,6 +402,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
 err_create_queue:
        kfd_queue_release_buffers(pdd, &q_properties);
 err_acquire_queue_buf:
+err_sdma_engine_id:
 err_bind_process:
 err_pdd:
        mutex_unlock(&p->mutex);
index fdc76c24b2e72ad32a2165fdfef842d380a7d05c..f0bfeb35246ffac5a5acf31e5ac2dc9087fc693d 100644 (file)
@@ -1532,6 +1532,41 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
                        q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
                q->properties.sdma_queue_id = q->sdma_id /
                        kfd_get_num_xgmi_sdma_engines(dqm->dev);
+       } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
+               int i, num_queues, num_engines, eng_offset = 0, start_engine;
+               bool free_bit_found = false, is_xgmi = false;
+
+               if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) {
+                       num_queues = get_num_sdma_queues(dqm);
+                       num_engines = kfd_get_num_sdma_engines(dqm->dev);
+                       q->properties.type = KFD_QUEUE_TYPE_SDMA;
+               } else {
+                       num_queues = get_num_xgmi_sdma_queues(dqm);
+                       num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev);
+                       eng_offset = kfd_get_num_sdma_engines(dqm->dev);
+                       q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI;
+                       is_xgmi = true;
+               }
+
+               /* Scan available bit based on target engine ID. */
+               start_engine = q->properties.sdma_engine_id - eng_offset;
+               for (i = start_engine; i < num_queues; i += num_engines) {
+
+                       if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap))
+                               continue;
+
+                       clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap);
+                       q->sdma_id = i;
+                       q->properties.sdma_queue_id = q->sdma_id / num_engines;
+                       free_bit_found = true;
+                       break;
+               }
+
+               if (!free_bit_found) {
+                       dev_err(dev, "No more SDMA queue to allocate for target ID %i\n",
+                               q->properties.sdma_engine_id);
+                       return -ENOMEM;
+               }
        }
 
        pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
@@ -1784,7 +1819,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
        }
 
        if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
-               q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+               q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI ||
+               q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
                dqm_lock(dqm);
                retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL);
                dqm_unlock(dqm);
index b5cae48dff669939efbe4b4931766d2986c9d703..4190fa3399138b66bed1726937ca5b4983b6e25b 100644 (file)
@@ -414,13 +414,16 @@ enum kfd_unmap_queues_filter {
  * @KFD_QUEUE_TYPE_DIQ: DIQ queue type.
  *
  * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface.
+ *
+ * @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:  SDMA user mode queue with target SDMA engine ID.
  */
 enum kfd_queue_type  {
        KFD_QUEUE_TYPE_COMPUTE,
        KFD_QUEUE_TYPE_SDMA,
        KFD_QUEUE_TYPE_HIQ,
        KFD_QUEUE_TYPE_DIQ,
-       KFD_QUEUE_TYPE_SDMA_XGMI
+       KFD_QUEUE_TYPE_SDMA_XGMI,
+       KFD_QUEUE_TYPE_SDMA_BY_ENG_ID
 };
 
 enum kfd_queue_format {
index 9995dbb433599a3b4c9f0d66e4c0e6c34048bc76..f732ee35b53157d7a4dc76cc9cd4eee713d1840a 100644 (file)
@@ -366,6 +366,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
        switch (type) {
        case KFD_QUEUE_TYPE_SDMA:
        case KFD_QUEUE_TYPE_SDMA_XGMI:
+       case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:
                /* SDMA queues are always allocated statically no matter
                 * which scheduler mode is used. We also do not need to
                 * check whether a SDMA queue can be allocated here, because
index a9b3eda65a2ccbf16fcfc4f1c6ab302d77e69d50..40771f8752cbc6a6db6c55669794a7c7a0e3d355 100644 (file)
@@ -292,6 +292,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr,
                              iolink->max_bandwidth);
        sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size",
                              iolink->rec_transfer_size);
+       sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask",
+                             iolink->rec_sdma_eng_id_mask);
        sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags);
 
        return offs;
@@ -1265,6 +1267,55 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev,
        }
 }
 
+#define REC_SDMA_NUM_GPU       8
+static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = {
+                                                       { -1, 14, 12, 2, 4, 8, 10, 6 },
+                                                       { 14, -1, 2, 10, 8, 4, 6, 12 },
+                                                       { 10, 2, -1, 12, 14, 6, 4, 8 },
+                                                       { 2, 12, 10, -1, 6, 14, 8, 4 },
+                                                       { 4, 8, 14, 6, -1, 10, 12, 2 },
+                                                       { 8, 4, 6, 14, 12, -1, 2, 10 },
+                                                       { 10, 6, 4, 8, 12, 2, -1, 14 },
+                                                       { 6, 12, 8, 4, 2, 10, 14, -1 }};
+
+static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev,
+                                            struct kfd_iolink_properties *outbound_link,
+                                            struct kfd_iolink_properties *inbound_link)
+{
+       struct kfd_node *gpu = outbound_link->gpu;
+       struct amdgpu_device *adev = gpu->adev;
+       int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes;
+       bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu &&
+               adev->aid_mask && num_xgmi_nodes &&
+               (amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) ==
+                     AMDGPU_SPX_PARTITION_MODE) &&
+               (!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8);
+
+       if (support_rec_eng) {
+               int src_socket_id = adev->gmc.xgmi.physical_node_id;
+               int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id;
+
+               outbound_link->rec_sdma_eng_id_mask =
+                       1 << rec_sdma_eng_map[src_socket_id][dst_socket_id];
+               inbound_link->rec_sdma_eng_id_mask =
+                       1 << rec_sdma_eng_map[dst_socket_id][src_socket_id];
+       } else {
+               int num_sdma_eng = kfd_get_num_sdma_engines(gpu);
+               int i, eng_offset = 0;
+
+               if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI &&
+                   kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) {
+                       eng_offset = num_sdma_eng;
+                       num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu);
+               }
+
+               for (i = 0; i < num_sdma_eng; i++) {
+                       outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
+                       inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset));
+               }
+       }
+}
+
 static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
 {
        struct kfd_iolink_properties *link, *inbound_link;
@@ -1303,6 +1354,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
                        inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED;
                        kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link);
                        kfd_set_iolink_non_coherent(peer_dev, link, inbound_link);
+                       kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link);
                }
        }
 
index 43ba0d32e5bd7145d41855eb07ed3ff5c6a23b66..155b5c410af1651ae245fdb0aa2ce8ac33a5edf5 100644 (file)
@@ -125,6 +125,7 @@ struct kfd_iolink_properties {
        uint32_t                min_bandwidth;
        uint32_t                max_bandwidth;
        uint32_t                rec_transfer_size;
+       uint32_t                rec_sdma_eng_id_mask;
        uint32_t                flags;
        struct kfd_node         *gpu;
        struct kobject          *kobj;
index 285a36601dc927dc4d7272d051ad1df82d21c918..71a7ce5f2d4c0391516ecaa4e43ff4f4fd9e8261 100644 (file)
  * - 1.14 - Update kfd_event_data
  * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
  * - 1.16 - Add contiguous VRAM allocation flag
+ * - 1.17 - Add SDMA queue creation with target SDMA engine ID
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 16
+#define KFD_IOCTL_MINOR_VERSION 17
 
 struct kfd_ioctl_get_version_args {
        __u32 major_version;    /* from KFD */
@@ -56,6 +57,7 @@ struct kfd_ioctl_get_version_args {
 #define KFD_IOC_QUEUE_TYPE_SDMA                        0x1
 #define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL         0x2
 #define KFD_IOC_QUEUE_TYPE_SDMA_XGMI           0x3
+#define KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID      0x4
 
 #define KFD_MAX_QUEUE_PERCENTAGE       100
 #define KFD_MAX_QUEUE_PRIORITY         15
@@ -78,6 +80,8 @@ struct kfd_ioctl_create_queue_args {
        __u64 ctx_save_restore_address; /* to KFD */
        __u32 ctx_save_restore_size;    /* to KFD */
        __u32 ctl_stack_size;           /* to KFD */
+       __u32 sdma_engine_id;           /* to KFD */
+       __u32 pad;
 };
 
 struct kfd_ioctl_destroy_queue_args {