drm/amdkfd: Fix checkpoint-restore on multi-xcc
authorDavid Yat Sin <David.YatSin@amd.com>
Wed, 16 Jul 2025 22:04:28 +0000 (22:04 +0000)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 4 Aug 2025 19:38:49 +0000 (15:38 -0400)
GPUs with multi-xcc have multiple MQDs per queue. This patch saves and
restores all the MQDs within the partition.

Signed-off-by: David Yat Sin <David.YatSin@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit a578f2a58c3ab38f0643b1b6e7534af860233cb1)
Cc: stable@vger.kernel.org
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c

index 2d91027e2a747dc18600c2bc4a4a42895a26723a..6c5c7c1bf5eda25ec15cdae291050e265594fb20 100644 (file)
@@ -2725,7 +2725,7 @@ static void get_queue_checkpoint_info(struct device_queue_manager *dqm,
 
        dqm_lock(dqm);
        mqd_mgr = dqm->mqd_mgrs[mqd_type];
-       *mqd_size = mqd_mgr->mqd_size;
+       *mqd_size = mqd_mgr->mqd_size * NUM_XCC(mqd_mgr->dev->xcc_mask);
        *ctl_stack_size = 0;
 
        if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE && mqd_mgr->get_checkpoint_info)
index 97933d2a3803233855829d9cbb07c74f7d90e6e0..f2dee320fada42306ee8259edf912b2a72d8413b 100644 (file)
@@ -373,7 +373,7 @@ static void get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stac
 {
        struct v9_mqd *m = get_mqd(mqd);
 
-       *ctl_stack_size = m->cp_hqd_cntl_stack_size;
+       *ctl_stack_size = m->cp_hqd_cntl_stack_size * NUM_XCC(mm->dev->xcc_mask);
 }
 
 static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst)
@@ -388,6 +388,24 @@ static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, voi
        memcpy(ctl_stack_dst, ctl_stack, m->cp_hqd_cntl_stack_size);
 }
 
+static void checkpoint_mqd_v9_4_3(struct mqd_manager *mm,
+                                                                 void *mqd,
+                                                                 void *mqd_dst,
+                                                                 void *ctl_stack_dst)
+{
+       struct v9_mqd *m;
+       int xcc;
+       uint64_t size = get_mqd(mqd)->cp_mqd_stride_size;
+
+       for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) {
+               m = get_mqd(mqd + size * xcc);
+
+               checkpoint_mqd(mm, m,
+                               (uint8_t *)mqd_dst + sizeof(*m) * xcc,
+                               (uint8_t *)ctl_stack_dst + m->cp_hqd_cntl_stack_size * xcc);
+       }
+}
+
 static void restore_mqd(struct mqd_manager *mm, void **mqd,
                        struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
                        struct queue_properties *qp,
@@ -764,13 +782,35 @@ static void restore_mqd_v9_4_3(struct mqd_manager *mm, void **mqd,
                        const void *mqd_src,
                        const void *ctl_stack_src, u32 ctl_stack_size)
 {
-       restore_mqd(mm, mqd, mqd_mem_obj, gart_addr, qp, mqd_src, ctl_stack_src, ctl_stack_size);
-       if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) {
-               struct v9_mqd *m;
+       struct kfd_mem_obj xcc_mqd_mem_obj;
+       u32 mqd_ctl_stack_size;
+       struct v9_mqd *m;
+       u32 num_xcc;
+       int xcc;
 
-               m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr;
-               m->cp_hqd_pq_doorbell_control |= 1 <<
-                               CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT;
+       uint64_t offset = mm->mqd_stride(mm, qp);
+
+       mm->dev->dqm->current_logical_xcc_start++;
+
+       num_xcc = NUM_XCC(mm->dev->xcc_mask);
+       mqd_ctl_stack_size = ctl_stack_size / num_xcc;
+
+       memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj));
+
+       /* Set the MQD pointer and gart address to XCC0 MQD */
+       *mqd = mqd_mem_obj->cpu_ptr;
+       if (gart_addr)
+               *gart_addr = mqd_mem_obj->gpu_addr;
+
+       for (xcc = 0; xcc < num_xcc; xcc++) {
+               get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset * xcc);
+               restore_mqd(mm, (void **)&m,
+                                       &xcc_mqd_mem_obj,
+                                       NULL,
+                                       qp,
+                                       (uint8_t *)mqd_src + xcc * sizeof(*m),
+                                       (uint8_t *)ctl_stack_src + xcc *  mqd_ctl_stack_size,
+                                       mqd_ctl_stack_size);
        }
 }
 static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
@@ -906,7 +946,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
                mqd->free_mqd = kfd_free_mqd_cp;
                mqd->is_occupied = kfd_is_occupied_cp;
                mqd->get_checkpoint_info = get_checkpoint_info;
-               mqd->checkpoint_mqd = checkpoint_mqd;
                mqd->mqd_size = sizeof(struct v9_mqd);
                mqd->mqd_stride = mqd_stride_v9;
 #if defined(CONFIG_DEBUG_FS)
@@ -918,16 +957,18 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
                        mqd->init_mqd = init_mqd_v9_4_3;
                        mqd->load_mqd = load_mqd_v9_4_3;
                        mqd->update_mqd = update_mqd_v9_4_3;
-                       mqd->restore_mqd = restore_mqd_v9_4_3;
                        mqd->destroy_mqd = destroy_mqd_v9_4_3;
                        mqd->get_wave_state = get_wave_state_v9_4_3;
+                       mqd->checkpoint_mqd = checkpoint_mqd_v9_4_3;
+                       mqd->restore_mqd = restore_mqd_v9_4_3;
                } else {
                        mqd->init_mqd = init_mqd;
                        mqd->load_mqd = load_mqd;
                        mqd->update_mqd = update_mqd;
-                       mqd->restore_mqd = restore_mqd;
                        mqd->destroy_mqd = kfd_destroy_mqd_cp;
                        mqd->get_wave_state = get_wave_state;
+                       mqd->checkpoint_mqd = checkpoint_mqd;
+                       mqd->restore_mqd = restore_mqd;
                }
                break;
        case KFD_MQD_TYPE_HIQ:
index c643e0ccec52b02cf3301ae334ee126917cfc2b6..7fbb5c274ccc4296529417a002110329fc95ba4b 100644 (file)
@@ -914,7 +914,10 @@ static int criu_checkpoint_queues_device(struct kfd_process_device *pdd,
 
                q_data = (struct kfd_criu_queue_priv_data *)q_private_data;
 
-               /* data stored in this order: priv_data, mqd, ctl_stack */
+               /*
+                * data stored in this order:
+                * priv_data, mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
+                */
                q_data->mqd_size = mqd_size;
                q_data->ctl_stack_size = ctl_stack_size;
 
@@ -963,7 +966,7 @@ int kfd_criu_checkpoint_queues(struct kfd_process *p,
 }
 
 static void set_queue_properties_from_criu(struct queue_properties *qp,
-                                         struct kfd_criu_queue_priv_data *q_data)
+                                         struct kfd_criu_queue_priv_data *q_data, uint32_t num_xcc)
 {
        qp->is_interop = false;
        qp->queue_percent = q_data->q_percent;
@@ -976,7 +979,11 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
        qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size;
        qp->ctx_save_restore_area_address = q_data->ctx_save_restore_area_address;
        qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size;
-       qp->ctl_stack_size = q_data->ctl_stack_size;
+       if (q_data->type == KFD_QUEUE_TYPE_COMPUTE)
+               qp->ctl_stack_size = q_data->ctl_stack_size / num_xcc;
+       else
+               qp->ctl_stack_size = q_data->ctl_stack_size;
+
        qp->type = q_data->type;
        qp->format = q_data->format;
 }
@@ -1036,12 +1043,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
                goto exit;
        }
 
-       /* data stored in this order: mqd, ctl_stack */
+       /*
+        * data stored in this order:
+        * mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
+        */
        mqd = q_extra_data;
        ctl_stack = mqd + q_data->mqd_size;
 
        memset(&qp, 0, sizeof(qp));
-       set_queue_properties_from_criu(&qp, q_data);
+       set_queue_properties_from_criu(&qp, q_data, NUM_XCC(pdd->dev->adev->gfx.xcc_mask));
 
        print_queue_properties(&qp);