Host will send a notification when new bad pages are available.
Uopn guest request, the first 256 bad page addresses
will be placed into the PF2VF region.
Guest should pause the PF2VF worker thread while
the copy is in progress.
Reviewed-by: Shravan Kumar Gande <Shravankumar.Gande@amd.com>
Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
Signed-off-by: Ellen Pan <yunru.pan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
return true;
}
+
+/*
+ * amdgpu_virt_request_bad_pages() - request bad pages
+ * @adev: amdgpu device.
+ * Send command to GPU hypervisor to write new bad pages into the shared PF2VF region
+ */
+void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev)
+{
+ struct amdgpu_virt *virt = &adev->virt;
+
+ if (virt->ops && virt->ops->req_bad_pages)
+ virt->ops->req_bad_pages(adev);
+}
bool (*rcvd_ras_intr)(struct amdgpu_device *adev);
int (*req_ras_err_count)(struct amdgpu_device *adev);
int (*req_ras_cper_dump)(struct amdgpu_device *adev, u64 vf_rptr);
+ int (*req_bad_pages)(struct amdgpu_device *adev);
};
/*
uint32_t reg_val_offs;
struct amdgpu_irq_src ack_irq;
struct amdgpu_irq_src rcv_irq;
+
struct work_struct flr_work;
+ struct work_struct bad_pages_work;
+
struct amdgpu_mm_table mm_table;
const struct amdgpu_virt_ops *ops;
struct amdgpu_vf_error_buffer vf_errors;
int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev);
bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev,
enum amdgpu_ras_block block);
+void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev);
#endif
{
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+ struct amdgpu_reset_context reset_context = { 0 };
amdgpu_virt_fini_data_exchange(adev);
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) {
- struct amdgpu_reset_context reset_context;
- memset(&reset_context, 0, sizeof(reset_context));
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
}
}
+static void xgpu_ai_mailbox_bad_pages_work(struct work_struct *work)
+{
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work);
+ struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+
+ if (down_read_trylock(&adev->reset_domain->sem)) {
+ amdgpu_virt_fini_data_exchange(adev);
+ amdgpu_virt_request_bad_pages(adev);
+ amdgpu_virt_init_data_exchange(adev);
+ up_read(&adev->reset_domain->sem);
+ }
+}
+
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *src,
unsigned type,
enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);
switch (event) {
- case IDH_FLR_NOTIFICATION:
+ case IDH_RAS_BAD_PAGES_NOTIFICATION:
+ xgpu_ai_mailbox_send_ack(adev);
+ if (amdgpu_sriov_runtime(adev))
+ schedule_work(&adev->virt.bad_pages_work);
+ break;
+ case IDH_FLR_NOTIFICATION:
if (amdgpu_sriov_runtime(adev))
WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
&adev->virt.flr_work),
"Failed to queue work! at %s",
__func__);
break;
- case IDH_QUERY_ALIVE:
- xgpu_ai_mailbox_send_ack(adev);
- break;
- /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
- * it byfar since that polling thread will handle it,
- * other msg like flr complete is not handled here.
- */
- case IDH_CLR_MSG_BUF:
- case IDH_FLR_NOTIFICATION_CMPL:
- case IDH_READY_TO_ACCESS_GPU:
- default:
+ case IDH_QUERY_ALIVE:
+ xgpu_ai_mailbox_send_ack(adev);
+ break;
+ /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
+ * it byfar since that polling thread will handle it,
+ * other msg like flr complete is not handled here.
+ */
+ case IDH_CLR_MSG_BUF:
+ case IDH_FLR_NOTIFICATION_CMPL:
+ case IDH_READY_TO_ACCESS_GPU:
+ default:
break;
}
}
INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
+ INIT_WORK(&adev->virt.bad_pages_work, xgpu_ai_mailbox_bad_pages_work);
return 0;
}
IDH_LOG_VF_ERROR = 200,
IDH_READY_TO_RESET = 201,
IDH_RAS_POISON = 202,
+ IDH_REQ_RAS_BAD_PAGES = 205,
};
enum idh_event {
IDH_RAS_POISON_READY,
IDH_PF_SOFT_FLR_NOTIFICATION,
IDH_RAS_ERROR_DETECTED,
+ IDH_RAS_BAD_PAGES_READY = 15,
+ IDH_RAS_BAD_PAGES_NOTIFICATION = 16,
IDH_TEXT_MESSAGE = 255,
};
case IDH_REQ_RAS_CPER_DUMP:
event = IDH_RAS_CPER_DUMP_READY;
break;
+ case IDH_REQ_RAS_BAD_PAGES:
+ event = IDH_RAS_BAD_PAGES_READY;
+ break;
default:
break;
}
}
}
+static void xgpu_nv_mailbox_bad_pages_work(struct work_struct *work)
+{
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work);
+ struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+
+ if (down_read_trylock(&adev->reset_domain->sem)) {
+ amdgpu_virt_fini_data_exchange(adev);
+ amdgpu_virt_request_bad_pages(adev);
+ amdgpu_virt_init_data_exchange(adev);
+ up_read(&adev->reset_domain->sem);
+ }
+}
+
static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *src,
unsigned type,
enum idh_event event = xgpu_nv_mailbox_peek_msg(adev);
switch (event) {
+ case IDH_RAS_BAD_PAGES_NOTIFICATION:
+ xgpu_nv_mailbox_send_ack(adev);
+ if (amdgpu_sriov_runtime(adev))
+ schedule_work(&adev->virt.bad_pages_work);
+ break;
case IDH_FLR_NOTIFICATION:
if (amdgpu_sriov_runtime(adev))
WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
}
INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
+ INIT_WORK(&adev->virt.bad_pages_work, xgpu_nv_mailbox_bad_pages_work);
return 0;
}
adev, IDH_REQ_RAS_CPER_DUMP, vf_rptr_hi, vf_rptr_lo, 0);
}
+static int xgpu_nv_req_ras_bad_pages(struct amdgpu_device *adev)
+{
+ return xgpu_nv_send_access_requests(adev, IDH_REQ_RAS_BAD_PAGES);
+}
+
const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.req_full_gpu = xgpu_nv_request_full_gpu_access,
.rel_full_gpu = xgpu_nv_release_full_gpu_access,
.rcvd_ras_intr = xgpu_nv_rcvd_ras_intr,
.req_ras_err_count = xgpu_nv_req_ras_err_count,
.req_ras_cper_dump = xgpu_nv_req_ras_cper_dump,
+ .req_bad_pages = xgpu_nv_req_ras_bad_pages,
};
IDH_RAS_POISON = 202,
IDH_REQ_RAS_ERROR_COUNT = 203,
IDH_REQ_RAS_CPER_DUMP = 204,
+ IDH_REQ_RAS_BAD_PAGES = 205,
};
enum idh_event {
IDH_RAS_ERROR_DETECTED,
IDH_RAS_ERROR_COUNT_READY = 11,
IDH_RAS_CPER_DUMP_READY = 14,
+ IDH_RAS_BAD_PAGES_READY = 15,
+ IDH_RAS_BAD_PAGES_NOTIFICATION = 16,
IDH_TEXT_MESSAGE = 255,
};