.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v9_hqd_reset,
+ .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
};
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
- .hqd_reset = kgd_gfx_v9_hqd_reset
+ .hqd_reset = kgd_gfx_v9_hqd_reset,
+ .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
};
return 0;
}
+static uint32_t kgd_gfx_v9_4_3_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+ int engine, int queue)
+{
+ uint32_t reg_offset = get_sdma_rlc_reg_offset(adev, engine, queue);
+ uint32_t status = RREG32(regSDMA_RLC0_CONTEXT_STATUS + reg_offset);
+ uint32_t doorbell_off = RREG32(regSDMA_RLC0_DOORBELL_OFFSET + reg_offset);
+ bool is_active = !!REG_GET_FIELD(status, SDMA_RLC0_CONTEXT_STATUS, SELECTED);
+
+ return is_active ? doorbell_off >> 2 : 0;
+}
+
const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping,
.set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
.clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
- .hqd_reset = kgd_gfx_v9_hqd_reset
+ .hqd_reset = kgd_gfx_v9_hqd_reset,
+ .hqd_sdma_get_doorbell = kgd_gfx_v9_4_3_hqd_sdma_get_doorbell
};
return 0;
}
+uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+ int engine, int queue)
+{
+ return 0;
+}
+
const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
.program_sh_mem_settings = kgd_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
.program_trap_handler_settings = program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
- .hqd_reset = kgd_gfx_v10_hqd_reset
+ .hqd_reset = kgd_gfx_v10_hqd_reset,
+ .hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell
};
uint32_t queue_id,
uint32_t inst,
unsigned int utimeout);
+uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+ int engine, int queue);
.set_address_watch = kgd_gfx_v10_set_address_watch,
.clear_address_watch = kgd_gfx_v10_clear_address_watch,
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
- .hqd_reset = kgd_gfx_v10_hqd_reset
+ .hqd_reset = kgd_gfx_v10_hqd_reset,
+ .hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell
};
return 0;
}
+static uint32_t kgd_gfx_v11_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+ int engine, int queue)
+{
+ return 0;
+}
+
const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
.program_sh_mem_settings = program_sh_mem_settings_v11,
.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
.set_address_watch = kgd_gfx_v11_set_address_watch,
.clear_address_watch = kgd_gfx_v11_clear_address_watch,
.hqd_get_pq_addr = kgd_gfx_v11_hqd_get_pq_addr,
- .hqd_reset = kgd_gfx_v11_hqd_reset
+ .hqd_reset = kgd_gfx_v11_hqd_reset,
+ .hqd_sdma_get_doorbell = kgd_gfx_v11_hqd_sdma_get_doorbell
};
return 0;
}
+static uint32_t kgd_gfx_v12_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+ int engine, int queue)
+{
+ return 0;
+}
+
const struct kfd2kgd_calls gfx_v12_kfd2kgd = {
.init_interrupts = init_interrupts_v12,
.hqd_dump = hqd_dump_v12,
.set_wave_launch_mode = kgd_gfx_v12_set_wave_launch_mode,
.set_address_watch = kgd_gfx_v12_set_address_watch,
.clear_address_watch = kgd_gfx_v12_clear_address_watch,
+ .hqd_sdma_get_doorbell = kgd_gfx_v12_hqd_sdma_get_doorbell
};
uint32_t low, high;
uint64_t queue_addr = 0;
- if (!amdgpu_gpu_recovery)
- return 0;
-
kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
uint32_t low, high, pipe_reset_data = 0;
uint64_t queue_addr = 0;
- if (!amdgpu_gpu_recovery)
- return 0;
-
kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
return queue_addr;
}
+uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+ int engine, int queue)
+
+{
+ return 0;
+}
+
const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
- .hqd_reset = kgd_gfx_v9_hqd_reset
+ .hqd_reset = kgd_gfx_v9_hqd_reset,
+ .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell
};
uint32_t queue_id,
uint32_t inst,
unsigned int utimeout);
+uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev,
+ int engine, int queue);
#include "kfd_kernel_queue.h"
#include "amdgpu_amdkfd.h"
#include "amdgpu_reset.h"
+#include "amdgpu_sdma.h"
#include "mes_v11_api_def.h"
#include "kfd_debug.h"
static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q, const uint32_t *restore_sdma_id);
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
+
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
{
return NULL;
}
-/* only for compute queue */
-static int reset_queues_on_hws_hang(struct device_queue_manager *dqm)
+static int reset_hung_queues(struct device_queue_manager *dqm)
{
int r = 0, reset_count = 0, i;
return r;
}
+static bool sdma_has_hang(struct device_queue_manager *dqm)
+{
+ int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm);
+ int engine_end = engine_start + get_num_all_sdma_engines(dqm);
+ int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine;
+ int i, j;
+
+ for (i = engine_start; i < engine_end; i++) {
+ for (j = 0; j < num_queues_per_eng; j++) {
+ if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j))
+ continue;
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool set_sdma_queue_as_reset(struct device_queue_manager *dqm,
+ uint32_t doorbell_off)
+{
+ struct device_process_node *cur;
+ struct qcm_process_device *qpd;
+ struct queue *q;
+
+ list_for_each_entry(cur, &dqm->queues, list) {
+ qpd = cur->qpd;
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if ((q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) &&
+ q->properties.doorbell_off == doorbell_off) {
+ set_queue_as_reset(dqm, q, qpd);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
+{
+ int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm);
+ int engine_end = engine_start + get_num_all_sdma_engines(dqm);
+ int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine;
+ int r = 0, i, j;
+
+ if (dqm->is_hws_hang)
+ return -EIO;
+
+ /* Scan for hung HW queues and reset engine. */
+ dqm->detect_hang_count = 0;
+ for (i = engine_start; i < engine_end; i++) {
+ for (j = 0; j < num_queues_per_eng; j++) {
+ uint32_t doorbell_off =
+ dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j);
+
+ if (!doorbell_off)
+ continue;
+
+ /* Reset engine and check. */
+ if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
+ dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
+ !set_sdma_queue_as_reset(dqm, doorbell_off)) {
+ r = -ENOTRECOVERABLE;
+ goto reset_fail;
+ }
+
+ /* Should only expect one queue active per engine */
+ dqm->detect_hang_count++;
+ break;
+ }
+ }
+
+ /* Signal process reset */
+ if (dqm->detect_hang_count)
+ kfd_signal_reset_event(dqm->dev);
+ else
+ r = -ENOTRECOVERABLE;
+
+reset_fail:
+ dqm->detect_hang_count = 0;
+
+ return r;
+}
+
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma)
+{
+ while (halt_if_hws_hang)
+ schedule();
+
+ if (!amdgpu_gpu_recovery)
+ return -ENOTRECOVERABLE;
+
+ return is_sdma ? reset_hung_queues_sdma(dqm) : reset_hung_queues(dqm);
+}
+
/* dqm->lock mutex has to be locked before calling this function */
static int unmap_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
* check those fields
*/
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
- if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
- while (halt_if_hws_hang)
- schedule();
- if (reset_queues_on_hws_hang(dqm)) {
- dqm->is_hws_hang = true;
- kfd_hws_hang(dqm);
- retval = -ETIME;
- goto out;
- }
- }
+ if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd) &&
+ reset_queues_on_hws_hang(dqm, false))
+ goto reset_fail;
+
+ /* Check for SDMA hang and attempt SDMA reset */
+ if (sdma_has_hang(dqm) && reset_queues_on_hws_hang(dqm, true))
+ goto reset_fail;
/* We need to reset the grace period value for this device */
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
pm_release_ib(&dqm->packet_mgr);
dqm->active_runlist = false;
-
out:
up_read(&dqm->dev->adev->reset_domain->sem);
return retval;
+
+reset_fail:
+ dqm->is_hws_hang = true;
+ kfd_hws_hang(dqm);
+ up_read(&dqm->dev->adev->reset_domain->sem);
+ return -ETIME;
}
/* only for compute queue */
uint64_t (*hqd_reset)(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id,
uint32_t inst, unsigned int utimeout);
+ uint32_t (*hqd_sdma_get_doorbell)(struct amdgpu_device *adev,
+ int engine, int queue);
};
#endif /* KGD_KFD_INTERFACE_H_INCLUDED */