#include "xe_macros.h"
#include "xe_map.h"
#include "xe_mocs.h"
+#include "xe_module.h"
#include "xe_ring_ops_types.h"
#include "xe_sched_job.h"
#include "xe_trace.h"
#define ENGINE_STATE_SUSPENDED (1 << 5)
#define EXEC_QUEUE_STATE_RESET (1 << 6)
#define ENGINE_STATE_KILLED (1 << 7)
+#define EXEC_QUEUE_STATE_WEDGED (1 << 8)
static bool exec_queue_registered(struct xe_exec_queue *q)
{
atomic_or(ENGINE_STATE_KILLED, &q->guc->state);
}
-static bool exec_queue_killed_or_banned(struct xe_exec_queue *q)
+static bool exec_queue_wedged(struct xe_exec_queue *q)
{
- return exec_queue_killed(q) || exec_queue_banned(q);
+ return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_WEDGED;
+}
+
+static void set_exec_queue_wedged(struct xe_exec_queue *q)
+{
+ atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
+}
+
+static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
+{
+ return exec_queue_banned(q) || (atomic_read(&q->guc->state) &
+ (EXEC_QUEUE_STATE_WEDGED | ENGINE_STATE_KILLED));
}
#ifdef CONFIG_PROVE_LOCKING
free_submit_wq(guc);
}
+static void guc_submit_wedged_fini(struct drm_device *drm, void *arg)
+{
+ struct xe_guc *guc = arg;
+ struct xe_exec_queue *q;
+ unsigned long index;
+
+ xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+ if (exec_queue_wedged(q))
+ xe_exec_queue_put(q);
+}
+
static const struct xe_exec_queue_ops guc_exec_queue_ops;
static void primelockdep(struct xe_guc *guc)
trace_xe_sched_job_run(job);
- if (!exec_queue_killed_or_banned(q) && !xe_sched_job_is_error(job)) {
+ if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) {
if (!exec_queue_registered(q))
register_engine(q);
if (!lr) /* LR jobs are emitted in the exec IOCTL */
xe_sched_tdr_queue_imm(&q->guc->sched);
}
+static void guc_submit_wedged(struct xe_guc *guc)
+{
+ struct xe_exec_queue *q;
+ unsigned long index;
+ int err;
+
+ xe_device_declare_wedged(guc_to_xe(guc));
+ xe_guc_submit_reset_prepare(guc);
+ xe_guc_ct_stop(&guc->ct);
+
+ err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
+ guc_submit_wedged_fini, guc);
+ if (err)
+ return;
+
+ mutex_lock(&guc->submission_state.lock);
+ xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+ if (xe_exec_queue_get_unless_zero(q))
+ set_exec_queue_wedged(q);
+ mutex_unlock(&guc->submission_state.lock);
+}
+
static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
{
struct xe_guc_exec_queue *ge =
struct xe_guc *guc = exec_queue_to_guc(q);
struct xe_device *xe = guc_to_xe(guc);
struct xe_gpu_scheduler *sched = &ge->sched;
+ bool wedged = xe_device_wedged(xe);
xe_assert(xe, xe_exec_queue_is_lr(q));
trace_xe_exec_queue_lr_cleanup(q);
+ if (!wedged && xe_modparam.wedged_mode == 2) {
+ guc_submit_wedged(exec_queue_to_guc(q));
+ wedged = true;
+ }
+
/* Kill the run_job / process_msg entry points */
xe_sched_submission_stop(sched);
* xe_guc_deregister_done_handler() which treats it as an unexpected
* state.
*/
- if (exec_queue_registered(q) && !exec_queue_destroyed(q)) {
+ if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
struct xe_guc *guc = exec_queue_to_guc(q);
int ret;
struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
int err = -ETIME;
int i = 0;
+ bool wedged = xe_device_wedged(xe);
/*
* TDR has fired before free job worker. Common if exec queue
trace_xe_sched_job_timedout(job);
+ if (!wedged && xe_modparam.wedged_mode == 2) {
+ guc_submit_wedged(exec_queue_to_guc(q));
+ wedged = true;
+ }
+
/* Kill the run_job entry point */
xe_sched_submission_stop(sched);
* Kernel jobs should never fail, nor should VM jobs if they do
* somethings has gone wrong and the GT needs a reset
*/
- if (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
- (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))) {
+ if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
+ (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
if (!xe_sched_invalidate_job(job, 2)) {
xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);
}
/* Engine state now stable, disable scheduling if needed */
- if (exec_queue_registered(q)) {
+ if (!wedged && exec_queue_registered(q)) {
struct xe_guc *guc = exec_queue_to_guc(q);
int ret;
*/
xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);
+
xe_guc_exec_queue_trigger_cleanup(q);
/* Mark all outstanding jobs as bad, thus completing them */
INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);
/* We must block on kernel engines so slabs are empty on driver unload */
- if (q->flags & EXEC_QUEUE_FLAG_PERMANENT)
+ if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
__guc_exec_queue_fini_async(&q->guc->fini_async);
else
queue_work(system_wq, &q->guc->fini_async);
static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
{
- return !exec_queue_killed_or_banned(q) && exec_queue_registered(q);
+ return !exec_queue_killed_or_banned_or_wedged(q) && exec_queue_registered(q);
}
static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *msg)
{
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;
- if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT))
+ if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
guc_exec_queue_add_msg(q, msg, CLEANUP);
else
__guc_exec_queue_fini(exec_queue_to_guc(q), q);
{
struct xe_sched_msg *msg;
- if (q->sched_props.priority == priority || exec_queue_killed_or_banned(q))
+ if (q->sched_props.priority == priority ||
+ exec_queue_killed_or_banned_or_wedged(q))
return 0;
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
struct xe_sched_msg *msg;
if (q->sched_props.timeslice_us == timeslice_us ||
- exec_queue_killed_or_banned(q))
+ exec_queue_killed_or_banned_or_wedged(q))
return 0;
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
struct xe_sched_msg *msg;
if (q->sched_props.preempt_timeout_us == preempt_timeout_us ||
- exec_queue_killed_or_banned(q))
+ exec_queue_killed_or_banned_or_wedged(q))
return 0;
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
{
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND;
- if (exec_queue_killed_or_banned(q) || q->guc->suspend_pending)
+ if (exec_queue_killed_or_banned_or_wedged(q) || q->guc->suspend_pending)
return -EINVAL;
q->guc->suspend_pending = true;
{
struct xe_gpu_scheduler *sched = &q->guc->sched;
- if (!exec_queue_killed_or_banned(q)) {
+ if (!exec_queue_killed_or_banned_or_wedged(q)) {
int i;
trace_xe_exec_queue_resubmit(q);