return 0;
}
+static void sob_reset_work(struct work_struct *work)
+{
+ struct hl_cs_compl *hl_cs_cmpl =
+ container_of(work, struct hl_cs_compl, sob_reset_work);
+ struct hl_device *hdev = hl_cs_cmpl->hdev;
+
+ /*
+ * A signal CS can get completion while the corresponding wait
+ * for signal CS is on its way to the PQ. The wait for signal CS
+ * will get stuck if the signal CS incremented the SOB to its
+ * max value and there are no pending (submitted) waits on this
+ * SOB.
+ * We do the following to void this situation:
+ * 1. The wait for signal CS must get a ref for the signal CS as
+ * soon as possible in cs_ioctl_signal_wait() and put it
+ * before being submitted to the PQ but after it incremented
+ * the SOB refcnt in init_signal_wait_cs().
+ * 2. Signal/Wait for signal CS will decrement the SOB refcnt
+ * here.
+ * These two measures guarantee that the wait for signal CS will
+ * reset the SOB upon completion rather than the signal CS and
+ * hence the above scenario is avoided.
+ */
+ kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset);
+
+ if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
+ hdev->asic_funcs->reset_sob_group(hdev,
+ hl_cs_cmpl->sob_group);
+
+ kfree(hl_cs_cmpl);
+}
+
static void hl_fence_release(struct kref *kref)
{
struct hl_fence *fence =
hl_cs_cmpl->hw_sob->sob_id,
hl_cs_cmpl->sob_val);
- /*
- * A signal CS can get completion while the corresponding wait
- * for signal CS is on its way to the PQ. The wait for signal CS
- * will get stuck if the signal CS incremented the SOB to its
- * max value and there are no pending (submitted) waits on this
- * SOB.
- * We do the following to void this situation:
- * 1. The wait for signal CS must get a ref for the signal CS as
- * soon as possible in cs_ioctl_signal_wait() and put it
- * before being submitted to the PQ but after it incremented
- * the SOB refcnt in init_signal_wait_cs().
- * 2. Signal/Wait for signal CS will decrement the SOB refcnt
- * here.
- * These two measures guarantee that the wait for signal CS will
- * reset the SOB upon completion rather than the signal CS and
- * hence the above scenario is avoided.
- */
- kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset);
+ queue_work(hdev->sob_reset_wq, &hl_cs_cmpl->sob_reset_work);
- if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
- hdev->asic_funcs->reset_sob_group(hdev,
- hl_cs_cmpl->sob_group);
+ return;
}
free:
goto free_cs;
}
+ cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
+ sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
+ if (!cs->jobs_in_queue_cnt)
+ cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
+ sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);
+
+ if (!cs->jobs_in_queue_cnt) {
+ atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
+ atomic64_inc(&cntr->out_of_mem_drop_cnt);
+ rc = -ENOMEM;
+ goto free_cs_cmpl;
+ }
+
cs_cmpl->hdev = hdev;
cs_cmpl->type = cs->type;
spin_lock_init(&cs_cmpl->lock);
+ INIT_WORK(&cs_cmpl->sob_reset_work, sob_reset_work);
cs->fence = &cs_cmpl->base_fence;
spin_lock(&ctx->cs_lock);
goto free_fence;
}
- cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
- sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
- if (!cs->jobs_in_queue_cnt)
- cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
- sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);
-
- if (!cs->jobs_in_queue_cnt) {
- atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
- atomic64_inc(&cntr->out_of_mem_drop_cnt);
- rc = -ENOMEM;
- goto free_fence;
- }
-
/* init hl_fence */
hl_fence_init(&cs_cmpl->base_fence, cs_cmpl->cs_seq);
free_fence:
spin_unlock(&ctx->cs_lock);
+ kfree(cs->jobs_in_queue_cnt);
+free_cs_cmpl:
kfree(cs_cmpl);
free_cs:
kfree(cs);
int i;
struct hl_cs *cs, *tmp;
+ flush_workqueue(hdev->sob_reset_wq);
+
/* flush all completions before iterating over the CS mirror list in
* order to avoid a race with the release functions
*/
goto free_cq_wq;
}
+ hdev->sob_reset_wq = alloc_workqueue("hl-sob-reset", WQ_UNBOUND, 0);
+ if (!hdev->sob_reset_wq) {
+ dev_err(hdev->dev,
+ "Failed to allocate SOB reset workqueue\n");
+ rc = -ENOMEM;
+ goto free_eq_wq;
+ }
+
hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
GFP_KERNEL);
if (!hdev->hl_chip_info) {
rc = -ENOMEM;
- goto free_eq_wq;
+ goto free_sob_reset_wq;
}
hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
kfree(hdev->idle_busy_ts_arr);
free_chip_info:
kfree(hdev->hl_chip_info);
+free_sob_reset_wq:
+ destroy_workqueue(hdev->sob_reset_wq);
free_eq_wq:
destroy_workqueue(hdev->eq_wq);
free_cq_wq:
kfree(hdev->idle_busy_ts_arr);
kfree(hdev->hl_chip_info);
+ destroy_workqueue(hdev->sob_reset_wq);
destroy_workqueue(hdev->eq_wq);
destroy_workqueue(hdev->device_reset_work.wq);
/**
* struct hl_cs_compl - command submission completion object.
+ * @sob_reset_work: workqueue object to run SOB reset flow.
* @base_fence: hl fence object.
* @lock: spinlock to protect fence.
* @hdev: habanalabs device structure.
* @sob_group: the SOB group that is used in this collective wait CS.
*/
struct hl_cs_compl {
+ struct work_struct sob_reset_work;
struct hl_fence base_fence;
spinlock_t lock;
struct hl_device *hdev;
* @cq_wq: work queues of completion queues for executing work in process
* context.
* @eq_wq: work queue of event queue for executing work in process context.
+ * @sob_reset_wq: work queue for sob reset executions.
* @kernel_ctx: Kernel driver context structure.
* @kernel_queues: array of hl_hw_queue.
* @cs_mirror_list: CS mirror list for TDR.
struct hl_user_interrupt common_user_interrupt;
struct workqueue_struct **cq_wq;
struct workqueue_struct *eq_wq;
+ struct workqueue_struct *sob_reset_wq;
struct hl_ctx *kernel_ctx;
struct hl_hw_queue *kernel_queues;
struct list_head cs_mirror_list;