drm/sched: fix the bug of time out calculation(v4)
authorMonk Liu <Monk.Liu@amd.com>
Wed, 1 Sep 2021 00:46:46 +0000 (08:46 +0800)
committerAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Wed, 15 Sep 2021 14:21:30 +0000 (10:21 -0400)
issue:
in cleanup_job the cancle_delayed_work will cancel a TO timer
even the its corresponding job is still running.

fix:
do not cancel the timer in cleanup_job, instead do the cancelling
only when the heading job is signaled, and if there is a "next" job
we start_timeout again.

v2:
further cleanup the logic, and do the TDR timer cancelling if the signaled job
is the last one in its scheduler.

v3:
change the issue description
remove the cancel_delayed_work in the begining of the cleanup_job
recover the implement of drm_sched_job_begin.

v4:
remove the kthread_should_park() checking in cleanup_job routine,
we should cleanup the signaled job asap

TODO:
1)introduce pause/resume scheduler in job_timeout to serial the handling
of scheduler and job_timeout.
2)drop the bad job's del and insert in scheduler due to above serialization
(no race issue anymore with the serialization)

Tested-by: jingwen <jingwen.chen@@amd.com>
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1630457207-13107-1-git-send-email-Monk.Liu@amd.com
drivers/gpu/drm/scheduler/sched_main.c

index 6987d412a946c031d80aa5af19fc0bcf5eba6177..042c16b5d54a0d485a46b04fcf78bcaad6abc6cc 100644 (file)
@@ -827,15 +827,6 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 {
        struct drm_sched_job *job, *next;
 
-       /*
-        * Don't destroy jobs while the timeout worker is running  OR thread
-        * is being parked and hence assumed to not touch pending_list
-        */
-       if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-           !cancel_delayed_work(&sched->work_tdr)) ||
-           kthread_should_park())
-               return NULL;
-
        spin_lock(&sched->job_list_lock);
 
        job = list_first_entry_or_null(&sched->pending_list,
@@ -844,17 +835,21 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
        if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
                /* remove job from pending_list */
                list_del_init(&job->list);
+
+               /* cancel this job's TO timer */
+               cancel_delayed_work(&sched->work_tdr);
                /* make the scheduled timestamp more accurate */
                next = list_first_entry_or_null(&sched->pending_list,
                                                typeof(*next), list);
-               if (next)
+
+               if (next) {
                        next->s_fence->scheduled.timestamp =
                                job->s_fence->finished.timestamp;
-
+                       /* start TO timer for next job */
+                       drm_sched_start_timeout(sched);
+               }
        } else {
                job = NULL;
-               /* queue timeout for next job */
-               drm_sched_start_timeout(sched);
        }
 
        spin_unlock(&sched->job_list_lock);
@@ -942,11 +937,8 @@ static int drm_sched_main(void *param)
                                          (entity = drm_sched_select_entity(sched))) ||
                                         kthread_should_stop());
 
-               if (cleanup_job) {
+               if (cleanup_job)
                        sched->ops->free_job(cleanup_job);
-                       /* queue timeout for next job */
-                       drm_sched_start_timeout(sched);
-               }
 
                if (!entity)
                        continue;