drm/amdgpu: Fix the warning info in mode1 reset
authorMa Jun <Jun.Ma2@amd.com>
Fri, 5 Jan 2024 06:05:25 +0000 (14:05 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 31 Jan 2024 14:40:42 +0000 (09:40 -0500)
Fix the warning info below during mode1 reset.
[  +0.000004] Call Trace:
[  +0.000004]  <TASK>
[  +0.000006]  ? show_regs+0x6e/0x80
[  +0.000011]  ? __flush_work.isra.0+0x2e8/0x390
[  +0.000005]  ? __warn+0x91/0x150
[  +0.000009]  ? __flush_work.isra.0+0x2e8/0x390
[  +0.000006]  ? report_bug+0x19d/0x1b0
[  +0.000013]  ? handle_bug+0x46/0x80
[  +0.000012]  ? exc_invalid_op+0x1d/0x80
[  +0.000011]  ? asm_exc_invalid_op+0x1f/0x30
[  +0.000014]  ? __flush_work.isra.0+0x2e8/0x390
[  +0.000007]  ? __flush_work.isra.0+0x208/0x390
[  +0.000007]  ? _prb_read_valid+0x216/0x290
[  +0.000008]  __cancel_work_timer+0x11d/0x1a0
[  +0.000007]  ? try_to_grab_pending+0xe8/0x190
[  +0.000012]  cancel_work_sync+0x14/0x20
[  +0.000008]  amddrm_sched_stop+0x3c/0x1d0 [amd_sched]
[  +0.000032]  amdgpu_device_gpu_recover+0x29a/0xe90 [amdgpu]

This warning info was printed after applying the patch
"drm/sched: Convert drm scheduler to use a work queue rather than kthread".
The root cause is that amdgpu driver tries to use the uninitialized
work_struct in the struct drm_gpu_scheduler

v2:
 - Rename the function to amdgpu_ring_sched_ready and move it to
amdgpu_ring.c (Alex)
v3:
- Fix a few more checks based on Vitaly's patch (Alex)
v4:
- squash in fix noticed by Bert in
https://gitlab.freedesktop.org/drm/amd/-/issues/3139

Fixes: 11b3b9f461c5 ("drm/sched: Check scheduler ready before calling timeout handling")
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com>
Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index 899e31e3a5e81d2be343a668e295a564efee10af..3a3f3ce09f00dbe77f61455f24fed7bd0db0dec5 100644 (file)
@@ -290,7 +290,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus
        for (i = 0; i < adev->gfx.num_compute_rings; i++) {
                struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
 
-               if (!(ring && drm_sched_wqueue_ready(&ring->sched)))
+               if (!amdgpu_ring_sched_ready(ring))
                        continue;
 
                /* stop secheduler and drain ring. */
index e485dd3357c63fd225b3fb7e3847675749f018da..1afbb2e932c6b58a9e26cbabe61370151373a4af 100644 (file)
@@ -1678,7 +1678,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
        for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
                struct amdgpu_ring *ring = adev->rings[i];
 
-               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
+               if (!amdgpu_ring_sched_ready(ring))
                        continue;
                drm_sched_wqueue_stop(&ring->sched);
        }
@@ -1694,7 +1694,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
        for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
                struct amdgpu_ring *ring = adev->rings[i];
 
-               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
+               if (!amdgpu_ring_sched_ready(ring))
                        continue;
                drm_sched_wqueue_start(&ring->sched);
        }
@@ -1916,8 +1916,8 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
 
        ring = adev->rings[val];
 
-       if (!ring || !ring->funcs->preempt_ib ||
-           !drm_sched_wqueue_ready(&ring->sched))
+       if (!amdgpu_ring_sched_ready(ring) ||
+           !ring->funcs->preempt_ib)
                return -EINVAL;
 
        /* the last preemption failed */
index 9689756bf9f5f8d27108cda07f2ee940b0224e92..6a930c98387f2c13ce704047ce8a0ba02850f2c8 100644 (file)
@@ -5036,7 +5036,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                struct amdgpu_ring *ring = adev->rings[i];
 
-               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
+               if (!amdgpu_ring_sched_ready(ring))
                        continue;
 
                spin_lock(&ring->sched.job_list_lock);
@@ -5175,7 +5175,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                struct amdgpu_ring *ring = adev->rings[i];
 
-               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
+               if (!amdgpu_ring_sched_ready(ring))
                        continue;
 
                /* Clear job fence from fence drv to avoid force_completion
@@ -5642,7 +5642,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                        struct amdgpu_ring *ring = tmp_adev->rings[i];
 
-                       if (!ring || !drm_sched_wqueue_ready(&ring->sched))
+                       if (!amdgpu_ring_sched_ready(ring))
                                continue;
 
                        drm_sched_stop(&ring->sched, job ? &job->base : NULL);
@@ -5711,7 +5711,7 @@ skip_hw_reset:
                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                        struct amdgpu_ring *ring = tmp_adev->rings[i];
 
-                       if (!ring || !drm_sched_wqueue_ready(&ring->sched))
+                       if (!amdgpu_ring_sched_ready(ring))
                                continue;
 
                        drm_sched_start(&ring->sched, true);
@@ -6066,7 +6066,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                        struct amdgpu_ring *ring = adev->rings[i];
 
-                       if (!ring || !drm_sched_wqueue_ready(&ring->sched))
+                       if (!amdgpu_ring_sched_ready(ring))
                                continue;
 
                        drm_sched_stop(&ring->sched, NULL);
@@ -6208,7 +6208,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                struct amdgpu_ring *ring = adev->rings[i];
 
-               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
+               if (!amdgpu_ring_sched_ready(ring))
                        continue;
 
                drm_sched_start(&ring->sched, true);
index 45424ebf9681430fefc21bdc33d6aa2c6e5f6c91..5505d646f43aa8f963d8d8732846b00fc612a3a7 100644 (file)
@@ -635,6 +635,7 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
                              ring->name);
 
        ring->sched.ready = !r;
+
        return r;
 }
 
@@ -717,3 +718,14 @@ void amdgpu_ring_ib_on_emit_de(struct amdgpu_ring *ring)
        if (ring->is_sw_ring)
                amdgpu_sw_ring_ib_mark_offset(ring, AMDGPU_MUX_OFFSET_TYPE_DE);
 }
+
+bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring)
+{
+       if (!ring)
+               return false;
+
+       if (ring->no_scheduler || !drm_sched_wqueue_ready(&ring->sched))
+               return false;
+
+       return true;
+}
index bbb53720a0181d93cf9fdfd6f7721ee006699004..fe1a61eb6e4c0809c1bccd41bc89f32bcd8304f2 100644 (file)
@@ -450,5 +450,5 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 int amdgpu_ib_pool_init(struct amdgpu_device *adev);
 void amdgpu_ib_pool_fini(struct amdgpu_device *adev);
 int amdgpu_ib_ring_tests(struct amdgpu_device *adev);
-
+bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring);
 #endif