drm/amdgpu: Create an option to disable soft recovery
authorAndré Almeida <andrealmeid@igalia.com>
Mon, 11 Sep 2023 17:12:55 +0000 (14:12 -0300)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 11 Sep 2023 21:22:23 +0000 (17:22 -0400)
Create a module option to disable soft recoveries on amdgpu, making
every recovery go through the device reset path. This option makes
easier to force device resets for testing and debugging purposes.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index eaebd28845034b31426e4007e7cb8764d03260f1..62bbfdd502af3059e1b0219090caefbb7221c056 100644 (file)
@@ -1102,6 +1102,7 @@ struct amdgpu_device {
        /* Debug */
        bool                            debug_vm;
        bool                            debug_largebar;
+       bool                            debug_disable_soft_recovery;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
index 745174d196d679f4e6b02356eb6d726361c749c7..f735e91eef5cc757c4a4a1e52fae448b9297f6b3 100644 (file)
 enum AMDGPU_DEBUG_MASK {
        AMDGPU_DEBUG_VM = BIT(0),
        AMDGPU_DEBUG_LARGEBAR = BIT(1),
+       AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
 };
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -945,6 +946,7 @@ MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics
  * - 0x2: Enable simulating large-bar capability on non-large bar system. This
  *   limits the VRAM size reported to ROCm applications to the visible
  *   size, usually 256MB.
+ * - 0x4: Disable GPU soft recovery, always do a full reset
  */
 MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
 module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444);
@@ -2064,6 +2066,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
                pr_info("debug: enabled simulating large-bar capability on non-large bar system\n");
                adev->debug_largebar = true;
        }
+
+       if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY) {
+               pr_info("debug: soft reset for GPU recovery disabled\n");
+               adev->debug_disable_soft_recovery = true;
+       }
 }
 
 static int amdgpu_pci_probe(struct pci_dev *pdev,
index 80d6e132e4095d6b30811dca266bdfa88f77d240..6a80d3ec887e98972d84e8c1f75b6fdba5a86143 100644 (file)
@@ -434,8 +434,12 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
                               struct dma_fence *fence)
 {
        unsigned long flags;
+       ktime_t deadline;
 
-       ktime_t deadline = ktime_add_us(ktime_get(), 10000);
+       if (unlikely(ring->adev->debug_disable_soft_recovery))
+               return false;
+
+       deadline = ktime_add_us(ktime_get(), 10000);
 
        if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
                return false;