habanalabs: reset after device is actually released
authorOded Gabbay <ogabbay@kernel.org>
Tue, 16 Feb 2021 20:34:24 +0000 (22:34 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Fri, 9 Apr 2021 11:09:22 +0000 (14:09 +0300)
The device is actually released only after the refcnt of the hpriv
structure is 0, which means all its contexts were closed.

If we reset the device while a context is still open, there are
possibilities for unexpected behavior and crashes. For example, if the
process has a mapping of a register block that is now currently being
reset, and the process writes/reads to that block during the reset,
the device can get stuck.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/device.c

index c74bdf4ae6aa8fb9946f59aa7a97aee08c68718b..cba23e5f1bb361e6d16f3832dbd38219d514e273 100644 (file)
@@ -70,6 +70,22 @@ static void hpriv_release(struct kref *ref)
        mutex_unlock(&hdev->fpriv_list_lock);
 
        kfree(hpriv);
+
+       if (hdev->reset_upon_device_release) {
+               u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+
+               /* We try soft reset first */
+               hl_device_reset(hdev, false, false);
+
+               /* If device is not idle perform hard reset */
+               if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
+                               HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
+                       dev_info(hdev->dev,
+                               "device is not idle (mask %#llx %#llx) after soft reset, performing hard reset",
+                               idle_mask[0], idle_mask[1]);
+                       hl_device_reset(hdev, true, false);
+               }
+       }
 }
 
 void hl_hpriv_get(struct hl_fpriv *hpriv)
@@ -106,22 +122,6 @@ static int hl_device_release(struct inode *inode, struct file *filp)
        hl_cb_mgr_fini(hdev, &hpriv->cb_mgr);
        hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
 
-       if (hdev->reset_upon_device_release) {
-               u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-
-               /* We try soft reset first */
-               hl_device_reset(hdev, false, false);
-
-               /* If device is not idle perform hard reset */
-               if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
-                               HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
-                       dev_info(hdev->dev,
-                               "device is not idle (mask %#llx %#llx) after soft reset, performing hard reset",
-                               idle_mask[0], idle_mask[1]);
-                       hl_device_reset(hdev, true, false);
-               }
-       }
-
        hl_hpriv_put(hpriv);
 
        return 0;