drm/xe/pf: Reset GuC VF config when unprovisioning critical resource
authorMichal Wajdeczko <michal.wajdeczko@intel.com>
Wed, 29 Jan 2025 19:59:47 +0000 (20:59 +0100)
committerMichal Wajdeczko <michal.wajdeczko@intel.com>
Thu, 30 Jan 2025 16:10:41 +0000 (17:10 +0100)
GuC firmware counts received VF configuration KLVs and may start
validation of the complete VF config even if some resources where
unprovisioned in the meantime, leading to unexpected errors like:

 $ echo 1 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/contexts_quota
 $ echo 0 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/contexts_quota
 $ echo 1 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/doorbells_quota
 $ echo 0 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/doorbells_quota
 $ echo 1 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/ggtt_quota
 tee: '/sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/ggtt_quota': Input/output error

To mitigate this problem trigger explicit VF config reset after
unprovisioning any of the critical resources (GGTT, context or
doorbell IDs) that GuC is monitoring.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: MichaƂ Winiarski <michal.winiarski@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250129195947.764-3-michal.wajdeczko@intel.com
drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c

index c219c55f23ab3d2d18a690a9aed02cdacee9601c..b1d994d65589639c15058ca40a96f981bfd5833b 100644 (file)
@@ -338,6 +338,26 @@ static int pf_push_full_vf_config(struct xe_gt *gt, unsigned int vfid)
        return err;
 }
 
+static int pf_push_vf_cfg(struct xe_gt *gt, unsigned int vfid, bool reset)
+{
+       int err = 0;
+
+       xe_gt_assert(gt, vfid);
+       lockdep_assert_held(xe_gt_sriov_pf_master_mutex(gt));
+
+       if (reset)
+               err = pf_send_vf_cfg_reset(gt, vfid);
+       if (!err)
+               err = pf_push_full_vf_config(gt, vfid);
+
+       return err;
+}
+
+static int pf_refresh_vf_cfg(struct xe_gt *gt, unsigned int vfid)
+{
+       return pf_push_vf_cfg(gt, vfid, true);
+}
+
 static u64 pf_get_ggtt_alignment(struct xe_gt *gt)
 {
        struct xe_device *xe = gt_to_xe(gt);
@@ -434,6 +454,10 @@ static int pf_provision_vf_ggtt(struct xe_gt *gt, unsigned int vfid, u64 size)
                        return err;
 
                pf_release_vf_config_ggtt(gt, config);
+
+               err = pf_refresh_vf_cfg(gt, vfid);
+               if (unlikely(err))
+                       return err;
        }
        xe_gt_assert(gt, !xe_ggtt_node_allocated(config->ggtt_region));
 
@@ -759,6 +783,10 @@ static int pf_provision_vf_ctxs(struct xe_gt *gt, unsigned int vfid, u32 num_ctx
                        return ret;
 
                pf_release_config_ctxs(gt, config);
+
+               ret = pf_refresh_vf_cfg(gt, vfid);
+               if (unlikely(ret))
+                       return ret;
        }
 
        if (!num_ctxs)
@@ -1056,6 +1084,10 @@ static int pf_provision_vf_dbs(struct xe_gt *gt, unsigned int vfid, u32 num_dbs)
                        return ret;
 
                pf_release_config_dbs(gt, config);
+
+               ret = pf_refresh_vf_cfg(gt, vfid);
+               if (unlikely(ret))
+                       return ret;
        }
 
        if (!num_dbs)
@@ -2087,10 +2119,7 @@ int xe_gt_sriov_pf_config_push(struct xe_gt *gt, unsigned int vfid, bool refresh
        xe_gt_assert(gt, vfid);
 
        mutex_lock(xe_gt_sriov_pf_master_mutex(gt));
-       if (refresh)
-               err = pf_send_vf_cfg_reset(gt, vfid);
-       if (!err)
-               err = pf_push_full_vf_config(gt, vfid);
+       err = pf_push_vf_cfg(gt, vfid, refresh);
        mutex_unlock(xe_gt_sriov_pf_master_mutex(gt));
 
        if (unlikely(err)) {