drm/amdgpu: Optimize gfx v9 GPU page fault handling
authorPhilip Yang <Philip.Yang@amd.com>
Wed, 13 Nov 2024 03:07:33 +0000 (22:07 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 18 Dec 2024 17:39:07 +0000 (12:39 -0500)
After GPU page fault, there are lots of page fault interrupts generated
at short period even with CAM filter enabled because the fault address
is different. Each page fault copy to KFD ih fifo to send event to user
space by KFD interrupt worker, this could cause KFD ih fifo overflow
while other processes generate events at same time.

KFD process is aborted after GPU page fault, we only need one GPU page
fault interrupt sent to KFD ih fifo to send memory exception event to
user space.

Incease KFD ih fifo size to 2 times of IH primary ring size, to handle
the burst events case.

This patch handle the gfx v9 path, cover retry on/off and CAM filter
on/off cases.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 4b80ad860639c7f2c3136c8124f04854621b69b5..8af67f18500a7486bdc48a5647fbb520736eeaa9 100644 (file)
@@ -433,6 +433,9 @@ void kgd2kfd_unlock_kfd(void);
 int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
 int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
 bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
+bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
+                              bool retry_fault);
+
 #else
 static inline int kgd2kfd_init(void)
 {
@@ -518,5 +521,12 @@ static inline bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
 {
        return false;
 }
+
+static inline bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
+                                     bool retry_fault)
+{
+       return false;
+}
+
 #endif
 #endif /* AMDGPU_AMDKFD_H_INCLUDED */
index 48f1b9cf1f8824dd50497d030d98669a0513a4e3..291549765c38c5b18f92d477d9038bd044192f0c 100644 (file)
@@ -623,6 +623,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
                }
        }
 
+       if (kgd2kfd_vmfault_fast_path(adev, entry, retry_fault))
+               return 1;
+
        if (!printk_ratelimit())
                return 0;
 
index ac0fdaa1ea23eb56363219d498b0ee95a1cddd71..9b49563f2c42a2e483a19f5fc70515916f6d83aa 100644 (file)
@@ -1521,6 +1521,73 @@ bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
        return kfd_compute_active(node);
 }
 
+/**
+ * kgd2kfd_vmfault_fast_path() - KFD vm page fault interrupt handling fast path for gmc v9
+ * @adev: amdgpu device
+ * @entry: vm fault interrupt vector
+ * @retry_fault: if this is retry fault
+ *
+ * retry fault -
+ *    with CAM enabled, adev primary ring
+ *                           |  gmc_v9_0_process_interrupt()
+ *                      adev soft_ring
+ *                           |  gmc_v9_0_process_interrupt() worker failed to recover page fault
+ *                      KFD node ih_fifo
+ *                           |  KFD interrupt_wq worker
+ *                      kfd_signal_vm_fault_event
+ *
+ *    without CAM,      adev primary ring1
+ *                           |  gmc_v9_0_process_interrupt worker failed to recvoer page fault
+ *                      KFD node ih_fifo
+ *                           |  KFD interrupt_wq worker
+ *                      kfd_signal_vm_fault_event
+ *
+ * no-retry fault -
+ *                      adev primary ring
+ *                           |  gmc_v9_0_process_interrupt()
+ *                      KFD node ih_fifo
+ *                           |  KFD interrupt_wq worker
+ *                      kfd_signal_vm_fault_event
+ *
+ * fast path - After kfd_signal_vm_fault_event, gmc_v9_0_process_interrupt drop the page fault
+ *            of same process, don't copy interrupt to KFD node ih_fifo.
+ *            With gdb debugger enabled, need convert the retry fault to no-retry fault for
+ *            debugger, cannot use the fast path.
+ *
+ * Return:
+ *   true - use the fast path to handle this fault
+ *   false - use normal path to handle it
+ */
+bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
+                              bool retry_fault)
+{
+       struct kfd_process *p;
+       u32 cam_index;
+
+       if (entry->ih == &adev->irq.ih_soft || entry->ih == &adev->irq.ih1) {
+               p = kfd_lookup_process_by_pasid(entry->pasid);
+               if (!p)
+                       return true;
+
+               if (p->gpu_page_fault && !p->debug_trap_enabled) {
+                       if (retry_fault && adev->irq.retry_cam_enabled) {
+                               cam_index = entry->src_data[2] & 0x3ff;
+                               WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
+                       }
+
+                       kfd_unref_process(p);
+                       return true;
+               }
+
+               /*
+                * This is the first page fault, set flag and then signal user space
+                */
+               p->gpu_page_fault = true;
+               kfd_unref_process(p);
+       }
+       return false;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
index e7412de9a0ac592881ae593451168f7e2e8317ac..8e00800f3207e175c62cdda216be2e620463f375 100644 (file)
@@ -46,7 +46,7 @@
 #include <linux/kfifo.h>
 #include "kfd_priv.h"
 
-#define KFD_IH_NUM_ENTRIES 8192
+#define KFD_IH_NUM_ENTRIES 16384
 
 static void interrupt_wq(struct work_struct *);
 
index 800e4ae5b5735663ad0b99b8819080cf2290fefd..e529fdc1b422edeb2a03a021cd3b05ba34a1f659 100644 (file)
@@ -1003,6 +1003,9 @@ struct kfd_process {
        struct semaphore runtime_enable_sema;
        bool is_runtime_retry;
        struct kfd_runtime_info runtime_info;
+
+       /* if gpu page fault sent to KFD */
+       bool gpu_page_fault;
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */