drm/amdgpu/gfx10: Add cleaner shader for GFX10.1.10

author Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>

Tue, 21 Jan 2025 07:02:07 +0000 (12:32 +0530)

committer Alex Deucher <alexander.deucher@amd.com>

Thu, 13 Feb 2025 02:02:59 +0000 (21:02 -0500)
author Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Tue, 21 Jan 2025 07:02:07 +0000 (12:32 +0530)
committer Alex Deucher <alexander.deucher@amd.com>
Thu, 13 Feb 2025 02:02:59 +0000 (21:02 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

index 4b5e65affb8152decefc1b19a667fc8413974403..1878c83ff7e3da95b2dd3a344f23bc7f0d6a83ae 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4794,6 +4794,20 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
                 break;
         }
         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+       case IP_VERSION(10, 1, 10):
+               adev->gfx.cleaner_shader_ptr = gfx_10_1_10_cleaner_shader_hex;
+               adev->gfx.cleaner_shader_size = sizeof(gfx_10_1_10_cleaner_shader_hex);
+               if (adev->gfx.me_fw_version >= 101 &&
+                   adev->gfx.pfp_fw_version  >= 158 &&
+                   adev->gfx.mec_fw_version >= 152) {
+                       adev->gfx.enable_cleaner_shader = true;
+                       r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size);
+                       if (r) {
+                               adev->gfx.enable_cleaner_shader = false;
+                               dev_err(adev->dev, "Failed to initialize cleaner shader\n");
+                       }
+               }
+               break;
         case IP_VERSION(10, 3, 0):
         case IP_VERSION(10, 3, 2):
         case IP_VERSION(10, 3, 4):
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h

index 663c2572d440a1103586ff5c99af1d8e7364a39e..5255378af53c0a2d8941aea9226996542c356a24 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h
@@ -21,6 +21,41 @@
   * OTHER DEALINGS IN THE SOFTWARE.
   */
  
+/* Define the cleaner shader gfx_10_1_10 */
+static const u32 gfx_10_1_10_cleaner_shader_hex[] = {
+       0xb0804004, 0xbf8a0000,
+       0xbf068100, 0xbf840023,
+       0xbe8203b8, 0xbefc0380,
+       0x7e008480, 0x7e028480,
+       0x7e048480, 0x7e068480,
+       0x7e088480, 0x7e0a8480,
+       0x7e0c8480, 0x7e0e8480,
+       0xbefc0302, 0x80828802,
+       0xbf84fff5, 0xbe8203ff,
+       0x80000000, 0x87020102,
+       0xbf840012, 0xbefe03c1,
+       0xbeff03c1, 0xd7650001,
+       0x0001007f, 0xd7660001,
+       0x0002027e, 0x16020288,
+       0xbe8203bf, 0xbefc03c1,
+       0xd9382000, 0x00020201,
+       0xd9386040, 0x00040401,
+       0xd70f6a01, 0x000202ff,
+       0x00000400, 0x80828102,
+       0xbf84fff7, 0xbefc03ff,
+       0x00000068, 0xbe803080,
+       0xbe813080, 0xbe823080,
+       0xbe833080, 0x80fc847c,
+       0xbf84fffa, 0xbeea0480,
+       0xbeec0480, 0xbeee0480,
+       0xbef00480, 0xbef20480,
+       0xbef40480, 0xbef60480,
+       0xbef80480, 0xbefa0480,
+       0xbf810000, 0xbf9f0000,
+       0xbf9f0000, 0xbf9f0000,
+       0xbf9f0000, 0xbf9f0000,
+};
+
  /* Define the cleaner shader gfx_10_3_0 */
  static const u32 gfx_10_3_0_cleaner_shader_hex[] = {
         0xb0804004, 0xbf8a0000,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_1_10_cleaner_shader.asm b/drivers/gpu/drm/amd/amdgpu/gfx_v10_1_10_cleaner_shader.asm

new file mode 100644 (file)

index 0000000..9ba3359
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_1_10_cleaner_shader.asm
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// This shader is to clean LDS, SGPRs and VGPRs. It is  first 64 Dwords or 256 bytes of 256 Dwords cleaner shader.
+
+// GFX10.1 : Clear SGPRs, VGPRs and LDS
+//   Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot
+//   Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD
+//   Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS)
+//      It takes 2 workgroups to use all of LDS: one on each CU of the WGP
+//   Each wave clears SGPRs 0 - 107
+//   Each wave clears VGPRs 0 - 63
+//   The first wave of the workgroup clears its 64KB of LDS
+//   The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup
+//       before any wave in the workgroup could end.  Without this, it is possible not all SGPRs get cleared.
+
+
+shader main
+  asic(GFX10.1)
+  type(CS)
+  wave_size(32)
+// Note: original source code from SQ team
+
+//
+// Create 32 waves in a threadgroup (CS waves)
+// Each allocates 64 VGPRs
+// The workgroup allocates all of LDS (64kbytes)
+//
+// Takes about 2500 clocks to run.
+//   (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks)
+//
+  S_BARRIER
+  s_cmp_eq_u32 s0, 1                                // Bit0 is set, sgpr0 is set then clear VGPRS and LDS as FW set COMPUTE_USER_DATA_0
+  s_cbranch_scc0  label_0023                        // Clean VGPRs and LDS if sgpr0 of wave is set, scc = (s0 == 1)
+
+  s_mov_b32     s2, 0x00000038  // Loop 64/8=8 times  (loop unrolled for performance)
+  s_mov_b32     m0, 0
+  //
+  // CLEAR VGPRs
+  //
+label_0005:
+  v_movreld_b32     v0, 0
+  v_movreld_b32     v1, 0
+  v_movreld_b32     v2, 0
+  v_movreld_b32     v3, 0
+  v_movreld_b32     v4, 0
+  v_movreld_b32     v5, 0
+  v_movreld_b32     v6, 0
+  v_movreld_b32     v7, 0
+  s_mov_b32         m0, s2
+  s_sub_u32     s2, s2, 8
+  s_cbranch_scc0  label_0005
+  //
+  s_mov_b32     s2, 0x80000000                     // Bit31 is first_wave
+  s_and_b32     s2, s2, s0                                  // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set
+  s_cbranch_scc0  label_0023                         // Clean LDS if its first wave of ThreadGroup/WorkGroup
+  // CLEAR LDS
+  //
+  s_mov_b32 exec_lo, 0xffffffff
+  s_mov_b32 exec_hi, 0xffffffff
+  v_mbcnt_lo_u32_b32  v1, exec_hi, 0          // Set V1 to thread-ID (0..63)
+  v_mbcnt_hi_u32_b32  v1, exec_lo, v1        // Set V1 to thread-ID (0..63)
+  v_mul_u32_u24  v1, 0x00000008, v1          // * 8, so each thread is a double-dword address (8byte)
+  s_mov_b32     s2, 0x00000003f                    // 64 loop iterations
+  s_mov_b32     m0, 0xffffffff
+  // Clear all of LDS space
+  // Each FirstWave of WorkGroup clears 64kbyte block
+
+label_001F:
+  ds_write2_b64  v1, v[2:3], v[2:3] offset1:32
+  ds_write2_b64  v1, v[4:5], v[4:5] offset0:64 offset1:96
+  v_add_co_u32     v1, vcc, 0x00000400, v1
+  s_sub_u32     s2, s2, 1
+  s_cbranch_scc0  label_001F
+
+  //
+  // CLEAR SGPRs
+  //
+label_0023:
+  s_mov_b32     m0, 0x00000068  // Loop 108/4=27 times  (loop unrolled for performance)
+label_sgpr_loop:
+  s_movreld_b32     s0, 0
+  s_movreld_b32     s1, 0
+  s_movreld_b32     s2, 0
+  s_movreld_b32     s3, 0
+  s_sub_u32         m0, m0, 4
+  s_cbranch_scc0  label_sgpr_loop
+
+  //clear vcc
+  s_mov_b64 vcc, 0          //clear vcc
+  //s_setreg_imm32_b32 hw_reg_shader_flat_scratch_lo, 0   //clear  flat scratch lo SGPR
+  //s_setreg_imm32_b32 hw_reg_shader_flat_scratch_hi, 0    //clear  flat scratch hi SGPR
+  s_mov_b64 ttmp0, 0        //Clear ttmp0 and ttmp1
+  s_mov_b64 ttmp2, 0        //Clear ttmp2 and ttmp3
+  s_mov_b64 ttmp4, 0        //Clear ttmp4 and ttmp5
+  s_mov_b64 ttmp6, 0        //Clear ttmp6 and ttmp7
+  s_mov_b64 ttmp8, 0        //Clear ttmp8 and ttmp9
+  s_mov_b64 ttmp10, 0       //Clear ttmp10 and ttmp11
+  s_mov_b64 ttmp12, 0       //Clear ttmp12 and ttmp13
+  s_mov_b64 ttmp14, 0       //Clear ttmp14 and ttmp15
+
+ s_endpgm
+
+end
+
+
author	Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
	Tue, 21 Jan 2025 07:02:07 +0000 (12:32 +0530)
committer	Alex Deucher <alexander.deucher@amd.com>
	Thu, 13 Feb 2025 02:02:59 +0000 (21:02 -0500)
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/gfx_v10_1_10_cleaner_shader.asm	[new file with mode: 0644]	patch \| blob