1 // SPDX-License-Identifier: MIT
3 * Copyright © 2021 Intel Corporation
8 #include <generated/xe_wa_oob.h>
10 #include <linux/ascii85.h>
12 #include "instructions/xe_mi_commands.h"
13 #include "instructions/xe_gfxpipe_commands.h"
14 #include "instructions/xe_gfx_state_commands.h"
15 #include "regs/xe_engine_regs.h"
16 #include "regs/xe_lrc_layout.h"
19 #include "xe_device.h"
20 #include "xe_drm_client.h"
21 #include "xe_exec_queue_types.h"
23 #include "xe_gt_printk.h"
24 #include "xe_hw_fence.h"
26 #include "xe_memirq.h"
29 #include "xe_trace_lrc.h"
33 #define LRC_VALID BIT_ULL(0)
34 #define LRC_PRIVILEGE BIT_ULL(8)
35 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3)
36 #define LRC_LEGACY_64B_CONTEXT 3
38 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61)
39 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
41 #define LRC_PPHWSP_SIZE SZ_4K
42 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
43 #define LRC_WA_BB_SIZE SZ_4K
45 static struct xe_device *
46 lrc_to_xe(struct xe_lrc *lrc)
48 return gt_to_xe(lrc->fence_ctx.gt);
51 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
53 struct xe_device *xe = gt_to_xe(gt);
56 /* Per-process HW status page (PPHWSP) */
57 size = LRC_PPHWSP_SIZE;
59 /* Engine context image */
61 case XE_ENGINE_CLASS_RENDER:
62 if (GRAPHICS_VER(xe) >= 20)
67 case XE_ENGINE_CLASS_COMPUTE:
68 if (GRAPHICS_VER(xe) >= 20)
74 WARN(1, "Unknown engine class: %d", class);
76 case XE_ENGINE_CLASS_COPY:
77 case XE_ENGINE_CLASS_VIDEO_DECODE:
78 case XE_ENGINE_CLASS_VIDEO_ENHANCE:
79 case XE_ENGINE_CLASS_OTHER:
83 /* Add indirect ring state page */
84 if (xe_gt_has_indirect_ring_state(gt))
85 size += LRC_INDIRECT_RING_STATE_SIZE;
91 * The per-platform tables are u8-encoded in @data. Decode @data and set the
92 * addresses' offset and commands in @regs. The following encoding is used
93 * for each byte. There are 2 steps: decoding commands and decoding addresses.
96 * [7]: create NOPs - number of NOPs are set in lower bits
97 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
99 * [5:0]: Number of NOPs or registers to set values to in case of
100 * MI_LOAD_REGISTER_IMM
102 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
103 * number of registers. They are set by using the REG/REG16 macros: the former
104 * is used for offsets smaller than 0x200 while the latter is for values bigger
105 * than that. Those macros already set all the bits documented below correctly:
107 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
108 * follow, for the lower bits
109 * [6:0]: Register offset, without considering the engine base.
111 * This function only tweaks the commands and register offsets. Values are not
114 static void set_offsets(u32 *regs,
116 const struct xe_hw_engine *hwe)
117 #define NOP(x) (BIT(7) | (x))
118 #define LRI(count, flags) ((flags) << 6 | (count) | \
119 BUILD_BUG_ON_ZERO(count >= BIT(6)))
120 #define POSTED BIT(0)
121 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
123 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
126 const u32 base = hwe->mmio_base;
131 if (*data & BIT(7)) { /* skip */
132 count = *data++ & ~BIT(7);
137 count = *data & 0x3f;
141 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
143 *regs |= MI_LRI_FORCE_POSTED;
144 *regs |= MI_LRI_LRM_CS_MMIO;
147 xe_gt_assert(hwe->gt, count);
155 offset |= v & ~BIT(7);
156 } while (v & BIT(7));
158 regs[0] = base + (offset << 2);
163 *regs = MI_BATCH_BUFFER_END | BIT(0);
166 static const u8 gen12_xcs_offsets[] = {
198 static const u8 dg2_xcs_offsets[] = {
232 static const u8 gen12_rcs_offsets[] = {
328 static const u8 xehp_rcs_offsets[] = {
369 static const u8 dg2_rcs_offsets[] = {
412 static const u8 mtl_rcs_offsets[] = {
455 #define XE2_CTX_COMMON \
456 NOP(1), /* [0x00] */ \
457 LRI(15, POSTED), /* [0x01] */ \
458 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \
459 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \
460 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \
461 REG(0x038), /* [0x08] RING_BUFFER_START */ \
462 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \
463 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \
464 REG(0x140), /* [0x0e] BB_ADDR */ \
465 REG(0x110), /* [0x10] BB_STATE */ \
466 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \
467 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \
468 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
469 REG(0x180), /* [0x18] CCID */ \
470 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \
471 REG(0x120), /* [0x1c] PRT_BB_STATE */ \
472 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \
474 NOP(1), /* [0x20] */ \
475 LRI(9, POSTED), /* [0x21] */ \
476 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \
477 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \
478 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \
479 REG16(0x284), /* [0x28] dummy reg */ \
480 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \
481 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \
482 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \
483 REG16(0x274), /* [0x30] PTBP_UDW */ \
484 REG16(0x270) /* [0x32] PTBP_LDW */
486 static const u8 xe2_rcs_offsets[] = {
490 LRI(2, POSTED), /* [0x36] */
491 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
492 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */
495 LRI(1, 0), /* [0x47] */
496 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */
501 static const u8 xe2_bcs_offsets[] = {
504 NOP(4 + 8 + 1), /* [0x34] */
505 LRI(2, POSTED), /* [0x41] */
506 REG16(0x200), /* [0x42] BCS_SWCTRL */
507 REG16(0x204), /* [0x44] BLIT_CCTL */
512 static const u8 xe2_xcs_offsets[] = {
518 static const u8 xe2_indirect_ring_state_offsets[] = {
520 LRI(5, POSTED), /* [0x01] */
521 REG(0x034), /* [0x02] RING_BUFFER_HEAD */
522 REG(0x030), /* [0x04] RING_BUFFER_TAIL */
523 REG(0x038), /* [0x06] RING_BUFFER_START */
524 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */
525 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */
528 LRI(9, POSTED), /* [0x11] */
529 REG(0x168), /* [0x12] BB_ADDR_UDW */
530 REG(0x140), /* [0x14] BB_ADDR */
531 REG(0x110), /* [0x16] BB_STATE */
532 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */
533 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */
534 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */
535 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */
536 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */
537 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */
539 NOP(12), /* [0x00] */
549 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
551 if (class == XE_ENGINE_CLASS_RENDER) {
552 if (GRAPHICS_VER(xe) >= 20)
553 return xe2_rcs_offsets;
554 else if (GRAPHICS_VERx100(xe) >= 1270)
555 return mtl_rcs_offsets;
556 else if (GRAPHICS_VERx100(xe) >= 1255)
557 return dg2_rcs_offsets;
558 else if (GRAPHICS_VERx100(xe) >= 1250)
559 return xehp_rcs_offsets;
561 return gen12_rcs_offsets;
562 } else if (class == XE_ENGINE_CLASS_COPY) {
563 if (GRAPHICS_VER(xe) >= 20)
564 return xe2_bcs_offsets;
566 return gen12_xcs_offsets;
568 if (GRAPHICS_VER(xe) >= 20)
569 return xe2_xcs_offsets;
570 else if (GRAPHICS_VERx100(xe) >= 1255)
571 return dg2_xcs_offsets;
573 return gen12_xcs_offsets;
577 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
579 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
580 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
582 if (xe_gt_has_indirect_ring_state(hwe->gt))
583 regs[CTX_CONTEXT_CONTROL] |=
584 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
586 /* TODO: Timestamp */
589 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
591 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq;
592 struct xe_device *xe = gt_to_xe(hwe->gt);
595 if (!xe_device_uses_memirq(xe))
598 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
599 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
600 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
601 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
603 num_regs = xe_device_has_msix(xe) ? 3 : 2;
604 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
605 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
606 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
607 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
608 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
609 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
611 if (xe_device_has_msix(xe)) {
612 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
613 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
617 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
619 struct xe_device *xe = gt_to_xe(hwe->gt);
621 if (GRAPHICS_VERx100(xe) >= 1250)
627 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
631 x = lrc_ring_mi_mode(hwe);
632 regs[x + 1] &= ~STOP_RING;
633 regs[x + 1] |= STOP_RING << 16;
636 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
638 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
641 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
646 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
648 return lrc->ring.size;
651 /* Make the magic macros work */
652 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
653 #define __xe_lrc_regs_offset xe_lrc_regs_offset
655 #define LRC_SEQNO_PPHWSP_OFFSET 512
656 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
657 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
658 #define LRC_PARALLEL_PPHWSP_OFFSET 2048
659 #define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
661 u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
663 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
666 static size_t lrc_reg_size(struct xe_device *xe)
668 if (GRAPHICS_VERx100(xe) >= 1250)
669 return 96 * sizeof(u32);
671 return 80 * sizeof(u32);
674 size_t xe_lrc_skip_size(struct xe_device *xe)
676 return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
679 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
681 /* The seqno is stored in the driver-defined portion of PPHWSP */
682 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
685 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
687 /* The start seqno is stored in the driver-defined portion of PPHWSP */
688 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
691 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
693 /* This is stored in the driver-defined portion of PPHWSP */
694 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
697 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
699 /* The parallel is stored in the driver-defined portion of PPHWSP */
700 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
703 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
705 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
708 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
710 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
713 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
715 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
718 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
720 /* Indirect ring state page is at the very end of LRC */
721 return lrc->size - LRC_INDIRECT_RING_STATE_SIZE;
724 #define DECL_MAP_ADDR_HELPERS(elem) \
725 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
727 struct iosys_map map = lrc->bo->vmap; \
729 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \
730 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
733 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
735 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
738 DECL_MAP_ADDR_HELPERS(ring)
739 DECL_MAP_ADDR_HELPERS(pphwsp)
740 DECL_MAP_ADDR_HELPERS(seqno)
741 DECL_MAP_ADDR_HELPERS(regs)
742 DECL_MAP_ADDR_HELPERS(start_seqno)
743 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
744 DECL_MAP_ADDR_HELPERS(ctx_timestamp)
745 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
746 DECL_MAP_ADDR_HELPERS(parallel)
747 DECL_MAP_ADDR_HELPERS(indirect_ring)
748 DECL_MAP_ADDR_HELPERS(engine_id)
750 #undef DECL_MAP_ADDR_HELPERS
753 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
754 * @lrc: Pointer to the lrc.
756 * Returns: ctx timestamp GGTT address
758 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
760 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
764 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
765 * @lrc: Pointer to the lrc.
767 * Returns: ctx timestamp udw GGTT address
769 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
771 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
775 * xe_lrc_ctx_timestamp() - Read ctx timestamp value
776 * @lrc: Pointer to the lrc.
778 * Returns: ctx timestamp value
780 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
782 struct xe_device *xe = lrc_to_xe(lrc);
783 struct iosys_map map;
786 map = __xe_lrc_ctx_timestamp_map(lrc);
787 ldw = xe_map_read32(xe, &map);
789 if (xe->info.has_64bit_timestamp) {
790 map = __xe_lrc_ctx_timestamp_udw_map(lrc);
791 udw = xe_map_read32(xe, &map);
794 return (u64)udw << 32 | ldw;
798 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
799 * @lrc: Pointer to the lrc.
801 * Returns: ctx timestamp job GGTT address
803 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
805 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
809 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
810 * @lrc: Pointer to the lrc.
812 * Returns: ctx timestamp job value
814 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
816 struct xe_device *xe = lrc_to_xe(lrc);
817 struct iosys_map map;
819 map = __xe_lrc_ctx_job_timestamp_map(lrc);
820 return xe_map_read32(xe, &map);
823 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
825 return __xe_lrc_pphwsp_ggtt_addr(lrc);
828 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
830 if (!xe_lrc_has_indirect_ring_state(lrc))
833 return __xe_lrc_indirect_ring_ggtt_addr(lrc);
836 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
838 struct xe_device *xe = lrc_to_xe(lrc);
839 struct iosys_map map;
841 map = __xe_lrc_indirect_ring_map(lrc);
842 iosys_map_incr(&map, reg_nr * sizeof(u32));
843 return xe_map_read32(xe, &map);
846 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
849 struct xe_device *xe = lrc_to_xe(lrc);
850 struct iosys_map map;
852 map = __xe_lrc_indirect_ring_map(lrc);
853 iosys_map_incr(&map, reg_nr * sizeof(u32));
854 xe_map_write32(xe, &map, val);
857 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
859 struct xe_device *xe = lrc_to_xe(lrc);
860 struct iosys_map map;
862 map = __xe_lrc_regs_map(lrc);
863 iosys_map_incr(&map, reg_nr * sizeof(u32));
864 return xe_map_read32(xe, &map);
867 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
869 struct xe_device *xe = lrc_to_xe(lrc);
870 struct iosys_map map;
872 map = __xe_lrc_regs_map(lrc);
873 iosys_map_incr(&map, reg_nr * sizeof(u32));
874 xe_map_write32(xe, &map, val);
877 static void *empty_lrc_data(struct xe_hw_engine *hwe)
879 struct xe_gt *gt = hwe->gt;
883 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
887 /* 1st page: Per-Process of HW status Page */
888 regs = data + LRC_PPHWSP_SIZE;
889 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
890 set_context_control(regs, hwe);
891 set_memory_based_intr(regs, hwe);
892 reset_stop_ring(regs, hwe);
893 if (xe_gt_has_indirect_ring_state(gt)) {
894 regs = data + xe_gt_lrc_size(gt, hwe->class) -
895 LRC_INDIRECT_RING_STATE_SIZE;
896 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
902 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
904 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
906 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
907 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
910 static void xe_lrc_finish(struct xe_lrc *lrc)
912 xe_hw_fence_ctx_finish(&lrc->fence_ctx);
913 xe_bo_unpin_map_no_vm(lrc->bo);
916 static size_t wa_bb_offset(struct xe_lrc *lrc)
918 return lrc->bo->size - LRC_WA_BB_SIZE;
922 * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
924 * @lrc: Pointer to the lrc.
926 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
927 * context, but only gets updated when the context switches out. In order to
928 * check how long a context has been active before it switches out, two things
931 * (1) Determine if the context is running:
932 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
933 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
934 * initialized. During a query, we just check for this value to determine if the
935 * context is active. If the context switched out, it would overwrite this
936 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
937 * the last part of context restore, so reusing this LRC location will not
940 * (2) Calculate the time that the context has been active for:
941 * The CTX_TIMESTAMP ticks only when the context is active. If a context is
942 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
943 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
944 * engine instance. Since we do not know which instance the context is running
945 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
946 * store it in the PPHSWP.
948 #define CONTEXT_ACTIVE 1ULL
949 static int xe_lrc_setup_utilization(struct xe_lrc *lrc)
951 const size_t max_size = LRC_WA_BB_SIZE;
952 u32 *cmd, *buf = NULL;
954 if (lrc->bo->vmap.is_iomem) {
955 buf = kmalloc(max_size, GFP_KERNEL);
960 cmd = lrc->bo->vmap.vaddr + wa_bb_offset(lrc);
963 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
964 *cmd++ = ENGINE_ID(0).addr;
965 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
968 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
969 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
971 *cmd++ = lower_32_bits(CONTEXT_ACTIVE);
973 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
974 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
975 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
977 *cmd++ = upper_32_bits(CONTEXT_ACTIVE);
980 *cmd++ = MI_BATCH_BUFFER_END;
983 xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bo->vmap,
984 wa_bb_offset(lrc), buf,
985 (cmd - buf) * sizeof(*cmd));
989 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, xe_bo_ggtt_addr(lrc->bo) +
990 wa_bb_offset(lrc) + 1);
995 #define PVC_CTX_ASID (0x2e + 1)
996 #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1)
998 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
999 struct xe_vm *vm, u32 ring_size, u16 msix_vec,
1002 struct xe_gt *gt = hwe->gt;
1003 struct xe_tile *tile = gt_to_tile(gt);
1004 struct xe_device *xe = gt_to_xe(gt);
1005 struct iosys_map map;
1006 void *init_data = NULL;
1012 kref_init(&lrc->refcount);
1015 lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
1016 if (xe_gt_has_indirect_ring_state(gt))
1017 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1019 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1020 XE_BO_FLAG_GGTT_INVALIDATE;
1021 if (vm && vm->xef) /* userspace */
1022 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE;
1025 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
1026 * via VM bind calls.
1028 lrc->bo = xe_bo_create_pin_map(xe, tile, NULL,
1029 lrc_size + LRC_WA_BB_SIZE,
1032 if (IS_ERR(lrc->bo))
1033 return PTR_ERR(lrc->bo);
1035 lrc->size = lrc_size;
1036 lrc->ring.size = ring_size;
1039 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1040 hwe->fence_irq, hwe->name);
1042 if (!gt->default_lrc[hwe->class]) {
1043 init_data = empty_lrc_data(hwe);
1046 goto err_lrc_finish;
1051 * Init Per-Process of HW status Page, LRC / context state to known
1054 map = __xe_lrc_pphwsp_map(lrc);
1056 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
1057 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1058 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1059 xe_gt_lrc_size(gt, hwe->class) - LRC_PPHWSP_SIZE);
1061 xe_map_memcpy_to(xe, &map, 0, init_data,
1062 xe_gt_lrc_size(gt, hwe->class));
1067 xe_lrc_set_ppgtt(lrc, vm);
1070 xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1073 if (xe_device_has_msix(xe)) {
1074 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1075 xe_memirq_status_ptr(&tile->memirq, hwe));
1076 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1077 xe_memirq_source_ptr(&tile->memirq, hwe));
1078 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1081 if (xe_gt_has_indirect_ring_state(gt)) {
1082 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1083 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1085 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1086 __xe_lrc_ring_ggtt_addr(lrc));
1087 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1088 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
1089 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1090 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1091 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1093 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1094 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
1095 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1096 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1097 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1100 if (init_flags & XE_LRC_CREATE_RUNALONE)
1101 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1102 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1103 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
1105 if (init_flags & XE_LRC_CREATE_PXP)
1106 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1107 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1108 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
1110 lrc->ctx_timestamp = 0;
1111 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1112 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1113 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1115 if (xe->info.has_asid && vm)
1116 xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
1118 lrc->desc = LRC_VALID;
1119 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1120 /* TODO: Priority */
1122 /* While this appears to have something about privileged batches or
1123 * some such, it really just means PPGTT mode.
1126 lrc->desc |= LRC_PRIVILEGE;
1128 if (GRAPHICS_VERx100(xe) < 1250) {
1129 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1130 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1133 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1134 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1136 map = __xe_lrc_seqno_map(lrc);
1137 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1139 map = __xe_lrc_start_seqno_map(lrc);
1140 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1142 err = xe_lrc_setup_utilization(lrc);
1144 goto err_lrc_finish;
1154 * xe_lrc_create - Create a LRC
1155 * @hwe: Hardware Engine
1156 * @vm: The VM (address space)
1157 * @ring_size: LRC ring size
1158 * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1159 * @flags: LRC initialization flags
1161 * Allocate and initialize the Logical Ring Context (LRC).
1163 * Return pointer to created LRC upon success and an error pointer
1166 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1167 u32 ring_size, u16 msix_vec, u32 flags)
1172 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
1174 return ERR_PTR(-ENOMEM);
1176 err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
1179 return ERR_PTR(err);
1186 * xe_lrc_destroy - Destroy the LRC
1187 * @ref: reference to LRC
1189 * Called when ref == 0, release resources held by the Logical Ring Context
1190 * (LRC) and free the LRC memory.
1192 void xe_lrc_destroy(struct kref *ref)
1194 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1200 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1202 if (xe_lrc_has_indirect_ring_state(lrc))
1203 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1205 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1208 u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1210 if (xe_lrc_has_indirect_ring_state(lrc))
1211 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1213 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1216 static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1218 if (xe_lrc_has_indirect_ring_state(lrc))
1219 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1221 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1224 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1226 if (xe_lrc_has_indirect_ring_state(lrc))
1227 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1229 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1232 u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1234 if (xe_lrc_has_indirect_ring_state(lrc))
1235 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1237 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1240 u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1242 const u32 head = xe_lrc_ring_head(lrc);
1243 const u32 tail = lrc->ring.tail;
1244 const u32 size = lrc->ring.size;
1246 return ((head - tail - 1) & (size - 1)) + 1;
1249 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1250 const void *data, size_t size)
1252 struct xe_device *xe = lrc_to_xe(lrc);
1254 iosys_map_incr(&ring, lrc->ring.tail);
1255 xe_map_memcpy_to(xe, &ring, 0, data, size);
1256 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1259 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1261 struct xe_device *xe = lrc_to_xe(lrc);
1262 struct iosys_map ring;
1264 size_t aligned_size;
1266 xe_assert(xe, IS_ALIGNED(size, 4));
1267 aligned_size = ALIGN(size, 8);
1269 ring = __xe_lrc_ring_map(lrc);
1271 xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1272 rhs = lrc->ring.size - lrc->ring.tail;
1274 __xe_lrc_write_ring(lrc, ring, data, rhs);
1275 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1277 __xe_lrc_write_ring(lrc, ring, data, size);
1280 if (aligned_size > size) {
1283 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1287 u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1289 return lrc->desc | xe_lrc_ggtt_addr(lrc);
1292 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1294 return __xe_lrc_seqno_ggtt_addr(lrc);
1298 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1300 * Allocate but don't initialize an lrc seqno fence.
1302 * Return: Pointer to the allocated fence or
1303 * negative error pointer on error.
1305 struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1307 return xe_hw_fence_alloc();
1311 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1312 * @fence: Pointer to the fence to free.
1314 * Frees an lrc seqno fence that hasn't yet been
1317 void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1319 xe_hw_fence_free(fence);
1323 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1324 * @lrc: Pointer to the lrc.
1325 * @fence: Pointer to the fence to initialize.
1327 * Initializes a pre-allocated lrc seqno fence.
1328 * After initialization, the fence is subject to normal
1329 * dma-fence refcounting.
1331 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1333 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1336 s32 xe_lrc_seqno(struct xe_lrc *lrc)
1338 struct iosys_map map = __xe_lrc_seqno_map(lrc);
1340 return xe_map_read32(lrc_to_xe(lrc), &map);
1343 s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1345 struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1347 return xe_map_read32(lrc_to_xe(lrc), &map);
1350 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1352 return __xe_lrc_start_seqno_ggtt_addr(lrc);
1355 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1357 return __xe_lrc_parallel_ggtt_addr(lrc);
1360 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1362 return __xe_lrc_parallel_map(lrc);
1366 * xe_lrc_engine_id() - Read engine id value
1367 * @lrc: Pointer to the lrc.
1369 * Returns: context id value
1371 static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1373 struct xe_device *xe = lrc_to_xe(lrc);
1374 struct iosys_map map;
1376 map = __xe_lrc_engine_id_map(lrc);
1377 return xe_map_read32(xe, &map);
1380 static int instr_dw(u32 cmd_header)
1382 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1383 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1384 GFXPIPE_SINGLE_DW_CMD(0, 0))
1387 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1388 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1389 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1391 /* Most instructions have the # of dwords (minus 2) in 7:0 */
1392 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1395 static int dump_mi_command(struct drm_printer *p,
1400 u32 inst_header = *dw;
1401 u32 numdw = instr_dw(inst_header);
1402 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1405 /* First check for commands that don't have/use a '# DW' field */
1406 switch (inst_header & MI_OPCODE) {
1409 while (num_noop < remaining_dw &&
1410 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1412 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
1415 case MI_TOPOLOGY_FILTER:
1416 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
1419 case MI_BATCH_BUFFER_END:
1420 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
1421 /* Return 'remaining_dw' to consume the rest of the LRC */
1422 return remaining_dw;
1426 * Any remaining commands include a # of dwords. We should make sure
1427 * it doesn't exceed the remaining size of the LRC.
1429 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1430 numdw = remaining_dw;
1432 switch (inst_header & MI_OPCODE) {
1433 case MI_LOAD_REGISTER_IMM:
1434 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1435 inst_header, (numdw - 1) / 2);
1436 for (int i = 1; i < numdw; i += 2)
1437 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
1440 case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1441 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1443 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1444 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1446 drm_printf(p, " - %#6x = %#010llx\n",
1447 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1449 drm_printf(p, " - %*ph (%s)\n",
1450 (int)sizeof(u32) * (numdw - 1), dw + 1,
1451 numdw < 4 ? "truncated" : "malformed");
1454 case MI_FORCE_WAKEUP:
1455 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1459 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1460 inst_header, opcode, numdw);
1465 static int dump_gfxpipe_command(struct drm_printer *p,
1470 u32 numdw = instr_dw(*dw);
1471 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1472 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1473 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1476 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1477 * remaining size of the LRC.
1479 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1480 numdw = remaining_dw;
1482 switch (*dw & GFXPIPE_MATCH_MASK) {
1483 #define MATCH(cmd) \
1485 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1487 #define MATCH3D(cmd) \
1489 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1492 MATCH(STATE_BASE_ADDRESS);
1494 MATCH(GPGPU_CSR_BASE_ADDRESS);
1495 MATCH(STATE_COMPUTE_MODE);
1496 MATCH3D(3DSTATE_BTD);
1497 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1498 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1500 MATCH3D(3DSTATE_VF_STATISTICS);
1502 MATCH(PIPELINE_SELECT);
1504 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1505 MATCH3D(3DSTATE_CLEAR_PARAMS);
1506 MATCH3D(3DSTATE_DEPTH_BUFFER);
1507 MATCH3D(3DSTATE_STENCIL_BUFFER);
1508 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1509 MATCH3D(3DSTATE_VERTEX_BUFFERS);
1510 MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1511 MATCH3D(3DSTATE_INDEX_BUFFER);
1512 MATCH3D(3DSTATE_VF);
1513 MATCH3D(3DSTATE_MULTISAMPLE);
1514 MATCH3D(3DSTATE_CC_STATE_POINTERS);
1515 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1516 MATCH3D(3DSTATE_VS);
1517 MATCH3D(3DSTATE_GS);
1518 MATCH3D(3DSTATE_CLIP);
1519 MATCH3D(3DSTATE_SF);
1520 MATCH3D(3DSTATE_WM);
1521 MATCH3D(3DSTATE_CONSTANT_VS);
1522 MATCH3D(3DSTATE_CONSTANT_GS);
1523 MATCH3D(3DSTATE_CONSTANT_PS);
1524 MATCH3D(3DSTATE_SAMPLE_MASK);
1525 MATCH3D(3DSTATE_CONSTANT_HS);
1526 MATCH3D(3DSTATE_CONSTANT_DS);
1527 MATCH3D(3DSTATE_HS);
1528 MATCH3D(3DSTATE_TE);
1529 MATCH3D(3DSTATE_DS);
1530 MATCH3D(3DSTATE_STREAMOUT);
1531 MATCH3D(3DSTATE_SBE);
1532 MATCH3D(3DSTATE_PS);
1533 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1534 MATCH3D(3DSTATE_CPS_POINTERS);
1535 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1536 MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1537 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1538 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1539 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1540 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1541 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1542 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1543 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1544 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1545 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1546 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1547 MATCH3D(3DSTATE_VF_INSTANCING);
1548 MATCH3D(3DSTATE_VF_SGVS);
1549 MATCH3D(3DSTATE_VF_TOPOLOGY);
1550 MATCH3D(3DSTATE_WM_CHROMAKEY);
1551 MATCH3D(3DSTATE_PS_BLEND);
1552 MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1553 MATCH3D(3DSTATE_PS_EXTRA);
1554 MATCH3D(3DSTATE_RASTER);
1555 MATCH3D(3DSTATE_SBE_SWIZ);
1556 MATCH3D(3DSTATE_WM_HZ_OP);
1557 MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1558 MATCH3D(3DSTATE_VF_SGVS_2);
1559 MATCH3D(3DSTATE_VFG);
1560 MATCH3D(3DSTATE_URB_ALLOC_VS);
1561 MATCH3D(3DSTATE_URB_ALLOC_HS);
1562 MATCH3D(3DSTATE_URB_ALLOC_DS);
1563 MATCH3D(3DSTATE_URB_ALLOC_GS);
1564 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1565 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1566 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1567 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1568 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1569 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1570 MATCH3D(3DSTATE_AMFS);
1571 MATCH3D(3DSTATE_DEPTH_BOUNDS);
1572 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1573 MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1574 MATCH3D(3DSTATE_MESH_CONTROL);
1575 MATCH3D(3DSTATE_MESH_DISTRIB);
1576 MATCH3D(3DSTATE_TASK_REDISTRIB);
1577 MATCH3D(3DSTATE_MESH_SHADER);
1578 MATCH3D(3DSTATE_MESH_SHADER_DATA);
1579 MATCH3D(3DSTATE_TASK_CONTROL);
1580 MATCH3D(3DSTATE_TASK_SHADER);
1581 MATCH3D(3DSTATE_TASK_SHADER_DATA);
1582 MATCH3D(3DSTATE_URB_ALLOC_MESH);
1583 MATCH3D(3DSTATE_URB_ALLOC_TASK);
1584 MATCH3D(3DSTATE_CLIP_MESH);
1585 MATCH3D(3DSTATE_SBE_MESH);
1586 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1587 MATCH3D(3DSTATE_COARSE_PIXEL);
1589 MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1590 MATCH3D(3DSTATE_CHROMA_KEY);
1591 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1592 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1593 MATCH3D(3DSTATE_LINE_STIPPLE);
1594 MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1595 MATCH3D(3DSTATE_MONOFILTER_SIZE);
1596 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1597 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1598 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1599 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1600 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1601 MATCH3D(3DSTATE_SO_DECL_LIST);
1602 MATCH3D(3DSTATE_SO_BUFFER);
1603 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1604 MATCH3D(3DSTATE_SAMPLE_PATTERN);
1605 MATCH3D(3DSTATE_3D_MODE);
1606 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1607 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1608 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1611 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1612 *dw, pipeline, opcode, subopcode, numdw);
1617 static int dump_gfx_state_command(struct drm_printer *p,
1622 u32 numdw = instr_dw(*dw);
1623 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1626 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1627 * remaining size of the LRC.
1629 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1630 numdw = remaining_dw;
1632 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1633 MATCH(STATE_WRITE_INLINE);
1636 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1637 *dw, opcode, numdw);
1642 void xe_lrc_dump_default(struct drm_printer *p,
1644 enum xe_engine_class hwe_class)
1647 int remaining_dw, num_dw;
1649 if (!gt->default_lrc[hwe_class]) {
1650 drm_printf(p, "No default LRC for class %d\n", hwe_class);
1655 * Skip the beginning of the LRC since it contains the per-process
1656 * hardware status page.
1658 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1659 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
1661 while (remaining_dw > 0) {
1662 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1663 num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1664 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1665 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1666 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1667 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1669 num_dw = min(instr_dw(*dw), remaining_dw);
1670 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1671 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1676 remaining_dw -= num_dw;
1680 struct instr_state {
1685 static const struct instr_state xe_hpg_svg_state[] = {
1686 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1687 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1688 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1689 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1690 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1691 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1692 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1693 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1694 { .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1695 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1696 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1697 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1698 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1699 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1700 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1701 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1702 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1703 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1704 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1705 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1706 { .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1707 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1708 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1709 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1710 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1711 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1712 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1713 { .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1714 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1715 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1716 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1717 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1718 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1719 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1720 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1721 { .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1722 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1723 { .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1724 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1725 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1726 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1727 { .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1728 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1729 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1730 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1731 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1732 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1733 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1734 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1735 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1738 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1740 struct xe_gt *gt = q->hwe->gt;
1741 struct xe_device *xe = gt_to_xe(gt);
1742 const struct instr_state *state_table = NULL;
1743 int state_table_size = 0;
1748 * If the driver doesn't explicitly emit the SVG instructions while
1749 * setting up the default LRC, the context switch will write 0's
1750 * (noops) into the LRC memory rather than the expected instruction
1751 * headers. Application contexts start out as a copy of the default
1752 * LRC, and if they also do not emit specific settings for some SVG
1753 * state, then on context restore they'll unintentionally inherit
1754 * whatever state setting the previous context had programmed into the
1755 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
1756 * prevent the hardware from resetting that state back to any specific
1759 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
1760 * since that's a specific state setting that can easily cause GPU
1761 * hangs if unintentionally inherited. However to be safe we'll
1762 * continue to emit all of the SVG state since it's best not to leak
1763 * any of the state between contexts, even if that leakage is harmless.
1765 if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
1766 state_table = xe_hpg_svg_state;
1767 state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1771 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1772 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1776 for (int i = 0; i < state_table_size; i++) {
1777 u32 instr = state_table[i].instr;
1778 u16 num_dw = state_table[i].num_dw;
1779 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1781 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1782 xe_gt_assert(gt, num_dw != 0);
1783 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1786 * Xe2's SVG context is the same as the one on DG2 / MTL
1787 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1788 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1789 * Just make the replacement here rather than defining a
1790 * whole separate table for the single trivial change.
1792 if (GRAPHICS_VER(xe) >= 20 &&
1793 instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1794 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1796 bb->cs[bb->len] = instr;
1798 bb->cs[bb->len] |= (num_dw - 2);
1804 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1806 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1811 snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
1812 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
1813 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
1814 snapshot->head = xe_lrc_ring_head(lrc);
1815 snapshot->tail.internal = lrc->ring.tail;
1816 snapshot->tail.memory = xe_lrc_ring_tail(lrc);
1817 snapshot->start = xe_lrc_ring_start(lrc);
1818 snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1819 snapshot->seqno = xe_lrc_seqno(lrc);
1820 snapshot->lrc_bo = xe_bo_get(lrc->bo);
1821 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
1822 snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset -
1824 snapshot->lrc_snapshot = NULL;
1825 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
1826 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
1830 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
1833 struct iosys_map src;
1838 bo = snapshot->lrc_bo;
1839 snapshot->lrc_bo = NULL;
1841 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
1842 if (!snapshot->lrc_snapshot)
1845 xe_bo_lock(bo, false);
1846 if (!ttm_bo_vmap(&bo->ttm, &src)) {
1847 xe_map_memcpy_from(xe_bo_device(bo),
1848 snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
1849 snapshot->lrc_size);
1850 ttm_bo_vunmap(&bo->ttm, &src);
1852 kvfree(snapshot->lrc_snapshot);
1853 snapshot->lrc_snapshot = NULL;
1860 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
1867 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
1868 drm_printf(p, "\tHW Ring address: 0x%08x\n",
1869 snapshot->ring_addr);
1870 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
1871 snapshot->indirect_context_desc);
1872 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
1873 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
1874 snapshot->tail.internal, snapshot->tail.memory);
1875 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
1876 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
1877 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
1878 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
1879 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
1881 if (!snapshot->lrc_snapshot)
1884 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
1885 drm_puts(p, "\t[HWSP].data: ");
1886 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
1887 u32 *val = snapshot->lrc_snapshot + i;
1888 char dumped[ASCII85_BUFSZ];
1890 drm_puts(p, ascii85_encode(*val, dumped));
1893 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
1894 drm_puts(p, "\t[HWCTX].data: ");
1895 for (; i < snapshot->lrc_size; i += sizeof(u32)) {
1896 u32 *val = snapshot->lrc_snapshot + i;
1897 char dumped[ASCII85_BUFSZ];
1899 drm_puts(p, ascii85_encode(*val, dumped));
1904 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
1909 kvfree(snapshot->lrc_snapshot);
1910 if (snapshot->lrc_bo)
1911 xe_bo_put(snapshot->lrc_bo);
1916 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
1918 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
1919 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
1920 struct xe_hw_engine *hwe;
1923 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
1924 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
1925 "Unexpected engine class:instance %d:%d for context utilization\n",
1929 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1930 val = xe_mmio_read64_2x32(&hwe->gt->mmio,
1931 RING_CTX_TIMESTAMP(hwe->mmio_base));
1933 val = xe_mmio_read32(&hwe->gt->mmio,
1934 RING_CTX_TIMESTAMP(hwe->mmio_base));
1942 * xe_lrc_update_timestamp() - Update ctx timestamp
1943 * @lrc: Pointer to the lrc.
1944 * @old_ts: Old timestamp value
1946 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
1947 * update saved value. With support for active contexts, the calculation may be
1948 * slightly racy, so follow a read-again logic to ensure that the context is
1949 * still active before returning the right timestamp.
1951 * Returns: New ctx timestamp value
1953 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
1958 *old_ts = lrc->ctx_timestamp;
1960 lrc_ts = xe_lrc_ctx_timestamp(lrc);
1961 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
1962 if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
1963 lrc->ctx_timestamp = lrc_ts;
1967 if (lrc_ts == CONTEXT_ACTIVE) {
1968 engine_id = xe_lrc_engine_id(lrc);
1969 if (!get_ctx_timestamp(lrc, engine_id, ®_ts))
1970 lrc->ctx_timestamp = reg_ts;
1972 /* read lrc again to ensure context is still active */
1973 lrc_ts = xe_lrc_ctx_timestamp(lrc);
1977 * If context switched out, just use the lrc_ts. Note that this needs to
1978 * be a separate if condition.
1980 if (lrc_ts != CONTEXT_ACTIVE)
1981 lrc->ctx_timestamp = lrc_ts;
1984 trace_xe_lrc_update_timestamp(lrc, *old_ts);
1986 return lrc->ctx_timestamp;
1990 * xe_lrc_ring_is_idle() - LRC is idle
1991 * @lrc: Pointer to the lrc.
1993 * Compare LRC ring head and tail to determine if idle.
1995 * Return: True is ring is idle, False otherwise
1997 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
1999 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);