[linux-2.6-block.git] / drivers / gpu / drm / i915 / gt / gen8_engine_cs.c

// SPDX-License-Identifier: MIT
/*
 * Copyright © 2014 Intel Corporation
 */

#include "gen8_engine_cs.h"
#include "i915_drv.h"
#include "intel_lrc.h"
#include "intel_gpu_commands.h"
#include "intel_ring.h"

int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
{
	bool vf_flush_wa = false, dc_flush_wa = false;
	u32 *cs, flags = 0;
	int len;

	flags |= PIPE_CONTROL_CS_STALL;

	if (mode & EMIT_FLUSH) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
	}

	if (mode & EMIT_INVALIDATE) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_QW_WRITE;
		flags |= PIPE_CONTROL_STORE_DATA_INDEX;

		/*
		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
		 * pipe control.
		 */
		if (IS_GEN(rq->engine->i915, 9))
			vf_flush_wa = true;

		/* WaForGAMHang:kbl */
		if (IS_KBL_GT_REVID(rq->engine->i915, 0, KBL_REVID_B0))
			dc_flush_wa = true;
	}

	len = 6;

	if (vf_flush_wa)
		len += 6;

	if (dc_flush_wa)
		len += 12;

	cs = intel_ring_begin(rq, len);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	if (vf_flush_wa)
		cs = gen8_emit_pipe_control(cs, 0, 0);

	if (dc_flush_wa)
		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
					    0);

	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);

	if (dc_flush_wa)
		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);

	intel_ring_advance(rq, cs);

	return 0;
}

int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
{
	u32 cmd, *cs;

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	cmd = MI_FLUSH_DW + 1;

	/*
	 * We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

	if (mode & EMIT_INVALIDATE) {
		cmd |= MI_INVALIDATE_TLB;
		if (rq->engine->class == VIDEO_DECODE_CLASS)
			cmd |= MI_INVALIDATE_BSD;
	}

	*cs++ = cmd;
	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
	*cs++ = 0; /* upper addr */
	*cs++ = 0; /* value */
	intel_ring_advance(rq, cs);

	return 0;
}

int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
{
	if (mode & EMIT_FLUSH) {
		u32 *cs;
		u32 flags = 0;

		flags |= PIPE_CONTROL_CS_STALL;

		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
		flags |= PIPE_CONTROL_QW_WRITE;
		flags |= PIPE_CONTROL_STORE_DATA_INDEX;

		cs = intel_ring_begin(rq, 6);
		if (IS_ERR(cs))
			return PTR_ERR(cs);

		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
		intel_ring_advance(rq, cs);
	}

	if (mode & EMIT_INVALIDATE) {
		u32 *cs;
		u32 flags = 0;

		flags |= PIPE_CONTROL_CS_STALL;

		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_QW_WRITE;
		flags |= PIPE_CONTROL_STORE_DATA_INDEX;

		cs = intel_ring_begin(rq, 6);
		if (IS_ERR(cs))
			return PTR_ERR(cs);

		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
		intel_ring_advance(rq, cs);
	}

	return 0;
}

static u32 preparser_disable(bool state)
{
	return MI_ARB_CHECK | 1 << 8 | state;
}

static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
{
	static const i915_reg_t vd[] = {
		GEN12_VD0_AUX_NV,
		GEN12_VD1_AUX_NV,
		GEN12_VD2_AUX_NV,
		GEN12_VD3_AUX_NV,
	};

	static const i915_reg_t ve[] = {
		GEN12_VE0_AUX_NV,
		GEN12_VE1_AUX_NV,
	};

	if (engine->class == VIDEO_DECODE_CLASS)
		return vd[engine->instance];

	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
		return ve[engine->instance];

	GEM_BUG_ON("unknown aux_inv reg\n");
	return INVALID_MMIO_REG;
}

static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
{
	*cs++ = MI_LOAD_REGISTER_IMM(1);
	*cs++ = i915_mmio_reg_offset(inv_reg);
	*cs++ = AUX_INV;
	*cs++ = MI_NOOP;

	return cs;
}

int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
{
	if (mode & EMIT_FLUSH) {
		u32 flags = 0;
		u32 *cs;

		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
		flags |= PIPE_CONTROL_FLUSH_L3;
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		/* Wa_1409600907:tgl */
		flags |= PIPE_CONTROL_DEPTH_STALL;
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
		flags |= PIPE_CONTROL_FLUSH_ENABLE;

		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
		flags |= PIPE_CONTROL_QW_WRITE;

		flags |= PIPE_CONTROL_CS_STALL;

		cs = intel_ring_begin(rq, 6);
		if (IS_ERR(cs))
			return PTR_ERR(cs);

		cs = gen12_emit_pipe_control(cs,
					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
					     flags, LRC_PPHWSP_SCRATCH_ADDR);
		intel_ring_advance(rq, cs);
	}

	if (mode & EMIT_INVALIDATE) {
		u32 flags = 0;
		u32 *cs;

		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;

		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
		flags |= PIPE_CONTROL_QW_WRITE;

		flags |= PIPE_CONTROL_CS_STALL;

		cs = intel_ring_begin(rq, 8 + 4);
		if (IS_ERR(cs))
			return PTR_ERR(cs);

		/*
		 * Prevent the pre-parser from skipping past the TLB
		 * invalidate and loading a stale page for the batch
		 * buffer / request payload.
		 */
		*cs++ = preparser_disable(true);

		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);

		/* hsdes: 1809175790 */
		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);

		*cs++ = preparser_disable(false);
		intel_ring_advance(rq, cs);
	}

	return 0;
}

int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
{
	intel_engine_mask_t aux_inv = 0;
	u32 cmd, *cs;

	cmd = 4;
	if (mode & EMIT_INVALIDATE)
		cmd += 2;
	if (mode & EMIT_INVALIDATE)
		aux_inv = rq->engine->mask & ~BIT(BCS0);
	if (aux_inv)
		cmd += 2 * hweight8(aux_inv) + 2;

	cs = intel_ring_begin(rq, cmd);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	if (mode & EMIT_INVALIDATE)
		*cs++ = preparser_disable(true);

	cmd = MI_FLUSH_DW + 1;

	/*
	 * We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

	if (mode & EMIT_INVALIDATE) {
		cmd |= MI_INVALIDATE_TLB;
		if (rq->engine->class == VIDEO_DECODE_CLASS)
			cmd |= MI_INVALIDATE_BSD;
	}

	*cs++ = cmd;
	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
	*cs++ = 0; /* upper addr */
	*cs++ = 0; /* value */

	if (aux_inv) { /* hsdes: 1809175790 */
		struct intel_engine_cs *engine;
		unsigned int tmp;

		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
		for_each_engine_masked(engine, rq->engine->gt,
				       aux_inv, tmp) {
			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
			*cs++ = AUX_INV;
		}
		*cs++ = MI_NOOP;
	}

	if (mode & EMIT_INVALIDATE)
		*cs++ = preparser_disable(false);

	intel_ring_advance(rq, cs);

	return 0;
}

static u32 preempt_address(struct intel_engine_cs *engine)
{
	return (i915_ggtt_offset(engine->status_page.vma) +
		I915_GEM_HWS_PREEMPT_ADDR);
}

static u32 hwsp_offset(const struct i915_request *rq)
{
	const struct intel_timeline_cacheline *cl;

	/* Before the request is executed, the timeline/cachline is fixed */

	cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
	if (cl)
		return cl->ggtt_offset;

	return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
}

int gen8_emit_init_breadcrumb(struct i915_request *rq)
{
	u32 *cs;

	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
		return 0;

	cs = intel_ring_begin(rq, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
	*cs++ = hwsp_offset(rq);
	*cs++ = 0;
	*cs++ = rq->fence.seqno - 1;

	/*
	 * Check if we have been preempted before we even get started.
	 *
	 * After this point i915_request_started() reports true, even if
	 * we get preempted and so are no longer running.
	 *
	 * i915_request_started() is used during preemption processing
	 * to decide if the request is currently inside the user payload
	 * or spinning on a kernel semaphore (or earlier). For no-preemption
	 * requests, we do allow preemption on the semaphore before the user
	 * payload, but do not allow preemption once the request is started.
	 *
	 * i915_request_started() is similarly used during GPU hangs to
	 * determine if the user's payload was guilty, and if so, the
	 * request is banned. Before the request is started, it is assumed
	 * to be unharmed and an innocent victim of another's hang.
	 */
	*cs++ = MI_NOOP;
	*cs++ = MI_ARB_CHECK;

	intel_ring_advance(rq, cs);

	/* Record the updated position of the request's payload */
	rq->infix = intel_ring_offset(rq, cs);

	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);

	return 0;
}

int gen8_emit_bb_start_noarb(struct i915_request *rq,
			     u64 offset, u32 len,
			     const unsigned int flags)
{
	u32 *cs;

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	/*
	 * WaDisableCtxRestoreArbitration:bdw,chv
	 *
	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
	 * particular all the gen that do not need the w/a at all!), if we
	 * took care to make sure that on every switch into this context
	 * (both ordinary and for preemption) that arbitrartion was enabled
	 * we would be fine.  However, for gen8 there is another w/a that
	 * requires us to not preempt inside GPGPU execution, so we keep
	 * arbitration disabled for gen8 batches. Arbitration will be
	 * re-enabled before we close the request
	 * (engine->emit_fini_breadcrumb).
	 */
	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;

	/* FIXME(BDW+): Address space and security selectors. */
	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
	*cs++ = lower_32_bits(offset);
	*cs++ = upper_32_bits(offset);

	intel_ring_advance(rq, cs);

	return 0;
}

int gen8_emit_bb_start(struct i915_request *rq,
		       u64 offset, u32 len,
		       const unsigned int flags)
{
	u32 *cs;

	if (unlikely(i915_request_has_nopreempt(rq)))
		return gen8_emit_bb_start_noarb(rq, offset, len, flags);

	cs = intel_ring_begin(rq, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;

	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
	*cs++ = lower_32_bits(offset);
	*cs++ = upper_32_bits(offset);

	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
	*cs++ = MI_NOOP;

	intel_ring_advance(rq, cs);

	return 0;
}

static void assert_request_valid(struct i915_request *rq)
{
	struct intel_ring *ring __maybe_unused = rq->ring;

	/* Can we unwind this request without appearing to go forwards? */
	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
}

/*
 * Reserve space for 2 NOOPs at the end of each request to be
 * used as a workaround for not being allowed to do lite
 * restore with HEAD==TAIL (WaIdleLiteRestore).
 */
static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
{
	/* Ensure there's always at least one preemption point per-request. */
	*cs++ = MI_ARB_CHECK;
	*cs++ = MI_NOOP;
	rq->wa_tail = intel_ring_offset(rq, cs);

	/* Check that entire request is less than half the ring */
	assert_request_valid(rq);

	return cs;
}

static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs)
{
	*cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
	*cs++ = MI_SEMAPHORE_WAIT |
		MI_SEMAPHORE_GLOBAL_GTT |
		MI_SEMAPHORE_POLL |
		MI_SEMAPHORE_SAD_EQ_SDD;
	*cs++ = 0;
	*cs++ = preempt_address(rq->engine);
	*cs++ = 0;
	*cs++ = MI_NOOP;

	return cs;
}

static __always_inline u32*
gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
{
	*cs++ = MI_USER_INTERRUPT;

	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
	if (intel_engine_has_semaphores(rq->engine))
		cs = emit_preempt_busywait(rq, cs);

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return gen8_emit_wa_tail(rq, cs);
}

static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
{
	return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
}

u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
{
	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
}

u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
{
	cs = gen8_emit_pipe_control(cs,
				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
				    PIPE_CONTROL_DC_FLUSH_ENABLE,
				    0);

	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
	cs = gen8_emit_ggtt_write_rcs(cs,
				      rq->fence.seqno,
				      hwsp_offset(rq),
				      PIPE_CONTROL_FLUSH_ENABLE |
				      PIPE_CONTROL_CS_STALL);

	return gen8_emit_fini_breadcrumb_tail(rq, cs);
}

u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
{
	cs = gen8_emit_ggtt_write_rcs(cs,
				      rq->fence.seqno,
				      hwsp_offset(rq),
				      PIPE_CONTROL_CS_STALL |
				      PIPE_CONTROL_TILE_CACHE_FLUSH |
				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
				      PIPE_CONTROL_DC_FLUSH_ENABLE |
				      PIPE_CONTROL_FLUSH_ENABLE);

	return gen8_emit_fini_breadcrumb_tail(rq, cs);
}

/*
 * Note that the CS instruction pre-parser will not stall on the breadcrumb
 * flush and will continue pre-fetching the instructions after it before the
 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
 * of the next request before the memory has been flushed, we're guaranteed that
 * we won't access the batch itself too early.
 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
 * so, if the current request is modifying an instruction in the next request on
 * the same intel_context, we might pre-fetch and then execute the pre-update
 * instruction. To avoid this, the users of self-modifying code should either
 * disable the parser around the code emitting the memory writes, via a new flag
 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
 * the in-kernel use-cases we've opted to use a separate context, see
 * reloc_gpu() as an example.
 * All the above applies only to the instructions themselves. Non-inline data
 * used by the instructions is not pre-fetched.
 */

static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs)
{
	*cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
		MI_SEMAPHORE_GLOBAL_GTT |
		MI_SEMAPHORE_POLL |
		MI_SEMAPHORE_SAD_EQ_SDD;
	*cs++ = 0;
	*cs++ = preempt_address(rq->engine);
	*cs++ = 0;
	*cs++ = 0;

	return cs;
}

static __always_inline u32*
gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
{
	*cs++ = MI_USER_INTERRUPT;

	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
	if (intel_engine_has_semaphores(rq->engine))
		cs = gen12_emit_preempt_busywait(rq, cs);

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return gen8_emit_wa_tail(rq, cs);
}

u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
{
	/* XXX Stalling flush before seqno write; post-sync not */
	cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
	return gen12_emit_fini_breadcrumb_tail(rq, cs);
}

u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
{
	cs = gen12_emit_ggtt_write_rcs(cs,
				       rq->fence.seqno,
				       hwsp_offset(rq),
				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
				       PIPE_CONTROL_CS_STALL |
				       PIPE_CONTROL_TILE_CACHE_FLUSH |
				       PIPE_CONTROL_FLUSH_L3 |
				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
				       /* Wa_1409600907:tgl */
				       PIPE_CONTROL_DEPTH_STALL |
				       PIPE_CONTROL_DC_FLUSH_ENABLE |
				       PIPE_CONTROL_FLUSH_ENABLE);

	return gen12_emit_fini_breadcrumb_tail(rq, cs);
}
Commit	Line	Data
d0d829e5 DCS	1	// SPDX-License-Identifier: MIT
	2	/*
	3	* Copyright © 2014 Intel Corporation
	4	*/
	5
	6	#include "gen8_engine_cs.h"
	7	#include "i915_drv.h"
a0d3fdb6	8	#include "intel_lrc.h"
d0d829e5 DCS	9	#include "intel_gpu_commands.h"
	10	#include "intel_ring.h"
	11
	12	int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
	13	{
	14	bool vf_flush_wa = false, dc_flush_wa = false;
	15	u32 *cs, flags = 0;
	16	int len;
	17
	18	flags \|= PIPE_CONTROL_CS_STALL;
	19
	20	if (mode & EMIT_FLUSH) {
	21	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
	22	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
	23	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
	24	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
	25	}
	26
	27	if (mode & EMIT_INVALIDATE) {
	28	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
	29	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
	30	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
	31	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
	32	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
	33	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
	34	flags \|= PIPE_CONTROL_QW_WRITE;
	35	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
	36
	37	/*
	38	* On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
	39	* pipe control.
	40	*/
	41	if (IS_GEN(rq->engine->i915, 9))
	42	vf_flush_wa = true;
	43
	44	/* WaForGAMHang:kbl */
	45	if (IS_KBL_GT_REVID(rq->engine->i915, 0, KBL_REVID_B0))
	46	dc_flush_wa = true;
	47	}
	48
	49	len = 6;
	50
	51	if (vf_flush_wa)
	52	len += 6;
	53
	54	if (dc_flush_wa)
	55	len += 12;
	56
	57	cs = intel_ring_begin(rq, len);
	58	if (IS_ERR(cs))
	59	return PTR_ERR(cs);
	60
	61	if (vf_flush_wa)
	62	cs = gen8_emit_pipe_control(cs, 0, 0);
	63
	64	if (dc_flush_wa)
	65	cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
	66	0);
	67
	68	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
	69
	70	if (dc_flush_wa)
	71	cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
	72
73	intel_ring_advance(rq, cs);
74
75	return 0;
76	}
77
78	int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
79	{
80	u32 cmd, *cs;
81
82	cs = intel_ring_begin(rq, 4);
83	if (IS_ERR(cs))
84	return PTR_ERR(cs);
85
86	cmd = MI_FLUSH_DW + 1;
87
88	/*
89	* We always require a command barrier so that subsequent
90	* commands, such as breadcrumb interrupts, are strictly ordered
91	* wrt the contents of the write cache being flushed to memory
92	* (and thus being coherent from the CPU).
93	*/
94	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;
95
96	if (mode & EMIT_INVALIDATE) {
97	cmd \|= MI_INVALIDATE_TLB;
98	if (rq->engine->class == VIDEO_DECODE_CLASS)
99	cmd \|= MI_INVALIDATE_BSD;
100	}
101
102	*cs++ = cmd;
103	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
104	cs++ = 0; / upper addr */
105	cs++ = 0; / value */
106	intel_ring_advance(rq, cs);
107
108	return 0;
109	}
110
111	int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
112	{
113	if (mode & EMIT_FLUSH) {
114	u32 *cs;
115	u32 flags = 0;
116
117	flags \|= PIPE_CONTROL_CS_STALL;
118
119	flags \|= PIPE_CONTROL_TILE_CACHE_FLUSH;
120	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
121	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
122	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
123	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
124	flags \|= PIPE_CONTROL_QW_WRITE;
125	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
126
127	cs = intel_ring_begin(rq, 6);
128	if (IS_ERR(cs))
129	return PTR_ERR(cs);
130
131	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
132	intel_ring_advance(rq, cs);
133	}
134
135	if (mode & EMIT_INVALIDATE) {
136	u32 *cs;
137	u32 flags = 0;
138
139	flags \|= PIPE_CONTROL_CS_STALL;
140
141	flags \|= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
142	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
143	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
144	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
145	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
146	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
147	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
148	flags \|= PIPE_CONTROL_QW_WRITE;
149	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
150
151	cs = intel_ring_begin(rq, 6);
152	if (IS_ERR(cs))
153	return PTR_ERR(cs);
154
155	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
156	intel_ring_advance(rq, cs);
157	}
158
159	return 0;
160	}
161
162	static u32 preparser_disable(bool state)
163	{
164	return MI_ARB_CHECK \| 1 << 8 \| state;
165	}
166
167	static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
168	{
169	static const i915_reg_t vd[] = {
170	GEN12_VD0_AUX_NV,
171	GEN12_VD1_AUX_NV,
172	GEN12_VD2_AUX_NV,
173	GEN12_VD3_AUX_NV,
174	};
175
176	static const i915_reg_t ve[] = {
177	GEN12_VE0_AUX_NV,
178	GEN12_VE1_AUX_NV,
179	};
180
181	if (engine->class == VIDEO_DECODE_CLASS)
182	return vd[engine->instance];
183
184	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
185	return ve[engine->instance];
186
187	GEM_BUG_ON("unknown aux_inv reg\n");
188	return INVALID_MMIO_REG;
189	}
190
191	static u32 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 cs)
192	{
193	*cs++ = MI_LOAD_REGISTER_IMM(1);
194	*cs++ = i915_mmio_reg_offset(inv_reg);
195	*cs++ = AUX_INV;
196	*cs++ = MI_NOOP;
197
198	return cs;
199	}
200
201	int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
202	{
203	if (mode & EMIT_FLUSH) {
204	u32 flags = 0;
205	u32 *cs;
206
207	flags \|= PIPE_CONTROL_TILE_CACHE_FLUSH;
208	flags \|= PIPE_CONTROL_FLUSH_L3;
209	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
210	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
211	/* Wa_1409600907:tgl */
212	flags \|= PIPE_CONTROL_DEPTH_STALL;
213	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
214	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
215
216	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
217	flags \|= PIPE_CONTROL_QW_WRITE;
218
219	flags \|= PIPE_CONTROL_CS_STALL;
220
221	cs = intel_ring_begin(rq, 6);
222	if (IS_ERR(cs))
223	return PTR_ERR(cs);
224
225	cs = gen12_emit_pipe_control(cs,
226	PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
227	flags, LRC_PPHWSP_SCRATCH_ADDR);
228	intel_ring_advance(rq, cs);
229	}
230
231	if (mode & EMIT_INVALIDATE) {
232	u32 flags = 0;
233	u32 *cs;
234
235	flags \|= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
236	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
237	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
238	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
239	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
240	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
241	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
242
243	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
244	flags \|= PIPE_CONTROL_QW_WRITE;
245
246	flags \|= PIPE_CONTROL_CS_STALL;
247
248	cs = intel_ring_begin(rq, 8 + 4);
249	if (IS_ERR(cs))
250	return PTR_ERR(cs);
251
252	/*
253	* Prevent the pre-parser from skipping past the TLB
254	* invalidate and loading a stale page for the batch
255	* buffer / request payload.
256	*/
257	*cs++ = preparser_disable(true);
258
259	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
260
261	/* hsdes: 1809175790 */
262	cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
263
264	*cs++ = preparser_disable(false);
265	intel_ring_advance(rq, cs);
266	}
267
268	return 0;
269	}
270
271	int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
272	{
273	intel_engine_mask_t aux_inv = 0;
274	u32 cmd, *cs;
275
276	cmd = 4;
277	if (mode & EMIT_INVALIDATE)
278	cmd += 2;
279	if (mode & EMIT_INVALIDATE)
280	aux_inv = rq->engine->mask & ~BIT(BCS0);
281	if (aux_inv)
282	cmd += 2 * hweight8(aux_inv) + 2;
283
284	cs = intel_ring_begin(rq, cmd);
285	if (IS_ERR(cs))
286	return PTR_ERR(cs);
287
288	if (mode & EMIT_INVALIDATE)
289	*cs++ = preparser_disable(true);
290
291	cmd = MI_FLUSH_DW + 1;
292
293	/*
294	* We always require a command barrier so that subsequent
295	* commands, such as breadcrumb interrupts, are strictly ordered
296	* wrt the contents of the write cache being flushed to memory
297	* (and thus being coherent from the CPU).
298	*/
299	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;
300
301	if (mode & EMIT_INVALIDATE) {
302	cmd \|= MI_INVALIDATE_TLB;
303	if (rq->engine->class == VIDEO_DECODE_CLASS)
304	cmd \|= MI_INVALIDATE_BSD;
305	}
306
307	*cs++ = cmd;
308	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
309	cs++ = 0; / upper addr */
310	cs++ = 0; / value */
311
312	if (aux_inv) { /* hsdes: 1809175790 */
313	struct intel_engine_cs *engine;
314	unsigned int tmp;
315
316	*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
317	for_each_engine_masked(engine, rq->engine->gt,
318	aux_inv, tmp) {
319	*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
320	*cs++ = AUX_INV;
321	}
322	*cs++ = MI_NOOP;
323	}
324
325	if (mode & EMIT_INVALIDATE)
326	*cs++ = preparser_disable(false);
327
328	intel_ring_advance(rq, cs);
329
330	return 0;
331	}
332
9834dfef	333	static u32 preempt_address(struct intel_engine_cs *engine)
d0d829e5 DCS	334	{
	335	return (i915_ggtt_offset(engine->status_page.vma) +
	336	I915_GEM_HWS_PREEMPT_ADDR);
	337	}
	338
	339	static u32 hwsp_offset(const struct i915_request *rq)
	340	{
	341	const struct intel_timeline_cacheline *cl;
	342
	343	/* Before the request is executed, the timeline/cachline is fixed */
	344
	345	cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
	346	if (cl)
	347	return cl->ggtt_offset;
	348
	349	return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
	350	}
	351
	352	int gen8_emit_init_breadcrumb(struct i915_request *rq)
	353	{
	354	u32 *cs;
	355
	356	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
	357	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
	358	return 0;
	359
	360	cs = intel_ring_begin(rq, 6);
	361	if (IS_ERR(cs))
	362	return PTR_ERR(cs);
	363
1a51b50c CW	364	*cs++ = MI_STORE_DWORD_IMM_GEN4 \| MI_USE_GGTT;
	365	*cs++ = hwsp_offset(rq);
	366	*cs++ = 0;
	367	*cs++ = rq->fence.seqno - 1;
	368
d0d829e5 DCS	369	/*
	370	* Check if we have been preempted before we even get started.
	371	*
	372	* After this point i915_request_started() reports true, even if
	373	* we get preempted and so are no longer running.
1a51b50c CW	374	*
	375	* i915_request_started() is used during preemption processing
	376	* to decide if the request is currently inside the user payload
	377	* or spinning on a kernel semaphore (or earlier). For no-preemption
	378	* requests, we do allow preemption on the semaphore before the user
	379	* payload, but do not allow preemption once the request is started.
	380	*
	381	* i915_request_started() is similarly used during GPU hangs to
	382	* determine if the user's payload was guilty, and if so, the
	383	* request is banned. Before the request is started, it is assumed
	384	* to be unharmed and an innocent victim of another's hang.
d0d829e5	385	*/
d0d829e5	386	*cs++ = MI_NOOP;
1a51b50c	387	*cs++ = MI_ARB_CHECK;
d0d829e5 DCS	388
	389	intel_ring_advance(rq, cs);
	390
	391	/* Record the updated position of the request's payload */
	392	rq->infix = intel_ring_offset(rq, cs);
	393
	394	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
	395
	396	return 0;
	397	}
	398
	399	int gen8_emit_bb_start_noarb(struct i915_request *rq,
	400	u64 offset, u32 len,
	401	const unsigned int flags)
	402	{
	403	u32 *cs;
	404
	405	cs = intel_ring_begin(rq, 4);
	406	if (IS_ERR(cs))
	407	return PTR_ERR(cs);
	408
	409	/*
	410	* WaDisableCtxRestoreArbitration:bdw,chv
	411	*
	412	* We don't need to perform MI_ARB_ENABLE as often as we do (in
	413	* particular all the gen that do not need the w/a at all!), if we
	414	* took care to make sure that on every switch into this context
	415	* (both ordinary and for preemption) that arbitrartion was enabled
	416	* we would be fine. However, for gen8 there is another w/a that
	417	* requires us to not preempt inside GPGPU execution, so we keep
	418	* arbitration disabled for gen8 batches. Arbitration will be
	419	* re-enabled before we close the request
	420	* (engine->emit_fini_breadcrumb).
	421	*/
	422	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
	423
	424	/* FIXME(BDW+): Address space and security selectors. */
	425	*cs++ = MI_BATCH_BUFFER_START_GEN8 \|
	426	(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
	427	*cs++ = lower_32_bits(offset);
	428	*cs++ = upper_32_bits(offset);
	429
	430	intel_ring_advance(rq, cs);
	431
	432	return 0;
	433	}
	434
	435	int gen8_emit_bb_start(struct i915_request *rq,
	436	u64 offset, u32 len,
	437	const unsigned int flags)
	438	{
	439	u32 *cs;
	440
9b3a8f55 CW	441	if (unlikely(i915_request_has_nopreempt(rq)))
	442	return gen8_emit_bb_start_noarb(rq, offset, len, flags);
	443
d0d829e5 DCS	444	cs = intel_ring_begin(rq, 6);
	445	if (IS_ERR(cs))
	446	return PTR_ERR(cs);
	447
	448	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
	449
	450	*cs++ = MI_BATCH_BUFFER_START_GEN8 \|
	451	(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
	452	*cs++ = lower_32_bits(offset);
	453	*cs++ = upper_32_bits(offset);
	454
	455	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
	456	*cs++ = MI_NOOP;
	457
	458	intel_ring_advance(rq, cs);
	459
	460	return 0;
	461	}
	462
	463	static void assert_request_valid(struct i915_request *rq)
	464	{
	465	struct intel_ring *ring __maybe_unused = rq->ring;
	466
	467	/* Can we unwind this request without appearing to go forwards? */
	468	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
	469	}
	470
	471	/*
	472	* Reserve space for 2 NOOPs at the end of each request to be
	473	* used as a workaround for not being allowed to do lite
	474	* restore with HEAD==TAIL (WaIdleLiteRestore).
	475	*/
	476	static u32 gen8_emit_wa_tail(struct i915_request rq, u32 *cs)
	477	{
	478	/* Ensure there's always at least one preemption point per-request. */
	479	*cs++ = MI_ARB_CHECK;
	480	*cs++ = MI_NOOP;
	481	rq->wa_tail = intel_ring_offset(rq, cs);
	482
	483	/* Check that entire request is less than half the ring */
	484	assert_request_valid(rq);
	485
	486	return cs;
	487	}
	488
	489	static u32 emit_preempt_busywait(struct i915_request rq, u32 *cs)
	490	{
b06b8103	491	cs++ = MI_ARB_CHECK; / trigger IDLE->ACTIVE first */
d0d829e5 DCS	492	*cs++ = MI_SEMAPHORE_WAIT \|
	493	MI_SEMAPHORE_GLOBAL_GTT \|
	494	MI_SEMAPHORE_POLL \|
	495	MI_SEMAPHORE_SAD_EQ_SDD;
	496	*cs++ = 0;
	497	*cs++ = preempt_address(rq->engine);
	498	*cs++ = 0;
b06b8103	499	*cs++ = MI_NOOP;
d0d829e5 DCS	500
	501	return cs;
	502	}
	503
	504	static __always_inline u32*
	505	gen8_emit_fini_breadcrumb_tail(struct i915_request rq, u32 cs)
	506	{
	507	*cs++ = MI_USER_INTERRUPT;
	508
	509	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
	510	if (intel_engine_has_semaphores(rq->engine))
	511	cs = emit_preempt_busywait(rq, cs);
	512
	513	rq->tail = intel_ring_offset(rq, cs);
	514	assert_ring_tail_valid(rq->ring, rq->tail);
	515
	516	return gen8_emit_wa_tail(rq, cs);
	517	}
	518
	519	static u32 emit_xcs_breadcrumb(struct i915_request rq, u32 *cs)
	520	{
	521	return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
	522	}
	523
	524	u32 gen8_emit_fini_breadcrumb_xcs(struct i915_request rq, u32 *cs)
	525	{
	526	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
	527	}
	528
	529	u32 gen8_emit_fini_breadcrumb_rcs(struct i915_request rq, u32 *cs)
	530	{
	531	cs = gen8_emit_pipe_control(cs,
	532	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
	533	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
	534	PIPE_CONTROL_DC_FLUSH_ENABLE,
	535	0);
	536
	537	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
	538	cs = gen8_emit_ggtt_write_rcs(cs,
	539	rq->fence.seqno,
	540	hwsp_offset(rq),
	541	PIPE_CONTROL_FLUSH_ENABLE \|
	542	PIPE_CONTROL_CS_STALL);
	543
	544	return gen8_emit_fini_breadcrumb_tail(rq, cs);
	545	}
	546
	547	u32 gen11_emit_fini_breadcrumb_rcs(struct i915_request rq, u32 *cs)
	548	{
	549	cs = gen8_emit_ggtt_write_rcs(cs,
	550	rq->fence.seqno,
	551	hwsp_offset(rq),
	552	PIPE_CONTROL_CS_STALL \|
	553	PIPE_CONTROL_TILE_CACHE_FLUSH \|
	554	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
	555	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
	556	PIPE_CONTROL_DC_FLUSH_ENABLE \|
	557	PIPE_CONTROL_FLUSH_ENABLE);
	558
	559	return gen8_emit_fini_breadcrumb_tail(rq, cs);
	560	}
	561
	562	/*
	563	* Note that the CS instruction pre-parser will not stall on the breadcrumb
564	* flush and will continue pre-fetching the instructions after it before the
565	* memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
566	* BB_START/END instructions, so, even though we might pre-fetch the pre-amble
567	* of the next request before the memory has been flushed, we're guaranteed that
568	* we won't access the batch itself too early.
569	* However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
570	* so, if the current request is modifying an instruction in the next request on
571	* the same intel_context, we might pre-fetch and then execute the pre-update
572	* instruction. To avoid this, the users of self-modifying code should either
573	* disable the parser around the code emitting the memory writes, via a new flag
574	* added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
575	* the in-kernel use-cases we've opted to use a separate context, see
576	* reloc_gpu() as an example.
577	* All the above applies only to the instructions themselves. Non-inline data
578	* used by the instructions is not pre-fetched.
579	*/
580
581	static u32 gen12_emit_preempt_busywait(struct i915_request rq, u32 *cs)
582	{
49b20dbf	583	cs++ = MI_ARB_CHECK; / trigger IDLE->ACTIVE first */
d0d829e5 DCS	584	*cs++ = MI_SEMAPHORE_WAIT_TOKEN \|
	585	MI_SEMAPHORE_GLOBAL_GTT \|
	586	MI_SEMAPHORE_POLL \|
	587	MI_SEMAPHORE_SAD_EQ_SDD;
	588	*cs++ = 0;
	589	*cs++ = preempt_address(rq->engine);
	590	*cs++ = 0;
	591	*cs++ = 0;
d0d829e5 DCS	592
	593	return cs;
	594	}
	595
	596	static __always_inline u32*
	597	gen12_emit_fini_breadcrumb_tail(struct i915_request rq, u32 cs)
	598	{
	599	*cs++ = MI_USER_INTERRUPT;
	600
	601	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
	602	if (intel_engine_has_semaphores(rq->engine))
	603	cs = gen12_emit_preempt_busywait(rq, cs);
	604
	605	rq->tail = intel_ring_offset(rq, cs);
	606	assert_ring_tail_valid(rq->ring, rq->tail);
	607
	608	return gen8_emit_wa_tail(rq, cs);
	609	}
	610
	611	u32 gen12_emit_fini_breadcrumb_xcs(struct i915_request rq, u32 *cs)
	612	{
	613	/* XXX Stalling flush before seqno write; post-sync not */
	614	cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
	615	return gen12_emit_fini_breadcrumb_tail(rq, cs);
	616	}
	617
	618	u32 gen12_emit_fini_breadcrumb_rcs(struct i915_request rq, u32 *cs)
	619	{
	620	cs = gen12_emit_ggtt_write_rcs(cs,
	621	rq->fence.seqno,
	622	hwsp_offset(rq),
	623	PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
	624	PIPE_CONTROL_CS_STALL \|
	625	PIPE_CONTROL_TILE_CACHE_FLUSH \|
	626	PIPE_CONTROL_FLUSH_L3 \|
	627	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
	628	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
	629	/* Wa_1409600907:tgl */
	630	PIPE_CONTROL_DEPTH_STALL \|
	631	PIPE_CONTROL_DC_FLUSH_ENABLE \|
	632	PIPE_CONTROL_FLUSH_ENABLE);
	633
	634	return gen12_emit_fini_breadcrumb_tail(rq, cs);
	635	}