cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

gen6_engine_cs.c (12125B)


      1// SPDX-License-Identifier: MIT
      2/*
      3 * Copyright © 2020 Intel Corporation
      4 */
      5
      6#include "gen6_engine_cs.h"
      7#include "intel_engine.h"
      8#include "intel_engine_regs.h"
      9#include "intel_gpu_commands.h"
     10#include "intel_gt.h"
     11#include "intel_gt_irq.h"
     12#include "intel_gt_pm_irq.h"
     13#include "intel_ring.h"
     14
     15#define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
     16
     17/*
     18 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
     19 * implementing two workarounds on gen6.  From section 1.4.7.1
     20 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
     21 *
     22 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
     23 * produced by non-pipelined state commands), software needs to first
     24 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
     25 * 0.
     26 *
     27 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
     28 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
     29 *
     30 * And the workaround for these two requires this workaround first:
     31 *
     32 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
     33 * BEFORE the pipe-control with a post-sync op and no write-cache
     34 * flushes.
     35 *
     36 * And this last workaround is tricky because of the requirements on
     37 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
     38 * volume 2 part 1:
     39 *
     40 *     "1 of the following must also be set:
     41 *      - Render Target Cache Flush Enable ([12] of DW1)
     42 *      - Depth Cache Flush Enable ([0] of DW1)
     43 *      - Stall at Pixel Scoreboard ([1] of DW1)
     44 *      - Depth Stall ([13] of DW1)
     45 *      - Post-Sync Operation ([13] of DW1)
     46 *      - Notify Enable ([8] of DW1)"
     47 *
     48 * The cache flushes require the workaround flush that triggered this
     49 * one, so we can't use it.  Depth stall would trigger the same.
     50 * Post-sync nonzero is what triggered this second workaround, so we
     51 * can't use that one either.  Notify enable is IRQs, which aren't
     52 * really our business.  That leaves only stall at scoreboard.
     53 */
     54static int
     55gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
     56{
     57	u32 scratch_addr =
     58		intel_gt_scratch_offset(rq->engine->gt,
     59					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
     60	u32 *cs;
     61
     62	cs = intel_ring_begin(rq, 6);
     63	if (IS_ERR(cs))
     64		return PTR_ERR(cs);
     65
     66	*cs++ = GFX_OP_PIPE_CONTROL(5);
     67	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
     68	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
     69	*cs++ = 0; /* low dword */
     70	*cs++ = 0; /* high dword */
     71	*cs++ = MI_NOOP;
     72	intel_ring_advance(rq, cs);
     73
     74	cs = intel_ring_begin(rq, 6);
     75	if (IS_ERR(cs))
     76		return PTR_ERR(cs);
     77
     78	*cs++ = GFX_OP_PIPE_CONTROL(5);
     79	*cs++ = PIPE_CONTROL_QW_WRITE;
     80	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
     81	*cs++ = 0;
     82	*cs++ = 0;
     83	*cs++ = MI_NOOP;
     84	intel_ring_advance(rq, cs);
     85
     86	return 0;
     87}
     88
     89int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
     90{
     91	u32 scratch_addr =
     92		intel_gt_scratch_offset(rq->engine->gt,
     93					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
     94	u32 *cs, flags = 0;
     95	int ret;
     96
     97	/* Force SNB workarounds for PIPE_CONTROL flushes */
     98	ret = gen6_emit_post_sync_nonzero_flush(rq);
     99	if (ret)
    100		return ret;
    101
    102	/*
    103	 * Just flush everything.  Experiments have shown that reducing the
    104	 * number of bits based on the write domains has little performance
    105	 * impact. And when rearranging requests, the order of flushes is
    106	 * unknown.
    107	 */
    108	if (mode & EMIT_FLUSH) {
    109		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
    110		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
    111		/*
    112		 * Ensure that any following seqno writes only happen
    113		 * when the render cache is indeed flushed.
    114		 */
    115		flags |= PIPE_CONTROL_CS_STALL;
    116	}
    117	if (mode & EMIT_INVALIDATE) {
    118		flags |= PIPE_CONTROL_TLB_INVALIDATE;
    119		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
    120		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
    121		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
    122		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
    123		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
    124		/*
    125		 * TLB invalidate requires a post-sync write.
    126		 */
    127		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
    128	}
    129
    130	cs = intel_ring_begin(rq, 4);
    131	if (IS_ERR(cs))
    132		return PTR_ERR(cs);
    133
    134	*cs++ = GFX_OP_PIPE_CONTROL(4);
    135	*cs++ = flags;
    136	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
    137	*cs++ = 0;
    138	intel_ring_advance(rq, cs);
    139
    140	return 0;
    141}
    142
    143u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
    144{
    145	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
    146	*cs++ = GFX_OP_PIPE_CONTROL(4);
    147	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
    148	*cs++ = 0;
    149	*cs++ = 0;
    150
    151	*cs++ = GFX_OP_PIPE_CONTROL(4);
    152	*cs++ = PIPE_CONTROL_QW_WRITE;
    153	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
    154					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
    155		PIPE_CONTROL_GLOBAL_GTT;
    156	*cs++ = 0;
    157
    158	/* Finally we can flush and with it emit the breadcrumb */
    159	*cs++ = GFX_OP_PIPE_CONTROL(4);
    160	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
    161		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
    162		 PIPE_CONTROL_DC_FLUSH_ENABLE |
    163		 PIPE_CONTROL_QW_WRITE |
    164		 PIPE_CONTROL_CS_STALL);
    165	*cs++ = i915_request_active_seqno(rq) |
    166		PIPE_CONTROL_GLOBAL_GTT;
    167	*cs++ = rq->fence.seqno;
    168
    169	*cs++ = MI_USER_INTERRUPT;
    170	*cs++ = MI_NOOP;
    171
    172	rq->tail = intel_ring_offset(rq, cs);
    173	assert_ring_tail_valid(rq->ring, rq->tail);
    174
    175	return cs;
    176}
    177
    178static int mi_flush_dw(struct i915_request *rq, u32 flags)
    179{
    180	u32 cmd, *cs;
    181
    182	cs = intel_ring_begin(rq, 4);
    183	if (IS_ERR(cs))
    184		return PTR_ERR(cs);
    185
    186	cmd = MI_FLUSH_DW;
    187
    188	/*
    189	 * We always require a command barrier so that subsequent
    190	 * commands, such as breadcrumb interrupts, are strictly ordered
    191	 * wrt the contents of the write cache being flushed to memory
    192	 * (and thus being coherent from the CPU).
    193	 */
    194	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
    195
    196	/*
    197	 * Bspec vol 1c.3 - blitter engine command streamer:
    198	 * "If ENABLED, all TLBs will be invalidated once the flush
    199	 * operation is complete. This bit is only valid when the
    200	 * Post-Sync Operation field is a value of 1h or 3h."
    201	 */
    202	cmd |= flags;
    203
    204	*cs++ = cmd;
    205	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
    206	*cs++ = 0;
    207	*cs++ = MI_NOOP;
    208
    209	intel_ring_advance(rq, cs);
    210
    211	return 0;
    212}
    213
    214static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
    215{
    216	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
    217}
    218
    219int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
    220{
    221	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
    222}
    223
    224int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
    225{
    226	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
    227}
    228
    229int gen6_emit_bb_start(struct i915_request *rq,
    230		       u64 offset, u32 len,
    231		       unsigned int dispatch_flags)
    232{
    233	u32 security;
    234	u32 *cs;
    235
    236	security = MI_BATCH_NON_SECURE_I965;
    237	if (dispatch_flags & I915_DISPATCH_SECURE)
    238		security = 0;
    239
    240	cs = intel_ring_begin(rq, 2);
    241	if (IS_ERR(cs))
    242		return PTR_ERR(cs);
    243
    244	cs = __gen6_emit_bb_start(cs, offset, security);
    245	intel_ring_advance(rq, cs);
    246
    247	return 0;
    248}
    249
    250int
    251hsw_emit_bb_start(struct i915_request *rq,
    252		  u64 offset, u32 len,
    253		  unsigned int dispatch_flags)
    254{
    255	u32 security;
    256	u32 *cs;
    257
    258	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
    259	if (dispatch_flags & I915_DISPATCH_SECURE)
    260		security = 0;
    261
    262	cs = intel_ring_begin(rq, 2);
    263	if (IS_ERR(cs))
    264		return PTR_ERR(cs);
    265
    266	cs = __gen6_emit_bb_start(cs, offset, security);
    267	intel_ring_advance(rq, cs);
    268
    269	return 0;
    270}
    271
    272static int gen7_stall_cs(struct i915_request *rq)
    273{
    274	u32 *cs;
    275
    276	cs = intel_ring_begin(rq, 4);
    277	if (IS_ERR(cs))
    278		return PTR_ERR(cs);
    279
    280	*cs++ = GFX_OP_PIPE_CONTROL(4);
    281	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
    282	*cs++ = 0;
    283	*cs++ = 0;
    284	intel_ring_advance(rq, cs);
    285
    286	return 0;
    287}
    288
    289int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
    290{
    291	u32 scratch_addr =
    292		intel_gt_scratch_offset(rq->engine->gt,
    293					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
    294	u32 *cs, flags = 0;
    295
    296	/*
    297	 * Ensure that any following seqno writes only happen when the render
    298	 * cache is indeed flushed.
    299	 *
    300	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
    301	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
    302	 * don't try to be clever and just set it unconditionally.
    303	 */
    304	flags |= PIPE_CONTROL_CS_STALL;
    305
    306	/*
    307	 * CS_STALL suggests at least a post-sync write.
    308	 */
    309	flags |= PIPE_CONTROL_QW_WRITE;
    310	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
    311
    312	/*
    313	 * Just flush everything.  Experiments have shown that reducing the
    314	 * number of bits based on the write domains has little performance
    315	 * impact.
    316	 */
    317	if (mode & EMIT_FLUSH) {
    318		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
    319		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
    320		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
    321		flags |= PIPE_CONTROL_FLUSH_ENABLE;
    322	}
    323	if (mode & EMIT_INVALIDATE) {
    324		flags |= PIPE_CONTROL_TLB_INVALIDATE;
    325		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
    326		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
    327		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
    328		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
    329		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
    330		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
    331
    332		/*
    333		 * Workaround: we must issue a pipe_control with CS-stall bit
    334		 * set before a pipe_control command that has the state cache
    335		 * invalidate bit set.
    336		 */
    337		gen7_stall_cs(rq);
    338	}
    339
    340	cs = intel_ring_begin(rq, 4);
    341	if (IS_ERR(cs))
    342		return PTR_ERR(cs);
    343
    344	*cs++ = GFX_OP_PIPE_CONTROL(4);
    345	*cs++ = flags;
    346	*cs++ = scratch_addr;
    347	*cs++ = 0;
    348	intel_ring_advance(rq, cs);
    349
    350	return 0;
    351}
    352
    353u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
    354{
    355	*cs++ = GFX_OP_PIPE_CONTROL(4);
    356	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
    357		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
    358		 PIPE_CONTROL_DC_FLUSH_ENABLE |
    359		 PIPE_CONTROL_FLUSH_ENABLE |
    360		 PIPE_CONTROL_QW_WRITE |
    361		 PIPE_CONTROL_GLOBAL_GTT_IVB |
    362		 PIPE_CONTROL_CS_STALL);
    363	*cs++ = i915_request_active_seqno(rq);
    364	*cs++ = rq->fence.seqno;
    365
    366	*cs++ = MI_USER_INTERRUPT;
    367	*cs++ = MI_NOOP;
    368
    369	rq->tail = intel_ring_offset(rq, cs);
    370	assert_ring_tail_valid(rq->ring, rq->tail);
    371
    372	return cs;
    373}
    374
    375u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
    376{
    377	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
    378	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
    379
    380	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
    381	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
    382	*cs++ = rq->fence.seqno;
    383
    384	*cs++ = MI_USER_INTERRUPT;
    385
    386	rq->tail = intel_ring_offset(rq, cs);
    387	assert_ring_tail_valid(rq->ring, rq->tail);
    388
    389	return cs;
    390}
    391
    392#define GEN7_XCS_WA 32
    393u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
    394{
    395	int i;
    396
    397	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
    398	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
    399
    400	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
    401		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
    402	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
    403	*cs++ = rq->fence.seqno;
    404
    405	for (i = 0; i < GEN7_XCS_WA; i++) {
    406		*cs++ = MI_STORE_DWORD_INDEX;
    407		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
    408		*cs++ = rq->fence.seqno;
    409	}
    410
    411	*cs++ = MI_FLUSH_DW;
    412	*cs++ = 0;
    413	*cs++ = 0;
    414
    415	*cs++ = MI_USER_INTERRUPT;
    416	*cs++ = MI_NOOP;
    417
    418	rq->tail = intel_ring_offset(rq, cs);
    419	assert_ring_tail_valid(rq->ring, rq->tail);
    420
    421	return cs;
    422}
    423#undef GEN7_XCS_WA
    424
    425void gen6_irq_enable(struct intel_engine_cs *engine)
    426{
    427	ENGINE_WRITE(engine, RING_IMR,
    428		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
    429
    430	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
    431	ENGINE_POSTING_READ(engine, RING_IMR);
    432
    433	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
    434}
    435
    436void gen6_irq_disable(struct intel_engine_cs *engine)
    437{
    438	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
    439	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
    440}
    441
    442void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
    443{
    444	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
    445
    446	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
    447	ENGINE_POSTING_READ(engine, RING_IMR);
    448
    449	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
    450}
    451
    452void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
    453{
    454	ENGINE_WRITE(engine, RING_IMR, ~0);
    455	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
    456}