gen7_renderclear.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
gen7_renderclear.c (10703B)
      1// SPDX-License-Identifier: MIT
      2/*
      3 * Copyright © 2019 Intel Corporation
      4 */
      5
      6#include "gen7_renderclear.h"
      7#include "i915_drv.h"
      8#include "intel_gpu_commands.h"
      9#include "intel_gt_regs.h"
     10
     11#define GT3_INLINE_DATA_DELAYS 0x1E00
     12#define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
     13
     14struct cb_kernel {
     15	const void *data;
     16	u32 size;
     17};
     18
     19#define CB_KERNEL(name) { .data = (name), .size = sizeof(name) }
     20
     21#include "ivb_clear_kernel.c"
     22static const struct cb_kernel cb_kernel_ivb = CB_KERNEL(ivb_clear_kernel);
     23
     24#include "hsw_clear_kernel.c"
     25static const struct cb_kernel cb_kernel_hsw = CB_KERNEL(hsw_clear_kernel);
     26
     27struct batch_chunk {
     28	struct i915_vma *vma;
     29	u32 offset;
     30	u32 *start;
     31	u32 *end;
     32	u32 max_items;
     33};
     34
     35struct batch_vals {
     36	u32 max_threads;
     37	u32 state_start;
     38	u32 surface_start;
     39	u32 surface_height;
     40	u32 surface_width;
     41	u32 size;
     42};
     43
     44static int num_primitives(const struct batch_vals *bv)
     45{
     46	/*
     47	 * We need to saturate the GPU with work in order to dispatch
     48	 * a shader on every HW thread, and clear the thread-local registers.
     49	 * In short, we have to dispatch work faster than the shaders can
     50	 * run in order to fill the EU and occupy each HW thread.
     51	 */
     52	return bv->max_threads;
     53}
     54
     55static void
     56batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
     57{
     58	if (IS_HASWELL(i915)) {
     59		switch (INTEL_INFO(i915)->gt) {
     60		default:
     61		case 1:
     62			bv->max_threads = 70;
     63			break;
     64		case 2:
     65			bv->max_threads = 140;
     66			break;
     67		case 3:
     68			bv->max_threads = 280;
     69			break;
     70		}
     71		bv->surface_height = 16 * 16;
     72		bv->surface_width = 32 * 2 * 16;
     73	} else {
     74		switch (INTEL_INFO(i915)->gt) {
     75		default:
     76		case 1: /* including vlv */
     77			bv->max_threads = 36;
     78			break;
     79		case 2:
     80			bv->max_threads = 128;
     81			break;
     82		}
     83		bv->surface_height = 16 * 8;
     84		bv->surface_width = 32 * 16;
     85	}
     86	bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
     87	bv->surface_start = bv->state_start + SZ_4K;
     88	bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
     89}
     90
     91static void batch_init(struct batch_chunk *bc,
     92		       struct i915_vma *vma,
     93		       u32 *start, u32 offset, u32 max_bytes)
     94{
     95	bc->vma = vma;
     96	bc->offset = offset;
     97	bc->start = start + bc->offset / sizeof(*bc->start);
     98	bc->end = bc->start;
     99	bc->max_items = max_bytes / sizeof(*bc->start);
    100}
    101
    102static u32 batch_offset(const struct batch_chunk *bc, u32 *cs)
    103{
    104	return (cs - bc->start) * sizeof(*bc->start) + bc->offset;
    105}
    106
    107static u32 batch_addr(const struct batch_chunk *bc)
    108{
    109	return bc->vma->node.start;
    110}
    111
    112static void batch_add(struct batch_chunk *bc, const u32 d)
    113{
    114	GEM_BUG_ON((bc->end - bc->start) >= bc->max_items);
    115	*bc->end++ = d;
    116}
    117
    118static u32 *batch_alloc_items(struct batch_chunk *bc, u32 align, u32 items)
    119{
    120	u32 *map;
    121
    122	if (align) {
    123		u32 *end = PTR_ALIGN(bc->end, align);
    124
    125		memset32(bc->end, 0, end - bc->end);
    126		bc->end = end;
    127	}
    128
    129	map = bc->end;
    130	bc->end += items;
    131
    132	return map;
    133}
    134
    135static u32 *batch_alloc_bytes(struct batch_chunk *bc, u32 align, u32 bytes)
    136{
    137	GEM_BUG_ON(!IS_ALIGNED(bytes, sizeof(*bc->start)));
    138	return batch_alloc_items(bc, align, bytes / sizeof(*bc->start));
    139}
    140
    141static u32
    142gen7_fill_surface_state(struct batch_chunk *state,
    143			const u32 dst_offset,
    144			const struct batch_vals *bv)
    145{
    146	u32 surface_h = bv->surface_height;
    147	u32 surface_w = bv->surface_width;
    148	u32 *cs = batch_alloc_items(state, 32, 8);
    149	u32 offset = batch_offset(state, cs);
    150
    151#define SURFACE_2D 1
    152#define SURFACEFORMAT_B8G8R8A8_UNORM 0x0C0
    153#define RENDER_CACHE_READ_WRITE 1
    154
    155	*cs++ = SURFACE_2D << 29 |
    156		(SURFACEFORMAT_B8G8R8A8_UNORM << 18) |
    157		(RENDER_CACHE_READ_WRITE << 8);
    158
    159	*cs++ = batch_addr(state) + dst_offset;
    160
    161	*cs++ = ((surface_h / 4 - 1) << 16) | (surface_w / 4 - 1);
    162	*cs++ = surface_w;
    163	*cs++ = 0;
    164	*cs++ = 0;
    165	*cs++ = 0;
    166#define SHADER_CHANNELS(r, g, b, a) \
    167	(((r) << 25) | ((g) << 22) | ((b) << 19) | ((a) << 16))
    168	*cs++ = SHADER_CHANNELS(4, 5, 6, 7);
    169	batch_advance(state, cs);
    170
    171	return offset;
    172}
    173
    174static u32
    175gen7_fill_binding_table(struct batch_chunk *state,
    176			const struct batch_vals *bv)
    177{
    178	u32 surface_start =
    179		gen7_fill_surface_state(state, bv->surface_start, bv);
    180	u32 *cs = batch_alloc_items(state, 32, 8);
    181	u32 offset = batch_offset(state, cs);
    182
    183	*cs++ = surface_start - state->offset;
    184	*cs++ = 0;
    185	*cs++ = 0;
    186	*cs++ = 0;
    187	*cs++ = 0;
    188	*cs++ = 0;
    189	*cs++ = 0;
    190	*cs++ = 0;
    191	batch_advance(state, cs);
    192
    193	return offset;
    194}
    195
    196static u32
    197gen7_fill_kernel_data(struct batch_chunk *state,
    198		      const u32 *data,
    199		      const u32 size)
    200{
    201	return batch_offset(state,
    202			    memcpy(batch_alloc_bytes(state, 64, size),
    203				   data, size));
    204}
    205
    206static u32
    207gen7_fill_interface_descriptor(struct batch_chunk *state,
    208			       const struct batch_vals *bv,
    209			       const struct cb_kernel *kernel,
    210			       unsigned int count)
    211{
    212	u32 kernel_offset =
    213		gen7_fill_kernel_data(state, kernel->data, kernel->size);
    214	u32 binding_table = gen7_fill_binding_table(state, bv);
    215	u32 *cs = batch_alloc_items(state, 32, 8 * count);
    216	u32 offset = batch_offset(state, cs);
    217
    218	*cs++ = kernel_offset;
    219	*cs++ = (1 << 7) | (1 << 13);
    220	*cs++ = 0;
    221	*cs++ = (binding_table - state->offset) | 1;
    222	*cs++ = 0;
    223	*cs++ = 0;
    224	*cs++ = 0;
    225	*cs++ = 0;
    226
    227	/* 1 - 63dummy idds */
    228	memset32(cs, 0x00, (count - 1) * 8);
    229	batch_advance(state, cs + (count - 1) * 8);
    230
    231	return offset;
    232}
    233
    234static void
    235gen7_emit_state_base_address(struct batch_chunk *batch,
    236			     u32 surface_state_base)
    237{
    238	u32 *cs = batch_alloc_items(batch, 0, 10);
    239
    240	*cs++ = STATE_BASE_ADDRESS | (10 - 2);
    241	/* general */
    242	*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
    243	/* surface */
    244	*cs++ = (batch_addr(batch) + surface_state_base) | BASE_ADDRESS_MODIFY;
    245	/* dynamic */
    246	*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
    247	/* indirect */
    248	*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
    249	/* instruction */
    250	*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
    251
    252	/* general/dynamic/indirect/instruction access Bound */
    253	*cs++ = 0;
    254	*cs++ = BASE_ADDRESS_MODIFY;
    255	*cs++ = 0;
    256	*cs++ = BASE_ADDRESS_MODIFY;
    257	batch_advance(batch, cs);
    258}
    259
    260static void
    261gen7_emit_vfe_state(struct batch_chunk *batch,
    262		    const struct batch_vals *bv,
    263		    u32 urb_size, u32 curbe_size,
    264		    u32 mode)
    265{
    266	u32 threads = bv->max_threads - 1;
    267	u32 *cs = batch_alloc_items(batch, 32, 8);
    268
    269	*cs++ = MEDIA_VFE_STATE | (8 - 2);
    270
    271	/* scratch buffer */
    272	*cs++ = 0;
    273
    274	/* number of threads & urb entries for GPGPU vs Media Mode */
    275	*cs++ = threads << 16 | 1 << 8 | mode << 2;
    276
    277	*cs++ = 0;
    278
    279	/* urb entry size & curbe size in 256 bits unit */
    280	*cs++ = urb_size << 16 | curbe_size;
    281
    282	/* scoreboard */
    283	*cs++ = 0;
    284	*cs++ = 0;
    285	*cs++ = 0;
    286	batch_advance(batch, cs);
    287}
    288
    289static void
    290gen7_emit_interface_descriptor_load(struct batch_chunk *batch,
    291				    const u32 interface_descriptor,
    292				    unsigned int count)
    293{
    294	u32 *cs = batch_alloc_items(batch, 8, 4);
    295
    296	*cs++ = MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2);
    297	*cs++ = 0;
    298	*cs++ = count * 8 * sizeof(*cs);
    299
    300	/*
    301	 * interface descriptor address - it is relative to the dynamics base
    302	 * address
    303	 */
    304	*cs++ = interface_descriptor;
    305	batch_advance(batch, cs);
    306}
    307
    308static void
    309gen7_emit_media_object(struct batch_chunk *batch,
    310		       unsigned int media_object_index)
    311{
    312	unsigned int x_offset = (media_object_index % 16) * 64;
    313	unsigned int y_offset = (media_object_index / 16) * 16;
    314	unsigned int pkt = 6 + 3;
    315	u32 *cs;
    316
    317	cs = batch_alloc_items(batch, 8, pkt);
    318
    319	*cs++ = MEDIA_OBJECT | (pkt - 2);
    320
    321	/* interface descriptor offset */
    322	*cs++ = 0;
    323
    324	/* without indirect data */
    325	*cs++ = 0;
    326	*cs++ = 0;
    327
    328	/* scoreboard */
    329	*cs++ = 0;
    330	*cs++ = 0;
    331
    332	/* inline */
    333	*cs++ = y_offset << 16 | x_offset;
    334	*cs++ = 0;
    335	*cs++ = GT3_INLINE_DATA_DELAYS;
    336
    337	batch_advance(batch, cs);
    338}
    339
    340static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
    341{
    342	u32 *cs = batch_alloc_items(batch, 0, 4);
    343
    344	*cs++ = GFX_OP_PIPE_CONTROL(4);
    345	*cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
    346		PIPE_CONTROL_DEPTH_CACHE_FLUSH |
    347		PIPE_CONTROL_DC_FLUSH_ENABLE |
    348		PIPE_CONTROL_CS_STALL;
    349	*cs++ = 0;
    350	*cs++ = 0;
    351
    352	batch_advance(batch, cs);
    353}
    354
    355static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
    356{
    357	u32 *cs = batch_alloc_items(batch, 0, 10);
    358
    359	/* ivb: Stall before STATE_CACHE_INVALIDATE */
    360	*cs++ = GFX_OP_PIPE_CONTROL(5);
    361	*cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
    362		PIPE_CONTROL_CS_STALL;
    363	*cs++ = 0;
    364	*cs++ = 0;
    365	*cs++ = 0;
    366
    367	*cs++ = GFX_OP_PIPE_CONTROL(5);
    368	*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
    369	*cs++ = 0;
    370	*cs++ = 0;
    371	*cs++ = 0;
    372
    373	batch_advance(batch, cs);
    374}
    375
    376static void emit_batch(struct i915_vma * const vma,
    377		       u32 *start,
    378		       const struct batch_vals *bv)
    379{
    380	struct drm_i915_private *i915 = vma->vm->i915;
    381	const unsigned int desc_count = 1;
    382	const unsigned int urb_size = 1;
    383	struct batch_chunk cmds, state;
    384	u32 descriptors;
    385	unsigned int i;
    386
    387	batch_init(&cmds, vma, start, 0, bv->state_start);
    388	batch_init(&state, vma, start, bv->state_start, SZ_4K);
    389
    390	descriptors = gen7_fill_interface_descriptor(&state, bv,
    391						     IS_HASWELL(i915) ?
    392						     &cb_kernel_hsw :
    393						     &cb_kernel_ivb,
    394						     desc_count);
    395
    396	/* Reset inherited context registers */
    397	gen7_emit_pipeline_flush(&cmds);
    398	gen7_emit_pipeline_invalidate(&cmds);
    399	batch_add(&cmds, MI_LOAD_REGISTER_IMM(2));
    400	batch_add(&cmds, i915_mmio_reg_offset(CACHE_MODE_0_GEN7));
    401	batch_add(&cmds, 0xffff0000 |
    402			((IS_IVB_GT1(i915) || IS_VALLEYVIEW(i915)) ?
    403			 HIZ_RAW_STALL_OPT_DISABLE :
    404			 0));
    405	batch_add(&cmds, i915_mmio_reg_offset(CACHE_MODE_1));
    406	batch_add(&cmds, 0xffff0000 | PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
    407	gen7_emit_pipeline_invalidate(&cmds);
    408	gen7_emit_pipeline_flush(&cmds);
    409
    410	/* Switch to the media pipeline and our base address */
    411	gen7_emit_pipeline_invalidate(&cmds);
    412	batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
    413	batch_add(&cmds, MI_NOOP);
    414	gen7_emit_pipeline_invalidate(&cmds);
    415
    416	gen7_emit_pipeline_flush(&cmds);
    417	gen7_emit_state_base_address(&cmds, descriptors);
    418	gen7_emit_pipeline_invalidate(&cmds);
    419
    420	/* Set the clear-residual kernel state */
    421	gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0);
    422	gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count);
    423
    424	/* Execute the kernel on all HW threads */
    425	for (i = 0; i < num_primitives(bv); i++)
    426		gen7_emit_media_object(&cmds, i);
    427
    428	batch_add(&cmds, MI_BATCH_BUFFER_END);
    429}
    430
    431int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
    432			    struct i915_vma * const vma)
    433{
    434	struct batch_vals bv;
    435	u32 *batch;
    436
    437	batch_get_defaults(engine->i915, &bv);
    438	if (!vma)
    439		return bv.size;
    440
    441	GEM_BUG_ON(vma->obj->base.size < bv.size);
    442
    443	batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
    444	if (IS_ERR(batch))
    445		return PTR_ERR(batch);
    446
    447	emit_batch(vma, memset(batch, 0, bv.size), &bv);
    448
    449	i915_gem_object_flush_map(vma->obj);
    450	__i915_gem_object_release_map(vma->obj);
    451
    452	return 0;
    453}