cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

intel_lrc.c (41310B)


      1// SPDX-License-Identifier: MIT
      2/*
      3 * Copyright © 2014 Intel Corporation
      4 */
      5
      6#include "gem/i915_gem_lmem.h"
      7
      8#include "gen8_engine_cs.h"
      9#include "i915_drv.h"
     10#include "i915_perf.h"
     11#include "i915_reg.h"
     12#include "intel_context.h"
     13#include "intel_engine.h"
     14#include "intel_engine_regs.h"
     15#include "intel_gpu_commands.h"
     16#include "intel_gt.h"
     17#include "intel_gt_regs.h"
     18#include "intel_lrc.h"
     19#include "intel_lrc_reg.h"
     20#include "intel_ring.h"
     21#include "shmem_utils.h"
     22
     23static void set_offsets(u32 *regs,
     24			const u8 *data,
     25			const struct intel_engine_cs *engine,
     26			bool close)
     27#define NOP(x) (BIT(7) | (x))
     28#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
     29#define POSTED BIT(0)
     30#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
     31#define REG16(x) \
     32	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
     33	(((x) >> 2) & 0x7f)
     34#define END 0
     35{
     36	const u32 base = engine->mmio_base;
     37
     38	while (*data) {
     39		u8 count, flags;
     40
     41		if (*data & BIT(7)) { /* skip */
     42			count = *data++ & ~BIT(7);
     43			regs += count;
     44			continue;
     45		}
     46
     47		count = *data & 0x3f;
     48		flags = *data >> 6;
     49		data++;
     50
     51		*regs = MI_LOAD_REGISTER_IMM(count);
     52		if (flags & POSTED)
     53			*regs |= MI_LRI_FORCE_POSTED;
     54		if (GRAPHICS_VER(engine->i915) >= 11)
     55			*regs |= MI_LRI_LRM_CS_MMIO;
     56		regs++;
     57
     58		GEM_BUG_ON(!count);
     59		do {
     60			u32 offset = 0;
     61			u8 v;
     62
     63			do {
     64				v = *data++;
     65				offset <<= 7;
     66				offset |= v & ~BIT(7);
     67			} while (v & BIT(7));
     68
     69			regs[0] = base + (offset << 2);
     70			regs += 2;
     71		} while (--count);
     72	}
     73
     74	if (close) {
     75		/* Close the batch; used mainly by live_lrc_layout() */
     76		*regs = MI_BATCH_BUFFER_END;
     77		if (GRAPHICS_VER(engine->i915) >= 11)
     78			*regs |= BIT(0);
     79	}
     80}
     81
     82static const u8 gen8_xcs_offsets[] = {
     83	NOP(1),
     84	LRI(11, 0),
     85	REG16(0x244),
     86	REG(0x034),
     87	REG(0x030),
     88	REG(0x038),
     89	REG(0x03c),
     90	REG(0x168),
     91	REG(0x140),
     92	REG(0x110),
     93	REG(0x11c),
     94	REG(0x114),
     95	REG(0x118),
     96
     97	NOP(9),
     98	LRI(9, 0),
     99	REG16(0x3a8),
    100	REG16(0x28c),
    101	REG16(0x288),
    102	REG16(0x284),
    103	REG16(0x280),
    104	REG16(0x27c),
    105	REG16(0x278),
    106	REG16(0x274),
    107	REG16(0x270),
    108
    109	NOP(13),
    110	LRI(2, 0),
    111	REG16(0x200),
    112	REG(0x028),
    113
    114	END
    115};
    116
    117static const u8 gen9_xcs_offsets[] = {
    118	NOP(1),
    119	LRI(14, POSTED),
    120	REG16(0x244),
    121	REG(0x034),
    122	REG(0x030),
    123	REG(0x038),
    124	REG(0x03c),
    125	REG(0x168),
    126	REG(0x140),
    127	REG(0x110),
    128	REG(0x11c),
    129	REG(0x114),
    130	REG(0x118),
    131	REG(0x1c0),
    132	REG(0x1c4),
    133	REG(0x1c8),
    134
    135	NOP(3),
    136	LRI(9, POSTED),
    137	REG16(0x3a8),
    138	REG16(0x28c),
    139	REG16(0x288),
    140	REG16(0x284),
    141	REG16(0x280),
    142	REG16(0x27c),
    143	REG16(0x278),
    144	REG16(0x274),
    145	REG16(0x270),
    146
    147	NOP(13),
    148	LRI(1, POSTED),
    149	REG16(0x200),
    150
    151	NOP(13),
    152	LRI(44, POSTED),
    153	REG(0x028),
    154	REG(0x09c),
    155	REG(0x0c0),
    156	REG(0x178),
    157	REG(0x17c),
    158	REG16(0x358),
    159	REG(0x170),
    160	REG(0x150),
    161	REG(0x154),
    162	REG(0x158),
    163	REG16(0x41c),
    164	REG16(0x600),
    165	REG16(0x604),
    166	REG16(0x608),
    167	REG16(0x60c),
    168	REG16(0x610),
    169	REG16(0x614),
    170	REG16(0x618),
    171	REG16(0x61c),
    172	REG16(0x620),
    173	REG16(0x624),
    174	REG16(0x628),
    175	REG16(0x62c),
    176	REG16(0x630),
    177	REG16(0x634),
    178	REG16(0x638),
    179	REG16(0x63c),
    180	REG16(0x640),
    181	REG16(0x644),
    182	REG16(0x648),
    183	REG16(0x64c),
    184	REG16(0x650),
    185	REG16(0x654),
    186	REG16(0x658),
    187	REG16(0x65c),
    188	REG16(0x660),
    189	REG16(0x664),
    190	REG16(0x668),
    191	REG16(0x66c),
    192	REG16(0x670),
    193	REG16(0x674),
    194	REG16(0x678),
    195	REG16(0x67c),
    196	REG(0x068),
    197
    198	END
    199};
    200
    201static const u8 gen12_xcs_offsets[] = {
    202	NOP(1),
    203	LRI(13, POSTED),
    204	REG16(0x244),
    205	REG(0x034),
    206	REG(0x030),
    207	REG(0x038),
    208	REG(0x03c),
    209	REG(0x168),
    210	REG(0x140),
    211	REG(0x110),
    212	REG(0x1c0),
    213	REG(0x1c4),
    214	REG(0x1c8),
    215	REG(0x180),
    216	REG16(0x2b4),
    217
    218	NOP(5),
    219	LRI(9, POSTED),
    220	REG16(0x3a8),
    221	REG16(0x28c),
    222	REG16(0x288),
    223	REG16(0x284),
    224	REG16(0x280),
    225	REG16(0x27c),
    226	REG16(0x278),
    227	REG16(0x274),
    228	REG16(0x270),
    229
    230	END
    231};
    232
    233static const u8 dg2_xcs_offsets[] = {
    234	NOP(1),
    235	LRI(15, POSTED),
    236	REG16(0x244),
    237	REG(0x034),
    238	REG(0x030),
    239	REG(0x038),
    240	REG(0x03c),
    241	REG(0x168),
    242	REG(0x140),
    243	REG(0x110),
    244	REG(0x1c0),
    245	REG(0x1c4),
    246	REG(0x1c8),
    247	REG(0x180),
    248	REG16(0x2b4),
    249	REG(0x120),
    250	REG(0x124),
    251
    252	NOP(1),
    253	LRI(9, POSTED),
    254	REG16(0x3a8),
    255	REG16(0x28c),
    256	REG16(0x288),
    257	REG16(0x284),
    258	REG16(0x280),
    259	REG16(0x27c),
    260	REG16(0x278),
    261	REG16(0x274),
    262	REG16(0x270),
    263
    264	END
    265};
    266
    267static const u8 gen8_rcs_offsets[] = {
    268	NOP(1),
    269	LRI(14, POSTED),
    270	REG16(0x244),
    271	REG(0x034),
    272	REG(0x030),
    273	REG(0x038),
    274	REG(0x03c),
    275	REG(0x168),
    276	REG(0x140),
    277	REG(0x110),
    278	REG(0x11c),
    279	REG(0x114),
    280	REG(0x118),
    281	REG(0x1c0),
    282	REG(0x1c4),
    283	REG(0x1c8),
    284
    285	NOP(3),
    286	LRI(9, POSTED),
    287	REG16(0x3a8),
    288	REG16(0x28c),
    289	REG16(0x288),
    290	REG16(0x284),
    291	REG16(0x280),
    292	REG16(0x27c),
    293	REG16(0x278),
    294	REG16(0x274),
    295	REG16(0x270),
    296
    297	NOP(13),
    298	LRI(1, 0),
    299	REG(0x0c8),
    300
    301	END
    302};
    303
    304static const u8 gen9_rcs_offsets[] = {
    305	NOP(1),
    306	LRI(14, POSTED),
    307	REG16(0x244),
    308	REG(0x34),
    309	REG(0x30),
    310	REG(0x38),
    311	REG(0x3c),
    312	REG(0x168),
    313	REG(0x140),
    314	REG(0x110),
    315	REG(0x11c),
    316	REG(0x114),
    317	REG(0x118),
    318	REG(0x1c0),
    319	REG(0x1c4),
    320	REG(0x1c8),
    321
    322	NOP(3),
    323	LRI(9, POSTED),
    324	REG16(0x3a8),
    325	REG16(0x28c),
    326	REG16(0x288),
    327	REG16(0x284),
    328	REG16(0x280),
    329	REG16(0x27c),
    330	REG16(0x278),
    331	REG16(0x274),
    332	REG16(0x270),
    333
    334	NOP(13),
    335	LRI(1, 0),
    336	REG(0xc8),
    337
    338	NOP(13),
    339	LRI(44, POSTED),
    340	REG(0x28),
    341	REG(0x9c),
    342	REG(0xc0),
    343	REG(0x178),
    344	REG(0x17c),
    345	REG16(0x358),
    346	REG(0x170),
    347	REG(0x150),
    348	REG(0x154),
    349	REG(0x158),
    350	REG16(0x41c),
    351	REG16(0x600),
    352	REG16(0x604),
    353	REG16(0x608),
    354	REG16(0x60c),
    355	REG16(0x610),
    356	REG16(0x614),
    357	REG16(0x618),
    358	REG16(0x61c),
    359	REG16(0x620),
    360	REG16(0x624),
    361	REG16(0x628),
    362	REG16(0x62c),
    363	REG16(0x630),
    364	REG16(0x634),
    365	REG16(0x638),
    366	REG16(0x63c),
    367	REG16(0x640),
    368	REG16(0x644),
    369	REG16(0x648),
    370	REG16(0x64c),
    371	REG16(0x650),
    372	REG16(0x654),
    373	REG16(0x658),
    374	REG16(0x65c),
    375	REG16(0x660),
    376	REG16(0x664),
    377	REG16(0x668),
    378	REG16(0x66c),
    379	REG16(0x670),
    380	REG16(0x674),
    381	REG16(0x678),
    382	REG16(0x67c),
    383	REG(0x68),
    384
    385	END
    386};
    387
    388static const u8 gen11_rcs_offsets[] = {
    389	NOP(1),
    390	LRI(15, POSTED),
    391	REG16(0x244),
    392	REG(0x034),
    393	REG(0x030),
    394	REG(0x038),
    395	REG(0x03c),
    396	REG(0x168),
    397	REG(0x140),
    398	REG(0x110),
    399	REG(0x11c),
    400	REG(0x114),
    401	REG(0x118),
    402	REG(0x1c0),
    403	REG(0x1c4),
    404	REG(0x1c8),
    405	REG(0x180),
    406
    407	NOP(1),
    408	LRI(9, POSTED),
    409	REG16(0x3a8),
    410	REG16(0x28c),
    411	REG16(0x288),
    412	REG16(0x284),
    413	REG16(0x280),
    414	REG16(0x27c),
    415	REG16(0x278),
    416	REG16(0x274),
    417	REG16(0x270),
    418
    419	LRI(1, POSTED),
    420	REG(0x1b0),
    421
    422	NOP(10),
    423	LRI(1, 0),
    424	REG(0x0c8),
    425
    426	END
    427};
    428
    429static const u8 gen12_rcs_offsets[] = {
    430	NOP(1),
    431	LRI(13, POSTED),
    432	REG16(0x244),
    433	REG(0x034),
    434	REG(0x030),
    435	REG(0x038),
    436	REG(0x03c),
    437	REG(0x168),
    438	REG(0x140),
    439	REG(0x110),
    440	REG(0x1c0),
    441	REG(0x1c4),
    442	REG(0x1c8),
    443	REG(0x180),
    444	REG16(0x2b4),
    445
    446	NOP(5),
    447	LRI(9, POSTED),
    448	REG16(0x3a8),
    449	REG16(0x28c),
    450	REG16(0x288),
    451	REG16(0x284),
    452	REG16(0x280),
    453	REG16(0x27c),
    454	REG16(0x278),
    455	REG16(0x274),
    456	REG16(0x270),
    457
    458	LRI(3, POSTED),
    459	REG(0x1b0),
    460	REG16(0x5a8),
    461	REG16(0x5ac),
    462
    463	NOP(6),
    464	LRI(1, 0),
    465	REG(0x0c8),
    466	NOP(3 + 9 + 1),
    467
    468	LRI(51, POSTED),
    469	REG16(0x588),
    470	REG16(0x588),
    471	REG16(0x588),
    472	REG16(0x588),
    473	REG16(0x588),
    474	REG16(0x588),
    475	REG(0x028),
    476	REG(0x09c),
    477	REG(0x0c0),
    478	REG(0x178),
    479	REG(0x17c),
    480	REG16(0x358),
    481	REG(0x170),
    482	REG(0x150),
    483	REG(0x154),
    484	REG(0x158),
    485	REG16(0x41c),
    486	REG16(0x600),
    487	REG16(0x604),
    488	REG16(0x608),
    489	REG16(0x60c),
    490	REG16(0x610),
    491	REG16(0x614),
    492	REG16(0x618),
    493	REG16(0x61c),
    494	REG16(0x620),
    495	REG16(0x624),
    496	REG16(0x628),
    497	REG16(0x62c),
    498	REG16(0x630),
    499	REG16(0x634),
    500	REG16(0x638),
    501	REG16(0x63c),
    502	REG16(0x640),
    503	REG16(0x644),
    504	REG16(0x648),
    505	REG16(0x64c),
    506	REG16(0x650),
    507	REG16(0x654),
    508	REG16(0x658),
    509	REG16(0x65c),
    510	REG16(0x660),
    511	REG16(0x664),
    512	REG16(0x668),
    513	REG16(0x66c),
    514	REG16(0x670),
    515	REG16(0x674),
    516	REG16(0x678),
    517	REG16(0x67c),
    518	REG(0x068),
    519	REG(0x084),
    520	NOP(1),
    521
    522	END
    523};
    524
    525static const u8 xehp_rcs_offsets[] = {
    526	NOP(1),
    527	LRI(13, POSTED),
    528	REG16(0x244),
    529	REG(0x034),
    530	REG(0x030),
    531	REG(0x038),
    532	REG(0x03c),
    533	REG(0x168),
    534	REG(0x140),
    535	REG(0x110),
    536	REG(0x1c0),
    537	REG(0x1c4),
    538	REG(0x1c8),
    539	REG(0x180),
    540	REG16(0x2b4),
    541
    542	NOP(5),
    543	LRI(9, POSTED),
    544	REG16(0x3a8),
    545	REG16(0x28c),
    546	REG16(0x288),
    547	REG16(0x284),
    548	REG16(0x280),
    549	REG16(0x27c),
    550	REG16(0x278),
    551	REG16(0x274),
    552	REG16(0x270),
    553
    554	LRI(3, POSTED),
    555	REG(0x1b0),
    556	REG16(0x5a8),
    557	REG16(0x5ac),
    558
    559	NOP(6),
    560	LRI(1, 0),
    561	REG(0x0c8),
    562
    563	END
    564};
    565
    566static const u8 dg2_rcs_offsets[] = {
    567	NOP(1),
    568	LRI(15, POSTED),
    569	REG16(0x244),
    570	REG(0x034),
    571	REG(0x030),
    572	REG(0x038),
    573	REG(0x03c),
    574	REG(0x168),
    575	REG(0x140),
    576	REG(0x110),
    577	REG(0x1c0),
    578	REG(0x1c4),
    579	REG(0x1c8),
    580	REG(0x180),
    581	REG16(0x2b4),
    582	REG(0x120),
    583	REG(0x124),
    584
    585	NOP(1),
    586	LRI(9, POSTED),
    587	REG16(0x3a8),
    588	REG16(0x28c),
    589	REG16(0x288),
    590	REG16(0x284),
    591	REG16(0x280),
    592	REG16(0x27c),
    593	REG16(0x278),
    594	REG16(0x274),
    595	REG16(0x270),
    596
    597	LRI(3, POSTED),
    598	REG(0x1b0),
    599	REG16(0x5a8),
    600	REG16(0x5ac),
    601
    602	NOP(6),
    603	LRI(1, 0),
    604	REG(0x0c8),
    605
    606	END
    607};
    608
    609#undef END
    610#undef REG16
    611#undef REG
    612#undef LRI
    613#undef NOP
    614
    615static const u8 *reg_offsets(const struct intel_engine_cs *engine)
    616{
    617	/*
    618	 * The gen12+ lists only have the registers we program in the basic
    619	 * default state. We rely on the context image using relative
    620	 * addressing to automatic fixup the register state between the
    621	 * physical engines for virtual engine.
    622	 */
    623	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
    624		   !intel_engine_has_relative_mmio(engine));
    625
    626	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
    627		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
    628			return dg2_rcs_offsets;
    629		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
    630			return xehp_rcs_offsets;
    631		else if (GRAPHICS_VER(engine->i915) >= 12)
    632			return gen12_rcs_offsets;
    633		else if (GRAPHICS_VER(engine->i915) >= 11)
    634			return gen11_rcs_offsets;
    635		else if (GRAPHICS_VER(engine->i915) >= 9)
    636			return gen9_rcs_offsets;
    637		else
    638			return gen8_rcs_offsets;
    639	} else {
    640		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
    641			return dg2_xcs_offsets;
    642		else if (GRAPHICS_VER(engine->i915) >= 12)
    643			return gen12_xcs_offsets;
    644		else if (GRAPHICS_VER(engine->i915) >= 9)
    645			return gen9_xcs_offsets;
    646		else
    647			return gen8_xcs_offsets;
    648	}
    649}
    650
    651static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
    652{
    653	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
    654		return 0x70;
    655	else if (GRAPHICS_VER(engine->i915) >= 12)
    656		return 0x60;
    657	else if (GRAPHICS_VER(engine->i915) >= 9)
    658		return 0x54;
    659	else if (engine->class == RENDER_CLASS)
    660		return 0x58;
    661	else
    662		return -1;
    663}
    664
    665static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
    666{
    667	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
    668		return 0x84;
    669	else if (GRAPHICS_VER(engine->i915) >= 12)
    670		return 0x74;
    671	else if (GRAPHICS_VER(engine->i915) >= 9)
    672		return 0x68;
    673	else if (engine->class == RENDER_CLASS)
    674		return 0xd8;
    675	else
    676		return -1;
    677}
    678
    679static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
    680{
    681	if (GRAPHICS_VER(engine->i915) >= 12)
    682		return 0x12;
    683	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
    684		return 0x18;
    685	else
    686		return -1;
    687}
    688
    689static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
    690{
    691	int x;
    692
    693	x = lrc_ring_wa_bb_per_ctx(engine);
    694	if (x < 0)
    695		return x;
    696
    697	return x + 2;
    698}
    699
    700static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
    701{
    702	int x;
    703
    704	x = lrc_ring_indirect_ptr(engine);
    705	if (x < 0)
    706		return x;
    707
    708	return x + 2;
    709}
    710
    711static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
    712{
    713
    714	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
    715		/*
    716		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
    717		 * simply to match the RCS context image layout.
    718		 */
    719		return 0xc6;
    720	else if (engine->class != RENDER_CLASS)
    721		return -1;
    722	else if (GRAPHICS_VER(engine->i915) >= 12)
    723		return 0xb6;
    724	else if (GRAPHICS_VER(engine->i915) >= 11)
    725		return 0xaa;
    726	else
    727		return -1;
    728}
    729
    730static u32
    731lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
    732{
    733	switch (GRAPHICS_VER(engine->i915)) {
    734	default:
    735		MISSING_CASE(GRAPHICS_VER(engine->i915));
    736		fallthrough;
    737	case 12:
    738		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
    739	case 11:
    740		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
    741	case 9:
    742		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
    743	case 8:
    744		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
    745	}
    746}
    747
    748static void
    749lrc_setup_indirect_ctx(u32 *regs,
    750		       const struct intel_engine_cs *engine,
    751		       u32 ctx_bb_ggtt_addr,
    752		       u32 size)
    753{
    754	GEM_BUG_ON(!size);
    755	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
    756	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
    757	regs[lrc_ring_indirect_ptr(engine) + 1] =
    758		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
    759
    760	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
    761	regs[lrc_ring_indirect_offset(engine) + 1] =
    762		lrc_ring_indirect_offset_default(engine) << 6;
    763}
    764
    765static void init_common_regs(u32 * const regs,
    766			     const struct intel_context *ce,
    767			     const struct intel_engine_cs *engine,
    768			     bool inhibit)
    769{
    770	u32 ctl;
    771
    772	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
    773	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
    774	if (inhibit)
    775		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
    776	if (GRAPHICS_VER(engine->i915) < 11)
    777		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
    778					   CTX_CTRL_RS_CTX_ENABLE);
    779	regs[CTX_CONTEXT_CONTROL] = ctl;
    780
    781	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
    782}
    783
    784static void init_wa_bb_regs(u32 * const regs,
    785			    const struct intel_engine_cs *engine)
    786{
    787	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
    788
    789	if (wa_ctx->per_ctx.size) {
    790		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
    791
    792		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
    793		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
    794			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
    795	}
    796
    797	if (wa_ctx->indirect_ctx.size) {
    798		lrc_setup_indirect_ctx(regs, engine,
    799				       i915_ggtt_offset(wa_ctx->vma) +
    800				       wa_ctx->indirect_ctx.offset,
    801				       wa_ctx->indirect_ctx.size);
    802	}
    803}
    804
    805static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
    806{
    807	if (i915_vm_is_4lvl(&ppgtt->vm)) {
    808		/* 64b PPGTT (48bit canonical)
    809		 * PDP0_DESCRIPTOR contains the base address to PML4 and
    810		 * other PDP Descriptors are ignored.
    811		 */
    812		ASSIGN_CTX_PML4(ppgtt, regs);
    813	} else {
    814		ASSIGN_CTX_PDP(ppgtt, regs, 3);
    815		ASSIGN_CTX_PDP(ppgtt, regs, 2);
    816		ASSIGN_CTX_PDP(ppgtt, regs, 1);
    817		ASSIGN_CTX_PDP(ppgtt, regs, 0);
    818	}
    819}
    820
    821static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
    822{
    823	if (i915_is_ggtt(vm))
    824		return i915_vm_to_ggtt(vm)->alias;
    825	else
    826		return i915_vm_to_ppgtt(vm);
    827}
    828
    829static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
    830{
    831	int x;
    832
    833	x = lrc_ring_mi_mode(engine);
    834	if (x != -1) {
    835		regs[x + 1] &= ~STOP_RING;
    836		regs[x + 1] |= STOP_RING << 16;
    837	}
    838}
    839
    840static void __lrc_init_regs(u32 *regs,
    841			    const struct intel_context *ce,
    842			    const struct intel_engine_cs *engine,
    843			    bool inhibit)
    844{
    845	/*
    846	 * A context is actually a big batch buffer with several
    847	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
    848	 * values we are setting here are only for the first context restore:
    849	 * on a subsequent save, the GPU will recreate this batchbuffer with new
    850	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
    851	 * we are not initializing here).
    852	 *
    853	 * Must keep consistent with virtual_update_register_offsets().
    854	 */
    855
    856	if (inhibit)
    857		memset(regs, 0, PAGE_SIZE);
    858
    859	set_offsets(regs, reg_offsets(engine), engine, inhibit);
    860
    861	init_common_regs(regs, ce, engine, inhibit);
    862	init_ppgtt_regs(regs, vm_alias(ce->vm));
    863
    864	init_wa_bb_regs(regs, engine);
    865
    866	__reset_stop_ring(regs, engine);
    867}
    868
    869void lrc_init_regs(const struct intel_context *ce,
    870		   const struct intel_engine_cs *engine,
    871		   bool inhibit)
    872{
    873	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
    874}
    875
    876void lrc_reset_regs(const struct intel_context *ce,
    877		    const struct intel_engine_cs *engine)
    878{
    879	__reset_stop_ring(ce->lrc_reg_state, engine);
    880}
    881
    882static void
    883set_redzone(void *vaddr, const struct intel_engine_cs *engine)
    884{
    885	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
    886		return;
    887
    888	vaddr += engine->context_size;
    889
    890	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
    891}
    892
    893static void
    894check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
    895{
    896	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
    897		return;
    898
    899	vaddr += engine->context_size;
    900
    901	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
    902		drm_err_once(&engine->i915->drm,
    903			     "%s context redzone overwritten!\n",
    904			     engine->name);
    905}
    906
    907static u32 context_wa_bb_offset(const struct intel_context *ce)
    908{
    909	return PAGE_SIZE * ce->wa_bb_page;
    910}
    911
    912static u32 *context_indirect_bb(const struct intel_context *ce)
    913{
    914	void *ptr;
    915
    916	GEM_BUG_ON(!ce->wa_bb_page);
    917
    918	ptr = ce->lrc_reg_state;
    919	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
    920	ptr += context_wa_bb_offset(ce);
    921
    922	return ptr;
    923}
    924
    925void lrc_init_state(struct intel_context *ce,
    926		    struct intel_engine_cs *engine,
    927		    void *state)
    928{
    929	bool inhibit = true;
    930
    931	set_redzone(state, engine);
    932
    933	if (engine->default_state) {
    934		shmem_read(engine->default_state, 0,
    935			   state, engine->context_size);
    936		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
    937		inhibit = false;
    938	}
    939
    940	/* Clear the ppHWSP (inc. per-context counters) */
    941	memset(state, 0, PAGE_SIZE);
    942
    943	/* Clear the indirect wa and storage */
    944	if (ce->wa_bb_page)
    945		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
    946
    947	/*
    948	 * The second page of the context object contains some registers which
    949	 * must be set up prior to the first execution.
    950	 */
    951	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
    952}
    953
    954u32 lrc_indirect_bb(const struct intel_context *ce)
    955{
    956	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
    957}
    958
    959static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
    960{
    961	/* If predication is active, this will be noop'ed */
    962	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
    963	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
    964	*cs++ = 0;
    965	*cs++ = 0; /* No predication */
    966
    967	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
    968	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
    969	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
    970
    971	/* Instructions are no longer predicated (disabled), we can proceed */
    972	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
    973	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
    974	*cs++ = 0;
    975	*cs++ = 1; /* enable predication before the next BB */
    976
    977	*cs++ = MI_BATCH_BUFFER_END;
    978	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
    979
    980	return cs;
    981}
    982
    983static struct i915_vma *
    984__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
    985{
    986	struct drm_i915_gem_object *obj;
    987	struct i915_vma *vma;
    988	u32 context_size;
    989
    990	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
    991
    992	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
    993		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
    994
    995	if (GRAPHICS_VER(engine->i915) == 12) {
    996		ce->wa_bb_page = context_size / PAGE_SIZE;
    997		context_size += PAGE_SIZE;
    998	}
    999
   1000	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
   1001		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
   1002		context_size += PARENT_SCRATCH_SIZE;
   1003	}
   1004
   1005	obj = i915_gem_object_create_lmem(engine->i915, context_size,
   1006					  I915_BO_ALLOC_PM_VOLATILE);
   1007	if (IS_ERR(obj))
   1008		obj = i915_gem_object_create_shmem(engine->i915, context_size);
   1009	if (IS_ERR(obj))
   1010		return ERR_CAST(obj);
   1011
   1012	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
   1013	if (IS_ERR(vma)) {
   1014		i915_gem_object_put(obj);
   1015		return vma;
   1016	}
   1017
   1018	return vma;
   1019}
   1020
   1021static struct intel_timeline *
   1022pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
   1023{
   1024	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
   1025
   1026	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
   1027}
   1028
   1029int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
   1030{
   1031	struct intel_ring *ring;
   1032	struct i915_vma *vma;
   1033	int err;
   1034
   1035	GEM_BUG_ON(ce->state);
   1036
   1037	vma = __lrc_alloc_state(ce, engine);
   1038	if (IS_ERR(vma))
   1039		return PTR_ERR(vma);
   1040
   1041	ring = intel_engine_create_ring(engine, ce->ring_size);
   1042	if (IS_ERR(ring)) {
   1043		err = PTR_ERR(ring);
   1044		goto err_vma;
   1045	}
   1046
   1047	if (!page_mask_bits(ce->timeline)) {
   1048		struct intel_timeline *tl;
   1049
   1050		/*
   1051		 * Use the static global HWSP for the kernel context, and
   1052		 * a dynamically allocated cacheline for everyone else.
   1053		 */
   1054		if (unlikely(ce->timeline))
   1055			tl = pinned_timeline(ce, engine);
   1056		else
   1057			tl = intel_timeline_create(engine->gt);
   1058		if (IS_ERR(tl)) {
   1059			err = PTR_ERR(tl);
   1060			goto err_ring;
   1061		}
   1062
   1063		ce->timeline = tl;
   1064	}
   1065
   1066	ce->ring = ring;
   1067	ce->state = vma;
   1068
   1069	return 0;
   1070
   1071err_ring:
   1072	intel_ring_put(ring);
   1073err_vma:
   1074	i915_vma_put(vma);
   1075	return err;
   1076}
   1077
   1078void lrc_reset(struct intel_context *ce)
   1079{
   1080	GEM_BUG_ON(!intel_context_is_pinned(ce));
   1081
   1082	intel_ring_reset(ce->ring, ce->ring->emit);
   1083
   1084	/* Scrub away the garbage */
   1085	lrc_init_regs(ce, ce->engine, true);
   1086	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
   1087}
   1088
   1089int
   1090lrc_pre_pin(struct intel_context *ce,
   1091	    struct intel_engine_cs *engine,
   1092	    struct i915_gem_ww_ctx *ww,
   1093	    void **vaddr)
   1094{
   1095	GEM_BUG_ON(!ce->state);
   1096	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
   1097
   1098	*vaddr = i915_gem_object_pin_map(ce->state->obj,
   1099					 i915_coherent_map_type(ce->engine->i915,
   1100								ce->state->obj,
   1101								false) |
   1102					 I915_MAP_OVERRIDE);
   1103
   1104	return PTR_ERR_OR_ZERO(*vaddr);
   1105}
   1106
   1107int
   1108lrc_pin(struct intel_context *ce,
   1109	struct intel_engine_cs *engine,
   1110	void *vaddr)
   1111{
   1112	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
   1113
   1114	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
   1115		lrc_init_state(ce, engine, vaddr);
   1116
   1117	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
   1118	return 0;
   1119}
   1120
   1121void lrc_unpin(struct intel_context *ce)
   1122{
   1123	if (unlikely(ce->parallel.last_rq)) {
   1124		i915_request_put(ce->parallel.last_rq);
   1125		ce->parallel.last_rq = NULL;
   1126	}
   1127	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
   1128		      ce->engine);
   1129}
   1130
   1131void lrc_post_unpin(struct intel_context *ce)
   1132{
   1133	i915_gem_object_unpin_map(ce->state->obj);
   1134}
   1135
   1136void lrc_fini(struct intel_context *ce)
   1137{
   1138	if (!ce->state)
   1139		return;
   1140
   1141	intel_ring_put(fetch_and_zero(&ce->ring));
   1142	i915_vma_put(fetch_and_zero(&ce->state));
   1143}
   1144
   1145void lrc_destroy(struct kref *kref)
   1146{
   1147	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
   1148
   1149	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
   1150	GEM_BUG_ON(intel_context_is_pinned(ce));
   1151
   1152	lrc_fini(ce);
   1153
   1154	intel_context_fini(ce);
   1155	intel_context_free(ce);
   1156}
   1157
   1158static u32 *
   1159gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
   1160{
   1161	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
   1162		MI_SRM_LRM_GLOBAL_GTT |
   1163		MI_LRI_LRM_CS_MMIO;
   1164	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
   1165	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
   1166		CTX_TIMESTAMP * sizeof(u32);
   1167	*cs++ = 0;
   1168
   1169	*cs++ = MI_LOAD_REGISTER_REG |
   1170		MI_LRR_SOURCE_CS_MMIO |
   1171		MI_LRI_LRM_CS_MMIO;
   1172	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
   1173	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
   1174
   1175	*cs++ = MI_LOAD_REGISTER_REG |
   1176		MI_LRR_SOURCE_CS_MMIO |
   1177		MI_LRI_LRM_CS_MMIO;
   1178	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
   1179	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
   1180
   1181	return cs;
   1182}
   1183
   1184static u32 *
   1185gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
   1186{
   1187	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
   1188
   1189	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
   1190		MI_SRM_LRM_GLOBAL_GTT |
   1191		MI_LRI_LRM_CS_MMIO;
   1192	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
   1193	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
   1194		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
   1195	*cs++ = 0;
   1196
   1197	return cs;
   1198}
   1199
   1200static u32 *
   1201gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
   1202{
   1203	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
   1204
   1205	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
   1206		MI_SRM_LRM_GLOBAL_GTT |
   1207		MI_LRI_LRM_CS_MMIO;
   1208	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
   1209	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
   1210		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
   1211	*cs++ = 0;
   1212
   1213	*cs++ = MI_LOAD_REGISTER_REG |
   1214		MI_LRR_SOURCE_CS_MMIO |
   1215		MI_LRI_LRM_CS_MMIO;
   1216	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
   1217	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
   1218
   1219	return cs;
   1220}
   1221
   1222/*
   1223 * On DG2 during context restore of a preempted context in GPGPU mode,
   1224 * RCS restore hang is detected. This is extremely timing dependent.
   1225 * To address this below sw wabb is implemented for DG2 A steppings.
   1226 */
   1227static u32 *
   1228dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
   1229{
   1230	*cs++ = MI_LOAD_REGISTER_IMM(1);
   1231	*cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
   1232	*cs++ = 0x21;
   1233
   1234	*cs++ = MI_LOAD_REGISTER_REG;
   1235	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
   1236	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
   1237
   1238	*cs++ = MI_LOAD_REGISTER_REG;
   1239	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
   1240	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
   1241
   1242	return cs;
   1243}
   1244
   1245static u32 *
   1246gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
   1247{
   1248	cs = gen12_emit_timestamp_wa(ce, cs);
   1249	cs = gen12_emit_cmd_buf_wa(ce, cs);
   1250	cs = gen12_emit_restore_scratch(ce, cs);
   1251
   1252	/* Wa_22011450934:dg2 */
   1253	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
   1254	    IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
   1255		cs = dg2_emit_rcs_hang_wabb(ce, cs);
   1256
   1257	/* Wa_16013000631:dg2 */
   1258	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
   1259	    IS_DG2_G11(ce->engine->i915))
   1260		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
   1261
   1262	/* hsdes: 1809175790 */
   1263	if (!HAS_FLAT_CCS(ce->engine->i915))
   1264		cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV);
   1265
   1266	return cs;
   1267}
   1268
   1269static u32 *
   1270gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
   1271{
   1272	cs = gen12_emit_timestamp_wa(ce, cs);
   1273	cs = gen12_emit_restore_scratch(ce, cs);
   1274
   1275	/* Wa_16013000631:dg2 */
   1276	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
   1277	    IS_DG2_G11(ce->engine->i915))
   1278		if (ce->engine->class == COMPUTE_CLASS)
   1279			cs = gen8_emit_pipe_control(cs,
   1280						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
   1281						    0);
   1282
   1283	/* hsdes: 1809175790 */
   1284	if (!HAS_FLAT_CCS(ce->engine->i915)) {
   1285		if (ce->engine->class == VIDEO_DECODE_CLASS)
   1286			cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV);
   1287		else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
   1288			cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV);
   1289	}
   1290
   1291	return cs;
   1292}
   1293
   1294static void
   1295setup_indirect_ctx_bb(const struct intel_context *ce,
   1296		      const struct intel_engine_cs *engine,
   1297		      u32 *(*emit)(const struct intel_context *, u32 *))
   1298{
   1299	u32 * const start = context_indirect_bb(ce);
   1300	u32 *cs;
   1301
   1302	cs = emit(ce, start);
   1303	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
   1304	while ((unsigned long)cs % CACHELINE_BYTES)
   1305		*cs++ = MI_NOOP;
   1306
   1307	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
   1308	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
   1309
   1310	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
   1311			       lrc_indirect_bb(ce),
   1312			       (cs - start) * sizeof(*cs));
   1313}
   1314
   1315/*
   1316 * The context descriptor encodes various attributes of a context,
   1317 * including its GTT address and some flags. Because it's fairly
   1318 * expensive to calculate, we'll just do it once and cache the result,
   1319 * which remains valid until the context is unpinned.
   1320 *
   1321 * This is what a descriptor looks like, from LSB to MSB::
   1322 *
   1323 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
   1324 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
   1325 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
   1326 *      bits 53-54:    mbz, reserved for use by hardware
   1327 *      bits 55-63:    group ID, currently unused and set to 0
   1328 *
   1329 * Starting from Gen11, the upper dword of the descriptor has a new format:
   1330 *
   1331 *      bits 32-36:    reserved
   1332 *      bits 37-47:    SW context ID
   1333 *      bits 48:53:    engine instance
   1334 *      bit 54:        mbz, reserved for use by hardware
   1335 *      bits 55-60:    SW counter
   1336 *      bits 61-63:    engine class
   1337 *
   1338 * On Xe_HP, the upper dword of the descriptor has a new format:
   1339 *
   1340 *      bits 32-37:    virtual function number
   1341 *      bit 38:        mbz, reserved for use by hardware
   1342 *      bits 39-54:    SW context ID
   1343 *      bits 55-57:    reserved
   1344 *      bits 58-63:    SW counter
   1345 *
   1346 * engine info, SW context ID and SW counter need to form a unique number
   1347 * (Context ID) per lrc.
   1348 */
   1349static u32 lrc_descriptor(const struct intel_context *ce)
   1350{
   1351	u32 desc;
   1352
   1353	desc = INTEL_LEGACY_32B_CONTEXT;
   1354	if (i915_vm_is_4lvl(ce->vm))
   1355		desc = INTEL_LEGACY_64B_CONTEXT;
   1356	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
   1357
   1358	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
   1359	if (GRAPHICS_VER(ce->vm->i915) == 8)
   1360		desc |= GEN8_CTX_L3LLC_COHERENT;
   1361
   1362	return i915_ggtt_offset(ce->state) | desc;
   1363}
   1364
   1365u32 lrc_update_regs(const struct intel_context *ce,
   1366		    const struct intel_engine_cs *engine,
   1367		    u32 head)
   1368{
   1369	struct intel_ring *ring = ce->ring;
   1370	u32 *regs = ce->lrc_reg_state;
   1371
   1372	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
   1373	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
   1374
   1375	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
   1376	regs[CTX_RING_HEAD] = head;
   1377	regs[CTX_RING_TAIL] = ring->tail;
   1378	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
   1379
   1380	/* RPCS */
   1381	if (engine->class == RENDER_CLASS) {
   1382		regs[CTX_R_PWR_CLK_STATE] =
   1383			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
   1384
   1385		i915_oa_init_reg_state(ce, engine);
   1386	}
   1387
   1388	if (ce->wa_bb_page) {
   1389		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
   1390
   1391		fn = gen12_emit_indirect_ctx_xcs;
   1392		if (ce->engine->class == RENDER_CLASS)
   1393			fn = gen12_emit_indirect_ctx_rcs;
   1394
   1395		/* Mutually exclusive wrt to global indirect bb */
   1396		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
   1397		setup_indirect_ctx_bb(ce, engine, fn);
   1398	}
   1399
   1400	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
   1401}
   1402
   1403void lrc_update_offsets(struct intel_context *ce,
   1404			struct intel_engine_cs *engine)
   1405{
   1406	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
   1407}
   1408
   1409void lrc_check_regs(const struct intel_context *ce,
   1410		    const struct intel_engine_cs *engine,
   1411		    const char *when)
   1412{
   1413	const struct intel_ring *ring = ce->ring;
   1414	u32 *regs = ce->lrc_reg_state;
   1415	bool valid = true;
   1416	int x;
   1417
   1418	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
   1419		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
   1420		       engine->name,
   1421		       regs[CTX_RING_START],
   1422		       i915_ggtt_offset(ring->vma));
   1423		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
   1424		valid = false;
   1425	}
   1426
   1427	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
   1428	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
   1429		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
   1430		       engine->name,
   1431		       regs[CTX_RING_CTL],
   1432		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
   1433		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
   1434		valid = false;
   1435	}
   1436
   1437	x = lrc_ring_mi_mode(engine);
   1438	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
   1439		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
   1440		       engine->name, regs[x + 1]);
   1441		regs[x + 1] &= ~STOP_RING;
   1442		regs[x + 1] |= STOP_RING << 16;
   1443		valid = false;
   1444	}
   1445
   1446	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
   1447}
   1448
   1449/*
   1450 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
   1451 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
   1452 * but there is a slight complication as this is applied in WA batch where the
   1453 * values are only initialized once so we cannot take register value at the
   1454 * beginning and reuse it further; hence we save its value to memory, upload a
   1455 * constant value with bit21 set and then we restore it back with the saved value.
   1456 * To simplify the WA, a constant value is formed by using the default value
   1457 * of this register. This shouldn't be a problem because we are only modifying
   1458 * it for a short period and this batch in non-premptible. We can ofcourse
   1459 * use additional instructions that read the actual value of the register
   1460 * at that time and set our bit of interest but it makes the WA complicated.
   1461 *
   1462 * This WA is also required for Gen9 so extracting as a function avoids
   1463 * code duplication.
   1464 */
   1465static u32 *
   1466gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
   1467{
   1468	/* NB no one else is allowed to scribble over scratch + 256! */
   1469	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
   1470	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
   1471	*batch++ = intel_gt_scratch_offset(engine->gt,
   1472					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
   1473	*batch++ = 0;
   1474
   1475	*batch++ = MI_LOAD_REGISTER_IMM(1);
   1476	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
   1477	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
   1478
   1479	batch = gen8_emit_pipe_control(batch,
   1480				       PIPE_CONTROL_CS_STALL |
   1481				       PIPE_CONTROL_DC_FLUSH_ENABLE,
   1482				       0);
   1483
   1484	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
   1485	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
   1486	*batch++ = intel_gt_scratch_offset(engine->gt,
   1487					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
   1488	*batch++ = 0;
   1489
   1490	return batch;
   1491}
   1492
   1493/*
   1494 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
   1495 * initialized at the beginning and shared across all contexts but this field
   1496 * helps us to have multiple batches at different offsets and select them based
   1497 * on a criteria. At the moment this batch always start at the beginning of the page
   1498 * and at this point we don't have multiple wa_ctx batch buffers.
   1499 *
   1500 * The number of WA applied are not known at the beginning; we use this field
   1501 * to return the no of DWORDS written.
   1502 *
   1503 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
   1504 * so it adds NOOPs as padding to make it cacheline aligned.
   1505 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
   1506 * makes a complete batch buffer.
   1507 */
   1508static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
   1509{
   1510	/* WaDisableCtxRestoreArbitration:bdw,chv */
   1511	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
   1512
   1513	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
   1514	if (IS_BROADWELL(engine->i915))
   1515		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
   1516
   1517	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
   1518	/* Actual scratch location is at 128 bytes offset */
   1519	batch = gen8_emit_pipe_control(batch,
   1520				       PIPE_CONTROL_FLUSH_L3 |
   1521				       PIPE_CONTROL_STORE_DATA_INDEX |
   1522				       PIPE_CONTROL_CS_STALL |
   1523				       PIPE_CONTROL_QW_WRITE,
   1524				       LRC_PPHWSP_SCRATCH_ADDR);
   1525
   1526	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
   1527
   1528	/* Pad to end of cacheline */
   1529	while ((unsigned long)batch % CACHELINE_BYTES)
   1530		*batch++ = MI_NOOP;
   1531
   1532	/*
   1533	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
   1534	 * execution depends on the length specified in terms of cache lines
   1535	 * in the register CTX_RCS_INDIRECT_CTX
   1536	 */
   1537
   1538	return batch;
   1539}
   1540
   1541struct lri {
   1542	i915_reg_t reg;
   1543	u32 value;
   1544};
   1545
   1546static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
   1547{
   1548	GEM_BUG_ON(!count || count > 63);
   1549
   1550	*batch++ = MI_LOAD_REGISTER_IMM(count);
   1551	do {
   1552		*batch++ = i915_mmio_reg_offset(lri->reg);
   1553		*batch++ = lri->value;
   1554	} while (lri++, --count);
   1555	*batch++ = MI_NOOP;
   1556
   1557	return batch;
   1558}
   1559
   1560static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
   1561{
   1562	static const struct lri lri[] = {
   1563		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
   1564		{
   1565			COMMON_SLICE_CHICKEN2,
   1566			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
   1567				       0),
   1568		},
   1569
   1570		/* BSpec: 11391 */
   1571		{
   1572			FF_SLICE_CHICKEN,
   1573			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
   1574				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
   1575		},
   1576
   1577		/* BSpec: 11299 */
   1578		{
   1579			_3D_CHICKEN3,
   1580			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
   1581				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
   1582		}
   1583	};
   1584
   1585	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
   1586
   1587	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
   1588	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
   1589
   1590	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
   1591	batch = gen8_emit_pipe_control(batch,
   1592				       PIPE_CONTROL_FLUSH_L3 |
   1593				       PIPE_CONTROL_STORE_DATA_INDEX |
   1594				       PIPE_CONTROL_CS_STALL |
   1595				       PIPE_CONTROL_QW_WRITE,
   1596				       LRC_PPHWSP_SCRATCH_ADDR);
   1597
   1598	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
   1599
   1600	/* WaMediaPoolStateCmdInWABB:bxt,glk */
   1601	if (HAS_POOLED_EU(engine->i915)) {
   1602		/*
   1603		 * EU pool configuration is setup along with golden context
   1604		 * during context initialization. This value depends on
   1605		 * device type (2x6 or 3x6) and needs to be updated based
   1606		 * on which subslice is disabled especially for 2x6
   1607		 * devices, however it is safe to load default
   1608		 * configuration of 3x6 device instead of masking off
   1609		 * corresponding bits because HW ignores bits of a disabled
   1610		 * subslice and drops down to appropriate config. Please
   1611		 * see render_state_setup() in i915_gem_render_state.c for
   1612		 * possible configurations, to avoid duplication they are
   1613		 * not shown here again.
   1614		 */
   1615		*batch++ = GEN9_MEDIA_POOL_STATE;
   1616		*batch++ = GEN9_MEDIA_POOL_ENABLE;
   1617		*batch++ = 0x00777000;
   1618		*batch++ = 0;
   1619		*batch++ = 0;
   1620		*batch++ = 0;
   1621	}
   1622
   1623	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
   1624
   1625	/* Pad to end of cacheline */
   1626	while ((unsigned long)batch % CACHELINE_BYTES)
   1627		*batch++ = MI_NOOP;
   1628
   1629	return batch;
   1630}
   1631
   1632#define CTX_WA_BB_SIZE (PAGE_SIZE)
   1633
   1634static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
   1635{
   1636	struct drm_i915_gem_object *obj;
   1637	struct i915_vma *vma;
   1638	int err;
   1639
   1640	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
   1641	if (IS_ERR(obj))
   1642		return PTR_ERR(obj);
   1643
   1644	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
   1645	if (IS_ERR(vma)) {
   1646		err = PTR_ERR(vma);
   1647		goto err;
   1648	}
   1649
   1650	engine->wa_ctx.vma = vma;
   1651	return 0;
   1652
   1653err:
   1654	i915_gem_object_put(obj);
   1655	return err;
   1656}
   1657
   1658void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
   1659{
   1660	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
   1661}
   1662
   1663typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
   1664
   1665void lrc_init_wa_ctx(struct intel_engine_cs *engine)
   1666{
   1667	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
   1668	struct i915_wa_ctx_bb *wa_bb[] = {
   1669		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
   1670	};
   1671	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
   1672	struct i915_gem_ww_ctx ww;
   1673	void *batch, *batch_ptr;
   1674	unsigned int i;
   1675	int err;
   1676
   1677	if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
   1678		return;
   1679
   1680	switch (GRAPHICS_VER(engine->i915)) {
   1681	case 12:
   1682	case 11:
   1683		return;
   1684	case 9:
   1685		wa_bb_fn[0] = gen9_init_indirectctx_bb;
   1686		wa_bb_fn[1] = NULL;
   1687		break;
   1688	case 8:
   1689		wa_bb_fn[0] = gen8_init_indirectctx_bb;
   1690		wa_bb_fn[1] = NULL;
   1691		break;
   1692	default:
   1693		MISSING_CASE(GRAPHICS_VER(engine->i915));
   1694		return;
   1695	}
   1696
   1697	err = lrc_create_wa_ctx(engine);
   1698	if (err) {
   1699		/*
   1700		 * We continue even if we fail to initialize WA batch
   1701		 * because we only expect rare glitches but nothing
   1702		 * critical to prevent us from using GPU
   1703		 */
   1704		drm_err(&engine->i915->drm,
   1705			"Ignoring context switch w/a allocation error:%d\n",
   1706			err);
   1707		return;
   1708	}
   1709
   1710	if (!engine->wa_ctx.vma)
   1711		return;
   1712
   1713	i915_gem_ww_ctx_init(&ww, true);
   1714retry:
   1715	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
   1716	if (!err)
   1717		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
   1718	if (err)
   1719		goto err;
   1720
   1721	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
   1722	if (IS_ERR(batch)) {
   1723		err = PTR_ERR(batch);
   1724		goto err_unpin;
   1725	}
   1726
   1727	/*
   1728	 * Emit the two workaround batch buffers, recording the offset from the
   1729	 * start of the workaround batch buffer object for each and their
   1730	 * respective sizes.
   1731	 */
   1732	batch_ptr = batch;
   1733	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
   1734		wa_bb[i]->offset = batch_ptr - batch;
   1735		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
   1736						  CACHELINE_BYTES))) {
   1737			err = -EINVAL;
   1738			break;
   1739		}
   1740		if (wa_bb_fn[i])
   1741			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
   1742		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
   1743	}
   1744	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
   1745
   1746	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
   1747	__i915_gem_object_release_map(wa_ctx->vma->obj);
   1748
   1749	/* Verify that we can handle failure to setup the wa_ctx */
   1750	if (!err)
   1751		err = i915_inject_probe_error(engine->i915, -ENODEV);
   1752
   1753err_unpin:
   1754	if (err)
   1755		i915_vma_unpin(wa_ctx->vma);
   1756err:
   1757	if (err == -EDEADLK) {
   1758		err = i915_gem_ww_ctx_backoff(&ww);
   1759		if (!err)
   1760			goto retry;
   1761	}
   1762	i915_gem_ww_ctx_fini(&ww);
   1763
   1764	if (err) {
   1765		i915_vma_put(engine->wa_ctx.vma);
   1766
   1767		/* Clear all flags to prevent further use */
   1768		memset(wa_ctx, 0, sizeof(*wa_ctx));
   1769	}
   1770}
   1771
   1772static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
   1773{
   1774#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
   1775	stats->runtime.num_underflow++;
   1776	stats->runtime.max_underflow =
   1777		max_t(u32, stats->runtime.max_underflow, -dt);
   1778#endif
   1779}
   1780
   1781static u32 lrc_get_runtime(const struct intel_context *ce)
   1782{
   1783	/*
   1784	 * We can use either ppHWSP[16] which is recorded before the context
   1785	 * switch (and so excludes the cost of context switches) or use the
   1786	 * value from the context image itself, which is saved/restored earlier
   1787	 * and so includes the cost of the save.
   1788	 */
   1789	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
   1790}
   1791
   1792void lrc_update_runtime(struct intel_context *ce)
   1793{
   1794	struct intel_context_stats *stats = &ce->stats;
   1795	u32 old;
   1796	s32 dt;
   1797
   1798	old = stats->runtime.last;
   1799	stats->runtime.last = lrc_get_runtime(ce);
   1800	dt = stats->runtime.last - old;
   1801	if (!dt)
   1802		return;
   1803
   1804	if (unlikely(dt < 0)) {
   1805		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
   1806			 old, stats->runtime.last, dt);
   1807		st_runtime_underflow(stats, dt);
   1808		return;
   1809	}
   1810
   1811	ewma_runtime_add(&stats->runtime.avg, dt);
   1812	stats->runtime.total += dt;
   1813}
   1814
   1815#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
   1816#include "selftest_lrc.c"
   1817#endif