cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

intel_migrate.c (28557B)


      1// SPDX-License-Identifier: MIT
      2/*
      3 * Copyright © 2020 Intel Corporation
      4 */
      5
      6#include "i915_drv.h"
      7#include "intel_context.h"
      8#include "intel_gpu_commands.h"
      9#include "intel_gt.h"
     10#include "intel_gtt.h"
     11#include "intel_migrate.h"
     12#include "intel_ring.h"
     13
     14struct insert_pte_data {
     15	u64 offset;
     16};
     17
     18#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
     19
     20#define GET_CCS_BYTES(i915, size)	(HAS_FLAT_CCS(i915) ? \
     21					 DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0)
     22static bool engine_supports_migration(struct intel_engine_cs *engine)
     23{
     24	if (!engine)
     25		return false;
     26
     27	/*
     28	 * We need the ability to prevent aribtration (MI_ARB_ON_OFF),
     29	 * the ability to write PTE using inline data (MI_STORE_DATA)
     30	 * and of course the ability to do the block transfer (blits).
     31	 */
     32	GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
     33
     34	return true;
     35}
     36
     37static void xehpsdv_toggle_pdes(struct i915_address_space *vm,
     38				struct i915_page_table *pt,
     39				void *data)
     40{
     41	struct insert_pte_data *d = data;
     42
     43	/*
     44	 * Insert a dummy PTE into every PT that will map to LMEM to ensure
     45	 * we have a correctly setup PDE structure for later use.
     46	 */
     47	vm->insert_page(vm, 0, d->offset, I915_CACHE_NONE, PTE_LM);
     48	GEM_BUG_ON(!pt->is_compact);
     49	d->offset += SZ_2M;
     50}
     51
     52static void xehpsdv_insert_pte(struct i915_address_space *vm,
     53			       struct i915_page_table *pt,
     54			       void *data)
     55{
     56	struct insert_pte_data *d = data;
     57
     58	/*
     59	 * We are playing tricks here, since the actual pt, from the hw
     60	 * pov, is only 256bytes with 32 entries, or 4096bytes with 512
     61	 * entries, but we are still guaranteed that the physical
     62	 * alignment is 64K underneath for the pt, and we are careful
     63	 * not to access the space in the void.
     64	 */
     65	vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE, PTE_LM);
     66	d->offset += SZ_64K;
     67}
     68
     69static void insert_pte(struct i915_address_space *vm,
     70		       struct i915_page_table *pt,
     71		       void *data)
     72{
     73	struct insert_pte_data *d = data;
     74
     75	vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE,
     76			i915_gem_object_is_lmem(pt->base) ? PTE_LM : 0);
     77	d->offset += PAGE_SIZE;
     78}
     79
     80static struct i915_address_space *migrate_vm(struct intel_gt *gt)
     81{
     82	struct i915_vm_pt_stash stash = {};
     83	struct i915_ppgtt *vm;
     84	int err;
     85	int i;
     86
     87	/*
     88	 * We construct a very special VM for use by all migration contexts,
     89	 * it is kept pinned so that it can be used at any time. As we need
     90	 * to pre-allocate the page directories for the migration VM, this
     91	 * limits us to only using a small number of prepared vma.
     92	 *
     93	 * To be able to pipeline and reschedule migration operations while
     94	 * avoiding unnecessary contention on the vm itself, the PTE updates
     95	 * are inline with the blits. All the blits use the same fixed
     96	 * addresses, with the backing store redirection being updated on the
     97	 * fly. Only 2 implicit vma are used for all migration operations.
     98	 *
     99	 * We lay the ppGTT out as:
    100	 *
    101	 *	[0, CHUNK_SZ) -> first object
    102	 *	[CHUNK_SZ, 2 * CHUNK_SZ) -> second object
    103	 *	[2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
    104	 *
    105	 * By exposing the dma addresses of the page directories themselves
    106	 * within the ppGTT, we are then able to rewrite the PTE prior to use.
    107	 * But the PTE update and subsequent migration operation must be atomic,
    108	 * i.e. within the same non-preemptible window so that we do not switch
    109	 * to another migration context that overwrites the PTE.
    110	 *
    111	 * This changes quite a bit on platforms with HAS_64K_PAGES support,
    112	 * where we instead have three windows, each CHUNK_SIZE in size. The
    113	 * first is reserved for mapping system-memory, and that just uses the
    114	 * 512 entry layout using 4K GTT pages. The other two windows just map
    115	 * lmem pages and must use the new compact 32 entry layout using 64K GTT
    116	 * pages, which ensures we can address any lmem object that the user
    117	 * throws at us. We then also use the xehpsdv_toggle_pdes as a way of
    118	 * just toggling the PDE bit(GEN12_PDE_64K) for us, to enable the
    119	 * compact layout for each of these page-tables, that fall within the
    120	 * [CHUNK_SIZE, 3 * CHUNK_SIZE) range.
    121	 *
    122	 * We lay the ppGTT out as:
    123	 *
    124	 * [0, CHUNK_SZ) -> first window/object, maps smem
    125	 * [CHUNK_SZ, 2 * CHUNK_SZ) -> second window/object, maps lmem src
    126	 * [2 * CHUNK_SZ, 3 * CHUNK_SZ) -> third window/object, maps lmem dst
    127	 *
    128	 * For the PTE window it's also quite different, since each PTE must
    129	 * point to some 64K page, one for each PT(since it's in lmem), and yet
    130	 * each is only <= 4096bytes, but since the unused space within that PTE
    131	 * range is never touched, this should be fine.
    132	 *
    133	 * So basically each PT now needs 64K of virtual memory, instead of 4K,
    134	 * which looks like:
    135	 *
    136	 * [3 * CHUNK_SZ, 3 * CHUNK_SZ + ((3 * CHUNK_SZ / SZ_2M) * SZ_64K)] -> PTE
    137	 */
    138
    139	vm = i915_ppgtt_create(gt, I915_BO_ALLOC_PM_EARLY);
    140	if (IS_ERR(vm))
    141		return ERR_CAST(vm);
    142
    143	if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
    144		err = -ENODEV;
    145		goto err_vm;
    146	}
    147
    148	if (HAS_64K_PAGES(gt->i915))
    149		stash.pt_sz = I915_GTT_PAGE_SIZE_64K;
    150
    151	/*
    152	 * Each engine instance is assigned its own chunk in the VM, so
    153	 * that we can run multiple instances concurrently
    154	 */
    155	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
    156		struct intel_engine_cs *engine;
    157		u64 base = (u64)i << 32;
    158		struct insert_pte_data d = {};
    159		struct i915_gem_ww_ctx ww;
    160		u64 sz;
    161
    162		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
    163		if (!engine_supports_migration(engine))
    164			continue;
    165
    166		/*
    167		 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
    168		 * 4x2 page directories for source/destination.
    169		 */
    170		if (HAS_64K_PAGES(gt->i915))
    171			sz = 3 * CHUNK_SZ;
    172		else
    173			sz = 2 * CHUNK_SZ;
    174		d.offset = base + sz;
    175
    176		/*
    177		 * We need another page directory setup so that we can write
    178		 * the 8x512 PTE in each chunk.
    179		 */
    180		if (HAS_64K_PAGES(gt->i915))
    181			sz += (sz / SZ_2M) * SZ_64K;
    182		else
    183			sz += (sz >> 12) * sizeof(u64);
    184
    185		err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
    186		if (err)
    187			goto err_vm;
    188
    189		for_i915_gem_ww(&ww, err, true) {
    190			err = i915_vm_lock_objects(&vm->vm, &ww);
    191			if (err)
    192				continue;
    193			err = i915_vm_map_pt_stash(&vm->vm, &stash);
    194			if (err)
    195				continue;
    196
    197			vm->vm.allocate_va_range(&vm->vm, &stash, base, sz);
    198		}
    199		i915_vm_free_pt_stash(&vm->vm, &stash);
    200		if (err)
    201			goto err_vm;
    202
    203		/* Now allow the GPU to rewrite the PTE via its own ppGTT */
    204		if (HAS_64K_PAGES(gt->i915)) {
    205			vm->vm.foreach(&vm->vm, base, d.offset - base,
    206				       xehpsdv_insert_pte, &d);
    207			d.offset = base + CHUNK_SZ;
    208			vm->vm.foreach(&vm->vm,
    209				       d.offset,
    210				       2 * CHUNK_SZ,
    211				       xehpsdv_toggle_pdes, &d);
    212		} else {
    213			vm->vm.foreach(&vm->vm, base, d.offset - base,
    214				       insert_pte, &d);
    215		}
    216	}
    217
    218	return &vm->vm;
    219
    220err_vm:
    221	i915_vm_put(&vm->vm);
    222	return ERR_PTR(err);
    223}
    224
    225static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt)
    226{
    227	struct intel_engine_cs *engine;
    228	int i;
    229
    230	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
    231		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
    232		if (engine_supports_migration(engine))
    233			return engine;
    234	}
    235
    236	return NULL;
    237}
    238
    239static struct intel_context *pinned_context(struct intel_gt *gt)
    240{
    241	static struct lock_class_key key;
    242	struct intel_engine_cs *engine;
    243	struct i915_address_space *vm;
    244	struct intel_context *ce;
    245
    246	engine = first_copy_engine(gt);
    247	if (!engine)
    248		return ERR_PTR(-ENODEV);
    249
    250	vm = migrate_vm(gt);
    251	if (IS_ERR(vm))
    252		return ERR_CAST(vm);
    253
    254	ce = intel_engine_create_pinned_context(engine, vm, SZ_512K,
    255						I915_GEM_HWS_MIGRATE,
    256						&key, "migrate");
    257	i915_vm_put(vm);
    258	return ce;
    259}
    260
    261int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt)
    262{
    263	struct intel_context *ce;
    264
    265	memset(m, 0, sizeof(*m));
    266
    267	ce = pinned_context(gt);
    268	if (IS_ERR(ce))
    269		return PTR_ERR(ce);
    270
    271	m->context = ce;
    272	return 0;
    273}
    274
    275static int random_index(unsigned int max)
    276{
    277	return upper_32_bits(mul_u32_u32(get_random_u32(), max));
    278}
    279
    280static struct intel_context *__migrate_engines(struct intel_gt *gt)
    281{
    282	struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
    283	struct intel_engine_cs *engine;
    284	unsigned int count, i;
    285
    286	count = 0;
    287	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
    288		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
    289		if (engine_supports_migration(engine))
    290			engines[count++] = engine;
    291	}
    292
    293	return intel_context_create(engines[random_index(count)]);
    294}
    295
    296struct intel_context *intel_migrate_create_context(struct intel_migrate *m)
    297{
    298	struct intel_context *ce;
    299
    300	/*
    301	 * We randomly distribute contexts across the engines upon constrction,
    302	 * as they all share the same pinned vm, and so in order to allow
    303	 * multiple blits to run in parallel, we must construct each blit
    304	 * to use a different range of the vm for its GTT. This has to be
    305	 * known at construction, so we can not use the late greedy load
    306	 * balancing of the virtual-engine.
    307	 */
    308	ce = __migrate_engines(m->context->engine->gt);
    309	if (IS_ERR(ce))
    310		return ce;
    311
    312	ce->ring = NULL;
    313	ce->ring_size = SZ_256K;
    314
    315	i915_vm_put(ce->vm);
    316	ce->vm = i915_vm_get(m->context->vm);
    317
    318	return ce;
    319}
    320
    321static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
    322{
    323	dma_addr_t addr = sg_dma_address(sg);
    324
    325	return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
    326}
    327
    328static int emit_no_arbitration(struct i915_request *rq)
    329{
    330	u32 *cs;
    331
    332	cs = intel_ring_begin(rq, 2);
    333	if (IS_ERR(cs))
    334		return PTR_ERR(cs);
    335
    336	/* Explicitly disable preemption for this request. */
    337	*cs++ = MI_ARB_ON_OFF;
    338	*cs++ = MI_NOOP;
    339	intel_ring_advance(rq, cs);
    340
    341	return 0;
    342}
    343
    344static int emit_pte(struct i915_request *rq,
    345		    struct sgt_dma *it,
    346		    enum i915_cache_level cache_level,
    347		    bool is_lmem,
    348		    u64 offset,
    349		    int length)
    350{
    351	bool has_64K_pages = HAS_64K_PAGES(rq->engine->i915);
    352	const u64 encode = rq->context->vm->pte_encode(0, cache_level,
    353						       is_lmem ? PTE_LM : 0);
    354	struct intel_ring *ring = rq->ring;
    355	int pkt, dword_length;
    356	u32 total = 0;
    357	u32 page_size;
    358	u32 *hdr, *cs;
    359
    360	GEM_BUG_ON(GRAPHICS_VER(rq->engine->i915) < 8);
    361
    362	page_size = I915_GTT_PAGE_SIZE;
    363	dword_length = 0x400;
    364
    365	/* Compute the page directory offset for the target address range */
    366	if (has_64K_pages) {
    367		GEM_BUG_ON(!IS_ALIGNED(offset, SZ_2M));
    368
    369		offset /= SZ_2M;
    370		offset *= SZ_64K;
    371		offset += 3 * CHUNK_SZ;
    372
    373		if (is_lmem) {
    374			page_size = I915_GTT_PAGE_SIZE_64K;
    375			dword_length = 0x40;
    376		}
    377	} else {
    378		offset >>= 12;
    379		offset *= sizeof(u64);
    380		offset += 2 * CHUNK_SZ;
    381	}
    382
    383	offset += (u64)rq->engine->instance << 32;
    384
    385	cs = intel_ring_begin(rq, 6);
    386	if (IS_ERR(cs))
    387		return PTR_ERR(cs);
    388
    389	/* Pack as many PTE updates as possible into a single MI command */
    390	pkt = min_t(int, dword_length, ring->space / sizeof(u32) + 5);
    391	pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
    392
    393	hdr = cs;
    394	*cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */
    395	*cs++ = lower_32_bits(offset);
    396	*cs++ = upper_32_bits(offset);
    397
    398	do {
    399		if (cs - hdr >= pkt) {
    400			int dword_rem;
    401
    402			*hdr += cs - hdr - 2;
    403			*cs++ = MI_NOOP;
    404
    405			ring->emit = (void *)cs - ring->vaddr;
    406			intel_ring_advance(rq, cs);
    407			intel_ring_update_space(ring);
    408
    409			cs = intel_ring_begin(rq, 6);
    410			if (IS_ERR(cs))
    411				return PTR_ERR(cs);
    412
    413			dword_rem = dword_length;
    414			if (has_64K_pages) {
    415				if (IS_ALIGNED(total, SZ_2M)) {
    416					offset = round_up(offset, SZ_64K);
    417				} else {
    418					dword_rem = SZ_2M - (total & (SZ_2M - 1));
    419					dword_rem /= page_size;
    420					dword_rem *= 2;
    421				}
    422			}
    423
    424			pkt = min_t(int, dword_rem, ring->space / sizeof(u32) + 5);
    425			pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5);
    426
    427			hdr = cs;
    428			*cs++ = MI_STORE_DATA_IMM | REG_BIT(21);
    429			*cs++ = lower_32_bits(offset);
    430			*cs++ = upper_32_bits(offset);
    431		}
    432
    433		GEM_BUG_ON(!IS_ALIGNED(it->dma, page_size));
    434
    435		*cs++ = lower_32_bits(encode | it->dma);
    436		*cs++ = upper_32_bits(encode | it->dma);
    437
    438		offset += 8;
    439		total += page_size;
    440
    441		it->dma += page_size;
    442		if (it->dma >= it->max) {
    443			it->sg = __sg_next(it->sg);
    444			if (!it->sg || sg_dma_len(it->sg) == 0)
    445				break;
    446
    447			it->dma = sg_dma_address(it->sg);
    448			it->max = it->dma + sg_dma_len(it->sg);
    449		}
    450	} while (total < length);
    451
    452	*hdr += cs - hdr - 2;
    453	*cs++ = MI_NOOP;
    454
    455	ring->emit = (void *)cs - ring->vaddr;
    456	intel_ring_advance(rq, cs);
    457	intel_ring_update_space(ring);
    458
    459	return total;
    460}
    461
    462static bool wa_1209644611_applies(int ver, u32 size)
    463{
    464	u32 height = size >> PAGE_SHIFT;
    465
    466	if (ver != 11)
    467		return false;
    468
    469	return height % 4 == 3 && height <= 8;
    470}
    471
    472/**
    473 * DOC: Flat-CCS - Memory compression for Local memory
    474 *
    475 * On Xe-HP and later devices, we use dedicated compression control state (CCS)
    476 * stored in local memory for each surface, to support the 3D and media
    477 * compression formats.
    478 *
    479 * The memory required for the CCS of the entire local memory is 1/256 of the
    480 * local memory size. So before the kernel boot, the required memory is reserved
    481 * for the CCS data and a secure register will be programmed with the CCS base
    482 * address.
    483 *
    484 * Flat CCS data needs to be cleared when a lmem object is allocated.
    485 * And CCS data can be copied in and out of CCS region through
    486 * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
    487 *
    488 * I915 supports Flat-CCS on lmem only objects. When an objects has smem in
    489 * its preference list, on memory pressure, i915 needs to migrate the lmem
    490 * content into smem. If the lmem object is Flat-CCS compressed by userspace,
    491 * then i915 needs to decompress it. But I915 lack the required information
    492 * for such decompression. Hence I915 supports Flat-CCS only on lmem only objects.
    493 *
    494 * When we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can
    495 * be temporarily evicted to smem, along with the auxiliary CCS state, where
    496 * it can be potentially swapped-out at a later point, if required.
    497 * If userspace later touches the evicted pages, then we always move
    498 * the backing memory back to lmem, which includes restoring the saved CCS state,
    499 * and potentially performing any required swap-in.
    500 *
    501 * For the migration of the lmem objects with smem in placement list, such as
    502 * {lmem, smem}, objects are treated as non Flat-CCS capable objects.
    503 */
    504
    505static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
    506{
    507	*cmd++ = MI_FLUSH_DW | flags;
    508	*cmd++ = 0;
    509	*cmd++ = 0;
    510
    511	return cmd;
    512}
    513
    514static u32 calc_ctrl_surf_instr_size(struct drm_i915_private *i915, int size)
    515{
    516	u32 num_cmds, num_blks, total_size;
    517
    518	if (!GET_CCS_BYTES(i915, size))
    519		return 0;
    520
    521	/*
    522	 * XY_CTRL_SURF_COPY_BLT transfers CCS in 256 byte
    523	 * blocks. one XY_CTRL_SURF_COPY_BLT command can
    524	 * transfer upto 1024 blocks.
    525	 */
    526	num_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
    527				NUM_CCS_BYTES_PER_BLOCK);
    528	num_cmds = DIV_ROUND_UP(num_blks, NUM_CCS_BLKS_PER_XFER);
    529	total_size = XY_CTRL_SURF_INSTR_SIZE * num_cmds;
    530
    531	/*
    532	 * Adding a flush before and after XY_CTRL_SURF_COPY_BLT
    533	 */
    534	total_size += 2 * MI_FLUSH_DW_SIZE;
    535
    536	return total_size;
    537}
    538
    539static int emit_copy_ccs(struct i915_request *rq,
    540			 u32 dst_offset, u8 dst_access,
    541			 u32 src_offset, u8 src_access, int size)
    542{
    543	struct drm_i915_private *i915 = rq->engine->i915;
    544	int mocs = rq->engine->gt->mocs.uc_index << 1;
    545	u32 num_ccs_blks, ccs_ring_size;
    546	u32 *cs;
    547
    548	ccs_ring_size = calc_ctrl_surf_instr_size(i915, size);
    549	WARN_ON(!ccs_ring_size);
    550
    551	cs = intel_ring_begin(rq, round_up(ccs_ring_size, 2));
    552	if (IS_ERR(cs))
    553		return PTR_ERR(cs);
    554
    555	num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
    556				    NUM_CCS_BYTES_PER_BLOCK);
    557	GEM_BUG_ON(num_ccs_blks > NUM_CCS_BLKS_PER_XFER);
    558	cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
    559
    560	/*
    561	 * The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS
    562	 * data in and out of the CCS region.
    563	 *
    564	 * We can copy at most 1024 blocks of 256 bytes using one
    565	 * XY_CTRL_SURF_COPY_BLT instruction.
    566	 *
    567	 * In case we need to copy more than 1024 blocks, we need to add
    568	 * another instruction to the same batch buffer.
    569	 *
    570	 * 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS.
    571	 *
    572	 * 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM.
    573	 */
    574	*cs++ = XY_CTRL_SURF_COPY_BLT |
    575		src_access << SRC_ACCESS_TYPE_SHIFT |
    576		dst_access << DST_ACCESS_TYPE_SHIFT |
    577		((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT;
    578	*cs++ = src_offset;
    579	*cs++ = rq->engine->instance |
    580		FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
    581	*cs++ = dst_offset;
    582	*cs++ = rq->engine->instance |
    583		FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
    584
    585	cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
    586	if (ccs_ring_size & 1)
    587		*cs++ = MI_NOOP;
    588
    589	intel_ring_advance(rq, cs);
    590
    591	return 0;
    592}
    593
    594static int emit_copy(struct i915_request *rq,
    595		     u32 dst_offset, u32 src_offset, int size)
    596{
    597	const int ver = GRAPHICS_VER(rq->engine->i915);
    598	u32 instance = rq->engine->instance;
    599	u32 *cs;
    600
    601	cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6);
    602	if (IS_ERR(cs))
    603		return PTR_ERR(cs);
    604
    605	if (ver >= 9 && !wa_1209644611_applies(ver, size)) {
    606		*cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
    607		*cs++ = BLT_DEPTH_32 | PAGE_SIZE;
    608		*cs++ = 0;
    609		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
    610		*cs++ = dst_offset;
    611		*cs++ = instance;
    612		*cs++ = 0;
    613		*cs++ = PAGE_SIZE;
    614		*cs++ = src_offset;
    615		*cs++ = instance;
    616	} else if (ver >= 8) {
    617		*cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
    618		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
    619		*cs++ = 0;
    620		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
    621		*cs++ = dst_offset;
    622		*cs++ = instance;
    623		*cs++ = 0;
    624		*cs++ = PAGE_SIZE;
    625		*cs++ = src_offset;
    626		*cs++ = instance;
    627	} else {
    628		GEM_BUG_ON(instance);
    629		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
    630		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
    631		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
    632		*cs++ = dst_offset;
    633		*cs++ = PAGE_SIZE;
    634		*cs++ = src_offset;
    635	}
    636
    637	intel_ring_advance(rq, cs);
    638	return 0;
    639}
    640
    641static int scatter_list_length(struct scatterlist *sg)
    642{
    643	int len = 0;
    644
    645	while (sg && sg_dma_len(sg)) {
    646		len += sg_dma_len(sg);
    647		sg = sg_next(sg);
    648	};
    649
    650	return len;
    651}
    652
    653static void
    654calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
    655		   int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy)
    656{
    657	if (ccs_bytes_to_cpy) {
    658		if (!src_is_lmem)
    659			/*
    660			 * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
    661			 * will be taken for the blt. in Flat-ccs supported
    662			 * platform Smem obj will have more pages than required
    663			 * for main meory hence limit it to the required size
    664			 * for main memory
    665			 */
    666			*src_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
    667	} else { /* ccs handling is not required */
    668		*src_sz = CHUNK_SZ;
    669	}
    670}
    671
    672static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
    673{
    674	u32 len;
    675
    676	do {
    677		GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg));
    678		len = it->max - it->dma;
    679		if (len > bytes_to_cpy) {
    680			it->dma += bytes_to_cpy;
    681			break;
    682		}
    683
    684		bytes_to_cpy -= len;
    685
    686		it->sg = __sg_next(it->sg);
    687		it->dma = sg_dma_address(it->sg);
    688		it->max = it->dma + sg_dma_len(it->sg);
    689	} while (bytes_to_cpy);
    690}
    691
    692int
    693intel_context_migrate_copy(struct intel_context *ce,
    694			   const struct i915_deps *deps,
    695			   struct scatterlist *src,
    696			   enum i915_cache_level src_cache_level,
    697			   bool src_is_lmem,
    698			   struct scatterlist *dst,
    699			   enum i915_cache_level dst_cache_level,
    700			   bool dst_is_lmem,
    701			   struct i915_request **out)
    702{
    703	struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst), it_ccs;
    704	struct drm_i915_private *i915 = ce->engine->i915;
    705	u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
    706	enum i915_cache_level ccs_cache_level;
    707	u32 src_offset, dst_offset;
    708	u8 src_access, dst_access;
    709	struct i915_request *rq;
    710	int src_sz, dst_sz;
    711	bool ccs_is_src;
    712	int err;
    713
    714	GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
    715	GEM_BUG_ON(IS_DGFX(ce->engine->i915) && (!src_is_lmem && !dst_is_lmem));
    716	*out = NULL;
    717
    718	GEM_BUG_ON(ce->ring->size < SZ_64K);
    719
    720	src_sz = scatter_list_length(src);
    721	bytes_to_cpy = src_sz;
    722
    723	if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
    724		src_access = !src_is_lmem && dst_is_lmem;
    725		dst_access = !src_access;
    726
    727		dst_sz = scatter_list_length(dst);
    728		if (src_is_lmem) {
    729			it_ccs = it_dst;
    730			ccs_cache_level = dst_cache_level;
    731			ccs_is_src = false;
    732		} else if (dst_is_lmem) {
    733			bytes_to_cpy = dst_sz;
    734			it_ccs = it_src;
    735			ccs_cache_level = src_cache_level;
    736			ccs_is_src = true;
    737		}
    738
    739		/*
    740		 * When there is a eviction of ccs needed smem will have the
    741		 * extra pages for the ccs data
    742		 *
    743		 * TO-DO: Want to move the size mismatch check to a WARN_ON,
    744		 * but still we have some requests of smem->lmem with same size.
    745		 * Need to fix it.
    746		 */
    747		ccs_bytes_to_cpy = src_sz != dst_sz ? GET_CCS_BYTES(i915, bytes_to_cpy) : 0;
    748		if (ccs_bytes_to_cpy)
    749			get_ccs_sg_sgt(&it_ccs, bytes_to_cpy);
    750	}
    751
    752	src_offset = 0;
    753	dst_offset = CHUNK_SZ;
    754	if (HAS_64K_PAGES(ce->engine->i915)) {
    755		src_offset = 0;
    756		dst_offset = 0;
    757		if (src_is_lmem)
    758			src_offset = CHUNK_SZ;
    759		if (dst_is_lmem)
    760			dst_offset = 2 * CHUNK_SZ;
    761	}
    762
    763	do {
    764		int len;
    765
    766		rq = i915_request_create(ce);
    767		if (IS_ERR(rq)) {
    768			err = PTR_ERR(rq);
    769			goto out_ce;
    770		}
    771
    772		if (deps) {
    773			err = i915_request_await_deps(rq, deps);
    774			if (err)
    775				goto out_rq;
    776
    777			if (rq->engine->emit_init_breadcrumb) {
    778				err = rq->engine->emit_init_breadcrumb(rq);
    779				if (err)
    780					goto out_rq;
    781			}
    782
    783			deps = NULL;
    784		}
    785
    786		/* The PTE updates + copy must not be interrupted. */
    787		err = emit_no_arbitration(rq);
    788		if (err)
    789			goto out_rq;
    790
    791		calculate_chunk_sz(i915, src_is_lmem, &src_sz,
    792				   bytes_to_cpy, ccs_bytes_to_cpy);
    793
    794		len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem,
    795			       src_offset, src_sz);
    796		if (!len) {
    797			err = -EINVAL;
    798			goto out_rq;
    799		}
    800		if (len < 0) {
    801			err = len;
    802			goto out_rq;
    803		}
    804
    805		err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem,
    806			       dst_offset, len);
    807		if (err < 0)
    808			goto out_rq;
    809		if (err < len) {
    810			err = -EINVAL;
    811			goto out_rq;
    812		}
    813
    814		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
    815		if (err)
    816			goto out_rq;
    817
    818		err = emit_copy(rq, dst_offset,	src_offset, len);
    819		if (err)
    820			goto out_rq;
    821
    822		bytes_to_cpy -= len;
    823
    824		if (ccs_bytes_to_cpy) {
    825			int ccs_sz;
    826
    827			err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
    828			if (err)
    829				goto out_rq;
    830
    831			ccs_sz = GET_CCS_BYTES(i915, len);
    832			err = emit_pte(rq, &it_ccs, ccs_cache_level, false,
    833				       ccs_is_src ? src_offset : dst_offset,
    834				       ccs_sz);
    835			if (err < 0)
    836				goto out_rq;
    837			if (err < ccs_sz) {
    838				err = -EINVAL;
    839				goto out_rq;
    840			}
    841
    842			err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
    843			if (err)
    844				goto out_rq;
    845
    846			err = emit_copy_ccs(rq, dst_offset, dst_access,
    847					    src_offset, src_access, len);
    848			if (err)
    849				goto out_rq;
    850
    851			err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
    852			if (err)
    853				goto out_rq;
    854			ccs_bytes_to_cpy -= ccs_sz;
    855		}
    856
    857		/* Arbitration is re-enabled between requests. */
    858out_rq:
    859		if (*out)
    860			i915_request_put(*out);
    861		*out = i915_request_get(rq);
    862		i915_request_add(rq);
    863
    864		if (err)
    865			break;
    866
    867		if (!bytes_to_cpy && !ccs_bytes_to_cpy) {
    868			if (src_is_lmem)
    869				WARN_ON(it_src.sg && sg_dma_len(it_src.sg));
    870			else
    871				WARN_ON(it_dst.sg && sg_dma_len(it_dst.sg));
    872			break;
    873		}
    874
    875		if (WARN_ON(!it_src.sg || !sg_dma_len(it_src.sg) ||
    876			    !it_dst.sg || !sg_dma_len(it_dst.sg) ||
    877			    (ccs_bytes_to_cpy && (!it_ccs.sg ||
    878						  !sg_dma_len(it_ccs.sg))))) {
    879			err = -EINVAL;
    880			break;
    881		}
    882
    883		cond_resched();
    884	} while (1);
    885
    886out_ce:
    887	return err;
    888}
    889
    890static int emit_clear(struct i915_request *rq, u32 offset, int size,
    891		      u32 value, bool is_lmem)
    892{
    893	struct drm_i915_private *i915 = rq->engine->i915;
    894	int mocs = rq->engine->gt->mocs.uc_index << 1;
    895	const int ver = GRAPHICS_VER(i915);
    896	int ring_sz;
    897	u32 *cs;
    898
    899	GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
    900
    901	if (HAS_FLAT_CCS(i915) && ver >= 12)
    902		ring_sz = XY_FAST_COLOR_BLT_DW;
    903	else if (ver >= 8)
    904		ring_sz = 8;
    905	else
    906		ring_sz = 6;
    907
    908	cs = intel_ring_begin(rq, ring_sz);
    909	if (IS_ERR(cs))
    910		return PTR_ERR(cs);
    911
    912	if (HAS_FLAT_CCS(i915) && ver >= 12) {
    913		*cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
    914			(XY_FAST_COLOR_BLT_DW - 2);
    915		*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
    916			(PAGE_SIZE - 1);
    917		*cs++ = 0;
    918		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
    919		*cs++ = offset;
    920		*cs++ = rq->engine->instance;
    921		*cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
    922		/* BG7 */
    923		*cs++ = value;
    924		*cs++ = 0;
    925		*cs++ = 0;
    926		*cs++ = 0;
    927		/* BG11 */
    928		*cs++ = 0;
    929		*cs++ = 0;
    930		/* BG13 */
    931		*cs++ = 0;
    932		*cs++ = 0;
    933		*cs++ = 0;
    934	} else if (ver >= 8) {
    935		*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
    936		*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
    937		*cs++ = 0;
    938		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
    939		*cs++ = offset;
    940		*cs++ = rq->engine->instance;
    941		*cs++ = value;
    942		*cs++ = MI_NOOP;
    943	} else {
    944		*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
    945		*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
    946		*cs++ = 0;
    947		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
    948		*cs++ = offset;
    949		*cs++ = value;
    950	}
    951
    952	intel_ring_advance(rq, cs);
    953	return 0;
    954}
    955
    956int
    957intel_context_migrate_clear(struct intel_context *ce,
    958			    const struct i915_deps *deps,
    959			    struct scatterlist *sg,
    960			    enum i915_cache_level cache_level,
    961			    bool is_lmem,
    962			    u32 value,
    963			    struct i915_request **out)
    964{
    965	struct drm_i915_private *i915 = ce->engine->i915;
    966	struct sgt_dma it = sg_sgt(sg);
    967	struct i915_request *rq;
    968	u32 offset;
    969	int err;
    970
    971	GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
    972	*out = NULL;
    973
    974	GEM_BUG_ON(ce->ring->size < SZ_64K);
    975
    976	offset = 0;
    977	if (HAS_64K_PAGES(i915) && is_lmem)
    978		offset = CHUNK_SZ;
    979
    980	do {
    981		int len;
    982
    983		rq = i915_request_create(ce);
    984		if (IS_ERR(rq)) {
    985			err = PTR_ERR(rq);
    986			goto out_ce;
    987		}
    988
    989		if (deps) {
    990			err = i915_request_await_deps(rq, deps);
    991			if (err)
    992				goto out_rq;
    993
    994			if (rq->engine->emit_init_breadcrumb) {
    995				err = rq->engine->emit_init_breadcrumb(rq);
    996				if (err)
    997					goto out_rq;
    998			}
    999
   1000			deps = NULL;
   1001		}
   1002
   1003		/* The PTE updates + clear must not be interrupted. */
   1004		err = emit_no_arbitration(rq);
   1005		if (err)
   1006			goto out_rq;
   1007
   1008		len = emit_pte(rq, &it, cache_level, is_lmem, offset, CHUNK_SZ);
   1009		if (len <= 0) {
   1010			err = len;
   1011			goto out_rq;
   1012		}
   1013
   1014		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
   1015		if (err)
   1016			goto out_rq;
   1017
   1018		err = emit_clear(rq, offset, len, value, is_lmem);
   1019		if (err)
   1020			goto out_rq;
   1021
   1022		if (HAS_FLAT_CCS(i915) && is_lmem && !value) {
   1023			/*
   1024			 * copy the content of memory into corresponding
   1025			 * ccs surface
   1026			 */
   1027			err = emit_copy_ccs(rq, offset, INDIRECT_ACCESS, offset,
   1028					    DIRECT_ACCESS, len);
   1029			if (err)
   1030				goto out_rq;
   1031		}
   1032
   1033		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
   1034
   1035		/* Arbitration is re-enabled between requests. */
   1036out_rq:
   1037		if (*out)
   1038			i915_request_put(*out);
   1039		*out = i915_request_get(rq);
   1040		i915_request_add(rq);
   1041		if (err || !it.sg || !sg_dma_len(it.sg))
   1042			break;
   1043
   1044		cond_resched();
   1045	} while (1);
   1046
   1047out_ce:
   1048	return err;
   1049}
   1050
   1051int intel_migrate_copy(struct intel_migrate *m,
   1052		       struct i915_gem_ww_ctx *ww,
   1053		       const struct i915_deps *deps,
   1054		       struct scatterlist *src,
   1055		       enum i915_cache_level src_cache_level,
   1056		       bool src_is_lmem,
   1057		       struct scatterlist *dst,
   1058		       enum i915_cache_level dst_cache_level,
   1059		       bool dst_is_lmem,
   1060		       struct i915_request **out)
   1061{
   1062	struct intel_context *ce;
   1063	int err;
   1064
   1065	*out = NULL;
   1066	if (!m->context)
   1067		return -ENODEV;
   1068
   1069	ce = intel_migrate_create_context(m);
   1070	if (IS_ERR(ce))
   1071		ce = intel_context_get(m->context);
   1072	GEM_BUG_ON(IS_ERR(ce));
   1073
   1074	err = intel_context_pin_ww(ce, ww);
   1075	if (err)
   1076		goto out;
   1077
   1078	err = intel_context_migrate_copy(ce, deps,
   1079					 src, src_cache_level, src_is_lmem,
   1080					 dst, dst_cache_level, dst_is_lmem,
   1081					 out);
   1082
   1083	intel_context_unpin(ce);
   1084out:
   1085	intel_context_put(ce);
   1086	return err;
   1087}
   1088
   1089int
   1090intel_migrate_clear(struct intel_migrate *m,
   1091		    struct i915_gem_ww_ctx *ww,
   1092		    const struct i915_deps *deps,
   1093		    struct scatterlist *sg,
   1094		    enum i915_cache_level cache_level,
   1095		    bool is_lmem,
   1096		    u32 value,
   1097		    struct i915_request **out)
   1098{
   1099	struct intel_context *ce;
   1100	int err;
   1101
   1102	*out = NULL;
   1103	if (!m->context)
   1104		return -ENODEV;
   1105
   1106	ce = intel_migrate_create_context(m);
   1107	if (IS_ERR(ce))
   1108		ce = intel_context_get(m->context);
   1109	GEM_BUG_ON(IS_ERR(ce));
   1110
   1111	err = intel_context_pin_ww(ce, ww);
   1112	if (err)
   1113		goto out;
   1114
   1115	err = intel_context_migrate_clear(ce, deps, sg, cache_level,
   1116					  is_lmem, value, out);
   1117
   1118	intel_context_unpin(ce);
   1119out:
   1120	intel_context_put(ce);
   1121	return err;
   1122}
   1123
   1124void intel_migrate_fini(struct intel_migrate *m)
   1125{
   1126	struct intel_context *ce;
   1127
   1128	ce = fetch_and_zero(&m->context);
   1129	if (!ce)
   1130		return;
   1131
   1132	intel_engine_destroy_pinned_context(ce);
   1133}
   1134
   1135#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
   1136#include "selftest_migrate.c"
   1137#endif