amdgpu_amdkfd_gfx_v9.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
amdgpu_amdkfd_gfx_v9.c (27106B)
      1/*
      2 * Copyright 2014-2018 Advanced Micro Devices, Inc.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice shall be included in
     12 * all copies or substantial portions of the Software.
     13 *
     14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20 * OTHER DEALINGS IN THE SOFTWARE.
     21 */
     22#include "amdgpu.h"
     23#include "amdgpu_amdkfd.h"
     24#include "gc/gc_9_0_offset.h"
     25#include "gc/gc_9_0_sh_mask.h"
     26#include "vega10_enum.h"
     27#include "sdma0/sdma0_4_0_offset.h"
     28#include "sdma0/sdma0_4_0_sh_mask.h"
     29#include "sdma1/sdma1_4_0_offset.h"
     30#include "sdma1/sdma1_4_0_sh_mask.h"
     31#include "athub/athub_1_0_offset.h"
     32#include "athub/athub_1_0_sh_mask.h"
     33#include "oss/osssys_4_0_offset.h"
     34#include "oss/osssys_4_0_sh_mask.h"
     35#include "soc15_common.h"
     36#include "v9_structs.h"
     37#include "soc15.h"
     38#include "soc15d.h"
     39#include "gfx_v9_0.h"
     40#include "amdgpu_amdkfd_gfx_v9.h"
     41
     42enum hqd_dequeue_request_type {
     43	NO_ACTION = 0,
     44	DRAIN_PIPE,
     45	RESET_WAVES,
     46	SAVE_WAVES
     47};
     48
     49static void lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe,
     50			uint32_t queue, uint32_t vmid)
     51{
     52	mutex_lock(&adev->srbm_mutex);
     53	soc15_grbm_select(adev, mec, pipe, queue, vmid);
     54}
     55
     56static void unlock_srbm(struct amdgpu_device *adev)
     57{
     58	soc15_grbm_select(adev, 0, 0, 0, 0);
     59	mutex_unlock(&adev->srbm_mutex);
     60}
     61
     62static void acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id,
     63				uint32_t queue_id)
     64{
     65	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
     66	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
     67
     68	lock_srbm(adev, mec, pipe, queue_id, 0);
     69}
     70
     71static uint64_t get_queue_mask(struct amdgpu_device *adev,
     72			       uint32_t pipe_id, uint32_t queue_id)
     73{
     74	unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe +
     75			queue_id;
     76
     77	return 1ull << bit;
     78}
     79
     80static void release_queue(struct amdgpu_device *adev)
     81{
     82	unlock_srbm(adev);
     83}
     84
     85void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid,
     86					uint32_t sh_mem_config,
     87					uint32_t sh_mem_ape1_base,
     88					uint32_t sh_mem_ape1_limit,
     89					uint32_t sh_mem_bases)
     90{
     91	lock_srbm(adev, 0, 0, 0, vmid);
     92
     93	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
     94	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
     95	/* APE1 no longer exists on GFX9 */
     96
     97	unlock_srbm(adev);
     98}
     99
    100int kgd_gfx_v9_set_pasid_vmid_mapping(struct amdgpu_device *adev, u32 pasid,
    101					unsigned int vmid)
    102{
    103	/*
    104	 * We have to assume that there is no outstanding mapping.
    105	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
    106	 * a mapping is in progress or because a mapping finished
    107	 * and the SW cleared it.
    108	 * So the protocol is to always wait & clear.
    109	 */
    110	uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
    111			ATC_VMID0_PASID_MAPPING__VALID_MASK;
    112
    113	/*
    114	 * need to do this twice, once for gfx and once for mmhub
    115	 * for ATC add 16 to VMID for mmhub, for IH different registers.
    116	 * ATC_VMID0..15 registers are separate from ATC_VMID16..31.
    117	 */
    118
    119	WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid,
    120	       pasid_mapping);
    121
    122	while (!(RREG32(SOC15_REG_OFFSET(
    123				ATHUB, 0,
    124				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
    125		 (1U << vmid)))
    126		cpu_relax();
    127
    128	WREG32(SOC15_REG_OFFSET(ATHUB, 0,
    129				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
    130	       1U << vmid);
    131
    132	/* Mapping vmid to pasid also for IH block */
    133	WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid,
    134	       pasid_mapping);
    135
    136	WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid,
    137	       pasid_mapping);
    138
    139	while (!(RREG32(SOC15_REG_OFFSET(
    140				ATHUB, 0,
    141				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
    142		 (1U << (vmid + 16))))
    143		cpu_relax();
    144
    145	WREG32(SOC15_REG_OFFSET(ATHUB, 0,
    146				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
    147	       1U << (vmid + 16));
    148
    149	/* Mapping vmid to pasid also for IH block */
    150	WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid,
    151	       pasid_mapping);
    152	return 0;
    153}
    154
    155/* TODO - RING0 form of field is obsolete, seems to date back to SI
    156 * but still works
    157 */
    158
    159int kgd_gfx_v9_init_interrupts(struct amdgpu_device *adev, uint32_t pipe_id)
    160{
    161	uint32_t mec;
    162	uint32_t pipe;
    163
    164	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
    165	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
    166
    167	lock_srbm(adev, mec, pipe, 0, 0);
    168
    169	WREG32_SOC15(GC, 0, mmCPC_INT_CNTL,
    170		CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
    171		CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
    172
    173	unlock_srbm(adev);
    174
    175	return 0;
    176}
    177
    178static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
    179				unsigned int engine_id,
    180				unsigned int queue_id)
    181{
    182	uint32_t sdma_engine_reg_base = 0;
    183	uint32_t sdma_rlc_reg_offset;
    184
    185	switch (engine_id) {
    186	default:
    187		dev_warn(adev->dev,
    188			 "Invalid sdma engine id (%d), using engine id 0\n",
    189			 engine_id);
    190		fallthrough;
    191	case 0:
    192		sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0,
    193				mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL;
    194		break;
    195	case 1:
    196		sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0,
    197				mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL;
    198		break;
    199	}
    200
    201	sdma_rlc_reg_offset = sdma_engine_reg_base
    202		+ queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL);
    203
    204	pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
    205		 queue_id, sdma_rlc_reg_offset);
    206
    207	return sdma_rlc_reg_offset;
    208}
    209
    210static inline struct v9_mqd *get_mqd(void *mqd)
    211{
    212	return (struct v9_mqd *)mqd;
    213}
    214
    215static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
    216{
    217	return (struct v9_sdma_mqd *)mqd;
    218}
    219
    220int kgd_gfx_v9_hqd_load(struct amdgpu_device *adev, void *mqd,
    221			uint32_t pipe_id, uint32_t queue_id,
    222			uint32_t __user *wptr, uint32_t wptr_shift,
    223			uint32_t wptr_mask, struct mm_struct *mm)
    224{
    225	struct v9_mqd *m;
    226	uint32_t *mqd_hqd;
    227	uint32_t reg, hqd_base, data;
    228
    229	m = get_mqd(mqd);
    230
    231	acquire_queue(adev, pipe_id, queue_id);
    232
    233	/* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
    234	mqd_hqd = &m->cp_mqd_base_addr_lo;
    235	hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
    236
    237	for (reg = hqd_base;
    238	     reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
    239		WREG32_RLC(reg, mqd_hqd[reg - hqd_base]);
    240
    241
    242	/* Activate doorbell logic before triggering WPTR poll. */
    243	data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
    244			     CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
    245	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
    246
    247	if (wptr) {
    248		/* Don't read wptr with get_user because the user
    249		 * context may not be accessible (if this function
    250		 * runs in a work queue). Instead trigger a one-shot
    251		 * polling read from memory in the CP. This assumes
    252		 * that wptr is GPU-accessible in the queue's VMID via
    253		 * ATC or SVM. WPTR==RPTR before starting the poll so
    254		 * the CP starts fetching new commands from the right
    255		 * place.
    256		 *
    257		 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
    258		 * tricky. Assume that the queue didn't overflow. The
    259		 * number of valid bits in the 32-bit RPTR depends on
    260		 * the queue size. The remaining bits are taken from
    261		 * the saved 64-bit WPTR. If the WPTR wrapped, add the
    262		 * queue size.
    263		 */
    264		uint32_t queue_size =
    265			2 << REG_GET_FIELD(m->cp_hqd_pq_control,
    266					   CP_HQD_PQ_CONTROL, QUEUE_SIZE);
    267		uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
    268
    269		if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
    270			guessed_wptr += queue_size;
    271		guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
    272		guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
    273
    274		WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
    275		       lower_32_bits(guessed_wptr));
    276		WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
    277		       upper_32_bits(guessed_wptr));
    278		WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
    279		       lower_32_bits((uintptr_t)wptr));
    280		WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
    281		       upper_32_bits((uintptr_t)wptr));
    282		WREG32_SOC15(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1,
    283		       (uint32_t)get_queue_mask(adev, pipe_id, queue_id));
    284	}
    285
    286	/* Start the EOP fetcher */
    287	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
    288	       REG_SET_FIELD(m->cp_hqd_eop_rptr,
    289			     CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
    290
    291	data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
    292	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
    293
    294	release_queue(adev);
    295
    296	return 0;
    297}
    298
    299int kgd_gfx_v9_hiq_mqd_load(struct amdgpu_device *adev, void *mqd,
    300			    uint32_t pipe_id, uint32_t queue_id,
    301			    uint32_t doorbell_off)
    302{
    303	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
    304	struct v9_mqd *m;
    305	uint32_t mec, pipe;
    306	int r;
    307
    308	m = get_mqd(mqd);
    309
    310	acquire_queue(adev, pipe_id, queue_id);
    311
    312	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
    313	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
    314
    315	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
    316		 mec, pipe, queue_id);
    317
    318	spin_lock(&adev->gfx.kiq.ring_lock);
    319	r = amdgpu_ring_alloc(kiq_ring, 7);
    320	if (r) {
    321		pr_err("Failed to alloc KIQ (%d).\n", r);
    322		goto out_unlock;
    323	}
    324
    325	amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
    326	amdgpu_ring_write(kiq_ring,
    327			  PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
    328			  PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */
    329			  PACKET3_MAP_QUEUES_QUEUE(queue_id) |
    330			  PACKET3_MAP_QUEUES_PIPE(pipe) |
    331			  PACKET3_MAP_QUEUES_ME((mec - 1)) |
    332			  PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
    333			  PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
    334			  PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */
    335			  PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
    336	amdgpu_ring_write(kiq_ring,
    337			  PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off));
    338	amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo);
    339	amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi);
    340	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo);
    341	amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
    342	amdgpu_ring_commit(kiq_ring);
    343
    344out_unlock:
    345	spin_unlock(&adev->gfx.kiq.ring_lock);
    346	release_queue(adev);
    347
    348	return r;
    349}
    350
    351int kgd_gfx_v9_hqd_dump(struct amdgpu_device *adev,
    352			uint32_t pipe_id, uint32_t queue_id,
    353			uint32_t (**dump)[2], uint32_t *n_regs)
    354{
    355	uint32_t i = 0, reg;
    356#define HQD_N_REGS 56
    357#define DUMP_REG(addr) do {				\
    358		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
    359			break;				\
    360		(*dump)[i][0] = (addr) << 2;		\
    361		(*dump)[i++][1] = RREG32(addr);		\
    362	} while (0)
    363
    364	*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
    365	if (*dump == NULL)
    366		return -ENOMEM;
    367
    368	acquire_queue(adev, pipe_id, queue_id);
    369
    370	for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
    371	     reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
    372		DUMP_REG(reg);
    373
    374	release_queue(adev);
    375
    376	WARN_ON_ONCE(i != HQD_N_REGS);
    377	*n_regs = i;
    378
    379	return 0;
    380}
    381
    382static int kgd_hqd_sdma_load(struct amdgpu_device *adev, void *mqd,
    383			     uint32_t __user *wptr, struct mm_struct *mm)
    384{
    385	struct v9_sdma_mqd *m;
    386	uint32_t sdma_rlc_reg_offset;
    387	unsigned long end_jiffies;
    388	uint32_t data;
    389	uint64_t data64;
    390	uint64_t __user *wptr64 = (uint64_t __user *)wptr;
    391
    392	m = get_sdma_mqd(mqd);
    393	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
    394					    m->sdma_queue_id);
    395
    396	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
    397		m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
    398
    399	end_jiffies = msecs_to_jiffies(2000) + jiffies;
    400	while (true) {
    401		data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
    402		if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
    403			break;
    404		if (time_after(jiffies, end_jiffies)) {
    405			pr_err("SDMA RLC not idle in %s\n", __func__);
    406			return -ETIME;
    407		}
    408		usleep_range(500, 1000);
    409	}
    410
    411	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET,
    412	       m->sdmax_rlcx_doorbell_offset);
    413
    414	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
    415			     ENABLE, 1);
    416	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data);
    417	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR,
    418				m->sdmax_rlcx_rb_rptr);
    419	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI,
    420				m->sdmax_rlcx_rb_rptr_hi);
    421
    422	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
    423	if (read_user_wptr(mm, wptr64, data64)) {
    424		WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
    425		       lower_32_bits(data64));
    426		WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
    427		       upper_32_bits(data64));
    428	} else {
    429		WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
    430		       m->sdmax_rlcx_rb_rptr);
    431		WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
    432		       m->sdmax_rlcx_rb_rptr_hi);
    433	}
    434	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
    435
    436	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
    437	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI,
    438			m->sdmax_rlcx_rb_base_hi);
    439	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
    440			m->sdmax_rlcx_rb_rptr_addr_lo);
    441	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
    442			m->sdmax_rlcx_rb_rptr_addr_hi);
    443
    444	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
    445			     RB_ENABLE, 1);
    446	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data);
    447
    448	return 0;
    449}
    450
    451static int kgd_hqd_sdma_dump(struct amdgpu_device *adev,
    452			     uint32_t engine_id, uint32_t queue_id,
    453			     uint32_t (**dump)[2], uint32_t *n_regs)
    454{
    455	uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
    456			engine_id, queue_id);
    457	uint32_t i = 0, reg;
    458#undef HQD_N_REGS
    459#define HQD_N_REGS (19+6+7+10)
    460
    461	*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
    462	if (*dump == NULL)
    463		return -ENOMEM;
    464
    465	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
    466		DUMP_REG(sdma_rlc_reg_offset + reg);
    467	for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
    468		DUMP_REG(sdma_rlc_reg_offset + reg);
    469	for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
    470	     reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
    471		DUMP_REG(sdma_rlc_reg_offset + reg);
    472	for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
    473	     reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
    474		DUMP_REG(sdma_rlc_reg_offset + reg);
    475
    476	WARN_ON_ONCE(i != HQD_N_REGS);
    477	*n_regs = i;
    478
    479	return 0;
    480}
    481
    482bool kgd_gfx_v9_hqd_is_occupied(struct amdgpu_device *adev,
    483				uint64_t queue_address, uint32_t pipe_id,
    484				uint32_t queue_id)
    485{
    486	uint32_t act;
    487	bool retval = false;
    488	uint32_t low, high;
    489
    490	acquire_queue(adev, pipe_id, queue_id);
    491	act = RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE);
    492	if (act) {
    493		low = lower_32_bits(queue_address >> 8);
    494		high = upper_32_bits(queue_address >> 8);
    495
    496		if (low == RREG32_SOC15(GC, 0, mmCP_HQD_PQ_BASE) &&
    497		   high == RREG32_SOC15(GC, 0, mmCP_HQD_PQ_BASE_HI))
    498			retval = true;
    499	}
    500	release_queue(adev);
    501	return retval;
    502}
    503
    504static bool kgd_hqd_sdma_is_occupied(struct amdgpu_device *adev, void *mqd)
    505{
    506	struct v9_sdma_mqd *m;
    507	uint32_t sdma_rlc_reg_offset;
    508	uint32_t sdma_rlc_rb_cntl;
    509
    510	m = get_sdma_mqd(mqd);
    511	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
    512					    m->sdma_queue_id);
    513
    514	sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
    515
    516	if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
    517		return true;
    518
    519	return false;
    520}
    521
    522int kgd_gfx_v9_hqd_destroy(struct amdgpu_device *adev, void *mqd,
    523				enum kfd_preempt_type reset_type,
    524				unsigned int utimeout, uint32_t pipe_id,
    525				uint32_t queue_id)
    526{
    527	enum hqd_dequeue_request_type type;
    528	unsigned long end_jiffies;
    529	uint32_t temp;
    530	struct v9_mqd *m = get_mqd(mqd);
    531
    532	if (amdgpu_in_reset(adev))
    533		return -EIO;
    534
    535	acquire_queue(adev, pipe_id, queue_id);
    536
    537	if (m->cp_hqd_vmid == 0)
    538		WREG32_FIELD15_RLC(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
    539
    540	switch (reset_type) {
    541	case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
    542		type = DRAIN_PIPE;
    543		break;
    544	case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
    545		type = RESET_WAVES;
    546		break;
    547	case KFD_PREEMPT_TYPE_WAVEFRONT_SAVE:
    548		type = SAVE_WAVES;
    549		break;
    550	default:
    551		type = DRAIN_PIPE;
    552		break;
    553	}
    554
    555	WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
    556
    557	end_jiffies = (utimeout * HZ / 1000) + jiffies;
    558	while (true) {
    559		temp = RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE);
    560		if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
    561			break;
    562		if (time_after(jiffies, end_jiffies)) {
    563			pr_err("cp queue preemption time out.\n");
    564			release_queue(adev);
    565			return -ETIME;
    566		}
    567		usleep_range(500, 1000);
    568	}
    569
    570	release_queue(adev);
    571	return 0;
    572}
    573
    574static int kgd_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
    575				unsigned int utimeout)
    576{
    577	struct v9_sdma_mqd *m;
    578	uint32_t sdma_rlc_reg_offset;
    579	uint32_t temp;
    580	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
    581
    582	m = get_sdma_mqd(mqd);
    583	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
    584					    m->sdma_queue_id);
    585
    586	temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
    587	temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
    588	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp);
    589
    590	while (true) {
    591		temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
    592		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
    593			break;
    594		if (time_after(jiffies, end_jiffies)) {
    595			pr_err("SDMA RLC not idle in %s\n", __func__);
    596			return -ETIME;
    597		}
    598		usleep_range(500, 1000);
    599	}
    600
    601	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0);
    602	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
    603		RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) |
    604		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
    605
    606	m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR);
    607	m->sdmax_rlcx_rb_rptr_hi =
    608		RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI);
    609
    610	return 0;
    611}
    612
    613bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
    614					uint8_t vmid, uint16_t *p_pasid)
    615{
    616	uint32_t value;
    617
    618	value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
    619		     + vmid);
    620	*p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK;
    621
    622	return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK);
    623}
    624
    625int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
    626					uint32_t gfx_index_val,
    627					uint32_t sq_cmd)
    628{
    629	uint32_t data = 0;
    630
    631	mutex_lock(&adev->grbm_idx_mutex);
    632
    633	WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, gfx_index_val);
    634	WREG32_SOC15(GC, 0, mmSQ_CMD, sq_cmd);
    635
    636	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
    637		INSTANCE_BROADCAST_WRITES, 1);
    638	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
    639		SH_BROADCAST_WRITES, 1);
    640	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
    641		SE_BROADCAST_WRITES, 1);
    642
    643	WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, data);
    644	mutex_unlock(&adev->grbm_idx_mutex);
    645
    646	return 0;
    647}
    648
    649void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
    650			uint32_t vmid, uint64_t page_table_base)
    651{
    652	if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
    653		pr_err("trying to set page table base for wrong VMID %u\n",
    654		       vmid);
    655		return;
    656	}
    657
    658	adev->mmhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
    659
    660	adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
    661}
    662
    663static void lock_spi_csq_mutexes(struct amdgpu_device *adev)
    664{
    665	mutex_lock(&adev->srbm_mutex);
    666	mutex_lock(&adev->grbm_idx_mutex);
    667
    668}
    669
    670static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
    671{
    672	mutex_unlock(&adev->grbm_idx_mutex);
    673	mutex_unlock(&adev->srbm_mutex);
    674}
    675
    676/**
    677 * get_wave_count: Read device registers to get number of waves in flight for
    678 * a particular queue. The method also returns the VMID associated with the
    679 * queue.
    680 *
    681 * @adev: Handle of device whose registers are to be read
    682 * @queue_idx: Index of queue in the queue-map bit-field
    683 * @wave_cnt: Output parameter updated with number of waves in flight
    684 * @vmid: Output parameter updated with VMID of queue whose wave count
    685 * is being collected
    686 */
    687static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
    688		int *wave_cnt, int *vmid)
    689{
    690	int pipe_idx;
    691	int queue_slot;
    692	unsigned int reg_val;
    693
    694	/*
    695	 * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
    696	 * parameters to read out waves in flight. Get VMID if there are
    697	 * non-zero waves in flight.
    698	 */
    699	*vmid = 0xFF;
    700	*wave_cnt = 0;
    701	pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
    702	queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
    703	soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0);
    704	reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, 0, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
    705			 queue_slot);
    706	*wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
    707	if (*wave_cnt != 0)
    708		*vmid = (RREG32_SOC15(GC, 0, mmCP_HQD_VMID) &
    709			 CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
    710}
    711
    712/**
    713 * kgd_gfx_v9_get_cu_occupancy: Reads relevant registers associated with each
    714 * shader engine and aggregates the number of waves that are in flight for the
    715 * process whose pasid is provided as a parameter. The process could have ZERO
    716 * or more queues running and submitting waves to compute units.
    717 *
    718 * @adev: Handle of device from which to get number of waves in flight
    719 * @pasid: Identifies the process for which this query call is invoked
    720 * @pasid_wave_cnt: Output parameter updated with number of waves in flight that
    721 * belong to process with given pasid
    722 * @max_waves_per_cu: Output parameter updated with maximum number of waves
    723 * possible per Compute Unit
    724 *
    725 * Note: It's possible that the device has too many queues (oversubscription)
    726 * in which case a VMID could be remapped to a different PASID. This could lead
    727 * to an inaccurate wave count. Following is a high-level sequence:
    728 *    Time T1: vmid = getVmid(); vmid is associated with Pasid P1
    729 *    Time T2: passId = getPasId(vmid); vmid is associated with Pasid P2
    730 * In the sequence above wave count obtained from time T1 will be incorrectly
    731 * lost or added to total wave count.
    732 *
    733 * The registers that provide the waves in flight are:
    734 *
    735 *  SPI_CSQ_WF_ACTIVE_STATUS - bit-map of queues per pipe. The bit is ON if a
    736 *  queue is slotted, OFF if there is no queue. A process could have ZERO or
    737 *  more queues slotted and submitting waves to be run on compute units. Even
    738 *  when there is a queue it is possible there could be zero wave fronts, this
    739 *  can happen when queue is waiting on top-of-pipe events - e.g. waitRegMem
    740 *  command
    741 *
    742 *  For each bit that is ON from above:
    743 *
    744 *    Read (SPI_CSQ_WF_ACTIVE_COUNT_0 + queue_idx) register. It provides the
    745 *    number of waves that are in flight for the queue at specified index. The
    746 *    index ranges from 0 to 7.
    747 *
    748 *    If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
    749 *    of the wave(s).
    750 *
    751 *    Determine if VMID from above step maps to pasid provided as parameter. If
    752 *    it matches agrregate the wave count. That the VMID will not match pasid is
    753 *    a normal condition i.e. a device is expected to support multiple queues
    754 *    from multiple proceses.
    755 *
    756 *  Reading registers referenced above involves programming GRBM appropriately
    757 */
    758void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
    759		int *pasid_wave_cnt, int *max_waves_per_cu)
    760{
    761	int qidx;
    762	int vmid;
    763	int se_idx;
    764	int sh_idx;
    765	int se_cnt;
    766	int sh_cnt;
    767	int wave_cnt;
    768	int queue_map;
    769	int pasid_tmp;
    770	int max_queue_cnt;
    771	int vmid_wave_cnt = 0;
    772	DECLARE_BITMAP(cp_queue_bitmap, KGD_MAX_QUEUES);
    773
    774	lock_spi_csq_mutexes(adev);
    775	soc15_grbm_select(adev, 1, 0, 0, 0);
    776
    777	/*
    778	 * Iterate through the shader engines and arrays of the device
    779	 * to get number of waves in flight
    780	 */
    781	bitmap_complement(cp_queue_bitmap, adev->gfx.mec.queue_bitmap,
    782			  KGD_MAX_QUEUES);
    783	max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
    784			adev->gfx.mec.num_queue_per_pipe;
    785	sh_cnt = adev->gfx.config.max_sh_per_se;
    786	se_cnt = adev->gfx.config.max_shader_engines;
    787	for (se_idx = 0; se_idx < se_cnt; se_idx++) {
    788		for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
    789
    790			gfx_v9_0_select_se_sh(adev, se_idx, sh_idx, 0xffffffff);
    791			queue_map = RREG32_SOC15(GC, 0, mmSPI_CSQ_WF_ACTIVE_STATUS);
    792
    793			/*
    794			 * Assumption: queue map encodes following schema: four
    795			 * pipes per each micro-engine, with each pipe mapping
    796			 * eight queues. This schema is true for GFX9 devices
    797			 * and must be verified for newer device families
    798			 */
    799			for (qidx = 0; qidx < max_queue_cnt; qidx++) {
    800
    801				/* Skip qeueus that are not associated with
    802				 * compute functions
    803				 */
    804				if (!test_bit(qidx, cp_queue_bitmap))
    805					continue;
    806
    807				if (!(queue_map & (1 << qidx)))
    808					continue;
    809
    810				/* Get number of waves in flight and aggregate them */
    811				get_wave_count(adev, qidx, &wave_cnt, &vmid);
    812				if (wave_cnt != 0) {
    813					pasid_tmp =
    814					  RREG32(SOC15_REG_OFFSET(OSSSYS, 0,
    815						 mmIH_VMID_0_LUT) + vmid);
    816					if (pasid_tmp == pasid)
    817						vmid_wave_cnt += wave_cnt;
    818				}
    819			}
    820		}
    821	}
    822
    823	gfx_v9_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
    824	soc15_grbm_select(adev, 0, 0, 0, 0);
    825	unlock_spi_csq_mutexes(adev);
    826
    827	/* Update the output parameters and return */
    828	*pasid_wave_cnt = vmid_wave_cnt;
    829	*max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
    830				adev->gfx.cu_info.max_waves_per_simd;
    831}
    832
    833void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
    834                        uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr)
    835{
    836	lock_srbm(adev, 0, 0, 0, vmid);
    837
    838	/*
    839	 * Program TBA registers
    840	 */
    841	WREG32_SOC15(GC, 0, mmSQ_SHADER_TBA_LO,
    842                        lower_32_bits(tba_addr >> 8));
    843	WREG32_SOC15(GC, 0, mmSQ_SHADER_TBA_HI,
    844                        upper_32_bits(tba_addr >> 8));
    845
    846	/*
    847	 * Program TMA registers
    848	 */
    849	WREG32_SOC15(GC, 0, mmSQ_SHADER_TMA_LO,
    850			lower_32_bits(tma_addr >> 8));
    851	WREG32_SOC15(GC, 0, mmSQ_SHADER_TMA_HI,
    852			upper_32_bits(tma_addr >> 8));
    853
    854	unlock_srbm(adev);
    855}
    856
    857const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
    858	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
    859	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
    860	.init_interrupts = kgd_gfx_v9_init_interrupts,
    861	.hqd_load = kgd_gfx_v9_hqd_load,
    862	.hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load,
    863	.hqd_sdma_load = kgd_hqd_sdma_load,
    864	.hqd_dump = kgd_gfx_v9_hqd_dump,
    865	.hqd_sdma_dump = kgd_hqd_sdma_dump,
    866	.hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied,
    867	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
    868	.hqd_destroy = kgd_gfx_v9_hqd_destroy,
    869	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
    870	.wave_control_execute = kgd_gfx_v9_wave_control_execute,
    871	.get_atc_vmid_pasid_mapping_info =
    872			kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
    873	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
    874	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
    875	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
    876};