amdgpu_amdkfd_gfx_v10_3.c (24533B)
1/* 2 * Copyright 2019 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22#include <linux/mmu_context.h> 23#include "amdgpu.h" 24#include "amdgpu_amdkfd.h" 25#include "gc/gc_10_3_0_offset.h" 26#include "gc/gc_10_3_0_sh_mask.h" 27#include "oss/osssys_5_0_0_offset.h" 28#include "oss/osssys_5_0_0_sh_mask.h" 29#include "athub/athub_2_1_0_offset.h" 30#include "athub/athub_2_1_0_sh_mask.h" 31#include "soc15_common.h" 32#include "v10_structs.h" 33#include "nv.h" 34#include "nvd.h" 35 36enum hqd_dequeue_request_type { 37 NO_ACTION = 0, 38 DRAIN_PIPE, 39 RESET_WAVES, 40 SAVE_WAVES 41}; 42 43static void lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe, 44 uint32_t queue, uint32_t vmid) 45{ 46 mutex_lock(&adev->srbm_mutex); 47 nv_grbm_select(adev, mec, pipe, queue, vmid); 48} 49 50static void unlock_srbm(struct amdgpu_device *adev) 51{ 52 nv_grbm_select(adev, 0, 0, 0, 0); 53 mutex_unlock(&adev->srbm_mutex); 54} 55 56static void acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id, 57 uint32_t queue_id) 58{ 59 uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 60 uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 61 62 lock_srbm(adev, mec, pipe, queue_id, 0); 63} 64 65static uint64_t get_queue_mask(struct amdgpu_device *adev, 66 uint32_t pipe_id, uint32_t queue_id) 67{ 68 unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe + 69 queue_id; 70 71 return 1ull << bit; 72} 73 74static void release_queue(struct amdgpu_device *adev) 75{ 76 unlock_srbm(adev); 77} 78 79static void program_sh_mem_settings_v10_3(struct amdgpu_device *adev, uint32_t vmid, 80 uint32_t sh_mem_config, 81 uint32_t sh_mem_ape1_base, 82 uint32_t sh_mem_ape1_limit, 83 uint32_t sh_mem_bases) 84{ 85 lock_srbm(adev, 0, 0, 0, vmid); 86 87 WREG32_SOC15(GC, 0, mmSH_MEM_CONFIG, sh_mem_config); 88 WREG32_SOC15(GC, 0, mmSH_MEM_BASES, sh_mem_bases); 89 /* APE1 no longer exists on GFX9 */ 90 91 unlock_srbm(adev); 92} 93 94/* ATC is defeatured on Sienna_Cichlid */ 95static int set_pasid_vmid_mapping_v10_3(struct amdgpu_device *adev, unsigned int pasid, 96 unsigned int vmid) 97{ 98 uint32_t value = pasid << IH_VMID_0_LUT__PASID__SHIFT; 99 100 /* Mapping vmid to pasid also for IH block */ 101 pr_debug("mapping vmid %d -> pasid %d in IH block for GFX client\n", 102 vmid, pasid); 103 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, value); 104 105 return 0; 106} 107 108static int init_interrupts_v10_3(struct amdgpu_device *adev, uint32_t pipe_id) 109{ 110 uint32_t mec; 111 uint32_t pipe; 112 113 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 114 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 115 116 lock_srbm(adev, mec, pipe, 0, 0); 117 118 WREG32_SOC15(GC, 0, mmCPC_INT_CNTL, 119 CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | 120 CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); 121 122 unlock_srbm(adev); 123 124 return 0; 125} 126 127static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, 128 unsigned int engine_id, 129 unsigned int queue_id) 130{ 131 uint32_t sdma_engine_reg_base = 0; 132 uint32_t sdma_rlc_reg_offset; 133 134 switch (engine_id) { 135 default: 136 dev_warn(adev->dev, 137 "Invalid sdma engine id (%d), using engine id 0\n", 138 engine_id); 139 fallthrough; 140 case 0: 141 sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, 142 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; 143 break; 144 case 1: 145 sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, 146 mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; 147 break; 148 case 2: 149 sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, 150 mmSDMA2_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; 151 break; 152 case 3: 153 sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, 154 mmSDMA3_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; 155 break; 156 } 157 158 sdma_rlc_reg_offset = sdma_engine_reg_base 159 + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL); 160 161 pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id, 162 queue_id, sdma_rlc_reg_offset); 163 164 return sdma_rlc_reg_offset; 165} 166 167static inline struct v10_compute_mqd *get_mqd(void *mqd) 168{ 169 return (struct v10_compute_mqd *)mqd; 170} 171 172static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd) 173{ 174 return (struct v10_sdma_mqd *)mqd; 175} 176 177static int hqd_load_v10_3(struct amdgpu_device *adev, void *mqd, 178 uint32_t pipe_id, uint32_t queue_id, 179 uint32_t __user *wptr, uint32_t wptr_shift, 180 uint32_t wptr_mask, struct mm_struct *mm) 181{ 182 struct v10_compute_mqd *m; 183 uint32_t *mqd_hqd; 184 uint32_t reg, hqd_base, data; 185 186 m = get_mqd(mqd); 187 188 pr_debug("Load hqd of pipe %d queue %d\n", pipe_id, queue_id); 189 acquire_queue(adev, pipe_id, queue_id); 190 191 /* HIQ is set during driver init period with vmid set to 0*/ 192 if (m->cp_hqd_vmid == 0) { 193 uint32_t value, mec, pipe; 194 195 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 196 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 197 198 pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 199 mec, pipe, queue_id); 200 value = RREG32_SOC15(GC, 0, mmRLC_CP_SCHEDULERS); 201 value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, 202 ((mec << 5) | (pipe << 3) | queue_id | 0x80)); 203 WREG32_SOC15(GC, 0, mmRLC_CP_SCHEDULERS, value); 204 } 205 206 /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ 207 mqd_hqd = &m->cp_mqd_base_addr_lo; 208 hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 209 210 for (reg = hqd_base; 211 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 212 WREG32_SOC15_IP(GC, reg, mqd_hqd[reg - hqd_base]); 213 214 215 /* Activate doorbell logic before triggering WPTR poll. */ 216 data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, 217 CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); 218 WREG32_SOC15(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL, data); 219 220 if (wptr) { 221 /* Don't read wptr with get_user because the user 222 * context may not be accessible (if this function 223 * runs in a work queue). Instead trigger a one-shot 224 * polling read from memory in the CP. This assumes 225 * that wptr is GPU-accessible in the queue's VMID via 226 * ATC or SVM. WPTR==RPTR before starting the poll so 227 * the CP starts fetching new commands from the right 228 * place. 229 * 230 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit 231 * tricky. Assume that the queue didn't overflow. The 232 * number of valid bits in the 32-bit RPTR depends on 233 * the queue size. The remaining bits are taken from 234 * the saved 64-bit WPTR. If the WPTR wrapped, add the 235 * queue size. 236 */ 237 uint32_t queue_size = 238 2 << REG_GET_FIELD(m->cp_hqd_pq_control, 239 CP_HQD_PQ_CONTROL, QUEUE_SIZE); 240 uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); 241 242 if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) 243 guessed_wptr += queue_size; 244 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); 245 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; 246 247 WREG32_SOC15(GC, 0, mmCP_HQD_PQ_WPTR_LO, 248 lower_32_bits(guessed_wptr)); 249 WREG32_SOC15(GC, 0, mmCP_HQD_PQ_WPTR_HI, 250 upper_32_bits(guessed_wptr)); 251 WREG32_SOC15(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR, 252 lower_32_bits((uint64_t)wptr)); 253 WREG32_SOC15(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI, 254 upper_32_bits((uint64_t)wptr)); 255 pr_debug("%s setting CP_PQ_WPTR_POLL_CNTL1 to %x\n", __func__, 256 (uint32_t)get_queue_mask(adev, pipe_id, queue_id)); 257 WREG32_SOC15(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1, 258 (uint32_t)get_queue_mask(adev, pipe_id, queue_id)); 259 } 260 261 /* Start the EOP fetcher */ 262 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), 263 REG_SET_FIELD(m->cp_hqd_eop_rptr, 264 CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); 265 266 data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); 267 WREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE, data); 268 269 release_queue(adev); 270 271 return 0; 272} 273 274static int hiq_mqd_load_v10_3(struct amdgpu_device *adev, void *mqd, 275 uint32_t pipe_id, uint32_t queue_id, 276 uint32_t doorbell_off) 277{ 278 struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring; 279 struct v10_compute_mqd *m; 280 uint32_t mec, pipe; 281 int r; 282 283 m = get_mqd(mqd); 284 285 acquire_queue(adev, pipe_id, queue_id); 286 287 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 288 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 289 290 pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 291 mec, pipe, queue_id); 292 293 spin_lock(&adev->gfx.kiq.ring_lock); 294 r = amdgpu_ring_alloc(kiq_ring, 7); 295 if (r) { 296 pr_err("Failed to alloc KIQ (%d).\n", r); 297 goto out_unlock; 298 } 299 300 amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5)); 301 amdgpu_ring_write(kiq_ring, 302 PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ 303 PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */ 304 PACKET3_MAP_QUEUES_QUEUE(queue_id) | 305 PACKET3_MAP_QUEUES_PIPE(pipe) | 306 PACKET3_MAP_QUEUES_ME((mec - 1)) | 307 PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */ 308 PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */ 309 PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */ 310 PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */ 311 amdgpu_ring_write(kiq_ring, 312 PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off)); 313 amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo); 314 amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi); 315 amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo); 316 amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi); 317 amdgpu_ring_commit(kiq_ring); 318 319out_unlock: 320 spin_unlock(&adev->gfx.kiq.ring_lock); 321 release_queue(adev); 322 323 return r; 324} 325 326static int hqd_dump_v10_3(struct amdgpu_device *adev, 327 uint32_t pipe_id, uint32_t queue_id, 328 uint32_t (**dump)[2], uint32_t *n_regs) 329{ 330 uint32_t i = 0, reg; 331#define HQD_N_REGS 56 332#define DUMP_REG(addr) do { \ 333 if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ 334 break; \ 335 (*dump)[i][0] = (addr) << 2; \ 336 (*dump)[i++][1] = RREG32_SOC15_IP(GC, addr); \ 337 } while (0) 338 339 *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); 340 if (*dump == NULL) 341 return -ENOMEM; 342 343 acquire_queue(adev, pipe_id, queue_id); 344 345 for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 346 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 347 DUMP_REG(reg); 348 349 release_queue(adev); 350 351 WARN_ON_ONCE(i != HQD_N_REGS); 352 *n_regs = i; 353 354 return 0; 355} 356 357static int hqd_sdma_load_v10_3(struct amdgpu_device *adev, void *mqd, 358 uint32_t __user *wptr, struct mm_struct *mm) 359{ 360 struct v10_sdma_mqd *m; 361 uint32_t sdma_rlc_reg_offset; 362 unsigned long end_jiffies; 363 uint32_t data; 364 uint64_t data64; 365 uint64_t __user *wptr64 = (uint64_t __user *)wptr; 366 367 m = get_sdma_mqd(mqd); 368 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 369 m->sdma_queue_id); 370 371 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 372 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); 373 374 end_jiffies = msecs_to_jiffies(2000) + jiffies; 375 while (true) { 376 data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 377 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 378 break; 379 if (time_after(jiffies, end_jiffies)) { 380 pr_err("SDMA RLC not idle in %s\n", __func__); 381 return -ETIME; 382 } 383 usleep_range(500, 1000); 384 } 385 386 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET, 387 m->sdmax_rlcx_doorbell_offset); 388 389 data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, 390 ENABLE, 1); 391 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data); 392 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR, 393 m->sdmax_rlcx_rb_rptr); 394 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI, 395 m->sdmax_rlcx_rb_rptr_hi); 396 397 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); 398 if (read_user_wptr(mm, wptr64, data64)) { 399 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 400 lower_32_bits(data64)); 401 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 402 upper_32_bits(data64)); 403 } else { 404 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 405 m->sdmax_rlcx_rb_rptr); 406 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 407 m->sdmax_rlcx_rb_rptr_hi); 408 } 409 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); 410 411 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); 412 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI, 413 m->sdmax_rlcx_rb_base_hi); 414 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, 415 m->sdmax_rlcx_rb_rptr_addr_lo); 416 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, 417 m->sdmax_rlcx_rb_rptr_addr_hi); 418 419 data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, 420 RB_ENABLE, 1); 421 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data); 422 423 return 0; 424} 425 426static int hqd_sdma_dump_v10_3(struct amdgpu_device *adev, 427 uint32_t engine_id, uint32_t queue_id, 428 uint32_t (**dump)[2], uint32_t *n_regs) 429{ 430 uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, 431 engine_id, queue_id); 432 uint32_t i = 0, reg; 433#undef HQD_N_REGS 434#define HQD_N_REGS (19+6+7+12) 435 436 *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); 437 if (*dump == NULL) 438 return -ENOMEM; 439 440 for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) 441 DUMP_REG(sdma_rlc_reg_offset + reg); 442 for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) 443 DUMP_REG(sdma_rlc_reg_offset + reg); 444 for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; 445 reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) 446 DUMP_REG(sdma_rlc_reg_offset + reg); 447 for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; 448 reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) 449 DUMP_REG(sdma_rlc_reg_offset + reg); 450 451 WARN_ON_ONCE(i != HQD_N_REGS); 452 *n_regs = i; 453 454 return 0; 455} 456 457static bool hqd_is_occupied_v10_3(struct amdgpu_device *adev, 458 uint64_t queue_address, uint32_t pipe_id, 459 uint32_t queue_id) 460{ 461 uint32_t act; 462 bool retval = false; 463 uint32_t low, high; 464 465 acquire_queue(adev, pipe_id, queue_id); 466 act = RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE); 467 if (act) { 468 low = lower_32_bits(queue_address >> 8); 469 high = upper_32_bits(queue_address >> 8); 470 471 if (low == RREG32_SOC15(GC, 0, mmCP_HQD_PQ_BASE) && 472 high == RREG32_SOC15(GC, 0, mmCP_HQD_PQ_BASE_HI)) 473 retval = true; 474 } 475 release_queue(adev); 476 return retval; 477} 478 479static bool hqd_sdma_is_occupied_v10_3(struct amdgpu_device *adev, 480 void *mqd) 481{ 482 struct v10_sdma_mqd *m; 483 uint32_t sdma_rlc_reg_offset; 484 uint32_t sdma_rlc_rb_cntl; 485 486 m = get_sdma_mqd(mqd); 487 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 488 m->sdma_queue_id); 489 490 sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 491 492 if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) 493 return true; 494 495 return false; 496} 497 498static int hqd_destroy_v10_3(struct amdgpu_device *adev, void *mqd, 499 enum kfd_preempt_type reset_type, 500 unsigned int utimeout, uint32_t pipe_id, 501 uint32_t queue_id) 502{ 503 enum hqd_dequeue_request_type type; 504 unsigned long end_jiffies; 505 uint32_t temp; 506 struct v10_compute_mqd *m = get_mqd(mqd); 507 508 acquire_queue(adev, pipe_id, queue_id); 509 510 if (m->cp_hqd_vmid == 0) 511 WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); 512 513 switch (reset_type) { 514 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: 515 type = DRAIN_PIPE; 516 break; 517 case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: 518 type = RESET_WAVES; 519 break; 520 case KFD_PREEMPT_TYPE_WAVEFRONT_SAVE: 521 type = SAVE_WAVES; 522 break; 523 default: 524 type = DRAIN_PIPE; 525 break; 526 } 527 528 WREG32_SOC15(GC, 0, mmCP_HQD_DEQUEUE_REQUEST, type); 529 530 end_jiffies = (utimeout * HZ / 1000) + jiffies; 531 while (true) { 532 temp = RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE); 533 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 534 break; 535 if (time_after(jiffies, end_jiffies)) { 536 pr_err("cp queue pipe %d queue %d preemption failed\n", 537 pipe_id, queue_id); 538 release_queue(adev); 539 return -ETIME; 540 } 541 usleep_range(500, 1000); 542 } 543 544 release_queue(adev); 545 return 0; 546} 547 548static int hqd_sdma_destroy_v10_3(struct amdgpu_device *adev, void *mqd, 549 unsigned int utimeout) 550{ 551 struct v10_sdma_mqd *m; 552 uint32_t sdma_rlc_reg_offset; 553 uint32_t temp; 554 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 555 556 m = get_sdma_mqd(mqd); 557 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 558 m->sdma_queue_id); 559 560 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 561 temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; 562 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp); 563 564 while (true) { 565 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 566 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 567 break; 568 if (time_after(jiffies, end_jiffies)) { 569 pr_err("SDMA RLC not idle in %s\n", __func__); 570 return -ETIME; 571 } 572 usleep_range(500, 1000); 573 } 574 575 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0); 576 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 577 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) | 578 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); 579 580 m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR); 581 m->sdmax_rlcx_rb_rptr_hi = 582 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI); 583 584 return 0; 585} 586 587static int wave_control_execute_v10_3(struct amdgpu_device *adev, 588 uint32_t gfx_index_val, 589 uint32_t sq_cmd) 590{ 591 uint32_t data = 0; 592 593 mutex_lock(&adev->grbm_idx_mutex); 594 595 WREG32_SOC15(GC, 0, mmGRBM_GFX_INDEX, gfx_index_val); 596 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); 597 598 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 599 INSTANCE_BROADCAST_WRITES, 1); 600 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 601 SA_BROADCAST_WRITES, 1); 602 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 603 SE_BROADCAST_WRITES, 1); 604 605 WREG32_SOC15(GC, 0, mmGRBM_GFX_INDEX, data); 606 mutex_unlock(&adev->grbm_idx_mutex); 607 608 return 0; 609} 610 611static bool get_atc_vmid_pasid_mapping_info_v10_3(struct amdgpu_device *adev, 612 uint8_t vmid, uint16_t *p_pasid) 613{ 614 uint32_t value; 615 616 value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 617 + vmid); 618 *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK; 619 620 return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK); 621} 622 623static void set_vm_context_page_table_base_v10_3(struct amdgpu_device *adev, 624 uint32_t vmid, uint64_t page_table_base) 625{ 626 /* SDMA is on gfxhub as well for Navi1* series */ 627 adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); 628} 629 630static void program_trap_handler_settings_v10_3(struct amdgpu_device *adev, 631 uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr) 632{ 633 lock_srbm(adev, 0, 0, 0, vmid); 634 635 /* 636 * Program TBA registers 637 */ 638 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_SHADER_TBA_LO), 639 lower_32_bits(tba_addr >> 8)); 640 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_SHADER_TBA_HI), 641 upper_32_bits(tba_addr >> 8) | 642 (1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT)); 643 644 /* 645 * Program TMA registers 646 */ 647 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_SHADER_TMA_LO), 648 lower_32_bits(tma_addr >> 8)); 649 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_SHADER_TMA_HI), 650 upper_32_bits(tma_addr >> 8)); 651 652 unlock_srbm(adev); 653} 654 655#if 0 656uint32_t enable_debug_trap_v10_3(struct amdgpu_device *adev, 657 uint32_t trap_debug_wave_launch_mode, 658 uint32_t vmid) 659{ 660 uint32_t data = 0; 661 uint32_t orig_wave_cntl_value; 662 uint32_t orig_stall_vmid; 663 664 mutex_lock(&adev->grbm_idx_mutex); 665 666 orig_wave_cntl_value = RREG32(SOC15_REG_OFFSET(GC, 667 0, 668 mmSPI_GDBG_WAVE_CNTL)); 669 orig_stall_vmid = REG_GET_FIELD(orig_wave_cntl_value, 670 SPI_GDBG_WAVE_CNTL, 671 STALL_VMID); 672 673 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 1); 674 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data); 675 676 data = 0; 677 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data); 678 679 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), orig_stall_vmid); 680 681 mutex_unlock(&adev->grbm_idx_mutex); 682 683 return 0; 684} 685 686uint32_t disable_debug_trap_v10_3(struct amdgpu_device *adev) 687{ 688 mutex_lock(&adev->grbm_idx_mutex); 689 690 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0); 691 692 mutex_unlock(&adev->grbm_idx_mutex); 693 694 return 0; 695} 696 697uint32_t set_wave_launch_trap_override_v10_3(struct amdgpu_device *adev, 698 uint32_t trap_override, 699 uint32_t trap_mask) 700{ 701 uint32_t data = 0; 702 703 mutex_lock(&adev->grbm_idx_mutex); 704 705 data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); 706 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 1); 707 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data); 708 709 data = 0; 710 data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, 711 EXCP_EN, trap_mask); 712 data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, 713 REPLACE, trap_override); 714 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data); 715 716 data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); 717 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, 0); 718 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data); 719 720 mutex_unlock(&adev->grbm_idx_mutex); 721 722 return 0; 723} 724 725uint32_t set_wave_launch_mode_v10_3(struct amdgpu_device *adev, 726 uint8_t wave_launch_mode, 727 uint32_t vmid) 728{ 729 uint32_t data = 0; 730 bool is_stall_mode; 731 bool is_mode_set; 732 733 is_stall_mode = (wave_launch_mode == 4); 734 is_mode_set = (wave_launch_mode != 0 && wave_launch_mode != 4); 735 736 mutex_lock(&adev->grbm_idx_mutex); 737 738 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2, 739 VMID_MASK, is_mode_set ? 1 << vmid : 0); 740 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2, 741 MODE, is_mode_set ? wave_launch_mode : 0); 742 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data); 743 744 data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); 745 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, 746 STALL_VMID, is_stall_mode ? 1 << vmid : 0); 747 data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, 748 STALL_RA, is_stall_mode ? 1 : 0); 749 WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data); 750 751 mutex_unlock(&adev->grbm_idx_mutex); 752 753 return 0; 754} 755 756/* kgd_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values 757 * The values read are: 758 * ib_offload_wait_time -- Wait Count for Indirect Buffer Offloads. 759 * atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads. 760 * wrm_offload_wait_time -- Wait Count for WAIT_REG_MEM Offloads. 761 * gws_wait_time -- Wait Count for Global Wave Syncs. 762 * que_sleep_wait_time -- Wait Count for Dequeue Retry. 763 * sch_wave_wait_time -- Wait Count for Scheduling Wave Message. 764 * sem_rearm_wait_time -- Wait Count for Semaphore re-arm. 765 * deq_retry_wait_time -- Wait Count for Global Wave Syncs. 766 */ 767void get_iq_wait_times_v10_3(struct amdgpu_device *adev, 768 uint32_t *wait_times) 769 770{ 771 *wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2)); 772} 773 774void build_grace_period_packet_info_v10_3(struct amdgpu_device *adev, 775 uint32_t wait_times, 776 uint32_t grace_period, 777 uint32_t *reg_offset, 778 uint32_t *reg_data) 779{ 780 *reg_data = wait_times; 781 782 *reg_data = REG_SET_FIELD(*reg_data, 783 CP_IQ_WAIT_TIME2, 784 SCH_WAVE, 785 grace_period); 786 787 *reg_offset = mmCP_IQ_WAIT_TIME2; 788} 789#endif 790 791const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = { 792 .program_sh_mem_settings = program_sh_mem_settings_v10_3, 793 .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v10_3, 794 .init_interrupts = init_interrupts_v10_3, 795 .hqd_load = hqd_load_v10_3, 796 .hiq_mqd_load = hiq_mqd_load_v10_3, 797 .hqd_sdma_load = hqd_sdma_load_v10_3, 798 .hqd_dump = hqd_dump_v10_3, 799 .hqd_sdma_dump = hqd_sdma_dump_v10_3, 800 .hqd_is_occupied = hqd_is_occupied_v10_3, 801 .hqd_sdma_is_occupied = hqd_sdma_is_occupied_v10_3, 802 .hqd_destroy = hqd_destroy_v10_3, 803 .hqd_sdma_destroy = hqd_sdma_destroy_v10_3, 804 .wave_control_execute = wave_control_execute_v10_3, 805 .get_atc_vmid_pasid_mapping_info = get_atc_vmid_pasid_mapping_info_v10_3, 806 .set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3, 807 .program_trap_handler_settings = program_trap_handler_settings_v10_3, 808#if 0 809 .enable_debug_trap = enable_debug_trap_v10_3, 810 .disable_debug_trap = disable_debug_trap_v10_3, 811 .set_wave_launch_trap_override = set_wave_launch_trap_override_v10_3, 812 .set_wave_launch_mode = set_wave_launch_mode_v10_3, 813 .get_iq_wait_times = get_iq_wait_times_v10_3, 814 .build_grace_period_packet_info = build_grace_period_packet_info_v10_3, 815#endif 816};