gen6_engine_cs.c (12125B)
1// SPDX-License-Identifier: MIT 2/* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6#include "gen6_engine_cs.h" 7#include "intel_engine.h" 8#include "intel_engine_regs.h" 9#include "intel_gpu_commands.h" 10#include "intel_gt.h" 11#include "intel_gt_irq.h" 12#include "intel_gt_pm_irq.h" 13#include "intel_ring.h" 14 15#define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32)) 16 17/* 18 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for 19 * implementing two workarounds on gen6. From section 1.4.7.1 20 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: 21 * 22 * [DevSNB-C+{W/A}] Before any depth stall flush (including those 23 * produced by non-pipelined state commands), software needs to first 24 * send a PIPE_CONTROL with no bits set except Post-Sync Operation != 25 * 0. 26 * 27 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable 28 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. 29 * 30 * And the workaround for these two requires this workaround first: 31 * 32 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent 33 * BEFORE the pipe-control with a post-sync op and no write-cache 34 * flushes. 35 * 36 * And this last workaround is tricky because of the requirements on 37 * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM 38 * volume 2 part 1: 39 * 40 * "1 of the following must also be set: 41 * - Render Target Cache Flush Enable ([12] of DW1) 42 * - Depth Cache Flush Enable ([0] of DW1) 43 * - Stall at Pixel Scoreboard ([1] of DW1) 44 * - Depth Stall ([13] of DW1) 45 * - Post-Sync Operation ([13] of DW1) 46 * - Notify Enable ([8] of DW1)" 47 * 48 * The cache flushes require the workaround flush that triggered this 49 * one, so we can't use it. Depth stall would trigger the same. 50 * Post-sync nonzero is what triggered this second workaround, so we 51 * can't use that one either. Notify enable is IRQs, which aren't 52 * really our business. That leaves only stall at scoreboard. 53 */ 54static int 55gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) 56{ 57 u32 scratch_addr = 58 intel_gt_scratch_offset(rq->engine->gt, 59 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 60 u32 *cs; 61 62 cs = intel_ring_begin(rq, 6); 63 if (IS_ERR(cs)) 64 return PTR_ERR(cs); 65 66 *cs++ = GFX_OP_PIPE_CONTROL(5); 67 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; 68 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; 69 *cs++ = 0; /* low dword */ 70 *cs++ = 0; /* high dword */ 71 *cs++ = MI_NOOP; 72 intel_ring_advance(rq, cs); 73 74 cs = intel_ring_begin(rq, 6); 75 if (IS_ERR(cs)) 76 return PTR_ERR(cs); 77 78 *cs++ = GFX_OP_PIPE_CONTROL(5); 79 *cs++ = PIPE_CONTROL_QW_WRITE; 80 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; 81 *cs++ = 0; 82 *cs++ = 0; 83 *cs++ = MI_NOOP; 84 intel_ring_advance(rq, cs); 85 86 return 0; 87} 88 89int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode) 90{ 91 u32 scratch_addr = 92 intel_gt_scratch_offset(rq->engine->gt, 93 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 94 u32 *cs, flags = 0; 95 int ret; 96 97 /* Force SNB workarounds for PIPE_CONTROL flushes */ 98 ret = gen6_emit_post_sync_nonzero_flush(rq); 99 if (ret) 100 return ret; 101 102 /* 103 * Just flush everything. Experiments have shown that reducing the 104 * number of bits based on the write domains has little performance 105 * impact. And when rearranging requests, the order of flushes is 106 * unknown. 107 */ 108 if (mode & EMIT_FLUSH) { 109 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 110 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 111 /* 112 * Ensure that any following seqno writes only happen 113 * when the render cache is indeed flushed. 114 */ 115 flags |= PIPE_CONTROL_CS_STALL; 116 } 117 if (mode & EMIT_INVALIDATE) { 118 flags |= PIPE_CONTROL_TLB_INVALIDATE; 119 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 120 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 121 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 122 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 123 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 124 /* 125 * TLB invalidate requires a post-sync write. 126 */ 127 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; 128 } 129 130 cs = intel_ring_begin(rq, 4); 131 if (IS_ERR(cs)) 132 return PTR_ERR(cs); 133 134 *cs++ = GFX_OP_PIPE_CONTROL(4); 135 *cs++ = flags; 136 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; 137 *cs++ = 0; 138 intel_ring_advance(rq, cs); 139 140 return 0; 141} 142 143u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) 144{ 145 /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ 146 *cs++ = GFX_OP_PIPE_CONTROL(4); 147 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; 148 *cs++ = 0; 149 *cs++ = 0; 150 151 *cs++ = GFX_OP_PIPE_CONTROL(4); 152 *cs++ = PIPE_CONTROL_QW_WRITE; 153 *cs++ = intel_gt_scratch_offset(rq->engine->gt, 154 INTEL_GT_SCRATCH_FIELD_DEFAULT) | 155 PIPE_CONTROL_GLOBAL_GTT; 156 *cs++ = 0; 157 158 /* Finally we can flush and with it emit the breadcrumb */ 159 *cs++ = GFX_OP_PIPE_CONTROL(4); 160 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 161 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 162 PIPE_CONTROL_DC_FLUSH_ENABLE | 163 PIPE_CONTROL_QW_WRITE | 164 PIPE_CONTROL_CS_STALL); 165 *cs++ = i915_request_active_seqno(rq) | 166 PIPE_CONTROL_GLOBAL_GTT; 167 *cs++ = rq->fence.seqno; 168 169 *cs++ = MI_USER_INTERRUPT; 170 *cs++ = MI_NOOP; 171 172 rq->tail = intel_ring_offset(rq, cs); 173 assert_ring_tail_valid(rq->ring, rq->tail); 174 175 return cs; 176} 177 178static int mi_flush_dw(struct i915_request *rq, u32 flags) 179{ 180 u32 cmd, *cs; 181 182 cs = intel_ring_begin(rq, 4); 183 if (IS_ERR(cs)) 184 return PTR_ERR(cs); 185 186 cmd = MI_FLUSH_DW; 187 188 /* 189 * We always require a command barrier so that subsequent 190 * commands, such as breadcrumb interrupts, are strictly ordered 191 * wrt the contents of the write cache being flushed to memory 192 * (and thus being coherent from the CPU). 193 */ 194 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 195 196 /* 197 * Bspec vol 1c.3 - blitter engine command streamer: 198 * "If ENABLED, all TLBs will be invalidated once the flush 199 * operation is complete. This bit is only valid when the 200 * Post-Sync Operation field is a value of 1h or 3h." 201 */ 202 cmd |= flags; 203 204 *cs++ = cmd; 205 *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; 206 *cs++ = 0; 207 *cs++ = MI_NOOP; 208 209 intel_ring_advance(rq, cs); 210 211 return 0; 212} 213 214static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags) 215{ 216 return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0); 217} 218 219int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode) 220{ 221 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB); 222} 223 224int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode) 225{ 226 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD); 227} 228 229int gen6_emit_bb_start(struct i915_request *rq, 230 u64 offset, u32 len, 231 unsigned int dispatch_flags) 232{ 233 u32 security; 234 u32 *cs; 235 236 security = MI_BATCH_NON_SECURE_I965; 237 if (dispatch_flags & I915_DISPATCH_SECURE) 238 security = 0; 239 240 cs = intel_ring_begin(rq, 2); 241 if (IS_ERR(cs)) 242 return PTR_ERR(cs); 243 244 cs = __gen6_emit_bb_start(cs, offset, security); 245 intel_ring_advance(rq, cs); 246 247 return 0; 248} 249 250int 251hsw_emit_bb_start(struct i915_request *rq, 252 u64 offset, u32 len, 253 unsigned int dispatch_flags) 254{ 255 u32 security; 256 u32 *cs; 257 258 security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW; 259 if (dispatch_flags & I915_DISPATCH_SECURE) 260 security = 0; 261 262 cs = intel_ring_begin(rq, 2); 263 if (IS_ERR(cs)) 264 return PTR_ERR(cs); 265 266 cs = __gen6_emit_bb_start(cs, offset, security); 267 intel_ring_advance(rq, cs); 268 269 return 0; 270} 271 272static int gen7_stall_cs(struct i915_request *rq) 273{ 274 u32 *cs; 275 276 cs = intel_ring_begin(rq, 4); 277 if (IS_ERR(cs)) 278 return PTR_ERR(cs); 279 280 *cs++ = GFX_OP_PIPE_CONTROL(4); 281 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; 282 *cs++ = 0; 283 *cs++ = 0; 284 intel_ring_advance(rq, cs); 285 286 return 0; 287} 288 289int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode) 290{ 291 u32 scratch_addr = 292 intel_gt_scratch_offset(rq->engine->gt, 293 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 294 u32 *cs, flags = 0; 295 296 /* 297 * Ensure that any following seqno writes only happen when the render 298 * cache is indeed flushed. 299 * 300 * Workaround: 4th PIPE_CONTROL command (except the ones with only 301 * read-cache invalidate bits set) must have the CS_STALL bit set. We 302 * don't try to be clever and just set it unconditionally. 303 */ 304 flags |= PIPE_CONTROL_CS_STALL; 305 306 /* 307 * CS_STALL suggests at least a post-sync write. 308 */ 309 flags |= PIPE_CONTROL_QW_WRITE; 310 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 311 312 /* 313 * Just flush everything. Experiments have shown that reducing the 314 * number of bits based on the write domains has little performance 315 * impact. 316 */ 317 if (mode & EMIT_FLUSH) { 318 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 319 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 320 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 321 flags |= PIPE_CONTROL_FLUSH_ENABLE; 322 } 323 if (mode & EMIT_INVALIDATE) { 324 flags |= PIPE_CONTROL_TLB_INVALIDATE; 325 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 326 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 327 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 328 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 329 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 330 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR; 331 332 /* 333 * Workaround: we must issue a pipe_control with CS-stall bit 334 * set before a pipe_control command that has the state cache 335 * invalidate bit set. 336 */ 337 gen7_stall_cs(rq); 338 } 339 340 cs = intel_ring_begin(rq, 4); 341 if (IS_ERR(cs)) 342 return PTR_ERR(cs); 343 344 *cs++ = GFX_OP_PIPE_CONTROL(4); 345 *cs++ = flags; 346 *cs++ = scratch_addr; 347 *cs++ = 0; 348 intel_ring_advance(rq, cs); 349 350 return 0; 351} 352 353u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) 354{ 355 *cs++ = GFX_OP_PIPE_CONTROL(4); 356 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 357 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 358 PIPE_CONTROL_DC_FLUSH_ENABLE | 359 PIPE_CONTROL_FLUSH_ENABLE | 360 PIPE_CONTROL_QW_WRITE | 361 PIPE_CONTROL_GLOBAL_GTT_IVB | 362 PIPE_CONTROL_CS_STALL); 363 *cs++ = i915_request_active_seqno(rq); 364 *cs++ = rq->fence.seqno; 365 366 *cs++ = MI_USER_INTERRUPT; 367 *cs++ = MI_NOOP; 368 369 rq->tail = intel_ring_offset(rq, cs); 370 assert_ring_tail_valid(rq->ring, rq->tail); 371 372 return cs; 373} 374 375u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) 376{ 377 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); 378 GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR); 379 380 *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; 381 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; 382 *cs++ = rq->fence.seqno; 383 384 *cs++ = MI_USER_INTERRUPT; 385 386 rq->tail = intel_ring_offset(rq, cs); 387 assert_ring_tail_valid(rq->ring, rq->tail); 388 389 return cs; 390} 391 392#define GEN7_XCS_WA 32 393u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) 394{ 395 int i; 396 397 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); 398 GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR); 399 400 *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB | 401 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; 402 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; 403 *cs++ = rq->fence.seqno; 404 405 for (i = 0; i < GEN7_XCS_WA; i++) { 406 *cs++ = MI_STORE_DWORD_INDEX; 407 *cs++ = I915_GEM_HWS_SEQNO_ADDR; 408 *cs++ = rq->fence.seqno; 409 } 410 411 *cs++ = MI_FLUSH_DW; 412 *cs++ = 0; 413 *cs++ = 0; 414 415 *cs++ = MI_USER_INTERRUPT; 416 *cs++ = MI_NOOP; 417 418 rq->tail = intel_ring_offset(rq, cs); 419 assert_ring_tail_valid(rq->ring, rq->tail); 420 421 return cs; 422} 423#undef GEN7_XCS_WA 424 425void gen6_irq_enable(struct intel_engine_cs *engine) 426{ 427 ENGINE_WRITE(engine, RING_IMR, 428 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 429 430 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ 431 ENGINE_POSTING_READ(engine, RING_IMR); 432 433 gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask); 434} 435 436void gen6_irq_disable(struct intel_engine_cs *engine) 437{ 438 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 439 gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask); 440} 441 442void hsw_irq_enable_vecs(struct intel_engine_cs *engine) 443{ 444 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask); 445 446 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ 447 ENGINE_POSTING_READ(engine, RING_IMR); 448 449 gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask); 450} 451 452void hsw_irq_disable_vecs(struct intel_engine_cs *engine) 453{ 454 ENGINE_WRITE(engine, RING_IMR, ~0); 455 gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask); 456}