intel_reset.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
intel_reset.c (39763B)
      1// SPDX-License-Identifier: MIT
      2/*
      3 * Copyright © 2008-2018 Intel Corporation
      4 */
      5
      6#include <linux/sched/mm.h>
      7#include <linux/stop_machine.h>
      8#include <linux/string_helpers.h>
      9
     10#include "display/intel_display.h"
     11#include "display/intel_overlay.h"
     12
     13#include "gem/i915_gem_context.h"
     14
     15#include "gt/intel_gt_regs.h"
     16
     17#include "i915_drv.h"
     18#include "i915_file_private.h"
     19#include "i915_gpu_error.h"
     20#include "i915_irq.h"
     21#include "intel_breadcrumbs.h"
     22#include "intel_engine_pm.h"
     23#include "intel_engine_regs.h"
     24#include "intel_gt.h"
     25#include "intel_gt_pm.h"
     26#include "intel_gt_requests.h"
     27#include "intel_mchbar_regs.h"
     28#include "intel_pci_config.h"
     29#include "intel_reset.h"
     30
     31#include "uc/intel_guc.h"
     32
     33#define RESET_MAX_RETRIES 3
     34
     35/* XXX How to handle concurrent GGTT updates using tiling registers? */
     36#define RESET_UNDER_STOP_MACHINE 0
     37
     38static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
     39{
     40	intel_uncore_rmw_fw(uncore, reg, 0, set);
     41}
     42
     43static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
     44{
     45	intel_uncore_rmw_fw(uncore, reg, clr, 0);
     46}
     47
     48static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
     49{
     50	struct drm_i915_file_private *file_priv = ctx->file_priv;
     51	unsigned long prev_hang;
     52	unsigned int score;
     53
     54	if (IS_ERR_OR_NULL(file_priv))
     55		return;
     56
     57	score = 0;
     58	if (banned)
     59		score = I915_CLIENT_SCORE_CONTEXT_BAN;
     60
     61	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
     62	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
     63		score += I915_CLIENT_SCORE_HANG_FAST;
     64
     65	if (score) {
     66		atomic_add(score, &file_priv->ban_score);
     67
     68		drm_dbg(&ctx->i915->drm,
     69			"client %s: gained %u ban score, now %u\n",
     70			ctx->name, score,
     71			atomic_read(&file_priv->ban_score));
     72	}
     73}
     74
     75static bool mark_guilty(struct i915_request *rq)
     76{
     77	struct i915_gem_context *ctx;
     78	unsigned long prev_hang;
     79	bool banned;
     80	int i;
     81
     82	if (intel_context_is_closed(rq->context))
     83		return true;
     84
     85	rcu_read_lock();
     86	ctx = rcu_dereference(rq->context->gem_context);
     87	if (ctx && !kref_get_unless_zero(&ctx->ref))
     88		ctx = NULL;
     89	rcu_read_unlock();
     90	if (!ctx)
     91		return intel_context_is_banned(rq->context);
     92
     93	atomic_inc(&ctx->guilty_count);
     94
     95	/* Cool contexts are too cool to be banned! (Used for reset testing.) */
     96	if (!i915_gem_context_is_bannable(ctx)) {
     97		banned = false;
     98		goto out;
     99	}
    100
    101	drm_notice(&ctx->i915->drm,
    102		   "%s context reset due to GPU hang\n",
    103		   ctx->name);
    104
    105	/* Record the timestamp for the last N hangs */
    106	prev_hang = ctx->hang_timestamp[0];
    107	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
    108		ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
    109	ctx->hang_timestamp[i] = jiffies;
    110
    111	/* If we have hung N+1 times in rapid succession, we ban the context! */
    112	banned = !i915_gem_context_is_recoverable(ctx);
    113	if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
    114		banned = true;
    115	if (banned)
    116		drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
    117			ctx->name, atomic_read(&ctx->guilty_count));
    118
    119	client_mark_guilty(ctx, banned);
    120
    121out:
    122	i915_gem_context_put(ctx);
    123	return banned;
    124}
    125
    126static void mark_innocent(struct i915_request *rq)
    127{
    128	struct i915_gem_context *ctx;
    129
    130	rcu_read_lock();
    131	ctx = rcu_dereference(rq->context->gem_context);
    132	if (ctx)
    133		atomic_inc(&ctx->active_count);
    134	rcu_read_unlock();
    135}
    136
    137void __i915_request_reset(struct i915_request *rq, bool guilty)
    138{
    139	bool banned = false;
    140
    141	RQ_TRACE(rq, "guilty? %s\n", str_yes_no(guilty));
    142	GEM_BUG_ON(__i915_request_is_complete(rq));
    143
    144	rcu_read_lock(); /* protect the GEM context */
    145	if (guilty) {
    146		i915_request_set_error_once(rq, -EIO);
    147		__i915_request_skip(rq);
    148		banned = mark_guilty(rq);
    149	} else {
    150		i915_request_set_error_once(rq, -EAGAIN);
    151		mark_innocent(rq);
    152	}
    153	rcu_read_unlock();
    154
    155	if (banned)
    156		intel_context_ban(rq->context, rq);
    157}
    158
    159static bool i915_in_reset(struct pci_dev *pdev)
    160{
    161	u8 gdrst;
    162
    163	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
    164	return gdrst & GRDOM_RESET_STATUS;
    165}
    166
    167static int i915_do_reset(struct intel_gt *gt,
    168			 intel_engine_mask_t engine_mask,
    169			 unsigned int retry)
    170{
    171	struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
    172	int err;
    173
    174	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
    175	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
    176	udelay(50);
    177	err = wait_for_atomic(i915_in_reset(pdev), 50);
    178
    179	/* Clear the reset request. */
    180	pci_write_config_byte(pdev, I915_GDRST, 0);
    181	udelay(50);
    182	if (!err)
    183		err = wait_for_atomic(!i915_in_reset(pdev), 50);
    184
    185	return err;
    186}
    187
    188static bool g4x_reset_complete(struct pci_dev *pdev)
    189{
    190	u8 gdrst;
    191
    192	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
    193	return (gdrst & GRDOM_RESET_ENABLE) == 0;
    194}
    195
    196static int g33_do_reset(struct intel_gt *gt,
    197			intel_engine_mask_t engine_mask,
    198			unsigned int retry)
    199{
    200	struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
    201
    202	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
    203	return wait_for_atomic(g4x_reset_complete(pdev), 50);
    204}
    205
    206static int g4x_do_reset(struct intel_gt *gt,
    207			intel_engine_mask_t engine_mask,
    208			unsigned int retry)
    209{
    210	struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
    211	struct intel_uncore *uncore = gt->uncore;
    212	int ret;
    213
    214	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
    215	rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
    216	intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
    217
    218	pci_write_config_byte(pdev, I915_GDRST,
    219			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
    220	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
    221	if (ret) {
    222		GT_TRACE(gt, "Wait for media reset failed\n");
    223		goto out;
    224	}
    225
    226	pci_write_config_byte(pdev, I915_GDRST,
    227			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
    228	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
    229	if (ret) {
    230		GT_TRACE(gt, "Wait for render reset failed\n");
    231		goto out;
    232	}
    233
    234out:
    235	pci_write_config_byte(pdev, I915_GDRST, 0);
    236
    237	rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
    238	intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
    239
    240	return ret;
    241}
    242
    243static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
    244			unsigned int retry)
    245{
    246	struct intel_uncore *uncore = gt->uncore;
    247	int ret;
    248
    249	intel_uncore_write_fw(uncore, ILK_GDSR,
    250			      ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
    251	ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
    252					   ILK_GRDOM_RESET_ENABLE, 0,
    253					   5000, 0,
    254					   NULL);
    255	if (ret) {
    256		GT_TRACE(gt, "Wait for render reset failed\n");
    257		goto out;
    258	}
    259
    260	intel_uncore_write_fw(uncore, ILK_GDSR,
    261			      ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
    262	ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
    263					   ILK_GRDOM_RESET_ENABLE, 0,
    264					   5000, 0,
    265					   NULL);
    266	if (ret) {
    267		GT_TRACE(gt, "Wait for media reset failed\n");
    268		goto out;
    269	}
    270
    271out:
    272	intel_uncore_write_fw(uncore, ILK_GDSR, 0);
    273	intel_uncore_posting_read_fw(uncore, ILK_GDSR);
    274	return ret;
    275}
    276
    277/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
    278static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
    279{
    280	struct intel_uncore *uncore = gt->uncore;
    281	int err;
    282
    283	/*
    284	 * GEN6_GDRST is not in the gt power well, no need to check
    285	 * for fifo space for the write or forcewake the chip for
    286	 * the read
    287	 */
    288	intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
    289
    290	/* Wait for the device to ack the reset requests */
    291	err = __intel_wait_for_register_fw(uncore,
    292					   GEN6_GDRST, hw_domain_mask, 0,
    293					   500, 0,
    294					   NULL);
    295	if (err)
    296		GT_TRACE(gt,
    297			 "Wait for 0x%08x engines reset failed\n",
    298			 hw_domain_mask);
    299
    300	return err;
    301}
    302
    303static int gen6_reset_engines(struct intel_gt *gt,
    304			      intel_engine_mask_t engine_mask,
    305			      unsigned int retry)
    306{
    307	struct intel_engine_cs *engine;
    308	u32 hw_mask;
    309
    310	if (engine_mask == ALL_ENGINES) {
    311		hw_mask = GEN6_GRDOM_FULL;
    312	} else {
    313		intel_engine_mask_t tmp;
    314
    315		hw_mask = 0;
    316		for_each_engine_masked(engine, gt, engine_mask, tmp) {
    317			hw_mask |= engine->reset_domain;
    318		}
    319	}
    320
    321	return gen6_hw_domain_reset(gt, hw_mask);
    322}
    323
    324static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine)
    325{
    326	int vecs_id;
    327
    328	GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS);
    329
    330	vecs_id = _VECS((engine->instance) / 2);
    331
    332	return engine->gt->engine[vecs_id];
    333}
    334
    335struct sfc_lock_data {
    336	i915_reg_t lock_reg;
    337	i915_reg_t ack_reg;
    338	i915_reg_t usage_reg;
    339	u32 lock_bit;
    340	u32 ack_bit;
    341	u32 usage_bit;
    342	u32 reset_bit;
    343};
    344
    345static void get_sfc_forced_lock_data(struct intel_engine_cs *engine,
    346				     struct sfc_lock_data *sfc_lock)
    347{
    348	switch (engine->class) {
    349	default:
    350		MISSING_CASE(engine->class);
    351		fallthrough;
    352	case VIDEO_DECODE_CLASS:
    353		sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine->mmio_base);
    354		sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
    355
    356		sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
    357		sfc_lock->ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
    358
    359		sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base);
    360		sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT;
    361		sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
    362
    363		break;
    364	case VIDEO_ENHANCEMENT_CLASS:
    365		sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine->mmio_base);
    366		sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
    367
    368		sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine->mmio_base);
    369		sfc_lock->ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
    370
    371		sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine->mmio_base);
    372		sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT;
    373		sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
    374
    375		break;
    376	}
    377}
    378
    379static int gen11_lock_sfc(struct intel_engine_cs *engine,
    380			  u32 *reset_mask,
    381			  u32 *unlock_mask)
    382{
    383	struct intel_uncore *uncore = engine->uncore;
    384	u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
    385	struct sfc_lock_data sfc_lock;
    386	bool lock_obtained, lock_to_other = false;
    387	int ret;
    388
    389	switch (engine->class) {
    390	case VIDEO_DECODE_CLASS:
    391		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
    392			return 0;
    393
    394		fallthrough;
    395	case VIDEO_ENHANCEMENT_CLASS:
    396		get_sfc_forced_lock_data(engine, &sfc_lock);
    397
    398		break;
    399	default:
    400		return 0;
    401	}
    402
    403	if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) {
    404		struct intel_engine_cs *paired_vecs;
    405
    406		if (engine->class != VIDEO_DECODE_CLASS ||
    407		    GRAPHICS_VER(engine->i915) != 12)
    408			return 0;
    409
    410		/*
    411		 * Wa_14010733141
    412		 *
    413		 * If the VCS-MFX isn't using the SFC, we also need to check
    414		 * whether VCS-HCP is using it.  If so, we need to issue a *VE*
    415		 * forced lock on the VE engine that shares the same SFC.
    416		 */
    417		if (!(intel_uncore_read_fw(uncore,
    418					   GEN12_HCP_SFC_LOCK_STATUS(engine->mmio_base)) &
    419		      GEN12_HCP_SFC_USAGE_BIT))
    420			return 0;
    421
    422		paired_vecs = find_sfc_paired_vecs_engine(engine);
    423		get_sfc_forced_lock_data(paired_vecs, &sfc_lock);
    424		lock_to_other = true;
    425		*unlock_mask |= paired_vecs->mask;
    426	} else {
    427		*unlock_mask |= engine->mask;
    428	}
    429
    430	/*
    431	 * If the engine is using an SFC, tell the engine that a software reset
    432	 * is going to happen. The engine will then try to force lock the SFC.
    433	 * If SFC ends up being locked to the engine we want to reset, we have
    434	 * to reset it as well (we will unlock it once the reset sequence is
    435	 * completed).
    436	 */
    437	rmw_set_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
    438
    439	ret = __intel_wait_for_register_fw(uncore,
    440					   sfc_lock.ack_reg,
    441					   sfc_lock.ack_bit,
    442					   sfc_lock.ack_bit,
    443					   1000, 0, NULL);
    444
    445	/*
    446	 * Was the SFC released while we were trying to lock it?
    447	 *
    448	 * We should reset both the engine and the SFC if:
    449	 *  - We were locking the SFC to this engine and the lock succeeded
    450	 *       OR
    451	 *  - We were locking the SFC to a different engine (Wa_14010733141)
    452	 *    but the SFC was released before the lock was obtained.
    453	 *
    454	 * Otherwise we need only reset the engine by itself and we can
    455	 * leave the SFC alone.
    456	 */
    457	lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) &
    458			sfc_lock.usage_bit) != 0;
    459	if (lock_obtained == lock_to_other)
    460		return 0;
    461
    462	if (ret) {
    463		ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n");
    464		return ret;
    465	}
    466
    467	*reset_mask |= sfc_lock.reset_bit;
    468	return 0;
    469}
    470
    471static void gen11_unlock_sfc(struct intel_engine_cs *engine)
    472{
    473	struct intel_uncore *uncore = engine->uncore;
    474	u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
    475	struct sfc_lock_data sfc_lock = {};
    476
    477	if (engine->class != VIDEO_DECODE_CLASS &&
    478	    engine->class != VIDEO_ENHANCEMENT_CLASS)
    479		return;
    480
    481	if (engine->class == VIDEO_DECODE_CLASS &&
    482	    (BIT(engine->instance) & vdbox_sfc_access) == 0)
    483		return;
    484
    485	get_sfc_forced_lock_data(engine, &sfc_lock);
    486
    487	rmw_clear_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
    488}
    489
    490static int gen11_reset_engines(struct intel_gt *gt,
    491			       intel_engine_mask_t engine_mask,
    492			       unsigned int retry)
    493{
    494	struct intel_engine_cs *engine;
    495	intel_engine_mask_t tmp;
    496	u32 reset_mask, unlock_mask = 0;
    497	int ret;
    498
    499	if (engine_mask == ALL_ENGINES) {
    500		reset_mask = GEN11_GRDOM_FULL;
    501	} else {
    502		reset_mask = 0;
    503		for_each_engine_masked(engine, gt, engine_mask, tmp) {
    504			reset_mask |= engine->reset_domain;
    505			ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask);
    506			if (ret)
    507				goto sfc_unlock;
    508		}
    509	}
    510
    511	ret = gen6_hw_domain_reset(gt, reset_mask);
    512
    513sfc_unlock:
    514	/*
    515	 * We unlock the SFC based on the lock status and not the result of
    516	 * gen11_lock_sfc to make sure that we clean properly if something
    517	 * wrong happened during the lock (e.g. lock acquired after timeout
    518	 * expiration).
    519	 *
    520	 * Due to Wa_14010733141, we may have locked an SFC to an engine that
    521	 * wasn't being reset.  So instead of calling gen11_unlock_sfc()
    522	 * on engine_mask, we instead call it on the mask of engines that our
    523	 * gen11_lock_sfc() calls told us actually had locks attempted.
    524	 */
    525	for_each_engine_masked(engine, gt, unlock_mask, tmp)
    526		gen11_unlock_sfc(engine);
    527
    528	return ret;
    529}
    530
    531static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
    532{
    533	struct intel_uncore *uncore = engine->uncore;
    534	const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
    535	u32 request, mask, ack;
    536	int ret;
    537
    538	if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
    539		return -ETIMEDOUT;
    540
    541	ack = intel_uncore_read_fw(uncore, reg);
    542	if (ack & RESET_CTL_CAT_ERROR) {
    543		/*
    544		 * For catastrophic errors, ready-for-reset sequence
    545		 * needs to be bypassed: HAS#396813
    546		 */
    547		request = RESET_CTL_CAT_ERROR;
    548		mask = RESET_CTL_CAT_ERROR;
    549
    550		/* Catastrophic errors need to be cleared by HW */
    551		ack = 0;
    552	} else if (!(ack & RESET_CTL_READY_TO_RESET)) {
    553		request = RESET_CTL_REQUEST_RESET;
    554		mask = RESET_CTL_READY_TO_RESET;
    555		ack = RESET_CTL_READY_TO_RESET;
    556	} else {
    557		return 0;
    558	}
    559
    560	intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
    561	ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
    562					   700, 0, NULL);
    563	if (ret)
    564		drm_err(&engine->i915->drm,
    565			"%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
    566			engine->name, request,
    567			intel_uncore_read_fw(uncore, reg));
    568
    569	return ret;
    570}
    571
    572static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
    573{
    574	intel_uncore_write_fw(engine->uncore,
    575			      RING_RESET_CTL(engine->mmio_base),
    576			      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
    577}
    578
    579static int gen8_reset_engines(struct intel_gt *gt,
    580			      intel_engine_mask_t engine_mask,
    581			      unsigned int retry)
    582{
    583	struct intel_engine_cs *engine;
    584	const bool reset_non_ready = retry >= 1;
    585	intel_engine_mask_t tmp;
    586	int ret;
    587
    588	for_each_engine_masked(engine, gt, engine_mask, tmp) {
    589		ret = gen8_engine_reset_prepare(engine);
    590		if (ret && !reset_non_ready)
    591			goto skip_reset;
    592
    593		/*
    594		 * If this is not the first failed attempt to prepare,
    595		 * we decide to proceed anyway.
    596		 *
    597		 * By doing so we risk context corruption and with
    598		 * some gens (kbl), possible system hang if reset
    599		 * happens during active bb execution.
    600		 *
    601		 * We rather take context corruption instead of
    602		 * failed reset with a wedged driver/gpu. And
    603		 * active bb execution case should be covered by
    604		 * stop_engines() we have before the reset.
    605		 */
    606	}
    607
    608	/*
    609	 * Wa_22011100796:dg2, whenever Full soft reset is required,
    610	 * reset all individual engines firstly, and then do a full soft reset.
    611	 *
    612	 * This is best effort, so ignore any error from the initial reset.
    613	 */
    614	if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES)
    615		gen11_reset_engines(gt, gt->info.engine_mask, 0);
    616
    617	if (GRAPHICS_VER(gt->i915) >= 11)
    618		ret = gen11_reset_engines(gt, engine_mask, retry);
    619	else
    620		ret = gen6_reset_engines(gt, engine_mask, retry);
    621
    622skip_reset:
    623	for_each_engine_masked(engine, gt, engine_mask, tmp)
    624		gen8_engine_reset_cancel(engine);
    625
    626	return ret;
    627}
    628
    629static int mock_reset(struct intel_gt *gt,
    630		      intel_engine_mask_t mask,
    631		      unsigned int retry)
    632{
    633	return 0;
    634}
    635
    636typedef int (*reset_func)(struct intel_gt *,
    637			  intel_engine_mask_t engine_mask,
    638			  unsigned int retry);
    639
    640static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
    641{
    642	struct drm_i915_private *i915 = gt->i915;
    643
    644	if (is_mock_gt(gt))
    645		return mock_reset;
    646	else if (GRAPHICS_VER(i915) >= 8)
    647		return gen8_reset_engines;
    648	else if (GRAPHICS_VER(i915) >= 6)
    649		return gen6_reset_engines;
    650	else if (GRAPHICS_VER(i915) >= 5)
    651		return ilk_do_reset;
    652	else if (IS_G4X(i915))
    653		return g4x_do_reset;
    654	else if (IS_G33(i915) || IS_PINEVIEW(i915))
    655		return g33_do_reset;
    656	else if (GRAPHICS_VER(i915) >= 3)
    657		return i915_do_reset;
    658	else
    659		return NULL;
    660}
    661
    662int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
    663{
    664	const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
    665	reset_func reset;
    666	int ret = -ETIMEDOUT;
    667	int retry;
    668
    669	reset = intel_get_gpu_reset(gt);
    670	if (!reset)
    671		return -ENODEV;
    672
    673	/*
    674	 * If the power well sleeps during the reset, the reset
    675	 * request may be dropped and never completes (causing -EIO).
    676	 */
    677	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
    678	for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
    679		GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
    680		preempt_disable();
    681		ret = reset(gt, engine_mask, retry);
    682		preempt_enable();
    683	}
    684	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
    685
    686	return ret;
    687}
    688
    689bool intel_has_gpu_reset(const struct intel_gt *gt)
    690{
    691	if (!gt->i915->params.reset)
    692		return NULL;
    693
    694	return intel_get_gpu_reset(gt);
    695}
    696
    697bool intel_has_reset_engine(const struct intel_gt *gt)
    698{
    699	if (gt->i915->params.reset < 2)
    700		return false;
    701
    702	return INTEL_INFO(gt->i915)->has_reset_engine;
    703}
    704
    705int intel_reset_guc(struct intel_gt *gt)
    706{
    707	u32 guc_domain =
    708		GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
    709	int ret;
    710
    711	GEM_BUG_ON(!HAS_GT_UC(gt->i915));
    712
    713	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
    714	ret = gen6_hw_domain_reset(gt, guc_domain);
    715	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
    716
    717	return ret;
    718}
    719
    720/*
    721 * Ensure irq handler finishes, and not run again.
    722 * Also return the active request so that we only search for it once.
    723 */
    724static void reset_prepare_engine(struct intel_engine_cs *engine)
    725{
    726	/*
    727	 * During the reset sequence, we must prevent the engine from
    728	 * entering RC6. As the context state is undefined until we restart
    729	 * the engine, if it does enter RC6 during the reset, the state
    730	 * written to the powercontext is undefined and so we may lose
    731	 * GPU state upon resume, i.e. fail to restart after a reset.
    732	 */
    733	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
    734	if (engine->reset.prepare)
    735		engine->reset.prepare(engine);
    736}
    737
    738static void revoke_mmaps(struct intel_gt *gt)
    739{
    740	int i;
    741
    742	for (i = 0; i < gt->ggtt->num_fences; i++) {
    743		struct drm_vma_offset_node *node;
    744		struct i915_vma *vma;
    745		u64 vma_offset;
    746
    747		vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
    748		if (!vma)
    749			continue;
    750
    751		if (!i915_vma_has_userfault(vma))
    752			continue;
    753
    754		GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
    755
    756		if (!vma->mmo)
    757			continue;
    758
    759		node = &vma->mmo->vma_node;
    760		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
    761
    762		unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
    763				    drm_vma_node_offset_addr(node) + vma_offset,
    764				    vma->size,
    765				    1);
    766	}
    767}
    768
    769static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
    770{
    771	struct intel_engine_cs *engine;
    772	intel_engine_mask_t awake = 0;
    773	enum intel_engine_id id;
    774
    775	/* For GuC mode, ensure submission is disabled before stopping ring */
    776	intel_uc_reset_prepare(&gt->uc);
    777
    778	for_each_engine(engine, gt, id) {
    779		if (intel_engine_pm_get_if_awake(engine))
    780			awake |= engine->mask;
    781		reset_prepare_engine(engine);
    782	}
    783
    784	return awake;
    785}
    786
    787static void gt_revoke(struct intel_gt *gt)
    788{
    789	revoke_mmaps(gt);
    790}
    791
    792static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
    793{
    794	struct intel_engine_cs *engine;
    795	enum intel_engine_id id;
    796	int err;
    797
    798	/*
    799	 * Everything depends on having the GTT running, so we need to start
    800	 * there.
    801	 */
    802	err = i915_ggtt_enable_hw(gt->i915);
    803	if (err)
    804		return err;
    805
    806	local_bh_disable();
    807	for_each_engine(engine, gt, id)
    808		__intel_engine_reset(engine, stalled_mask & engine->mask);
    809	local_bh_enable();
    810
    811	intel_uc_reset(&gt->uc, ALL_ENGINES);
    812
    813	intel_ggtt_restore_fences(gt->ggtt);
    814
    815	return err;
    816}
    817
    818static void reset_finish_engine(struct intel_engine_cs *engine)
    819{
    820	if (engine->reset.finish)
    821		engine->reset.finish(engine);
    822	intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
    823
    824	intel_engine_signal_breadcrumbs(engine);
    825}
    826
    827static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
    828{
    829	struct intel_engine_cs *engine;
    830	enum intel_engine_id id;
    831
    832	for_each_engine(engine, gt, id) {
    833		reset_finish_engine(engine);
    834		if (awake & engine->mask)
    835			intel_engine_pm_put(engine);
    836	}
    837
    838	intel_uc_reset_finish(&gt->uc);
    839}
    840
    841static void nop_submit_request(struct i915_request *request)
    842{
    843	RQ_TRACE(request, "-EIO\n");
    844
    845	request = i915_request_mark_eio(request);
    846	if (request) {
    847		i915_request_submit(request);
    848		intel_engine_signal_breadcrumbs(request->engine);
    849
    850		i915_request_put(request);
    851	}
    852}
    853
    854static void __intel_gt_set_wedged(struct intel_gt *gt)
    855{
    856	struct intel_engine_cs *engine;
    857	intel_engine_mask_t awake;
    858	enum intel_engine_id id;
    859
    860	if (test_bit(I915_WEDGED, &gt->reset.flags))
    861		return;
    862
    863	GT_TRACE(gt, "start\n");
    864
    865	/*
    866	 * First, stop submission to hw, but do not yet complete requests by
    867	 * rolling the global seqno forward (since this would complete requests
    868	 * for which we haven't set the fence error to EIO yet).
    869	 */
    870	awake = reset_prepare(gt);
    871
    872	/* Even if the GPU reset fails, it should still stop the engines */
    873	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
    874		__intel_gt_reset(gt, ALL_ENGINES);
    875
    876	for_each_engine(engine, gt, id)
    877		engine->submit_request = nop_submit_request;
    878
    879	/*
    880	 * Make sure no request can slip through without getting completed by
    881	 * either this call here to intel_engine_write_global_seqno, or the one
    882	 * in nop_submit_request.
    883	 */
    884	synchronize_rcu_expedited();
    885	set_bit(I915_WEDGED, &gt->reset.flags);
    886
    887	/* Mark all executing requests as skipped */
    888	local_bh_disable();
    889	for_each_engine(engine, gt, id)
    890		if (engine->reset.cancel)
    891			engine->reset.cancel(engine);
    892	intel_uc_cancel_requests(&gt->uc);
    893	local_bh_enable();
    894
    895	reset_finish(gt, awake);
    896
    897	GT_TRACE(gt, "end\n");
    898}
    899
    900void intel_gt_set_wedged(struct intel_gt *gt)
    901{
    902	intel_wakeref_t wakeref;
    903
    904	if (test_bit(I915_WEDGED, &gt->reset.flags))
    905		return;
    906
    907	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
    908	mutex_lock(&gt->reset.mutex);
    909
    910	if (GEM_SHOW_DEBUG()) {
    911		struct drm_printer p = drm_debug_printer(__func__);
    912		struct intel_engine_cs *engine;
    913		enum intel_engine_id id;
    914
    915		drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
    916		for_each_engine(engine, gt, id) {
    917			if (intel_engine_is_idle(engine))
    918				continue;
    919
    920			intel_engine_dump(engine, &p, "%s\n", engine->name);
    921		}
    922	}
    923
    924	__intel_gt_set_wedged(gt);
    925
    926	mutex_unlock(&gt->reset.mutex);
    927	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
    928}
    929
    930static bool __intel_gt_unset_wedged(struct intel_gt *gt)
    931{
    932	struct intel_gt_timelines *timelines = &gt->timelines;
    933	struct intel_timeline *tl;
    934	bool ok;
    935
    936	if (!test_bit(I915_WEDGED, &gt->reset.flags))
    937		return true;
    938
    939	/* Never fully initialised, recovery impossible */
    940	if (intel_gt_has_unrecoverable_error(gt))
    941		return false;
    942
    943	GT_TRACE(gt, "start\n");
    944
    945	/*
    946	 * Before unwedging, make sure that all pending operations
    947	 * are flushed and errored out - we may have requests waiting upon
    948	 * third party fences. We marked all inflight requests as EIO, and
    949	 * every execbuf since returned EIO, for consistency we want all
    950	 * the currently pending requests to also be marked as EIO, which
    951	 * is done inside our nop_submit_request - and so we must wait.
    952	 *
    953	 * No more can be submitted until we reset the wedged bit.
    954	 */
    955	spin_lock(&timelines->lock);
    956	list_for_each_entry(tl, &timelines->active_list, link) {
    957		struct dma_fence *fence;
    958
    959		fence = i915_active_fence_get(&tl->last_request);
    960		if (!fence)
    961			continue;
    962
    963		spin_unlock(&timelines->lock);
    964
    965		/*
    966		 * All internal dependencies (i915_requests) will have
    967		 * been flushed by the set-wedge, but we may be stuck waiting
    968		 * for external fences. These should all be capped to 10s
    969		 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
    970		 * in the worst case.
    971		 */
    972		dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
    973		dma_fence_put(fence);
    974
    975		/* Restart iteration after droping lock */
    976		spin_lock(&timelines->lock);
    977		tl = list_entry(&timelines->active_list, typeof(*tl), link);
    978	}
    979	spin_unlock(&timelines->lock);
    980
    981	/* We must reset pending GPU events before restoring our submission */
    982	ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
    983	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
    984		ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
    985	if (!ok) {
    986		/*
    987		 * Warn CI about the unrecoverable wedged condition.
    988		 * Time for a reboot.
    989		 */
    990		add_taint_for_CI(gt->i915, TAINT_WARN);
    991		return false;
    992	}
    993
    994	/*
    995	 * Undo nop_submit_request. We prevent all new i915 requests from
    996	 * being queued (by disallowing execbuf whilst wedged) so having
    997	 * waited for all active requests above, we know the system is idle
    998	 * and do not have to worry about a thread being inside
    999	 * engine->submit_request() as we swap over. So unlike installing
   1000	 * the nop_submit_request on reset, we can do this from normal
   1001	 * context and do not require stop_machine().
   1002	 */
   1003	intel_engines_reset_default_submission(gt);
   1004
   1005	GT_TRACE(gt, "end\n");
   1006
   1007	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
   1008	clear_bit(I915_WEDGED, &gt->reset.flags);
   1009
   1010	return true;
   1011}
   1012
   1013bool intel_gt_unset_wedged(struct intel_gt *gt)
   1014{
   1015	bool result;
   1016
   1017	mutex_lock(&gt->reset.mutex);
   1018	result = __intel_gt_unset_wedged(gt);
   1019	mutex_unlock(&gt->reset.mutex);
   1020
   1021	return result;
   1022}
   1023
   1024static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
   1025{
   1026	int err, i;
   1027
   1028	err = __intel_gt_reset(gt, ALL_ENGINES);
   1029	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
   1030		msleep(10 * (i + 1));
   1031		err = __intel_gt_reset(gt, ALL_ENGINES);
   1032	}
   1033	if (err)
   1034		return err;
   1035
   1036	return gt_reset(gt, stalled_mask);
   1037}
   1038
   1039static int resume(struct intel_gt *gt)
   1040{
   1041	struct intel_engine_cs *engine;
   1042	enum intel_engine_id id;
   1043	int ret;
   1044
   1045	for_each_engine(engine, gt, id) {
   1046		ret = intel_engine_resume(engine);
   1047		if (ret)
   1048			return ret;
   1049	}
   1050
   1051	return 0;
   1052}
   1053
   1054/**
   1055 * intel_gt_reset - reset chip after a hang
   1056 * @gt: #intel_gt to reset
   1057 * @stalled_mask: mask of the stalled engines with the guilty requests
   1058 * @reason: user error message for why we are resetting
   1059 *
   1060 * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
   1061 * on failure.
   1062 *
   1063 * Procedure is fairly simple:
   1064 *   - reset the chip using the reset reg
   1065 *   - re-init context state
   1066 *   - re-init hardware status page
   1067 *   - re-init ring buffer
   1068 *   - re-init interrupt state
   1069 *   - re-init display
   1070 */
   1071void intel_gt_reset(struct intel_gt *gt,
   1072		    intel_engine_mask_t stalled_mask,
   1073		    const char *reason)
   1074{
   1075	intel_engine_mask_t awake;
   1076	int ret;
   1077
   1078	GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
   1079
   1080	might_sleep();
   1081	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
   1082
   1083	/*
   1084	 * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence
   1085	 * critical section like gpu reset.
   1086	 */
   1087	gt_revoke(gt);
   1088
   1089	mutex_lock(&gt->reset.mutex);
   1090
   1091	/* Clear any previous failed attempts at recovery. Time to try again. */
   1092	if (!__intel_gt_unset_wedged(gt))
   1093		goto unlock;
   1094
   1095	if (reason)
   1096		drm_notice(&gt->i915->drm,
   1097			   "Resetting chip for %s\n", reason);
   1098	atomic_inc(&gt->i915->gpu_error.reset_count);
   1099
   1100	awake = reset_prepare(gt);
   1101
   1102	if (!intel_has_gpu_reset(gt)) {
   1103		if (gt->i915->params.reset)
   1104			drm_err(&gt->i915->drm, "GPU reset not supported\n");
   1105		else
   1106			drm_dbg(&gt->i915->drm, "GPU reset disabled\n");
   1107		goto error;
   1108	}
   1109
   1110	if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
   1111		intel_runtime_pm_disable_interrupts(gt->i915);
   1112
   1113	if (do_reset(gt, stalled_mask)) {
   1114		drm_err(&gt->i915->drm, "Failed to reset chip\n");
   1115		goto taint;
   1116	}
   1117
   1118	if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
   1119		intel_runtime_pm_enable_interrupts(gt->i915);
   1120
   1121	intel_overlay_reset(gt->i915);
   1122
   1123	/*
   1124	 * Next we need to restore the context, but we don't use those
   1125	 * yet either...
   1126	 *
   1127	 * Ring buffer needs to be re-initialized in the KMS case, or if X
   1128	 * was running at the time of the reset (i.e. we weren't VT
   1129	 * switched away).
   1130	 */
   1131	ret = intel_gt_init_hw(gt);
   1132	if (ret) {
   1133		drm_err(&gt->i915->drm,
   1134			"Failed to initialise HW following reset (%d)\n",
   1135			ret);
   1136		goto taint;
   1137	}
   1138
   1139	ret = resume(gt);
   1140	if (ret)
   1141		goto taint;
   1142
   1143finish:
   1144	reset_finish(gt, awake);
   1145unlock:
   1146	mutex_unlock(&gt->reset.mutex);
   1147	return;
   1148
   1149taint:
   1150	/*
   1151	 * History tells us that if we cannot reset the GPU now, we
   1152	 * never will. This then impacts everything that is run
   1153	 * subsequently. On failing the reset, we mark the driver
   1154	 * as wedged, preventing further execution on the GPU.
   1155	 * We also want to go one step further and add a taint to the
   1156	 * kernel so that any subsequent faults can be traced back to
   1157	 * this failure. This is important for CI, where if the
   1158	 * GPU/driver fails we would like to reboot and restart testing
   1159	 * rather than continue on into oblivion. For everyone else,
   1160	 * the system should still plod along, but they have been warned!
   1161	 */
   1162	add_taint_for_CI(gt->i915, TAINT_WARN);
   1163error:
   1164	__intel_gt_set_wedged(gt);
   1165	goto finish;
   1166}
   1167
   1168static int intel_gt_reset_engine(struct intel_engine_cs *engine)
   1169{
   1170	return __intel_gt_reset(engine->gt, engine->mask);
   1171}
   1172
   1173int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
   1174{
   1175	struct intel_gt *gt = engine->gt;
   1176	int ret;
   1177
   1178	ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
   1179	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
   1180
   1181	if (intel_engine_uses_guc(engine))
   1182		return -ENODEV;
   1183
   1184	if (!intel_engine_pm_get_if_awake(engine))
   1185		return 0;
   1186
   1187	reset_prepare_engine(engine);
   1188
   1189	if (msg)
   1190		drm_notice(&engine->i915->drm,
   1191			   "Resetting %s for %s\n", engine->name, msg);
   1192	atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
   1193
   1194	ret = intel_gt_reset_engine(engine);
   1195	if (ret) {
   1196		/* If we fail here, we expect to fallback to a global reset */
   1197		ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret);
   1198		goto out;
   1199	}
   1200
   1201	/*
   1202	 * The request that caused the hang is stuck on elsp, we know the
   1203	 * active request and can drop it, adjust head to skip the offending
   1204	 * request to resume executing remaining requests in the queue.
   1205	 */
   1206	__intel_engine_reset(engine, true);
   1207
   1208	/*
   1209	 * The engine and its registers (and workarounds in case of render)
   1210	 * have been reset to their default values. Follow the init_ring
   1211	 * process to program RING_MODE, HWSP and re-enable submission.
   1212	 */
   1213	ret = intel_engine_resume(engine);
   1214
   1215out:
   1216	intel_engine_cancel_stop_cs(engine);
   1217	reset_finish_engine(engine);
   1218	intel_engine_pm_put_async(engine);
   1219	return ret;
   1220}
   1221
   1222/**
   1223 * intel_engine_reset - reset GPU engine to recover from a hang
   1224 * @engine: engine to reset
   1225 * @msg: reason for GPU reset; or NULL for no drm_notice()
   1226 *
   1227 * Reset a specific GPU engine. Useful if a hang is detected.
   1228 * Returns zero on successful reset or otherwise an error code.
   1229 *
   1230 * Procedure is:
   1231 *  - identifies the request that caused the hang and it is dropped
   1232 *  - reset engine (which will force the engine to idle)
   1233 *  - re-init/configure engine
   1234 */
   1235int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
   1236{
   1237	int err;
   1238
   1239	local_bh_disable();
   1240	err = __intel_engine_reset_bh(engine, msg);
   1241	local_bh_enable();
   1242
   1243	return err;
   1244}
   1245
   1246static void intel_gt_reset_global(struct intel_gt *gt,
   1247				  u32 engine_mask,
   1248				  const char *reason)
   1249{
   1250	struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
   1251	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
   1252	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
   1253	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
   1254	struct intel_wedge_me w;
   1255
   1256	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
   1257
   1258	GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask);
   1259	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
   1260
   1261	/* Use a watchdog to ensure that our reset completes */
   1262	intel_wedge_on_timeout(&w, gt, 5 * HZ) {
   1263		intel_display_prepare_reset(gt->i915);
   1264
   1265		/* Flush everyone using a resource about to be clobbered */
   1266		synchronize_srcu_expedited(&gt->reset.backoff_srcu);
   1267
   1268		intel_gt_reset(gt, engine_mask, reason);
   1269
   1270		intel_display_finish_reset(gt->i915);
   1271	}
   1272
   1273	if (!test_bit(I915_WEDGED, &gt->reset.flags))
   1274		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
   1275}
   1276
   1277/**
   1278 * intel_gt_handle_error - handle a gpu error
   1279 * @gt: the intel_gt
   1280 * @engine_mask: mask representing engines that are hung
   1281 * @flags: control flags
   1282 * @fmt: Error message format string
   1283 *
   1284 * Do some basic checking of register state at error time and
   1285 * dump it to the syslog.  Also call i915_capture_error_state() to make
   1286 * sure we get a record and make it available in debugfs.  Fire a uevent
   1287 * so userspace knows something bad happened (should trigger collection
   1288 * of a ring dump etc.).
   1289 */
   1290void intel_gt_handle_error(struct intel_gt *gt,
   1291			   intel_engine_mask_t engine_mask,
   1292			   unsigned long flags,
   1293			   const char *fmt, ...)
   1294{
   1295	struct intel_engine_cs *engine;
   1296	intel_wakeref_t wakeref;
   1297	intel_engine_mask_t tmp;
   1298	char error_msg[80];
   1299	char *msg = NULL;
   1300
   1301	if (fmt) {
   1302		va_list args;
   1303
   1304		va_start(args, fmt);
   1305		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
   1306		va_end(args);
   1307
   1308		msg = error_msg;
   1309	}
   1310
   1311	/*
   1312	 * In most cases it's guaranteed that we get here with an RPM
   1313	 * reference held, for example because there is a pending GPU
   1314	 * request that won't finish until the reset is done. This
   1315	 * isn't the case at least when we get here by doing a
   1316	 * simulated reset via debugfs, so get an RPM reference.
   1317	 */
   1318	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
   1319
   1320	engine_mask &= gt->info.engine_mask;
   1321
   1322	if (flags & I915_ERROR_CAPTURE) {
   1323		i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE);
   1324		intel_gt_clear_error_registers(gt, engine_mask);
   1325	}
   1326
   1327	/*
   1328	 * Try engine reset when available. We fall back to full reset if
   1329	 * single reset fails.
   1330	 */
   1331	if (!intel_uc_uses_guc_submission(&gt->uc) &&
   1332	    intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
   1333		local_bh_disable();
   1334		for_each_engine_masked(engine, gt, engine_mask, tmp) {
   1335			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
   1336			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
   1337					     &gt->reset.flags))
   1338				continue;
   1339
   1340			if (__intel_engine_reset_bh(engine, msg) == 0)
   1341				engine_mask &= ~engine->mask;
   1342
   1343			clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
   1344					      &gt->reset.flags);
   1345		}
   1346		local_bh_enable();
   1347	}
   1348
   1349	if (!engine_mask)
   1350		goto out;
   1351
   1352	/* Full reset needs the mutex, stop any other user trying to do so. */
   1353	if (test_and_set_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
   1354		wait_event(gt->reset.queue,
   1355			   !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
   1356		goto out; /* piggy-back on the other reset */
   1357	}
   1358
   1359	/* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
   1360	synchronize_rcu_expedited();
   1361
   1362	/*
   1363	 * Prevent any other reset-engine attempt. We don't do this for GuC
   1364	 * submission the GuC owns the per-engine reset, not the i915.
   1365	 */
   1366	if (!intel_uc_uses_guc_submission(&gt->uc)) {
   1367		for_each_engine(engine, gt, tmp) {
   1368			while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
   1369						&gt->reset.flags))
   1370				wait_on_bit(&gt->reset.flags,
   1371					    I915_RESET_ENGINE + engine->id,
   1372					    TASK_UNINTERRUPTIBLE);
   1373		}
   1374	}
   1375
   1376	intel_gt_reset_global(gt, engine_mask, msg);
   1377
   1378	if (!intel_uc_uses_guc_submission(&gt->uc)) {
   1379		for_each_engine(engine, gt, tmp)
   1380			clear_bit_unlock(I915_RESET_ENGINE + engine->id,
   1381					 &gt->reset.flags);
   1382	}
   1383	clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
   1384	smp_mb__after_atomic();
   1385	wake_up_all(&gt->reset.queue);
   1386
   1387out:
   1388	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
   1389}
   1390
   1391int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
   1392{
   1393	might_lock(&gt->reset.backoff_srcu);
   1394	might_sleep();
   1395
   1396	rcu_read_lock();
   1397	while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
   1398		rcu_read_unlock();
   1399
   1400		if (wait_event_interruptible(gt->reset.queue,
   1401					     !test_bit(I915_RESET_BACKOFF,
   1402						       &gt->reset.flags)))
   1403			return -EINTR;
   1404
   1405		rcu_read_lock();
   1406	}
   1407	*srcu = srcu_read_lock(&gt->reset.backoff_srcu);
   1408	rcu_read_unlock();
   1409
   1410	return 0;
   1411}
   1412
   1413void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
   1414__releases(&gt->reset.backoff_srcu)
   1415{
   1416	srcu_read_unlock(&gt->reset.backoff_srcu, tag);
   1417}
   1418
   1419int intel_gt_terminally_wedged(struct intel_gt *gt)
   1420{
   1421	might_sleep();
   1422
   1423	if (!intel_gt_is_wedged(gt))
   1424		return 0;
   1425
   1426	if (intel_gt_has_unrecoverable_error(gt))
   1427		return -EIO;
   1428
   1429	/* Reset still in progress? Maybe we will recover? */
   1430	if (wait_event_interruptible(gt->reset.queue,
   1431				     !test_bit(I915_RESET_BACKOFF,
   1432					       &gt->reset.flags)))
   1433		return -EINTR;
   1434
   1435	return intel_gt_is_wedged(gt) ? -EIO : 0;
   1436}
   1437
   1438void intel_gt_set_wedged_on_init(struct intel_gt *gt)
   1439{
   1440	BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
   1441		     I915_WEDGED_ON_INIT);
   1442	intel_gt_set_wedged(gt);
   1443	i915_disable_error_state(gt->i915, -ENODEV);
   1444	set_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
   1445
   1446	/* Wedged on init is non-recoverable */
   1447	add_taint_for_CI(gt->i915, TAINT_WARN);
   1448}
   1449
   1450void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
   1451{
   1452	intel_gt_set_wedged(gt);
   1453	i915_disable_error_state(gt->i915, -ENODEV);
   1454	set_bit(I915_WEDGED_ON_FINI, &gt->reset.flags);
   1455	intel_gt_retire_requests(gt); /* cleanup any wedged requests */
   1456}
   1457
   1458void intel_gt_init_reset(struct intel_gt *gt)
   1459{
   1460	init_waitqueue_head(&gt->reset.queue);
   1461	mutex_init(&gt->reset.mutex);
   1462	init_srcu_struct(&gt->reset.backoff_srcu);
   1463
   1464	/*
   1465	 * While undesirable to wait inside the shrinker, complain anyway.
   1466	 *
   1467	 * If we have to wait during shrinking, we guarantee forward progress
   1468	 * by forcing the reset. Therefore during the reset we must not
   1469	 * re-enter the shrinker. By declaring that we take the reset mutex
   1470	 * within the shrinker, we forbid ourselves from performing any
   1471	 * fs-reclaim or taking related locks during reset.
   1472	 */
   1473	i915_gem_shrinker_taints_mutex(gt->i915, &gt->reset.mutex);
   1474
   1475	/* no GPU until we are ready! */
   1476	__set_bit(I915_WEDGED, &gt->reset.flags);
   1477}
   1478
   1479void intel_gt_fini_reset(struct intel_gt *gt)
   1480{
   1481	cleanup_srcu_struct(&gt->reset.backoff_srcu);
   1482}
   1483
   1484static void intel_wedge_me(struct work_struct *work)
   1485{
   1486	struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
   1487
   1488	drm_err(&w->gt->i915->drm,
   1489		"%s timed out, cancelling all in-flight rendering.\n",
   1490		w->name);
   1491	intel_gt_set_wedged(w->gt);
   1492}
   1493
   1494void __intel_init_wedge(struct intel_wedge_me *w,
   1495			struct intel_gt *gt,
   1496			long timeout,
   1497			const char *name)
   1498{
   1499	w->gt = gt;
   1500	w->name = name;
   1501
   1502	INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
   1503	schedule_delayed_work(&w->work, timeout);
   1504}
   1505
   1506void __intel_fini_wedge(struct intel_wedge_me *w)
   1507{
   1508	cancel_delayed_work_sync(&w->work);
   1509	destroy_delayed_work_on_stack(&w->work);
   1510	w->gt = NULL;
   1511}
   1512
   1513#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
   1514#include "selftest_reset.c"
   1515#include "selftest_hangcheck.c"
   1516#endif