cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

intel_engine_heartbeat.c (9295B)


      1// SPDX-License-Identifier: MIT
      2/*
      3 * Copyright © 2019 Intel Corporation
      4 */
      5
      6#include "i915_drv.h"
      7#include "i915_request.h"
      8
      9#include "intel_context.h"
     10#include "intel_engine_heartbeat.h"
     11#include "intel_engine_pm.h"
     12#include "intel_engine.h"
     13#include "intel_gt.h"
     14#include "intel_reset.h"
     15
     16/*
     17 * While the engine is active, we send a periodic pulse along the engine
     18 * to check on its health and to flush any idle-barriers. If that request
     19 * is stuck, and we fail to preempt it, we declare the engine hung and
     20 * issue a reset -- in the hope that restores progress.
     21 */
     22
     23static bool next_heartbeat(struct intel_engine_cs *engine)
     24{
     25	long delay;
     26
     27	delay = READ_ONCE(engine->props.heartbeat_interval_ms);
     28	if (!delay)
     29		return false;
     30
     31	delay = msecs_to_jiffies_timeout(delay);
     32	if (delay >= HZ)
     33		delay = round_jiffies_up_relative(delay);
     34	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
     35
     36	return true;
     37}
     38
     39static struct i915_request *
     40heartbeat_create(struct intel_context *ce, gfp_t gfp)
     41{
     42	struct i915_request *rq;
     43
     44	intel_context_enter(ce);
     45	rq = __i915_request_create(ce, gfp);
     46	intel_context_exit(ce);
     47
     48	return rq;
     49}
     50
     51static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
     52{
     53	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
     54	i915_request_add_active_barriers(rq);
     55	if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
     56		engine->heartbeat.systole = i915_request_get(rq);
     57}
     58
     59static void heartbeat_commit(struct i915_request *rq,
     60			     const struct i915_sched_attr *attr)
     61{
     62	idle_pulse(rq->engine, rq);
     63
     64	__i915_request_commit(rq);
     65	__i915_request_queue(rq, attr);
     66}
     67
     68static void show_heartbeat(const struct i915_request *rq,
     69			   struct intel_engine_cs *engine)
     70{
     71	struct drm_printer p = drm_debug_printer("heartbeat");
     72
     73	if (!rq) {
     74		intel_engine_dump(engine, &p,
     75				  "%s heartbeat not ticking\n",
     76				  engine->name);
     77	} else {
     78		intel_engine_dump(engine, &p,
     79				  "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
     80				  engine->name,
     81				  rq->fence.context,
     82				  rq->fence.seqno,
     83				  rq->sched.attr.priority);
     84	}
     85}
     86
     87static void
     88reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
     89{
     90	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
     91		show_heartbeat(rq, engine);
     92
     93	if (intel_engine_uses_guc(engine))
     94		/*
     95		 * GuC itself is toast or GuC's hang detection
     96		 * is disabled. Either way, need to find the
     97		 * hang culprit manually.
     98		 */
     99		intel_guc_find_hung_context(engine);
    100
    101	intel_gt_handle_error(engine->gt, engine->mask,
    102			      I915_ERROR_CAPTURE,
    103			      "stopped heartbeat on %s",
    104			      engine->name);
    105}
    106
    107static void heartbeat(struct work_struct *wrk)
    108{
    109	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
    110	struct intel_engine_cs *engine =
    111		container_of(wrk, typeof(*engine), heartbeat.work.work);
    112	struct intel_context *ce = engine->kernel_context;
    113	struct i915_request *rq;
    114	unsigned long serial;
    115
    116	/* Just in case everything has gone horribly wrong, give it a kick */
    117	intel_engine_flush_submission(engine);
    118
    119	rq = engine->heartbeat.systole;
    120	if (rq && i915_request_completed(rq)) {
    121		i915_request_put(rq);
    122		engine->heartbeat.systole = NULL;
    123	}
    124
    125	if (!intel_engine_pm_get_if_awake(engine))
    126		return;
    127
    128	if (intel_gt_is_wedged(engine->gt))
    129		goto out;
    130
    131	if (i915_sched_engine_disabled(engine->sched_engine)) {
    132		reset_engine(engine, engine->heartbeat.systole);
    133		goto out;
    134	}
    135
    136	if (engine->heartbeat.systole) {
    137		long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
    138
    139		/* Safeguard against too-fast worker invocations */
    140		if (!time_after(jiffies,
    141				rq->emitted_jiffies + msecs_to_jiffies(delay)))
    142			goto out;
    143
    144		if (!i915_sw_fence_signaled(&rq->submit)) {
    145			/*
    146			 * Not yet submitted, system is stalled.
    147			 *
    148			 * This more often happens for ring submission,
    149			 * where all contexts are funnelled into a common
    150			 * ringbuffer. If one context is blocked on an
    151			 * external fence, not only is it not submitted,
    152			 * but all other contexts, including the kernel
    153			 * context are stuck waiting for the signal.
    154			 */
    155		} else if (engine->sched_engine->schedule &&
    156			   rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
    157			/*
    158			 * Gradually raise the priority of the heartbeat to
    159			 * give high priority work [which presumably desires
    160			 * low latency and no jitter] the chance to naturally
    161			 * complete before being preempted.
    162			 */
    163			attr.priority = 0;
    164			if (rq->sched.attr.priority >= attr.priority)
    165				attr.priority = I915_PRIORITY_HEARTBEAT;
    166			if (rq->sched.attr.priority >= attr.priority)
    167				attr.priority = I915_PRIORITY_BARRIER;
    168
    169			local_bh_disable();
    170			engine->sched_engine->schedule(rq, &attr);
    171			local_bh_enable();
    172		} else {
    173			reset_engine(engine, rq);
    174		}
    175
    176		rq->emitted_jiffies = jiffies;
    177		goto out;
    178	}
    179
    180	serial = READ_ONCE(engine->serial);
    181	if (engine->wakeref_serial == serial)
    182		goto out;
    183
    184	if (!mutex_trylock(&ce->timeline->mutex)) {
    185		/* Unable to lock the kernel timeline, is the engine stuck? */
    186		if (xchg(&engine->heartbeat.blocked, serial) == serial)
    187			intel_gt_handle_error(engine->gt, engine->mask,
    188					      I915_ERROR_CAPTURE,
    189					      "no heartbeat on %s",
    190					      engine->name);
    191		goto out;
    192	}
    193
    194	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
    195	if (IS_ERR(rq))
    196		goto unlock;
    197
    198	heartbeat_commit(rq, &attr);
    199
    200unlock:
    201	mutex_unlock(&ce->timeline->mutex);
    202out:
    203	if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
    204		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
    205	intel_engine_pm_put(engine);
    206}
    207
    208void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
    209{
    210	if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
    211		return;
    212
    213	next_heartbeat(engine);
    214}
    215
    216void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
    217{
    218	if (cancel_delayed_work(&engine->heartbeat.work))
    219		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
    220}
    221
    222void intel_gt_unpark_heartbeats(struct intel_gt *gt)
    223{
    224	struct intel_engine_cs *engine;
    225	enum intel_engine_id id;
    226
    227	for_each_engine(engine, gt, id)
    228		if (intel_engine_pm_is_awake(engine))
    229			intel_engine_unpark_heartbeat(engine);
    230}
    231
    232void intel_gt_park_heartbeats(struct intel_gt *gt)
    233{
    234	struct intel_engine_cs *engine;
    235	enum intel_engine_id id;
    236
    237	for_each_engine(engine, gt, id)
    238		intel_engine_park_heartbeat(engine);
    239}
    240
    241void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
    242{
    243	INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
    244}
    245
    246static int __intel_engine_pulse(struct intel_engine_cs *engine)
    247{
    248	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
    249	struct intel_context *ce = engine->kernel_context;
    250	struct i915_request *rq;
    251
    252	lockdep_assert_held(&ce->timeline->mutex);
    253	GEM_BUG_ON(!intel_engine_has_preemption(engine));
    254	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
    255
    256	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
    257	if (IS_ERR(rq))
    258		return PTR_ERR(rq);
    259
    260	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
    261
    262	heartbeat_commit(rq, &attr);
    263	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
    264
    265	return 0;
    266}
    267
    268static unsigned long set_heartbeat(struct intel_engine_cs *engine,
    269				   unsigned long delay)
    270{
    271	unsigned long old;
    272
    273	old = xchg(&engine->props.heartbeat_interval_ms, delay);
    274	if (delay)
    275		intel_engine_unpark_heartbeat(engine);
    276	else
    277		intel_engine_park_heartbeat(engine);
    278
    279	return old;
    280}
    281
    282int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
    283			       unsigned long delay)
    284{
    285	struct intel_context *ce = engine->kernel_context;
    286	int err = 0;
    287
    288	if (!delay && !intel_engine_has_preempt_reset(engine))
    289		return -ENODEV;
    290
    291	intel_engine_pm_get(engine);
    292
    293	err = mutex_lock_interruptible(&ce->timeline->mutex);
    294	if (err)
    295		goto out_rpm;
    296
    297	if (delay != engine->props.heartbeat_interval_ms) {
    298		unsigned long saved = set_heartbeat(engine, delay);
    299
    300		/* recheck current execution */
    301		if (intel_engine_has_preemption(engine)) {
    302			err = __intel_engine_pulse(engine);
    303			if (err)
    304				set_heartbeat(engine, saved);
    305		}
    306	}
    307
    308	mutex_unlock(&ce->timeline->mutex);
    309
    310out_rpm:
    311	intel_engine_pm_put(engine);
    312	return err;
    313}
    314
    315int intel_engine_pulse(struct intel_engine_cs *engine)
    316{
    317	struct intel_context *ce = engine->kernel_context;
    318	int err;
    319
    320	if (!intel_engine_has_preemption(engine))
    321		return -ENODEV;
    322
    323	if (!intel_engine_pm_get_if_awake(engine))
    324		return 0;
    325
    326	err = -EINTR;
    327	if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
    328		err = __intel_engine_pulse(engine);
    329		mutex_unlock(&ce->timeline->mutex);
    330	}
    331
    332	intel_engine_flush_submission(engine);
    333	intel_engine_pm_put(engine);
    334	return err;
    335}
    336
    337int intel_engine_flush_barriers(struct intel_engine_cs *engine)
    338{
    339	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
    340	struct intel_context *ce = engine->kernel_context;
    341	struct i915_request *rq;
    342	int err;
    343
    344	if (llist_empty(&engine->barrier_tasks))
    345		return 0;
    346
    347	if (!intel_engine_pm_get_if_awake(engine))
    348		return 0;
    349
    350	if (mutex_lock_interruptible(&ce->timeline->mutex)) {
    351		err = -EINTR;
    352		goto out_rpm;
    353	}
    354
    355	rq = heartbeat_create(ce, GFP_KERNEL);
    356	if (IS_ERR(rq)) {
    357		err = PTR_ERR(rq);
    358		goto out_unlock;
    359	}
    360
    361	heartbeat_commit(rq, &attr);
    362
    363	err = 0;
    364out_unlock:
    365	mutex_unlock(&ce->timeline->mutex);
    366out_rpm:
    367	intel_engine_pm_put(engine);
    368	return err;
    369}
    370
    371#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
    372#include "selftest_engine_heartbeat.c"
    373#endif