cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

i915_request.c (71365B)


      1/*
      2 * Copyright © 2016 Intel Corporation
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 */
     24
     25#include <linux/prime_numbers.h>
     26#include <linux/pm_qos.h>
     27#include <linux/sort.h>
     28
     29#include "gem/i915_gem_internal.h"
     30#include "gem/i915_gem_pm.h"
     31#include "gem/selftests/mock_context.h"
     32
     33#include "gt/intel_engine_heartbeat.h"
     34#include "gt/intel_engine_pm.h"
     35#include "gt/intel_engine_user.h"
     36#include "gt/intel_gt.h"
     37#include "gt/intel_gt_clock_utils.h"
     38#include "gt/intel_gt_requests.h"
     39#include "gt/selftest_engine_heartbeat.h"
     40
     41#include "i915_random.h"
     42#include "i915_selftest.h"
     43#include "igt_flush_test.h"
     44#include "igt_live_test.h"
     45#include "igt_spinner.h"
     46#include "lib_sw_fence.h"
     47
     48#include "mock_drm.h"
     49#include "mock_gem_device.h"
     50
     51static unsigned int num_uabi_engines(struct drm_i915_private *i915)
     52{
     53	struct intel_engine_cs *engine;
     54	unsigned int count;
     55
     56	count = 0;
     57	for_each_uabi_engine(engine, i915)
     58		count++;
     59
     60	return count;
     61}
     62
     63static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
     64{
     65	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
     66}
     67
     68static int igt_add_request(void *arg)
     69{
     70	struct drm_i915_private *i915 = arg;
     71	struct i915_request *request;
     72
     73	/* Basic preliminary test to create a request and let it loose! */
     74
     75	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
     76	if (!request)
     77		return -ENOMEM;
     78
     79	i915_request_add(request);
     80
     81	return 0;
     82}
     83
     84static int igt_wait_request(void *arg)
     85{
     86	const long T = HZ / 4;
     87	struct drm_i915_private *i915 = arg;
     88	struct i915_request *request;
     89	int err = -EINVAL;
     90
     91	/* Submit a request, then wait upon it */
     92
     93	request = mock_request(rcs0(i915)->kernel_context, T);
     94	if (!request)
     95		return -ENOMEM;
     96
     97	i915_request_get(request);
     98
     99	if (i915_request_wait(request, 0, 0) != -ETIME) {
    100		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
    101		goto out_request;
    102	}
    103
    104	if (i915_request_wait(request, 0, T) != -ETIME) {
    105		pr_err("request wait succeeded (expected timeout before submit!)\n");
    106		goto out_request;
    107	}
    108
    109	if (i915_request_completed(request)) {
    110		pr_err("request completed before submit!!\n");
    111		goto out_request;
    112	}
    113
    114	i915_request_add(request);
    115
    116	if (i915_request_wait(request, 0, 0) != -ETIME) {
    117		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
    118		goto out_request;
    119	}
    120
    121	if (i915_request_completed(request)) {
    122		pr_err("request completed immediately!\n");
    123		goto out_request;
    124	}
    125
    126	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
    127		pr_err("request wait succeeded (expected timeout!)\n");
    128		goto out_request;
    129	}
    130
    131	if (i915_request_wait(request, 0, T) == -ETIME) {
    132		pr_err("request wait timed out!\n");
    133		goto out_request;
    134	}
    135
    136	if (!i915_request_completed(request)) {
    137		pr_err("request not complete after waiting!\n");
    138		goto out_request;
    139	}
    140
    141	if (i915_request_wait(request, 0, T) == -ETIME) {
    142		pr_err("request wait timed out when already complete!\n");
    143		goto out_request;
    144	}
    145
    146	err = 0;
    147out_request:
    148	i915_request_put(request);
    149	mock_device_flush(i915);
    150	return err;
    151}
    152
    153static int igt_fence_wait(void *arg)
    154{
    155	const long T = HZ / 4;
    156	struct drm_i915_private *i915 = arg;
    157	struct i915_request *request;
    158	int err = -EINVAL;
    159
    160	/* Submit a request, treat it as a fence and wait upon it */
    161
    162	request = mock_request(rcs0(i915)->kernel_context, T);
    163	if (!request)
    164		return -ENOMEM;
    165
    166	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
    167		pr_err("fence wait success before submit (expected timeout)!\n");
    168		goto out;
    169	}
    170
    171	i915_request_add(request);
    172
    173	if (dma_fence_is_signaled(&request->fence)) {
    174		pr_err("fence signaled immediately!\n");
    175		goto out;
    176	}
    177
    178	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
    179		pr_err("fence wait success after submit (expected timeout)!\n");
    180		goto out;
    181	}
    182
    183	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
    184		pr_err("fence wait timed out (expected success)!\n");
    185		goto out;
    186	}
    187
    188	if (!dma_fence_is_signaled(&request->fence)) {
    189		pr_err("fence unsignaled after waiting!\n");
    190		goto out;
    191	}
    192
    193	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
    194		pr_err("fence wait timed out when complete (expected success)!\n");
    195		goto out;
    196	}
    197
    198	err = 0;
    199out:
    200	mock_device_flush(i915);
    201	return err;
    202}
    203
    204static int igt_request_rewind(void *arg)
    205{
    206	struct drm_i915_private *i915 = arg;
    207	struct i915_request *request, *vip;
    208	struct i915_gem_context *ctx[2];
    209	struct intel_context *ce;
    210	int err = -EINVAL;
    211
    212	ctx[0] = mock_context(i915, "A");
    213	if (!ctx[0]) {
    214		err = -ENOMEM;
    215		goto err_ctx_0;
    216	}
    217
    218	ce = i915_gem_context_get_engine(ctx[0], RCS0);
    219	GEM_BUG_ON(IS_ERR(ce));
    220	request = mock_request(ce, 2 * HZ);
    221	intel_context_put(ce);
    222	if (!request) {
    223		err = -ENOMEM;
    224		goto err_context_0;
    225	}
    226
    227	i915_request_get(request);
    228	i915_request_add(request);
    229
    230	ctx[1] = mock_context(i915, "B");
    231	if (!ctx[1]) {
    232		err = -ENOMEM;
    233		goto err_ctx_1;
    234	}
    235
    236	ce = i915_gem_context_get_engine(ctx[1], RCS0);
    237	GEM_BUG_ON(IS_ERR(ce));
    238	vip = mock_request(ce, 0);
    239	intel_context_put(ce);
    240	if (!vip) {
    241		err = -ENOMEM;
    242		goto err_context_1;
    243	}
    244
    245	/* Simulate preemption by manual reordering */
    246	if (!mock_cancel_request(request)) {
    247		pr_err("failed to cancel request (already executed)!\n");
    248		i915_request_add(vip);
    249		goto err_context_1;
    250	}
    251	i915_request_get(vip);
    252	i915_request_add(vip);
    253	rcu_read_lock();
    254	request->engine->submit_request(request);
    255	rcu_read_unlock();
    256
    257
    258	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
    259		pr_err("timed out waiting for high priority request\n");
    260		goto err;
    261	}
    262
    263	if (i915_request_completed(request)) {
    264		pr_err("low priority request already completed\n");
    265		goto err;
    266	}
    267
    268	err = 0;
    269err:
    270	i915_request_put(vip);
    271err_context_1:
    272	mock_context_close(ctx[1]);
    273err_ctx_1:
    274	i915_request_put(request);
    275err_context_0:
    276	mock_context_close(ctx[0]);
    277err_ctx_0:
    278	mock_device_flush(i915);
    279	return err;
    280}
    281
    282struct smoketest {
    283	struct intel_engine_cs *engine;
    284	struct i915_gem_context **contexts;
    285	atomic_long_t num_waits, num_fences;
    286	int ncontexts, max_batch;
    287	struct i915_request *(*request_alloc)(struct intel_context *ce);
    288};
    289
    290static struct i915_request *
    291__mock_request_alloc(struct intel_context *ce)
    292{
    293	return mock_request(ce, 0);
    294}
    295
    296static struct i915_request *
    297__live_request_alloc(struct intel_context *ce)
    298{
    299	return intel_context_create_request(ce);
    300}
    301
    302static int __igt_breadcrumbs_smoketest(void *arg)
    303{
    304	struct smoketest *t = arg;
    305	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
    306	const unsigned int total = 4 * t->ncontexts + 1;
    307	unsigned int num_waits = 0, num_fences = 0;
    308	struct i915_request **requests;
    309	I915_RND_STATE(prng);
    310	unsigned int *order;
    311	int err = 0;
    312
    313	/*
    314	 * A very simple test to catch the most egregious of list handling bugs.
    315	 *
    316	 * At its heart, we simply create oodles of requests running across
    317	 * multiple kthreads and enable signaling on them, for the sole purpose
    318	 * of stressing our breadcrumb handling. The only inspection we do is
    319	 * that the fences were marked as signaled.
    320	 */
    321
    322	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
    323	if (!requests)
    324		return -ENOMEM;
    325
    326	order = i915_random_order(total, &prng);
    327	if (!order) {
    328		err = -ENOMEM;
    329		goto out_requests;
    330	}
    331
    332	while (!kthread_should_stop()) {
    333		struct i915_sw_fence *submit, *wait;
    334		unsigned int n, count;
    335
    336		submit = heap_fence_create(GFP_KERNEL);
    337		if (!submit) {
    338			err = -ENOMEM;
    339			break;
    340		}
    341
    342		wait = heap_fence_create(GFP_KERNEL);
    343		if (!wait) {
    344			i915_sw_fence_commit(submit);
    345			heap_fence_put(submit);
    346			err = -ENOMEM;
    347			break;
    348		}
    349
    350		i915_random_reorder(order, total, &prng);
    351		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
    352
    353		for (n = 0; n < count; n++) {
    354			struct i915_gem_context *ctx =
    355				t->contexts[order[n] % t->ncontexts];
    356			struct i915_request *rq;
    357			struct intel_context *ce;
    358
    359			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
    360			GEM_BUG_ON(IS_ERR(ce));
    361			rq = t->request_alloc(ce);
    362			intel_context_put(ce);
    363			if (IS_ERR(rq)) {
    364				err = PTR_ERR(rq);
    365				count = n;
    366				break;
    367			}
    368
    369			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
    370							       submit,
    371							       GFP_KERNEL);
    372
    373			requests[n] = i915_request_get(rq);
    374			i915_request_add(rq);
    375
    376			if (err >= 0)
    377				err = i915_sw_fence_await_dma_fence(wait,
    378								    &rq->fence,
    379								    0,
    380								    GFP_KERNEL);
    381
    382			if (err < 0) {
    383				i915_request_put(rq);
    384				count = n;
    385				break;
    386			}
    387		}
    388
    389		i915_sw_fence_commit(submit);
    390		i915_sw_fence_commit(wait);
    391
    392		if (!wait_event_timeout(wait->wait,
    393					i915_sw_fence_done(wait),
    394					5 * HZ)) {
    395			struct i915_request *rq = requests[count - 1];
    396
    397			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
    398			       atomic_read(&wait->pending), count,
    399			       rq->fence.context, rq->fence.seqno,
    400			       t->engine->name);
    401			GEM_TRACE_DUMP();
    402
    403			intel_gt_set_wedged(t->engine->gt);
    404			GEM_BUG_ON(!i915_request_completed(rq));
    405			i915_sw_fence_wait(wait);
    406			err = -EIO;
    407		}
    408
    409		for (n = 0; n < count; n++) {
    410			struct i915_request *rq = requests[n];
    411
    412			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
    413				      &rq->fence.flags)) {
    414				pr_err("%llu:%llu was not signaled!\n",
    415				       rq->fence.context, rq->fence.seqno);
    416				err = -EINVAL;
    417			}
    418
    419			i915_request_put(rq);
    420		}
    421
    422		heap_fence_put(wait);
    423		heap_fence_put(submit);
    424
    425		if (err < 0)
    426			break;
    427
    428		num_fences += count;
    429		num_waits++;
    430
    431		cond_resched();
    432	}
    433
    434	atomic_long_add(num_fences, &t->num_fences);
    435	atomic_long_add(num_waits, &t->num_waits);
    436
    437	kfree(order);
    438out_requests:
    439	kfree(requests);
    440	return err;
    441}
    442
    443static int mock_breadcrumbs_smoketest(void *arg)
    444{
    445	struct drm_i915_private *i915 = arg;
    446	struct smoketest t = {
    447		.engine = rcs0(i915),
    448		.ncontexts = 1024,
    449		.max_batch = 1024,
    450		.request_alloc = __mock_request_alloc
    451	};
    452	unsigned int ncpus = num_online_cpus();
    453	struct task_struct **threads;
    454	unsigned int n;
    455	int ret = 0;
    456
    457	/*
    458	 * Smoketest our breadcrumb/signal handling for requests across multiple
    459	 * threads. A very simple test to only catch the most egregious of bugs.
    460	 * See __igt_breadcrumbs_smoketest();
    461	 */
    462
    463	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
    464	if (!threads)
    465		return -ENOMEM;
    466
    467	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
    468	if (!t.contexts) {
    469		ret = -ENOMEM;
    470		goto out_threads;
    471	}
    472
    473	for (n = 0; n < t.ncontexts; n++) {
    474		t.contexts[n] = mock_context(t.engine->i915, "mock");
    475		if (!t.contexts[n]) {
    476			ret = -ENOMEM;
    477			goto out_contexts;
    478		}
    479	}
    480
    481	for (n = 0; n < ncpus; n++) {
    482		threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
    483					 &t, "igt/%d", n);
    484		if (IS_ERR(threads[n])) {
    485			ret = PTR_ERR(threads[n]);
    486			ncpus = n;
    487			break;
    488		}
    489
    490		get_task_struct(threads[n]);
    491	}
    492
    493	yield(); /* start all threads before we begin */
    494	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
    495
    496	for (n = 0; n < ncpus; n++) {
    497		int err;
    498
    499		err = kthread_stop(threads[n]);
    500		if (err < 0 && !ret)
    501			ret = err;
    502
    503		put_task_struct(threads[n]);
    504	}
    505	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
    506		atomic_long_read(&t.num_waits),
    507		atomic_long_read(&t.num_fences),
    508		ncpus);
    509
    510out_contexts:
    511	for (n = 0; n < t.ncontexts; n++) {
    512		if (!t.contexts[n])
    513			break;
    514		mock_context_close(t.contexts[n]);
    515	}
    516	kfree(t.contexts);
    517out_threads:
    518	kfree(threads);
    519	return ret;
    520}
    521
    522int i915_request_mock_selftests(void)
    523{
    524	static const struct i915_subtest tests[] = {
    525		SUBTEST(igt_add_request),
    526		SUBTEST(igt_wait_request),
    527		SUBTEST(igt_fence_wait),
    528		SUBTEST(igt_request_rewind),
    529		SUBTEST(mock_breadcrumbs_smoketest),
    530	};
    531	struct drm_i915_private *i915;
    532	intel_wakeref_t wakeref;
    533	int err = 0;
    534
    535	i915 = mock_gem_device();
    536	if (!i915)
    537		return -ENOMEM;
    538
    539	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
    540		err = i915_subtests(tests, i915);
    541
    542	mock_destroy_device(i915);
    543
    544	return err;
    545}
    546
    547static int live_nop_request(void *arg)
    548{
    549	struct drm_i915_private *i915 = arg;
    550	struct intel_engine_cs *engine;
    551	struct igt_live_test t;
    552	int err = -ENODEV;
    553
    554	/*
    555	 * Submit various sized batches of empty requests, to each engine
    556	 * (individually), and wait for the batch to complete. We can check
    557	 * the overhead of submitting requests to the hardware.
    558	 */
    559
    560	for_each_uabi_engine(engine, i915) {
    561		unsigned long n, prime;
    562		IGT_TIMEOUT(end_time);
    563		ktime_t times[2] = {};
    564
    565		err = igt_live_test_begin(&t, i915, __func__, engine->name);
    566		if (err)
    567			return err;
    568
    569		intel_engine_pm_get(engine);
    570		for_each_prime_number_from(prime, 1, 8192) {
    571			struct i915_request *request = NULL;
    572
    573			times[1] = ktime_get_raw();
    574
    575			for (n = 0; n < prime; n++) {
    576				i915_request_put(request);
    577				request = i915_request_create(engine->kernel_context);
    578				if (IS_ERR(request))
    579					return PTR_ERR(request);
    580
    581				/*
    582				 * This space is left intentionally blank.
    583				 *
    584				 * We do not actually want to perform any
    585				 * action with this request, we just want
    586				 * to measure the latency in allocation
    587				 * and submission of our breadcrumbs -
    588				 * ensuring that the bare request is sufficient
    589				 * for the system to work (i.e. proper HEAD
    590				 * tracking of the rings, interrupt handling,
    591				 * etc). It also gives us the lowest bounds
    592				 * for latency.
    593				 */
    594
    595				i915_request_get(request);
    596				i915_request_add(request);
    597			}
    598			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
    599			i915_request_put(request);
    600
    601			times[1] = ktime_sub(ktime_get_raw(), times[1]);
    602			if (prime == 1)
    603				times[0] = times[1];
    604
    605			if (__igt_timeout(end_time, NULL))
    606				break;
    607		}
    608		intel_engine_pm_put(engine);
    609
    610		err = igt_live_test_end(&t);
    611		if (err)
    612			return err;
    613
    614		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
    615			engine->name,
    616			ktime_to_ns(times[0]),
    617			prime, div64_u64(ktime_to_ns(times[1]), prime));
    618	}
    619
    620	return err;
    621}
    622
    623static int __cancel_inactive(struct intel_engine_cs *engine)
    624{
    625	struct intel_context *ce;
    626	struct igt_spinner spin;
    627	struct i915_request *rq;
    628	int err = 0;
    629
    630	if (igt_spinner_init(&spin, engine->gt))
    631		return -ENOMEM;
    632
    633	ce = intel_context_create(engine);
    634	if (IS_ERR(ce)) {
    635		err = PTR_ERR(ce);
    636		goto out_spin;
    637	}
    638
    639	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
    640	if (IS_ERR(rq)) {
    641		err = PTR_ERR(rq);
    642		goto out_ce;
    643	}
    644
    645	pr_debug("%s: Cancelling inactive request\n", engine->name);
    646	i915_request_cancel(rq, -EINTR);
    647	i915_request_get(rq);
    648	i915_request_add(rq);
    649
    650	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
    651		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
    652
    653		pr_err("%s: Failed to cancel inactive request\n", engine->name);
    654		intel_engine_dump(engine, &p, "%s\n", engine->name);
    655		err = -ETIME;
    656		goto out_rq;
    657	}
    658
    659	if (rq->fence.error != -EINTR) {
    660		pr_err("%s: fence not cancelled (%u)\n",
    661		       engine->name, rq->fence.error);
    662		err = -EINVAL;
    663	}
    664
    665out_rq:
    666	i915_request_put(rq);
    667out_ce:
    668	intel_context_put(ce);
    669out_spin:
    670	igt_spinner_fini(&spin);
    671	if (err)
    672		pr_err("%s: %s error %d\n", __func__, engine->name, err);
    673	return err;
    674}
    675
    676static int __cancel_active(struct intel_engine_cs *engine)
    677{
    678	struct intel_context *ce;
    679	struct igt_spinner spin;
    680	struct i915_request *rq;
    681	int err = 0;
    682
    683	if (igt_spinner_init(&spin, engine->gt))
    684		return -ENOMEM;
    685
    686	ce = intel_context_create(engine);
    687	if (IS_ERR(ce)) {
    688		err = PTR_ERR(ce);
    689		goto out_spin;
    690	}
    691
    692	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
    693	if (IS_ERR(rq)) {
    694		err = PTR_ERR(rq);
    695		goto out_ce;
    696	}
    697
    698	pr_debug("%s: Cancelling active request\n", engine->name);
    699	i915_request_get(rq);
    700	i915_request_add(rq);
    701	if (!igt_wait_for_spinner(&spin, rq)) {
    702		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
    703
    704		pr_err("Failed to start spinner on %s\n", engine->name);
    705		intel_engine_dump(engine, &p, "%s\n", engine->name);
    706		err = -ETIME;
    707		goto out_rq;
    708	}
    709	i915_request_cancel(rq, -EINTR);
    710
    711	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
    712		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
    713
    714		pr_err("%s: Failed to cancel active request\n", engine->name);
    715		intel_engine_dump(engine, &p, "%s\n", engine->name);
    716		err = -ETIME;
    717		goto out_rq;
    718	}
    719
    720	if (rq->fence.error != -EINTR) {
    721		pr_err("%s: fence not cancelled (%u)\n",
    722		       engine->name, rq->fence.error);
    723		err = -EINVAL;
    724	}
    725
    726out_rq:
    727	i915_request_put(rq);
    728out_ce:
    729	intel_context_put(ce);
    730out_spin:
    731	igt_spinner_fini(&spin);
    732	if (err)
    733		pr_err("%s: %s error %d\n", __func__, engine->name, err);
    734	return err;
    735}
    736
    737static int __cancel_completed(struct intel_engine_cs *engine)
    738{
    739	struct intel_context *ce;
    740	struct igt_spinner spin;
    741	struct i915_request *rq;
    742	int err = 0;
    743
    744	if (igt_spinner_init(&spin, engine->gt))
    745		return -ENOMEM;
    746
    747	ce = intel_context_create(engine);
    748	if (IS_ERR(ce)) {
    749		err = PTR_ERR(ce);
    750		goto out_spin;
    751	}
    752
    753	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
    754	if (IS_ERR(rq)) {
    755		err = PTR_ERR(rq);
    756		goto out_ce;
    757	}
    758	igt_spinner_end(&spin);
    759	i915_request_get(rq);
    760	i915_request_add(rq);
    761
    762	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
    763		err = -ETIME;
    764		goto out_rq;
    765	}
    766
    767	pr_debug("%s: Cancelling completed request\n", engine->name);
    768	i915_request_cancel(rq, -EINTR);
    769	if (rq->fence.error) {
    770		pr_err("%s: fence not cancelled (%u)\n",
    771		       engine->name, rq->fence.error);
    772		err = -EINVAL;
    773	}
    774
    775out_rq:
    776	i915_request_put(rq);
    777out_ce:
    778	intel_context_put(ce);
    779out_spin:
    780	igt_spinner_fini(&spin);
    781	if (err)
    782		pr_err("%s: %s error %d\n", __func__, engine->name, err);
    783	return err;
    784}
    785
    786/*
    787 * Test to prove a non-preemptable request can be cancelled and a subsequent
    788 * request on the same context can successfully complete after cancellation.
    789 *
    790 * Testing methodology is to create a non-preemptible request and submit it,
    791 * wait for spinner to start, create a NOP request and submit it, cancel the
    792 * spinner, wait for spinner to complete and verify it failed with an error,
    793 * finally wait for NOP request to complete verify it succeeded without an
    794 * error. Preemption timeout also reduced / restored so test runs in a timely
    795 * maner.
    796 */
    797static int __cancel_reset(struct drm_i915_private *i915,
    798			  struct intel_engine_cs *engine)
    799{
    800	struct intel_context *ce;
    801	struct igt_spinner spin;
    802	struct i915_request *rq, *nop;
    803	unsigned long preempt_timeout_ms;
    804	int err = 0;
    805
    806	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
    807	    !intel_has_reset_engine(engine->gt))
    808		return 0;
    809
    810	preempt_timeout_ms = engine->props.preempt_timeout_ms;
    811	engine->props.preempt_timeout_ms = 100;
    812
    813	if (igt_spinner_init(&spin, engine->gt))
    814		goto out_restore;
    815
    816	ce = intel_context_create(engine);
    817	if (IS_ERR(ce)) {
    818		err = PTR_ERR(ce);
    819		goto out_spin;
    820	}
    821
    822	rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
    823	if (IS_ERR(rq)) {
    824		err = PTR_ERR(rq);
    825		goto out_ce;
    826	}
    827
    828	pr_debug("%s: Cancelling active non-preemptable request\n",
    829		 engine->name);
    830	i915_request_get(rq);
    831	i915_request_add(rq);
    832	if (!igt_wait_for_spinner(&spin, rq)) {
    833		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
    834
    835		pr_err("Failed to start spinner on %s\n", engine->name);
    836		intel_engine_dump(engine, &p, "%s\n", engine->name);
    837		err = -ETIME;
    838		goto out_rq;
    839	}
    840
    841	nop = intel_context_create_request(ce);
    842	if (IS_ERR(nop))
    843		goto out_rq;
    844	i915_request_get(nop);
    845	i915_request_add(nop);
    846
    847	i915_request_cancel(rq, -EINTR);
    848
    849	if (i915_request_wait(rq, 0, HZ) < 0) {
    850		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
    851
    852		pr_err("%s: Failed to cancel hung request\n", engine->name);
    853		intel_engine_dump(engine, &p, "%s\n", engine->name);
    854		err = -ETIME;
    855		goto out_nop;
    856	}
    857
    858	if (rq->fence.error != -EINTR) {
    859		pr_err("%s: fence not cancelled (%u)\n",
    860		       engine->name, rq->fence.error);
    861		err = -EINVAL;
    862		goto out_nop;
    863	}
    864
    865	if (i915_request_wait(nop, 0, HZ) < 0) {
    866		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
    867
    868		pr_err("%s: Failed to complete nop request\n", engine->name);
    869		intel_engine_dump(engine, &p, "%s\n", engine->name);
    870		err = -ETIME;
    871		goto out_nop;
    872	}
    873
    874	if (nop->fence.error != 0) {
    875		pr_err("%s: Nop request errored (%u)\n",
    876		       engine->name, nop->fence.error);
    877		err = -EINVAL;
    878	}
    879
    880out_nop:
    881	i915_request_put(nop);
    882out_rq:
    883	i915_request_put(rq);
    884out_ce:
    885	intel_context_put(ce);
    886out_spin:
    887	igt_spinner_fini(&spin);
    888out_restore:
    889	engine->props.preempt_timeout_ms = preempt_timeout_ms;
    890	if (err)
    891		pr_err("%s: %s error %d\n", __func__, engine->name, err);
    892	return err;
    893}
    894
    895static int live_cancel_request(void *arg)
    896{
    897	struct drm_i915_private *i915 = arg;
    898	struct intel_engine_cs *engine;
    899
    900	/*
    901	 * Check cancellation of requests. We expect to be able to immediately
    902	 * cancel active requests, even if they are currently on the GPU.
    903	 */
    904
    905	for_each_uabi_engine(engine, i915) {
    906		struct igt_live_test t;
    907		int err, err2;
    908
    909		if (!intel_engine_has_preemption(engine))
    910			continue;
    911
    912		err = igt_live_test_begin(&t, i915, __func__, engine->name);
    913		if (err)
    914			return err;
    915
    916		err = __cancel_inactive(engine);
    917		if (err == 0)
    918			err = __cancel_active(engine);
    919		if (err == 0)
    920			err = __cancel_completed(engine);
    921
    922		err2 = igt_live_test_end(&t);
    923		if (err)
    924			return err;
    925		if (err2)
    926			return err2;
    927
    928		/* Expects reset so call outside of igt_live_test_* */
    929		err = __cancel_reset(i915, engine);
    930		if (err)
    931			return err;
    932
    933		if (igt_flush_test(i915))
    934			return -EIO;
    935	}
    936
    937	return 0;
    938}
    939
    940static struct i915_vma *empty_batch(struct drm_i915_private *i915)
    941{
    942	struct drm_i915_gem_object *obj;
    943	struct i915_vma *vma;
    944	u32 *cmd;
    945	int err;
    946
    947	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
    948	if (IS_ERR(obj))
    949		return ERR_CAST(obj);
    950
    951	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
    952	if (IS_ERR(cmd)) {
    953		err = PTR_ERR(cmd);
    954		goto err;
    955	}
    956
    957	*cmd = MI_BATCH_BUFFER_END;
    958
    959	__i915_gem_object_flush_map(obj, 0, 64);
    960	i915_gem_object_unpin_map(obj);
    961
    962	intel_gt_chipset_flush(to_gt(i915));
    963
    964	vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL);
    965	if (IS_ERR(vma)) {
    966		err = PTR_ERR(vma);
    967		goto err;
    968	}
    969
    970	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
    971	if (err)
    972		goto err;
    973
    974	/* Force the wait wait now to avoid including it in the benchmark */
    975	err = i915_vma_sync(vma);
    976	if (err)
    977		goto err_pin;
    978
    979	return vma;
    980
    981err_pin:
    982	i915_vma_unpin(vma);
    983err:
    984	i915_gem_object_put(obj);
    985	return ERR_PTR(err);
    986}
    987
    988static struct i915_request *
    989empty_request(struct intel_engine_cs *engine,
    990	      struct i915_vma *batch)
    991{
    992	struct i915_request *request;
    993	int err;
    994
    995	request = i915_request_create(engine->kernel_context);
    996	if (IS_ERR(request))
    997		return request;
    998
    999	err = engine->emit_bb_start(request,
   1000				    batch->node.start,
   1001				    batch->node.size,
   1002				    I915_DISPATCH_SECURE);
   1003	if (err)
   1004		goto out_request;
   1005
   1006	i915_request_get(request);
   1007out_request:
   1008	i915_request_add(request);
   1009	return err ? ERR_PTR(err) : request;
   1010}
   1011
   1012static int live_empty_request(void *arg)
   1013{
   1014	struct drm_i915_private *i915 = arg;
   1015	struct intel_engine_cs *engine;
   1016	struct igt_live_test t;
   1017	struct i915_vma *batch;
   1018	int err = 0;
   1019
   1020	/*
   1021	 * Submit various sized batches of empty requests, to each engine
   1022	 * (individually), and wait for the batch to complete. We can check
   1023	 * the overhead of submitting requests to the hardware.
   1024	 */
   1025
   1026	batch = empty_batch(i915);
   1027	if (IS_ERR(batch))
   1028		return PTR_ERR(batch);
   1029
   1030	for_each_uabi_engine(engine, i915) {
   1031		IGT_TIMEOUT(end_time);
   1032		struct i915_request *request;
   1033		unsigned long n, prime;
   1034		ktime_t times[2] = {};
   1035
   1036		err = igt_live_test_begin(&t, i915, __func__, engine->name);
   1037		if (err)
   1038			goto out_batch;
   1039
   1040		intel_engine_pm_get(engine);
   1041
   1042		/* Warmup / preload */
   1043		request = empty_request(engine, batch);
   1044		if (IS_ERR(request)) {
   1045			err = PTR_ERR(request);
   1046			intel_engine_pm_put(engine);
   1047			goto out_batch;
   1048		}
   1049		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
   1050
   1051		for_each_prime_number_from(prime, 1, 8192) {
   1052			times[1] = ktime_get_raw();
   1053
   1054			for (n = 0; n < prime; n++) {
   1055				i915_request_put(request);
   1056				request = empty_request(engine, batch);
   1057				if (IS_ERR(request)) {
   1058					err = PTR_ERR(request);
   1059					intel_engine_pm_put(engine);
   1060					goto out_batch;
   1061				}
   1062			}
   1063			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
   1064
   1065			times[1] = ktime_sub(ktime_get_raw(), times[1]);
   1066			if (prime == 1)
   1067				times[0] = times[1];
   1068
   1069			if (__igt_timeout(end_time, NULL))
   1070				break;
   1071		}
   1072		i915_request_put(request);
   1073		intel_engine_pm_put(engine);
   1074
   1075		err = igt_live_test_end(&t);
   1076		if (err)
   1077			goto out_batch;
   1078
   1079		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
   1080			engine->name,
   1081			ktime_to_ns(times[0]),
   1082			prime, div64_u64(ktime_to_ns(times[1]), prime));
   1083	}
   1084
   1085out_batch:
   1086	i915_vma_unpin(batch);
   1087	i915_vma_put(batch);
   1088	return err;
   1089}
   1090
   1091static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
   1092{
   1093	struct drm_i915_gem_object *obj;
   1094	const int ver = GRAPHICS_VER(i915);
   1095	struct i915_vma *vma;
   1096	u32 *cmd;
   1097	int err;
   1098
   1099	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
   1100	if (IS_ERR(obj))
   1101		return ERR_CAST(obj);
   1102
   1103	vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL);
   1104	if (IS_ERR(vma)) {
   1105		err = PTR_ERR(vma);
   1106		goto err;
   1107	}
   1108
   1109	err = i915_vma_pin(vma, 0, 0, PIN_USER);
   1110	if (err)
   1111		goto err;
   1112
   1113	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
   1114	if (IS_ERR(cmd)) {
   1115		err = PTR_ERR(cmd);
   1116		goto err;
   1117	}
   1118
   1119	if (ver >= 8) {
   1120		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
   1121		*cmd++ = lower_32_bits(vma->node.start);
   1122		*cmd++ = upper_32_bits(vma->node.start);
   1123	} else if (ver >= 6) {
   1124		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
   1125		*cmd++ = lower_32_bits(vma->node.start);
   1126	} else {
   1127		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
   1128		*cmd++ = lower_32_bits(vma->node.start);
   1129	}
   1130	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
   1131
   1132	__i915_gem_object_flush_map(obj, 0, 64);
   1133	i915_gem_object_unpin_map(obj);
   1134
   1135	intel_gt_chipset_flush(to_gt(i915));
   1136
   1137	return vma;
   1138
   1139err:
   1140	i915_gem_object_put(obj);
   1141	return ERR_PTR(err);
   1142}
   1143
   1144static int recursive_batch_resolve(struct i915_vma *batch)
   1145{
   1146	u32 *cmd;
   1147
   1148	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
   1149	if (IS_ERR(cmd))
   1150		return PTR_ERR(cmd);
   1151
   1152	*cmd = MI_BATCH_BUFFER_END;
   1153
   1154	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
   1155	i915_gem_object_unpin_map(batch->obj);
   1156
   1157	intel_gt_chipset_flush(batch->vm->gt);
   1158
   1159	return 0;
   1160}
   1161
   1162static int live_all_engines(void *arg)
   1163{
   1164	struct drm_i915_private *i915 = arg;
   1165	const unsigned int nengines = num_uabi_engines(i915);
   1166	struct intel_engine_cs *engine;
   1167	struct i915_request **request;
   1168	struct igt_live_test t;
   1169	struct i915_vma *batch;
   1170	unsigned int idx;
   1171	int err;
   1172
   1173	/*
   1174	 * Check we can submit requests to all engines simultaneously. We
   1175	 * send a recursive batch to each engine - checking that we don't
   1176	 * block doing so, and that they don't complete too soon.
   1177	 */
   1178
   1179	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
   1180	if (!request)
   1181		return -ENOMEM;
   1182
   1183	err = igt_live_test_begin(&t, i915, __func__, "");
   1184	if (err)
   1185		goto out_free;
   1186
   1187	batch = recursive_batch(i915);
   1188	if (IS_ERR(batch)) {
   1189		err = PTR_ERR(batch);
   1190		pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
   1191		goto out_free;
   1192	}
   1193
   1194	i915_vma_lock(batch);
   1195
   1196	idx = 0;
   1197	for_each_uabi_engine(engine, i915) {
   1198		request[idx] = intel_engine_create_kernel_request(engine);
   1199		if (IS_ERR(request[idx])) {
   1200			err = PTR_ERR(request[idx]);
   1201			pr_err("%s: Request allocation failed with err=%d\n",
   1202			       __func__, err);
   1203			goto out_request;
   1204		}
   1205
   1206		err = i915_request_await_object(request[idx], batch->obj, 0);
   1207		if (err == 0)
   1208			err = i915_vma_move_to_active(batch, request[idx], 0);
   1209		GEM_BUG_ON(err);
   1210
   1211		err = engine->emit_bb_start(request[idx],
   1212					    batch->node.start,
   1213					    batch->node.size,
   1214					    0);
   1215		GEM_BUG_ON(err);
   1216		request[idx]->batch = batch;
   1217
   1218		i915_request_get(request[idx]);
   1219		i915_request_add(request[idx]);
   1220		idx++;
   1221	}
   1222
   1223	i915_vma_unlock(batch);
   1224
   1225	idx = 0;
   1226	for_each_uabi_engine(engine, i915) {
   1227		if (i915_request_completed(request[idx])) {
   1228			pr_err("%s(%s): request completed too early!\n",
   1229			       __func__, engine->name);
   1230			err = -EINVAL;
   1231			goto out_request;
   1232		}
   1233		idx++;
   1234	}
   1235
   1236	err = recursive_batch_resolve(batch);
   1237	if (err) {
   1238		pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
   1239		goto out_request;
   1240	}
   1241
   1242	idx = 0;
   1243	for_each_uabi_engine(engine, i915) {
   1244		long timeout;
   1245
   1246		timeout = i915_request_wait(request[idx], 0,
   1247					    MAX_SCHEDULE_TIMEOUT);
   1248		if (timeout < 0) {
   1249			err = timeout;
   1250			pr_err("%s: error waiting for request on %s, err=%d\n",
   1251			       __func__, engine->name, err);
   1252			goto out_request;
   1253		}
   1254
   1255		GEM_BUG_ON(!i915_request_completed(request[idx]));
   1256		i915_request_put(request[idx]);
   1257		request[idx] = NULL;
   1258		idx++;
   1259	}
   1260
   1261	err = igt_live_test_end(&t);
   1262
   1263out_request:
   1264	idx = 0;
   1265	for_each_uabi_engine(engine, i915) {
   1266		if (request[idx])
   1267			i915_request_put(request[idx]);
   1268		idx++;
   1269	}
   1270	i915_vma_unpin(batch);
   1271	i915_vma_put(batch);
   1272out_free:
   1273	kfree(request);
   1274	return err;
   1275}
   1276
   1277static int live_sequential_engines(void *arg)
   1278{
   1279	struct drm_i915_private *i915 = arg;
   1280	const unsigned int nengines = num_uabi_engines(i915);
   1281	struct i915_request **request;
   1282	struct i915_request *prev = NULL;
   1283	struct intel_engine_cs *engine;
   1284	struct igt_live_test t;
   1285	unsigned int idx;
   1286	int err;
   1287
   1288	/*
   1289	 * Check we can submit requests to all engines sequentially, such
   1290	 * that each successive request waits for the earlier ones. This
   1291	 * tests that we don't execute requests out of order, even though
   1292	 * they are running on independent engines.
   1293	 */
   1294
   1295	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
   1296	if (!request)
   1297		return -ENOMEM;
   1298
   1299	err = igt_live_test_begin(&t, i915, __func__, "");
   1300	if (err)
   1301		goto out_free;
   1302
   1303	idx = 0;
   1304	for_each_uabi_engine(engine, i915) {
   1305		struct i915_vma *batch;
   1306
   1307		batch = recursive_batch(i915);
   1308		if (IS_ERR(batch)) {
   1309			err = PTR_ERR(batch);
   1310			pr_err("%s: Unable to create batch for %s, err=%d\n",
   1311			       __func__, engine->name, err);
   1312			goto out_free;
   1313		}
   1314
   1315		i915_vma_lock(batch);
   1316		request[idx] = intel_engine_create_kernel_request(engine);
   1317		if (IS_ERR(request[idx])) {
   1318			err = PTR_ERR(request[idx]);
   1319			pr_err("%s: Request allocation failed for %s with err=%d\n",
   1320			       __func__, engine->name, err);
   1321			goto out_unlock;
   1322		}
   1323
   1324		if (prev) {
   1325			err = i915_request_await_dma_fence(request[idx],
   1326							   &prev->fence);
   1327			if (err) {
   1328				i915_request_add(request[idx]);
   1329				pr_err("%s: Request await failed for %s with err=%d\n",
   1330				       __func__, engine->name, err);
   1331				goto out_unlock;
   1332			}
   1333		}
   1334
   1335		err = i915_request_await_object(request[idx],
   1336						batch->obj, false);
   1337		if (err == 0)
   1338			err = i915_vma_move_to_active(batch, request[idx], 0);
   1339		GEM_BUG_ON(err);
   1340
   1341		err = engine->emit_bb_start(request[idx],
   1342					    batch->node.start,
   1343					    batch->node.size,
   1344					    0);
   1345		GEM_BUG_ON(err);
   1346		request[idx]->batch = batch;
   1347
   1348		i915_request_get(request[idx]);
   1349		i915_request_add(request[idx]);
   1350
   1351		prev = request[idx];
   1352		idx++;
   1353
   1354out_unlock:
   1355		i915_vma_unlock(batch);
   1356		if (err)
   1357			goto out_request;
   1358	}
   1359
   1360	idx = 0;
   1361	for_each_uabi_engine(engine, i915) {
   1362		long timeout;
   1363
   1364		if (i915_request_completed(request[idx])) {
   1365			pr_err("%s(%s): request completed too early!\n",
   1366			       __func__, engine->name);
   1367			err = -EINVAL;
   1368			goto out_request;
   1369		}
   1370
   1371		err = recursive_batch_resolve(request[idx]->batch);
   1372		if (err) {
   1373			pr_err("%s: failed to resolve batch, err=%d\n",
   1374			       __func__, err);
   1375			goto out_request;
   1376		}
   1377
   1378		timeout = i915_request_wait(request[idx], 0,
   1379					    MAX_SCHEDULE_TIMEOUT);
   1380		if (timeout < 0) {
   1381			err = timeout;
   1382			pr_err("%s: error waiting for request on %s, err=%d\n",
   1383			       __func__, engine->name, err);
   1384			goto out_request;
   1385		}
   1386
   1387		GEM_BUG_ON(!i915_request_completed(request[idx]));
   1388		idx++;
   1389	}
   1390
   1391	err = igt_live_test_end(&t);
   1392
   1393out_request:
   1394	idx = 0;
   1395	for_each_uabi_engine(engine, i915) {
   1396		u32 *cmd;
   1397
   1398		if (!request[idx])
   1399			break;
   1400
   1401		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
   1402						       I915_MAP_WC);
   1403		if (!IS_ERR(cmd)) {
   1404			*cmd = MI_BATCH_BUFFER_END;
   1405
   1406			__i915_gem_object_flush_map(request[idx]->batch->obj,
   1407						    0, sizeof(*cmd));
   1408			i915_gem_object_unpin_map(request[idx]->batch->obj);
   1409
   1410			intel_gt_chipset_flush(engine->gt);
   1411		}
   1412
   1413		i915_vma_put(request[idx]->batch);
   1414		i915_request_put(request[idx]);
   1415		idx++;
   1416	}
   1417out_free:
   1418	kfree(request);
   1419	return err;
   1420}
   1421
   1422static int __live_parallel_engine1(void *arg)
   1423{
   1424	struct intel_engine_cs *engine = arg;
   1425	IGT_TIMEOUT(end_time);
   1426	unsigned long count;
   1427	int err = 0;
   1428
   1429	count = 0;
   1430	intel_engine_pm_get(engine);
   1431	do {
   1432		struct i915_request *rq;
   1433
   1434		rq = i915_request_create(engine->kernel_context);
   1435		if (IS_ERR(rq)) {
   1436			err = PTR_ERR(rq);
   1437			break;
   1438		}
   1439
   1440		i915_request_get(rq);
   1441		i915_request_add(rq);
   1442
   1443		err = 0;
   1444		if (i915_request_wait(rq, 0, HZ) < 0)
   1445			err = -ETIME;
   1446		i915_request_put(rq);
   1447		if (err)
   1448			break;
   1449
   1450		count++;
   1451	} while (!__igt_timeout(end_time, NULL));
   1452	intel_engine_pm_put(engine);
   1453
   1454	pr_info("%s: %lu request + sync\n", engine->name, count);
   1455	return err;
   1456}
   1457
   1458static int __live_parallel_engineN(void *arg)
   1459{
   1460	struct intel_engine_cs *engine = arg;
   1461	IGT_TIMEOUT(end_time);
   1462	unsigned long count;
   1463	int err = 0;
   1464
   1465	count = 0;
   1466	intel_engine_pm_get(engine);
   1467	do {
   1468		struct i915_request *rq;
   1469
   1470		rq = i915_request_create(engine->kernel_context);
   1471		if (IS_ERR(rq)) {
   1472			err = PTR_ERR(rq);
   1473			break;
   1474		}
   1475
   1476		i915_request_add(rq);
   1477		count++;
   1478	} while (!__igt_timeout(end_time, NULL));
   1479	intel_engine_pm_put(engine);
   1480
   1481	pr_info("%s: %lu requests\n", engine->name, count);
   1482	return err;
   1483}
   1484
   1485static bool wake_all(struct drm_i915_private *i915)
   1486{
   1487	if (atomic_dec_and_test(&i915->selftest.counter)) {
   1488		wake_up_var(&i915->selftest.counter);
   1489		return true;
   1490	}
   1491
   1492	return false;
   1493}
   1494
   1495static int wait_for_all(struct drm_i915_private *i915)
   1496{
   1497	if (wake_all(i915))
   1498		return 0;
   1499
   1500	if (wait_var_event_timeout(&i915->selftest.counter,
   1501				   !atomic_read(&i915->selftest.counter),
   1502				   i915_selftest.timeout_jiffies))
   1503		return 0;
   1504
   1505	return -ETIME;
   1506}
   1507
   1508static int __live_parallel_spin(void *arg)
   1509{
   1510	struct intel_engine_cs *engine = arg;
   1511	struct igt_spinner spin;
   1512	struct i915_request *rq;
   1513	int err = 0;
   1514
   1515	/*
   1516	 * Create a spinner running for eternity on each engine. If a second
   1517	 * spinner is incorrectly placed on the same engine, it will not be
   1518	 * able to start in time.
   1519	 */
   1520
   1521	if (igt_spinner_init(&spin, engine->gt)) {
   1522		wake_all(engine->i915);
   1523		return -ENOMEM;
   1524	}
   1525
   1526	intel_engine_pm_get(engine);
   1527	rq = igt_spinner_create_request(&spin,
   1528					engine->kernel_context,
   1529					MI_NOOP); /* no preemption */
   1530	intel_engine_pm_put(engine);
   1531	if (IS_ERR(rq)) {
   1532		err = PTR_ERR(rq);
   1533		if (err == -ENODEV)
   1534			err = 0;
   1535		wake_all(engine->i915);
   1536		goto out_spin;
   1537	}
   1538
   1539	i915_request_get(rq);
   1540	i915_request_add(rq);
   1541	if (igt_wait_for_spinner(&spin, rq)) {
   1542		/* Occupy this engine for the whole test */
   1543		err = wait_for_all(engine->i915);
   1544	} else {
   1545		pr_err("Failed to start spinner on %s\n", engine->name);
   1546		err = -EINVAL;
   1547	}
   1548	igt_spinner_end(&spin);
   1549
   1550	if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
   1551		err = -EIO;
   1552	i915_request_put(rq);
   1553
   1554out_spin:
   1555	igt_spinner_fini(&spin);
   1556	return err;
   1557}
   1558
   1559static int live_parallel_engines(void *arg)
   1560{
   1561	struct drm_i915_private *i915 = arg;
   1562	static int (* const func[])(void *arg) = {
   1563		__live_parallel_engine1,
   1564		__live_parallel_engineN,
   1565		__live_parallel_spin,
   1566		NULL,
   1567	};
   1568	const unsigned int nengines = num_uabi_engines(i915);
   1569	struct intel_engine_cs *engine;
   1570	int (* const *fn)(void *arg);
   1571	struct task_struct **tsk;
   1572	int err = 0;
   1573
   1574	/*
   1575	 * Check we can submit requests to all engines concurrently. This
   1576	 * tests that we load up the system maximally.
   1577	 */
   1578
   1579	tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
   1580	if (!tsk)
   1581		return -ENOMEM;
   1582
   1583	for (fn = func; !err && *fn; fn++) {
   1584		char name[KSYM_NAME_LEN];
   1585		struct igt_live_test t;
   1586		unsigned int idx;
   1587
   1588		snprintf(name, sizeof(name), "%ps", *fn);
   1589		err = igt_live_test_begin(&t, i915, __func__, name);
   1590		if (err)
   1591			break;
   1592
   1593		atomic_set(&i915->selftest.counter, nengines);
   1594
   1595		idx = 0;
   1596		for_each_uabi_engine(engine, i915) {
   1597			tsk[idx] = kthread_run(*fn, engine,
   1598					       "igt/parallel:%s",
   1599					       engine->name);
   1600			if (IS_ERR(tsk[idx])) {
   1601				err = PTR_ERR(tsk[idx]);
   1602				break;
   1603			}
   1604			get_task_struct(tsk[idx++]);
   1605		}
   1606
   1607		yield(); /* start all threads before we kthread_stop() */
   1608
   1609		idx = 0;
   1610		for_each_uabi_engine(engine, i915) {
   1611			int status;
   1612
   1613			if (IS_ERR(tsk[idx]))
   1614				break;
   1615
   1616			status = kthread_stop(tsk[idx]);
   1617			if (status && !err)
   1618				err = status;
   1619
   1620			put_task_struct(tsk[idx++]);
   1621		}
   1622
   1623		if (igt_live_test_end(&t))
   1624			err = -EIO;
   1625	}
   1626
   1627	kfree(tsk);
   1628	return err;
   1629}
   1630
   1631static int
   1632max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
   1633{
   1634	struct i915_request *rq;
   1635	int ret;
   1636
   1637	/*
   1638	 * Before execlists, all contexts share the same ringbuffer. With
   1639	 * execlists, each context/engine has a separate ringbuffer and
   1640	 * for the purposes of this test, inexhaustible.
   1641	 *
   1642	 * For the global ringbuffer though, we have to be very careful
   1643	 * that we do not wrap while preventing the execution of requests
   1644	 * with a unsignaled fence.
   1645	 */
   1646	if (HAS_EXECLISTS(ctx->i915))
   1647		return INT_MAX;
   1648
   1649	rq = igt_request_alloc(ctx, engine);
   1650	if (IS_ERR(rq)) {
   1651		ret = PTR_ERR(rq);
   1652	} else {
   1653		int sz;
   1654
   1655		ret = rq->ring->size - rq->reserved_space;
   1656		i915_request_add(rq);
   1657
   1658		sz = rq->ring->emit - rq->head;
   1659		if (sz < 0)
   1660			sz += rq->ring->size;
   1661		ret /= sz;
   1662		ret /= 2; /* leave half spare, in case of emergency! */
   1663	}
   1664
   1665	return ret;
   1666}
   1667
   1668static int live_breadcrumbs_smoketest(void *arg)
   1669{
   1670	struct drm_i915_private *i915 = arg;
   1671	const unsigned int nengines = num_uabi_engines(i915);
   1672	const unsigned int ncpus = num_online_cpus();
   1673	unsigned long num_waits, num_fences;
   1674	struct intel_engine_cs *engine;
   1675	struct task_struct **threads;
   1676	struct igt_live_test live;
   1677	intel_wakeref_t wakeref;
   1678	struct smoketest *smoke;
   1679	unsigned int n, idx;
   1680	struct file *file;
   1681	int ret = 0;
   1682
   1683	/*
   1684	 * Smoketest our breadcrumb/signal handling for requests across multiple
   1685	 * threads. A very simple test to only catch the most egregious of bugs.
   1686	 * See __igt_breadcrumbs_smoketest();
   1687	 *
   1688	 * On real hardware this time.
   1689	 */
   1690
   1691	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
   1692
   1693	file = mock_file(i915);
   1694	if (IS_ERR(file)) {
   1695		ret = PTR_ERR(file);
   1696		goto out_rpm;
   1697	}
   1698
   1699	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
   1700	if (!smoke) {
   1701		ret = -ENOMEM;
   1702		goto out_file;
   1703	}
   1704
   1705	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
   1706	if (!threads) {
   1707		ret = -ENOMEM;
   1708		goto out_smoke;
   1709	}
   1710
   1711	smoke[0].request_alloc = __live_request_alloc;
   1712	smoke[0].ncontexts = 64;
   1713	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
   1714				    sizeof(*smoke[0].contexts),
   1715				    GFP_KERNEL);
   1716	if (!smoke[0].contexts) {
   1717		ret = -ENOMEM;
   1718		goto out_threads;
   1719	}
   1720
   1721	for (n = 0; n < smoke[0].ncontexts; n++) {
   1722		smoke[0].contexts[n] = live_context(i915, file);
   1723		if (IS_ERR(smoke[0].contexts[n])) {
   1724			ret = PTR_ERR(smoke[0].contexts[n]);
   1725			goto out_contexts;
   1726		}
   1727	}
   1728
   1729	ret = igt_live_test_begin(&live, i915, __func__, "");
   1730	if (ret)
   1731		goto out_contexts;
   1732
   1733	idx = 0;
   1734	for_each_uabi_engine(engine, i915) {
   1735		smoke[idx] = smoke[0];
   1736		smoke[idx].engine = engine;
   1737		smoke[idx].max_batch =
   1738			max_batches(smoke[0].contexts[0], engine);
   1739		if (smoke[idx].max_batch < 0) {
   1740			ret = smoke[idx].max_batch;
   1741			goto out_flush;
   1742		}
   1743		/* One ring interleaved between requests from all cpus */
   1744		smoke[idx].max_batch /= num_online_cpus() + 1;
   1745		pr_debug("Limiting batches to %d requests on %s\n",
   1746			 smoke[idx].max_batch, engine->name);
   1747
   1748		for (n = 0; n < ncpus; n++) {
   1749			struct task_struct *tsk;
   1750
   1751			tsk = kthread_run(__igt_breadcrumbs_smoketest,
   1752					  &smoke[idx], "igt/%d.%d", idx, n);
   1753			if (IS_ERR(tsk)) {
   1754				ret = PTR_ERR(tsk);
   1755				goto out_flush;
   1756			}
   1757
   1758			get_task_struct(tsk);
   1759			threads[idx * ncpus + n] = tsk;
   1760		}
   1761
   1762		idx++;
   1763	}
   1764
   1765	yield(); /* start all threads before we begin */
   1766	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
   1767
   1768out_flush:
   1769	idx = 0;
   1770	num_waits = 0;
   1771	num_fences = 0;
   1772	for_each_uabi_engine(engine, i915) {
   1773		for (n = 0; n < ncpus; n++) {
   1774			struct task_struct *tsk = threads[idx * ncpus + n];
   1775			int err;
   1776
   1777			if (!tsk)
   1778				continue;
   1779
   1780			err = kthread_stop(tsk);
   1781			if (err < 0 && !ret)
   1782				ret = err;
   1783
   1784			put_task_struct(tsk);
   1785		}
   1786
   1787		num_waits += atomic_long_read(&smoke[idx].num_waits);
   1788		num_fences += atomic_long_read(&smoke[idx].num_fences);
   1789		idx++;
   1790	}
   1791	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
   1792		num_waits, num_fences, idx, ncpus);
   1793
   1794	ret = igt_live_test_end(&live) ?: ret;
   1795out_contexts:
   1796	kfree(smoke[0].contexts);
   1797out_threads:
   1798	kfree(threads);
   1799out_smoke:
   1800	kfree(smoke);
   1801out_file:
   1802	fput(file);
   1803out_rpm:
   1804	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
   1805
   1806	return ret;
   1807}
   1808
   1809int i915_request_live_selftests(struct drm_i915_private *i915)
   1810{
   1811	static const struct i915_subtest tests[] = {
   1812		SUBTEST(live_nop_request),
   1813		SUBTEST(live_all_engines),
   1814		SUBTEST(live_sequential_engines),
   1815		SUBTEST(live_parallel_engines),
   1816		SUBTEST(live_empty_request),
   1817		SUBTEST(live_cancel_request),
   1818		SUBTEST(live_breadcrumbs_smoketest),
   1819	};
   1820
   1821	if (intel_gt_is_wedged(to_gt(i915)))
   1822		return 0;
   1823
   1824	return i915_subtests(tests, i915);
   1825}
   1826
   1827static int switch_to_kernel_sync(struct intel_context *ce, int err)
   1828{
   1829	struct i915_request *rq;
   1830	struct dma_fence *fence;
   1831
   1832	rq = intel_engine_create_kernel_request(ce->engine);
   1833	if (IS_ERR(rq))
   1834		return PTR_ERR(rq);
   1835
   1836	fence = i915_active_fence_get(&ce->timeline->last_request);
   1837	if (fence) {
   1838		i915_request_await_dma_fence(rq, fence);
   1839		dma_fence_put(fence);
   1840	}
   1841
   1842	rq = i915_request_get(rq);
   1843	i915_request_add(rq);
   1844	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
   1845		err = -ETIME;
   1846	i915_request_put(rq);
   1847
   1848	while (!err && !intel_engine_is_idle(ce->engine))
   1849		intel_engine_flush_submission(ce->engine);
   1850
   1851	return err;
   1852}
   1853
   1854struct perf_stats {
   1855	struct intel_engine_cs *engine;
   1856	unsigned long count;
   1857	ktime_t time;
   1858	ktime_t busy;
   1859	u64 runtime;
   1860};
   1861
   1862struct perf_series {
   1863	struct drm_i915_private *i915;
   1864	unsigned int nengines;
   1865	struct intel_context *ce[];
   1866};
   1867
   1868static int cmp_u32(const void *A, const void *B)
   1869{
   1870	const u32 *a = A, *b = B;
   1871
   1872	return *a - *b;
   1873}
   1874
   1875static u32 trifilter(u32 *a)
   1876{
   1877	u64 sum;
   1878
   1879#define TF_COUNT 5
   1880	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
   1881
   1882	sum = mul_u32_u32(a[2], 2);
   1883	sum += a[1];
   1884	sum += a[3];
   1885
   1886	GEM_BUG_ON(sum > U32_MAX);
   1887	return sum;
   1888#define TF_BIAS 2
   1889}
   1890
   1891static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
   1892{
   1893	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
   1894
   1895	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
   1896}
   1897
   1898static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
   1899{
   1900	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
   1901	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
   1902	*cs++ = offset;
   1903	*cs++ = 0;
   1904
   1905	return cs;
   1906}
   1907
   1908static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
   1909{
   1910	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
   1911	*cs++ = offset;
   1912	*cs++ = 0;
   1913	*cs++ = value;
   1914
   1915	return cs;
   1916}
   1917
   1918static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
   1919{
   1920	*cs++ = MI_SEMAPHORE_WAIT |
   1921		MI_SEMAPHORE_GLOBAL_GTT |
   1922		MI_SEMAPHORE_POLL |
   1923		mode;
   1924	*cs++ = value;
   1925	*cs++ = offset;
   1926	*cs++ = 0;
   1927
   1928	return cs;
   1929}
   1930
   1931static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
   1932{
   1933	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
   1934}
   1935
   1936static void semaphore_set(u32 *sema, u32 value)
   1937{
   1938	WRITE_ONCE(*sema, value);
   1939	wmb(); /* flush the update to the cache, and beyond */
   1940}
   1941
   1942static u32 *hwsp_scratch(const struct intel_context *ce)
   1943{
   1944	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
   1945}
   1946
   1947static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
   1948{
   1949	return (i915_ggtt_offset(ce->engine->status_page.vma) +
   1950		offset_in_page(dw));
   1951}
   1952
   1953static int measure_semaphore_response(struct intel_context *ce)
   1954{
   1955	u32 *sema = hwsp_scratch(ce);
   1956	const u32 offset = hwsp_offset(ce, sema);
   1957	u32 elapsed[TF_COUNT], cycles;
   1958	struct i915_request *rq;
   1959	u32 *cs;
   1960	int err;
   1961	int i;
   1962
   1963	/*
   1964	 * Measure how many cycles it takes for the HW to detect the change
   1965	 * in a semaphore value.
   1966	 *
   1967	 *    A: read CS_TIMESTAMP from CPU
   1968	 *    poke semaphore
   1969	 *    B: read CS_TIMESTAMP on GPU
   1970	 *
   1971	 * Semaphore latency: B - A
   1972	 */
   1973
   1974	semaphore_set(sema, -1);
   1975
   1976	rq = i915_request_create(ce);
   1977	if (IS_ERR(rq))
   1978		return PTR_ERR(rq);
   1979
   1980	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
   1981	if (IS_ERR(cs)) {
   1982		i915_request_add(rq);
   1983		err = PTR_ERR(cs);
   1984		goto err;
   1985	}
   1986
   1987	cs = emit_store_dw(cs, offset, 0);
   1988	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
   1989		cs = emit_semaphore_poll_until(cs, offset, i);
   1990		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
   1991		cs = emit_store_dw(cs, offset, 0);
   1992	}
   1993
   1994	intel_ring_advance(rq, cs);
   1995	i915_request_add(rq);
   1996
   1997	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
   1998		err = -EIO;
   1999		goto err;
   2000	}
   2001
   2002	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
   2003		preempt_disable();
   2004		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
   2005		semaphore_set(sema, i);
   2006		preempt_enable();
   2007
   2008		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
   2009			err = -EIO;
   2010			goto err;
   2011		}
   2012
   2013		elapsed[i - 1] = sema[i] - cycles;
   2014	}
   2015
   2016	cycles = trifilter(elapsed);
   2017	pr_info("%s: semaphore response %d cycles, %lluns\n",
   2018		ce->engine->name, cycles >> TF_BIAS,
   2019		cycles_to_ns(ce->engine, cycles));
   2020
   2021	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
   2022
   2023err:
   2024	intel_gt_set_wedged(ce->engine->gt);
   2025	return err;
   2026}
   2027
   2028static int measure_idle_dispatch(struct intel_context *ce)
   2029{
   2030	u32 *sema = hwsp_scratch(ce);
   2031	const u32 offset = hwsp_offset(ce, sema);
   2032	u32 elapsed[TF_COUNT], cycles;
   2033	u32 *cs;
   2034	int err;
   2035	int i;
   2036
   2037	/*
   2038	 * Measure how long it takes for us to submit a request while the
   2039	 * engine is idle, but is resting in our context.
   2040	 *
   2041	 *    A: read CS_TIMESTAMP from CPU
   2042	 *    submit request
   2043	 *    B: read CS_TIMESTAMP on GPU
   2044	 *
   2045	 * Submission latency: B - A
   2046	 */
   2047
   2048	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
   2049		struct i915_request *rq;
   2050
   2051		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
   2052		if (err)
   2053			return err;
   2054
   2055		rq = i915_request_create(ce);
   2056		if (IS_ERR(rq)) {
   2057			err = PTR_ERR(rq);
   2058			goto err;
   2059		}
   2060
   2061		cs = intel_ring_begin(rq, 4);
   2062		if (IS_ERR(cs)) {
   2063			i915_request_add(rq);
   2064			err = PTR_ERR(cs);
   2065			goto err;
   2066		}
   2067
   2068		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
   2069
   2070		intel_ring_advance(rq, cs);
   2071
   2072		preempt_disable();
   2073		local_bh_disable();
   2074		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
   2075		i915_request_add(rq);
   2076		local_bh_enable();
   2077		preempt_enable();
   2078	}
   2079
   2080	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
   2081	if (err)
   2082		goto err;
   2083
   2084	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
   2085		elapsed[i] = sema[i] - elapsed[i];
   2086
   2087	cycles = trifilter(elapsed);
   2088	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
   2089		ce->engine->name, cycles >> TF_BIAS,
   2090		cycles_to_ns(ce->engine, cycles));
   2091
   2092	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
   2093
   2094err:
   2095	intel_gt_set_wedged(ce->engine->gt);
   2096	return err;
   2097}
   2098
   2099static int measure_busy_dispatch(struct intel_context *ce)
   2100{
   2101	u32 *sema = hwsp_scratch(ce);
   2102	const u32 offset = hwsp_offset(ce, sema);
   2103	u32 elapsed[TF_COUNT + 1], cycles;
   2104	u32 *cs;
   2105	int err;
   2106	int i;
   2107
   2108	/*
   2109	 * Measure how long it takes for us to submit a request while the
   2110	 * engine is busy, polling on a semaphore in our context. With
   2111	 * direct submission, this will include the cost of a lite restore.
   2112	 *
   2113	 *    A: read CS_TIMESTAMP from CPU
   2114	 *    submit request
   2115	 *    B: read CS_TIMESTAMP on GPU
   2116	 *
   2117	 * Submission latency: B - A
   2118	 */
   2119
   2120	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
   2121		struct i915_request *rq;
   2122
   2123		rq = i915_request_create(ce);
   2124		if (IS_ERR(rq)) {
   2125			err = PTR_ERR(rq);
   2126			goto err;
   2127		}
   2128
   2129		cs = intel_ring_begin(rq, 12);
   2130		if (IS_ERR(cs)) {
   2131			i915_request_add(rq);
   2132			err = PTR_ERR(cs);
   2133			goto err;
   2134		}
   2135
   2136		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
   2137		cs = emit_semaphore_poll_until(cs, offset, i);
   2138		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
   2139
   2140		intel_ring_advance(rq, cs);
   2141
   2142		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
   2143			err = -EIO;
   2144			goto err;
   2145		}
   2146
   2147		preempt_disable();
   2148		local_bh_disable();
   2149		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
   2150		i915_request_add(rq);
   2151		local_bh_enable();
   2152		semaphore_set(sema, i - 1);
   2153		preempt_enable();
   2154	}
   2155
   2156	wait_for(READ_ONCE(sema[i - 1]), 500);
   2157	semaphore_set(sema, i - 1);
   2158
   2159	for (i = 1; i <= TF_COUNT; i++) {
   2160		GEM_BUG_ON(sema[i] == -1);
   2161		elapsed[i - 1] = sema[i] - elapsed[i];
   2162	}
   2163
   2164	cycles = trifilter(elapsed);
   2165	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
   2166		ce->engine->name, cycles >> TF_BIAS,
   2167		cycles_to_ns(ce->engine, cycles));
   2168
   2169	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
   2170
   2171err:
   2172	intel_gt_set_wedged(ce->engine->gt);
   2173	return err;
   2174}
   2175
   2176static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
   2177{
   2178	const u32 offset =
   2179		i915_ggtt_offset(engine->status_page.vma) +
   2180		offset_in_page(sema);
   2181	struct i915_request *rq;
   2182	u32 *cs;
   2183
   2184	rq = i915_request_create(engine->kernel_context);
   2185	if (IS_ERR(rq))
   2186		return PTR_ERR(rq);
   2187
   2188	cs = intel_ring_begin(rq, 4);
   2189	if (IS_ERR(cs)) {
   2190		i915_request_add(rq);
   2191		return PTR_ERR(cs);
   2192	}
   2193
   2194	cs = emit_semaphore_poll(cs, mode, value, offset);
   2195
   2196	intel_ring_advance(rq, cs);
   2197	i915_request_add(rq);
   2198
   2199	return 0;
   2200}
   2201
   2202static int measure_inter_request(struct intel_context *ce)
   2203{
   2204	u32 *sema = hwsp_scratch(ce);
   2205	const u32 offset = hwsp_offset(ce, sema);
   2206	u32 elapsed[TF_COUNT + 1], cycles;
   2207	struct i915_sw_fence *submit;
   2208	int i, err;
   2209
   2210	/*
   2211	 * Measure how long it takes to advance from one request into the
   2212	 * next. Between each request we flush the GPU caches to memory,
   2213	 * update the breadcrumbs, and then invalidate those caches.
   2214	 * We queue up all the requests to be submitted in one batch so
   2215	 * it should be one set of contiguous measurements.
   2216	 *
   2217	 *    A: read CS_TIMESTAMP on GPU
   2218	 *    advance request
   2219	 *    B: read CS_TIMESTAMP on GPU
   2220	 *
   2221	 * Request latency: B - A
   2222	 */
   2223
   2224	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
   2225	if (err)
   2226		return err;
   2227
   2228	submit = heap_fence_create(GFP_KERNEL);
   2229	if (!submit) {
   2230		semaphore_set(sema, 1);
   2231		return -ENOMEM;
   2232	}
   2233
   2234	intel_engine_flush_submission(ce->engine);
   2235	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
   2236		struct i915_request *rq;
   2237		u32 *cs;
   2238
   2239		rq = i915_request_create(ce);
   2240		if (IS_ERR(rq)) {
   2241			err = PTR_ERR(rq);
   2242			goto err_submit;
   2243		}
   2244
   2245		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
   2246						       submit,
   2247						       GFP_KERNEL);
   2248		if (err < 0) {
   2249			i915_request_add(rq);
   2250			goto err_submit;
   2251		}
   2252
   2253		cs = intel_ring_begin(rq, 4);
   2254		if (IS_ERR(cs)) {
   2255			i915_request_add(rq);
   2256			err = PTR_ERR(cs);
   2257			goto err_submit;
   2258		}
   2259
   2260		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
   2261
   2262		intel_ring_advance(rq, cs);
   2263		i915_request_add(rq);
   2264	}
   2265	i915_sw_fence_commit(submit);
   2266	intel_engine_flush_submission(ce->engine);
   2267	heap_fence_put(submit);
   2268
   2269	semaphore_set(sema, 1);
   2270	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
   2271	if (err)
   2272		goto err;
   2273
   2274	for (i = 1; i <= TF_COUNT; i++)
   2275		elapsed[i - 1] = sema[i + 1] - sema[i];
   2276
   2277	cycles = trifilter(elapsed);
   2278	pr_info("%s: inter-request latency %d cycles, %lluns\n",
   2279		ce->engine->name, cycles >> TF_BIAS,
   2280		cycles_to_ns(ce->engine, cycles));
   2281
   2282	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
   2283
   2284err_submit:
   2285	i915_sw_fence_commit(submit);
   2286	heap_fence_put(submit);
   2287	semaphore_set(sema, 1);
   2288err:
   2289	intel_gt_set_wedged(ce->engine->gt);
   2290	return err;
   2291}
   2292
   2293static int measure_context_switch(struct intel_context *ce)
   2294{
   2295	u32 *sema = hwsp_scratch(ce);
   2296	const u32 offset = hwsp_offset(ce, sema);
   2297	struct i915_request *fence = NULL;
   2298	u32 elapsed[TF_COUNT + 1], cycles;
   2299	int i, j, err;
   2300	u32 *cs;
   2301
   2302	/*
   2303	 * Measure how long it takes to advance from one request in one
   2304	 * context to a request in another context. This allows us to
   2305	 * measure how long the context save/restore take, along with all
   2306	 * the inter-context setup we require.
   2307	 *
   2308	 *    A: read CS_TIMESTAMP on GPU
   2309	 *    switch context
   2310	 *    B: read CS_TIMESTAMP on GPU
   2311	 *
   2312	 * Context switch latency: B - A
   2313	 */
   2314
   2315	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
   2316	if (err)
   2317		return err;
   2318
   2319	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
   2320		struct intel_context *arr[] = {
   2321			ce, ce->engine->kernel_context
   2322		};
   2323		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
   2324
   2325		for (j = 0; j < ARRAY_SIZE(arr); j++) {
   2326			struct i915_request *rq;
   2327
   2328			rq = i915_request_create(arr[j]);
   2329			if (IS_ERR(rq)) {
   2330				err = PTR_ERR(rq);
   2331				goto err_fence;
   2332			}
   2333
   2334			if (fence) {
   2335				err = i915_request_await_dma_fence(rq,
   2336								   &fence->fence);
   2337				if (err) {
   2338					i915_request_add(rq);
   2339					goto err_fence;
   2340				}
   2341			}
   2342
   2343			cs = intel_ring_begin(rq, 4);
   2344			if (IS_ERR(cs)) {
   2345				i915_request_add(rq);
   2346				err = PTR_ERR(cs);
   2347				goto err_fence;
   2348			}
   2349
   2350			cs = emit_timestamp_store(cs, ce, addr);
   2351			addr += sizeof(u32);
   2352
   2353			intel_ring_advance(rq, cs);
   2354
   2355			i915_request_put(fence);
   2356			fence = i915_request_get(rq);
   2357
   2358			i915_request_add(rq);
   2359		}
   2360	}
   2361	i915_request_put(fence);
   2362	intel_engine_flush_submission(ce->engine);
   2363
   2364	semaphore_set(sema, 1);
   2365	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
   2366	if (err)
   2367		goto err;
   2368
   2369	for (i = 1; i <= TF_COUNT; i++)
   2370		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
   2371
   2372	cycles = trifilter(elapsed);
   2373	pr_info("%s: context switch latency %d cycles, %lluns\n",
   2374		ce->engine->name, cycles >> TF_BIAS,
   2375		cycles_to_ns(ce->engine, cycles));
   2376
   2377	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
   2378
   2379err_fence:
   2380	i915_request_put(fence);
   2381	semaphore_set(sema, 1);
   2382err:
   2383	intel_gt_set_wedged(ce->engine->gt);
   2384	return err;
   2385}
   2386
   2387static int measure_preemption(struct intel_context *ce)
   2388{
   2389	u32 *sema = hwsp_scratch(ce);
   2390	const u32 offset = hwsp_offset(ce, sema);
   2391	u32 elapsed[TF_COUNT], cycles;
   2392	u32 *cs;
   2393	int err;
   2394	int i;
   2395
   2396	/*
   2397	 * We measure two latencies while triggering preemption. The first
   2398	 * latency is how long it takes for us to submit a preempting request.
   2399	 * The second latency is how it takes for us to return from the
   2400	 * preemption back to the original context.
   2401	 *
   2402	 *    A: read CS_TIMESTAMP from CPU
   2403	 *    submit preemption
   2404	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
   2405	 *    context switch
   2406	 *    C: read CS_TIMESTAMP on GPU (in original context)
   2407	 *
   2408	 * Preemption dispatch latency: B - A
   2409	 * Preemption switch latency: C - B
   2410	 */
   2411
   2412	if (!intel_engine_has_preemption(ce->engine))
   2413		return 0;
   2414
   2415	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
   2416		u32 addr = offset + 2 * i * sizeof(u32);
   2417		struct i915_request *rq;
   2418
   2419		rq = i915_request_create(ce);
   2420		if (IS_ERR(rq)) {
   2421			err = PTR_ERR(rq);
   2422			goto err;
   2423		}
   2424
   2425		cs = intel_ring_begin(rq, 12);
   2426		if (IS_ERR(cs)) {
   2427			i915_request_add(rq);
   2428			err = PTR_ERR(cs);
   2429			goto err;
   2430		}
   2431
   2432		cs = emit_store_dw(cs, addr, -1);
   2433		cs = emit_semaphore_poll_until(cs, offset, i);
   2434		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
   2435
   2436		intel_ring_advance(rq, cs);
   2437		i915_request_add(rq);
   2438
   2439		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
   2440			err = -EIO;
   2441			goto err;
   2442		}
   2443
   2444		rq = i915_request_create(ce->engine->kernel_context);
   2445		if (IS_ERR(rq)) {
   2446			err = PTR_ERR(rq);
   2447			goto err;
   2448		}
   2449
   2450		cs = intel_ring_begin(rq, 8);
   2451		if (IS_ERR(cs)) {
   2452			i915_request_add(rq);
   2453			err = PTR_ERR(cs);
   2454			goto err;
   2455		}
   2456
   2457		cs = emit_timestamp_store(cs, ce, addr);
   2458		cs = emit_store_dw(cs, offset, i);
   2459
   2460		intel_ring_advance(rq, cs);
   2461		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
   2462
   2463		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
   2464		i915_request_add(rq);
   2465	}
   2466
   2467	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
   2468		err = -EIO;
   2469		goto err;
   2470	}
   2471
   2472	for (i = 1; i <= TF_COUNT; i++)
   2473		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
   2474
   2475	cycles = trifilter(elapsed);
   2476	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
   2477		ce->engine->name, cycles >> TF_BIAS,
   2478		cycles_to_ns(ce->engine, cycles));
   2479
   2480	for (i = 1; i <= TF_COUNT; i++)
   2481		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
   2482
   2483	cycles = trifilter(elapsed);
   2484	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
   2485		ce->engine->name, cycles >> TF_BIAS,
   2486		cycles_to_ns(ce->engine, cycles));
   2487
   2488	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
   2489
   2490err:
   2491	intel_gt_set_wedged(ce->engine->gt);
   2492	return err;
   2493}
   2494
   2495struct signal_cb {
   2496	struct dma_fence_cb base;
   2497	bool seen;
   2498};
   2499
   2500static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
   2501{
   2502	struct signal_cb *s = container_of(cb, typeof(*s), base);
   2503
   2504	smp_store_mb(s->seen, true); /* be safe, be strong */
   2505}
   2506
   2507static int measure_completion(struct intel_context *ce)
   2508{
   2509	u32 *sema = hwsp_scratch(ce);
   2510	const u32 offset = hwsp_offset(ce, sema);
   2511	u32 elapsed[TF_COUNT], cycles;
   2512	u32 *cs;
   2513	int err;
   2514	int i;
   2515
   2516	/*
   2517	 * Measure how long it takes for the signal (interrupt) to be
   2518	 * sent from the GPU to be processed by the CPU.
   2519	 *
   2520	 *    A: read CS_TIMESTAMP on GPU
   2521	 *    signal
   2522	 *    B: read CS_TIMESTAMP from CPU
   2523	 *
   2524	 * Completion latency: B - A
   2525	 */
   2526
   2527	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
   2528		struct signal_cb cb = { .seen = false };
   2529		struct i915_request *rq;
   2530
   2531		rq = i915_request_create(ce);
   2532		if (IS_ERR(rq)) {
   2533			err = PTR_ERR(rq);
   2534			goto err;
   2535		}
   2536
   2537		cs = intel_ring_begin(rq, 12);
   2538		if (IS_ERR(cs)) {
   2539			i915_request_add(rq);
   2540			err = PTR_ERR(cs);
   2541			goto err;
   2542		}
   2543
   2544		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
   2545		cs = emit_semaphore_poll_until(cs, offset, i);
   2546		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
   2547
   2548		intel_ring_advance(rq, cs);
   2549
   2550		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
   2551		i915_request_add(rq);
   2552
   2553		intel_engine_flush_submission(ce->engine);
   2554		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
   2555			err = -EIO;
   2556			goto err;
   2557		}
   2558
   2559		preempt_disable();
   2560		semaphore_set(sema, i);
   2561		while (!READ_ONCE(cb.seen))
   2562			cpu_relax();
   2563
   2564		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
   2565		preempt_enable();
   2566	}
   2567
   2568	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
   2569	if (err)
   2570		goto err;
   2571
   2572	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
   2573		GEM_BUG_ON(sema[i + 1] == -1);
   2574		elapsed[i] = elapsed[i] - sema[i + 1];
   2575	}
   2576
   2577	cycles = trifilter(elapsed);
   2578	pr_info("%s: completion latency %d cycles, %lluns\n",
   2579		ce->engine->name, cycles >> TF_BIAS,
   2580		cycles_to_ns(ce->engine, cycles));
   2581
   2582	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
   2583
   2584err:
   2585	intel_gt_set_wedged(ce->engine->gt);
   2586	return err;
   2587}
   2588
   2589static void rps_pin(struct intel_gt *gt)
   2590{
   2591	/* Pin the frequency to max */
   2592	atomic_inc(&gt->rps.num_waiters);
   2593	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
   2594
   2595	mutex_lock(&gt->rps.lock);
   2596	intel_rps_set(&gt->rps, gt->rps.max_freq);
   2597	mutex_unlock(&gt->rps.lock);
   2598}
   2599
   2600static void rps_unpin(struct intel_gt *gt)
   2601{
   2602	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
   2603	atomic_dec(&gt->rps.num_waiters);
   2604}
   2605
   2606static int perf_request_latency(void *arg)
   2607{
   2608	struct drm_i915_private *i915 = arg;
   2609	struct intel_engine_cs *engine;
   2610	struct pm_qos_request qos;
   2611	int err = 0;
   2612
   2613	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
   2614		return 0;
   2615
   2616	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
   2617
   2618	for_each_uabi_engine(engine, i915) {
   2619		struct intel_context *ce;
   2620
   2621		ce = intel_context_create(engine);
   2622		if (IS_ERR(ce)) {
   2623			err = PTR_ERR(ce);
   2624			goto out;
   2625		}
   2626
   2627		err = intel_context_pin(ce);
   2628		if (err) {
   2629			intel_context_put(ce);
   2630			goto out;
   2631		}
   2632
   2633		st_engine_heartbeat_disable(engine);
   2634		rps_pin(engine->gt);
   2635
   2636		if (err == 0)
   2637			err = measure_semaphore_response(ce);
   2638		if (err == 0)
   2639			err = measure_idle_dispatch(ce);
   2640		if (err == 0)
   2641			err = measure_busy_dispatch(ce);
   2642		if (err == 0)
   2643			err = measure_inter_request(ce);
   2644		if (err == 0)
   2645			err = measure_context_switch(ce);
   2646		if (err == 0)
   2647			err = measure_preemption(ce);
   2648		if (err == 0)
   2649			err = measure_completion(ce);
   2650
   2651		rps_unpin(engine->gt);
   2652		st_engine_heartbeat_enable(engine);
   2653
   2654		intel_context_unpin(ce);
   2655		intel_context_put(ce);
   2656		if (err)
   2657			goto out;
   2658	}
   2659
   2660out:
   2661	if (igt_flush_test(i915))
   2662		err = -EIO;
   2663
   2664	cpu_latency_qos_remove_request(&qos);
   2665	return err;
   2666}
   2667
   2668static int s_sync0(void *arg)
   2669{
   2670	struct perf_series *ps = arg;
   2671	IGT_TIMEOUT(end_time);
   2672	unsigned int idx = 0;
   2673	int err = 0;
   2674
   2675	GEM_BUG_ON(!ps->nengines);
   2676	do {
   2677		struct i915_request *rq;
   2678
   2679		rq = i915_request_create(ps->ce[idx]);
   2680		if (IS_ERR(rq)) {
   2681			err = PTR_ERR(rq);
   2682			break;
   2683		}
   2684
   2685		i915_request_get(rq);
   2686		i915_request_add(rq);
   2687
   2688		if (i915_request_wait(rq, 0, HZ / 5) < 0)
   2689			err = -ETIME;
   2690		i915_request_put(rq);
   2691		if (err)
   2692			break;
   2693
   2694		if (++idx == ps->nengines)
   2695			idx = 0;
   2696	} while (!__igt_timeout(end_time, NULL));
   2697
   2698	return err;
   2699}
   2700
   2701static int s_sync1(void *arg)
   2702{
   2703	struct perf_series *ps = arg;
   2704	struct i915_request *prev = NULL;
   2705	IGT_TIMEOUT(end_time);
   2706	unsigned int idx = 0;
   2707	int err = 0;
   2708
   2709	GEM_BUG_ON(!ps->nengines);
   2710	do {
   2711		struct i915_request *rq;
   2712
   2713		rq = i915_request_create(ps->ce[idx]);
   2714		if (IS_ERR(rq)) {
   2715			err = PTR_ERR(rq);
   2716			break;
   2717		}
   2718
   2719		i915_request_get(rq);
   2720		i915_request_add(rq);
   2721
   2722		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
   2723			err = -ETIME;
   2724		i915_request_put(prev);
   2725		prev = rq;
   2726		if (err)
   2727			break;
   2728
   2729		if (++idx == ps->nengines)
   2730			idx = 0;
   2731	} while (!__igt_timeout(end_time, NULL));
   2732	i915_request_put(prev);
   2733
   2734	return err;
   2735}
   2736
   2737static int s_many(void *arg)
   2738{
   2739	struct perf_series *ps = arg;
   2740	IGT_TIMEOUT(end_time);
   2741	unsigned int idx = 0;
   2742
   2743	GEM_BUG_ON(!ps->nengines);
   2744	do {
   2745		struct i915_request *rq;
   2746
   2747		rq = i915_request_create(ps->ce[idx]);
   2748		if (IS_ERR(rq))
   2749			return PTR_ERR(rq);
   2750
   2751		i915_request_add(rq);
   2752
   2753		if (++idx == ps->nengines)
   2754			idx = 0;
   2755	} while (!__igt_timeout(end_time, NULL));
   2756
   2757	return 0;
   2758}
   2759
   2760static int perf_series_engines(void *arg)
   2761{
   2762	struct drm_i915_private *i915 = arg;
   2763	static int (* const func[])(void *arg) = {
   2764		s_sync0,
   2765		s_sync1,
   2766		s_many,
   2767		NULL,
   2768	};
   2769	const unsigned int nengines = num_uabi_engines(i915);
   2770	struct intel_engine_cs *engine;
   2771	int (* const *fn)(void *arg);
   2772	struct pm_qos_request qos;
   2773	struct perf_stats *stats;
   2774	struct perf_series *ps;
   2775	unsigned int idx;
   2776	int err = 0;
   2777
   2778	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
   2779	if (!stats)
   2780		return -ENOMEM;
   2781
   2782	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
   2783	if (!ps) {
   2784		kfree(stats);
   2785		return -ENOMEM;
   2786	}
   2787
   2788	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
   2789
   2790	ps->i915 = i915;
   2791	ps->nengines = nengines;
   2792
   2793	idx = 0;
   2794	for_each_uabi_engine(engine, i915) {
   2795		struct intel_context *ce;
   2796
   2797		ce = intel_context_create(engine);
   2798		if (IS_ERR(ce)) {
   2799			err = PTR_ERR(ce);
   2800			goto out;
   2801		}
   2802
   2803		err = intel_context_pin(ce);
   2804		if (err) {
   2805			intel_context_put(ce);
   2806			goto out;
   2807		}
   2808
   2809		ps->ce[idx++] = ce;
   2810	}
   2811	GEM_BUG_ON(idx != ps->nengines);
   2812
   2813	for (fn = func; *fn && !err; fn++) {
   2814		char name[KSYM_NAME_LEN];
   2815		struct igt_live_test t;
   2816
   2817		snprintf(name, sizeof(name), "%ps", *fn);
   2818		err = igt_live_test_begin(&t, i915, __func__, name);
   2819		if (err)
   2820			break;
   2821
   2822		for (idx = 0; idx < nengines; idx++) {
   2823			struct perf_stats *p =
   2824				memset(&stats[idx], 0, sizeof(stats[idx]));
   2825			struct intel_context *ce = ps->ce[idx];
   2826
   2827			p->engine = ps->ce[idx]->engine;
   2828			intel_engine_pm_get(p->engine);
   2829
   2830			if (intel_engine_supports_stats(p->engine))
   2831				p->busy = intel_engine_get_busy_time(p->engine,
   2832								     &p->time) + 1;
   2833			else
   2834				p->time = ktime_get();
   2835			p->runtime = -intel_context_get_total_runtime_ns(ce);
   2836		}
   2837
   2838		err = (*fn)(ps);
   2839		if (igt_live_test_end(&t))
   2840			err = -EIO;
   2841
   2842		for (idx = 0; idx < nengines; idx++) {
   2843			struct perf_stats *p = &stats[idx];
   2844			struct intel_context *ce = ps->ce[idx];
   2845			int integer, decimal;
   2846			u64 busy, dt, now;
   2847
   2848			if (p->busy)
   2849				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
   2850									       &now),
   2851						    p->busy - 1);
   2852			else
   2853				now = ktime_get();
   2854			p->time = ktime_sub(now, p->time);
   2855
   2856			err = switch_to_kernel_sync(ce, err);
   2857			p->runtime += intel_context_get_total_runtime_ns(ce);
   2858			intel_engine_pm_put(p->engine);
   2859
   2860			busy = 100 * ktime_to_ns(p->busy);
   2861			dt = ktime_to_ns(p->time);
   2862			if (dt) {
   2863				integer = div64_u64(busy, dt);
   2864				busy -= integer * dt;
   2865				decimal = div64_u64(100 * busy, dt);
   2866			} else {
   2867				integer = 0;
   2868				decimal = 0;
   2869			}
   2870
   2871			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
   2872				name, p->engine->name, ce->timeline->seqno,
   2873				integer, decimal,
   2874				div_u64(p->runtime, 1000 * 1000),
   2875				div_u64(ktime_to_ns(p->time), 1000 * 1000));
   2876		}
   2877	}
   2878
   2879out:
   2880	for (idx = 0; idx < nengines; idx++) {
   2881		if (IS_ERR_OR_NULL(ps->ce[idx]))
   2882			break;
   2883
   2884		intel_context_unpin(ps->ce[idx]);
   2885		intel_context_put(ps->ce[idx]);
   2886	}
   2887	kfree(ps);
   2888
   2889	cpu_latency_qos_remove_request(&qos);
   2890	kfree(stats);
   2891	return err;
   2892}
   2893
   2894static int p_sync0(void *arg)
   2895{
   2896	struct perf_stats *p = arg;
   2897	struct intel_engine_cs *engine = p->engine;
   2898	struct intel_context *ce;
   2899	IGT_TIMEOUT(end_time);
   2900	unsigned long count;
   2901	bool busy;
   2902	int err = 0;
   2903
   2904	ce = intel_context_create(engine);
   2905	if (IS_ERR(ce))
   2906		return PTR_ERR(ce);
   2907
   2908	err = intel_context_pin(ce);
   2909	if (err) {
   2910		intel_context_put(ce);
   2911		return err;
   2912	}
   2913
   2914	if (intel_engine_supports_stats(engine)) {
   2915		p->busy = intel_engine_get_busy_time(engine, &p->time);
   2916		busy = true;
   2917	} else {
   2918		p->time = ktime_get();
   2919		busy = false;
   2920	}
   2921
   2922	count = 0;
   2923	do {
   2924		struct i915_request *rq;
   2925
   2926		rq = i915_request_create(ce);
   2927		if (IS_ERR(rq)) {
   2928			err = PTR_ERR(rq);
   2929			break;
   2930		}
   2931
   2932		i915_request_get(rq);
   2933		i915_request_add(rq);
   2934
   2935		err = 0;
   2936		if (i915_request_wait(rq, 0, HZ) < 0)
   2937			err = -ETIME;
   2938		i915_request_put(rq);
   2939		if (err)
   2940			break;
   2941
   2942		count++;
   2943	} while (!__igt_timeout(end_time, NULL));
   2944
   2945	if (busy) {
   2946		ktime_t now;
   2947
   2948		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
   2949				    p->busy);
   2950		p->time = ktime_sub(now, p->time);
   2951	} else {
   2952		p->time = ktime_sub(ktime_get(), p->time);
   2953	}
   2954
   2955	err = switch_to_kernel_sync(ce, err);
   2956	p->runtime = intel_context_get_total_runtime_ns(ce);
   2957	p->count = count;
   2958
   2959	intel_context_unpin(ce);
   2960	intel_context_put(ce);
   2961	return err;
   2962}
   2963
   2964static int p_sync1(void *arg)
   2965{
   2966	struct perf_stats *p = arg;
   2967	struct intel_engine_cs *engine = p->engine;
   2968	struct i915_request *prev = NULL;
   2969	struct intel_context *ce;
   2970	IGT_TIMEOUT(end_time);
   2971	unsigned long count;
   2972	bool busy;
   2973	int err = 0;
   2974
   2975	ce = intel_context_create(engine);
   2976	if (IS_ERR(ce))
   2977		return PTR_ERR(ce);
   2978
   2979	err = intel_context_pin(ce);
   2980	if (err) {
   2981		intel_context_put(ce);
   2982		return err;
   2983	}
   2984
   2985	if (intel_engine_supports_stats(engine)) {
   2986		p->busy = intel_engine_get_busy_time(engine, &p->time);
   2987		busy = true;
   2988	} else {
   2989		p->time = ktime_get();
   2990		busy = false;
   2991	}
   2992
   2993	count = 0;
   2994	do {
   2995		struct i915_request *rq;
   2996
   2997		rq = i915_request_create(ce);
   2998		if (IS_ERR(rq)) {
   2999			err = PTR_ERR(rq);
   3000			break;
   3001		}
   3002
   3003		i915_request_get(rq);
   3004		i915_request_add(rq);
   3005
   3006		err = 0;
   3007		if (prev && i915_request_wait(prev, 0, HZ) < 0)
   3008			err = -ETIME;
   3009		i915_request_put(prev);
   3010		prev = rq;
   3011		if (err)
   3012			break;
   3013
   3014		count++;
   3015	} while (!__igt_timeout(end_time, NULL));
   3016	i915_request_put(prev);
   3017
   3018	if (busy) {
   3019		ktime_t now;
   3020
   3021		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
   3022				    p->busy);
   3023		p->time = ktime_sub(now, p->time);
   3024	} else {
   3025		p->time = ktime_sub(ktime_get(), p->time);
   3026	}
   3027
   3028	err = switch_to_kernel_sync(ce, err);
   3029	p->runtime = intel_context_get_total_runtime_ns(ce);
   3030	p->count = count;
   3031
   3032	intel_context_unpin(ce);
   3033	intel_context_put(ce);
   3034	return err;
   3035}
   3036
   3037static int p_many(void *arg)
   3038{
   3039	struct perf_stats *p = arg;
   3040	struct intel_engine_cs *engine = p->engine;
   3041	struct intel_context *ce;
   3042	IGT_TIMEOUT(end_time);
   3043	unsigned long count;
   3044	int err = 0;
   3045	bool busy;
   3046
   3047	ce = intel_context_create(engine);
   3048	if (IS_ERR(ce))
   3049		return PTR_ERR(ce);
   3050
   3051	err = intel_context_pin(ce);
   3052	if (err) {
   3053		intel_context_put(ce);
   3054		return err;
   3055	}
   3056
   3057	if (intel_engine_supports_stats(engine)) {
   3058		p->busy = intel_engine_get_busy_time(engine, &p->time);
   3059		busy = true;
   3060	} else {
   3061		p->time = ktime_get();
   3062		busy = false;
   3063	}
   3064
   3065	count = 0;
   3066	do {
   3067		struct i915_request *rq;
   3068
   3069		rq = i915_request_create(ce);
   3070		if (IS_ERR(rq)) {
   3071			err = PTR_ERR(rq);
   3072			break;
   3073		}
   3074
   3075		i915_request_add(rq);
   3076		count++;
   3077	} while (!__igt_timeout(end_time, NULL));
   3078
   3079	if (busy) {
   3080		ktime_t now;
   3081
   3082		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
   3083				    p->busy);
   3084		p->time = ktime_sub(now, p->time);
   3085	} else {
   3086		p->time = ktime_sub(ktime_get(), p->time);
   3087	}
   3088
   3089	err = switch_to_kernel_sync(ce, err);
   3090	p->runtime = intel_context_get_total_runtime_ns(ce);
   3091	p->count = count;
   3092
   3093	intel_context_unpin(ce);
   3094	intel_context_put(ce);
   3095	return err;
   3096}
   3097
   3098static int perf_parallel_engines(void *arg)
   3099{
   3100	struct drm_i915_private *i915 = arg;
   3101	static int (* const func[])(void *arg) = {
   3102		p_sync0,
   3103		p_sync1,
   3104		p_many,
   3105		NULL,
   3106	};
   3107	const unsigned int nengines = num_uabi_engines(i915);
   3108	struct intel_engine_cs *engine;
   3109	int (* const *fn)(void *arg);
   3110	struct pm_qos_request qos;
   3111	struct {
   3112		struct perf_stats p;
   3113		struct task_struct *tsk;
   3114	} *engines;
   3115	int err = 0;
   3116
   3117	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
   3118	if (!engines)
   3119		return -ENOMEM;
   3120
   3121	cpu_latency_qos_add_request(&qos, 0);
   3122
   3123	for (fn = func; *fn; fn++) {
   3124		char name[KSYM_NAME_LEN];
   3125		struct igt_live_test t;
   3126		unsigned int idx;
   3127
   3128		snprintf(name, sizeof(name), "%ps", *fn);
   3129		err = igt_live_test_begin(&t, i915, __func__, name);
   3130		if (err)
   3131			break;
   3132
   3133		atomic_set(&i915->selftest.counter, nengines);
   3134
   3135		idx = 0;
   3136		for_each_uabi_engine(engine, i915) {
   3137			intel_engine_pm_get(engine);
   3138
   3139			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
   3140			engines[idx].p.engine = engine;
   3141
   3142			engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
   3143						       "igt:%s", engine->name);
   3144			if (IS_ERR(engines[idx].tsk)) {
   3145				err = PTR_ERR(engines[idx].tsk);
   3146				intel_engine_pm_put(engine);
   3147				break;
   3148			}
   3149			get_task_struct(engines[idx++].tsk);
   3150		}
   3151
   3152		yield(); /* start all threads before we kthread_stop() */
   3153
   3154		idx = 0;
   3155		for_each_uabi_engine(engine, i915) {
   3156			int status;
   3157
   3158			if (IS_ERR(engines[idx].tsk))
   3159				break;
   3160
   3161			status = kthread_stop(engines[idx].tsk);
   3162			if (status && !err)
   3163				err = status;
   3164
   3165			intel_engine_pm_put(engine);
   3166			put_task_struct(engines[idx++].tsk);
   3167		}
   3168
   3169		if (igt_live_test_end(&t))
   3170			err = -EIO;
   3171		if (err)
   3172			break;
   3173
   3174		idx = 0;
   3175		for_each_uabi_engine(engine, i915) {
   3176			struct perf_stats *p = &engines[idx].p;
   3177			u64 busy = 100 * ktime_to_ns(p->busy);
   3178			u64 dt = ktime_to_ns(p->time);
   3179			int integer, decimal;
   3180
   3181			if (dt) {
   3182				integer = div64_u64(busy, dt);
   3183				busy -= integer * dt;
   3184				decimal = div64_u64(100 * busy, dt);
   3185			} else {
   3186				integer = 0;
   3187				decimal = 0;
   3188			}
   3189
   3190			GEM_BUG_ON(engine != p->engine);
   3191			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
   3192				name, engine->name, p->count, integer, decimal,
   3193				div_u64(p->runtime, 1000 * 1000),
   3194				div_u64(ktime_to_ns(p->time), 1000 * 1000));
   3195			idx++;
   3196		}
   3197	}
   3198
   3199	cpu_latency_qos_remove_request(&qos);
   3200	kfree(engines);
   3201	return err;
   3202}
   3203
   3204int i915_request_perf_selftests(struct drm_i915_private *i915)
   3205{
   3206	static const struct i915_subtest tests[] = {
   3207		SUBTEST(perf_request_latency),
   3208		SUBTEST(perf_series_engines),
   3209		SUBTEST(perf_parallel_engines),
   3210	};
   3211
   3212	if (intel_gt_is_wedged(to_gt(i915)))
   3213		return 0;
   3214
   3215	return i915_subtests(tests, i915);
   3216}