sched_main.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
sched_main.c (30402B)
      1/*
      2 * Copyright 2015 Advanced Micro Devices, Inc.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice shall be included in
     12 * all copies or substantial portions of the Software.
     13 *
     14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20 * OTHER DEALINGS IN THE SOFTWARE.
     21 *
     22 */
     23
     24/**
     25 * DOC: Overview
     26 *
     27 * The GPU scheduler provides entities which allow userspace to push jobs
     28 * into software queues which are then scheduled on a hardware run queue.
     29 * The software queues have a priority among them. The scheduler selects the entities
     30 * from the run queue using a FIFO. The scheduler provides dependency handling
     31 * features among jobs. The driver is supposed to provide callback functions for
     32 * backend operations to the scheduler like submitting a job to hardware run queue,
     33 * returning the dependencies of a job etc.
     34 *
     35 * The organisation of the scheduler is the following:
     36 *
     37 * 1. Each hw run queue has one scheduler
     38 * 2. Each scheduler has multiple run queues with different priorities
     39 *    (e.g., HIGH_HW,HIGH_SW, KERNEL, NORMAL)
     40 * 3. Each scheduler run queue has a queue of entities to schedule
     41 * 4. Entities themselves maintain a queue of jobs that will be scheduled on
     42 *    the hardware.
     43 *
     44 * The jobs in a entity are always scheduled in the order that they were pushed.
     45 */
     46
     47#include <linux/kthread.h>
     48#include <linux/wait.h>
     49#include <linux/sched.h>
     50#include <linux/completion.h>
     51#include <linux/dma-resv.h>
     52#include <uapi/linux/sched/types.h>
     53
     54#include <drm/drm_print.h>
     55#include <drm/drm_gem.h>
     56#include <drm/gpu_scheduler.h>
     57#include <drm/spsc_queue.h>
     58
     59#define CREATE_TRACE_POINTS
     60#include "gpu_scheduler_trace.h"
     61
     62#define to_drm_sched_job(sched_job)		\
     63		container_of((sched_job), struct drm_sched_job, queue_node)
     64
     65/**
     66 * drm_sched_rq_init - initialize a given run queue struct
     67 *
     68 * @sched: scheduler instance to associate with this run queue
     69 * @rq: scheduler run queue
     70 *
     71 * Initializes a scheduler runqueue.
     72 */
     73static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
     74			      struct drm_sched_rq *rq)
     75{
     76	spin_lock_init(&rq->lock);
     77	INIT_LIST_HEAD(&rq->entities);
     78	rq->current_entity = NULL;
     79	rq->sched = sched;
     80}
     81
     82/**
     83 * drm_sched_rq_add_entity - add an entity
     84 *
     85 * @rq: scheduler run queue
     86 * @entity: scheduler entity
     87 *
     88 * Adds a scheduler entity to the run queue.
     89 */
     90void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
     91			     struct drm_sched_entity *entity)
     92{
     93	if (!list_empty(&entity->list))
     94		return;
     95	spin_lock(&rq->lock);
     96	atomic_inc(rq->sched->score);
     97	list_add_tail(&entity->list, &rq->entities);
     98	spin_unlock(&rq->lock);
     99}
    100
    101/**
    102 * drm_sched_rq_remove_entity - remove an entity
    103 *
    104 * @rq: scheduler run queue
    105 * @entity: scheduler entity
    106 *
    107 * Removes a scheduler entity from the run queue.
    108 */
    109void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
    110				struct drm_sched_entity *entity)
    111{
    112	if (list_empty(&entity->list))
    113		return;
    114	spin_lock(&rq->lock);
    115	atomic_dec(rq->sched->score);
    116	list_del_init(&entity->list);
    117	if (rq->current_entity == entity)
    118		rq->current_entity = NULL;
    119	spin_unlock(&rq->lock);
    120}
    121
    122/**
    123 * drm_sched_rq_select_entity - Select an entity which could provide a job to run
    124 *
    125 * @rq: scheduler run queue to check.
    126 *
    127 * Try to find a ready entity, returns NULL if none found.
    128 */
    129static struct drm_sched_entity *
    130drm_sched_rq_select_entity(struct drm_sched_rq *rq)
    131{
    132	struct drm_sched_entity *entity;
    133
    134	spin_lock(&rq->lock);
    135
    136	entity = rq->current_entity;
    137	if (entity) {
    138		list_for_each_entry_continue(entity, &rq->entities, list) {
    139			if (drm_sched_entity_is_ready(entity)) {
    140				rq->current_entity = entity;
    141				reinit_completion(&entity->entity_idle);
    142				spin_unlock(&rq->lock);
    143				return entity;
    144			}
    145		}
    146	}
    147
    148	list_for_each_entry(entity, &rq->entities, list) {
    149
    150		if (drm_sched_entity_is_ready(entity)) {
    151			rq->current_entity = entity;
    152			reinit_completion(&entity->entity_idle);
    153			spin_unlock(&rq->lock);
    154			return entity;
    155		}
    156
    157		if (entity == rq->current_entity)
    158			break;
    159	}
    160
    161	spin_unlock(&rq->lock);
    162
    163	return NULL;
    164}
    165
    166/**
    167 * drm_sched_job_done - complete a job
    168 * @s_job: pointer to the job which is done
    169 *
    170 * Finish the job's fence and wake up the worker thread.
    171 */
    172static void drm_sched_job_done(struct drm_sched_job *s_job)
    173{
    174	struct drm_sched_fence *s_fence = s_job->s_fence;
    175	struct drm_gpu_scheduler *sched = s_fence->sched;
    176
    177	atomic_dec(&sched->hw_rq_count);
    178	atomic_dec(sched->score);
    179
    180	trace_drm_sched_process_job(s_fence);
    181
    182	dma_fence_get(&s_fence->finished);
    183	drm_sched_fence_finished(s_fence);
    184	dma_fence_put(&s_fence->finished);
    185	wake_up_interruptible(&sched->wake_up_worker);
    186}
    187
    188/**
    189 * drm_sched_job_done_cb - the callback for a done job
    190 * @f: fence
    191 * @cb: fence callbacks
    192 */
    193static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
    194{
    195	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
    196
    197	drm_sched_job_done(s_job);
    198}
    199
    200/**
    201 * drm_sched_dependency_optimized
    202 *
    203 * @fence: the dependency fence
    204 * @entity: the entity which depends on the above fence
    205 *
    206 * Returns true if the dependency can be optimized and false otherwise
    207 */
    208bool drm_sched_dependency_optimized(struct dma_fence* fence,
    209				    struct drm_sched_entity *entity)
    210{
    211	struct drm_gpu_scheduler *sched = entity->rq->sched;
    212	struct drm_sched_fence *s_fence;
    213
    214	if (!fence || dma_fence_is_signaled(fence))
    215		return false;
    216	if (fence->context == entity->fence_context)
    217		return true;
    218	s_fence = to_drm_sched_fence(fence);
    219	if (s_fence && s_fence->sched == sched)
    220		return true;
    221
    222	return false;
    223}
    224EXPORT_SYMBOL(drm_sched_dependency_optimized);
    225
    226/**
    227 * drm_sched_start_timeout - start timeout for reset worker
    228 *
    229 * @sched: scheduler instance to start the worker for
    230 *
    231 * Start the timeout for the given scheduler.
    232 */
    233static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
    234{
    235	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
    236	    !list_empty(&sched->pending_list))
    237		queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
    238}
    239
    240/**
    241 * drm_sched_fault - immediately start timeout handler
    242 *
    243 * @sched: scheduler where the timeout handling should be started.
    244 *
    245 * Start timeout handling immediately when the driver detects a hardware fault.
    246 */
    247void drm_sched_fault(struct drm_gpu_scheduler *sched)
    248{
    249	mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
    250}
    251EXPORT_SYMBOL(drm_sched_fault);
    252
    253/**
    254 * drm_sched_suspend_timeout - Suspend scheduler job timeout
    255 *
    256 * @sched: scheduler instance for which to suspend the timeout
    257 *
    258 * Suspend the delayed work timeout for the scheduler. This is done by
    259 * modifying the delayed work timeout to an arbitrary large value,
    260 * MAX_SCHEDULE_TIMEOUT in this case.
    261 *
    262 * Returns the timeout remaining
    263 *
    264 */
    265unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
    266{
    267	unsigned long sched_timeout, now = jiffies;
    268
    269	sched_timeout = sched->work_tdr.timer.expires;
    270
    271	/*
    272	 * Modify the timeout to an arbitrarily large value. This also prevents
    273	 * the timeout to be restarted when new submissions arrive
    274	 */
    275	if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
    276			&& time_after(sched_timeout, now))
    277		return sched_timeout - now;
    278	else
    279		return sched->timeout;
    280}
    281EXPORT_SYMBOL(drm_sched_suspend_timeout);
    282
    283/**
    284 * drm_sched_resume_timeout - Resume scheduler job timeout
    285 *
    286 * @sched: scheduler instance for which to resume the timeout
    287 * @remaining: remaining timeout
    288 *
    289 * Resume the delayed work timeout for the scheduler.
    290 */
    291void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
    292		unsigned long remaining)
    293{
    294	spin_lock(&sched->job_list_lock);
    295
    296	if (list_empty(&sched->pending_list))
    297		cancel_delayed_work(&sched->work_tdr);
    298	else
    299		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, remaining);
    300
    301	spin_unlock(&sched->job_list_lock);
    302}
    303EXPORT_SYMBOL(drm_sched_resume_timeout);
    304
    305static void drm_sched_job_begin(struct drm_sched_job *s_job)
    306{
    307	struct drm_gpu_scheduler *sched = s_job->sched;
    308
    309	spin_lock(&sched->job_list_lock);
    310	list_add_tail(&s_job->list, &sched->pending_list);
    311	drm_sched_start_timeout(sched);
    312	spin_unlock(&sched->job_list_lock);
    313}
    314
    315static void drm_sched_job_timedout(struct work_struct *work)
    316{
    317	struct drm_gpu_scheduler *sched;
    318	struct drm_sched_job *job;
    319	enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;
    320
    321	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
    322
    323	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
    324	spin_lock(&sched->job_list_lock);
    325	job = list_first_entry_or_null(&sched->pending_list,
    326				       struct drm_sched_job, list);
    327
    328	if (job) {
    329		/*
    330		 * Remove the bad job so it cannot be freed by concurrent
    331		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
    332		 * is parked at which point it's safe.
    333		 */
    334		list_del_init(&job->list);
    335		spin_unlock(&sched->job_list_lock);
    336
    337		status = job->sched->ops->timedout_job(job);
    338
    339		/*
    340		 * Guilty job did complete and hence needs to be manually removed
    341		 * See drm_sched_stop doc.
    342		 */
    343		if (sched->free_guilty) {
    344			job->sched->ops->free_job(job);
    345			sched->free_guilty = false;
    346		}
    347	} else {
    348		spin_unlock(&sched->job_list_lock);
    349	}
    350
    351	if (status != DRM_GPU_SCHED_STAT_ENODEV) {
    352		spin_lock(&sched->job_list_lock);
    353		drm_sched_start_timeout(sched);
    354		spin_unlock(&sched->job_list_lock);
    355	}
    356}
    357
    358 /**
    359  * drm_sched_increase_karma - Update sched_entity guilty flag
    360  *
    361  * @bad: The job guilty of time out
    362  *
    363  * Increment on every hang caused by the 'bad' job. If this exceeds the hang
    364  * limit of the scheduler then the respective sched entity is marked guilty and
    365  * jobs from it will not be scheduled further
    366  */
    367void drm_sched_increase_karma(struct drm_sched_job *bad)
    368{
    369	drm_sched_increase_karma_ext(bad, 1);
    370}
    371EXPORT_SYMBOL(drm_sched_increase_karma);
    372
    373void drm_sched_reset_karma(struct drm_sched_job *bad)
    374{
    375	drm_sched_increase_karma_ext(bad, 0);
    376}
    377EXPORT_SYMBOL(drm_sched_reset_karma);
    378
    379/**
    380 * drm_sched_stop - stop the scheduler
    381 *
    382 * @sched: scheduler instance
    383 * @bad: job which caused the time out
    384 *
    385 * Stop the scheduler and also removes and frees all completed jobs.
    386 * Note: bad job will not be freed as it might be used later and so it's
    387 * callers responsibility to release it manually if it's not part of the
    388 * pending list any more.
    389 *
    390 */
    391void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
    392{
    393	struct drm_sched_job *s_job, *tmp;
    394
    395	kthread_park(sched->thread);
    396
    397	/*
    398	 * Reinsert back the bad job here - now it's safe as
    399	 * drm_sched_get_cleanup_job cannot race against us and release the
    400	 * bad job at this point - we parked (waited for) any in progress
    401	 * (earlier) cleanups and drm_sched_get_cleanup_job will not be called
    402	 * now until the scheduler thread is unparked.
    403	 */
    404	if (bad && bad->sched == sched)
    405		/*
    406		 * Add at the head of the queue to reflect it was the earliest
    407		 * job extracted.
    408		 */
    409		list_add(&bad->list, &sched->pending_list);
    410
    411	/*
    412	 * Iterate the job list from later to  earlier one and either deactive
    413	 * their HW callbacks or remove them from pending list if they already
    414	 * signaled.
    415	 * This iteration is thread safe as sched thread is stopped.
    416	 */
    417	list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list,
    418					 list) {
    419		if (s_job->s_fence->parent &&
    420		    dma_fence_remove_callback(s_job->s_fence->parent,
    421					      &s_job->cb)) {
    422			atomic_dec(&sched->hw_rq_count);
    423		} else {
    424			/*
    425			 * remove job from pending_list.
    426			 * Locking here is for concurrent resume timeout
    427			 */
    428			spin_lock(&sched->job_list_lock);
    429			list_del_init(&s_job->list);
    430			spin_unlock(&sched->job_list_lock);
    431
    432			/*
    433			 * Wait for job's HW fence callback to finish using s_job
    434			 * before releasing it.
    435			 *
    436			 * Job is still alive so fence refcount at least 1
    437			 */
    438			dma_fence_wait(&s_job->s_fence->finished, false);
    439
    440			/*
    441			 * We must keep bad job alive for later use during
    442			 * recovery by some of the drivers but leave a hint
    443			 * that the guilty job must be released.
    444			 */
    445			if (bad != s_job)
    446				sched->ops->free_job(s_job);
    447			else
    448				sched->free_guilty = true;
    449		}
    450	}
    451
    452	/*
    453	 * Stop pending timer in flight as we rearm it in  drm_sched_start. This
    454	 * avoids the pending timeout work in progress to fire right away after
    455	 * this TDR finished and before the newly restarted jobs had a
    456	 * chance to complete.
    457	 */
    458	cancel_delayed_work(&sched->work_tdr);
    459}
    460
    461EXPORT_SYMBOL(drm_sched_stop);
    462
    463/**
    464 * drm_sched_start - recover jobs after a reset
    465 *
    466 * @sched: scheduler instance
    467 * @full_recovery: proceed with complete sched restart
    468 *
    469 */
    470void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
    471{
    472	struct drm_sched_job *s_job, *tmp;
    473	int r;
    474
    475	/*
    476	 * Locking the list is not required here as the sched thread is parked
    477	 * so no new jobs are being inserted or removed. Also concurrent
    478	 * GPU recovers can't run in parallel.
    479	 */
    480	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
    481		struct dma_fence *fence = s_job->s_fence->parent;
    482
    483		atomic_inc(&sched->hw_rq_count);
    484
    485		if (!full_recovery)
    486			continue;
    487
    488		if (fence) {
    489			r = dma_fence_add_callback(fence, &s_job->cb,
    490						   drm_sched_job_done_cb);
    491			if (r == -ENOENT)
    492				drm_sched_job_done(s_job);
    493			else if (r)
    494				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
    495					  r);
    496		} else
    497			drm_sched_job_done(s_job);
    498	}
    499
    500	if (full_recovery) {
    501		spin_lock(&sched->job_list_lock);
    502		drm_sched_start_timeout(sched);
    503		spin_unlock(&sched->job_list_lock);
    504	}
    505
    506	kthread_unpark(sched->thread);
    507}
    508EXPORT_SYMBOL(drm_sched_start);
    509
    510/**
    511 * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list
    512 *
    513 * @sched: scheduler instance
    514 *
    515 */
    516void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
    517{
    518	drm_sched_resubmit_jobs_ext(sched, INT_MAX);
    519}
    520EXPORT_SYMBOL(drm_sched_resubmit_jobs);
    521
    522/**
    523 * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
    524 *
    525 * @sched: scheduler instance
    526 * @max: job numbers to relaunch
    527 *
    528 */
    529void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
    530{
    531	struct drm_sched_job *s_job, *tmp;
    532	uint64_t guilty_context;
    533	bool found_guilty = false;
    534	struct dma_fence *fence;
    535	int i = 0;
    536
    537	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
    538		struct drm_sched_fence *s_fence = s_job->s_fence;
    539
    540		if (i >= max)
    541			break;
    542
    543		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
    544			found_guilty = true;
    545			guilty_context = s_job->s_fence->scheduled.context;
    546		}
    547
    548		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
    549			dma_fence_set_error(&s_fence->finished, -ECANCELED);
    550
    551		dma_fence_put(s_job->s_fence->parent);
    552		fence = sched->ops->run_job(s_job);
    553		i++;
    554
    555		if (IS_ERR_OR_NULL(fence)) {
    556			if (IS_ERR(fence))
    557				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
    558
    559			s_job->s_fence->parent = NULL;
    560		} else {
    561			s_job->s_fence->parent = fence;
    562		}
    563	}
    564}
    565EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
    566
    567/**
    568 * drm_sched_job_init - init a scheduler job
    569 * @job: scheduler job to init
    570 * @entity: scheduler entity to use
    571 * @owner: job owner for debugging
    572 *
    573 * Refer to drm_sched_entity_push_job() documentation
    574 * for locking considerations.
    575 *
    576 * Drivers must make sure drm_sched_job_cleanup() if this function returns
    577 * successfully, even when @job is aborted before drm_sched_job_arm() is called.
    578 *
    579 * WARNING: amdgpu abuses &drm_sched.ready to signal when the hardware
    580 * has died, which can mean that there's no valid runqueue for a @entity.
    581 * This function returns -ENOENT in this case (which probably should be -EIO as
    582 * a more meanigful return value).
    583 *
    584 * Returns 0 for success, negative error code otherwise.
    585 */
    586int drm_sched_job_init(struct drm_sched_job *job,
    587		       struct drm_sched_entity *entity,
    588		       void *owner)
    589{
    590	drm_sched_entity_select_rq(entity);
    591	if (!entity->rq)
    592		return -ENOENT;
    593
    594	job->entity = entity;
    595	job->s_fence = drm_sched_fence_alloc(entity, owner);
    596	if (!job->s_fence)
    597		return -ENOMEM;
    598
    599	INIT_LIST_HEAD(&job->list);
    600
    601	xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
    602
    603	return 0;
    604}
    605EXPORT_SYMBOL(drm_sched_job_init);
    606
    607/**
    608 * drm_sched_job_arm - arm a scheduler job for execution
    609 * @job: scheduler job to arm
    610 *
    611 * This arms a scheduler job for execution. Specifically it initializes the
    612 * &drm_sched_job.s_fence of @job, so that it can be attached to struct dma_resv
    613 * or other places that need to track the completion of this job.
    614 *
    615 * Refer to drm_sched_entity_push_job() documentation for locking
    616 * considerations.
    617 *
    618 * This can only be called if drm_sched_job_init() succeeded.
    619 */
    620void drm_sched_job_arm(struct drm_sched_job *job)
    621{
    622	struct drm_gpu_scheduler *sched;
    623	struct drm_sched_entity *entity = job->entity;
    624
    625	BUG_ON(!entity);
    626
    627	sched = entity->rq->sched;
    628
    629	job->sched = sched;
    630	job->s_priority = entity->rq - sched->sched_rq;
    631	job->id = atomic64_inc_return(&sched->job_id_count);
    632
    633	drm_sched_fence_init(job->s_fence, job->entity);
    634}
    635EXPORT_SYMBOL(drm_sched_job_arm);
    636
    637/**
    638 * drm_sched_job_add_dependency - adds the fence as a job dependency
    639 * @job: scheduler job to add the dependencies to
    640 * @fence: the dma_fence to add to the list of dependencies.
    641 *
    642 * Note that @fence is consumed in both the success and error cases.
    643 *
    644 * Returns:
    645 * 0 on success, or an error on failing to expand the array.
    646 */
    647int drm_sched_job_add_dependency(struct drm_sched_job *job,
    648				 struct dma_fence *fence)
    649{
    650	struct dma_fence *entry;
    651	unsigned long index;
    652	u32 id = 0;
    653	int ret;
    654
    655	if (!fence)
    656		return 0;
    657
    658	/* Deduplicate if we already depend on a fence from the same context.
    659	 * This lets the size of the array of deps scale with the number of
    660	 * engines involved, rather than the number of BOs.
    661	 */
    662	xa_for_each(&job->dependencies, index, entry) {
    663		if (entry->context != fence->context)
    664			continue;
    665
    666		if (dma_fence_is_later(fence, entry)) {
    667			dma_fence_put(entry);
    668			xa_store(&job->dependencies, index, fence, GFP_KERNEL);
    669		} else {
    670			dma_fence_put(fence);
    671		}
    672		return 0;
    673	}
    674
    675	ret = xa_alloc(&job->dependencies, &id, fence, xa_limit_32b, GFP_KERNEL);
    676	if (ret != 0)
    677		dma_fence_put(fence);
    678
    679	return ret;
    680}
    681EXPORT_SYMBOL(drm_sched_job_add_dependency);
    682
    683/**
    684 * drm_sched_job_add_implicit_dependencies - adds implicit dependencies as job
    685 *   dependencies
    686 * @job: scheduler job to add the dependencies to
    687 * @obj: the gem object to add new dependencies from.
    688 * @write: whether the job might write the object (so we need to depend on
    689 * shared fences in the reservation object).
    690 *
    691 * This should be called after drm_gem_lock_reservations() on your array of
    692 * GEM objects used in the job but before updating the reservations with your
    693 * own fences.
    694 *
    695 * Returns:
    696 * 0 on success, or an error on failing to expand the array.
    697 */
    698int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
    699					    struct drm_gem_object *obj,
    700					    bool write)
    701{
    702	struct dma_resv_iter cursor;
    703	struct dma_fence *fence;
    704	int ret;
    705
    706	dma_resv_assert_held(obj->resv);
    707
    708	dma_resv_for_each_fence(&cursor, obj->resv, dma_resv_usage_rw(write),
    709				fence) {
    710		/* Make sure to grab an additional ref on the added fence */
    711		dma_fence_get(fence);
    712		ret = drm_sched_job_add_dependency(job, fence);
    713		if (ret) {
    714			dma_fence_put(fence);
    715			return ret;
    716		}
    717	}
    718	return 0;
    719}
    720EXPORT_SYMBOL(drm_sched_job_add_implicit_dependencies);
    721
    722
    723/**
    724 * drm_sched_job_cleanup - clean up scheduler job resources
    725 * @job: scheduler job to clean up
    726 *
    727 * Cleans up the resources allocated with drm_sched_job_init().
    728 *
    729 * Drivers should call this from their error unwind code if @job is aborted
    730 * before drm_sched_job_arm() is called.
    731 *
    732 * After that point of no return @job is committed to be executed by the
    733 * scheduler, and this function should be called from the
    734 * &drm_sched_backend_ops.free_job callback.
    735 */
    736void drm_sched_job_cleanup(struct drm_sched_job *job)
    737{
    738	struct dma_fence *fence;
    739	unsigned long index;
    740
    741	if (kref_read(&job->s_fence->finished.refcount)) {
    742		/* drm_sched_job_arm() has been called */
    743		dma_fence_put(&job->s_fence->finished);
    744	} else {
    745		/* aborted job before committing to run it */
    746		drm_sched_fence_free(job->s_fence);
    747	}
    748
    749	job->s_fence = NULL;
    750
    751	xa_for_each(&job->dependencies, index, fence) {
    752		dma_fence_put(fence);
    753	}
    754	xa_destroy(&job->dependencies);
    755
    756}
    757EXPORT_SYMBOL(drm_sched_job_cleanup);
    758
    759/**
    760 * drm_sched_ready - is the scheduler ready
    761 *
    762 * @sched: scheduler instance
    763 *
    764 * Return true if we can push more jobs to the hw, otherwise false.
    765 */
    766static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
    767{
    768	return atomic_read(&sched->hw_rq_count) <
    769		sched->hw_submission_limit;
    770}
    771
    772/**
    773 * drm_sched_wakeup - Wake up the scheduler when it is ready
    774 *
    775 * @sched: scheduler instance
    776 *
    777 */
    778void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
    779{
    780	if (drm_sched_ready(sched))
    781		wake_up_interruptible(&sched->wake_up_worker);
    782}
    783
    784/**
    785 * drm_sched_select_entity - Select next entity to process
    786 *
    787 * @sched: scheduler instance
    788 *
    789 * Returns the entity to process or NULL if none are found.
    790 */
    791static struct drm_sched_entity *
    792drm_sched_select_entity(struct drm_gpu_scheduler *sched)
    793{
    794	struct drm_sched_entity *entity;
    795	int i;
    796
    797	if (!drm_sched_ready(sched))
    798		return NULL;
    799
    800	/* Kernel run queue has higher priority than normal run queue*/
    801	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
    802		entity = drm_sched_rq_select_entity(&sched->sched_rq[i]);
    803		if (entity)
    804			break;
    805	}
    806
    807	return entity;
    808}
    809
    810/**
    811 * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
    812 *
    813 * @sched: scheduler instance
    814 *
    815 * Returns the next finished job from the pending list (if there is one)
    816 * ready for it to be destroyed.
    817 */
    818static struct drm_sched_job *
    819drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
    820{
    821	struct drm_sched_job *job, *next;
    822
    823	spin_lock(&sched->job_list_lock);
    824
    825	job = list_first_entry_or_null(&sched->pending_list,
    826				       struct drm_sched_job, list);
    827
    828	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
    829		/* remove job from pending_list */
    830		list_del_init(&job->list);
    831
    832		/* cancel this job's TO timer */
    833		cancel_delayed_work(&sched->work_tdr);
    834		/* make the scheduled timestamp more accurate */
    835		next = list_first_entry_or_null(&sched->pending_list,
    836						typeof(*next), list);
    837
    838		if (next) {
    839			next->s_fence->scheduled.timestamp =
    840				job->s_fence->finished.timestamp;
    841			/* start TO timer for next job */
    842			drm_sched_start_timeout(sched);
    843		}
    844	} else {
    845		job = NULL;
    846	}
    847
    848	spin_unlock(&sched->job_list_lock);
    849
    850	return job;
    851}
    852
    853/**
    854 * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
    855 * @sched_list: list of drm_gpu_schedulers
    856 * @num_sched_list: number of drm_gpu_schedulers in the sched_list
    857 *
    858 * Returns pointer of the sched with the least load or NULL if none of the
    859 * drm_gpu_schedulers are ready
    860 */
    861struct drm_gpu_scheduler *
    862drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
    863		     unsigned int num_sched_list)
    864{
    865	struct drm_gpu_scheduler *sched, *picked_sched = NULL;
    866	int i;
    867	unsigned int min_score = UINT_MAX, num_score;
    868
    869	for (i = 0; i < num_sched_list; ++i) {
    870		sched = sched_list[i];
    871
    872		if (!sched->ready) {
    873			DRM_WARN("scheduler %s is not ready, skipping",
    874				 sched->name);
    875			continue;
    876		}
    877
    878		num_score = atomic_read(sched->score);
    879		if (num_score < min_score) {
    880			min_score = num_score;
    881			picked_sched = sched;
    882		}
    883	}
    884
    885	return picked_sched;
    886}
    887EXPORT_SYMBOL(drm_sched_pick_best);
    888
    889/**
    890 * drm_sched_blocked - check if the scheduler is blocked
    891 *
    892 * @sched: scheduler instance
    893 *
    894 * Returns true if blocked, otherwise false.
    895 */
    896static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
    897{
    898	if (kthread_should_park()) {
    899		kthread_parkme();
    900		return true;
    901	}
    902
    903	return false;
    904}
    905
    906/**
    907 * drm_sched_main - main scheduler thread
    908 *
    909 * @param: scheduler instance
    910 *
    911 * Returns 0.
    912 */
    913static int drm_sched_main(void *param)
    914{
    915	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
    916	int r;
    917
    918	sched_set_fifo_low(current);
    919
    920	while (!kthread_should_stop()) {
    921		struct drm_sched_entity *entity = NULL;
    922		struct drm_sched_fence *s_fence;
    923		struct drm_sched_job *sched_job;
    924		struct dma_fence *fence;
    925		struct drm_sched_job *cleanup_job = NULL;
    926
    927		wait_event_interruptible(sched->wake_up_worker,
    928					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
    929					 (!drm_sched_blocked(sched) &&
    930					  (entity = drm_sched_select_entity(sched))) ||
    931					 kthread_should_stop());
    932
    933		if (cleanup_job)
    934			sched->ops->free_job(cleanup_job);
    935
    936		if (!entity)
    937			continue;
    938
    939		sched_job = drm_sched_entity_pop_job(entity);
    940
    941		if (!sched_job) {
    942			complete(&entity->entity_idle);
    943			continue;
    944		}
    945
    946		s_fence = sched_job->s_fence;
    947
    948		atomic_inc(&sched->hw_rq_count);
    949		drm_sched_job_begin(sched_job);
    950
    951		trace_drm_run_job(sched_job, entity);
    952		fence = sched->ops->run_job(sched_job);
    953		complete(&entity->entity_idle);
    954		drm_sched_fence_scheduled(s_fence);
    955
    956		if (!IS_ERR_OR_NULL(fence)) {
    957			s_fence->parent = dma_fence_get(fence);
    958			r = dma_fence_add_callback(fence, &sched_job->cb,
    959						   drm_sched_job_done_cb);
    960			if (r == -ENOENT)
    961				drm_sched_job_done(sched_job);
    962			else if (r)
    963				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
    964					  r);
    965			dma_fence_put(fence);
    966		} else {
    967			if (IS_ERR(fence))
    968				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
    969
    970			drm_sched_job_done(sched_job);
    971		}
    972
    973		wake_up(&sched->job_scheduled);
    974	}
    975	return 0;
    976}
    977
    978/**
    979 * drm_sched_init - Init a gpu scheduler instance
    980 *
    981 * @sched: scheduler instance
    982 * @ops: backend operations for this scheduler
    983 * @hw_submission: number of hw submissions that can be in flight
    984 * @hang_limit: number of times to allow a job to hang before dropping it
    985 * @timeout: timeout value in jiffies for the scheduler
    986 * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
    987 *		used
    988 * @score: optional score atomic shared with other schedulers
    989 * @name: name used for debugging
    990 *
    991 * Return 0 on success, otherwise error code.
    992 */
    993int drm_sched_init(struct drm_gpu_scheduler *sched,
    994		   const struct drm_sched_backend_ops *ops,
    995		   unsigned hw_submission, unsigned hang_limit,
    996		   long timeout, struct workqueue_struct *timeout_wq,
    997		   atomic_t *score, const char *name, struct device *dev)
    998{
    999	int i, ret;
   1000	sched->ops = ops;
   1001	sched->hw_submission_limit = hw_submission;
   1002	sched->name = name;
   1003	sched->timeout = timeout;
   1004	sched->timeout_wq = timeout_wq ? : system_wq;
   1005	sched->hang_limit = hang_limit;
   1006	sched->score = score ? score : &sched->_score;
   1007	sched->dev = dev;
   1008	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
   1009		drm_sched_rq_init(sched, &sched->sched_rq[i]);
   1010
   1011	init_waitqueue_head(&sched->wake_up_worker);
   1012	init_waitqueue_head(&sched->job_scheduled);
   1013	INIT_LIST_HEAD(&sched->pending_list);
   1014	spin_lock_init(&sched->job_list_lock);
   1015	atomic_set(&sched->hw_rq_count, 0);
   1016	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
   1017	atomic_set(&sched->_score, 0);
   1018	atomic64_set(&sched->job_id_count, 0);
   1019
   1020	/* Each scheduler will run on a seperate kernel thread */
   1021	sched->thread = kthread_run(drm_sched_main, sched, sched->name);
   1022	if (IS_ERR(sched->thread)) {
   1023		ret = PTR_ERR(sched->thread);
   1024		sched->thread = NULL;
   1025		DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
   1026		return ret;
   1027	}
   1028
   1029	sched->ready = true;
   1030	return 0;
   1031}
   1032EXPORT_SYMBOL(drm_sched_init);
   1033
   1034/**
   1035 * drm_sched_fini - Destroy a gpu scheduler
   1036 *
   1037 * @sched: scheduler instance
   1038 *
   1039 * Tears down and cleans up the scheduler.
   1040 */
   1041void drm_sched_fini(struct drm_gpu_scheduler *sched)
   1042{
   1043	struct drm_sched_entity *s_entity;
   1044	int i;
   1045
   1046	if (sched->thread)
   1047		kthread_stop(sched->thread);
   1048
   1049	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
   1050		struct drm_sched_rq *rq = &sched->sched_rq[i];
   1051
   1052		if (!rq)
   1053			continue;
   1054
   1055		spin_lock(&rq->lock);
   1056		list_for_each_entry(s_entity, &rq->entities, list)
   1057			/*
   1058			 * Prevents reinsertion and marks job_queue as idle,
   1059			 * it will removed from rq in drm_sched_entity_fini
   1060			 * eventually
   1061			 */
   1062			s_entity->stopped = true;
   1063		spin_unlock(&rq->lock);
   1064
   1065	}
   1066
   1067	/* Wakeup everyone stuck in drm_sched_entity_flush for this scheduler */
   1068	wake_up_all(&sched->job_scheduled);
   1069
   1070	/* Confirm no work left behind accessing device structures */
   1071	cancel_delayed_work_sync(&sched->work_tdr);
   1072
   1073	sched->ready = false;
   1074}
   1075EXPORT_SYMBOL(drm_sched_fini);
   1076
   1077/**
   1078 * drm_sched_increase_karma_ext - Update sched_entity guilty flag
   1079 *
   1080 * @bad: The job guilty of time out
   1081 * @type: type for increase/reset karma
   1082 *
   1083 */
   1084void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
   1085{
   1086	int i;
   1087	struct drm_sched_entity *tmp;
   1088	struct drm_sched_entity *entity;
   1089	struct drm_gpu_scheduler *sched = bad->sched;
   1090
   1091	/* don't change @bad's karma if it's from KERNEL RQ,
   1092	 * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
   1093	 * corrupt but keep in mind that kernel jobs always considered good.
   1094	 */
   1095	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
   1096		if (type == 0)
   1097			atomic_set(&bad->karma, 0);
   1098		else if (type == 1)
   1099			atomic_inc(&bad->karma);
   1100
   1101		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
   1102		     i++) {
   1103			struct drm_sched_rq *rq = &sched->sched_rq[i];
   1104
   1105			spin_lock(&rq->lock);
   1106			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
   1107				if (bad->s_fence->scheduled.context ==
   1108				    entity->fence_context) {
   1109					if (entity->guilty)
   1110						atomic_set(entity->guilty, type);
   1111					break;
   1112				}
   1113			}
   1114			spin_unlock(&rq->lock);
   1115			if (&entity->list != &rq->entities)
   1116				break;
   1117		}
   1118	}
   1119}
   1120EXPORT_SYMBOL(drm_sched_increase_karma_ext);