command_submission.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
command_submission.c (90625B)
      1// SPDX-License-Identifier: GPL-2.0
      2
      3/*
      4 * Copyright 2016-2021 HabanaLabs, Ltd.
      5 * All Rights Reserved.
      6 */
      7
      8#include <uapi/misc/habanalabs.h>
      9#include "habanalabs.h"
     10
     11#include <linux/uaccess.h>
     12#include <linux/slab.h>
     13
     14#define HL_CS_FLAGS_TYPE_MASK	(HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
     15				HL_CS_FLAGS_COLLECTIVE_WAIT)
     16
     17#define MAX_TS_ITER_NUM 10
     18
     19/**
     20 * enum hl_cs_wait_status - cs wait status
     21 * @CS_WAIT_STATUS_BUSY: cs was not completed yet
     22 * @CS_WAIT_STATUS_COMPLETED: cs completed
     23 * @CS_WAIT_STATUS_GONE: cs completed but fence is already gone
     24 */
     25enum hl_cs_wait_status {
     26	CS_WAIT_STATUS_BUSY,
     27	CS_WAIT_STATUS_COMPLETED,
     28	CS_WAIT_STATUS_GONE
     29};
     30
     31static void job_wq_completion(struct work_struct *work);
     32static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
     33				u64 timeout_us, u64 seq,
     34				enum hl_cs_wait_status *status, s64 *timestamp);
     35static void cs_do_release(struct kref *ref);
     36
     37static void hl_sob_reset(struct kref *ref)
     38{
     39	struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
     40							kref);
     41	struct hl_device *hdev = hw_sob->hdev;
     42
     43	dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
     44
     45	hdev->asic_funcs->reset_sob(hdev, hw_sob);
     46
     47	hw_sob->need_reset = false;
     48}
     49
     50void hl_sob_reset_error(struct kref *ref)
     51{
     52	struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
     53							kref);
     54	struct hl_device *hdev = hw_sob->hdev;
     55
     56	dev_crit(hdev->dev,
     57		"SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
     58		hw_sob->q_idx, hw_sob->sob_id);
     59}
     60
     61void hw_sob_put(struct hl_hw_sob *hw_sob)
     62{
     63	if (hw_sob)
     64		kref_put(&hw_sob->kref, hl_sob_reset);
     65}
     66
     67static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
     68{
     69	if (hw_sob)
     70		kref_put(&hw_sob->kref, hl_sob_reset_error);
     71}
     72
     73void hw_sob_get(struct hl_hw_sob *hw_sob)
     74{
     75	if (hw_sob)
     76		kref_get(&hw_sob->kref);
     77}
     78
     79/**
     80 * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
     81 * @sob_base: sob base id
     82 * @sob_mask: sob user mask, each bit represents a sob offset from sob base
     83 * @mask: generated mask
     84 *
     85 * Return: 0 if given parameters are valid
     86 */
     87int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
     88{
     89	int i;
     90
     91	if (sob_mask == 0)
     92		return -EINVAL;
     93
     94	if (sob_mask == 0x1) {
     95		*mask = ~(1 << (sob_base & 0x7));
     96	} else {
     97		/* find msb in order to verify sob range is valid */
     98		for (i = BITS_PER_BYTE - 1 ; i >= 0 ; i--)
     99			if (BIT(i) & sob_mask)
    100				break;
    101
    102		if (i > (HL_MAX_SOBS_PER_MONITOR - (sob_base & 0x7) - 1))
    103			return -EINVAL;
    104
    105		*mask = ~sob_mask;
    106	}
    107
    108	return 0;
    109}
    110
    111static void hl_fence_release(struct kref *kref)
    112{
    113	struct hl_fence *fence =
    114		container_of(kref, struct hl_fence, refcount);
    115	struct hl_cs_compl *hl_cs_cmpl =
    116		container_of(fence, struct hl_cs_compl, base_fence);
    117
    118	kfree(hl_cs_cmpl);
    119}
    120
    121void hl_fence_put(struct hl_fence *fence)
    122{
    123	if (IS_ERR_OR_NULL(fence))
    124		return;
    125	kref_put(&fence->refcount, hl_fence_release);
    126}
    127
    128void hl_fences_put(struct hl_fence **fence, int len)
    129{
    130	int i;
    131
    132	for (i = 0; i < len; i++, fence++)
    133		hl_fence_put(*fence);
    134}
    135
    136void hl_fence_get(struct hl_fence *fence)
    137{
    138	if (fence)
    139		kref_get(&fence->refcount);
    140}
    141
    142static void hl_fence_init(struct hl_fence *fence, u64 sequence)
    143{
    144	kref_init(&fence->refcount);
    145	fence->cs_sequence = sequence;
    146	fence->error = 0;
    147	fence->timestamp = ktime_set(0, 0);
    148	fence->mcs_handling_done = false;
    149	init_completion(&fence->completion);
    150}
    151
    152void cs_get(struct hl_cs *cs)
    153{
    154	kref_get(&cs->refcount);
    155}
    156
    157static int cs_get_unless_zero(struct hl_cs *cs)
    158{
    159	return kref_get_unless_zero(&cs->refcount);
    160}
    161
    162static void cs_put(struct hl_cs *cs)
    163{
    164	kref_put(&cs->refcount, cs_do_release);
    165}
    166
    167static void cs_job_do_release(struct kref *ref)
    168{
    169	struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount);
    170
    171	kfree(job);
    172}
    173
    174static void cs_job_put(struct hl_cs_job *job)
    175{
    176	kref_put(&job->refcount, cs_job_do_release);
    177}
    178
    179bool cs_needs_completion(struct hl_cs *cs)
    180{
    181	/* In case this is a staged CS, only the last CS in sequence should
    182	 * get a completion, any non staged CS will always get a completion
    183	 */
    184	if (cs->staged_cs && !cs->staged_last)
    185		return false;
    186
    187	return true;
    188}
    189
    190bool cs_needs_timeout(struct hl_cs *cs)
    191{
    192	/* In case this is a staged CS, only the first CS in sequence should
    193	 * get a timeout, any non staged CS will always get a timeout
    194	 */
    195	if (cs->staged_cs && !cs->staged_first)
    196		return false;
    197
    198	return true;
    199}
    200
    201static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
    202{
    203	/*
    204	 * Patched CB is created for external queues jobs, and for H/W queues
    205	 * jobs if the user CB was allocated by driver and MMU is disabled.
    206	 */
    207	return (job->queue_type == QUEUE_TYPE_EXT ||
    208			(job->queue_type == QUEUE_TYPE_HW &&
    209					job->is_kernel_allocated_cb &&
    210					!hdev->mmu_enable));
    211}
    212
    213/*
    214 * cs_parser - parse the user command submission
    215 *
    216 * @hpriv	: pointer to the private data of the fd
    217 * @job        : pointer to the job that holds the command submission info
    218 *
    219 * The function parses the command submission of the user. It calls the
    220 * ASIC specific parser, which returns a list of memory blocks to send
    221 * to the device as different command buffers
    222 *
    223 */
    224static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
    225{
    226	struct hl_device *hdev = hpriv->hdev;
    227	struct hl_cs_parser parser;
    228	int rc;
    229
    230	parser.ctx_id = job->cs->ctx->asid;
    231	parser.cs_sequence = job->cs->sequence;
    232	parser.job_id = job->id;
    233
    234	parser.hw_queue_id = job->hw_queue_id;
    235	parser.job_userptr_list = &job->userptr_list;
    236	parser.patched_cb = NULL;
    237	parser.user_cb = job->user_cb;
    238	parser.user_cb_size = job->user_cb_size;
    239	parser.queue_type = job->queue_type;
    240	parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
    241	job->patched_cb = NULL;
    242	parser.completion = cs_needs_completion(job->cs);
    243
    244	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
    245
    246	if (is_cb_patched(hdev, job)) {
    247		if (!rc) {
    248			job->patched_cb = parser.patched_cb;
    249			job->job_cb_size = parser.patched_cb_size;
    250			job->contains_dma_pkt = parser.contains_dma_pkt;
    251			atomic_inc(&job->patched_cb->cs_cnt);
    252		}
    253
    254		/*
    255		 * Whether the parsing worked or not, we don't need the
    256		 * original CB anymore because it was already parsed and
    257		 * won't be accessed again for this CS
    258		 */
    259		atomic_dec(&job->user_cb->cs_cnt);
    260		hl_cb_put(job->user_cb);
    261		job->user_cb = NULL;
    262	} else if (!rc) {
    263		job->job_cb_size = job->user_cb_size;
    264	}
    265
    266	return rc;
    267}
    268
    269static void complete_job(struct hl_device *hdev, struct hl_cs_job *job)
    270{
    271	struct hl_cs *cs = job->cs;
    272
    273	if (is_cb_patched(hdev, job)) {
    274		hl_userptr_delete_list(hdev, &job->userptr_list);
    275
    276		/*
    277		 * We might arrive here from rollback and patched CB wasn't
    278		 * created, so we need to check it's not NULL
    279		 */
    280		if (job->patched_cb) {
    281			atomic_dec(&job->patched_cb->cs_cnt);
    282			hl_cb_put(job->patched_cb);
    283		}
    284	}
    285
    286	/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
    287	 * enabled, the user CB isn't released in cs_parser() and thus should be
    288	 * released here.
    289	 * This is also true for INT queues jobs which were allocated by driver
    290	 */
    291	if (job->is_kernel_allocated_cb &&
    292		((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
    293				job->queue_type == QUEUE_TYPE_INT)) {
    294		atomic_dec(&job->user_cb->cs_cnt);
    295		hl_cb_put(job->user_cb);
    296	}
    297
    298	/*
    299	 * This is the only place where there can be multiple threads
    300	 * modifying the list at the same time
    301	 */
    302	spin_lock(&cs->job_lock);
    303	list_del(&job->cs_node);
    304	spin_unlock(&cs->job_lock);
    305
    306	hl_debugfs_remove_job(hdev, job);
    307
    308	/* We decrement reference only for a CS that gets completion
    309	 * because the reference was incremented only for this kind of CS
    310	 * right before it was scheduled.
    311	 *
    312	 * In staged submission, only the last CS marked as 'staged_last'
    313	 * gets completion, hence its release function will be called from here.
    314	 * As for all the rest CS's in the staged submission which do not get
    315	 * completion, their CS reference will be decremented by the
    316	 * 'staged_last' CS during the CS release flow.
    317	 * All relevant PQ CI counters will be incremented during the CS release
    318	 * flow by calling 'hl_hw_queue_update_ci'.
    319	 */
    320	if (cs_needs_completion(cs) &&
    321		(job->queue_type == QUEUE_TYPE_EXT ||
    322			job->queue_type == QUEUE_TYPE_HW))
    323		cs_put(cs);
    324
    325	cs_job_put(job);
    326}
    327
    328/*
    329 * hl_staged_cs_find_first - locate the first CS in this staged submission
    330 *
    331 * @hdev: pointer to device structure
    332 * @cs_seq: staged submission sequence number
    333 *
    334 * @note: This function must be called under 'hdev->cs_mirror_lock'
    335 *
    336 * Find and return a CS pointer with the given sequence
    337 */
    338struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq)
    339{
    340	struct hl_cs *cs;
    341
    342	list_for_each_entry_reverse(cs, &hdev->cs_mirror_list, mirror_node)
    343		if (cs->staged_cs && cs->staged_first &&
    344				cs->sequence == cs_seq)
    345			return cs;
    346
    347	return NULL;
    348}
    349
    350/*
    351 * is_staged_cs_last_exists - returns true if the last CS in sequence exists
    352 *
    353 * @hdev: pointer to device structure
    354 * @cs: staged submission member
    355 *
    356 */
    357bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs)
    358{
    359	struct hl_cs *last_entry;
    360
    361	last_entry = list_last_entry(&cs->staged_cs_node, struct hl_cs,
    362								staged_cs_node);
    363
    364	if (last_entry->staged_last)
    365		return true;
    366
    367	return false;
    368}
    369
    370/*
    371 * staged_cs_get - get CS reference if this CS is a part of a staged CS
    372 *
    373 * @hdev: pointer to device structure
    374 * @cs: current CS
    375 * @cs_seq: staged submission sequence number
    376 *
    377 * Increment CS reference for every CS in this staged submission except for
    378 * the CS which get completion.
    379 */
    380static void staged_cs_get(struct hl_device *hdev, struct hl_cs *cs)
    381{
    382	/* Only the last CS in this staged submission will get a completion.
    383	 * We must increment the reference for all other CS's in this
    384	 * staged submission.
    385	 * Once we get a completion we will release the whole staged submission.
    386	 */
    387	if (!cs->staged_last)
    388		cs_get(cs);
    389}
    390
    391/*
    392 * staged_cs_put - put a CS in case it is part of staged submission
    393 *
    394 * @hdev: pointer to device structure
    395 * @cs: CS to put
    396 *
    397 * This function decrements a CS reference (for a non completion CS)
    398 */
    399static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs)
    400{
    401	/* We release all CS's in a staged submission except the last
    402	 * CS which we have never incremented its reference.
    403	 */
    404	if (!cs_needs_completion(cs))
    405		cs_put(cs);
    406}
    407
    408static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
    409{
    410	struct hl_cs *next = NULL, *iter, *first_cs;
    411
    412	if (!cs_needs_timeout(cs))
    413		return;
    414
    415	spin_lock(&hdev->cs_mirror_lock);
    416
    417	/* We need to handle tdr only once for the complete staged submission.
    418	 * Hence, we choose the CS that reaches this function first which is
    419	 * the CS marked as 'staged_last'.
    420	 * In case single staged cs was submitted which has both first and last
    421	 * indications, then "cs_find_first" below will return NULL, since we
    422	 * removed the cs node from the list before getting here,
    423	 * in such cases just continue with the cs to cancel it's TDR work.
    424	 */
    425	if (cs->staged_cs && cs->staged_last) {
    426		first_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence);
    427		if (first_cs)
    428			cs = first_cs;
    429	}
    430
    431	spin_unlock(&hdev->cs_mirror_lock);
    432
    433	/* Don't cancel TDR in case this CS was timedout because we might be
    434	 * running from the TDR context
    435	 */
    436	if (cs->timedout || hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT)
    437		return;
    438
    439	if (cs->tdr_active)
    440		cancel_delayed_work_sync(&cs->work_tdr);
    441
    442	spin_lock(&hdev->cs_mirror_lock);
    443
    444	/* queue TDR for next CS */
    445	list_for_each_entry(iter, &hdev->cs_mirror_list, mirror_node)
    446		if (cs_needs_timeout(iter)) {
    447			next = iter;
    448			break;
    449		}
    450
    451	if (next && !next->tdr_active) {
    452		next->tdr_active = true;
    453		schedule_delayed_work(&next->work_tdr, next->timeout_jiffies);
    454	}
    455
    456	spin_unlock(&hdev->cs_mirror_lock);
    457}
    458
    459/*
    460 * force_complete_multi_cs - complete all contexts that wait on multi-CS
    461 *
    462 * @hdev: pointer to habanalabs device structure
    463 */
    464static void force_complete_multi_cs(struct hl_device *hdev)
    465{
    466	int i;
    467
    468	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
    469		struct multi_cs_completion *mcs_compl;
    470
    471		mcs_compl = &hdev->multi_cs_completion[i];
    472
    473		spin_lock(&mcs_compl->lock);
    474
    475		if (!mcs_compl->used) {
    476			spin_unlock(&mcs_compl->lock);
    477			continue;
    478		}
    479
    480		/* when calling force complete no context should be waiting on
    481		 * multi-cS.
    482		 * We are calling the function as a protection for such case
    483		 * to free any pending context and print error message
    484		 */
    485		dev_err(hdev->dev,
    486				"multi-CS completion context %d still waiting when calling force completion\n",
    487				i);
    488		complete_all(&mcs_compl->completion);
    489		spin_unlock(&mcs_compl->lock);
    490	}
    491}
    492
    493/*
    494 * complete_multi_cs - complete all waiting entities on multi-CS
    495 *
    496 * @hdev: pointer to habanalabs device structure
    497 * @cs: CS structure
    498 * The function signals a waiting entity that has an overlapping stream masters
    499 * with the completed CS.
    500 * For example:
    501 * - a completed CS worked on stream master QID 4, multi CS completion
    502 *   is actively waiting on stream master QIDs 3, 5. don't send signal as no
    503 *   common stream master QID
    504 * - a completed CS worked on stream master QID 4, multi CS completion
    505 *   is actively waiting on stream master QIDs 3, 4. send signal as stream
    506 *   master QID 4 is common
    507 */
    508static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
    509{
    510	struct hl_fence *fence = cs->fence;
    511	int i;
    512
    513	/* in case of multi CS check for completion only for the first CS */
    514	if (cs->staged_cs && !cs->staged_first)
    515		return;
    516
    517	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
    518		struct multi_cs_completion *mcs_compl;
    519
    520		mcs_compl = &hdev->multi_cs_completion[i];
    521		if (!mcs_compl->used)
    522			continue;
    523
    524		spin_lock(&mcs_compl->lock);
    525
    526		/*
    527		 * complete if:
    528		 * 1. still waiting for completion
    529		 * 2. the completed CS has at least one overlapping stream
    530		 *    master with the stream masters in the completion
    531		 */
    532		if (mcs_compl->used &&
    533				(fence->stream_master_qid_map &
    534					mcs_compl->stream_master_qid_map)) {
    535			/* extract the timestamp only of first completed CS */
    536			if (!mcs_compl->timestamp)
    537				mcs_compl->timestamp = ktime_to_ns(fence->timestamp);
    538
    539			complete_all(&mcs_compl->completion);
    540
    541			/*
    542			 * Setting mcs_handling_done inside the lock ensures
    543			 * at least one fence have mcs_handling_done set to
    544			 * true before wait for mcs finish. This ensures at
    545			 * least one CS will be set as completed when polling
    546			 * mcs fences.
    547			 */
    548			fence->mcs_handling_done = true;
    549		}
    550
    551		spin_unlock(&mcs_compl->lock);
    552	}
    553	/* In case CS completed without mcs completion initialized */
    554	fence->mcs_handling_done = true;
    555}
    556
    557static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
    558					struct hl_cs *cs,
    559					struct hl_cs_compl *hl_cs_cmpl)
    560{
    561	/* Skip this handler if the cs wasn't submitted, to avoid putting
    562	 * the hw_sob twice, since this case already handled at this point,
    563	 * also skip if the hw_sob pointer wasn't set.
    564	 */
    565	if (!hl_cs_cmpl->hw_sob || !cs->submitted)
    566		return;
    567
    568	spin_lock(&hl_cs_cmpl->lock);
    569
    570	/*
    571	 * we get refcount upon reservation of signals or signal/wait cs for the
    572	 * hw_sob object, and need to put it when the first staged cs
    573	 * (which cotains the encaps signals) or cs signal/wait is completed.
    574	 */
    575	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
    576			(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
    577			(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
    578			(!!hl_cs_cmpl->encaps_signals)) {
    579		dev_dbg(hdev->dev,
    580				"CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n",
    581				hl_cs_cmpl->cs_seq,
    582				hl_cs_cmpl->type,
    583				hl_cs_cmpl->hw_sob->sob_id,
    584				hl_cs_cmpl->sob_val);
    585
    586		hw_sob_put(hl_cs_cmpl->hw_sob);
    587
    588		if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
    589			hdev->asic_funcs->reset_sob_group(hdev,
    590					hl_cs_cmpl->sob_group);
    591	}
    592
    593	spin_unlock(&hl_cs_cmpl->lock);
    594}
    595
    596static void cs_do_release(struct kref *ref)
    597{
    598	struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
    599	struct hl_device *hdev = cs->ctx->hdev;
    600	struct hl_cs_job *job, *tmp;
    601	struct hl_cs_compl *hl_cs_cmpl =
    602			container_of(cs->fence, struct hl_cs_compl, base_fence);
    603
    604	cs->completed = true;
    605
    606	/*
    607	 * Although if we reached here it means that all external jobs have
    608	 * finished, because each one of them took refcnt to CS, we still
    609	 * need to go over the internal jobs and complete them. Otherwise, we
    610	 * will have leaked memory and what's worse, the CS object (and
    611	 * potentially the CTX object) could be released, while the JOB
    612	 * still holds a pointer to them (but no reference).
    613	 */
    614	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
    615		complete_job(hdev, job);
    616
    617	if (!cs->submitted) {
    618		/*
    619		 * In case the wait for signal CS was submitted, the fence put
    620		 * occurs in init_signal_wait_cs() or collective_wait_init_cs()
    621		 * right before hanging on the PQ.
    622		 */
    623		if (cs->type == CS_TYPE_WAIT ||
    624				cs->type == CS_TYPE_COLLECTIVE_WAIT)
    625			hl_fence_put(cs->signal_fence);
    626
    627		goto out;
    628	}
    629
    630	/* Need to update CI for all queue jobs that does not get completion */
    631	hl_hw_queue_update_ci(cs);
    632
    633	/* remove CS from CS mirror list */
    634	spin_lock(&hdev->cs_mirror_lock);
    635	list_del_init(&cs->mirror_node);
    636	spin_unlock(&hdev->cs_mirror_lock);
    637
    638	cs_handle_tdr(hdev, cs);
    639
    640	if (cs->staged_cs) {
    641		/* the completion CS decrements reference for the entire
    642		 * staged submission
    643		 */
    644		if (cs->staged_last) {
    645			struct hl_cs *staged_cs, *tmp;
    646
    647			list_for_each_entry_safe(staged_cs, tmp,
    648					&cs->staged_cs_node, staged_cs_node)
    649				staged_cs_put(hdev, staged_cs);
    650		}
    651
    652		/* A staged CS will be a member in the list only after it
    653		 * was submitted. We used 'cs_mirror_lock' when inserting
    654		 * it to list so we will use it again when removing it
    655		 */
    656		if (cs->submitted) {
    657			spin_lock(&hdev->cs_mirror_lock);
    658			list_del(&cs->staged_cs_node);
    659			spin_unlock(&hdev->cs_mirror_lock);
    660		}
    661
    662		/* decrement refcount to handle when first staged cs
    663		 * with encaps signals is completed.
    664		 */
    665		if (hl_cs_cmpl->encaps_signals)
    666			kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
    667						hl_encaps_handle_do_release);
    668	}
    669
    670	if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
    671			&& cs->encaps_signals)
    672		kref_put(&cs->encaps_sig_hdl->refcount,
    673					hl_encaps_handle_do_release);
    674
    675out:
    676	/* Must be called before hl_ctx_put because inside we use ctx to get
    677	 * the device
    678	 */
    679	hl_debugfs_remove_cs(cs);
    680
    681	hl_ctx_put(cs->ctx);
    682
    683	/* We need to mark an error for not submitted because in that case
    684	 * the hl fence release flow is different. Mainly, we don't need
    685	 * to handle hw_sob for signal/wait
    686	 */
    687	if (cs->timedout)
    688		cs->fence->error = -ETIMEDOUT;
    689	else if (cs->aborted)
    690		cs->fence->error = -EIO;
    691	else if (!cs->submitted)
    692		cs->fence->error = -EBUSY;
    693
    694	if (unlikely(cs->skip_reset_on_timeout)) {
    695		dev_err(hdev->dev,
    696			"Command submission %llu completed after %llu (s)\n",
    697			cs->sequence,
    698			div_u64(jiffies - cs->submission_time_jiffies, HZ));
    699	}
    700
    701	if (cs->timestamp)
    702		cs->fence->timestamp = ktime_get();
    703	complete_all(&cs->fence->completion);
    704	complete_multi_cs(hdev, cs);
    705
    706	cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl);
    707
    708	hl_fence_put(cs->fence);
    709
    710	kfree(cs->jobs_in_queue_cnt);
    711	kfree(cs);
    712}
    713
    714static void cs_timedout(struct work_struct *work)
    715{
    716	struct hl_device *hdev;
    717	int rc;
    718	struct hl_cs *cs = container_of(work, struct hl_cs,
    719						 work_tdr.work);
    720	bool skip_reset_on_timeout = cs->skip_reset_on_timeout;
    721
    722	rc = cs_get_unless_zero(cs);
    723	if (!rc)
    724		return;
    725
    726	if ((!cs->submitted) || (cs->completed)) {
    727		cs_put(cs);
    728		return;
    729	}
    730
    731	/* Mark the CS is timed out so we won't try to cancel its TDR */
    732	if (likely(!skip_reset_on_timeout))
    733		cs->timedout = true;
    734
    735	hdev = cs->ctx->hdev;
    736
    737	/* Save only the first CS timeout parameters */
    738	rc = atomic_cmpxchg(&hdev->last_error.cs_timeout.write_disable, 0, 1);
    739	if (!rc) {
    740		hdev->last_error.cs_timeout.timestamp = ktime_get();
    741		hdev->last_error.cs_timeout.seq = cs->sequence;
    742	}
    743
    744	switch (cs->type) {
    745	case CS_TYPE_SIGNAL:
    746		dev_err(hdev->dev,
    747			"Signal command submission %llu has not finished in time!\n",
    748			cs->sequence);
    749		break;
    750
    751	case CS_TYPE_WAIT:
    752		dev_err(hdev->dev,
    753			"Wait command submission %llu has not finished in time!\n",
    754			cs->sequence);
    755		break;
    756
    757	case CS_TYPE_COLLECTIVE_WAIT:
    758		dev_err(hdev->dev,
    759			"Collective Wait command submission %llu has not finished in time!\n",
    760			cs->sequence);
    761		break;
    762
    763	default:
    764		dev_err(hdev->dev,
    765			"Command submission %llu has not finished in time!\n",
    766			cs->sequence);
    767		break;
    768	}
    769
    770	rc = hl_state_dump(hdev);
    771	if (rc)
    772		dev_err(hdev->dev, "Error during system state dump %d\n", rc);
    773
    774	cs_put(cs);
    775
    776	if (likely(!skip_reset_on_timeout)) {
    777		if (hdev->reset_on_lockup)
    778			hl_device_reset(hdev, HL_DRV_RESET_TDR);
    779		else
    780			hdev->reset_info.needs_reset = true;
    781	}
    782}
    783
    784static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
    785			enum hl_cs_type cs_type, u64 user_sequence,
    786			struct hl_cs **cs_new, u32 flags, u32 timeout)
    787{
    788	struct hl_cs_counters_atomic *cntr;
    789	struct hl_fence *other = NULL;
    790	struct hl_cs_compl *cs_cmpl;
    791	struct hl_cs *cs;
    792	int rc;
    793
    794	cntr = &hdev->aggregated_cs_counters;
    795
    796	cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
    797	if (!cs)
    798		cs = kzalloc(sizeof(*cs), GFP_KERNEL);
    799
    800	if (!cs) {
    801		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
    802		atomic64_inc(&cntr->out_of_mem_drop_cnt);
    803		return -ENOMEM;
    804	}
    805
    806	/* increment refcnt for context */
    807	hl_ctx_get(ctx);
    808
    809	cs->ctx = ctx;
    810	cs->submitted = false;
    811	cs->completed = false;
    812	cs->type = cs_type;
    813	cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
    814	cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
    815	cs->timeout_jiffies = timeout;
    816	cs->skip_reset_on_timeout =
    817		hdev->reset_info.skip_reset_on_timeout ||
    818		!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
    819	cs->submission_time_jiffies = jiffies;
    820	INIT_LIST_HEAD(&cs->job_list);
    821	INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
    822	kref_init(&cs->refcount);
    823	spin_lock_init(&cs->job_lock);
    824
    825	cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
    826	if (!cs_cmpl)
    827		cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL);
    828
    829	if (!cs_cmpl) {
    830		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
    831		atomic64_inc(&cntr->out_of_mem_drop_cnt);
    832		rc = -ENOMEM;
    833		goto free_cs;
    834	}
    835
    836	cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
    837			sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
    838	if (!cs->jobs_in_queue_cnt)
    839		cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
    840				sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);
    841
    842	if (!cs->jobs_in_queue_cnt) {
    843		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
    844		atomic64_inc(&cntr->out_of_mem_drop_cnt);
    845		rc = -ENOMEM;
    846		goto free_cs_cmpl;
    847	}
    848
    849	cs_cmpl->hdev = hdev;
    850	cs_cmpl->type = cs->type;
    851	spin_lock_init(&cs_cmpl->lock);
    852	cs->fence = &cs_cmpl->base_fence;
    853
    854	spin_lock(&ctx->cs_lock);
    855
    856	cs_cmpl->cs_seq = ctx->cs_sequence;
    857	other = ctx->cs_pending[cs_cmpl->cs_seq &
    858				(hdev->asic_prop.max_pending_cs - 1)];
    859
    860	if (other && !completion_done(&other->completion)) {
    861		/* If the following statement is true, it means we have reached
    862		 * a point in which only part of the staged submission was
    863		 * submitted and we don't have enough room in the 'cs_pending'
    864		 * array for the rest of the submission.
    865		 * This causes a deadlock because this CS will never be
    866		 * completed as it depends on future CS's for completion.
    867		 */
    868		if (other->cs_sequence == user_sequence)
    869			dev_crit_ratelimited(hdev->dev,
    870				"Staged CS %llu deadlock due to lack of resources",
    871				user_sequence);
    872
    873		dev_dbg_ratelimited(hdev->dev,
    874			"Rejecting CS because of too many in-flights CS\n");
    875		atomic64_inc(&ctx->cs_counters.max_cs_in_flight_drop_cnt);
    876		atomic64_inc(&cntr->max_cs_in_flight_drop_cnt);
    877		rc = -EAGAIN;
    878		goto free_fence;
    879	}
    880
    881	/* init hl_fence */
    882	hl_fence_init(&cs_cmpl->base_fence, cs_cmpl->cs_seq);
    883
    884	cs->sequence = cs_cmpl->cs_seq;
    885
    886	ctx->cs_pending[cs_cmpl->cs_seq &
    887			(hdev->asic_prop.max_pending_cs - 1)] =
    888							&cs_cmpl->base_fence;
    889	ctx->cs_sequence++;
    890
    891	hl_fence_get(&cs_cmpl->base_fence);
    892
    893	hl_fence_put(other);
    894
    895	spin_unlock(&ctx->cs_lock);
    896
    897	*cs_new = cs;
    898
    899	return 0;
    900
    901free_fence:
    902	spin_unlock(&ctx->cs_lock);
    903	kfree(cs->jobs_in_queue_cnt);
    904free_cs_cmpl:
    905	kfree(cs_cmpl);
    906free_cs:
    907	kfree(cs);
    908	hl_ctx_put(ctx);
    909	return rc;
    910}
    911
    912static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
    913{
    914	struct hl_cs_job *job, *tmp;
    915
    916	staged_cs_put(hdev, cs);
    917
    918	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
    919		complete_job(hdev, job);
    920}
    921
    922void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
    923{
    924	int i;
    925	struct hl_cs *cs, *tmp;
    926
    927	if (!skip_wq_flush) {
    928		flush_workqueue(hdev->ts_free_obj_wq);
    929
    930		/* flush all completions before iterating over the CS mirror list in
    931		 * order to avoid a race with the release functions
    932		 */
    933		for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
    934			flush_workqueue(hdev->cq_wq[i]);
    935
    936	}
    937
    938	/* Make sure we don't have leftovers in the CS mirror list */
    939	list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
    940		cs_get(cs);
    941		cs->aborted = true;
    942		dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
    943				cs->ctx->asid, cs->sequence);
    944		cs_rollback(hdev, cs);
    945		cs_put(cs);
    946	}
    947
    948	force_complete_multi_cs(hdev);
    949}
    950
    951static void
    952wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
    953{
    954	struct hl_user_pending_interrupt *pend, *temp;
    955	unsigned long flags;
    956
    957	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
    958	list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) {
    959		if (pend->ts_reg_info.buf) {
    960			list_del(&pend->wait_list_node);
    961			hl_mmap_mem_buf_put(pend->ts_reg_info.buf);
    962			hl_cb_put(pend->ts_reg_info.cq_cb);
    963		} else {
    964			pend->fence.error = -EIO;
    965			complete_all(&pend->fence.completion);
    966		}
    967	}
    968	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
    969}
    970
    971void hl_release_pending_user_interrupts(struct hl_device *hdev)
    972{
    973	struct asic_fixed_properties *prop = &hdev->asic_prop;
    974	struct hl_user_interrupt *interrupt;
    975	int i;
    976
    977	if (!prop->user_interrupt_count)
    978		return;
    979
    980	/* We iterate through the user interrupt requests and waking up all
    981	 * user threads waiting for interrupt completion. We iterate the
    982	 * list under a lock, this is why all user threads, once awake,
    983	 * will wait on the same lock and will release the waiting object upon
    984	 * unlock.
    985	 */
    986
    987	for (i = 0 ; i < prop->user_interrupt_count ; i++) {
    988		interrupt = &hdev->user_interrupt[i];
    989		wake_pending_user_interrupt_threads(interrupt);
    990	}
    991
    992	interrupt = &hdev->common_user_interrupt;
    993	wake_pending_user_interrupt_threads(interrupt);
    994}
    995
    996static void job_wq_completion(struct work_struct *work)
    997{
    998	struct hl_cs_job *job = container_of(work, struct hl_cs_job,
    999						finish_work);
   1000	struct hl_cs *cs = job->cs;
   1001	struct hl_device *hdev = cs->ctx->hdev;
   1002
   1003	/* job is no longer needed */
   1004	complete_job(hdev, job);
   1005}
   1006
   1007static int validate_queue_index(struct hl_device *hdev,
   1008				struct hl_cs_chunk *chunk,
   1009				enum hl_queue_type *queue_type,
   1010				bool *is_kernel_allocated_cb)
   1011{
   1012	struct asic_fixed_properties *asic = &hdev->asic_prop;
   1013	struct hw_queue_properties *hw_queue_prop;
   1014
   1015	/* This must be checked here to prevent out-of-bounds access to
   1016	 * hw_queues_props array
   1017	 */
   1018	if (chunk->queue_index >= asic->max_queues) {
   1019		dev_err(hdev->dev, "Queue index %d is invalid\n",
   1020			chunk->queue_index);
   1021		return -EINVAL;
   1022	}
   1023
   1024	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
   1025
   1026	if (hw_queue_prop->type == QUEUE_TYPE_NA) {
   1027		dev_err(hdev->dev, "Queue index %d is invalid\n",
   1028			chunk->queue_index);
   1029		return -EINVAL;
   1030	}
   1031
   1032	if (hw_queue_prop->driver_only) {
   1033		dev_err(hdev->dev,
   1034			"Queue index %d is restricted for the kernel driver\n",
   1035			chunk->queue_index);
   1036		return -EINVAL;
   1037	}
   1038
   1039	/* When hw queue type isn't QUEUE_TYPE_HW,
   1040	 * USER_ALLOC_CB flag shall be referred as "don't care".
   1041	 */
   1042	if (hw_queue_prop->type == QUEUE_TYPE_HW) {
   1043		if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) {
   1044			if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) {
   1045				dev_err(hdev->dev,
   1046					"Queue index %d doesn't support user CB\n",
   1047					chunk->queue_index);
   1048				return -EINVAL;
   1049			}
   1050
   1051			*is_kernel_allocated_cb = false;
   1052		} else {
   1053			if (!(hw_queue_prop->cb_alloc_flags &
   1054					CB_ALLOC_KERNEL)) {
   1055				dev_err(hdev->dev,
   1056					"Queue index %d doesn't support kernel CB\n",
   1057					chunk->queue_index);
   1058				return -EINVAL;
   1059			}
   1060
   1061			*is_kernel_allocated_cb = true;
   1062		}
   1063	} else {
   1064		*is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags
   1065						& CB_ALLOC_KERNEL);
   1066	}
   1067
   1068	*queue_type = hw_queue_prop->type;
   1069	return 0;
   1070}
   1071
   1072static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
   1073					struct hl_mem_mgr *mmg,
   1074					struct hl_cs_chunk *chunk)
   1075{
   1076	struct hl_cb *cb;
   1077
   1078	cb = hl_cb_get(mmg, chunk->cb_handle);
   1079	if (!cb) {
   1080		dev_err(hdev->dev, "CB handle 0x%llx invalid\n", chunk->cb_handle);
   1081		return NULL;
   1082	}
   1083
   1084	if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) {
   1085		dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
   1086		goto release_cb;
   1087	}
   1088
   1089	atomic_inc(&cb->cs_cnt);
   1090
   1091	return cb;
   1092
   1093release_cb:
   1094	hl_cb_put(cb);
   1095	return NULL;
   1096}
   1097
   1098struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
   1099		enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
   1100{
   1101	struct hl_cs_job *job;
   1102
   1103	job = kzalloc(sizeof(*job), GFP_ATOMIC);
   1104	if (!job)
   1105		job = kzalloc(sizeof(*job), GFP_KERNEL);
   1106
   1107	if (!job)
   1108		return NULL;
   1109
   1110	kref_init(&job->refcount);
   1111	job->queue_type = queue_type;
   1112	job->is_kernel_allocated_cb = is_kernel_allocated_cb;
   1113
   1114	if (is_cb_patched(hdev, job))
   1115		INIT_LIST_HEAD(&job->userptr_list);
   1116
   1117	if (job->queue_type == QUEUE_TYPE_EXT)
   1118		INIT_WORK(&job->finish_work, job_wq_completion);
   1119
   1120	return job;
   1121}
   1122
   1123static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
   1124{
   1125	if (cs_type_flags & HL_CS_FLAGS_SIGNAL)
   1126		return CS_TYPE_SIGNAL;
   1127	else if (cs_type_flags & HL_CS_FLAGS_WAIT)
   1128		return CS_TYPE_WAIT;
   1129	else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
   1130		return CS_TYPE_COLLECTIVE_WAIT;
   1131	else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
   1132		return CS_RESERVE_SIGNALS;
   1133	else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
   1134		return CS_UNRESERVE_SIGNALS;
   1135	else
   1136		return CS_TYPE_DEFAULT;
   1137}
   1138
   1139static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
   1140{
   1141	struct hl_device *hdev = hpriv->hdev;
   1142	struct hl_ctx *ctx = hpriv->ctx;
   1143	u32 cs_type_flags, num_chunks;
   1144	enum hl_device_status status;
   1145	enum hl_cs_type cs_type;
   1146
   1147	if (!hl_device_operational(hdev, &status)) {
   1148		return -EBUSY;
   1149	}
   1150
   1151	if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
   1152			!hdev->supports_staged_submission) {
   1153		dev_err(hdev->dev, "staged submission not supported");
   1154		return -EPERM;
   1155	}
   1156
   1157	cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK;
   1158
   1159	if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) {
   1160		dev_err(hdev->dev,
   1161			"CS type flags are mutually exclusive, context %d\n",
   1162			ctx->asid);
   1163		return -EINVAL;
   1164	}
   1165
   1166	cs_type = hl_cs_get_cs_type(cs_type_flags);
   1167	num_chunks = args->in.num_chunks_execute;
   1168
   1169	if (unlikely((cs_type != CS_TYPE_DEFAULT) &&
   1170					!hdev->supports_sync_stream)) {
   1171		dev_err(hdev->dev, "Sync stream CS is not supported\n");
   1172		return -EINVAL;
   1173	}
   1174
   1175	if (cs_type == CS_TYPE_DEFAULT) {
   1176		if (!num_chunks) {
   1177			dev_err(hdev->dev,
   1178				"Got execute CS with 0 chunks, context %d\n",
   1179				ctx->asid);
   1180			return -EINVAL;
   1181		}
   1182	} else if (num_chunks != 1) {
   1183		dev_err(hdev->dev,
   1184			"Sync stream CS mandates one chunk only, context %d\n",
   1185			ctx->asid);
   1186		return -EINVAL;
   1187	}
   1188
   1189	return 0;
   1190}
   1191
   1192static int hl_cs_copy_chunk_array(struct hl_device *hdev,
   1193					struct hl_cs_chunk **cs_chunk_array,
   1194					void __user *chunks, u32 num_chunks,
   1195					struct hl_ctx *ctx)
   1196{
   1197	u32 size_to_copy;
   1198
   1199	if (num_chunks > HL_MAX_JOBS_PER_CS) {
   1200		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   1201		atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
   1202		dev_err(hdev->dev,
   1203			"Number of chunks can NOT be larger than %d\n",
   1204			HL_MAX_JOBS_PER_CS);
   1205		return -EINVAL;
   1206	}
   1207
   1208	*cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array),
   1209					GFP_ATOMIC);
   1210	if (!*cs_chunk_array)
   1211		*cs_chunk_array = kmalloc_array(num_chunks,
   1212					sizeof(**cs_chunk_array), GFP_KERNEL);
   1213	if (!*cs_chunk_array) {
   1214		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
   1215		atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
   1216		return -ENOMEM;
   1217	}
   1218
   1219	size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
   1220	if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) {
   1221		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   1222		atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
   1223		dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
   1224		kfree(*cs_chunk_array);
   1225		return -EFAULT;
   1226	}
   1227
   1228	return 0;
   1229}
   1230
   1231static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
   1232				u64 sequence, u32 flags,
   1233				u32 encaps_signal_handle)
   1234{
   1235	if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
   1236		return 0;
   1237
   1238	cs->staged_last = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_LAST);
   1239	cs->staged_first = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST);
   1240
   1241	if (cs->staged_first) {
   1242		/* Staged CS sequence is the first CS sequence */
   1243		INIT_LIST_HEAD(&cs->staged_cs_node);
   1244		cs->staged_sequence = cs->sequence;
   1245
   1246		if (cs->encaps_signals)
   1247			cs->encaps_sig_hdl_id = encaps_signal_handle;
   1248	} else {
   1249		/* User sequence will be validated in 'hl_hw_queue_schedule_cs'
   1250		 * under the cs_mirror_lock
   1251		 */
   1252		cs->staged_sequence = sequence;
   1253	}
   1254
   1255	/* Increment CS reference if needed */
   1256	staged_cs_get(hdev, cs);
   1257
   1258	cs->staged_cs = true;
   1259
   1260	return 0;
   1261}
   1262
   1263static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
   1264{
   1265	int i;
   1266
   1267	for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
   1268		if (qid == hdev->stream_master_qid_arr[i])
   1269			return BIT(i);
   1270
   1271	return 0;
   1272}
   1273
   1274static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
   1275				u32 num_chunks, u64 *cs_seq, u32 flags,
   1276				u32 encaps_signals_handle, u32 timeout,
   1277				u16 *signal_initial_sob_count)
   1278{
   1279	bool staged_mid, int_queues_only = true;
   1280	struct hl_device *hdev = hpriv->hdev;
   1281	struct hl_cs_chunk *cs_chunk_array;
   1282	struct hl_cs_counters_atomic *cntr;
   1283	struct hl_ctx *ctx = hpriv->ctx;
   1284	struct hl_cs_job *job;
   1285	struct hl_cs *cs;
   1286	struct hl_cb *cb;
   1287	u64 user_sequence;
   1288	u8 stream_master_qid_map = 0;
   1289	int rc, i;
   1290
   1291	cntr = &hdev->aggregated_cs_counters;
   1292	user_sequence = *cs_seq;
   1293	*cs_seq = ULLONG_MAX;
   1294
   1295	rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
   1296			hpriv->ctx);
   1297	if (rc)
   1298		goto out;
   1299
   1300	if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
   1301			!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
   1302		staged_mid = true;
   1303	else
   1304		staged_mid = false;
   1305
   1306	rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT,
   1307			staged_mid ? user_sequence : ULLONG_MAX, &cs, flags,
   1308			timeout);
   1309	if (rc)
   1310		goto free_cs_chunk_array;
   1311
   1312	*cs_seq = cs->sequence;
   1313
   1314	hl_debugfs_add_cs(cs);
   1315
   1316	rc = cs_staged_submission(hdev, cs, user_sequence, flags,
   1317						encaps_signals_handle);
   1318	if (rc)
   1319		goto free_cs_object;
   1320
   1321	/* If this is a staged submission we must return the staged sequence
   1322	 * rather than the internal CS sequence
   1323	 */
   1324	if (cs->staged_cs)
   1325		*cs_seq = cs->staged_sequence;
   1326
   1327	/* Validate ALL the CS chunks before submitting the CS */
   1328	for (i = 0 ; i < num_chunks ; i++) {
   1329		struct hl_cs_chunk *chunk = &cs_chunk_array[i];
   1330		enum hl_queue_type queue_type;
   1331		bool is_kernel_allocated_cb;
   1332
   1333		rc = validate_queue_index(hdev, chunk, &queue_type,
   1334						&is_kernel_allocated_cb);
   1335		if (rc) {
   1336			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   1337			atomic64_inc(&cntr->validation_drop_cnt);
   1338			goto free_cs_object;
   1339		}
   1340
   1341		if (is_kernel_allocated_cb) {
   1342			cb = get_cb_from_cs_chunk(hdev, &hpriv->mem_mgr, chunk);
   1343			if (!cb) {
   1344				atomic64_inc(
   1345					&ctx->cs_counters.validation_drop_cnt);
   1346				atomic64_inc(&cntr->validation_drop_cnt);
   1347				rc = -EINVAL;
   1348				goto free_cs_object;
   1349			}
   1350		} else {
   1351			cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
   1352		}
   1353
   1354		if (queue_type == QUEUE_TYPE_EXT ||
   1355						queue_type == QUEUE_TYPE_HW) {
   1356			int_queues_only = false;
   1357
   1358			/*
   1359			 * store which stream are being used for external/HW
   1360			 * queues of this CS
   1361			 */
   1362			if (hdev->supports_wait_for_multi_cs)
   1363				stream_master_qid_map |=
   1364					get_stream_master_qid_mask(hdev,
   1365							chunk->queue_index);
   1366		}
   1367
   1368		job = hl_cs_allocate_job(hdev, queue_type,
   1369						is_kernel_allocated_cb);
   1370		if (!job) {
   1371			atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
   1372			atomic64_inc(&cntr->out_of_mem_drop_cnt);
   1373			dev_err(hdev->dev, "Failed to allocate a new job\n");
   1374			rc = -ENOMEM;
   1375			if (is_kernel_allocated_cb)
   1376				goto release_cb;
   1377
   1378			goto free_cs_object;
   1379		}
   1380
   1381		job->id = i + 1;
   1382		job->cs = cs;
   1383		job->user_cb = cb;
   1384		job->user_cb_size = chunk->cb_size;
   1385		job->hw_queue_id = chunk->queue_index;
   1386
   1387		cs->jobs_in_queue_cnt[job->hw_queue_id]++;
   1388
   1389		list_add_tail(&job->cs_node, &cs->job_list);
   1390
   1391		/*
   1392		 * Increment CS reference. When CS reference is 0, CS is
   1393		 * done and can be signaled to user and free all its resources
   1394		 * Only increment for JOB on external or H/W queues, because
   1395		 * only for those JOBs we get completion
   1396		 */
   1397		if (cs_needs_completion(cs) &&
   1398			(job->queue_type == QUEUE_TYPE_EXT ||
   1399				job->queue_type == QUEUE_TYPE_HW))
   1400			cs_get(cs);
   1401
   1402		hl_debugfs_add_job(hdev, job);
   1403
   1404		rc = cs_parser(hpriv, job);
   1405		if (rc) {
   1406			atomic64_inc(&ctx->cs_counters.parsing_drop_cnt);
   1407			atomic64_inc(&cntr->parsing_drop_cnt);
   1408			dev_err(hdev->dev,
   1409				"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
   1410				cs->ctx->asid, cs->sequence, job->id, rc);
   1411			goto free_cs_object;
   1412		}
   1413	}
   1414
   1415	/* We allow a CS with any queue type combination as long as it does
   1416	 * not get a completion
   1417	 */
   1418	if (int_queues_only && cs_needs_completion(cs)) {
   1419		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   1420		atomic64_inc(&cntr->validation_drop_cnt);
   1421		dev_err(hdev->dev,
   1422			"Reject CS %d.%llu since it contains only internal queues jobs and needs completion\n",
   1423			cs->ctx->asid, cs->sequence);
   1424		rc = -EINVAL;
   1425		goto free_cs_object;
   1426	}
   1427
   1428	/*
   1429	 * store the (external/HW queues) streams used by the CS in the
   1430	 * fence object for multi-CS completion
   1431	 */
   1432	if (hdev->supports_wait_for_multi_cs)
   1433		cs->fence->stream_master_qid_map = stream_master_qid_map;
   1434
   1435	rc = hl_hw_queue_schedule_cs(cs);
   1436	if (rc) {
   1437		if (rc != -EAGAIN)
   1438			dev_err(hdev->dev,
   1439				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
   1440				cs->ctx->asid, cs->sequence, rc);
   1441		goto free_cs_object;
   1442	}
   1443
   1444	*signal_initial_sob_count = cs->initial_sob_count;
   1445
   1446	rc = HL_CS_STATUS_SUCCESS;
   1447	goto put_cs;
   1448
   1449release_cb:
   1450	atomic_dec(&cb->cs_cnt);
   1451	hl_cb_put(cb);
   1452free_cs_object:
   1453	cs_rollback(hdev, cs);
   1454	*cs_seq = ULLONG_MAX;
   1455	/* The path below is both for good and erroneous exits */
   1456put_cs:
   1457	/* We finished with the CS in this function, so put the ref */
   1458	cs_put(cs);
   1459free_cs_chunk_array:
   1460	kfree(cs_chunk_array);
   1461out:
   1462	return rc;
   1463}
   1464
   1465static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
   1466				u64 *cs_seq)
   1467{
   1468	struct hl_device *hdev = hpriv->hdev;
   1469	struct hl_ctx *ctx = hpriv->ctx;
   1470	bool need_soft_reset = false;
   1471	int rc = 0, do_ctx_switch;
   1472	void __user *chunks;
   1473	u32 num_chunks, tmp;
   1474	u16 sob_count;
   1475	int ret;
   1476
   1477	do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
   1478
   1479	if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
   1480		mutex_lock(&hpriv->restore_phase_mutex);
   1481
   1482		if (do_ctx_switch) {
   1483			rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
   1484			if (rc) {
   1485				dev_err_ratelimited(hdev->dev,
   1486					"Failed to switch to context %d, rejecting CS! %d\n",
   1487					ctx->asid, rc);
   1488				/*
   1489				 * If we timedout, or if the device is not IDLE
   1490				 * while we want to do context-switch (-EBUSY),
   1491				 * we need to soft-reset because QMAN is
   1492				 * probably stuck. However, we can't call to
   1493				 * reset here directly because of deadlock, so
   1494				 * need to do it at the very end of this
   1495				 * function
   1496				 */
   1497				if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
   1498					need_soft_reset = true;
   1499				mutex_unlock(&hpriv->restore_phase_mutex);
   1500				goto out;
   1501			}
   1502		}
   1503
   1504		hdev->asic_funcs->restore_phase_topology(hdev);
   1505
   1506		chunks = (void __user *) (uintptr_t) args->in.chunks_restore;
   1507		num_chunks = args->in.num_chunks_restore;
   1508
   1509		if (!num_chunks) {
   1510			dev_dbg(hdev->dev,
   1511				"Need to run restore phase but restore CS is empty\n");
   1512			rc = 0;
   1513		} else {
   1514			rc = cs_ioctl_default(hpriv, chunks, num_chunks,
   1515					cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count);
   1516		}
   1517
   1518		mutex_unlock(&hpriv->restore_phase_mutex);
   1519
   1520		if (rc) {
   1521			dev_err(hdev->dev,
   1522				"Failed to submit restore CS for context %d (%d)\n",
   1523				ctx->asid, rc);
   1524			goto out;
   1525		}
   1526
   1527		/* Need to wait for restore completion before execution phase */
   1528		if (num_chunks) {
   1529			enum hl_cs_wait_status status;
   1530wait_again:
   1531			ret = _hl_cs_wait_ioctl(hdev, ctx,
   1532					jiffies_to_usecs(hdev->timeout_jiffies),
   1533					*cs_seq, &status, NULL);
   1534			if (ret) {
   1535				if (ret == -ERESTARTSYS) {
   1536					usleep_range(100, 200);
   1537					goto wait_again;
   1538				}
   1539
   1540				dev_err(hdev->dev,
   1541					"Restore CS for context %d failed to complete %d\n",
   1542					ctx->asid, ret);
   1543				rc = -ENOEXEC;
   1544				goto out;
   1545			}
   1546		}
   1547
   1548		ctx->thread_ctx_switch_wait_token = 1;
   1549
   1550	} else if (!ctx->thread_ctx_switch_wait_token) {
   1551		rc = hl_poll_timeout_memory(hdev,
   1552			&ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
   1553			100, jiffies_to_usecs(hdev->timeout_jiffies), false);
   1554
   1555		if (rc == -ETIMEDOUT) {
   1556			dev_err(hdev->dev,
   1557				"context switch phase timeout (%d)\n", tmp);
   1558			goto out;
   1559		}
   1560	}
   1561
   1562out:
   1563	if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset))
   1564		hl_device_reset(hdev, 0);
   1565
   1566	return rc;
   1567}
   1568
   1569/*
   1570 * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
   1571 * if the SOB value reaches the max value move to the other SOB reserved
   1572 * to the queue.
   1573 * @hdev: pointer to device structure
   1574 * @q_idx: stream queue index
   1575 * @hw_sob: the H/W SOB used in this signal CS.
   1576 * @count: signals count
   1577 * @encaps_sig: tells whether it's reservation for encaps signals or not.
   1578 *
   1579 * Note that this function must be called while hw_queues_lock is taken.
   1580 */
   1581int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
   1582			struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
   1583
   1584{
   1585	struct hl_sync_stream_properties *prop;
   1586	struct hl_hw_sob *sob = *hw_sob, *other_sob;
   1587	u8 other_sob_offset;
   1588
   1589	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
   1590
   1591	hw_sob_get(sob);
   1592
   1593	/* check for wraparound */
   1594	if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
   1595		/*
   1596		 * Decrement as we reached the max value.
   1597		 * The release function won't be called here as we've
   1598		 * just incremented the refcount right before calling this
   1599		 * function.
   1600		 */
   1601		hw_sob_put_err(sob);
   1602
   1603		/*
   1604		 * check the other sob value, if it still in use then fail
   1605		 * otherwise make the switch
   1606		 */
   1607		other_sob_offset = (prop->curr_sob_offset + 1) % HL_RSVD_SOBS;
   1608		other_sob = &prop->hw_sob[other_sob_offset];
   1609
   1610		if (kref_read(&other_sob->kref) != 1) {
   1611			dev_err(hdev->dev, "error: Cannot switch SOBs q_idx: %d\n",
   1612								q_idx);
   1613			return -EINVAL;
   1614		}
   1615
   1616		/*
   1617		 * next_sob_val always points to the next available signal
   1618		 * in the sob, so in encaps signals it will be the next one
   1619		 * after reserving the required amount.
   1620		 */
   1621		if (encaps_sig)
   1622			prop->next_sob_val = count + 1;
   1623		else
   1624			prop->next_sob_val = count;
   1625
   1626		/* only two SOBs are currently in use */
   1627		prop->curr_sob_offset = other_sob_offset;
   1628		*hw_sob = other_sob;
   1629
   1630		/*
   1631		 * check if other_sob needs reset, then do it before using it
   1632		 * for the reservation or the next signal cs.
   1633		 * we do it here, and for both encaps and regular signal cs
   1634		 * cases in order to avoid possible races of two kref_put
   1635		 * of the sob which can occur at the same time if we move the
   1636		 * sob reset(kref_put) to cs_do_release function.
   1637		 * in addition, if we have combination of cs signal and
   1638		 * encaps, and at the point we need to reset the sob there was
   1639		 * no more reservations and only signal cs keep coming,
   1640		 * in such case we need signal_cs to put the refcount and
   1641		 * reset the sob.
   1642		 */
   1643		if (other_sob->need_reset)
   1644			hw_sob_put(other_sob);
   1645
   1646		if (encaps_sig) {
   1647			/* set reset indication for the sob */
   1648			sob->need_reset = true;
   1649			hw_sob_get(other_sob);
   1650		}
   1651
   1652		dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
   1653				prop->curr_sob_offset, q_idx);
   1654	} else {
   1655		prop->next_sob_val += count;
   1656	}
   1657
   1658	return 0;
   1659}
   1660
   1661static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
   1662		struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx,
   1663		bool encaps_signals)
   1664{
   1665	u64 *signal_seq_arr = NULL;
   1666	u32 size_to_copy, signal_seq_arr_len;
   1667	int rc = 0;
   1668
   1669	if (encaps_signals) {
   1670		*signal_seq = chunk->encaps_signal_seq;
   1671		return 0;
   1672	}
   1673
   1674	signal_seq_arr_len = chunk->num_signal_seq_arr;
   1675
   1676	/* currently only one signal seq is supported */
   1677	if (signal_seq_arr_len != 1) {
   1678		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   1679		atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
   1680		dev_err(hdev->dev,
   1681			"Wait for signal CS supports only one signal CS seq\n");
   1682		return -EINVAL;
   1683	}
   1684
   1685	signal_seq_arr = kmalloc_array(signal_seq_arr_len,
   1686					sizeof(*signal_seq_arr),
   1687					GFP_ATOMIC);
   1688	if (!signal_seq_arr)
   1689		signal_seq_arr = kmalloc_array(signal_seq_arr_len,
   1690					sizeof(*signal_seq_arr),
   1691					GFP_KERNEL);
   1692	if (!signal_seq_arr) {
   1693		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
   1694		atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
   1695		return -ENOMEM;
   1696	}
   1697
   1698	size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr);
   1699	if (copy_from_user(signal_seq_arr,
   1700				u64_to_user_ptr(chunk->signal_seq_arr),
   1701				size_to_copy)) {
   1702		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   1703		atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
   1704		dev_err(hdev->dev,
   1705			"Failed to copy signal seq array from user\n");
   1706		rc = -EFAULT;
   1707		goto out;
   1708	}
   1709
   1710	/* currently it is guaranteed to have only one signal seq */
   1711	*signal_seq = signal_seq_arr[0];
   1712
   1713out:
   1714	kfree(signal_seq_arr);
   1715
   1716	return rc;
   1717}
   1718
   1719static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
   1720		struct hl_ctx *ctx, struct hl_cs *cs,
   1721		enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset)
   1722{
   1723	struct hl_cs_counters_atomic *cntr;
   1724	struct hl_cs_job *job;
   1725	struct hl_cb *cb;
   1726	u32 cb_size;
   1727
   1728	cntr = &hdev->aggregated_cs_counters;
   1729
   1730	job = hl_cs_allocate_job(hdev, q_type, true);
   1731	if (!job) {
   1732		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
   1733		atomic64_inc(&cntr->out_of_mem_drop_cnt);
   1734		dev_err(hdev->dev, "Failed to allocate a new job\n");
   1735		return -ENOMEM;
   1736	}
   1737
   1738	if (cs->type == CS_TYPE_WAIT)
   1739		cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
   1740	else
   1741		cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
   1742
   1743	cb = hl_cb_kernel_create(hdev, cb_size,
   1744				q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
   1745	if (!cb) {
   1746		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
   1747		atomic64_inc(&cntr->out_of_mem_drop_cnt);
   1748		kfree(job);
   1749		return -EFAULT;
   1750	}
   1751
   1752	job->id = 0;
   1753	job->cs = cs;
   1754	job->user_cb = cb;
   1755	atomic_inc(&job->user_cb->cs_cnt);
   1756	job->user_cb_size = cb_size;
   1757	job->hw_queue_id = q_idx;
   1758
   1759	if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
   1760			&& cs->encaps_signals)
   1761		job->encaps_sig_wait_offset = encaps_signal_offset;
   1762	/*
   1763	 * No need in parsing, user CB is the patched CB.
   1764	 * We call hl_cb_destroy() out of two reasons - we don't need the CB in
   1765	 * the CB idr anymore and to decrement its refcount as it was
   1766	 * incremented inside hl_cb_kernel_create().
   1767	 */
   1768	job->patched_cb = job->user_cb;
   1769	job->job_cb_size = job->user_cb_size;
   1770	hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle);
   1771
   1772	/* increment refcount as for external queues we get completion */
   1773	cs_get(cs);
   1774
   1775	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
   1776
   1777	list_add_tail(&job->cs_node, &cs->job_list);
   1778
   1779	hl_debugfs_add_job(hdev, job);
   1780
   1781	return 0;
   1782}
   1783
   1784static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
   1785				u32 q_idx, u32 count,
   1786				u32 *handle_id, u32 *sob_addr,
   1787				u32 *signals_count)
   1788{
   1789	struct hw_queue_properties *hw_queue_prop;
   1790	struct hl_sync_stream_properties *prop;
   1791	struct hl_device *hdev = hpriv->hdev;
   1792	struct hl_cs_encaps_sig_handle *handle;
   1793	struct hl_encaps_signals_mgr *mgr;
   1794	struct hl_hw_sob *hw_sob;
   1795	int hdl_id;
   1796	int rc = 0;
   1797
   1798	if (count >= HL_MAX_SOB_VAL) {
   1799		dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
   1800						count);
   1801		rc = -EINVAL;
   1802		goto out;
   1803	}
   1804
   1805	if (q_idx >= hdev->asic_prop.max_queues) {
   1806		dev_err(hdev->dev, "Queue index %d is invalid\n",
   1807			q_idx);
   1808		rc = -EINVAL;
   1809		goto out;
   1810	}
   1811
   1812	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
   1813
   1814	if (!hw_queue_prop->supports_sync_stream) {
   1815		dev_err(hdev->dev,
   1816			"Queue index %d does not support sync stream operations\n",
   1817									q_idx);
   1818		rc = -EINVAL;
   1819		goto out;
   1820	}
   1821
   1822	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
   1823
   1824	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
   1825	if (!handle) {
   1826		rc = -ENOMEM;
   1827		goto out;
   1828	}
   1829
   1830	handle->count = count;
   1831
   1832	hl_ctx_get(hpriv->ctx);
   1833	handle->ctx = hpriv->ctx;
   1834	mgr = &hpriv->ctx->sig_mgr;
   1835
   1836	spin_lock(&mgr->lock);
   1837	hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC);
   1838	spin_unlock(&mgr->lock);
   1839
   1840	if (hdl_id < 0) {
   1841		dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
   1842		rc = -EINVAL;
   1843		goto put_ctx;
   1844	}
   1845
   1846	handle->id = hdl_id;
   1847	handle->q_idx = q_idx;
   1848	handle->hdev = hdev;
   1849	kref_init(&handle->refcount);
   1850
   1851	hdev->asic_funcs->hw_queues_lock(hdev);
   1852
   1853	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
   1854
   1855	/*
   1856	 * Increment the SOB value by count by user request
   1857	 * to reserve those signals
   1858	 * check if the signals amount to reserve is not exceeding the max sob
   1859	 * value, if yes then switch sob.
   1860	 */
   1861	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
   1862								true);
   1863	if (rc) {
   1864		dev_err(hdev->dev, "Failed to switch SOB\n");
   1865		hdev->asic_funcs->hw_queues_unlock(hdev);
   1866		rc = -EINVAL;
   1867		goto remove_idr;
   1868	}
   1869	/* set the hw_sob to the handle after calling the sob wraparound handler
   1870	 * since sob could have changed.
   1871	 */
   1872	handle->hw_sob = hw_sob;
   1873
   1874	/* store the current sob value for unreserve validity check, and
   1875	 * signal offset support
   1876	 */
   1877	handle->pre_sob_val = prop->next_sob_val - handle->count;
   1878
   1879	*signals_count = prop->next_sob_val;
   1880	hdev->asic_funcs->hw_queues_unlock(hdev);
   1881
   1882	*sob_addr = handle->hw_sob->sob_addr;
   1883	*handle_id = hdl_id;
   1884
   1885	dev_dbg(hdev->dev,
   1886		"Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n",
   1887			hw_sob->sob_id, handle->hw_sob->sob_addr,
   1888			prop->next_sob_val - 1, q_idx, hdl_id);
   1889	goto out;
   1890
   1891remove_idr:
   1892	spin_lock(&mgr->lock);
   1893	idr_remove(&mgr->handles, hdl_id);
   1894	spin_unlock(&mgr->lock);
   1895
   1896put_ctx:
   1897	hl_ctx_put(handle->ctx);
   1898	kfree(handle);
   1899
   1900out:
   1901	return rc;
   1902}
   1903
   1904static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
   1905{
   1906	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
   1907	struct hl_sync_stream_properties *prop;
   1908	struct hl_device *hdev = hpriv->hdev;
   1909	struct hl_encaps_signals_mgr *mgr;
   1910	struct hl_hw_sob *hw_sob;
   1911	u32 q_idx, sob_addr;
   1912	int rc = 0;
   1913
   1914	mgr = &hpriv->ctx->sig_mgr;
   1915
   1916	spin_lock(&mgr->lock);
   1917	encaps_sig_hdl = idr_find(&mgr->handles, handle_id);
   1918	if (encaps_sig_hdl) {
   1919		dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
   1920				handle_id, encaps_sig_hdl->hw_sob->sob_addr,
   1921					encaps_sig_hdl->count);
   1922
   1923		hdev->asic_funcs->hw_queues_lock(hdev);
   1924
   1925		q_idx = encaps_sig_hdl->q_idx;
   1926		prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
   1927		hw_sob = &prop->hw_sob[prop->curr_sob_offset];
   1928		sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
   1929
   1930		/* Check if sob_val got out of sync due to other
   1931		 * signal submission requests which were handled
   1932		 * between the reserve-unreserve calls or SOB switch
   1933		 * upon reaching SOB max value.
   1934		 */
   1935		if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
   1936				!= prop->next_sob_val ||
   1937				sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
   1938			dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
   1939				encaps_sig_hdl->pre_sob_val,
   1940				(prop->next_sob_val - encaps_sig_hdl->count));
   1941
   1942			hdev->asic_funcs->hw_queues_unlock(hdev);
   1943			rc = -EINVAL;
   1944			goto out;
   1945		}
   1946
   1947		/*
   1948		 * Decrement the SOB value by count by user request
   1949		 * to unreserve those signals
   1950		 */
   1951		prop->next_sob_val -= encaps_sig_hdl->count;
   1952
   1953		hdev->asic_funcs->hw_queues_unlock(hdev);
   1954
   1955		hw_sob_put(hw_sob);
   1956
   1957		/* Release the id and free allocated memory of the handle */
   1958		idr_remove(&mgr->handles, handle_id);
   1959		hl_ctx_put(encaps_sig_hdl->ctx);
   1960		kfree(encaps_sig_hdl);
   1961	} else {
   1962		rc = -EINVAL;
   1963		dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
   1964	}
   1965out:
   1966	spin_unlock(&mgr->lock);
   1967
   1968	return rc;
   1969}
   1970
   1971static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
   1972				void __user *chunks, u32 num_chunks,
   1973				u64 *cs_seq, u32 flags, u32 timeout,
   1974				u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count)
   1975{
   1976	struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
   1977	bool handle_found = false, is_wait_cs = false,
   1978			wait_cs_submitted = false,
   1979			cs_encaps_signals = false;
   1980	struct hl_cs_chunk *cs_chunk_array, *chunk;
   1981	bool staged_cs_with_encaps_signals = false;
   1982	struct hw_queue_properties *hw_queue_prop;
   1983	struct hl_device *hdev = hpriv->hdev;
   1984	struct hl_cs_compl *sig_waitcs_cmpl;
   1985	u32 q_idx, collective_engine_id = 0;
   1986	struct hl_cs_counters_atomic *cntr;
   1987	struct hl_fence *sig_fence = NULL;
   1988	struct hl_ctx *ctx = hpriv->ctx;
   1989	enum hl_queue_type q_type;
   1990	struct hl_cs *cs;
   1991	u64 signal_seq;
   1992	int rc;
   1993
   1994	cntr = &hdev->aggregated_cs_counters;
   1995	*cs_seq = ULLONG_MAX;
   1996
   1997	rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
   1998			ctx);
   1999	if (rc)
   2000		goto out;
   2001
   2002	/* currently it is guaranteed to have only one chunk */
   2003	chunk = &cs_chunk_array[0];
   2004
   2005	if (chunk->queue_index >= hdev->asic_prop.max_queues) {
   2006		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   2007		atomic64_inc(&cntr->validation_drop_cnt);
   2008		dev_err(hdev->dev, "Queue index %d is invalid\n",
   2009			chunk->queue_index);
   2010		rc = -EINVAL;
   2011		goto free_cs_chunk_array;
   2012	}
   2013
   2014	q_idx = chunk->queue_index;
   2015	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
   2016	q_type = hw_queue_prop->type;
   2017
   2018	if (!hw_queue_prop->supports_sync_stream) {
   2019		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   2020		atomic64_inc(&cntr->validation_drop_cnt);
   2021		dev_err(hdev->dev,
   2022			"Queue index %d does not support sync stream operations\n",
   2023			q_idx);
   2024		rc = -EINVAL;
   2025		goto free_cs_chunk_array;
   2026	}
   2027
   2028	if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
   2029		if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
   2030			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   2031			atomic64_inc(&cntr->validation_drop_cnt);
   2032			dev_err(hdev->dev,
   2033				"Queue index %d is invalid\n", q_idx);
   2034			rc = -EINVAL;
   2035			goto free_cs_chunk_array;
   2036		}
   2037
   2038		if (!hdev->nic_ports_mask) {
   2039			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   2040			atomic64_inc(&cntr->validation_drop_cnt);
   2041			dev_err(hdev->dev,
   2042				"Collective operations not supported when NIC ports are disabled");
   2043			rc = -EINVAL;
   2044			goto free_cs_chunk_array;
   2045		}
   2046
   2047		collective_engine_id = chunk->collective_engine_id;
   2048	}
   2049
   2050	is_wait_cs = !!(cs_type == CS_TYPE_WAIT ||
   2051			cs_type == CS_TYPE_COLLECTIVE_WAIT);
   2052
   2053	cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
   2054
   2055	if (is_wait_cs) {
   2056		rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq,
   2057				ctx, cs_encaps_signals);
   2058		if (rc)
   2059			goto free_cs_chunk_array;
   2060
   2061		if (cs_encaps_signals) {
   2062			/* check if cs sequence has encapsulated
   2063			 * signals handle
   2064			 */
   2065			struct idr *idp;
   2066			u32 id;
   2067
   2068			spin_lock(&ctx->sig_mgr.lock);
   2069			idp = &ctx->sig_mgr.handles;
   2070			idr_for_each_entry(idp, encaps_sig_hdl, id) {
   2071				if (encaps_sig_hdl->cs_seq == signal_seq) {
   2072					/* get refcount to protect removing this handle from idr,
   2073					 * needed when multiple wait cs are used with offset
   2074					 * to wait on reserved encaps signals.
   2075					 * Since kref_put of this handle is executed outside the
   2076					 * current lock, it is possible that the handle refcount
   2077					 * is 0 but it yet to be removed from the list. In this
   2078					 * case need to consider the handle as not valid.
   2079					 */
   2080					if (kref_get_unless_zero(&encaps_sig_hdl->refcount))
   2081						handle_found = true;
   2082					break;
   2083				}
   2084			}
   2085			spin_unlock(&ctx->sig_mgr.lock);
   2086
   2087			if (!handle_found) {
   2088				/* treat as signal CS already finished */
   2089				dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n",
   2090						signal_seq);
   2091				rc = 0;
   2092				goto free_cs_chunk_array;
   2093			}
   2094
   2095			/* validate also the signal offset value */
   2096			if (chunk->encaps_signal_offset >
   2097					encaps_sig_hdl->count) {
   2098				dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n",
   2099						chunk->encaps_signal_offset,
   2100						encaps_sig_hdl->count);
   2101				rc = -EINVAL;
   2102				goto free_cs_chunk_array;
   2103			}
   2104		}
   2105
   2106		sig_fence = hl_ctx_get_fence(ctx, signal_seq);
   2107		if (IS_ERR(sig_fence)) {
   2108			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   2109			atomic64_inc(&cntr->validation_drop_cnt);
   2110			dev_err(hdev->dev,
   2111				"Failed to get signal CS with seq 0x%llx\n",
   2112				signal_seq);
   2113			rc = PTR_ERR(sig_fence);
   2114			goto free_cs_chunk_array;
   2115		}
   2116
   2117		if (!sig_fence) {
   2118			/* signal CS already finished */
   2119			rc = 0;
   2120			goto free_cs_chunk_array;
   2121		}
   2122
   2123		sig_waitcs_cmpl =
   2124			container_of(sig_fence, struct hl_cs_compl, base_fence);
   2125
   2126		staged_cs_with_encaps_signals = !!
   2127				(sig_waitcs_cmpl->type == CS_TYPE_DEFAULT &&
   2128				(flags & HL_CS_FLAGS_ENCAP_SIGNALS));
   2129
   2130		if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL &&
   2131				!staged_cs_with_encaps_signals) {
   2132			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   2133			atomic64_inc(&cntr->validation_drop_cnt);
   2134			dev_err(hdev->dev,
   2135				"CS seq 0x%llx is not of a signal/encaps-signal CS\n",
   2136				signal_seq);
   2137			hl_fence_put(sig_fence);
   2138			rc = -EINVAL;
   2139			goto free_cs_chunk_array;
   2140		}
   2141
   2142		if (completion_done(&sig_fence->completion)) {
   2143			/* signal CS already finished */
   2144			hl_fence_put(sig_fence);
   2145			rc = 0;
   2146			goto free_cs_chunk_array;
   2147		}
   2148	}
   2149
   2150	rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout);
   2151	if (rc) {
   2152		if (is_wait_cs)
   2153			hl_fence_put(sig_fence);
   2154
   2155		goto free_cs_chunk_array;
   2156	}
   2157
   2158	/*
   2159	 * Save the signal CS fence for later initialization right before
   2160	 * hanging the wait CS on the queue.
   2161	 * for encaps signals case, we save the cs sequence and handle pointer
   2162	 * for later initialization.
   2163	 */
   2164	if (is_wait_cs) {
   2165		cs->signal_fence = sig_fence;
   2166		/* store the handle pointer, so we don't have to
   2167		 * look for it again, later on the flow
   2168		 * when we need to set SOB info in hw_queue.
   2169		 */
   2170		if (cs->encaps_signals)
   2171			cs->encaps_sig_hdl = encaps_sig_hdl;
   2172	}
   2173
   2174	hl_debugfs_add_cs(cs);
   2175
   2176	*cs_seq = cs->sequence;
   2177
   2178	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
   2179		rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
   2180				q_idx, chunk->encaps_signal_offset);
   2181	else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
   2182		rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
   2183				cs, q_idx, collective_engine_id,
   2184				chunk->encaps_signal_offset);
   2185	else {
   2186		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
   2187		atomic64_inc(&cntr->validation_drop_cnt);
   2188		rc = -EINVAL;
   2189	}
   2190
   2191	if (rc)
   2192		goto free_cs_object;
   2193
   2194	rc = hl_hw_queue_schedule_cs(cs);
   2195	if (rc) {
   2196		/* In case wait cs failed here, it means the signal cs
   2197		 * already completed. we want to free all it's related objects
   2198		 * but we don't want to fail the ioctl.
   2199		 */
   2200		if (is_wait_cs)
   2201			rc = 0;
   2202		else if (rc != -EAGAIN)
   2203			dev_err(hdev->dev,
   2204				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
   2205				ctx->asid, cs->sequence, rc);
   2206		goto free_cs_object;
   2207	}
   2208
   2209	*signal_sob_addr_offset = cs->sob_addr_offset;
   2210	*signal_initial_sob_count = cs->initial_sob_count;
   2211
   2212	rc = HL_CS_STATUS_SUCCESS;
   2213	if (is_wait_cs)
   2214		wait_cs_submitted = true;
   2215	goto put_cs;
   2216
   2217free_cs_object:
   2218	cs_rollback(hdev, cs);
   2219	*cs_seq = ULLONG_MAX;
   2220	/* The path below is both for good and erroneous exits */
   2221put_cs:
   2222	/* We finished with the CS in this function, so put the ref */
   2223	cs_put(cs);
   2224free_cs_chunk_array:
   2225	if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
   2226							is_wait_cs)
   2227		kref_put(&encaps_sig_hdl->refcount,
   2228				hl_encaps_handle_do_release);
   2229	kfree(cs_chunk_array);
   2230out:
   2231	return rc;
   2232}
   2233
   2234int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
   2235{
   2236	union hl_cs_args *args = data;
   2237	enum hl_cs_type cs_type = 0;
   2238	u64 cs_seq = ULONG_MAX;
   2239	void __user *chunks;
   2240	u32 num_chunks, flags, timeout,
   2241		signals_count = 0, sob_addr = 0, handle_id = 0;
   2242	u16 sob_initial_count = 0;
   2243	int rc;
   2244
   2245	rc = hl_cs_sanity_checks(hpriv, args);
   2246	if (rc)
   2247		goto out;
   2248
   2249	rc = hl_cs_ctx_switch(hpriv, args, &cs_seq);
   2250	if (rc)
   2251		goto out;
   2252
   2253	cs_type = hl_cs_get_cs_type(args->in.cs_flags &
   2254					~HL_CS_FLAGS_FORCE_RESTORE);
   2255	chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
   2256	num_chunks = args->in.num_chunks_execute;
   2257	flags = args->in.cs_flags;
   2258
   2259	/* In case this is a staged CS, user should supply the CS sequence */
   2260	if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
   2261			!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
   2262		cs_seq = args->in.seq;
   2263
   2264	timeout = flags & HL_CS_FLAGS_CUSTOM_TIMEOUT
   2265			? msecs_to_jiffies(args->in.timeout * 1000)
   2266			: hpriv->hdev->timeout_jiffies;
   2267
   2268	switch (cs_type) {
   2269	case CS_TYPE_SIGNAL:
   2270	case CS_TYPE_WAIT:
   2271	case CS_TYPE_COLLECTIVE_WAIT:
   2272		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
   2273					&cs_seq, args->in.cs_flags, timeout,
   2274					&sob_addr, &sob_initial_count);
   2275		break;
   2276	case CS_RESERVE_SIGNALS:
   2277		rc = cs_ioctl_reserve_signals(hpriv,
   2278					args->in.encaps_signals_q_idx,
   2279					args->in.encaps_signals_count,
   2280					&handle_id, &sob_addr, &signals_count);
   2281		break;
   2282	case CS_UNRESERVE_SIGNALS:
   2283		rc = cs_ioctl_unreserve_signals(hpriv,
   2284					args->in.encaps_sig_handle_id);
   2285		break;
   2286	default:
   2287		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
   2288						args->in.cs_flags,
   2289						args->in.encaps_sig_handle_id,
   2290						timeout, &sob_initial_count);
   2291		break;
   2292	}
   2293out:
   2294	if (rc != -EAGAIN) {
   2295		memset(args, 0, sizeof(*args));
   2296
   2297		switch (cs_type) {
   2298		case CS_RESERVE_SIGNALS:
   2299			args->out.handle_id = handle_id;
   2300			args->out.sob_base_addr_offset = sob_addr;
   2301			args->out.count = signals_count;
   2302			break;
   2303		case CS_TYPE_SIGNAL:
   2304			args->out.sob_base_addr_offset = sob_addr;
   2305			args->out.sob_count_before_submission = sob_initial_count;
   2306			args->out.seq = cs_seq;
   2307			break;
   2308		case CS_TYPE_DEFAULT:
   2309			args->out.sob_count_before_submission = sob_initial_count;
   2310			args->out.seq = cs_seq;
   2311			break;
   2312		default:
   2313			args->out.seq = cs_seq;
   2314			break;
   2315		}
   2316
   2317		args->out.status = rc;
   2318	}
   2319
   2320	return rc;
   2321}
   2322
   2323static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence,
   2324				enum hl_cs_wait_status *status, u64 timeout_us,
   2325				s64 *timestamp)
   2326{
   2327	struct hl_device *hdev = ctx->hdev;
   2328	long completion_rc;
   2329	int rc = 0;
   2330
   2331	if (IS_ERR(fence)) {
   2332		rc = PTR_ERR(fence);
   2333		if (rc == -EINVAL)
   2334			dev_notice_ratelimited(hdev->dev,
   2335				"Can't wait on CS %llu because current CS is at seq %llu\n",
   2336				seq, ctx->cs_sequence);
   2337		return rc;
   2338	}
   2339
   2340	if (!fence) {
   2341		dev_dbg(hdev->dev,
   2342			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
   2343				seq, ctx->cs_sequence);
   2344
   2345		*status = CS_WAIT_STATUS_GONE;
   2346		return 0;
   2347	}
   2348
   2349	if (!timeout_us) {
   2350		completion_rc = completion_done(&fence->completion);
   2351	} else {
   2352		unsigned long timeout;
   2353
   2354		timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ?
   2355				timeout_us : usecs_to_jiffies(timeout_us);
   2356		completion_rc =
   2357			wait_for_completion_interruptible_timeout(
   2358				&fence->completion, timeout);
   2359	}
   2360
   2361	if (completion_rc > 0) {
   2362		*status = CS_WAIT_STATUS_COMPLETED;
   2363		if (timestamp)
   2364			*timestamp = ktime_to_ns(fence->timestamp);
   2365	} else {
   2366		*status = CS_WAIT_STATUS_BUSY;
   2367	}
   2368
   2369	if (fence->error == -ETIMEDOUT)
   2370		rc = -ETIMEDOUT;
   2371	else if (fence->error == -EIO)
   2372		rc = -EIO;
   2373
   2374	return rc;
   2375}
   2376
   2377/*
   2378 * hl_cs_poll_fences - iterate CS fences to check for CS completion
   2379 *
   2380 * @mcs_data: multi-CS internal data
   2381 * @mcs_compl: multi-CS completion structure
   2382 *
   2383 * @return 0 on success, otherwise non 0 error code
   2384 *
   2385 * The function iterates on all CS sequence in the list and set bit in
   2386 * completion_bitmap for each completed CS.
   2387 * While iterating, the function sets the stream map of each fence in the fence
   2388 * array in the completion QID stream map to be used by CSs to perform
   2389 * completion to the multi-CS context.
   2390 * This function shall be called after taking context ref
   2391 */
   2392static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_completion *mcs_compl)
   2393{
   2394	struct hl_fence **fence_ptr = mcs_data->fence_arr;
   2395	struct hl_device *hdev = mcs_data->ctx->hdev;
   2396	int i, rc, arr_len = mcs_data->arr_len;
   2397	u64 *seq_arr = mcs_data->seq_arr;
   2398	ktime_t max_ktime, first_cs_time;
   2399	enum hl_cs_wait_status status;
   2400
   2401	memset(fence_ptr, 0, arr_len * sizeof(*fence_ptr));
   2402
   2403	/* get all fences under the same lock */
   2404	rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len);
   2405	if (rc)
   2406		return rc;
   2407
   2408	/*
   2409	 * re-initialize the completion here to handle 2 possible cases:
   2410	 * 1. CS will complete the multi-CS prior clearing the completion. in which
   2411	 *    case the fence iteration is guaranteed to catch the CS completion.
   2412	 * 2. the completion will occur after re-init of the completion.
   2413	 *    in which case we will wake up immediately in wait_for_completion.
   2414	 */
   2415	reinit_completion(&mcs_compl->completion);
   2416
   2417	/*
   2418	 * set to maximum time to verify timestamp is valid: if at the end
   2419	 * this value is maintained- no timestamp was updated
   2420	 */
   2421	max_ktime = ktime_set(KTIME_SEC_MAX, 0);
   2422	first_cs_time = max_ktime;
   2423
   2424	for (i = 0; i < arr_len; i++, fence_ptr++) {
   2425		struct hl_fence *fence = *fence_ptr;
   2426
   2427		/*
   2428		 * In order to prevent case where we wait until timeout even though a CS associated
   2429		 * with the multi-CS actually completed we do things in the below order:
   2430		 * 1. for each fence set it's QID map in the multi-CS completion QID map. This way
   2431		 *    any CS can, potentially, complete the multi CS for the specific QID (note
   2432		 *    that once completion is initialized, calling complete* and then wait on the
   2433		 *    completion will cause it to return at once)
   2434		 * 2. only after allowing multi-CS completion for the specific QID we check whether
   2435		 *    the specific CS already completed (and thus the wait for completion part will
   2436		 *    be skipped). if the CS not completed it is guaranteed that completing CS will
   2437		 *    wake up the completion.
   2438		 */
   2439		if (fence)
   2440			mcs_compl->stream_master_qid_map |= fence->stream_master_qid_map;
   2441
   2442		/*
   2443		 * function won't sleep as it is called with timeout 0 (i.e.
   2444		 * poll the fence)
   2445		 */
   2446		rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence,
   2447						&status, 0, NULL);
   2448		if (rc) {
   2449			dev_err(hdev->dev,
   2450				"wait_for_fence error :%d for CS seq %llu\n",
   2451								rc, seq_arr[i]);
   2452			break;
   2453		}
   2454
   2455		switch (status) {
   2456		case CS_WAIT_STATUS_BUSY:
   2457			/* CS did not finished, QID to wait on already stored */
   2458			break;
   2459		case CS_WAIT_STATUS_COMPLETED:
   2460			/*
   2461			 * Using mcs_handling_done to avoid possibility of mcs_data
   2462			 * returns to user indicating CS completed before it finished
   2463			 * all of its mcs handling, to avoid race the next time the
   2464			 * user waits for mcs.
   2465			 * note: when reaching this case fence is definitely not NULL
   2466			 *       but NULL check was added to overcome static analysis
   2467			 */
   2468			if (fence && !fence->mcs_handling_done) {
   2469				/*
   2470				 * in case multi CS is completed but MCS handling not done
   2471				 * we "complete" the multi CS to prevent it from waiting
   2472				 * until time-out and the "multi-CS handling done" will have
   2473				 * another chance at the next iteration
   2474				 */
   2475				complete_all(&mcs_compl->completion);
   2476				break;
   2477			}
   2478
   2479			mcs_data->completion_bitmap |= BIT(i);
   2480			/*
   2481			 * For all completed CSs we take the earliest timestamp.
   2482			 * For this we have to validate that the timestamp is
   2483			 * earliest of all timestamps so far.
   2484			 */
   2485			if (mcs_data->update_ts &&
   2486					(ktime_compare(fence->timestamp, first_cs_time) < 0))
   2487				first_cs_time = fence->timestamp;
   2488			break;
   2489		case CS_WAIT_STATUS_GONE:
   2490			mcs_data->update_ts = false;
   2491			mcs_data->gone_cs = true;
   2492			/*
   2493			 * It is possible to get an old sequence numbers from user
   2494			 * which related to already completed CSs and their fences
   2495			 * already gone. In this case, CS set as completed but
   2496			 * no need to consider its QID for mcs completion.
   2497			 */
   2498			mcs_data->completion_bitmap |= BIT(i);
   2499			break;
   2500		default:
   2501			dev_err(hdev->dev, "Invalid fence status\n");
   2502			return -EINVAL;
   2503		}
   2504
   2505	}
   2506
   2507	hl_fences_put(mcs_data->fence_arr, arr_len);
   2508
   2509	if (mcs_data->update_ts &&
   2510			(ktime_compare(first_cs_time, max_ktime) != 0))
   2511		mcs_data->timestamp = ktime_to_ns(first_cs_time);
   2512
   2513	return rc;
   2514}
   2515
   2516static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
   2517				u64 timeout_us, u64 seq,
   2518				enum hl_cs_wait_status *status, s64 *timestamp)
   2519{
   2520	struct hl_fence *fence;
   2521	int rc = 0;
   2522
   2523	if (timestamp)
   2524		*timestamp = 0;
   2525
   2526	hl_ctx_get(ctx);
   2527
   2528	fence = hl_ctx_get_fence(ctx, seq);
   2529
   2530	rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp);
   2531	hl_fence_put(fence);
   2532	hl_ctx_put(ctx);
   2533
   2534	return rc;
   2535}
   2536
   2537static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
   2538{
   2539	if (usecs <= U32_MAX)
   2540		return usecs_to_jiffies(usecs);
   2541
   2542	/*
   2543	 * If the value in nanoseconds is larger than 64 bit, use the largest
   2544	 * 64 bit value.
   2545	 */
   2546	if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC)))
   2547		return nsecs_to_jiffies(U64_MAX);
   2548
   2549	return nsecs_to_jiffies(usecs * NSEC_PER_USEC);
   2550}
   2551
   2552/*
   2553 * hl_wait_multi_cs_completion_init - init completion structure
   2554 *
   2555 * @hdev: pointer to habanalabs device structure
   2556 * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
   2557 *                        master QID to wait on
   2558 *
   2559 * @return valid completion struct pointer on success, otherwise error pointer
   2560 *
   2561 * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
   2562 * the function gets the first available completion (by marking it "used")
   2563 * and initialize its values.
   2564 */
   2565static struct multi_cs_completion *hl_wait_multi_cs_completion_init(struct hl_device *hdev)
   2566{
   2567	struct multi_cs_completion *mcs_compl;
   2568	int i;
   2569
   2570	/* find free multi_cs completion structure */
   2571	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
   2572		mcs_compl = &hdev->multi_cs_completion[i];
   2573		spin_lock(&mcs_compl->lock);
   2574		if (!mcs_compl->used) {
   2575			mcs_compl->used = 1;
   2576			mcs_compl->timestamp = 0;
   2577			/*
   2578			 * init QID map to 0 to avoid completion by CSs. the actual QID map
   2579			 * to multi-CS CSs will be set incrementally at a later stage
   2580			 */
   2581			mcs_compl->stream_master_qid_map = 0;
   2582			spin_unlock(&mcs_compl->lock);
   2583			break;
   2584		}
   2585		spin_unlock(&mcs_compl->lock);
   2586	}
   2587
   2588	if (i == MULTI_CS_MAX_USER_CTX) {
   2589		dev_err(hdev->dev, "no available multi-CS completion structure\n");
   2590		return ERR_PTR(-ENOMEM);
   2591	}
   2592	return mcs_compl;
   2593}
   2594
   2595/*
   2596 * hl_wait_multi_cs_completion_fini - return completion structure and set as
   2597 *                                    unused
   2598 *
   2599 * @mcs_compl: pointer to the completion structure
   2600 */
   2601static void hl_wait_multi_cs_completion_fini(
   2602					struct multi_cs_completion *mcs_compl)
   2603{
   2604	/*
   2605	 * free completion structure, do it under lock to be in-sync with the
   2606	 * thread that signals completion
   2607	 */
   2608	spin_lock(&mcs_compl->lock);
   2609	mcs_compl->used = 0;
   2610	spin_unlock(&mcs_compl->lock);
   2611}
   2612
   2613/*
   2614 * hl_wait_multi_cs_completion - wait for first CS to complete
   2615 *
   2616 * @mcs_data: multi-CS internal data
   2617 *
   2618 * @return 0 on success, otherwise non 0 error code
   2619 */
   2620static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data,
   2621						struct multi_cs_completion *mcs_compl)
   2622{
   2623	long completion_rc;
   2624
   2625	completion_rc = wait_for_completion_interruptible_timeout(&mcs_compl->completion,
   2626									mcs_data->timeout_jiffies);
   2627
   2628	/* update timestamp */
   2629	if (completion_rc > 0)
   2630		mcs_data->timestamp = mcs_compl->timestamp;
   2631
   2632	mcs_data->wait_status = completion_rc;
   2633
   2634	return 0;
   2635}
   2636
   2637/*
   2638 * hl_multi_cs_completion_init - init array of multi-CS completion structures
   2639 *
   2640 * @hdev: pointer to habanalabs device structure
   2641 */
   2642void hl_multi_cs_completion_init(struct hl_device *hdev)
   2643{
   2644	struct multi_cs_completion *mcs_cmpl;
   2645	int i;
   2646
   2647	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
   2648		mcs_cmpl = &hdev->multi_cs_completion[i];
   2649		mcs_cmpl->used = 0;
   2650		spin_lock_init(&mcs_cmpl->lock);
   2651		init_completion(&mcs_cmpl->completion);
   2652	}
   2653}
   2654
   2655/*
   2656 * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
   2657 *
   2658 * @hpriv: pointer to the private data of the fd
   2659 * @data: pointer to multi-CS wait ioctl in/out args
   2660 *
   2661 */
   2662static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
   2663{
   2664	struct multi_cs_completion *mcs_compl;
   2665	struct hl_device *hdev = hpriv->hdev;
   2666	struct multi_cs_data mcs_data = {};
   2667	union hl_wait_cs_args *args = data;
   2668	struct hl_ctx *ctx = hpriv->ctx;
   2669	struct hl_fence **fence_arr;
   2670	void __user *seq_arr;
   2671	u32 size_to_copy;
   2672	u64 *cs_seq_arr;
   2673	u8 seq_arr_len;
   2674	int rc;
   2675
   2676	if (!hdev->supports_wait_for_multi_cs) {
   2677		dev_err(hdev->dev, "Wait for multi CS is not supported\n");
   2678		return -EPERM;
   2679	}
   2680
   2681	seq_arr_len = args->in.seq_arr_len;
   2682
   2683	if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
   2684		dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
   2685				HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
   2686		return -EINVAL;
   2687	}
   2688
   2689	/* allocate memory for sequence array */
   2690	cs_seq_arr =
   2691		kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL);
   2692	if (!cs_seq_arr)
   2693		return -ENOMEM;
   2694
   2695	/* copy CS sequence array from user */
   2696	seq_arr = (void __user *) (uintptr_t) args->in.seq;
   2697	size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
   2698	if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) {
   2699		dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
   2700		rc = -EFAULT;
   2701		goto free_seq_arr;
   2702	}
   2703
   2704	/* allocate array for the fences */
   2705	fence_arr = kmalloc_array(seq_arr_len, sizeof(*fence_arr), GFP_KERNEL);
   2706	if (!fence_arr) {
   2707		rc = -ENOMEM;
   2708		goto free_seq_arr;
   2709	}
   2710
   2711	/* initialize the multi-CS internal data */
   2712	mcs_data.ctx = ctx;
   2713	mcs_data.seq_arr = cs_seq_arr;
   2714	mcs_data.fence_arr = fence_arr;
   2715	mcs_data.arr_len = seq_arr_len;
   2716
   2717	hl_ctx_get(ctx);
   2718
   2719	/* wait (with timeout) for the first CS to be completed */
   2720	mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us);
   2721	mcs_compl = hl_wait_multi_cs_completion_init(hdev);
   2722	if (IS_ERR(mcs_compl)) {
   2723		rc = PTR_ERR(mcs_compl);
   2724		goto put_ctx;
   2725	}
   2726
   2727	/* poll all CS fences, extract timestamp */
   2728	mcs_data.update_ts = true;
   2729	rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
   2730	/*
   2731	 * skip wait for CS completion when one of the below is true:
   2732	 * - an error on the poll function
   2733	 * - one or more CS in the list completed
   2734	 * - the user called ioctl with timeout 0
   2735	 */
   2736	if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
   2737		goto completion_fini;
   2738
   2739	while (true) {
   2740		rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl);
   2741		if (rc || (mcs_data.wait_status == 0))
   2742			break;
   2743
   2744		/*
   2745		 * poll fences once again to update the CS map.
   2746		 * no timestamp should be updated this time.
   2747		 */
   2748		mcs_data.update_ts = false;
   2749		rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
   2750
   2751		if (rc || mcs_data.completion_bitmap)
   2752			break;
   2753
   2754		/*
   2755		 * if hl_wait_multi_cs_completion returned before timeout (i.e.
   2756		 * it got a completion) it either got completed by CS in the multi CS list
   2757		 * (in which case the indication will be non empty completion_bitmap) or it
   2758		 * got completed by CS submitted to one of the shared stream master but
   2759		 * not in the multi CS list (in which case we should wait again but modify
   2760		 * the timeout and set timestamp as zero to let a CS related to the current
   2761		 * multi-CS set a new, relevant, timestamp)
   2762		 */
   2763		mcs_data.timeout_jiffies = mcs_data.wait_status;
   2764		mcs_compl->timestamp = 0;
   2765	}
   2766
   2767completion_fini:
   2768	hl_wait_multi_cs_completion_fini(mcs_compl);
   2769
   2770put_ctx:
   2771	hl_ctx_put(ctx);
   2772	kfree(fence_arr);
   2773
   2774free_seq_arr:
   2775	kfree(cs_seq_arr);
   2776
   2777	if (rc)
   2778		return rc;
   2779
   2780	if (mcs_data.wait_status == -ERESTARTSYS) {
   2781		dev_err_ratelimited(hdev->dev,
   2782				"user process got signal while waiting for Multi-CS\n");
   2783		return -EINTR;
   2784	}
   2785
   2786	/* update output args */
   2787	memset(args, 0, sizeof(*args));
   2788
   2789	if (mcs_data.completion_bitmap) {
   2790		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
   2791		args->out.cs_completion_map = mcs_data.completion_bitmap;
   2792
   2793		/* if timestamp not 0- it's valid */
   2794		if (mcs_data.timestamp) {
   2795			args->out.timestamp_nsec = mcs_data.timestamp;
   2796			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
   2797		}
   2798
   2799		/* update if some CS was gone */
   2800		if (!mcs_data.timestamp)
   2801			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
   2802	} else {
   2803		args->out.status = HL_WAIT_CS_STATUS_BUSY;
   2804	}
   2805
   2806	return 0;
   2807}
   2808
   2809static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
   2810{
   2811	struct hl_device *hdev = hpriv->hdev;
   2812	union hl_wait_cs_args *args = data;
   2813	enum hl_cs_wait_status status;
   2814	u64 seq = args->in.seq;
   2815	s64 timestamp;
   2816	int rc;
   2817
   2818	rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq,
   2819				&status, &timestamp);
   2820
   2821	if (rc == -ERESTARTSYS) {
   2822		dev_err_ratelimited(hdev->dev,
   2823			"user process got signal while waiting for CS handle %llu\n",
   2824			seq);
   2825		return -EINTR;
   2826	}
   2827
   2828	memset(args, 0, sizeof(*args));
   2829
   2830	if (rc) {
   2831		if (rc == -ETIMEDOUT) {
   2832			dev_err_ratelimited(hdev->dev,
   2833				"CS %llu has timed-out while user process is waiting for it\n",
   2834				seq);
   2835			args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
   2836		} else if (rc == -EIO) {
   2837			dev_err_ratelimited(hdev->dev,
   2838				"CS %llu has been aborted while user process is waiting for it\n",
   2839				seq);
   2840			args->out.status = HL_WAIT_CS_STATUS_ABORTED;
   2841		}
   2842		return rc;
   2843	}
   2844
   2845	if (timestamp) {
   2846		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
   2847		args->out.timestamp_nsec = timestamp;
   2848	}
   2849
   2850	switch (status) {
   2851	case CS_WAIT_STATUS_GONE:
   2852		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
   2853		fallthrough;
   2854	case CS_WAIT_STATUS_COMPLETED:
   2855		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
   2856		break;
   2857	case CS_WAIT_STATUS_BUSY:
   2858	default:
   2859		args->out.status = HL_WAIT_CS_STATUS_BUSY;
   2860		break;
   2861	}
   2862
   2863	return 0;
   2864}
   2865
   2866static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
   2867					struct hl_cb *cq_cb,
   2868					u64 ts_offset, u64 cq_offset, u64 target_value,
   2869					spinlock_t *wait_list_lock,
   2870					struct hl_user_pending_interrupt **pend)
   2871{
   2872	struct hl_ts_buff *ts_buff = buf->private;
   2873	struct hl_user_pending_interrupt *requested_offset_record =
   2874				(struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
   2875				ts_offset;
   2876	struct hl_user_pending_interrupt *cb_last =
   2877			(struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
   2878			(ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt));
   2879	unsigned long flags, iter_counter = 0;
   2880	u64 current_cq_counter;
   2881
   2882	/* Validate ts_offset not exceeding last max */
   2883	if (requested_offset_record > cb_last) {
   2884		dev_err(buf->mmg->dev, "Ts offset exceeds max CB offset(0x%llx)\n",
   2885								(u64)(uintptr_t)cb_last);
   2886		return -EINVAL;
   2887	}
   2888
   2889start_over:
   2890	spin_lock_irqsave(wait_list_lock, flags);
   2891
   2892	/* Unregister only if we didn't reach the target value
   2893	 * since in this case there will be no handling in irq context
   2894	 * and then it's safe to delete the node out of the interrupt list
   2895	 * then re-use it on other interrupt
   2896	 */
   2897	if (requested_offset_record->ts_reg_info.in_use) {
   2898		current_cq_counter = *requested_offset_record->cq_kernel_addr;
   2899		if (current_cq_counter < requested_offset_record->cq_target_value) {
   2900			list_del(&requested_offset_record->wait_list_node);
   2901			spin_unlock_irqrestore(wait_list_lock, flags);
   2902
   2903			hl_mmap_mem_buf_put(requested_offset_record->ts_reg_info.buf);
   2904			hl_cb_put(requested_offset_record->ts_reg_info.cq_cb);
   2905
   2906			dev_dbg(buf->mmg->dev,
   2907				"ts node removed from interrupt list now can re-use\n");
   2908		} else {
   2909			dev_dbg(buf->mmg->dev,
   2910				"ts node in middle of irq handling\n");
   2911
   2912			/* irq handling in the middle give it time to finish */
   2913			spin_unlock_irqrestore(wait_list_lock, flags);
   2914			usleep_range(1, 10);
   2915			if (++iter_counter == MAX_TS_ITER_NUM) {
   2916				dev_err(buf->mmg->dev,
   2917					"handling registration interrupt took too long!!\n");
   2918				return -EINVAL;
   2919			}
   2920
   2921			goto start_over;
   2922		}
   2923	} else {
   2924		spin_unlock_irqrestore(wait_list_lock, flags);
   2925	}
   2926
   2927	/* Fill up the new registration node info */
   2928	requested_offset_record->ts_reg_info.in_use = 1;
   2929	requested_offset_record->ts_reg_info.buf = buf;
   2930	requested_offset_record->ts_reg_info.cq_cb = cq_cb;
   2931	requested_offset_record->ts_reg_info.timestamp_kernel_addr =
   2932			(u64 *) ts_buff->user_buff_address + ts_offset;
   2933	requested_offset_record->cq_kernel_addr =
   2934			(u64 *) cq_cb->kernel_address + cq_offset;
   2935	requested_offset_record->cq_target_value = target_value;
   2936
   2937	*pend = requested_offset_record;
   2938
   2939	dev_dbg(buf->mmg->dev, "Found available node in TS kernel CB(0x%llx)\n",
   2940						(u64)(uintptr_t)requested_offset_record);
   2941	return 0;
   2942}
   2943
   2944static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
   2945				struct hl_mem_mgr *cb_mmg, struct hl_mem_mgr *mmg,
   2946				u64 timeout_us, u64 cq_counters_handle,	u64 cq_counters_offset,
   2947				u64 target_value, struct hl_user_interrupt *interrupt,
   2948				bool register_ts_record, u64 ts_handle, u64 ts_offset,
   2949				u32 *status, u64 *timestamp)
   2950{
   2951	struct hl_user_pending_interrupt *pend;
   2952	struct hl_mmap_mem_buf *buf;
   2953	struct hl_cb *cq_cb;
   2954	unsigned long timeout, flags;
   2955	long completion_rc;
   2956	int rc = 0;
   2957
   2958	timeout = hl_usecs64_to_jiffies(timeout_us);
   2959
   2960	hl_ctx_get(ctx);
   2961
   2962	cq_cb = hl_cb_get(cb_mmg, cq_counters_handle);
   2963	if (!cq_cb) {
   2964		rc = -EINVAL;
   2965		goto put_ctx;
   2966	}
   2967
   2968	if (register_ts_record) {
   2969		dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, ts offset: %llu, cq_offset: %llu\n",
   2970					interrupt->interrupt_id, ts_offset, cq_counters_offset);
   2971		buf = hl_mmap_mem_buf_get(mmg, ts_handle);
   2972		if (!buf) {
   2973			rc = -EINVAL;
   2974			goto put_cq_cb;
   2975		}
   2976
   2977		/* Find first available record */
   2978		rc = ts_buff_get_kernel_ts_record(buf, cq_cb, ts_offset,
   2979						cq_counters_offset, target_value,
   2980						&interrupt->wait_list_lock, &pend);
   2981		if (rc)
   2982			goto put_ts_buff;
   2983	} else {
   2984		pend = kzalloc(sizeof(*pend), GFP_KERNEL);
   2985		if (!pend) {
   2986			rc = -ENOMEM;
   2987			goto put_cq_cb;
   2988		}
   2989		hl_fence_init(&pend->fence, ULONG_MAX);
   2990		pend->cq_kernel_addr = (u64 *) cq_cb->kernel_address + cq_counters_offset;
   2991		pend->cq_target_value = target_value;
   2992	}
   2993
   2994	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
   2995
   2996	/* We check for completion value as interrupt could have been received
   2997	 * before we added the node to the wait list
   2998	 */
   2999	if (*pend->cq_kernel_addr >= target_value) {
   3000		if (register_ts_record)
   3001			pend->ts_reg_info.in_use = 0;
   3002		spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
   3003
   3004		*status = HL_WAIT_CS_STATUS_COMPLETED;
   3005
   3006		if (register_ts_record) {
   3007			*pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns();
   3008			goto put_ts_buff;
   3009		} else {
   3010			pend->fence.timestamp = ktime_get();
   3011			goto set_timestamp;
   3012		}
   3013	} else if (!timeout_us) {
   3014		spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
   3015		*status = HL_WAIT_CS_STATUS_BUSY;
   3016		pend->fence.timestamp = ktime_get();
   3017		goto set_timestamp;
   3018	}
   3019
   3020	/* Add pending user interrupt to relevant list for the interrupt
   3021	 * handler to monitor.
   3022	 * Note that we cannot have sorted list by target value,
   3023	 * in order to shorten the list pass loop, since
   3024	 * same list could have nodes for different cq counter handle.
   3025	 */
   3026	list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
   3027	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
   3028
   3029	if (register_ts_record) {
   3030		rc = *status = HL_WAIT_CS_STATUS_COMPLETED;
   3031		goto ts_registration_exit;
   3032	}
   3033
   3034	/* Wait for interrupt handler to signal completion */
   3035	completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
   3036								timeout);
   3037	if (completion_rc > 0) {
   3038		*status = HL_WAIT_CS_STATUS_COMPLETED;
   3039	} else {
   3040		if (completion_rc == -ERESTARTSYS) {
   3041			dev_err_ratelimited(hdev->dev,
   3042					"user process got signal while waiting for interrupt ID %d\n",
   3043					interrupt->interrupt_id);
   3044			rc = -EINTR;
   3045			*status = HL_WAIT_CS_STATUS_ABORTED;
   3046		} else {
   3047			if (pend->fence.error == -EIO) {
   3048				dev_err_ratelimited(hdev->dev,
   3049						"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
   3050						pend->fence.error);
   3051				rc = -EIO;
   3052				*status = HL_WAIT_CS_STATUS_ABORTED;
   3053			} else {
   3054				/* The wait has timed-out. We don't know anything beyond that
   3055				 * because the workload wasn't submitted through the driver.
   3056				 * Therefore, from driver's perspective, the workload is still
   3057				 * executing.
   3058				 */
   3059				rc = 0;
   3060				*status = HL_WAIT_CS_STATUS_BUSY;
   3061			}
   3062		}
   3063	}
   3064
   3065	/*
   3066	 * We keep removing the node from list here, and not at the irq handler
   3067	 * for completion timeout case. and if it's a registration
   3068	 * for ts record, the node will be deleted in the irq handler after
   3069	 * we reach the target value.
   3070	 */
   3071	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
   3072	list_del(&pend->wait_list_node);
   3073	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
   3074
   3075set_timestamp:
   3076	*timestamp = ktime_to_ns(pend->fence.timestamp);
   3077	kfree(pend);
   3078	hl_cb_put(cq_cb);
   3079ts_registration_exit:
   3080	hl_ctx_put(ctx);
   3081
   3082	return rc;
   3083
   3084put_ts_buff:
   3085	hl_mmap_mem_buf_put(buf);
   3086put_cq_cb:
   3087	hl_cb_put(cq_cb);
   3088put_ctx:
   3089	hl_ctx_put(ctx);
   3090
   3091	return rc;
   3092}
   3093
   3094static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_ctx *ctx,
   3095				u64 timeout_us, u64 user_address,
   3096				u64 target_value, struct hl_user_interrupt *interrupt,
   3097
   3098				u32 *status,
   3099				u64 *timestamp)
   3100{
   3101	struct hl_user_pending_interrupt *pend;
   3102	unsigned long timeout, flags;
   3103	u64 completion_value;
   3104	long completion_rc;
   3105	int rc = 0;
   3106
   3107	timeout = hl_usecs64_to_jiffies(timeout_us);
   3108
   3109	hl_ctx_get(ctx);
   3110
   3111	pend = kzalloc(sizeof(*pend), GFP_KERNEL);
   3112	if (!pend) {
   3113		hl_ctx_put(ctx);
   3114		return -ENOMEM;
   3115	}
   3116
   3117	hl_fence_init(&pend->fence, ULONG_MAX);
   3118
   3119	/* Add pending user interrupt to relevant list for the interrupt
   3120	 * handler to monitor
   3121	 */
   3122	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
   3123	list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
   3124	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
   3125
   3126	/* We check for completion value as interrupt could have been received
   3127	 * before we added the node to the wait list
   3128	 */
   3129	if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
   3130		dev_err(hdev->dev, "Failed to copy completion value from user\n");
   3131		rc = -EFAULT;
   3132		goto remove_pending_user_interrupt;
   3133	}
   3134
   3135	if (completion_value >= target_value) {
   3136		*status = HL_WAIT_CS_STATUS_COMPLETED;
   3137		/* There was no interrupt, we assume the completion is now. */
   3138		pend->fence.timestamp = ktime_get();
   3139	} else {
   3140		*status = HL_WAIT_CS_STATUS_BUSY;
   3141	}
   3142
   3143	if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
   3144		goto remove_pending_user_interrupt;
   3145
   3146wait_again:
   3147	/* Wait for interrupt handler to signal completion */
   3148	completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
   3149										timeout);
   3150
   3151	/* If timeout did not expire we need to perform the comparison.
   3152	 * If comparison fails, keep waiting until timeout expires
   3153	 */
   3154	if (completion_rc > 0) {
   3155		spin_lock_irqsave(&interrupt->wait_list_lock, flags);
   3156		/* reinit_completion must be called before we check for user
   3157		 * completion value, otherwise, if interrupt is received after
   3158		 * the comparison and before the next wait_for_completion,
   3159		 * we will reach timeout and fail
   3160		 */
   3161		reinit_completion(&pend->fence.completion);
   3162		spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
   3163
   3164		if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
   3165			dev_err(hdev->dev, "Failed to copy completion value from user\n");
   3166			rc = -EFAULT;
   3167
   3168			goto remove_pending_user_interrupt;
   3169		}
   3170
   3171		if (completion_value >= target_value) {
   3172			*status = HL_WAIT_CS_STATUS_COMPLETED;
   3173		} else if (pend->fence.error) {
   3174			dev_err_ratelimited(hdev->dev,
   3175				"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
   3176				pend->fence.error);
   3177			/* set the command completion status as ABORTED */
   3178			*status = HL_WAIT_CS_STATUS_ABORTED;
   3179		} else {
   3180			timeout = completion_rc;
   3181			goto wait_again;
   3182		}
   3183	} else if (completion_rc == -ERESTARTSYS) {
   3184		dev_err_ratelimited(hdev->dev,
   3185			"user process got signal while waiting for interrupt ID %d\n",
   3186			interrupt->interrupt_id);
   3187		rc = -EINTR;
   3188	} else {
   3189		/* The wait has timed-out. We don't know anything beyond that
   3190		 * because the workload wasn't submitted through the driver.
   3191		 * Therefore, from driver's perspective, the workload is still
   3192		 * executing.
   3193		 */
   3194		rc = 0;
   3195		*status = HL_WAIT_CS_STATUS_BUSY;
   3196	}
   3197
   3198remove_pending_user_interrupt:
   3199	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
   3200	list_del(&pend->wait_list_node);
   3201	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
   3202
   3203	*timestamp = ktime_to_ns(pend->fence.timestamp);
   3204
   3205	kfree(pend);
   3206	hl_ctx_put(ctx);
   3207
   3208	return rc;
   3209}
   3210
   3211static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
   3212{
   3213	u16 interrupt_id, first_interrupt, last_interrupt;
   3214	struct hl_device *hdev = hpriv->hdev;
   3215	struct asic_fixed_properties *prop;
   3216	struct hl_user_interrupt *interrupt;
   3217	union hl_wait_cs_args *args = data;
   3218	u32 status = HL_WAIT_CS_STATUS_BUSY;
   3219	u64 timestamp;
   3220	int rc;
   3221
   3222	prop = &hdev->asic_prop;
   3223
   3224	if (!prop->user_interrupt_count) {
   3225		dev_err(hdev->dev, "no user interrupts allowed");
   3226		return -EPERM;
   3227	}
   3228
   3229	interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
   3230
   3231	first_interrupt = prop->first_available_user_msix_interrupt;
   3232	last_interrupt = prop->first_available_user_msix_interrupt +
   3233						prop->user_interrupt_count - 1;
   3234
   3235	if ((interrupt_id < first_interrupt || interrupt_id > last_interrupt) &&
   3236			interrupt_id != HL_COMMON_USER_INTERRUPT_ID) {
   3237		dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
   3238		return -EINVAL;
   3239	}
   3240
   3241	if (interrupt_id == HL_COMMON_USER_INTERRUPT_ID)
   3242		interrupt = &hdev->common_user_interrupt;
   3243	else
   3244		interrupt = &hdev->user_interrupt[interrupt_id - first_interrupt];
   3245
   3246	if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ)
   3247		rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->mem_mgr, &hpriv->mem_mgr,
   3248				args->in.interrupt_timeout_us, args->in.cq_counters_handle,
   3249				args->in.cq_counters_offset,
   3250				args->in.target, interrupt,
   3251				!!(args->in.flags & HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT),
   3252				args->in.timestamp_handle, args->in.timestamp_offset,
   3253				&status, &timestamp);
   3254	else
   3255		rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx,
   3256				args->in.interrupt_timeout_us, args->in.addr,
   3257				args->in.target, interrupt, &status,
   3258				&timestamp);
   3259	if (rc)
   3260		return rc;
   3261
   3262	memset(args, 0, sizeof(*args));
   3263	args->out.status = status;
   3264
   3265	if (timestamp) {
   3266		args->out.timestamp_nsec = timestamp;
   3267		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
   3268	}
   3269
   3270	return 0;
   3271}
   3272
   3273int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
   3274{
   3275	union hl_wait_cs_args *args = data;
   3276	u32 flags = args->in.flags;
   3277	int rc;
   3278
   3279	/* If the device is not operational, no point in waiting for any command submission or
   3280	 * user interrupt
   3281	 */
   3282	if (!hl_device_operational(hpriv->hdev, NULL))
   3283		return -EBUSY;
   3284
   3285	if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
   3286		rc = hl_interrupt_wait_ioctl(hpriv, data);
   3287	else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
   3288		rc = hl_multi_cs_wait_ioctl(hpriv, data);
   3289	else
   3290		rc = hl_cs_wait_ioctl(hpriv, data);
   3291
   3292	return rc;
   3293}