cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sdma.c (89945B)


      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2/*
      3 * Copyright(c) 2015 - 2018 Intel Corporation.
      4 */
      5
      6#include <linux/spinlock.h>
      7#include <linux/seqlock.h>
      8#include <linux/netdevice.h>
      9#include <linux/moduleparam.h>
     10#include <linux/bitops.h>
     11#include <linux/timer.h>
     12#include <linux/vmalloc.h>
     13#include <linux/highmem.h>
     14
     15#include "hfi.h"
     16#include "common.h"
     17#include "qp.h"
     18#include "sdma.h"
     19#include "iowait.h"
     20#include "trace.h"
     21
     22/* must be a power of 2 >= 64 <= 32768 */
     23#define SDMA_DESCQ_CNT 2048
     24#define SDMA_DESC_INTR 64
     25#define INVALID_TAIL 0xffff
     26#define SDMA_PAD max_t(size_t, MAX_16B_PADDING, sizeof(u32))
     27
     28static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
     29module_param(sdma_descq_cnt, uint, S_IRUGO);
     30MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
     31
     32static uint sdma_idle_cnt = 250;
     33module_param(sdma_idle_cnt, uint, S_IRUGO);
     34MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
     35
     36uint mod_num_sdma;
     37module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
     38MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
     39
     40static uint sdma_desct_intr = SDMA_DESC_INTR;
     41module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
     42MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
     43
     44#define SDMA_WAIT_BATCH_SIZE 20
     45/* max wait time for a SDMA engine to indicate it has halted */
     46#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
     47/* all SDMA engine errors that cause a halt */
     48
     49#define SD(name) SEND_DMA_##name
     50#define ALL_SDMA_ENG_HALT_ERRS \
     51	(SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
     52	| SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
     53	| SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
     54	| SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
     55	| SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
     56	| SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
     57	| SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
     58	| SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
     59	| SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
     60	| SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
     61	| SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
     62	| SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
     63	| SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
     64	| SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
     65	| SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
     66	| SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
     67	| SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
     68	| SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
     69
     70/* sdma_sendctrl operations */
     71#define SDMA_SENDCTRL_OP_ENABLE    BIT(0)
     72#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
     73#define SDMA_SENDCTRL_OP_HALT      BIT(2)
     74#define SDMA_SENDCTRL_OP_CLEANUP   BIT(3)
     75
     76/* handle long defines */
     77#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
     78SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
     79#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
     80SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
     81
     82static const char * const sdma_state_names[] = {
     83	[sdma_state_s00_hw_down]                = "s00_HwDown",
     84	[sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
     85	[sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
     86	[sdma_state_s20_idle]                   = "s20_Idle",
     87	[sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
     88	[sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
     89	[sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
     90	[sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
     91	[sdma_state_s80_hw_freeze]		= "s80_HwFreeze",
     92	[sdma_state_s82_freeze_sw_clean]	= "s82_FreezeSwClean",
     93	[sdma_state_s99_running]                = "s99_Running",
     94};
     95
     96#ifdef CONFIG_SDMA_VERBOSITY
     97static const char * const sdma_event_names[] = {
     98	[sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
     99	[sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
    100	[sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
    101	[sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
    102	[sdma_event_e30_go_running]   = "e30_GoRunning",
    103	[sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
    104	[sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
    105	[sdma_event_e60_hw_halted]    = "e60_HwHalted",
    106	[sdma_event_e70_go_idle]      = "e70_GoIdle",
    107	[sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
    108	[sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
    109	[sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
    110	[sdma_event_e85_link_down]    = "e85_LinkDown",
    111	[sdma_event_e90_sw_halted]    = "e90_SwHalted",
    112};
    113#endif
    114
    115static const struct sdma_set_state_action sdma_action_table[] = {
    116	[sdma_state_s00_hw_down] = {
    117		.go_s99_running_tofalse = 1,
    118		.op_enable = 0,
    119		.op_intenable = 0,
    120		.op_halt = 0,
    121		.op_cleanup = 0,
    122	},
    123	[sdma_state_s10_hw_start_up_halt_wait] = {
    124		.op_enable = 0,
    125		.op_intenable = 0,
    126		.op_halt = 1,
    127		.op_cleanup = 0,
    128	},
    129	[sdma_state_s15_hw_start_up_clean_wait] = {
    130		.op_enable = 0,
    131		.op_intenable = 1,
    132		.op_halt = 0,
    133		.op_cleanup = 1,
    134	},
    135	[sdma_state_s20_idle] = {
    136		.op_enable = 0,
    137		.op_intenable = 1,
    138		.op_halt = 0,
    139		.op_cleanup = 0,
    140	},
    141	[sdma_state_s30_sw_clean_up_wait] = {
    142		.op_enable = 0,
    143		.op_intenable = 0,
    144		.op_halt = 0,
    145		.op_cleanup = 0,
    146	},
    147	[sdma_state_s40_hw_clean_up_wait] = {
    148		.op_enable = 0,
    149		.op_intenable = 0,
    150		.op_halt = 0,
    151		.op_cleanup = 1,
    152	},
    153	[sdma_state_s50_hw_halt_wait] = {
    154		.op_enable = 0,
    155		.op_intenable = 0,
    156		.op_halt = 0,
    157		.op_cleanup = 0,
    158	},
    159	[sdma_state_s60_idle_halt_wait] = {
    160		.go_s99_running_tofalse = 1,
    161		.op_enable = 0,
    162		.op_intenable = 0,
    163		.op_halt = 1,
    164		.op_cleanup = 0,
    165	},
    166	[sdma_state_s80_hw_freeze] = {
    167		.op_enable = 0,
    168		.op_intenable = 0,
    169		.op_halt = 0,
    170		.op_cleanup = 0,
    171	},
    172	[sdma_state_s82_freeze_sw_clean] = {
    173		.op_enable = 0,
    174		.op_intenable = 0,
    175		.op_halt = 0,
    176		.op_cleanup = 0,
    177	},
    178	[sdma_state_s99_running] = {
    179		.op_enable = 1,
    180		.op_intenable = 1,
    181		.op_halt = 0,
    182		.op_cleanup = 0,
    183		.go_s99_running_totrue = 1,
    184	},
    185};
    186
    187#define SDMA_TAIL_UPDATE_THRESH 0x1F
    188
    189/* declare all statics here rather than keep sorting */
    190static void sdma_complete(struct kref *);
    191static void sdma_finalput(struct sdma_state *);
    192static void sdma_get(struct sdma_state *);
    193static void sdma_hw_clean_up_task(struct tasklet_struct *);
    194static void sdma_put(struct sdma_state *);
    195static void sdma_set_state(struct sdma_engine *, enum sdma_states);
    196static void sdma_start_hw_clean_up(struct sdma_engine *);
    197static void sdma_sw_clean_up_task(struct tasklet_struct *);
    198static void sdma_sendctrl(struct sdma_engine *, unsigned);
    199static void init_sdma_regs(struct sdma_engine *, u32, uint);
    200static void sdma_process_event(
    201	struct sdma_engine *sde,
    202	enum sdma_events event);
    203static void __sdma_process_event(
    204	struct sdma_engine *sde,
    205	enum sdma_events event);
    206static void dump_sdma_state(struct sdma_engine *sde);
    207static void sdma_make_progress(struct sdma_engine *sde, u64 status);
    208static void sdma_desc_avail(struct sdma_engine *sde, uint avail);
    209static void sdma_flush_descq(struct sdma_engine *sde);
    210
    211/**
    212 * sdma_state_name() - return state string from enum
    213 * @state: state
    214 */
    215static const char *sdma_state_name(enum sdma_states state)
    216{
    217	return sdma_state_names[state];
    218}
    219
    220static void sdma_get(struct sdma_state *ss)
    221{
    222	kref_get(&ss->kref);
    223}
    224
    225static void sdma_complete(struct kref *kref)
    226{
    227	struct sdma_state *ss =
    228		container_of(kref, struct sdma_state, kref);
    229
    230	complete(&ss->comp);
    231}
    232
    233static void sdma_put(struct sdma_state *ss)
    234{
    235	kref_put(&ss->kref, sdma_complete);
    236}
    237
    238static void sdma_finalput(struct sdma_state *ss)
    239{
    240	sdma_put(ss);
    241	wait_for_completion(&ss->comp);
    242}
    243
    244static inline void write_sde_csr(
    245	struct sdma_engine *sde,
    246	u32 offset0,
    247	u64 value)
    248{
    249	write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
    250}
    251
    252static inline u64 read_sde_csr(
    253	struct sdma_engine *sde,
    254	u32 offset0)
    255{
    256	return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
    257}
    258
    259/*
    260 * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
    261 * sdma engine 'sde' to drop to 0.
    262 */
    263static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
    264					int pause)
    265{
    266	u64 off = 8 * sde->this_idx;
    267	struct hfi1_devdata *dd = sde->dd;
    268	int lcnt = 0;
    269	u64 reg_prev;
    270	u64 reg = 0;
    271
    272	while (1) {
    273		reg_prev = reg;
    274		reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
    275
    276		reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
    277		reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
    278		if (reg == 0)
    279			break;
    280		/* counter is reest if accupancy count changes */
    281		if (reg != reg_prev)
    282			lcnt = 0;
    283		if (lcnt++ > 500) {
    284			/* timed out - bounce the link */
    285			dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
    286				   __func__, sde->this_idx, (u32)reg);
    287			queue_work(dd->pport->link_wq,
    288				   &dd->pport->link_bounce_work);
    289			break;
    290		}
    291		udelay(1);
    292	}
    293}
    294
    295/*
    296 * sdma_wait() - wait for packet egress to complete for all SDMA engines,
    297 * and pause for credit return.
    298 */
    299void sdma_wait(struct hfi1_devdata *dd)
    300{
    301	int i;
    302
    303	for (i = 0; i < dd->num_sdma; i++) {
    304		struct sdma_engine *sde = &dd->per_sdma[i];
    305
    306		sdma_wait_for_packet_egress(sde, 0);
    307	}
    308}
    309
    310static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
    311{
    312	u64 reg;
    313
    314	if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
    315		return;
    316	reg = cnt;
    317	reg &= SD(DESC_CNT_CNT_MASK);
    318	reg <<= SD(DESC_CNT_CNT_SHIFT);
    319	write_sde_csr(sde, SD(DESC_CNT), reg);
    320}
    321
    322static inline void complete_tx(struct sdma_engine *sde,
    323			       struct sdma_txreq *tx,
    324			       int res)
    325{
    326	/* protect against complete modifying */
    327	struct iowait *wait = tx->wait;
    328	callback_t complete = tx->complete;
    329
    330#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
    331	trace_hfi1_sdma_out_sn(sde, tx->sn);
    332	if (WARN_ON_ONCE(sde->head_sn != tx->sn))
    333		dd_dev_err(sde->dd, "expected %llu got %llu\n",
    334			   sde->head_sn, tx->sn);
    335	sde->head_sn++;
    336#endif
    337	__sdma_txclean(sde->dd, tx);
    338	if (complete)
    339		(*complete)(tx, res);
    340	if (iowait_sdma_dec(wait))
    341		iowait_drain_wakeup(wait);
    342}
    343
    344/*
    345 * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
    346 *
    347 * Depending on timing there can be txreqs in two places:
    348 * - in the descq ring
    349 * - in the flush list
    350 *
    351 * To avoid ordering issues the descq ring needs to be flushed
    352 * first followed by the flush list.
    353 *
    354 * This routine is called from two places
    355 * - From a work queue item
    356 * - Directly from the state machine just before setting the
    357 *   state to running
    358 *
    359 * Must be called with head_lock held
    360 *
    361 */
    362static void sdma_flush(struct sdma_engine *sde)
    363{
    364	struct sdma_txreq *txp, *txp_next;
    365	LIST_HEAD(flushlist);
    366	unsigned long flags;
    367	uint seq;
    368
    369	/* flush from head to tail */
    370	sdma_flush_descq(sde);
    371	spin_lock_irqsave(&sde->flushlist_lock, flags);
    372	/* copy flush list */
    373	list_splice_init(&sde->flushlist, &flushlist);
    374	spin_unlock_irqrestore(&sde->flushlist_lock, flags);
    375	/* flush from flush list */
    376	list_for_each_entry_safe(txp, txp_next, &flushlist, list)
    377		complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
    378	/* wakeup QPs orphaned on the dmawait list */
    379	do {
    380		struct iowait *w, *nw;
    381
    382		seq = read_seqbegin(&sde->waitlock);
    383		if (!list_empty(&sde->dmawait)) {
    384			write_seqlock(&sde->waitlock);
    385			list_for_each_entry_safe(w, nw, &sde->dmawait, list) {
    386				if (w->wakeup) {
    387					w->wakeup(w, SDMA_AVAIL_REASON);
    388					list_del_init(&w->list);
    389				}
    390			}
    391			write_sequnlock(&sde->waitlock);
    392		}
    393	} while (read_seqretry(&sde->waitlock, seq));
    394}
    395
    396/*
    397 * Fields a work request for flushing the descq ring
    398 * and the flush list
    399 *
    400 * If the engine has been brought to running during
    401 * the scheduling delay, the flush is ignored, assuming
    402 * that the process of bringing the engine to running
    403 * would have done this flush prior to going to running.
    404 *
    405 */
    406static void sdma_field_flush(struct work_struct *work)
    407{
    408	unsigned long flags;
    409	struct sdma_engine *sde =
    410		container_of(work, struct sdma_engine, flush_worker);
    411
    412	write_seqlock_irqsave(&sde->head_lock, flags);
    413	if (!__sdma_running(sde))
    414		sdma_flush(sde);
    415	write_sequnlock_irqrestore(&sde->head_lock, flags);
    416}
    417
    418static void sdma_err_halt_wait(struct work_struct *work)
    419{
    420	struct sdma_engine *sde = container_of(work, struct sdma_engine,
    421						err_halt_worker);
    422	u64 statuscsr;
    423	unsigned long timeout;
    424
    425	timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
    426	while (1) {
    427		statuscsr = read_sde_csr(sde, SD(STATUS));
    428		statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
    429		if (statuscsr)
    430			break;
    431		if (time_after(jiffies, timeout)) {
    432			dd_dev_err(sde->dd,
    433				   "SDMA engine %d - timeout waiting for engine to halt\n",
    434				   sde->this_idx);
    435			/*
    436			 * Continue anyway.  This could happen if there was
    437			 * an uncorrectable error in the wrong spot.
    438			 */
    439			break;
    440		}
    441		usleep_range(80, 120);
    442	}
    443
    444	sdma_process_event(sde, sdma_event_e15_hw_halt_done);
    445}
    446
    447static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
    448{
    449	if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
    450		unsigned index;
    451		struct hfi1_devdata *dd = sde->dd;
    452
    453		for (index = 0; index < dd->num_sdma; index++) {
    454			struct sdma_engine *curr_sdma = &dd->per_sdma[index];
    455
    456			if (curr_sdma != sde)
    457				curr_sdma->progress_check_head =
    458							curr_sdma->descq_head;
    459		}
    460		dd_dev_err(sde->dd,
    461			   "SDMA engine %d - check scheduled\n",
    462				sde->this_idx);
    463		mod_timer(&sde->err_progress_check_timer, jiffies + 10);
    464	}
    465}
    466
    467static void sdma_err_progress_check(struct timer_list *t)
    468{
    469	unsigned index;
    470	struct sdma_engine *sde = from_timer(sde, t, err_progress_check_timer);
    471
    472	dd_dev_err(sde->dd, "SDE progress check event\n");
    473	for (index = 0; index < sde->dd->num_sdma; index++) {
    474		struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
    475		unsigned long flags;
    476
    477		/* check progress on each engine except the current one */
    478		if (curr_sde == sde)
    479			continue;
    480		/*
    481		 * We must lock interrupts when acquiring sde->lock,
    482		 * to avoid a deadlock if interrupt triggers and spins on
    483		 * the same lock on same CPU
    484		 */
    485		spin_lock_irqsave(&curr_sde->tail_lock, flags);
    486		write_seqlock(&curr_sde->head_lock);
    487
    488		/* skip non-running queues */
    489		if (curr_sde->state.current_state != sdma_state_s99_running) {
    490			write_sequnlock(&curr_sde->head_lock);
    491			spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
    492			continue;
    493		}
    494
    495		if ((curr_sde->descq_head != curr_sde->descq_tail) &&
    496		    (curr_sde->descq_head ==
    497				curr_sde->progress_check_head))
    498			__sdma_process_event(curr_sde,
    499					     sdma_event_e90_sw_halted);
    500		write_sequnlock(&curr_sde->head_lock);
    501		spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
    502	}
    503	schedule_work(&sde->err_halt_worker);
    504}
    505
    506static void sdma_hw_clean_up_task(struct tasklet_struct *t)
    507{
    508	struct sdma_engine *sde = from_tasklet(sde, t,
    509					       sdma_hw_clean_up_task);
    510	u64 statuscsr;
    511
    512	while (1) {
    513#ifdef CONFIG_SDMA_VERBOSITY
    514		dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
    515			   sde->this_idx, slashstrip(__FILE__), __LINE__,
    516			__func__);
    517#endif
    518		statuscsr = read_sde_csr(sde, SD(STATUS));
    519		statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
    520		if (statuscsr)
    521			break;
    522		udelay(10);
    523	}
    524
    525	sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
    526}
    527
    528static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
    529{
    530	return sde->tx_ring[sde->tx_head & sde->sdma_mask];
    531}
    532
    533/*
    534 * flush ring for recovery
    535 */
    536static void sdma_flush_descq(struct sdma_engine *sde)
    537{
    538	u16 head, tail;
    539	int progress = 0;
    540	struct sdma_txreq *txp = get_txhead(sde);
    541
    542	/* The reason for some of the complexity of this code is that
    543	 * not all descriptors have corresponding txps.  So, we have to
    544	 * be able to skip over descs until we wander into the range of
    545	 * the next txp on the list.
    546	 */
    547	head = sde->descq_head & sde->sdma_mask;
    548	tail = sde->descq_tail & sde->sdma_mask;
    549	while (head != tail) {
    550		/* advance head, wrap if needed */
    551		head = ++sde->descq_head & sde->sdma_mask;
    552		/* if now past this txp's descs, do the callback */
    553		if (txp && txp->next_descq_idx == head) {
    554			/* remove from list */
    555			sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
    556			complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
    557			trace_hfi1_sdma_progress(sde, head, tail, txp);
    558			txp = get_txhead(sde);
    559		}
    560		progress++;
    561	}
    562	if (progress)
    563		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
    564}
    565
    566static void sdma_sw_clean_up_task(struct tasklet_struct *t)
    567{
    568	struct sdma_engine *sde = from_tasklet(sde, t, sdma_sw_clean_up_task);
    569	unsigned long flags;
    570
    571	spin_lock_irqsave(&sde->tail_lock, flags);
    572	write_seqlock(&sde->head_lock);
    573
    574	/*
    575	 * At this point, the following should always be true:
    576	 * - We are halted, so no more descriptors are getting retired.
    577	 * - We are not running, so no one is submitting new work.
    578	 * - Only we can send the e40_sw_cleaned, so we can't start
    579	 *   running again until we say so.  So, the active list and
    580	 *   descq are ours to play with.
    581	 */
    582
    583	/*
    584	 * In the error clean up sequence, software clean must be called
    585	 * before the hardware clean so we can use the hardware head in
    586	 * the progress routine.  A hardware clean or SPC unfreeze will
    587	 * reset the hardware head.
    588	 *
    589	 * Process all retired requests. The progress routine will use the
    590	 * latest physical hardware head - we are not running so speed does
    591	 * not matter.
    592	 */
    593	sdma_make_progress(sde, 0);
    594
    595	sdma_flush(sde);
    596
    597	/*
    598	 * Reset our notion of head and tail.
    599	 * Note that the HW registers have been reset via an earlier
    600	 * clean up.
    601	 */
    602	sde->descq_tail = 0;
    603	sde->descq_head = 0;
    604	sde->desc_avail = sdma_descq_freecnt(sde);
    605	*sde->head_dma = 0;
    606
    607	__sdma_process_event(sde, sdma_event_e40_sw_cleaned);
    608
    609	write_sequnlock(&sde->head_lock);
    610	spin_unlock_irqrestore(&sde->tail_lock, flags);
    611}
    612
    613static void sdma_sw_tear_down(struct sdma_engine *sde)
    614{
    615	struct sdma_state *ss = &sde->state;
    616
    617	/* Releasing this reference means the state machine has stopped. */
    618	sdma_put(ss);
    619
    620	/* stop waiting for all unfreeze events to complete */
    621	atomic_set(&sde->dd->sdma_unfreeze_count, -1);
    622	wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
    623}
    624
    625static void sdma_start_hw_clean_up(struct sdma_engine *sde)
    626{
    627	tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
    628}
    629
    630static void sdma_set_state(struct sdma_engine *sde,
    631			   enum sdma_states next_state)
    632{
    633	struct sdma_state *ss = &sde->state;
    634	const struct sdma_set_state_action *action = sdma_action_table;
    635	unsigned op = 0;
    636
    637	trace_hfi1_sdma_state(
    638		sde,
    639		sdma_state_names[ss->current_state],
    640		sdma_state_names[next_state]);
    641
    642	/* debugging bookkeeping */
    643	ss->previous_state = ss->current_state;
    644	ss->previous_op = ss->current_op;
    645	ss->current_state = next_state;
    646
    647	if (ss->previous_state != sdma_state_s99_running &&
    648	    next_state == sdma_state_s99_running)
    649		sdma_flush(sde);
    650
    651	if (action[next_state].op_enable)
    652		op |= SDMA_SENDCTRL_OP_ENABLE;
    653
    654	if (action[next_state].op_intenable)
    655		op |= SDMA_SENDCTRL_OP_INTENABLE;
    656
    657	if (action[next_state].op_halt)
    658		op |= SDMA_SENDCTRL_OP_HALT;
    659
    660	if (action[next_state].op_cleanup)
    661		op |= SDMA_SENDCTRL_OP_CLEANUP;
    662
    663	if (action[next_state].go_s99_running_tofalse)
    664		ss->go_s99_running = 0;
    665
    666	if (action[next_state].go_s99_running_totrue)
    667		ss->go_s99_running = 1;
    668
    669	ss->current_op = op;
    670	sdma_sendctrl(sde, ss->current_op);
    671}
    672
    673/**
    674 * sdma_get_descq_cnt() - called when device probed
    675 *
    676 * Return a validated descq count.
    677 *
    678 * This is currently only used in the verbs initialization to build the tx
    679 * list.
    680 *
    681 * This will probably be deleted in favor of a more scalable approach to
    682 * alloc tx's.
    683 *
    684 */
    685u16 sdma_get_descq_cnt(void)
    686{
    687	u16 count = sdma_descq_cnt;
    688
    689	if (!count)
    690		return SDMA_DESCQ_CNT;
    691	/* count must be a power of 2 greater than 64 and less than
    692	 * 32768.   Otherwise return default.
    693	 */
    694	if (!is_power_of_2(count))
    695		return SDMA_DESCQ_CNT;
    696	if (count < 64 || count > 32768)
    697		return SDMA_DESCQ_CNT;
    698	return count;
    699}
    700
    701/**
    702 * sdma_engine_get_vl() - return vl for a given sdma engine
    703 * @sde: sdma engine
    704 *
    705 * This function returns the vl mapped to a given engine, or an error if
    706 * the mapping can't be found. The mapping fields are protected by RCU.
    707 */
    708int sdma_engine_get_vl(struct sdma_engine *sde)
    709{
    710	struct hfi1_devdata *dd = sde->dd;
    711	struct sdma_vl_map *m;
    712	u8 vl;
    713
    714	if (sde->this_idx >= TXE_NUM_SDMA_ENGINES)
    715		return -EINVAL;
    716
    717	rcu_read_lock();
    718	m = rcu_dereference(dd->sdma_map);
    719	if (unlikely(!m)) {
    720		rcu_read_unlock();
    721		return -EINVAL;
    722	}
    723	vl = m->engine_to_vl[sde->this_idx];
    724	rcu_read_unlock();
    725
    726	return vl;
    727}
    728
    729/**
    730 * sdma_select_engine_vl() - select sdma engine
    731 * @dd: devdata
    732 * @selector: a spreading factor
    733 * @vl: this vl
    734 *
    735 *
    736 * This function returns an engine based on the selector and a vl.  The
    737 * mapping fields are protected by RCU.
    738 */
    739struct sdma_engine *sdma_select_engine_vl(
    740	struct hfi1_devdata *dd,
    741	u32 selector,
    742	u8 vl)
    743{
    744	struct sdma_vl_map *m;
    745	struct sdma_map_elem *e;
    746	struct sdma_engine *rval;
    747
    748	/* NOTE This should only happen if SC->VL changed after the initial
    749	 *      checks on the QP/AH
    750	 *      Default will return engine 0 below
    751	 */
    752	if (vl >= num_vls) {
    753		rval = NULL;
    754		goto done;
    755	}
    756
    757	rcu_read_lock();
    758	m = rcu_dereference(dd->sdma_map);
    759	if (unlikely(!m)) {
    760		rcu_read_unlock();
    761		return &dd->per_sdma[0];
    762	}
    763	e = m->map[vl & m->mask];
    764	rval = e->sde[selector & e->mask];
    765	rcu_read_unlock();
    766
    767done:
    768	rval =  !rval ? &dd->per_sdma[0] : rval;
    769	trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
    770	return rval;
    771}
    772
    773/**
    774 * sdma_select_engine_sc() - select sdma engine
    775 * @dd: devdata
    776 * @selector: a spreading factor
    777 * @sc5: the 5 bit sc
    778 *
    779 *
    780 * This function returns an engine based on the selector and an sc.
    781 */
    782struct sdma_engine *sdma_select_engine_sc(
    783	struct hfi1_devdata *dd,
    784	u32 selector,
    785	u8 sc5)
    786{
    787	u8 vl = sc_to_vlt(dd, sc5);
    788
    789	return sdma_select_engine_vl(dd, selector, vl);
    790}
    791
    792struct sdma_rht_map_elem {
    793	u32 mask;
    794	u8 ctr;
    795	struct sdma_engine *sde[];
    796};
    797
    798struct sdma_rht_node {
    799	unsigned long cpu_id;
    800	struct sdma_rht_map_elem *map[HFI1_MAX_VLS_SUPPORTED];
    801	struct rhash_head node;
    802};
    803
    804#define NR_CPUS_HINT 192
    805
    806static const struct rhashtable_params sdma_rht_params = {
    807	.nelem_hint = NR_CPUS_HINT,
    808	.head_offset = offsetof(struct sdma_rht_node, node),
    809	.key_offset = offsetof(struct sdma_rht_node, cpu_id),
    810	.key_len = sizeof_field(struct sdma_rht_node, cpu_id),
    811	.max_size = NR_CPUS,
    812	.min_size = 8,
    813	.automatic_shrinking = true,
    814};
    815
    816/*
    817 * sdma_select_user_engine() - select sdma engine based on user setup
    818 * @dd: devdata
    819 * @selector: a spreading factor
    820 * @vl: this vl
    821 *
    822 * This function returns an sdma engine for a user sdma request.
    823 * User defined sdma engine affinity setting is honored when applicable,
    824 * otherwise system default sdma engine mapping is used. To ensure correct
    825 * ordering, the mapping from <selector, vl> to sde must remain unchanged.
    826 */
    827struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
    828					    u32 selector, u8 vl)
    829{
    830	struct sdma_rht_node *rht_node;
    831	struct sdma_engine *sde = NULL;
    832	unsigned long cpu_id;
    833
    834	/*
    835	 * To ensure that always the same sdma engine(s) will be
    836	 * selected make sure the process is pinned to this CPU only.
    837	 */
    838	if (current->nr_cpus_allowed != 1)
    839		goto out;
    840
    841	rcu_read_lock();
    842	cpu_id = smp_processor_id();
    843	rht_node = rhashtable_lookup(dd->sdma_rht, &cpu_id,
    844				     sdma_rht_params);
    845
    846	if (rht_node && rht_node->map[vl]) {
    847		struct sdma_rht_map_elem *map = rht_node->map[vl];
    848
    849		sde = map->sde[selector & map->mask];
    850	}
    851	rcu_read_unlock();
    852
    853	if (sde)
    854		return sde;
    855
    856out:
    857	return sdma_select_engine_vl(dd, selector, vl);
    858}
    859
    860static void sdma_populate_sde_map(struct sdma_rht_map_elem *map)
    861{
    862	int i;
    863
    864	for (i = 0; i < roundup_pow_of_two(map->ctr ? : 1) - map->ctr; i++)
    865		map->sde[map->ctr + i] = map->sde[i];
    866}
    867
    868static void sdma_cleanup_sde_map(struct sdma_rht_map_elem *map,
    869				 struct sdma_engine *sde)
    870{
    871	unsigned int i, pow;
    872
    873	/* only need to check the first ctr entries for a match */
    874	for (i = 0; i < map->ctr; i++) {
    875		if (map->sde[i] == sde) {
    876			memmove(&map->sde[i], &map->sde[i + 1],
    877				(map->ctr - i - 1) * sizeof(map->sde[0]));
    878			map->ctr--;
    879			pow = roundup_pow_of_two(map->ctr ? : 1);
    880			map->mask = pow - 1;
    881			sdma_populate_sde_map(map);
    882			break;
    883		}
    884	}
    885}
    886
    887/*
    888 * Prevents concurrent reads and writes of the sdma engine cpu_mask
    889 */
    890static DEFINE_MUTEX(process_to_sde_mutex);
    891
    892ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf,
    893				size_t count)
    894{
    895	struct hfi1_devdata *dd = sde->dd;
    896	cpumask_var_t mask, new_mask;
    897	unsigned long cpu;
    898	int ret, vl, sz;
    899	struct sdma_rht_node *rht_node;
    900
    901	vl = sdma_engine_get_vl(sde);
    902	if (unlikely(vl < 0 || vl >= ARRAY_SIZE(rht_node->map)))
    903		return -EINVAL;
    904
    905	ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
    906	if (!ret)
    907		return -ENOMEM;
    908
    909	ret = zalloc_cpumask_var(&new_mask, GFP_KERNEL);
    910	if (!ret) {
    911		free_cpumask_var(mask);
    912		return -ENOMEM;
    913	}
    914	ret = cpulist_parse(buf, mask);
    915	if (ret)
    916		goto out_free;
    917
    918	if (!cpumask_subset(mask, cpu_online_mask)) {
    919		dd_dev_warn(sde->dd, "Invalid CPU mask\n");
    920		ret = -EINVAL;
    921		goto out_free;
    922	}
    923
    924	sz = sizeof(struct sdma_rht_map_elem) +
    925			(TXE_NUM_SDMA_ENGINES * sizeof(struct sdma_engine *));
    926
    927	mutex_lock(&process_to_sde_mutex);
    928
    929	for_each_cpu(cpu, mask) {
    930		/* Check if we have this already mapped */
    931		if (cpumask_test_cpu(cpu, &sde->cpu_mask)) {
    932			cpumask_set_cpu(cpu, new_mask);
    933			continue;
    934		}
    935
    936		rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu,
    937						  sdma_rht_params);
    938		if (!rht_node) {
    939			rht_node = kzalloc(sizeof(*rht_node), GFP_KERNEL);
    940			if (!rht_node) {
    941				ret = -ENOMEM;
    942				goto out;
    943			}
    944
    945			rht_node->map[vl] = kzalloc(sz, GFP_KERNEL);
    946			if (!rht_node->map[vl]) {
    947				kfree(rht_node);
    948				ret = -ENOMEM;
    949				goto out;
    950			}
    951			rht_node->cpu_id = cpu;
    952			rht_node->map[vl]->mask = 0;
    953			rht_node->map[vl]->ctr = 1;
    954			rht_node->map[vl]->sde[0] = sde;
    955
    956			ret = rhashtable_insert_fast(dd->sdma_rht,
    957						     &rht_node->node,
    958						     sdma_rht_params);
    959			if (ret) {
    960				kfree(rht_node->map[vl]);
    961				kfree(rht_node);
    962				dd_dev_err(sde->dd, "Failed to set process to sde affinity for cpu %lu\n",
    963					   cpu);
    964				goto out;
    965			}
    966
    967		} else {
    968			int ctr, pow;
    969
    970			/* Add new user mappings */
    971			if (!rht_node->map[vl])
    972				rht_node->map[vl] = kzalloc(sz, GFP_KERNEL);
    973
    974			if (!rht_node->map[vl]) {
    975				ret = -ENOMEM;
    976				goto out;
    977			}
    978
    979			rht_node->map[vl]->ctr++;
    980			ctr = rht_node->map[vl]->ctr;
    981			rht_node->map[vl]->sde[ctr - 1] = sde;
    982			pow = roundup_pow_of_two(ctr);
    983			rht_node->map[vl]->mask = pow - 1;
    984
    985			/* Populate the sde map table */
    986			sdma_populate_sde_map(rht_node->map[vl]);
    987		}
    988		cpumask_set_cpu(cpu, new_mask);
    989	}
    990
    991	/* Clean up old mappings */
    992	for_each_cpu(cpu, cpu_online_mask) {
    993		struct sdma_rht_node *rht_node;
    994
    995		/* Don't cleanup sdes that are set in the new mask */
    996		if (cpumask_test_cpu(cpu, mask))
    997			continue;
    998
    999		rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu,
   1000						  sdma_rht_params);
   1001		if (rht_node) {
   1002			bool empty = true;
   1003			int i;
   1004
   1005			/* Remove mappings for old sde */
   1006			for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
   1007				if (rht_node->map[i])
   1008					sdma_cleanup_sde_map(rht_node->map[i],
   1009							     sde);
   1010
   1011			/* Free empty hash table entries */
   1012			for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) {
   1013				if (!rht_node->map[i])
   1014					continue;
   1015
   1016				if (rht_node->map[i]->ctr) {
   1017					empty = false;
   1018					break;
   1019				}
   1020			}
   1021
   1022			if (empty) {
   1023				ret = rhashtable_remove_fast(dd->sdma_rht,
   1024							     &rht_node->node,
   1025							     sdma_rht_params);
   1026				WARN_ON(ret);
   1027
   1028				for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
   1029					kfree(rht_node->map[i]);
   1030
   1031				kfree(rht_node);
   1032			}
   1033		}
   1034	}
   1035
   1036	cpumask_copy(&sde->cpu_mask, new_mask);
   1037out:
   1038	mutex_unlock(&process_to_sde_mutex);
   1039out_free:
   1040	free_cpumask_var(mask);
   1041	free_cpumask_var(new_mask);
   1042	return ret ? : strnlen(buf, PAGE_SIZE);
   1043}
   1044
   1045ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf)
   1046{
   1047	mutex_lock(&process_to_sde_mutex);
   1048	if (cpumask_empty(&sde->cpu_mask))
   1049		snprintf(buf, PAGE_SIZE, "%s\n", "empty");
   1050	else
   1051		cpumap_print_to_pagebuf(true, buf, &sde->cpu_mask);
   1052	mutex_unlock(&process_to_sde_mutex);
   1053	return strnlen(buf, PAGE_SIZE);
   1054}
   1055
   1056static void sdma_rht_free(void *ptr, void *arg)
   1057{
   1058	struct sdma_rht_node *rht_node = ptr;
   1059	int i;
   1060
   1061	for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
   1062		kfree(rht_node->map[i]);
   1063
   1064	kfree(rht_node);
   1065}
   1066
   1067/**
   1068 * sdma_seqfile_dump_cpu_list() - debugfs dump the cpu to sdma mappings
   1069 * @s: seq file
   1070 * @dd: hfi1_devdata
   1071 * @cpuid: cpu id
   1072 *
   1073 * This routine dumps the process to sde mappings per cpu
   1074 */
   1075void sdma_seqfile_dump_cpu_list(struct seq_file *s,
   1076				struct hfi1_devdata *dd,
   1077				unsigned long cpuid)
   1078{
   1079	struct sdma_rht_node *rht_node;
   1080	int i, j;
   1081
   1082	rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpuid,
   1083					  sdma_rht_params);
   1084	if (!rht_node)
   1085		return;
   1086
   1087	seq_printf(s, "cpu%3lu: ", cpuid);
   1088	for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) {
   1089		if (!rht_node->map[i] || !rht_node->map[i]->ctr)
   1090			continue;
   1091
   1092		seq_printf(s, " vl%d: [", i);
   1093
   1094		for (j = 0; j < rht_node->map[i]->ctr; j++) {
   1095			if (!rht_node->map[i]->sde[j])
   1096				continue;
   1097
   1098			if (j > 0)
   1099				seq_puts(s, ",");
   1100
   1101			seq_printf(s, " sdma%2d",
   1102				   rht_node->map[i]->sde[j]->this_idx);
   1103		}
   1104		seq_puts(s, " ]");
   1105	}
   1106
   1107	seq_puts(s, "\n");
   1108}
   1109
   1110/*
   1111 * Free the indicated map struct
   1112 */
   1113static void sdma_map_free(struct sdma_vl_map *m)
   1114{
   1115	int i;
   1116
   1117	for (i = 0; m && i < m->actual_vls; i++)
   1118		kfree(m->map[i]);
   1119	kfree(m);
   1120}
   1121
   1122/*
   1123 * Handle RCU callback
   1124 */
   1125static void sdma_map_rcu_callback(struct rcu_head *list)
   1126{
   1127	struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
   1128
   1129	sdma_map_free(m);
   1130}
   1131
   1132/**
   1133 * sdma_map_init - called when # vls change
   1134 * @dd: hfi1_devdata
   1135 * @port: port number
   1136 * @num_vls: number of vls
   1137 * @vl_engines: per vl engine mapping (optional)
   1138 *
   1139 * This routine changes the mapping based on the number of vls.
   1140 *
   1141 * vl_engines is used to specify a non-uniform vl/engine loading. NULL
   1142 * implies auto computing the loading and giving each VLs a uniform
   1143 * distribution of engines per VL.
   1144 *
   1145 * The auto algorithm computes the sde_per_vl and the number of extra
   1146 * engines.  Any extra engines are added from the last VL on down.
   1147 *
   1148 * rcu locking is used here to control access to the mapping fields.
   1149 *
   1150 * If either the num_vls or num_sdma are non-power of 2, the array sizes
   1151 * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
   1152 * up to the next highest power of 2 and the first entry is reused
   1153 * in a round robin fashion.
   1154 *
   1155 * If an error occurs the map change is not done and the mapping is
   1156 * not changed.
   1157 *
   1158 */
   1159int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
   1160{
   1161	int i, j;
   1162	int extra, sde_per_vl;
   1163	int engine = 0;
   1164	u8 lvl_engines[OPA_MAX_VLS];
   1165	struct sdma_vl_map *oldmap, *newmap;
   1166
   1167	if (!(dd->flags & HFI1_HAS_SEND_DMA))
   1168		return 0;
   1169
   1170	if (!vl_engines) {
   1171		/* truncate divide */
   1172		sde_per_vl = dd->num_sdma / num_vls;
   1173		/* extras */
   1174		extra = dd->num_sdma % num_vls;
   1175		vl_engines = lvl_engines;
   1176		/* add extras from last vl down */
   1177		for (i = num_vls - 1; i >= 0; i--, extra--)
   1178			vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
   1179	}
   1180	/* build new map */
   1181	newmap = kzalloc(
   1182		sizeof(struct sdma_vl_map) +
   1183			roundup_pow_of_two(num_vls) *
   1184			sizeof(struct sdma_map_elem *),
   1185		GFP_KERNEL);
   1186	if (!newmap)
   1187		goto bail;
   1188	newmap->actual_vls = num_vls;
   1189	newmap->vls = roundup_pow_of_two(num_vls);
   1190	newmap->mask = (1 << ilog2(newmap->vls)) - 1;
   1191	/* initialize back-map */
   1192	for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
   1193		newmap->engine_to_vl[i] = -1;
   1194	for (i = 0; i < newmap->vls; i++) {
   1195		/* save for wrap around */
   1196		int first_engine = engine;
   1197
   1198		if (i < newmap->actual_vls) {
   1199			int sz = roundup_pow_of_two(vl_engines[i]);
   1200
   1201			/* only allocate once */
   1202			newmap->map[i] = kzalloc(
   1203				sizeof(struct sdma_map_elem) +
   1204					sz * sizeof(struct sdma_engine *),
   1205				GFP_KERNEL);
   1206			if (!newmap->map[i])
   1207				goto bail;
   1208			newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
   1209			/* assign engines */
   1210			for (j = 0; j < sz; j++) {
   1211				newmap->map[i]->sde[j] =
   1212					&dd->per_sdma[engine];
   1213				if (++engine >= first_engine + vl_engines[i])
   1214					/* wrap back to first engine */
   1215					engine = first_engine;
   1216			}
   1217			/* assign back-map */
   1218			for (j = 0; j < vl_engines[i]; j++)
   1219				newmap->engine_to_vl[first_engine + j] = i;
   1220		} else {
   1221			/* just re-use entry without allocating */
   1222			newmap->map[i] = newmap->map[i % num_vls];
   1223		}
   1224		engine = first_engine + vl_engines[i];
   1225	}
   1226	/* newmap in hand, save old map */
   1227	spin_lock_irq(&dd->sde_map_lock);
   1228	oldmap = rcu_dereference_protected(dd->sdma_map,
   1229					   lockdep_is_held(&dd->sde_map_lock));
   1230
   1231	/* publish newmap */
   1232	rcu_assign_pointer(dd->sdma_map, newmap);
   1233
   1234	spin_unlock_irq(&dd->sde_map_lock);
   1235	/* success, free any old map after grace period */
   1236	if (oldmap)
   1237		call_rcu(&oldmap->list, sdma_map_rcu_callback);
   1238	return 0;
   1239bail:
   1240	/* free any partial allocation */
   1241	sdma_map_free(newmap);
   1242	return -ENOMEM;
   1243}
   1244
   1245/**
   1246 * sdma_clean - Clean up allocated memory
   1247 * @dd:          struct hfi1_devdata
   1248 * @num_engines: num sdma engines
   1249 *
   1250 * This routine can be called regardless of the success of
   1251 * sdma_init()
   1252 */
   1253void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
   1254{
   1255	size_t i;
   1256	struct sdma_engine *sde;
   1257
   1258	if (dd->sdma_pad_dma) {
   1259		dma_free_coherent(&dd->pcidev->dev, SDMA_PAD,
   1260				  (void *)dd->sdma_pad_dma,
   1261				  dd->sdma_pad_phys);
   1262		dd->sdma_pad_dma = NULL;
   1263		dd->sdma_pad_phys = 0;
   1264	}
   1265	if (dd->sdma_heads_dma) {
   1266		dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
   1267				  (void *)dd->sdma_heads_dma,
   1268				  dd->sdma_heads_phys);
   1269		dd->sdma_heads_dma = NULL;
   1270		dd->sdma_heads_phys = 0;
   1271	}
   1272	for (i = 0; dd->per_sdma && i < num_engines; ++i) {
   1273		sde = &dd->per_sdma[i];
   1274
   1275		sde->head_dma = NULL;
   1276		sde->head_phys = 0;
   1277
   1278		if (sde->descq) {
   1279			dma_free_coherent(
   1280				&dd->pcidev->dev,
   1281				sde->descq_cnt * sizeof(u64[2]),
   1282				sde->descq,
   1283				sde->descq_phys
   1284			);
   1285			sde->descq = NULL;
   1286			sde->descq_phys = 0;
   1287		}
   1288		kvfree(sde->tx_ring);
   1289		sde->tx_ring = NULL;
   1290	}
   1291	if (rcu_access_pointer(dd->sdma_map)) {
   1292		spin_lock_irq(&dd->sde_map_lock);
   1293		sdma_map_free(rcu_access_pointer(dd->sdma_map));
   1294		RCU_INIT_POINTER(dd->sdma_map, NULL);
   1295		spin_unlock_irq(&dd->sde_map_lock);
   1296		synchronize_rcu();
   1297	}
   1298	kfree(dd->per_sdma);
   1299	dd->per_sdma = NULL;
   1300
   1301	if (dd->sdma_rht) {
   1302		rhashtable_free_and_destroy(dd->sdma_rht, sdma_rht_free, NULL);
   1303		kfree(dd->sdma_rht);
   1304		dd->sdma_rht = NULL;
   1305	}
   1306}
   1307
   1308/**
   1309 * sdma_init() - called when device probed
   1310 * @dd: hfi1_devdata
   1311 * @port: port number (currently only zero)
   1312 *
   1313 * Initializes each sde and its csrs.
   1314 * Interrupts are not required to be enabled.
   1315 *
   1316 * Returns:
   1317 * 0 - success, -errno on failure
   1318 */
   1319int sdma_init(struct hfi1_devdata *dd, u8 port)
   1320{
   1321	unsigned this_idx;
   1322	struct sdma_engine *sde;
   1323	struct rhashtable *tmp_sdma_rht;
   1324	u16 descq_cnt;
   1325	void *curr_head;
   1326	struct hfi1_pportdata *ppd = dd->pport + port;
   1327	u32 per_sdma_credits;
   1328	uint idle_cnt = sdma_idle_cnt;
   1329	size_t num_engines = chip_sdma_engines(dd);
   1330	int ret = -ENOMEM;
   1331
   1332	if (!HFI1_CAP_IS_KSET(SDMA)) {
   1333		HFI1_CAP_CLEAR(SDMA_AHG);
   1334		return 0;
   1335	}
   1336	if (mod_num_sdma &&
   1337	    /* can't exceed chip support */
   1338	    mod_num_sdma <= chip_sdma_engines(dd) &&
   1339	    /* count must be >= vls */
   1340	    mod_num_sdma >= num_vls)
   1341		num_engines = mod_num_sdma;
   1342
   1343	dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
   1344	dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", chip_sdma_engines(dd));
   1345	dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
   1346		    chip_sdma_mem_size(dd));
   1347
   1348	per_sdma_credits =
   1349		chip_sdma_mem_size(dd) / (num_engines * SDMA_BLOCK_SIZE);
   1350
   1351	/* set up freeze waitqueue */
   1352	init_waitqueue_head(&dd->sdma_unfreeze_wq);
   1353	atomic_set(&dd->sdma_unfreeze_count, 0);
   1354
   1355	descq_cnt = sdma_get_descq_cnt();
   1356	dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
   1357		    num_engines, descq_cnt);
   1358
   1359	/* alloc memory for array of send engines */
   1360	dd->per_sdma = kcalloc_node(num_engines, sizeof(*dd->per_sdma),
   1361				    GFP_KERNEL, dd->node);
   1362	if (!dd->per_sdma)
   1363		return ret;
   1364
   1365	idle_cnt = ns_to_cclock(dd, idle_cnt);
   1366	if (idle_cnt)
   1367		dd->default_desc1 =
   1368			SDMA_DESC1_HEAD_TO_HOST_FLAG;
   1369	else
   1370		dd->default_desc1 =
   1371			SDMA_DESC1_INT_REQ_FLAG;
   1372
   1373	if (!sdma_desct_intr)
   1374		sdma_desct_intr = SDMA_DESC_INTR;
   1375
   1376	/* Allocate memory for SendDMA descriptor FIFOs */
   1377	for (this_idx = 0; this_idx < num_engines; ++this_idx) {
   1378		sde = &dd->per_sdma[this_idx];
   1379		sde->dd = dd;
   1380		sde->ppd = ppd;
   1381		sde->this_idx = this_idx;
   1382		sde->descq_cnt = descq_cnt;
   1383		sde->desc_avail = sdma_descq_freecnt(sde);
   1384		sde->sdma_shift = ilog2(descq_cnt);
   1385		sde->sdma_mask = (1 << sde->sdma_shift) - 1;
   1386
   1387		/* Create a mask specifically for each interrupt source */
   1388		sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
   1389					   this_idx);
   1390		sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
   1391						this_idx);
   1392		sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
   1393					    this_idx);
   1394		/* Create a combined mask to cover all 3 interrupt sources */
   1395		sde->imask = sde->int_mask | sde->progress_mask |
   1396			     sde->idle_mask;
   1397
   1398		spin_lock_init(&sde->tail_lock);
   1399		seqlock_init(&sde->head_lock);
   1400		spin_lock_init(&sde->senddmactrl_lock);
   1401		spin_lock_init(&sde->flushlist_lock);
   1402		seqlock_init(&sde->waitlock);
   1403		/* insure there is always a zero bit */
   1404		sde->ahg_bits = 0xfffffffe00000000ULL;
   1405
   1406		sdma_set_state(sde, sdma_state_s00_hw_down);
   1407
   1408		/* set up reference counting */
   1409		kref_init(&sde->state.kref);
   1410		init_completion(&sde->state.comp);
   1411
   1412		INIT_LIST_HEAD(&sde->flushlist);
   1413		INIT_LIST_HEAD(&sde->dmawait);
   1414
   1415		sde->tail_csr =
   1416			get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
   1417
   1418		tasklet_setup(&sde->sdma_hw_clean_up_task,
   1419			      sdma_hw_clean_up_task);
   1420		tasklet_setup(&sde->sdma_sw_clean_up_task,
   1421			      sdma_sw_clean_up_task);
   1422		INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
   1423		INIT_WORK(&sde->flush_worker, sdma_field_flush);
   1424
   1425		sde->progress_check_head = 0;
   1426
   1427		timer_setup(&sde->err_progress_check_timer,
   1428			    sdma_err_progress_check, 0);
   1429
   1430		sde->descq = dma_alloc_coherent(&dd->pcidev->dev,
   1431						descq_cnt * sizeof(u64[2]),
   1432						&sde->descq_phys, GFP_KERNEL);
   1433		if (!sde->descq)
   1434			goto bail;
   1435		sde->tx_ring =
   1436			kvzalloc_node(array_size(descq_cnt,
   1437						 sizeof(struct sdma_txreq *)),
   1438				      GFP_KERNEL, dd->node);
   1439		if (!sde->tx_ring)
   1440			goto bail;
   1441	}
   1442
   1443	dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
   1444	/* Allocate memory for DMA of head registers to memory */
   1445	dd->sdma_heads_dma = dma_alloc_coherent(&dd->pcidev->dev,
   1446						dd->sdma_heads_size,
   1447						&dd->sdma_heads_phys,
   1448						GFP_KERNEL);
   1449	if (!dd->sdma_heads_dma) {
   1450		dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
   1451		goto bail;
   1452	}
   1453
   1454	/* Allocate memory for pad */
   1455	dd->sdma_pad_dma = dma_alloc_coherent(&dd->pcidev->dev, SDMA_PAD,
   1456					      &dd->sdma_pad_phys, GFP_KERNEL);
   1457	if (!dd->sdma_pad_dma) {
   1458		dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
   1459		goto bail;
   1460	}
   1461
   1462	/* assign each engine to different cacheline and init registers */
   1463	curr_head = (void *)dd->sdma_heads_dma;
   1464	for (this_idx = 0; this_idx < num_engines; ++this_idx) {
   1465		unsigned long phys_offset;
   1466
   1467		sde = &dd->per_sdma[this_idx];
   1468
   1469		sde->head_dma = curr_head;
   1470		curr_head += L1_CACHE_BYTES;
   1471		phys_offset = (unsigned long)sde->head_dma -
   1472			      (unsigned long)dd->sdma_heads_dma;
   1473		sde->head_phys = dd->sdma_heads_phys + phys_offset;
   1474		init_sdma_regs(sde, per_sdma_credits, idle_cnt);
   1475	}
   1476	dd->flags |= HFI1_HAS_SEND_DMA;
   1477	dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
   1478	dd->num_sdma = num_engines;
   1479	ret = sdma_map_init(dd, port, ppd->vls_operational, NULL);
   1480	if (ret < 0)
   1481		goto bail;
   1482
   1483	tmp_sdma_rht = kzalloc(sizeof(*tmp_sdma_rht), GFP_KERNEL);
   1484	if (!tmp_sdma_rht) {
   1485		ret = -ENOMEM;
   1486		goto bail;
   1487	}
   1488
   1489	ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params);
   1490	if (ret < 0) {
   1491		kfree(tmp_sdma_rht);
   1492		goto bail;
   1493	}
   1494
   1495	dd->sdma_rht = tmp_sdma_rht;
   1496
   1497	dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
   1498	return 0;
   1499
   1500bail:
   1501	sdma_clean(dd, num_engines);
   1502	return ret;
   1503}
   1504
   1505/**
   1506 * sdma_all_running() - called when the link goes up
   1507 * @dd: hfi1_devdata
   1508 *
   1509 * This routine moves all engines to the running state.
   1510 */
   1511void sdma_all_running(struct hfi1_devdata *dd)
   1512{
   1513	struct sdma_engine *sde;
   1514	unsigned int i;
   1515
   1516	/* move all engines to running */
   1517	for (i = 0; i < dd->num_sdma; ++i) {
   1518		sde = &dd->per_sdma[i];
   1519		sdma_process_event(sde, sdma_event_e30_go_running);
   1520	}
   1521}
   1522
   1523/**
   1524 * sdma_all_idle() - called when the link goes down
   1525 * @dd: hfi1_devdata
   1526 *
   1527 * This routine moves all engines to the idle state.
   1528 */
   1529void sdma_all_idle(struct hfi1_devdata *dd)
   1530{
   1531	struct sdma_engine *sde;
   1532	unsigned int i;
   1533
   1534	/* idle all engines */
   1535	for (i = 0; i < dd->num_sdma; ++i) {
   1536		sde = &dd->per_sdma[i];
   1537		sdma_process_event(sde, sdma_event_e70_go_idle);
   1538	}
   1539}
   1540
   1541/**
   1542 * sdma_start() - called to kick off state processing for all engines
   1543 * @dd: hfi1_devdata
   1544 *
   1545 * This routine is for kicking off the state processing for all required
   1546 * sdma engines.  Interrupts need to be working at this point.
   1547 *
   1548 */
   1549void sdma_start(struct hfi1_devdata *dd)
   1550{
   1551	unsigned i;
   1552	struct sdma_engine *sde;
   1553
   1554	/* kick off the engines state processing */
   1555	for (i = 0; i < dd->num_sdma; ++i) {
   1556		sde = &dd->per_sdma[i];
   1557		sdma_process_event(sde, sdma_event_e10_go_hw_start);
   1558	}
   1559}
   1560
   1561/**
   1562 * sdma_exit() - used when module is removed
   1563 * @dd: hfi1_devdata
   1564 */
   1565void sdma_exit(struct hfi1_devdata *dd)
   1566{
   1567	unsigned this_idx;
   1568	struct sdma_engine *sde;
   1569
   1570	for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
   1571			++this_idx) {
   1572		sde = &dd->per_sdma[this_idx];
   1573		if (!list_empty(&sde->dmawait))
   1574			dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
   1575				   sde->this_idx);
   1576		sdma_process_event(sde, sdma_event_e00_go_hw_down);
   1577
   1578		del_timer_sync(&sde->err_progress_check_timer);
   1579
   1580		/*
   1581		 * This waits for the state machine to exit so it is not
   1582		 * necessary to kill the sdma_sw_clean_up_task to make sure
   1583		 * it is not running.
   1584		 */
   1585		sdma_finalput(&sde->state);
   1586	}
   1587}
   1588
   1589/*
   1590 * unmap the indicated descriptor
   1591 */
   1592static inline void sdma_unmap_desc(
   1593	struct hfi1_devdata *dd,
   1594	struct sdma_desc *descp)
   1595{
   1596	switch (sdma_mapping_type(descp)) {
   1597	case SDMA_MAP_SINGLE:
   1598		dma_unmap_single(
   1599			&dd->pcidev->dev,
   1600			sdma_mapping_addr(descp),
   1601			sdma_mapping_len(descp),
   1602			DMA_TO_DEVICE);
   1603		break;
   1604	case SDMA_MAP_PAGE:
   1605		dma_unmap_page(
   1606			&dd->pcidev->dev,
   1607			sdma_mapping_addr(descp),
   1608			sdma_mapping_len(descp),
   1609			DMA_TO_DEVICE);
   1610		break;
   1611	}
   1612}
   1613
   1614/*
   1615 * return the mode as indicated by the first
   1616 * descriptor in the tx.
   1617 */
   1618static inline u8 ahg_mode(struct sdma_txreq *tx)
   1619{
   1620	return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
   1621		>> SDMA_DESC1_HEADER_MODE_SHIFT;
   1622}
   1623
   1624/**
   1625 * __sdma_txclean() - clean tx of mappings, descp *kmalloc's
   1626 * @dd: hfi1_devdata for unmapping
   1627 * @tx: tx request to clean
   1628 *
   1629 * This is used in the progress routine to clean the tx or
   1630 * by the ULP to toss an in-process tx build.
   1631 *
   1632 * The code can be called multiple times without issue.
   1633 *
   1634 */
   1635void __sdma_txclean(
   1636	struct hfi1_devdata *dd,
   1637	struct sdma_txreq *tx)
   1638{
   1639	u16 i;
   1640
   1641	if (tx->num_desc) {
   1642		u8 skip = 0, mode = ahg_mode(tx);
   1643
   1644		/* unmap first */
   1645		sdma_unmap_desc(dd, &tx->descp[0]);
   1646		/* determine number of AHG descriptors to skip */
   1647		if (mode > SDMA_AHG_APPLY_UPDATE1)
   1648			skip = mode >> 1;
   1649		for (i = 1 + skip; i < tx->num_desc; i++)
   1650			sdma_unmap_desc(dd, &tx->descp[i]);
   1651		tx->num_desc = 0;
   1652	}
   1653	kfree(tx->coalesce_buf);
   1654	tx->coalesce_buf = NULL;
   1655	/* kmalloc'ed descp */
   1656	if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
   1657		tx->desc_limit = ARRAY_SIZE(tx->descs);
   1658		kfree(tx->descp);
   1659	}
   1660}
   1661
   1662static inline u16 sdma_gethead(struct sdma_engine *sde)
   1663{
   1664	struct hfi1_devdata *dd = sde->dd;
   1665	int use_dmahead;
   1666	u16 hwhead;
   1667
   1668#ifdef CONFIG_SDMA_VERBOSITY
   1669	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
   1670		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
   1671#endif
   1672
   1673retry:
   1674	use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
   1675					(dd->flags & HFI1_HAS_SDMA_TIMEOUT);
   1676	hwhead = use_dmahead ?
   1677		(u16)le64_to_cpu(*sde->head_dma) :
   1678		(u16)read_sde_csr(sde, SD(HEAD));
   1679
   1680	if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
   1681		u16 cnt;
   1682		u16 swtail;
   1683		u16 swhead;
   1684		int sane;
   1685
   1686		swhead = sde->descq_head & sde->sdma_mask;
   1687		/* this code is really bad for cache line trading */
   1688		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
   1689		cnt = sde->descq_cnt;
   1690
   1691		if (swhead < swtail)
   1692			/* not wrapped */
   1693			sane = (hwhead >= swhead) & (hwhead <= swtail);
   1694		else if (swhead > swtail)
   1695			/* wrapped around */
   1696			sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
   1697				(hwhead <= swtail);
   1698		else
   1699			/* empty */
   1700			sane = (hwhead == swhead);
   1701
   1702		if (unlikely(!sane)) {
   1703			dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%u swhd=%u swtl=%u cnt=%u\n",
   1704				   sde->this_idx,
   1705				   use_dmahead ? "dma" : "kreg",
   1706				   hwhead, swhead, swtail, cnt);
   1707			if (use_dmahead) {
   1708				/* try one more time, using csr */
   1709				use_dmahead = 0;
   1710				goto retry;
   1711			}
   1712			/* proceed as if no progress */
   1713			hwhead = swhead;
   1714		}
   1715	}
   1716	return hwhead;
   1717}
   1718
   1719/*
   1720 * This is called when there are send DMA descriptors that might be
   1721 * available.
   1722 *
   1723 * This is called with head_lock held.
   1724 */
   1725static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
   1726{
   1727	struct iowait *wait, *nw, *twait;
   1728	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
   1729	uint i, n = 0, seq, tidx = 0;
   1730
   1731#ifdef CONFIG_SDMA_VERBOSITY
   1732	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
   1733		   slashstrip(__FILE__), __LINE__, __func__);
   1734	dd_dev_err(sde->dd, "avail: %u\n", avail);
   1735#endif
   1736
   1737	do {
   1738		seq = read_seqbegin(&sde->waitlock);
   1739		if (!list_empty(&sde->dmawait)) {
   1740			/* at least one item */
   1741			write_seqlock(&sde->waitlock);
   1742			/* Harvest waiters wanting DMA descriptors */
   1743			list_for_each_entry_safe(
   1744					wait,
   1745					nw,
   1746					&sde->dmawait,
   1747					list) {
   1748				u32 num_desc;
   1749
   1750				if (!wait->wakeup)
   1751					continue;
   1752				if (n == ARRAY_SIZE(waits))
   1753					break;
   1754				iowait_init_priority(wait);
   1755				num_desc = iowait_get_all_desc(wait);
   1756				if (num_desc > avail)
   1757					break;
   1758				avail -= num_desc;
   1759				/* Find the top-priority wait memeber */
   1760				if (n) {
   1761					twait = waits[tidx];
   1762					tidx =
   1763					    iowait_priority_update_top(wait,
   1764								       twait,
   1765								       n,
   1766								       tidx);
   1767				}
   1768				list_del_init(&wait->list);
   1769				waits[n++] = wait;
   1770			}
   1771			write_sequnlock(&sde->waitlock);
   1772			break;
   1773		}
   1774	} while (read_seqretry(&sde->waitlock, seq));
   1775
   1776	/* Schedule the top-priority entry first */
   1777	if (n)
   1778		waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
   1779
   1780	for (i = 0; i < n; i++)
   1781		if (i != tidx)
   1782			waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
   1783}
   1784
   1785/* head_lock must be held */
   1786static void sdma_make_progress(struct sdma_engine *sde, u64 status)
   1787{
   1788	struct sdma_txreq *txp = NULL;
   1789	int progress = 0;
   1790	u16 hwhead, swhead;
   1791	int idle_check_done = 0;
   1792
   1793	hwhead = sdma_gethead(sde);
   1794
   1795	/* The reason for some of the complexity of this code is that
   1796	 * not all descriptors have corresponding txps.  So, we have to
   1797	 * be able to skip over descs until we wander into the range of
   1798	 * the next txp on the list.
   1799	 */
   1800
   1801retry:
   1802	txp = get_txhead(sde);
   1803	swhead = sde->descq_head & sde->sdma_mask;
   1804	trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
   1805	while (swhead != hwhead) {
   1806		/* advance head, wrap if needed */
   1807		swhead = ++sde->descq_head & sde->sdma_mask;
   1808
   1809		/* if now past this txp's descs, do the callback */
   1810		if (txp && txp->next_descq_idx == swhead) {
   1811			/* remove from list */
   1812			sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
   1813			complete_tx(sde, txp, SDMA_TXREQ_S_OK);
   1814			/* see if there is another txp */
   1815			txp = get_txhead(sde);
   1816		}
   1817		trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
   1818		progress++;
   1819	}
   1820
   1821	/*
   1822	 * The SDMA idle interrupt is not guaranteed to be ordered with respect
   1823	 * to updates to the dma_head location in host memory. The head
   1824	 * value read might not be fully up to date. If there are pending
   1825	 * descriptors and the SDMA idle interrupt fired then read from the
   1826	 * CSR SDMA head instead to get the latest value from the hardware.
   1827	 * The hardware SDMA head should be read at most once in this invocation
   1828	 * of sdma_make_progress(..) which is ensured by idle_check_done flag
   1829	 */
   1830	if ((status & sde->idle_mask) && !idle_check_done) {
   1831		u16 swtail;
   1832
   1833		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
   1834		if (swtail != hwhead) {
   1835			hwhead = (u16)read_sde_csr(sde, SD(HEAD));
   1836			idle_check_done = 1;
   1837			goto retry;
   1838		}
   1839	}
   1840
   1841	sde->last_status = status;
   1842	if (progress)
   1843		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
   1844}
   1845
   1846/*
   1847 * sdma_engine_interrupt() - interrupt handler for engine
   1848 * @sde: sdma engine
   1849 * @status: sdma interrupt reason
   1850 *
   1851 * Status is a mask of the 3 possible interrupts for this engine.  It will
   1852 * contain bits _only_ for this SDMA engine.  It will contain at least one
   1853 * bit, it may contain more.
   1854 */
   1855void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
   1856{
   1857	trace_hfi1_sdma_engine_interrupt(sde, status);
   1858	write_seqlock(&sde->head_lock);
   1859	sdma_set_desc_cnt(sde, sdma_desct_intr);
   1860	if (status & sde->idle_mask)
   1861		sde->idle_int_cnt++;
   1862	else if (status & sde->progress_mask)
   1863		sde->progress_int_cnt++;
   1864	else if (status & sde->int_mask)
   1865		sde->sdma_int_cnt++;
   1866	sdma_make_progress(sde, status);
   1867	write_sequnlock(&sde->head_lock);
   1868}
   1869
   1870/**
   1871 * sdma_engine_error() - error handler for engine
   1872 * @sde: sdma engine
   1873 * @status: sdma interrupt reason
   1874 */
   1875void sdma_engine_error(struct sdma_engine *sde, u64 status)
   1876{
   1877	unsigned long flags;
   1878
   1879#ifdef CONFIG_SDMA_VERBOSITY
   1880	dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
   1881		   sde->this_idx,
   1882		   (unsigned long long)status,
   1883		   sdma_state_names[sde->state.current_state]);
   1884#endif
   1885	spin_lock_irqsave(&sde->tail_lock, flags);
   1886	write_seqlock(&sde->head_lock);
   1887	if (status & ALL_SDMA_ENG_HALT_ERRS)
   1888		__sdma_process_event(sde, sdma_event_e60_hw_halted);
   1889	if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
   1890		dd_dev_err(sde->dd,
   1891			   "SDMA (%u) engine error: 0x%llx state %s\n",
   1892			   sde->this_idx,
   1893			   (unsigned long long)status,
   1894			   sdma_state_names[sde->state.current_state]);
   1895		dump_sdma_state(sde);
   1896	}
   1897	write_sequnlock(&sde->head_lock);
   1898	spin_unlock_irqrestore(&sde->tail_lock, flags);
   1899}
   1900
   1901static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
   1902{
   1903	u64 set_senddmactrl = 0;
   1904	u64 clr_senddmactrl = 0;
   1905	unsigned long flags;
   1906
   1907#ifdef CONFIG_SDMA_VERBOSITY
   1908	dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
   1909		   sde->this_idx,
   1910		   (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
   1911		   (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
   1912		   (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
   1913		   (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
   1914#endif
   1915
   1916	if (op & SDMA_SENDCTRL_OP_ENABLE)
   1917		set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
   1918	else
   1919		clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
   1920
   1921	if (op & SDMA_SENDCTRL_OP_INTENABLE)
   1922		set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
   1923	else
   1924		clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
   1925
   1926	if (op & SDMA_SENDCTRL_OP_HALT)
   1927		set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
   1928	else
   1929		clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
   1930
   1931	spin_lock_irqsave(&sde->senddmactrl_lock, flags);
   1932
   1933	sde->p_senddmactrl |= set_senddmactrl;
   1934	sde->p_senddmactrl &= ~clr_senddmactrl;
   1935
   1936	if (op & SDMA_SENDCTRL_OP_CLEANUP)
   1937		write_sde_csr(sde, SD(CTRL),
   1938			      sde->p_senddmactrl |
   1939			      SD(CTRL_SDMA_CLEANUP_SMASK));
   1940	else
   1941		write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
   1942
   1943	spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
   1944
   1945#ifdef CONFIG_SDMA_VERBOSITY
   1946	sdma_dumpstate(sde);
   1947#endif
   1948}
   1949
   1950static void sdma_setlengen(struct sdma_engine *sde)
   1951{
   1952#ifdef CONFIG_SDMA_VERBOSITY
   1953	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
   1954		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
   1955#endif
   1956
   1957	/*
   1958	 * Set SendDmaLenGen and clear-then-set the MSB of the generation
   1959	 * count to enable generation checking and load the internal
   1960	 * generation counter.
   1961	 */
   1962	write_sde_csr(sde, SD(LEN_GEN),
   1963		      (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
   1964	write_sde_csr(sde, SD(LEN_GEN),
   1965		      ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
   1966		      (4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
   1967}
   1968
   1969static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
   1970{
   1971	/* Commit writes to memory and advance the tail on the chip */
   1972	smp_wmb(); /* see get_txhead() */
   1973	writeq(tail, sde->tail_csr);
   1974}
   1975
   1976/*
   1977 * This is called when changing to state s10_hw_start_up_halt_wait as
   1978 * a result of send buffer errors or send DMA descriptor errors.
   1979 */
   1980static void sdma_hw_start_up(struct sdma_engine *sde)
   1981{
   1982	u64 reg;
   1983
   1984#ifdef CONFIG_SDMA_VERBOSITY
   1985	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
   1986		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
   1987#endif
   1988
   1989	sdma_setlengen(sde);
   1990	sdma_update_tail(sde, 0); /* Set SendDmaTail */
   1991	*sde->head_dma = 0;
   1992
   1993	reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
   1994	      SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
   1995	write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
   1996}
   1997
   1998/*
   1999 * set_sdma_integrity
   2000 *
   2001 * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
   2002 */
   2003static void set_sdma_integrity(struct sdma_engine *sde)
   2004{
   2005	struct hfi1_devdata *dd = sde->dd;
   2006
   2007	write_sde_csr(sde, SD(CHECK_ENABLE),
   2008		      hfi1_pkt_base_sdma_integrity(dd));
   2009}
   2010
   2011static void init_sdma_regs(
   2012	struct sdma_engine *sde,
   2013	u32 credits,
   2014	uint idle_cnt)
   2015{
   2016	u8 opval, opmask;
   2017#ifdef CONFIG_SDMA_VERBOSITY
   2018	struct hfi1_devdata *dd = sde->dd;
   2019
   2020	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
   2021		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
   2022#endif
   2023
   2024	write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
   2025	sdma_setlengen(sde);
   2026	sdma_update_tail(sde, 0); /* Set SendDmaTail */
   2027	write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
   2028	write_sde_csr(sde, SD(DESC_CNT), 0);
   2029	write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
   2030	write_sde_csr(sde, SD(MEMORY),
   2031		      ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
   2032		      ((u64)(credits * sde->this_idx) <<
   2033		       SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
   2034	write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
   2035	set_sdma_integrity(sde);
   2036	opmask = OPCODE_CHECK_MASK_DISABLED;
   2037	opval = OPCODE_CHECK_VAL_DISABLED;
   2038	write_sde_csr(sde, SD(CHECK_OPCODE),
   2039		      (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
   2040		      (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
   2041}
   2042
   2043#ifdef CONFIG_SDMA_VERBOSITY
   2044
   2045#define sdma_dumpstate_helper0(reg) do { \
   2046		csr = read_csr(sde->dd, reg); \
   2047		dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
   2048	} while (0)
   2049
   2050#define sdma_dumpstate_helper(reg) do { \
   2051		csr = read_sde_csr(sde, reg); \
   2052		dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
   2053			#reg, sde->this_idx, csr); \
   2054	} while (0)
   2055
   2056#define sdma_dumpstate_helper2(reg) do { \
   2057		csr = read_csr(sde->dd, reg + (8 * i)); \
   2058		dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
   2059				#reg, i, csr); \
   2060	} while (0)
   2061
   2062void sdma_dumpstate(struct sdma_engine *sde)
   2063{
   2064	u64 csr;
   2065	unsigned i;
   2066
   2067	sdma_dumpstate_helper(SD(CTRL));
   2068	sdma_dumpstate_helper(SD(STATUS));
   2069	sdma_dumpstate_helper0(SD(ERR_STATUS));
   2070	sdma_dumpstate_helper0(SD(ERR_MASK));
   2071	sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
   2072	sdma_dumpstate_helper(SD(ENG_ERR_MASK));
   2073
   2074	for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
   2075		sdma_dumpstate_helper2(CCE_INT_STATUS);
   2076		sdma_dumpstate_helper2(CCE_INT_MASK);
   2077		sdma_dumpstate_helper2(CCE_INT_BLOCKED);
   2078	}
   2079
   2080	sdma_dumpstate_helper(SD(TAIL));
   2081	sdma_dumpstate_helper(SD(HEAD));
   2082	sdma_dumpstate_helper(SD(PRIORITY_THLD));
   2083	sdma_dumpstate_helper(SD(IDLE_CNT));
   2084	sdma_dumpstate_helper(SD(RELOAD_CNT));
   2085	sdma_dumpstate_helper(SD(DESC_CNT));
   2086	sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
   2087	sdma_dumpstate_helper(SD(MEMORY));
   2088	sdma_dumpstate_helper0(SD(ENGINES));
   2089	sdma_dumpstate_helper0(SD(MEM_SIZE));
   2090	/* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
   2091	sdma_dumpstate_helper(SD(BASE_ADDR));
   2092	sdma_dumpstate_helper(SD(LEN_GEN));
   2093	sdma_dumpstate_helper(SD(HEAD_ADDR));
   2094	sdma_dumpstate_helper(SD(CHECK_ENABLE));
   2095	sdma_dumpstate_helper(SD(CHECK_VL));
   2096	sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
   2097	sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
   2098	sdma_dumpstate_helper(SD(CHECK_SLID));
   2099	sdma_dumpstate_helper(SD(CHECK_OPCODE));
   2100}
   2101#endif
   2102
   2103static void dump_sdma_state(struct sdma_engine *sde)
   2104{
   2105	struct hw_sdma_desc *descqp;
   2106	u64 desc[2];
   2107	u64 addr;
   2108	u8 gen;
   2109	u16 len;
   2110	u16 head, tail, cnt;
   2111
   2112	head = sde->descq_head & sde->sdma_mask;
   2113	tail = sde->descq_tail & sde->sdma_mask;
   2114	cnt = sdma_descq_freecnt(sde);
   2115
   2116	dd_dev_err(sde->dd,
   2117		   "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
   2118		   sde->this_idx, head, tail, cnt,
   2119		   !list_empty(&sde->flushlist));
   2120
   2121	/* print info for each entry in the descriptor queue */
   2122	while (head != tail) {
   2123		char flags[6] = { 'x', 'x', 'x', 'x', 0 };
   2124
   2125		descqp = &sde->descq[head];
   2126		desc[0] = le64_to_cpu(descqp->qw[0]);
   2127		desc[1] = le64_to_cpu(descqp->qw[1]);
   2128		flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
   2129		flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
   2130				'H' : '-';
   2131		flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
   2132		flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
   2133		addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
   2134			& SDMA_DESC0_PHY_ADDR_MASK;
   2135		gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
   2136			& SDMA_DESC1_GENERATION_MASK;
   2137		len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
   2138			& SDMA_DESC0_BYTE_COUNT_MASK;
   2139		dd_dev_err(sde->dd,
   2140			   "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
   2141			   head, flags, addr, gen, len);
   2142		dd_dev_err(sde->dd,
   2143			   "\tdesc0:0x%016llx desc1 0x%016llx\n",
   2144			   desc[0], desc[1]);
   2145		if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
   2146			dd_dev_err(sde->dd,
   2147				   "\taidx: %u amode: %u alen: %u\n",
   2148				   (u8)((desc[1] &
   2149					 SDMA_DESC1_HEADER_INDEX_SMASK) >>
   2150					SDMA_DESC1_HEADER_INDEX_SHIFT),
   2151				   (u8)((desc[1] &
   2152					 SDMA_DESC1_HEADER_MODE_SMASK) >>
   2153					SDMA_DESC1_HEADER_MODE_SHIFT),
   2154				   (u8)((desc[1] &
   2155					 SDMA_DESC1_HEADER_DWS_SMASK) >>
   2156					SDMA_DESC1_HEADER_DWS_SHIFT));
   2157		head++;
   2158		head &= sde->sdma_mask;
   2159	}
   2160}
   2161
   2162#define SDE_FMT \
   2163	"SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
   2164/**
   2165 * sdma_seqfile_dump_sde() - debugfs dump of sde
   2166 * @s: seq file
   2167 * @sde: send dma engine to dump
   2168 *
   2169 * This routine dumps the sde to the indicated seq file.
   2170 */
   2171void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
   2172{
   2173	u16 head, tail;
   2174	struct hw_sdma_desc *descqp;
   2175	u64 desc[2];
   2176	u64 addr;
   2177	u8 gen;
   2178	u16 len;
   2179
   2180	head = sde->descq_head & sde->sdma_mask;
   2181	tail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
   2182	seq_printf(s, SDE_FMT, sde->this_idx,
   2183		   sde->cpu,
   2184		   sdma_state_name(sde->state.current_state),
   2185		   (unsigned long long)read_sde_csr(sde, SD(CTRL)),
   2186		   (unsigned long long)read_sde_csr(sde, SD(STATUS)),
   2187		   (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
   2188		   (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
   2189		   (unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
   2190		   (unsigned long long)le64_to_cpu(*sde->head_dma),
   2191		   (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
   2192		   (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
   2193		   (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
   2194		   (unsigned long long)sde->last_status,
   2195		   (unsigned long long)sde->ahg_bits,
   2196		   sde->tx_tail,
   2197		   sde->tx_head,
   2198		   sde->descq_tail,
   2199		   sde->descq_head,
   2200		   !list_empty(&sde->flushlist),
   2201		   sde->descq_full_count,
   2202		   (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
   2203
   2204	/* print info for each entry in the descriptor queue */
   2205	while (head != tail) {
   2206		char flags[6] = { 'x', 'x', 'x', 'x', 0 };
   2207
   2208		descqp = &sde->descq[head];
   2209		desc[0] = le64_to_cpu(descqp->qw[0]);
   2210		desc[1] = le64_to_cpu(descqp->qw[1]);
   2211		flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
   2212		flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
   2213				'H' : '-';
   2214		flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
   2215		flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
   2216		addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
   2217			& SDMA_DESC0_PHY_ADDR_MASK;
   2218		gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
   2219			& SDMA_DESC1_GENERATION_MASK;
   2220		len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
   2221			& SDMA_DESC0_BYTE_COUNT_MASK;
   2222		seq_printf(s,
   2223			   "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
   2224			   head, flags, addr, gen, len);
   2225		if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
   2226			seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
   2227				   (u8)((desc[1] &
   2228					 SDMA_DESC1_HEADER_INDEX_SMASK) >>
   2229					SDMA_DESC1_HEADER_INDEX_SHIFT),
   2230				   (u8)((desc[1] &
   2231					 SDMA_DESC1_HEADER_MODE_SMASK) >>
   2232					SDMA_DESC1_HEADER_MODE_SHIFT));
   2233		head = (head + 1) & sde->sdma_mask;
   2234	}
   2235}
   2236
   2237/*
   2238 * add the generation number into
   2239 * the qw1 and return
   2240 */
   2241static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
   2242{
   2243	u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
   2244
   2245	qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
   2246	qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
   2247			<< SDMA_DESC1_GENERATION_SHIFT;
   2248	return qw1;
   2249}
   2250
   2251/*
   2252 * This routine submits the indicated tx
   2253 *
   2254 * Space has already been guaranteed and
   2255 * tail side of ring is locked.
   2256 *
   2257 * The hardware tail update is done
   2258 * in the caller and that is facilitated
   2259 * by returning the new tail.
   2260 *
   2261 * There is special case logic for ahg
   2262 * to not add the generation number for
   2263 * up to 2 descriptors that follow the
   2264 * first descriptor.
   2265 *
   2266 */
   2267static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
   2268{
   2269	int i;
   2270	u16 tail;
   2271	struct sdma_desc *descp = tx->descp;
   2272	u8 skip = 0, mode = ahg_mode(tx);
   2273
   2274	tail = sde->descq_tail & sde->sdma_mask;
   2275	sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
   2276	sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
   2277	trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
   2278				   tail, &sde->descq[tail]);
   2279	tail = ++sde->descq_tail & sde->sdma_mask;
   2280	descp++;
   2281	if (mode > SDMA_AHG_APPLY_UPDATE1)
   2282		skip = mode >> 1;
   2283	for (i = 1; i < tx->num_desc; i++, descp++) {
   2284		u64 qw1;
   2285
   2286		sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
   2287		if (skip) {
   2288			/* edits don't have generation */
   2289			qw1 = descp->qw[1];
   2290			skip--;
   2291		} else {
   2292			/* replace generation with real one for non-edits */
   2293			qw1 = add_gen(sde, descp->qw[1]);
   2294		}
   2295		sde->descq[tail].qw[1] = cpu_to_le64(qw1);
   2296		trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
   2297					   tail, &sde->descq[tail]);
   2298		tail = ++sde->descq_tail & sde->sdma_mask;
   2299	}
   2300	tx->next_descq_idx = tail;
   2301#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
   2302	tx->sn = sde->tail_sn++;
   2303	trace_hfi1_sdma_in_sn(sde, tx->sn);
   2304	WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
   2305#endif
   2306	sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
   2307	sde->desc_avail -= tx->num_desc;
   2308	return tail;
   2309}
   2310
   2311/*
   2312 * Check for progress
   2313 */
   2314static int sdma_check_progress(
   2315	struct sdma_engine *sde,
   2316	struct iowait_work *wait,
   2317	struct sdma_txreq *tx,
   2318	bool pkts_sent)
   2319{
   2320	int ret;
   2321
   2322	sde->desc_avail = sdma_descq_freecnt(sde);
   2323	if (tx->num_desc <= sde->desc_avail)
   2324		return -EAGAIN;
   2325	/* pulse the head_lock */
   2326	if (wait && iowait_ioww_to_iow(wait)->sleep) {
   2327		unsigned seq;
   2328
   2329		seq = raw_seqcount_begin(
   2330			(const seqcount_t *)&sde->head_lock.seqcount);
   2331		ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent);
   2332		if (ret == -EAGAIN)
   2333			sde->desc_avail = sdma_descq_freecnt(sde);
   2334	} else {
   2335		ret = -EBUSY;
   2336	}
   2337	return ret;
   2338}
   2339
   2340/**
   2341 * sdma_send_txreq() - submit a tx req to ring
   2342 * @sde: sdma engine to use
   2343 * @wait: SE wait structure to use when full (may be NULL)
   2344 * @tx: sdma_txreq to submit
   2345 * @pkts_sent: has any packet been sent yet?
   2346 *
   2347 * The call submits the tx into the ring.  If a iowait structure is non-NULL
   2348 * the packet will be queued to the list in wait.
   2349 *
   2350 * Return:
   2351 * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
   2352 * ring (wait == NULL)
   2353 * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
   2354 */
   2355int sdma_send_txreq(struct sdma_engine *sde,
   2356		    struct iowait_work *wait,
   2357		    struct sdma_txreq *tx,
   2358		    bool pkts_sent)
   2359{
   2360	int ret = 0;
   2361	u16 tail;
   2362	unsigned long flags;
   2363
   2364	/* user should have supplied entire packet */
   2365	if (unlikely(tx->tlen))
   2366		return -EINVAL;
   2367	tx->wait = iowait_ioww_to_iow(wait);
   2368	spin_lock_irqsave(&sde->tail_lock, flags);
   2369retry:
   2370	if (unlikely(!__sdma_running(sde)))
   2371		goto unlock_noconn;
   2372	if (unlikely(tx->num_desc > sde->desc_avail))
   2373		goto nodesc;
   2374	tail = submit_tx(sde, tx);
   2375	if (wait)
   2376		iowait_sdma_inc(iowait_ioww_to_iow(wait));
   2377	sdma_update_tail(sde, tail);
   2378unlock:
   2379	spin_unlock_irqrestore(&sde->tail_lock, flags);
   2380	return ret;
   2381unlock_noconn:
   2382	if (wait)
   2383		iowait_sdma_inc(iowait_ioww_to_iow(wait));
   2384	tx->next_descq_idx = 0;
   2385#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
   2386	tx->sn = sde->tail_sn++;
   2387	trace_hfi1_sdma_in_sn(sde, tx->sn);
   2388#endif
   2389	spin_lock(&sde->flushlist_lock);
   2390	list_add_tail(&tx->list, &sde->flushlist);
   2391	spin_unlock(&sde->flushlist_lock);
   2392	iowait_inc_wait_count(wait, tx->num_desc);
   2393	queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
   2394	ret = -ECOMM;
   2395	goto unlock;
   2396nodesc:
   2397	ret = sdma_check_progress(sde, wait, tx, pkts_sent);
   2398	if (ret == -EAGAIN) {
   2399		ret = 0;
   2400		goto retry;
   2401	}
   2402	sde->descq_full_count++;
   2403	goto unlock;
   2404}
   2405
   2406/**
   2407 * sdma_send_txlist() - submit a list of tx req to ring
   2408 * @sde: sdma engine to use
   2409 * @wait: SE wait structure to use when full (may be NULL)
   2410 * @tx_list: list of sdma_txreqs to submit
   2411 * @count_out: pointer to a u16 which, after return will contain the total number of
   2412 *             sdma_txreqs removed from the tx_list. This will include sdma_txreqs
   2413 *             whose SDMA descriptors are submitted to the ring and the sdma_txreqs
   2414 *             which are added to SDMA engine flush list if the SDMA engine state is
   2415 *             not running.
   2416 *
   2417 * The call submits the list into the ring.
   2418 *
   2419 * If the iowait structure is non-NULL and not equal to the iowait list
   2420 * the unprocessed part of the list  will be appended to the list in wait.
   2421 *
   2422 * In all cases, the tx_list will be updated so the head of the tx_list is
   2423 * the list of descriptors that have yet to be transmitted.
   2424 *
   2425 * The intent of this call is to provide a more efficient
   2426 * way of submitting multiple packets to SDMA while holding the tail
   2427 * side locking.
   2428 *
   2429 * Return:
   2430 * 0 - Success,
   2431 * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
   2432 * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
   2433 */
   2434int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait,
   2435		     struct list_head *tx_list, u16 *count_out)
   2436{
   2437	struct sdma_txreq *tx, *tx_next;
   2438	int ret = 0;
   2439	unsigned long flags;
   2440	u16 tail = INVALID_TAIL;
   2441	u32 submit_count = 0, flush_count = 0, total_count;
   2442
   2443	spin_lock_irqsave(&sde->tail_lock, flags);
   2444retry:
   2445	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
   2446		tx->wait = iowait_ioww_to_iow(wait);
   2447		if (unlikely(!__sdma_running(sde)))
   2448			goto unlock_noconn;
   2449		if (unlikely(tx->num_desc > sde->desc_avail))
   2450			goto nodesc;
   2451		if (unlikely(tx->tlen)) {
   2452			ret = -EINVAL;
   2453			goto update_tail;
   2454		}
   2455		list_del_init(&tx->list);
   2456		tail = submit_tx(sde, tx);
   2457		submit_count++;
   2458		if (tail != INVALID_TAIL &&
   2459		    (submit_count & SDMA_TAIL_UPDATE_THRESH) == 0) {
   2460			sdma_update_tail(sde, tail);
   2461			tail = INVALID_TAIL;
   2462		}
   2463	}
   2464update_tail:
   2465	total_count = submit_count + flush_count;
   2466	if (wait) {
   2467		iowait_sdma_add(iowait_ioww_to_iow(wait), total_count);
   2468		iowait_starve_clear(submit_count > 0,
   2469				    iowait_ioww_to_iow(wait));
   2470	}
   2471	if (tail != INVALID_TAIL)
   2472		sdma_update_tail(sde, tail);
   2473	spin_unlock_irqrestore(&sde->tail_lock, flags);
   2474	*count_out = total_count;
   2475	return ret;
   2476unlock_noconn:
   2477	spin_lock(&sde->flushlist_lock);
   2478	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
   2479		tx->wait = iowait_ioww_to_iow(wait);
   2480		list_del_init(&tx->list);
   2481		tx->next_descq_idx = 0;
   2482#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
   2483		tx->sn = sde->tail_sn++;
   2484		trace_hfi1_sdma_in_sn(sde, tx->sn);
   2485#endif
   2486		list_add_tail(&tx->list, &sde->flushlist);
   2487		flush_count++;
   2488		iowait_inc_wait_count(wait, tx->num_desc);
   2489	}
   2490	spin_unlock(&sde->flushlist_lock);
   2491	queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
   2492	ret = -ECOMM;
   2493	goto update_tail;
   2494nodesc:
   2495	ret = sdma_check_progress(sde, wait, tx, submit_count > 0);
   2496	if (ret == -EAGAIN) {
   2497		ret = 0;
   2498		goto retry;
   2499	}
   2500	sde->descq_full_count++;
   2501	goto update_tail;
   2502}
   2503
   2504static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
   2505{
   2506	unsigned long flags;
   2507
   2508	spin_lock_irqsave(&sde->tail_lock, flags);
   2509	write_seqlock(&sde->head_lock);
   2510
   2511	__sdma_process_event(sde, event);
   2512
   2513	if (sde->state.current_state == sdma_state_s99_running)
   2514		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
   2515
   2516	write_sequnlock(&sde->head_lock);
   2517	spin_unlock_irqrestore(&sde->tail_lock, flags);
   2518}
   2519
   2520static void __sdma_process_event(struct sdma_engine *sde,
   2521				 enum sdma_events event)
   2522{
   2523	struct sdma_state *ss = &sde->state;
   2524	int need_progress = 0;
   2525
   2526	/* CONFIG SDMA temporary */
   2527#ifdef CONFIG_SDMA_VERBOSITY
   2528	dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
   2529		   sdma_state_names[ss->current_state],
   2530		   sdma_event_names[event]);
   2531#endif
   2532
   2533	switch (ss->current_state) {
   2534	case sdma_state_s00_hw_down:
   2535		switch (event) {
   2536		case sdma_event_e00_go_hw_down:
   2537			break;
   2538		case sdma_event_e30_go_running:
   2539			/*
   2540			 * If down, but running requested (usually result
   2541			 * of link up, then we need to start up.
   2542			 * This can happen when hw down is requested while
   2543			 * bringing the link up with traffic active on
   2544			 * 7220, e.g.
   2545			 */
   2546			ss->go_s99_running = 1;
   2547			fallthrough;	/* and start dma engine */
   2548		case sdma_event_e10_go_hw_start:
   2549			/* This reference means the state machine is started */
   2550			sdma_get(&sde->state);
   2551			sdma_set_state(sde,
   2552				       sdma_state_s10_hw_start_up_halt_wait);
   2553			break;
   2554		case sdma_event_e15_hw_halt_done:
   2555			break;
   2556		case sdma_event_e25_hw_clean_up_done:
   2557			break;
   2558		case sdma_event_e40_sw_cleaned:
   2559			sdma_sw_tear_down(sde);
   2560			break;
   2561		case sdma_event_e50_hw_cleaned:
   2562			break;
   2563		case sdma_event_e60_hw_halted:
   2564			break;
   2565		case sdma_event_e70_go_idle:
   2566			break;
   2567		case sdma_event_e80_hw_freeze:
   2568			break;
   2569		case sdma_event_e81_hw_frozen:
   2570			break;
   2571		case sdma_event_e82_hw_unfreeze:
   2572			break;
   2573		case sdma_event_e85_link_down:
   2574			break;
   2575		case sdma_event_e90_sw_halted:
   2576			break;
   2577		}
   2578		break;
   2579
   2580	case sdma_state_s10_hw_start_up_halt_wait:
   2581		switch (event) {
   2582		case sdma_event_e00_go_hw_down:
   2583			sdma_set_state(sde, sdma_state_s00_hw_down);
   2584			sdma_sw_tear_down(sde);
   2585			break;
   2586		case sdma_event_e10_go_hw_start:
   2587			break;
   2588		case sdma_event_e15_hw_halt_done:
   2589			sdma_set_state(sde,
   2590				       sdma_state_s15_hw_start_up_clean_wait);
   2591			sdma_start_hw_clean_up(sde);
   2592			break;
   2593		case sdma_event_e25_hw_clean_up_done:
   2594			break;
   2595		case sdma_event_e30_go_running:
   2596			ss->go_s99_running = 1;
   2597			break;
   2598		case sdma_event_e40_sw_cleaned:
   2599			break;
   2600		case sdma_event_e50_hw_cleaned:
   2601			break;
   2602		case sdma_event_e60_hw_halted:
   2603			schedule_work(&sde->err_halt_worker);
   2604			break;
   2605		case sdma_event_e70_go_idle:
   2606			ss->go_s99_running = 0;
   2607			break;
   2608		case sdma_event_e80_hw_freeze:
   2609			break;
   2610		case sdma_event_e81_hw_frozen:
   2611			break;
   2612		case sdma_event_e82_hw_unfreeze:
   2613			break;
   2614		case sdma_event_e85_link_down:
   2615			break;
   2616		case sdma_event_e90_sw_halted:
   2617			break;
   2618		}
   2619		break;
   2620
   2621	case sdma_state_s15_hw_start_up_clean_wait:
   2622		switch (event) {
   2623		case sdma_event_e00_go_hw_down:
   2624			sdma_set_state(sde, sdma_state_s00_hw_down);
   2625			sdma_sw_tear_down(sde);
   2626			break;
   2627		case sdma_event_e10_go_hw_start:
   2628			break;
   2629		case sdma_event_e15_hw_halt_done:
   2630			break;
   2631		case sdma_event_e25_hw_clean_up_done:
   2632			sdma_hw_start_up(sde);
   2633			sdma_set_state(sde, ss->go_s99_running ?
   2634				       sdma_state_s99_running :
   2635				       sdma_state_s20_idle);
   2636			break;
   2637		case sdma_event_e30_go_running:
   2638			ss->go_s99_running = 1;
   2639			break;
   2640		case sdma_event_e40_sw_cleaned:
   2641			break;
   2642		case sdma_event_e50_hw_cleaned:
   2643			break;
   2644		case sdma_event_e60_hw_halted:
   2645			break;
   2646		case sdma_event_e70_go_idle:
   2647			ss->go_s99_running = 0;
   2648			break;
   2649		case sdma_event_e80_hw_freeze:
   2650			break;
   2651		case sdma_event_e81_hw_frozen:
   2652			break;
   2653		case sdma_event_e82_hw_unfreeze:
   2654			break;
   2655		case sdma_event_e85_link_down:
   2656			break;
   2657		case sdma_event_e90_sw_halted:
   2658			break;
   2659		}
   2660		break;
   2661
   2662	case sdma_state_s20_idle:
   2663		switch (event) {
   2664		case sdma_event_e00_go_hw_down:
   2665			sdma_set_state(sde, sdma_state_s00_hw_down);
   2666			sdma_sw_tear_down(sde);
   2667			break;
   2668		case sdma_event_e10_go_hw_start:
   2669			break;
   2670		case sdma_event_e15_hw_halt_done:
   2671			break;
   2672		case sdma_event_e25_hw_clean_up_done:
   2673			break;
   2674		case sdma_event_e30_go_running:
   2675			sdma_set_state(sde, sdma_state_s99_running);
   2676			ss->go_s99_running = 1;
   2677			break;
   2678		case sdma_event_e40_sw_cleaned:
   2679			break;
   2680		case sdma_event_e50_hw_cleaned:
   2681			break;
   2682		case sdma_event_e60_hw_halted:
   2683			sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
   2684			schedule_work(&sde->err_halt_worker);
   2685			break;
   2686		case sdma_event_e70_go_idle:
   2687			break;
   2688		case sdma_event_e85_link_down:
   2689		case sdma_event_e80_hw_freeze:
   2690			sdma_set_state(sde, sdma_state_s80_hw_freeze);
   2691			atomic_dec(&sde->dd->sdma_unfreeze_count);
   2692			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
   2693			break;
   2694		case sdma_event_e81_hw_frozen:
   2695			break;
   2696		case sdma_event_e82_hw_unfreeze:
   2697			break;
   2698		case sdma_event_e90_sw_halted:
   2699			break;
   2700		}
   2701		break;
   2702
   2703	case sdma_state_s30_sw_clean_up_wait:
   2704		switch (event) {
   2705		case sdma_event_e00_go_hw_down:
   2706			sdma_set_state(sde, sdma_state_s00_hw_down);
   2707			break;
   2708		case sdma_event_e10_go_hw_start:
   2709			break;
   2710		case sdma_event_e15_hw_halt_done:
   2711			break;
   2712		case sdma_event_e25_hw_clean_up_done:
   2713			break;
   2714		case sdma_event_e30_go_running:
   2715			ss->go_s99_running = 1;
   2716			break;
   2717		case sdma_event_e40_sw_cleaned:
   2718			sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
   2719			sdma_start_hw_clean_up(sde);
   2720			break;
   2721		case sdma_event_e50_hw_cleaned:
   2722			break;
   2723		case sdma_event_e60_hw_halted:
   2724			break;
   2725		case sdma_event_e70_go_idle:
   2726			ss->go_s99_running = 0;
   2727			break;
   2728		case sdma_event_e80_hw_freeze:
   2729			break;
   2730		case sdma_event_e81_hw_frozen:
   2731			break;
   2732		case sdma_event_e82_hw_unfreeze:
   2733			break;
   2734		case sdma_event_e85_link_down:
   2735			ss->go_s99_running = 0;
   2736			break;
   2737		case sdma_event_e90_sw_halted:
   2738			break;
   2739		}
   2740		break;
   2741
   2742	case sdma_state_s40_hw_clean_up_wait:
   2743		switch (event) {
   2744		case sdma_event_e00_go_hw_down:
   2745			sdma_set_state(sde, sdma_state_s00_hw_down);
   2746			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2747			break;
   2748		case sdma_event_e10_go_hw_start:
   2749			break;
   2750		case sdma_event_e15_hw_halt_done:
   2751			break;
   2752		case sdma_event_e25_hw_clean_up_done:
   2753			sdma_hw_start_up(sde);
   2754			sdma_set_state(sde, ss->go_s99_running ?
   2755				       sdma_state_s99_running :
   2756				       sdma_state_s20_idle);
   2757			break;
   2758		case sdma_event_e30_go_running:
   2759			ss->go_s99_running = 1;
   2760			break;
   2761		case sdma_event_e40_sw_cleaned:
   2762			break;
   2763		case sdma_event_e50_hw_cleaned:
   2764			break;
   2765		case sdma_event_e60_hw_halted:
   2766			break;
   2767		case sdma_event_e70_go_idle:
   2768			ss->go_s99_running = 0;
   2769			break;
   2770		case sdma_event_e80_hw_freeze:
   2771			break;
   2772		case sdma_event_e81_hw_frozen:
   2773			break;
   2774		case sdma_event_e82_hw_unfreeze:
   2775			break;
   2776		case sdma_event_e85_link_down:
   2777			ss->go_s99_running = 0;
   2778			break;
   2779		case sdma_event_e90_sw_halted:
   2780			break;
   2781		}
   2782		break;
   2783
   2784	case sdma_state_s50_hw_halt_wait:
   2785		switch (event) {
   2786		case sdma_event_e00_go_hw_down:
   2787			sdma_set_state(sde, sdma_state_s00_hw_down);
   2788			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2789			break;
   2790		case sdma_event_e10_go_hw_start:
   2791			break;
   2792		case sdma_event_e15_hw_halt_done:
   2793			sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
   2794			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2795			break;
   2796		case sdma_event_e25_hw_clean_up_done:
   2797			break;
   2798		case sdma_event_e30_go_running:
   2799			ss->go_s99_running = 1;
   2800			break;
   2801		case sdma_event_e40_sw_cleaned:
   2802			break;
   2803		case sdma_event_e50_hw_cleaned:
   2804			break;
   2805		case sdma_event_e60_hw_halted:
   2806			schedule_work(&sde->err_halt_worker);
   2807			break;
   2808		case sdma_event_e70_go_idle:
   2809			ss->go_s99_running = 0;
   2810			break;
   2811		case sdma_event_e80_hw_freeze:
   2812			break;
   2813		case sdma_event_e81_hw_frozen:
   2814			break;
   2815		case sdma_event_e82_hw_unfreeze:
   2816			break;
   2817		case sdma_event_e85_link_down:
   2818			ss->go_s99_running = 0;
   2819			break;
   2820		case sdma_event_e90_sw_halted:
   2821			break;
   2822		}
   2823		break;
   2824
   2825	case sdma_state_s60_idle_halt_wait:
   2826		switch (event) {
   2827		case sdma_event_e00_go_hw_down:
   2828			sdma_set_state(sde, sdma_state_s00_hw_down);
   2829			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2830			break;
   2831		case sdma_event_e10_go_hw_start:
   2832			break;
   2833		case sdma_event_e15_hw_halt_done:
   2834			sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
   2835			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2836			break;
   2837		case sdma_event_e25_hw_clean_up_done:
   2838			break;
   2839		case sdma_event_e30_go_running:
   2840			ss->go_s99_running = 1;
   2841			break;
   2842		case sdma_event_e40_sw_cleaned:
   2843			break;
   2844		case sdma_event_e50_hw_cleaned:
   2845			break;
   2846		case sdma_event_e60_hw_halted:
   2847			schedule_work(&sde->err_halt_worker);
   2848			break;
   2849		case sdma_event_e70_go_idle:
   2850			ss->go_s99_running = 0;
   2851			break;
   2852		case sdma_event_e80_hw_freeze:
   2853			break;
   2854		case sdma_event_e81_hw_frozen:
   2855			break;
   2856		case sdma_event_e82_hw_unfreeze:
   2857			break;
   2858		case sdma_event_e85_link_down:
   2859			break;
   2860		case sdma_event_e90_sw_halted:
   2861			break;
   2862		}
   2863		break;
   2864
   2865	case sdma_state_s80_hw_freeze:
   2866		switch (event) {
   2867		case sdma_event_e00_go_hw_down:
   2868			sdma_set_state(sde, sdma_state_s00_hw_down);
   2869			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2870			break;
   2871		case sdma_event_e10_go_hw_start:
   2872			break;
   2873		case sdma_event_e15_hw_halt_done:
   2874			break;
   2875		case sdma_event_e25_hw_clean_up_done:
   2876			break;
   2877		case sdma_event_e30_go_running:
   2878			ss->go_s99_running = 1;
   2879			break;
   2880		case sdma_event_e40_sw_cleaned:
   2881			break;
   2882		case sdma_event_e50_hw_cleaned:
   2883			break;
   2884		case sdma_event_e60_hw_halted:
   2885			break;
   2886		case sdma_event_e70_go_idle:
   2887			ss->go_s99_running = 0;
   2888			break;
   2889		case sdma_event_e80_hw_freeze:
   2890			break;
   2891		case sdma_event_e81_hw_frozen:
   2892			sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
   2893			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2894			break;
   2895		case sdma_event_e82_hw_unfreeze:
   2896			break;
   2897		case sdma_event_e85_link_down:
   2898			break;
   2899		case sdma_event_e90_sw_halted:
   2900			break;
   2901		}
   2902		break;
   2903
   2904	case sdma_state_s82_freeze_sw_clean:
   2905		switch (event) {
   2906		case sdma_event_e00_go_hw_down:
   2907			sdma_set_state(sde, sdma_state_s00_hw_down);
   2908			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2909			break;
   2910		case sdma_event_e10_go_hw_start:
   2911			break;
   2912		case sdma_event_e15_hw_halt_done:
   2913			break;
   2914		case sdma_event_e25_hw_clean_up_done:
   2915			break;
   2916		case sdma_event_e30_go_running:
   2917			ss->go_s99_running = 1;
   2918			break;
   2919		case sdma_event_e40_sw_cleaned:
   2920			/* notify caller this engine is done cleaning */
   2921			atomic_dec(&sde->dd->sdma_unfreeze_count);
   2922			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
   2923			break;
   2924		case sdma_event_e50_hw_cleaned:
   2925			break;
   2926		case sdma_event_e60_hw_halted:
   2927			break;
   2928		case sdma_event_e70_go_idle:
   2929			ss->go_s99_running = 0;
   2930			break;
   2931		case sdma_event_e80_hw_freeze:
   2932			break;
   2933		case sdma_event_e81_hw_frozen:
   2934			break;
   2935		case sdma_event_e82_hw_unfreeze:
   2936			sdma_hw_start_up(sde);
   2937			sdma_set_state(sde, ss->go_s99_running ?
   2938				       sdma_state_s99_running :
   2939				       sdma_state_s20_idle);
   2940			break;
   2941		case sdma_event_e85_link_down:
   2942			break;
   2943		case sdma_event_e90_sw_halted:
   2944			break;
   2945		}
   2946		break;
   2947
   2948	case sdma_state_s99_running:
   2949		switch (event) {
   2950		case sdma_event_e00_go_hw_down:
   2951			sdma_set_state(sde, sdma_state_s00_hw_down);
   2952			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
   2953			break;
   2954		case sdma_event_e10_go_hw_start:
   2955			break;
   2956		case sdma_event_e15_hw_halt_done:
   2957			break;
   2958		case sdma_event_e25_hw_clean_up_done:
   2959			break;
   2960		case sdma_event_e30_go_running:
   2961			break;
   2962		case sdma_event_e40_sw_cleaned:
   2963			break;
   2964		case sdma_event_e50_hw_cleaned:
   2965			break;
   2966		case sdma_event_e60_hw_halted:
   2967			need_progress = 1;
   2968			sdma_err_progress_check_schedule(sde);
   2969			fallthrough;
   2970		case sdma_event_e90_sw_halted:
   2971			/*
   2972			* SW initiated halt does not perform engines
   2973			* progress check
   2974			*/
   2975			sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
   2976			schedule_work(&sde->err_halt_worker);
   2977			break;
   2978		case sdma_event_e70_go_idle:
   2979			sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
   2980			break;
   2981		case sdma_event_e85_link_down:
   2982			ss->go_s99_running = 0;
   2983			fallthrough;
   2984		case sdma_event_e80_hw_freeze:
   2985			sdma_set_state(sde, sdma_state_s80_hw_freeze);
   2986			atomic_dec(&sde->dd->sdma_unfreeze_count);
   2987			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
   2988			break;
   2989		case sdma_event_e81_hw_frozen:
   2990			break;
   2991		case sdma_event_e82_hw_unfreeze:
   2992			break;
   2993		}
   2994		break;
   2995	}
   2996
   2997	ss->last_event = event;
   2998	if (need_progress)
   2999		sdma_make_progress(sde, 0);
   3000}
   3001
   3002/*
   3003 * _extend_sdma_tx_descs() - helper to extend txreq
   3004 *
   3005 * This is called once the initial nominal allocation
   3006 * of descriptors in the sdma_txreq is exhausted.
   3007 *
   3008 * The code will bump the allocation up to the max
   3009 * of MAX_DESC (64) descriptors. There doesn't seem
   3010 * much point in an interim step. The last descriptor
   3011 * is reserved for coalesce buffer in order to support
   3012 * cases where input packet has >MAX_DESC iovecs.
   3013 *
   3014 */
   3015static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
   3016{
   3017	int i;
   3018	struct sdma_desc *descp;
   3019
   3020	/* Handle last descriptor */
   3021	if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
   3022		/* if tlen is 0, it is for padding, release last descriptor */
   3023		if (!tx->tlen) {
   3024			tx->desc_limit = MAX_DESC;
   3025		} else if (!tx->coalesce_buf) {
   3026			/* allocate coalesce buffer with space for padding */
   3027			tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
   3028						   GFP_ATOMIC);
   3029			if (!tx->coalesce_buf)
   3030				goto enomem;
   3031			tx->coalesce_idx = 0;
   3032		}
   3033		return 0;
   3034	}
   3035
   3036	if (unlikely(tx->num_desc == MAX_DESC))
   3037		goto enomem;
   3038
   3039	descp = kmalloc_array(MAX_DESC, sizeof(struct sdma_desc), GFP_ATOMIC);
   3040	if (!descp)
   3041		goto enomem;
   3042	tx->descp = descp;
   3043
   3044	/* reserve last descriptor for coalescing */
   3045	tx->desc_limit = MAX_DESC - 1;
   3046	/* copy ones already built */
   3047	for (i = 0; i < tx->num_desc; i++)
   3048		tx->descp[i] = tx->descs[i];
   3049	return 0;
   3050enomem:
   3051	__sdma_txclean(dd, tx);
   3052	return -ENOMEM;
   3053}
   3054
   3055/*
   3056 * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
   3057 *
   3058 * This is called once the initial nominal allocation of descriptors
   3059 * in the sdma_txreq is exhausted.
   3060 *
   3061 * This function calls _extend_sdma_tx_descs to extend or allocate
   3062 * coalesce buffer. If there is a allocated coalesce buffer, it will
   3063 * copy the input packet data into the coalesce buffer. It also adds
   3064 * coalesce buffer descriptor once when whole packet is received.
   3065 *
   3066 * Return:
   3067 * <0 - error
   3068 * 0 - coalescing, don't populate descriptor
   3069 * 1 - continue with populating descriptor
   3070 */
   3071int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
   3072			   int type, void *kvaddr, struct page *page,
   3073			   unsigned long offset, u16 len)
   3074{
   3075	int pad_len, rval;
   3076	dma_addr_t addr;
   3077
   3078	rval = _extend_sdma_tx_descs(dd, tx);
   3079	if (rval) {
   3080		__sdma_txclean(dd, tx);
   3081		return rval;
   3082	}
   3083
   3084	/* If coalesce buffer is allocated, copy data into it */
   3085	if (tx->coalesce_buf) {
   3086		if (type == SDMA_MAP_NONE) {
   3087			__sdma_txclean(dd, tx);
   3088			return -EINVAL;
   3089		}
   3090
   3091		if (type == SDMA_MAP_PAGE) {
   3092			kvaddr = kmap_local_page(page);
   3093			kvaddr += offset;
   3094		} else if (WARN_ON(!kvaddr)) {
   3095			__sdma_txclean(dd, tx);
   3096			return -EINVAL;
   3097		}
   3098
   3099		memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
   3100		tx->coalesce_idx += len;
   3101		if (type == SDMA_MAP_PAGE)
   3102			kunmap_local(kvaddr);
   3103
   3104		/* If there is more data, return */
   3105		if (tx->tlen - tx->coalesce_idx)
   3106			return 0;
   3107
   3108		/* Whole packet is received; add any padding */
   3109		pad_len = tx->packet_len & (sizeof(u32) - 1);
   3110		if (pad_len) {
   3111			pad_len = sizeof(u32) - pad_len;
   3112			memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
   3113			/* padding is taken care of for coalescing case */
   3114			tx->packet_len += pad_len;
   3115			tx->tlen += pad_len;
   3116		}
   3117
   3118		/* dma map the coalesce buffer */
   3119		addr = dma_map_single(&dd->pcidev->dev,
   3120				      tx->coalesce_buf,
   3121				      tx->tlen,
   3122				      DMA_TO_DEVICE);
   3123
   3124		if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
   3125			__sdma_txclean(dd, tx);
   3126			return -ENOSPC;
   3127		}
   3128
   3129		/* Add descriptor for coalesce buffer */
   3130		tx->desc_limit = MAX_DESC;
   3131		return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
   3132					 addr, tx->tlen);
   3133	}
   3134
   3135	return 1;
   3136}
   3137
   3138/* Update sdes when the lmc changes */
   3139void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
   3140{
   3141	struct sdma_engine *sde;
   3142	int i;
   3143	u64 sreg;
   3144
   3145	sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
   3146		SD(CHECK_SLID_MASK_SHIFT)) |
   3147		(((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
   3148		SD(CHECK_SLID_VALUE_SHIFT));
   3149
   3150	for (i = 0; i < dd->num_sdma; i++) {
   3151		hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
   3152			  i, (u32)sreg);
   3153		sde = &dd->per_sdma[i];
   3154		write_sde_csr(sde, SD(CHECK_SLID), sreg);
   3155	}
   3156}
   3157
   3158/* tx not dword sized - pad */
   3159int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
   3160{
   3161	int rval = 0;
   3162
   3163	tx->num_desc++;
   3164	if ((unlikely(tx->num_desc == tx->desc_limit))) {
   3165		rval = _extend_sdma_tx_descs(dd, tx);
   3166		if (rval) {
   3167			__sdma_txclean(dd, tx);
   3168			return rval;
   3169		}
   3170	}
   3171	/* finish the one just added */
   3172	make_tx_sdma_desc(
   3173		tx,
   3174		SDMA_MAP_NONE,
   3175		dd->sdma_pad_phys,
   3176		sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
   3177	_sdma_close_tx(dd, tx);
   3178	return rval;
   3179}
   3180
   3181/*
   3182 * Add ahg to the sdma_txreq
   3183 *
   3184 * The logic will consume up to 3
   3185 * descriptors at the beginning of
   3186 * sdma_txreq.
   3187 */
   3188void _sdma_txreq_ahgadd(
   3189	struct sdma_txreq *tx,
   3190	u8 num_ahg,
   3191	u8 ahg_entry,
   3192	u32 *ahg,
   3193	u8 ahg_hlen)
   3194{
   3195	u32 i, shift = 0, desc = 0;
   3196	u8 mode;
   3197
   3198	WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
   3199	/* compute mode */
   3200	if (num_ahg == 1)
   3201		mode = SDMA_AHG_APPLY_UPDATE1;
   3202	else if (num_ahg <= 5)
   3203		mode = SDMA_AHG_APPLY_UPDATE2;
   3204	else
   3205		mode = SDMA_AHG_APPLY_UPDATE3;
   3206	tx->num_desc++;
   3207	/* initialize to consumed descriptors to zero */
   3208	switch (mode) {
   3209	case SDMA_AHG_APPLY_UPDATE3:
   3210		tx->num_desc++;
   3211		tx->descs[2].qw[0] = 0;
   3212		tx->descs[2].qw[1] = 0;
   3213		fallthrough;
   3214	case SDMA_AHG_APPLY_UPDATE2:
   3215		tx->num_desc++;
   3216		tx->descs[1].qw[0] = 0;
   3217		tx->descs[1].qw[1] = 0;
   3218		break;
   3219	}
   3220	ahg_hlen >>= 2;
   3221	tx->descs[0].qw[1] |=
   3222		(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
   3223			<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
   3224		(((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
   3225			<< SDMA_DESC1_HEADER_DWS_SHIFT) |
   3226		(((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
   3227			<< SDMA_DESC1_HEADER_MODE_SHIFT) |
   3228		(((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
   3229			<< SDMA_DESC1_HEADER_UPDATE1_SHIFT);
   3230	for (i = 0; i < (num_ahg - 1); i++) {
   3231		if (!shift && !(i & 2))
   3232			desc++;
   3233		tx->descs[desc].qw[!!(i & 2)] |=
   3234			(((u64)ahg[i + 1])
   3235				<< shift);
   3236		shift = (shift + 32) & 63;
   3237	}
   3238}
   3239
   3240/**
   3241 * sdma_ahg_alloc - allocate an AHG entry
   3242 * @sde: engine to allocate from
   3243 *
   3244 * Return:
   3245 * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
   3246 * -ENOSPC if an entry is not available
   3247 */
   3248int sdma_ahg_alloc(struct sdma_engine *sde)
   3249{
   3250	int nr;
   3251	int oldbit;
   3252
   3253	if (!sde) {
   3254		trace_hfi1_ahg_allocate(sde, -EINVAL);
   3255		return -EINVAL;
   3256	}
   3257	while (1) {
   3258		nr = ffz(READ_ONCE(sde->ahg_bits));
   3259		if (nr > 31) {
   3260			trace_hfi1_ahg_allocate(sde, -ENOSPC);
   3261			return -ENOSPC;
   3262		}
   3263		oldbit = test_and_set_bit(nr, &sde->ahg_bits);
   3264		if (!oldbit)
   3265			break;
   3266		cpu_relax();
   3267	}
   3268	trace_hfi1_ahg_allocate(sde, nr);
   3269	return nr;
   3270}
   3271
   3272/**
   3273 * sdma_ahg_free - free an AHG entry
   3274 * @sde: engine to return AHG entry
   3275 * @ahg_index: index to free
   3276 *
   3277 * This routine frees the indicate AHG entry.
   3278 */
   3279void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
   3280{
   3281	if (!sde)
   3282		return;
   3283	trace_hfi1_ahg_deallocate(sde, ahg_index);
   3284	if (ahg_index < 0 || ahg_index > 31)
   3285		return;
   3286	clear_bit(ahg_index, &sde->ahg_bits);
   3287}
   3288
   3289/*
   3290 * SPC freeze handling for SDMA engines.  Called when the driver knows
   3291 * the SPC is going into a freeze but before the freeze is fully
   3292 * settled.  Generally an error interrupt.
   3293 *
   3294 * This event will pull the engine out of running so no more entries can be
   3295 * added to the engine's queue.
   3296 */
   3297void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
   3298{
   3299	int i;
   3300	enum sdma_events event = link_down ? sdma_event_e85_link_down :
   3301					     sdma_event_e80_hw_freeze;
   3302
   3303	/* set up the wait but do not wait here */
   3304	atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
   3305
   3306	/* tell all engines to stop running and wait */
   3307	for (i = 0; i < dd->num_sdma; i++)
   3308		sdma_process_event(&dd->per_sdma[i], event);
   3309
   3310	/* sdma_freeze() will wait for all engines to have stopped */
   3311}
   3312
   3313/*
   3314 * SPC freeze handling for SDMA engines.  Called when the driver knows
   3315 * the SPC is fully frozen.
   3316 */
   3317void sdma_freeze(struct hfi1_devdata *dd)
   3318{
   3319	int i;
   3320	int ret;
   3321
   3322	/*
   3323	 * Make sure all engines have moved out of the running state before
   3324	 * continuing.
   3325	 */
   3326	ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
   3327				       atomic_read(&dd->sdma_unfreeze_count) <=
   3328				       0);
   3329	/* interrupted or count is negative, then unloading - just exit */
   3330	if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
   3331		return;
   3332
   3333	/* set up the count for the next wait */
   3334	atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
   3335
   3336	/* tell all engines that the SPC is frozen, they can start cleaning */
   3337	for (i = 0; i < dd->num_sdma; i++)
   3338		sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
   3339
   3340	/*
   3341	 * Wait for everyone to finish software clean before exiting.  The
   3342	 * software clean will read engine CSRs, so must be completed before
   3343	 * the next step, which will clear the engine CSRs.
   3344	 */
   3345	(void)wait_event_interruptible(dd->sdma_unfreeze_wq,
   3346				atomic_read(&dd->sdma_unfreeze_count) <= 0);
   3347	/* no need to check results - done no matter what */
   3348}
   3349
   3350/*
   3351 * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
   3352 *
   3353 * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
   3354 * that is left is a software clean.  We could do it after the SPC is fully
   3355 * frozen, but then we'd have to add another state to wait for the unfreeze.
   3356 * Instead, just defer the software clean until the unfreeze step.
   3357 */
   3358void sdma_unfreeze(struct hfi1_devdata *dd)
   3359{
   3360	int i;
   3361
   3362	/* tell all engines start freeze clean up */
   3363	for (i = 0; i < dd->num_sdma; i++)
   3364		sdma_process_event(&dd->per_sdma[i],
   3365				   sdma_event_e82_hw_unfreeze);
   3366}
   3367
   3368/**
   3369 * _sdma_engine_progress_schedule() - schedule progress on engine
   3370 * @sde: sdma_engine to schedule progress
   3371 *
   3372 */
   3373void _sdma_engine_progress_schedule(
   3374	struct sdma_engine *sde)
   3375{
   3376	trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
   3377	/* assume we have selected a good cpu */
   3378	write_csr(sde->dd,
   3379		  CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
   3380		  sde->progress_mask);
   3381}