cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

qib_user_sdma.c (37047B)


      1/*
      2 * Copyright (c) 2007, 2008, 2009 QLogic Corporation. All rights reserved.
      3 *
      4 * This software is available to you under a choice of one of two
      5 * licenses.  You may choose to be licensed under the terms of the GNU
      6 * General Public License (GPL) Version 2, available from the file
      7 * COPYING in the main directory of this source tree, or the
      8 * OpenIB.org BSD license below:
      9 *
     10 *     Redistribution and use in source and binary forms, with or
     11 *     without modification, are permitted provided that the following
     12 *     conditions are met:
     13 *
     14 *      - Redistributions of source code must retain the above
     15 *        copyright notice, this list of conditions and the following
     16 *        disclaimer.
     17 *
     18 *      - Redistributions in binary form must reproduce the above
     19 *        copyright notice, this list of conditions and the following
     20 *        disclaimer in the documentation and/or other materials
     21 *        provided with the distribution.
     22 *
     23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     30 * SOFTWARE.
     31 */
     32#include <linux/mm.h>
     33#include <linux/types.h>
     34#include <linux/device.h>
     35#include <linux/dmapool.h>
     36#include <linux/slab.h>
     37#include <linux/list.h>
     38#include <linux/highmem.h>
     39#include <linux/io.h>
     40#include <linux/uio.h>
     41#include <linux/rbtree.h>
     42#include <linux/spinlock.h>
     43#include <linux/delay.h>
     44
     45#include "qib.h"
     46#include "qib_user_sdma.h"
     47
     48/* minimum size of header */
     49#define QIB_USER_SDMA_MIN_HEADER_LENGTH 64
     50/* expected size of headers (for dma_pool) */
     51#define QIB_USER_SDMA_EXP_HEADER_LENGTH 64
     52/* attempt to drain the queue for 5secs */
     53#define QIB_USER_SDMA_DRAIN_TIMEOUT 250
     54
     55/*
     56 * track how many times a process open this driver.
     57 */
     58static struct rb_root qib_user_sdma_rb_root = RB_ROOT;
     59
     60struct qib_user_sdma_rb_node {
     61	struct rb_node node;
     62	int refcount;
     63	pid_t pid;
     64};
     65
     66struct qib_user_sdma_pkt {
     67	struct list_head list;  /* list element */
     68
     69	u8  tiddma;		/* if this is NEW tid-sdma */
     70	u8  largepkt;		/* this is large pkt from kmalloc */
     71	u16 frag_size;		/* frag size used by PSM */
     72	u16 index;              /* last header index or push index */
     73	u16 naddr;              /* dimension of addr (1..3) ... */
     74	u16 addrlimit;		/* addr array size */
     75	u16 tidsmidx;		/* current tidsm index */
     76	u16 tidsmcount;		/* tidsm array item count */
     77	u16 payload_size;	/* payload size so far for header */
     78	u32 bytes_togo;		/* bytes for processing */
     79	u32 counter;            /* sdma pkts queued counter for this entry */
     80	struct qib_tid_session_member *tidsm;	/* tid session member array */
     81	struct qib_user_sdma_queue *pq;	/* which pq this pkt belongs to */
     82	u64 added;              /* global descq number of entries */
     83
     84	struct {
     85		u16 offset;                     /* offset for kvaddr, addr */
     86		u16 length;                     /* length in page */
     87		u16 first_desc;			/* first desc */
     88		u16 last_desc;			/* last desc */
     89		u16 put_page;                   /* should we put_page? */
     90		u16 dma_mapped;                 /* is page dma_mapped? */
     91		u16 dma_length;			/* for dma_unmap_page() */
     92		u16 padding;
     93		struct page *page;              /* may be NULL (coherent mem) */
     94		void *kvaddr;                   /* FIXME: only for pio hack */
     95		dma_addr_t addr;
     96	} addr[4];   /* max pages, any more and we coalesce */
     97};
     98
     99struct qib_user_sdma_queue {
    100	/*
    101	 * pkts sent to dma engine are queued on this
    102	 * list head.  the type of the elements of this
    103	 * list are struct qib_user_sdma_pkt...
    104	 */
    105	struct list_head sent;
    106
    107	/*
    108	 * Because above list will be accessed by both process and
    109	 * signal handler, we need a spinlock for it.
    110	 */
    111	spinlock_t sent_lock ____cacheline_aligned_in_smp;
    112
    113	/* headers with expected length are allocated from here... */
    114	char header_cache_name[64];
    115	struct dma_pool *header_cache;
    116
    117	/* packets are allocated from the slab cache... */
    118	char pkt_slab_name[64];
    119	struct kmem_cache *pkt_slab;
    120
    121	/* as packets go on the queued queue, they are counted... */
    122	u32 counter;
    123	u32 sent_counter;
    124	/* pending packets, not sending yet */
    125	u32 num_pending;
    126	/* sending packets, not complete yet */
    127	u32 num_sending;
    128	/* global descq number of entry of last sending packet */
    129	u64 added;
    130
    131	/* dma page table */
    132	struct rb_root dma_pages_root;
    133
    134	struct qib_user_sdma_rb_node *sdma_rb_node;
    135
    136	/* protect everything above... */
    137	struct mutex lock;
    138};
    139
    140static struct qib_user_sdma_rb_node *
    141qib_user_sdma_rb_search(struct rb_root *root, pid_t pid)
    142{
    143	struct qib_user_sdma_rb_node *sdma_rb_node;
    144	struct rb_node *node = root->rb_node;
    145
    146	while (node) {
    147		sdma_rb_node = rb_entry(node, struct qib_user_sdma_rb_node,
    148					node);
    149		if (pid < sdma_rb_node->pid)
    150			node = node->rb_left;
    151		else if (pid > sdma_rb_node->pid)
    152			node = node->rb_right;
    153		else
    154			return sdma_rb_node;
    155	}
    156	return NULL;
    157}
    158
    159static int
    160qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new)
    161{
    162	struct rb_node **node = &(root->rb_node);
    163	struct rb_node *parent = NULL;
    164	struct qib_user_sdma_rb_node *got;
    165
    166	while (*node) {
    167		got = rb_entry(*node, struct qib_user_sdma_rb_node, node);
    168		parent = *node;
    169		if (new->pid < got->pid)
    170			node = &((*node)->rb_left);
    171		else if (new->pid > got->pid)
    172			node = &((*node)->rb_right);
    173		else
    174			return 0;
    175	}
    176
    177	rb_link_node(&new->node, parent, node);
    178	rb_insert_color(&new->node, root);
    179	return 1;
    180}
    181
    182struct qib_user_sdma_queue *
    183qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)
    184{
    185	struct qib_user_sdma_queue *pq =
    186		kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL);
    187	struct qib_user_sdma_rb_node *sdma_rb_node;
    188
    189	if (!pq)
    190		goto done;
    191
    192	pq->counter = 0;
    193	pq->sent_counter = 0;
    194	pq->num_pending = 0;
    195	pq->num_sending = 0;
    196	pq->added = 0;
    197	pq->sdma_rb_node = NULL;
    198
    199	INIT_LIST_HEAD(&pq->sent);
    200	spin_lock_init(&pq->sent_lock);
    201	mutex_init(&pq->lock);
    202
    203	snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
    204		 "qib-user-sdma-pkts-%u-%02u.%02u", unit, ctxt, sctxt);
    205	pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
    206					 sizeof(struct qib_user_sdma_pkt),
    207					 0, 0, NULL);
    208
    209	if (!pq->pkt_slab)
    210		goto err_kfree;
    211
    212	snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
    213		 "qib-user-sdma-headers-%u-%02u.%02u", unit, ctxt, sctxt);
    214	pq->header_cache = dma_pool_create(pq->header_cache_name,
    215					   dev,
    216					   QIB_USER_SDMA_EXP_HEADER_LENGTH,
    217					   4, 0);
    218	if (!pq->header_cache)
    219		goto err_slab;
    220
    221	pq->dma_pages_root = RB_ROOT;
    222
    223	sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root,
    224					current->pid);
    225	if (sdma_rb_node) {
    226		sdma_rb_node->refcount++;
    227	} else {
    228		sdma_rb_node = kmalloc(sizeof(
    229			struct qib_user_sdma_rb_node), GFP_KERNEL);
    230		if (!sdma_rb_node)
    231			goto err_rb;
    232
    233		sdma_rb_node->refcount = 1;
    234		sdma_rb_node->pid = current->pid;
    235
    236		qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, sdma_rb_node);
    237	}
    238	pq->sdma_rb_node = sdma_rb_node;
    239
    240	goto done;
    241
    242err_rb:
    243	dma_pool_destroy(pq->header_cache);
    244err_slab:
    245	kmem_cache_destroy(pq->pkt_slab);
    246err_kfree:
    247	kfree(pq);
    248	pq = NULL;
    249
    250done:
    251	return pq;
    252}
    253
    254static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt,
    255				    int i, u16 offset, u16 len,
    256				    u16 first_desc, u16 last_desc,
    257				    u16 put_page, u16 dma_mapped,
    258				    struct page *page, void *kvaddr,
    259				    dma_addr_t dma_addr, u16 dma_length)
    260{
    261	pkt->addr[i].offset = offset;
    262	pkt->addr[i].length = len;
    263	pkt->addr[i].first_desc = first_desc;
    264	pkt->addr[i].last_desc = last_desc;
    265	pkt->addr[i].put_page = put_page;
    266	pkt->addr[i].dma_mapped = dma_mapped;
    267	pkt->addr[i].page = page;
    268	pkt->addr[i].kvaddr = kvaddr;
    269	pkt->addr[i].addr = dma_addr;
    270	pkt->addr[i].dma_length = dma_length;
    271}
    272
    273static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq,
    274				size_t len, dma_addr_t *dma_addr)
    275{
    276	void *hdr;
    277
    278	if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH)
    279		hdr = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
    280					     dma_addr);
    281	else
    282		hdr = NULL;
    283
    284	if (!hdr) {
    285		hdr = kmalloc(len, GFP_KERNEL);
    286		if (!hdr)
    287			return NULL;
    288
    289		*dma_addr = 0;
    290	}
    291
    292	return hdr;
    293}
    294
    295static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
    296				       struct qib_user_sdma_queue *pq,
    297				       struct qib_user_sdma_pkt *pkt,
    298				       struct page *page, u16 put,
    299				       u16 offset, u16 len, void *kvaddr)
    300{
    301	__le16 *pbc16;
    302	void *pbcvaddr;
    303	struct qib_message_header *hdr;
    304	u16 newlen, pbclen, lastdesc, dma_mapped;
    305	u32 vcto;
    306	union qib_seqnum seqnum;
    307	dma_addr_t pbcdaddr;
    308	dma_addr_t dma_addr =
    309		dma_map_page(&dd->pcidev->dev,
    310			page, offset, len, DMA_TO_DEVICE);
    311	int ret = 0;
    312
    313	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
    314		/*
    315		 * dma mapping error, pkt has not managed
    316		 * this page yet, return the page here so
    317		 * the caller can ignore this page.
    318		 */
    319		if (put) {
    320			unpin_user_page(page);
    321		} else {
    322			/* coalesce case */
    323			kunmap(page);
    324			__free_page(page);
    325		}
    326		ret = -ENOMEM;
    327		goto done;
    328	}
    329	offset = 0;
    330	dma_mapped = 1;
    331
    332
    333next_fragment:
    334
    335	/*
    336	 * In tid-sdma, the transfer length is restricted by
    337	 * receiver side current tid page length.
    338	 */
    339	if (pkt->tiddma && len > pkt->tidsm[pkt->tidsmidx].length)
    340		newlen = pkt->tidsm[pkt->tidsmidx].length;
    341	else
    342		newlen = len;
    343
    344	/*
    345	 * Then the transfer length is restricted by MTU.
    346	 * the last descriptor flag is determined by:
    347	 * 1. the current packet is at frag size length.
    348	 * 2. the current tid page is done if tid-sdma.
    349	 * 3. there is no more byte togo if sdma.
    350	 */
    351	lastdesc = 0;
    352	if ((pkt->payload_size + newlen) >= pkt->frag_size) {
    353		newlen = pkt->frag_size - pkt->payload_size;
    354		lastdesc = 1;
    355	} else if (pkt->tiddma) {
    356		if (newlen == pkt->tidsm[pkt->tidsmidx].length)
    357			lastdesc = 1;
    358	} else {
    359		if (newlen == pkt->bytes_togo)
    360			lastdesc = 1;
    361	}
    362
    363	/* fill the next fragment in this page */
    364	qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
    365		offset, newlen,		/* offset, len */
    366		0, lastdesc,		/* first last desc */
    367		put, dma_mapped,	/* put page, dma mapped */
    368		page, kvaddr,		/* struct page, virt addr */
    369		dma_addr, len);		/* dma addr, dma length */
    370	pkt->bytes_togo -= newlen;
    371	pkt->payload_size += newlen;
    372	pkt->naddr++;
    373	if (pkt->naddr == pkt->addrlimit) {
    374		ret = -EFAULT;
    375		goto done;
    376	}
    377
    378	/* If there is no more byte togo. (lastdesc==1) */
    379	if (pkt->bytes_togo == 0) {
    380		/* The packet is done, header is not dma mapped yet.
    381		 * it should be from kmalloc */
    382		if (!pkt->addr[pkt->index].addr) {
    383			pkt->addr[pkt->index].addr =
    384				dma_map_single(&dd->pcidev->dev,
    385					pkt->addr[pkt->index].kvaddr,
    386					pkt->addr[pkt->index].dma_length,
    387					DMA_TO_DEVICE);
    388			if (dma_mapping_error(&dd->pcidev->dev,
    389					pkt->addr[pkt->index].addr)) {
    390				ret = -ENOMEM;
    391				goto done;
    392			}
    393			pkt->addr[pkt->index].dma_mapped = 1;
    394		}
    395
    396		goto done;
    397	}
    398
    399	/* If tid-sdma, advance tid info. */
    400	if (pkt->tiddma) {
    401		pkt->tidsm[pkt->tidsmidx].length -= newlen;
    402		if (pkt->tidsm[pkt->tidsmidx].length) {
    403			pkt->tidsm[pkt->tidsmidx].offset += newlen;
    404		} else {
    405			pkt->tidsmidx++;
    406			if (pkt->tidsmidx == pkt->tidsmcount) {
    407				ret = -EFAULT;
    408				goto done;
    409			}
    410		}
    411	}
    412
    413	/*
    414	 * If this is NOT the last descriptor. (newlen==len)
    415	 * the current packet is not done yet, but the current
    416	 * send side page is done.
    417	 */
    418	if (lastdesc == 0)
    419		goto done;
    420
    421	/*
    422	 * If running this driver under PSM with message size
    423	 * fitting into one transfer unit, it is not possible
    424	 * to pass this line. otherwise, it is a buggggg.
    425	 */
    426
    427	/*
    428	 * Since the current packet is done, and there are more
    429	 * bytes togo, we need to create a new sdma header, copying
    430	 * from previous sdma header and modify both.
    431	 */
    432	pbclen = pkt->addr[pkt->index].length;
    433	pbcvaddr = qib_user_sdma_alloc_header(pq, pbclen, &pbcdaddr);
    434	if (!pbcvaddr) {
    435		ret = -ENOMEM;
    436		goto done;
    437	}
    438	/* Copy the previous sdma header to new sdma header */
    439	pbc16 = (__le16 *)pkt->addr[pkt->index].kvaddr;
    440	memcpy(pbcvaddr, pbc16, pbclen);
    441
    442	/* Modify the previous sdma header */
    443	hdr = (struct qib_message_header *)&pbc16[4];
    444
    445	/* New pbc length */
    446	pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->bytes_togo>>2));
    447
    448	/* New packet length */
    449	hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
    450
    451	if (pkt->tiddma) {
    452		/* turn on the header suppression */
    453		hdr->iph.pkt_flags =
    454			cpu_to_le16(le16_to_cpu(hdr->iph.pkt_flags)|0x2);
    455		/* turn off ACK_REQ: 0x04 and EXPECTED_DONE: 0x20 */
    456		hdr->flags &= ~(0x04|0x20);
    457	} else {
    458		/* turn off extra bytes: 20-21 bits */
    459		hdr->bth[0] = cpu_to_be32(be32_to_cpu(hdr->bth[0])&0xFFCFFFFF);
    460		/* turn off ACK_REQ: 0x04 */
    461		hdr->flags &= ~(0x04);
    462	}
    463
    464	/* New kdeth checksum */
    465	vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
    466	hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
    467		be16_to_cpu(hdr->lrh[2]) -
    468		((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
    469		le16_to_cpu(hdr->iph.pkt_flags));
    470
    471	/* The packet is done, header is not dma mapped yet.
    472	 * it should be from kmalloc */
    473	if (!pkt->addr[pkt->index].addr) {
    474		pkt->addr[pkt->index].addr =
    475			dma_map_single(&dd->pcidev->dev,
    476				pkt->addr[pkt->index].kvaddr,
    477				pkt->addr[pkt->index].dma_length,
    478				DMA_TO_DEVICE);
    479		if (dma_mapping_error(&dd->pcidev->dev,
    480				pkt->addr[pkt->index].addr)) {
    481			ret = -ENOMEM;
    482			goto done;
    483		}
    484		pkt->addr[pkt->index].dma_mapped = 1;
    485	}
    486
    487	/* Modify the new sdma header */
    488	pbc16 = (__le16 *)pbcvaddr;
    489	hdr = (struct qib_message_header *)&pbc16[4];
    490
    491	/* New pbc length */
    492	pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->payload_size>>2));
    493
    494	/* New packet length */
    495	hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
    496
    497	if (pkt->tiddma) {
    498		/* Set new tid and offset for new sdma header */
    499		hdr->iph.ver_ctxt_tid_offset = cpu_to_le32(
    500			(le32_to_cpu(hdr->iph.ver_ctxt_tid_offset)&0xFF000000) +
    501			(pkt->tidsm[pkt->tidsmidx].tid<<QLOGIC_IB_I_TID_SHIFT) +
    502			(pkt->tidsm[pkt->tidsmidx].offset>>2));
    503	} else {
    504		/* Middle protocol new packet offset */
    505		hdr->uwords[2] += pkt->payload_size;
    506	}
    507
    508	/* New kdeth checksum */
    509	vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
    510	hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
    511		be16_to_cpu(hdr->lrh[2]) -
    512		((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
    513		le16_to_cpu(hdr->iph.pkt_flags));
    514
    515	/* Next sequence number in new sdma header */
    516	seqnum.val = be32_to_cpu(hdr->bth[2]);
    517	if (pkt->tiddma)
    518		seqnum.seq++;
    519	else
    520		seqnum.pkt++;
    521	hdr->bth[2] = cpu_to_be32(seqnum.val);
    522
    523	/* Init new sdma header. */
    524	qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
    525		0, pbclen,		/* offset, len */
    526		1, 0,			/* first last desc */
    527		0, 0,			/* put page, dma mapped */
    528		NULL, pbcvaddr,		/* struct page, virt addr */
    529		pbcdaddr, pbclen);	/* dma addr, dma length */
    530	pkt->index = pkt->naddr;
    531	pkt->payload_size = 0;
    532	pkt->naddr++;
    533	if (pkt->naddr == pkt->addrlimit) {
    534		ret = -EFAULT;
    535		goto done;
    536	}
    537
    538	/* Prepare for next fragment in this page */
    539	if (newlen != len) {
    540		if (dma_mapped) {
    541			put = 0;
    542			dma_mapped = 0;
    543			page = NULL;
    544			kvaddr = NULL;
    545		}
    546		len -= newlen;
    547		offset += newlen;
    548
    549		goto next_fragment;
    550	}
    551
    552done:
    553	return ret;
    554}
    555
    556/* we've too many pages in the iovec, coalesce to a single page */
    557static int qib_user_sdma_coalesce(const struct qib_devdata *dd,
    558				  struct qib_user_sdma_queue *pq,
    559				  struct qib_user_sdma_pkt *pkt,
    560				  const struct iovec *iov,
    561				  unsigned long niov)
    562{
    563	int ret = 0;
    564	struct page *page = alloc_page(GFP_KERNEL);
    565	void *mpage_save;
    566	char *mpage;
    567	int i;
    568	int len = 0;
    569
    570	if (!page) {
    571		ret = -ENOMEM;
    572		goto done;
    573	}
    574
    575	mpage = kmap(page);
    576	mpage_save = mpage;
    577	for (i = 0; i < niov; i++) {
    578		int cfur;
    579
    580		cfur = copy_from_user(mpage,
    581				      iov[i].iov_base, iov[i].iov_len);
    582		if (cfur) {
    583			ret = -EFAULT;
    584			goto free_unmap;
    585		}
    586
    587		mpage += iov[i].iov_len;
    588		len += iov[i].iov_len;
    589	}
    590
    591	ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
    592			page, 0, 0, len, mpage_save);
    593	goto done;
    594
    595free_unmap:
    596	kunmap(page);
    597	__free_page(page);
    598done:
    599	return ret;
    600}
    601
    602/*
    603 * How many pages in this iovec element?
    604 */
    605static size_t qib_user_sdma_num_pages(const struct iovec *iov)
    606{
    607	const unsigned long addr  = (unsigned long) iov->iov_base;
    608	const unsigned long  len  = iov->iov_len;
    609	const unsigned long spage = addr & PAGE_MASK;
    610	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
    611
    612	return 1 + ((epage - spage) >> PAGE_SHIFT);
    613}
    614
    615static void qib_user_sdma_free_pkt_frag(struct device *dev,
    616					struct qib_user_sdma_queue *pq,
    617					struct qib_user_sdma_pkt *pkt,
    618					int frag)
    619{
    620	const int i = frag;
    621
    622	if (pkt->addr[i].page) {
    623		/* only user data has page */
    624		if (pkt->addr[i].dma_mapped)
    625			dma_unmap_page(dev,
    626				       pkt->addr[i].addr,
    627				       pkt->addr[i].dma_length,
    628				       DMA_TO_DEVICE);
    629
    630		if (pkt->addr[i].kvaddr)
    631			kunmap(pkt->addr[i].page);
    632
    633		if (pkt->addr[i].put_page)
    634			unpin_user_page(pkt->addr[i].page);
    635		else
    636			__free_page(pkt->addr[i].page);
    637	} else if (pkt->addr[i].kvaddr) {
    638		/* for headers */
    639		if (pkt->addr[i].dma_mapped) {
    640			/* from kmalloc & dma mapped */
    641			dma_unmap_single(dev,
    642				       pkt->addr[i].addr,
    643				       pkt->addr[i].dma_length,
    644				       DMA_TO_DEVICE);
    645			kfree(pkt->addr[i].kvaddr);
    646		} else if (pkt->addr[i].addr) {
    647			/* free coherent mem from cache... */
    648			dma_pool_free(pq->header_cache,
    649			      pkt->addr[i].kvaddr, pkt->addr[i].addr);
    650		} else {
    651			/* from kmalloc but not dma mapped */
    652			kfree(pkt->addr[i].kvaddr);
    653		}
    654	}
    655}
    656
    657/* return number of pages pinned... */
    658static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
    659				   struct qib_user_sdma_queue *pq,
    660				   struct qib_user_sdma_pkt *pkt,
    661				   unsigned long addr, int tlen, size_t npages)
    662{
    663	struct page *pages[8];
    664	int i, j;
    665	int ret = 0;
    666
    667	while (npages) {
    668		if (npages > 8)
    669			j = 8;
    670		else
    671			j = npages;
    672
    673		ret = pin_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
    674		if (ret != j) {
    675			i = 0;
    676			j = ret;
    677			ret = -ENOMEM;
    678			goto free_pages;
    679		}
    680
    681		for (i = 0; i < j; i++) {
    682			/* map the pages... */
    683			unsigned long fofs = addr & ~PAGE_MASK;
    684			int flen = ((fofs + tlen) > PAGE_SIZE) ?
    685				(PAGE_SIZE - fofs) : tlen;
    686
    687			ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
    688				pages[i], 1, fofs, flen, NULL);
    689			if (ret < 0) {
    690				/* current page has beed taken
    691				 * care of inside above call.
    692				 */
    693				i++;
    694				goto free_pages;
    695			}
    696
    697			addr += flen;
    698			tlen -= flen;
    699		}
    700
    701		npages -= j;
    702	}
    703
    704	goto done;
    705
    706	/* if error, return all pages not managed by pkt */
    707free_pages:
    708	while (i < j)
    709		unpin_user_page(pages[i++]);
    710
    711done:
    712	return ret;
    713}
    714
    715static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd,
    716				 struct qib_user_sdma_queue *pq,
    717				 struct qib_user_sdma_pkt *pkt,
    718				 const struct iovec *iov,
    719				 unsigned long niov)
    720{
    721	int ret = 0;
    722	unsigned long idx;
    723
    724	for (idx = 0; idx < niov; idx++) {
    725		const size_t npages = qib_user_sdma_num_pages(iov + idx);
    726		const unsigned long addr = (unsigned long) iov[idx].iov_base;
    727
    728		ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr,
    729					      iov[idx].iov_len, npages);
    730		if (ret < 0)
    731			goto free_pkt;
    732	}
    733
    734	goto done;
    735
    736free_pkt:
    737	/* we need to ignore the first entry here */
    738	for (idx = 1; idx < pkt->naddr; idx++)
    739		qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
    740
    741	/* need to dma unmap the first entry, this is to restore to
    742	 * the original state so that caller can free the memory in
    743	 * error condition. Caller does not know if dma mapped or not*/
    744	if (pkt->addr[0].dma_mapped) {
    745		dma_unmap_single(&dd->pcidev->dev,
    746		       pkt->addr[0].addr,
    747		       pkt->addr[0].dma_length,
    748		       DMA_TO_DEVICE);
    749		pkt->addr[0].addr = 0;
    750		pkt->addr[0].dma_mapped = 0;
    751	}
    752
    753done:
    754	return ret;
    755}
    756
    757static int qib_user_sdma_init_payload(const struct qib_devdata *dd,
    758				      struct qib_user_sdma_queue *pq,
    759				      struct qib_user_sdma_pkt *pkt,
    760				      const struct iovec *iov,
    761				      unsigned long niov, int npages)
    762{
    763	int ret = 0;
    764
    765	if (pkt->frag_size == pkt->bytes_togo &&
    766			npages >= ARRAY_SIZE(pkt->addr))
    767		ret = qib_user_sdma_coalesce(dd, pq, pkt, iov, niov);
    768	else
    769		ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
    770
    771	return ret;
    772}
    773
    774/* free a packet list -- return counter value of last packet */
    775static void qib_user_sdma_free_pkt_list(struct device *dev,
    776					struct qib_user_sdma_queue *pq,
    777					struct list_head *list)
    778{
    779	struct qib_user_sdma_pkt *pkt, *pkt_next;
    780
    781	list_for_each_entry_safe(pkt, pkt_next, list, list) {
    782		int i;
    783
    784		for (i = 0; i < pkt->naddr; i++)
    785			qib_user_sdma_free_pkt_frag(dev, pq, pkt, i);
    786
    787		if (pkt->largepkt)
    788			kfree(pkt);
    789		else
    790			kmem_cache_free(pq->pkt_slab, pkt);
    791	}
    792	INIT_LIST_HEAD(list);
    793}
    794
    795/*
    796 * copy headers, coalesce etc -- pq->lock must be held
    797 *
    798 * we queue all the packets to list, returning the
    799 * number of bytes total.  list must be empty initially,
    800 * as, if there is an error we clean it...
    801 */
    802static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
    803				    struct qib_pportdata *ppd,
    804				    struct qib_user_sdma_queue *pq,
    805				    const struct iovec *iov,
    806				    unsigned long niov,
    807				    struct list_head *list,
    808				    int *maxpkts, int *ndesc)
    809{
    810	unsigned long idx = 0;
    811	int ret = 0;
    812	int npkts = 0;
    813	__le32 *pbc;
    814	dma_addr_t dma_addr;
    815	struct qib_user_sdma_pkt *pkt = NULL;
    816	size_t len;
    817	size_t nw;
    818	u32 counter = pq->counter;
    819	u16 frag_size;
    820
    821	while (idx < niov && npkts < *maxpkts) {
    822		const unsigned long addr = (unsigned long) iov[idx].iov_base;
    823		const unsigned long idx_save = idx;
    824		unsigned pktnw;
    825		unsigned pktnwc;
    826		int nfrags = 0;
    827		size_t npages = 0;
    828		size_t bytes_togo = 0;
    829		int tiddma = 0;
    830		int cfur;
    831
    832		len = iov[idx].iov_len;
    833		nw = len >> 2;
    834
    835		if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH ||
    836		    len > PAGE_SIZE || len & 3 || addr & 3) {
    837			ret = -EINVAL;
    838			goto free_list;
    839		}
    840
    841		pbc = qib_user_sdma_alloc_header(pq, len, &dma_addr);
    842		if (!pbc) {
    843			ret = -ENOMEM;
    844			goto free_list;
    845		}
    846
    847		cfur = copy_from_user(pbc, iov[idx].iov_base, len);
    848		if (cfur) {
    849			ret = -EFAULT;
    850			goto free_pbc;
    851		}
    852
    853		/*
    854		 * This assignment is a bit strange.  it's because the
    855		 * the pbc counts the number of 32 bit words in the full
    856		 * packet _except_ the first word of the pbc itself...
    857		 */
    858		pktnwc = nw - 1;
    859
    860		/*
    861		 * pktnw computation yields the number of 32 bit words
    862		 * that the caller has indicated in the PBC.  note that
    863		 * this is one less than the total number of words that
    864		 * goes to the send DMA engine as the first 32 bit word
    865		 * of the PBC itself is not counted.  Armed with this count,
    866		 * we can verify that the packet is consistent with the
    867		 * iovec lengths.
    868		 */
    869		pktnw = le32_to_cpu(*pbc) & 0xFFFF;
    870		if (pktnw < pktnwc) {
    871			ret = -EINVAL;
    872			goto free_pbc;
    873		}
    874
    875		idx++;
    876		while (pktnwc < pktnw && idx < niov) {
    877			const size_t slen = iov[idx].iov_len;
    878			const unsigned long faddr =
    879				(unsigned long) iov[idx].iov_base;
    880
    881			if (slen & 3 || faddr & 3 || !slen) {
    882				ret = -EINVAL;
    883				goto free_pbc;
    884			}
    885
    886			npages += qib_user_sdma_num_pages(&iov[idx]);
    887
    888			if (check_add_overflow(bytes_togo, slen, &bytes_togo) ||
    889			    bytes_togo > type_max(typeof(pkt->bytes_togo))) {
    890				ret = -EINVAL;
    891				goto free_pbc;
    892			}
    893			pktnwc += slen >> 2;
    894			idx++;
    895			nfrags++;
    896		}
    897
    898		if (pktnwc != pktnw) {
    899			ret = -EINVAL;
    900			goto free_pbc;
    901		}
    902
    903		frag_size = ((le32_to_cpu(*pbc))>>16) & 0xFFFF;
    904		if (((frag_size ? frag_size : bytes_togo) + len) >
    905						ppd->ibmaxlen) {
    906			ret = -EINVAL;
    907			goto free_pbc;
    908		}
    909
    910		if (frag_size) {
    911			size_t tidsmsize, n, pktsize, sz, addrlimit;
    912
    913			n = npages*((2*PAGE_SIZE/frag_size)+1);
    914			pktsize = struct_size(pkt, addr, n);
    915
    916			/*
    917			 * Determine if this is tid-sdma or just sdma.
    918			 */
    919			tiddma = (((le32_to_cpu(pbc[7])>>
    920				QLOGIC_IB_I_TID_SHIFT)&
    921				QLOGIC_IB_I_TID_MASK) !=
    922				QLOGIC_IB_I_TID_MASK);
    923
    924			if (tiddma)
    925				tidsmsize = iov[idx].iov_len;
    926			else
    927				tidsmsize = 0;
    928
    929			if (check_add_overflow(pktsize, tidsmsize, &sz)) {
    930				ret = -EINVAL;
    931				goto free_pbc;
    932			}
    933			pkt = kmalloc(sz, GFP_KERNEL);
    934			if (!pkt) {
    935				ret = -ENOMEM;
    936				goto free_pbc;
    937			}
    938			pkt->largepkt = 1;
    939			pkt->frag_size = frag_size;
    940			if (check_add_overflow(n, ARRAY_SIZE(pkt->addr),
    941					       &addrlimit) ||
    942			    addrlimit > type_max(typeof(pkt->addrlimit))) {
    943				ret = -EINVAL;
    944				goto free_pkt;
    945			}
    946			pkt->addrlimit = addrlimit;
    947
    948			if (tiddma) {
    949				char *tidsm = (char *)pkt + pktsize;
    950
    951				cfur = copy_from_user(tidsm,
    952					iov[idx].iov_base, tidsmsize);
    953				if (cfur) {
    954					ret = -EFAULT;
    955					goto free_pkt;
    956				}
    957				pkt->tidsm =
    958					(struct qib_tid_session_member *)tidsm;
    959				pkt->tidsmcount = tidsmsize/
    960					sizeof(struct qib_tid_session_member);
    961				pkt->tidsmidx = 0;
    962				idx++;
    963			}
    964
    965			/*
    966			 * pbc 'fill1' field is borrowed to pass frag size,
    967			 * we need to clear it after picking frag size, the
    968			 * hardware requires this field to be zero.
    969			 */
    970			*pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF);
    971		} else {
    972			pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
    973			if (!pkt) {
    974				ret = -ENOMEM;
    975				goto free_pbc;
    976			}
    977			pkt->largepkt = 0;
    978			pkt->frag_size = bytes_togo;
    979			pkt->addrlimit = ARRAY_SIZE(pkt->addr);
    980		}
    981		pkt->bytes_togo = bytes_togo;
    982		pkt->payload_size = 0;
    983		pkt->counter = counter;
    984		pkt->tiddma = tiddma;
    985
    986		/* setup the first header */
    987		qib_user_sdma_init_frag(pkt, 0, /* index */
    988			0, len,		/* offset, len */
    989			1, 0,		/* first last desc */
    990			0, 0,		/* put page, dma mapped */
    991			NULL, pbc,	/* struct page, virt addr */
    992			dma_addr, len);	/* dma addr, dma length */
    993		pkt->index = 0;
    994		pkt->naddr = 1;
    995
    996		if (nfrags) {
    997			ret = qib_user_sdma_init_payload(dd, pq, pkt,
    998							 iov + idx_save + 1,
    999							 nfrags, npages);
   1000			if (ret < 0)
   1001				goto free_pkt;
   1002		} else {
   1003			/* since there is no payload, mark the
   1004			 * header as the last desc. */
   1005			pkt->addr[0].last_desc = 1;
   1006
   1007			if (dma_addr == 0) {
   1008				/*
   1009				 * the header is not dma mapped yet.
   1010				 * it should be from kmalloc.
   1011				 */
   1012				dma_addr = dma_map_single(&dd->pcidev->dev,
   1013					pbc, len, DMA_TO_DEVICE);
   1014				if (dma_mapping_error(&dd->pcidev->dev,
   1015								dma_addr)) {
   1016					ret = -ENOMEM;
   1017					goto free_pkt;
   1018				}
   1019				pkt->addr[0].addr = dma_addr;
   1020				pkt->addr[0].dma_mapped = 1;
   1021			}
   1022		}
   1023
   1024		counter++;
   1025		npkts++;
   1026		pkt->pq = pq;
   1027		pkt->index = 0; /* reset index for push on hw */
   1028		*ndesc += pkt->naddr;
   1029
   1030		list_add_tail(&pkt->list, list);
   1031	}
   1032
   1033	*maxpkts = npkts;
   1034	ret = idx;
   1035	goto done;
   1036
   1037free_pkt:
   1038	if (pkt->largepkt)
   1039		kfree(pkt);
   1040	else
   1041		kmem_cache_free(pq->pkt_slab, pkt);
   1042free_pbc:
   1043	if (dma_addr)
   1044		dma_pool_free(pq->header_cache, pbc, dma_addr);
   1045	else
   1046		kfree(pbc);
   1047free_list:
   1048	qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
   1049done:
   1050	return ret;
   1051}
   1052
   1053static void qib_user_sdma_set_complete_counter(struct qib_user_sdma_queue *pq,
   1054					       u32 c)
   1055{
   1056	pq->sent_counter = c;
   1057}
   1058
   1059/* try to clean out queue -- needs pq->lock */
   1060static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
   1061				     struct qib_user_sdma_queue *pq)
   1062{
   1063	struct qib_devdata *dd = ppd->dd;
   1064	struct list_head free_list;
   1065	struct qib_user_sdma_pkt *pkt;
   1066	struct qib_user_sdma_pkt *pkt_prev;
   1067	unsigned long flags;
   1068	int ret = 0;
   1069
   1070	if (!pq->num_sending)
   1071		return 0;
   1072
   1073	INIT_LIST_HEAD(&free_list);
   1074
   1075	/*
   1076	 * We need this spin lock here because interrupt handler
   1077	 * might modify this list in qib_user_sdma_send_desc(), also
   1078	 * we can not get interrupted, otherwise it is a deadlock.
   1079	 */
   1080	spin_lock_irqsave(&pq->sent_lock, flags);
   1081	list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
   1082		s64 descd = ppd->sdma_descq_removed - pkt->added;
   1083
   1084		if (descd < 0)
   1085			break;
   1086
   1087		list_move_tail(&pkt->list, &free_list);
   1088
   1089		/* one more packet cleaned */
   1090		ret++;
   1091		pq->num_sending--;
   1092	}
   1093	spin_unlock_irqrestore(&pq->sent_lock, flags);
   1094
   1095	if (!list_empty(&free_list)) {
   1096		u32 counter;
   1097
   1098		pkt = list_entry(free_list.prev,
   1099				 struct qib_user_sdma_pkt, list);
   1100		counter = pkt->counter;
   1101
   1102		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
   1103		qib_user_sdma_set_complete_counter(pq, counter);
   1104	}
   1105
   1106	return ret;
   1107}
   1108
   1109void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq)
   1110{
   1111	if (!pq)
   1112		return;
   1113
   1114	pq->sdma_rb_node->refcount--;
   1115	if (pq->sdma_rb_node->refcount == 0) {
   1116		rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root);
   1117		kfree(pq->sdma_rb_node);
   1118	}
   1119	dma_pool_destroy(pq->header_cache);
   1120	kmem_cache_destroy(pq->pkt_slab);
   1121	kfree(pq);
   1122}
   1123
   1124/* clean descriptor queue, returns > 0 if some elements cleaned */
   1125static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd)
   1126{
   1127	int ret;
   1128	unsigned long flags;
   1129
   1130	spin_lock_irqsave(&ppd->sdma_lock, flags);
   1131	ret = qib_sdma_make_progress(ppd);
   1132	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
   1133
   1134	return ret;
   1135}
   1136
   1137/* we're in close, drain packets so that we can cleanup successfully... */
   1138void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
   1139			       struct qib_user_sdma_queue *pq)
   1140{
   1141	struct qib_devdata *dd = ppd->dd;
   1142	unsigned long flags;
   1143	int i;
   1144
   1145	if (!pq)
   1146		return;
   1147
   1148	for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) {
   1149		mutex_lock(&pq->lock);
   1150		if (!pq->num_pending && !pq->num_sending) {
   1151			mutex_unlock(&pq->lock);
   1152			break;
   1153		}
   1154		qib_user_sdma_hwqueue_clean(ppd);
   1155		qib_user_sdma_queue_clean(ppd, pq);
   1156		mutex_unlock(&pq->lock);
   1157		msleep(20);
   1158	}
   1159
   1160	if (pq->num_pending || pq->num_sending) {
   1161		struct qib_user_sdma_pkt *pkt;
   1162		struct qib_user_sdma_pkt *pkt_prev;
   1163		struct list_head free_list;
   1164
   1165		mutex_lock(&pq->lock);
   1166		spin_lock_irqsave(&ppd->sdma_lock, flags);
   1167		/*
   1168		 * Since we hold sdma_lock, it is safe without sent_lock.
   1169		 */
   1170		if (pq->num_pending) {
   1171			list_for_each_entry_safe(pkt, pkt_prev,
   1172					&ppd->sdma_userpending, list) {
   1173				if (pkt->pq == pq) {
   1174					list_move_tail(&pkt->list, &pq->sent);
   1175					pq->num_pending--;
   1176					pq->num_sending++;
   1177				}
   1178			}
   1179		}
   1180		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
   1181
   1182		qib_dev_err(dd, "user sdma lists not empty: forcing!\n");
   1183		INIT_LIST_HEAD(&free_list);
   1184		list_splice_init(&pq->sent, &free_list);
   1185		pq->num_sending = 0;
   1186		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
   1187		mutex_unlock(&pq->lock);
   1188	}
   1189}
   1190
   1191static inline __le64 qib_sdma_make_desc0(u8 gen,
   1192					 u64 addr, u64 dwlen, u64 dwoffset)
   1193{
   1194	return cpu_to_le64(/* SDmaPhyAddr[31:0] */
   1195			   ((addr & 0xfffffffcULL) << 32) |
   1196			   /* SDmaGeneration[1:0] */
   1197			   ((gen & 3ULL) << 30) |
   1198			   /* SDmaDwordCount[10:0] */
   1199			   ((dwlen & 0x7ffULL) << 16) |
   1200			   /* SDmaBufOffset[12:2] */
   1201			   (dwoffset & 0x7ffULL));
   1202}
   1203
   1204static inline __le64 qib_sdma_make_first_desc0(__le64 descq)
   1205{
   1206	return descq | cpu_to_le64(1ULL << 12);
   1207}
   1208
   1209static inline __le64 qib_sdma_make_last_desc0(__le64 descq)
   1210{
   1211					      /* last */  /* dma head */
   1212	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
   1213}
   1214
   1215static inline __le64 qib_sdma_make_desc1(u64 addr)
   1216{
   1217	/* SDmaPhyAddr[47:32] */
   1218	return cpu_to_le64(addr >> 32);
   1219}
   1220
   1221static void qib_user_sdma_send_frag(struct qib_pportdata *ppd,
   1222				    struct qib_user_sdma_pkt *pkt, int idx,
   1223				    unsigned ofs, u16 tail, u8 gen)
   1224{
   1225	const u64 addr = (u64) pkt->addr[idx].addr +
   1226		(u64) pkt->addr[idx].offset;
   1227	const u64 dwlen = (u64) pkt->addr[idx].length / 4;
   1228	__le64 *descqp;
   1229	__le64 descq0;
   1230
   1231	descqp = &ppd->sdma_descq[tail].qw[0];
   1232
   1233	descq0 = qib_sdma_make_desc0(gen, addr, dwlen, ofs);
   1234	if (pkt->addr[idx].first_desc)
   1235		descq0 = qib_sdma_make_first_desc0(descq0);
   1236	if (pkt->addr[idx].last_desc) {
   1237		descq0 = qib_sdma_make_last_desc0(descq0);
   1238		if (ppd->sdma_intrequest) {
   1239			descq0 |= cpu_to_le64(1ULL << 15);
   1240			ppd->sdma_intrequest = 0;
   1241		}
   1242	}
   1243
   1244	descqp[0] = descq0;
   1245	descqp[1] = qib_sdma_make_desc1(addr);
   1246}
   1247
   1248void qib_user_sdma_send_desc(struct qib_pportdata *ppd,
   1249				struct list_head *pktlist)
   1250{
   1251	struct qib_devdata *dd = ppd->dd;
   1252	u16 nfree, nsent;
   1253	u16 tail, tail_c;
   1254	u8 gen, gen_c;
   1255
   1256	nfree = qib_sdma_descq_freecnt(ppd);
   1257	if (!nfree)
   1258		return;
   1259
   1260retry:
   1261	nsent = 0;
   1262	tail_c = tail = ppd->sdma_descq_tail;
   1263	gen_c = gen = ppd->sdma_generation;
   1264	while (!list_empty(pktlist)) {
   1265		struct qib_user_sdma_pkt *pkt =
   1266			list_entry(pktlist->next, struct qib_user_sdma_pkt,
   1267				   list);
   1268		int i, j, c = 0;
   1269		unsigned ofs = 0;
   1270		u16 dtail = tail;
   1271
   1272		for (i = pkt->index; i < pkt->naddr && nfree; i++) {
   1273			qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail, gen);
   1274			ofs += pkt->addr[i].length >> 2;
   1275
   1276			if (++tail == ppd->sdma_descq_cnt) {
   1277				tail = 0;
   1278				++gen;
   1279				ppd->sdma_intrequest = 1;
   1280			} else if (tail == (ppd->sdma_descq_cnt>>1)) {
   1281				ppd->sdma_intrequest = 1;
   1282			}
   1283			nfree--;
   1284			if (pkt->addr[i].last_desc == 0)
   1285				continue;
   1286
   1287			/*
   1288			 * If the packet is >= 2KB mtu equivalent, we
   1289			 * have to use the large buffers, and have to
   1290			 * mark each descriptor as part of a large
   1291			 * buffer packet.
   1292			 */
   1293			if (ofs > dd->piosize2kmax_dwords) {
   1294				for (j = pkt->index; j <= i; j++) {
   1295					ppd->sdma_descq[dtail].qw[0] |=
   1296						cpu_to_le64(1ULL << 14);
   1297					if (++dtail == ppd->sdma_descq_cnt)
   1298						dtail = 0;
   1299				}
   1300			}
   1301			c += i + 1 - pkt->index;
   1302			pkt->index = i + 1; /* index for next first */
   1303			tail_c = dtail = tail;
   1304			gen_c = gen;
   1305			ofs = 0;  /* reset for next packet */
   1306		}
   1307
   1308		ppd->sdma_descq_added += c;
   1309		nsent += c;
   1310		if (pkt->index == pkt->naddr) {
   1311			pkt->added = ppd->sdma_descq_added;
   1312			pkt->pq->added = pkt->added;
   1313			pkt->pq->num_pending--;
   1314			spin_lock(&pkt->pq->sent_lock);
   1315			pkt->pq->num_sending++;
   1316			list_move_tail(&pkt->list, &pkt->pq->sent);
   1317			spin_unlock(&pkt->pq->sent_lock);
   1318		}
   1319		if (!nfree || (nsent<<2) > ppd->sdma_descq_cnt)
   1320			break;
   1321	}
   1322
   1323	/* advance the tail on the chip if necessary */
   1324	if (ppd->sdma_descq_tail != tail_c) {
   1325		ppd->sdma_generation = gen_c;
   1326		dd->f_sdma_update_tail(ppd, tail_c);
   1327	}
   1328
   1329	if (nfree && !list_empty(pktlist))
   1330		goto retry;
   1331}
   1332
   1333/* pq->lock must be held, get packets on the wire... */
   1334static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,
   1335				 struct qib_user_sdma_queue *pq,
   1336				 struct list_head *pktlist, int count)
   1337{
   1338	unsigned long flags;
   1339
   1340	if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE)))
   1341		return -ECOMM;
   1342
   1343	/* non-blocking mode */
   1344	if (pq->sdma_rb_node->refcount > 1) {
   1345		spin_lock_irqsave(&ppd->sdma_lock, flags);
   1346		if (unlikely(!__qib_sdma_running(ppd))) {
   1347			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
   1348			return -ECOMM;
   1349		}
   1350		pq->num_pending += count;
   1351		list_splice_tail_init(pktlist, &ppd->sdma_userpending);
   1352		qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
   1353		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
   1354		return 0;
   1355	}
   1356
   1357	/* In this case, descriptors from this process are not
   1358	 * linked to ppd pending queue, interrupt handler
   1359	 * won't update this process, it is OK to directly
   1360	 * modify without sdma lock.
   1361	 */
   1362
   1363
   1364	pq->num_pending += count;
   1365	/*
   1366	 * Blocking mode for single rail process, we must
   1367	 * release/regain sdma_lock to give other process
   1368	 * chance to make progress. This is important for
   1369	 * performance.
   1370	 */
   1371	do {
   1372		spin_lock_irqsave(&ppd->sdma_lock, flags);
   1373		if (unlikely(!__qib_sdma_running(ppd))) {
   1374			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
   1375			return -ECOMM;
   1376		}
   1377		qib_user_sdma_send_desc(ppd, pktlist);
   1378		if (!list_empty(pktlist))
   1379			qib_sdma_make_progress(ppd);
   1380		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
   1381	} while (!list_empty(pktlist));
   1382
   1383	return 0;
   1384}
   1385
   1386int qib_user_sdma_writev(struct qib_ctxtdata *rcd,
   1387			 struct qib_user_sdma_queue *pq,
   1388			 const struct iovec *iov,
   1389			 unsigned long dim)
   1390{
   1391	struct qib_devdata *dd = rcd->dd;
   1392	struct qib_pportdata *ppd = rcd->ppd;
   1393	int ret = 0;
   1394	struct list_head list;
   1395	int npkts = 0;
   1396
   1397	INIT_LIST_HEAD(&list);
   1398
   1399	mutex_lock(&pq->lock);
   1400
   1401	/* why not -ECOMM like qib_user_sdma_push_pkts() below? */
   1402	if (!qib_sdma_running(ppd))
   1403		goto done_unlock;
   1404
   1405	/* if I have packets not complete yet */
   1406	if (pq->added > ppd->sdma_descq_removed)
   1407		qib_user_sdma_hwqueue_clean(ppd);
   1408	/* if I have complete packets to be freed */
   1409	if (pq->num_sending)
   1410		qib_user_sdma_queue_clean(ppd, pq);
   1411
   1412	while (dim) {
   1413		int mxp = 1;
   1414		int ndesc = 0;
   1415
   1416		ret = qib_user_sdma_queue_pkts(dd, ppd, pq,
   1417				iov, dim, &list, &mxp, &ndesc);
   1418		if (ret < 0)
   1419			goto done_unlock;
   1420		else {
   1421			dim -= ret;
   1422			iov += ret;
   1423		}
   1424
   1425		/* force packets onto the sdma hw queue... */
   1426		if (!list_empty(&list)) {
   1427			/*
   1428			 * Lazily clean hw queue.
   1429			 */
   1430			if (qib_sdma_descq_freecnt(ppd) < ndesc) {
   1431				qib_user_sdma_hwqueue_clean(ppd);
   1432				if (pq->num_sending)
   1433					qib_user_sdma_queue_clean(ppd, pq);
   1434			}
   1435
   1436			ret = qib_user_sdma_push_pkts(ppd, pq, &list, mxp);
   1437			if (ret < 0)
   1438				goto done_unlock;
   1439			else {
   1440				npkts += mxp;
   1441				pq->counter += mxp;
   1442			}
   1443		}
   1444	}
   1445
   1446done_unlock:
   1447	if (!list_empty(&list))
   1448		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
   1449	mutex_unlock(&pq->lock);
   1450
   1451	return (ret < 0) ? ret : npkts;
   1452}
   1453
   1454int qib_user_sdma_make_progress(struct qib_pportdata *ppd,
   1455				struct qib_user_sdma_queue *pq)
   1456{
   1457	int ret = 0;
   1458
   1459	mutex_lock(&pq->lock);
   1460	qib_user_sdma_hwqueue_clean(ppd);
   1461	ret = qib_user_sdma_queue_clean(ppd, pq);
   1462	mutex_unlock(&pq->lock);
   1463
   1464	return ret;
   1465}
   1466
   1467u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq)
   1468{
   1469	return pq ? pq->sent_counter : 0;
   1470}
   1471
   1472u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq)
   1473{
   1474	return pq ? pq->counter : 0;
   1475}