cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sge.c (95891B)


      1/*
      2 * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
      3 *
      4 * This software is available to you under a choice of one of two
      5 * licenses.  You may choose to be licensed under the terms of the GNU
      6 * General Public License (GPL) Version 2, available from the file
      7 * COPYING in the main directory of this source tree, or the
      8 * OpenIB.org BSD license below:
      9 *
     10 *     Redistribution and use in source and binary forms, with or
     11 *     without modification, are permitted provided that the following
     12 *     conditions are met:
     13 *
     14 *      - Redistributions of source code must retain the above
     15 *        copyright notice, this list of conditions and the following
     16 *        disclaimer.
     17 *
     18 *      - Redistributions in binary form must reproduce the above
     19 *        copyright notice, this list of conditions and the following
     20 *        disclaimer in the documentation and/or other materials
     21 *        provided with the distribution.
     22 *
     23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     30 * SOFTWARE.
     31 */
     32#include <linux/skbuff.h>
     33#include <linux/netdevice.h>
     34#include <linux/etherdevice.h>
     35#include <linux/if_vlan.h>
     36#include <linux/ip.h>
     37#include <linux/tcp.h>
     38#include <linux/dma-mapping.h>
     39#include <linux/slab.h>
     40#include <linux/prefetch.h>
     41#include <net/arp.h>
     42#include "common.h"
     43#include "regs.h"
     44#include "sge_defs.h"
     45#include "t3_cpl.h"
     46#include "firmware_exports.h"
     47#include "cxgb3_offload.h"
     48
     49#define USE_GTS 0
     50
     51#define SGE_RX_SM_BUF_SIZE 1536
     52
     53#define SGE_RX_COPY_THRES  256
     54#define SGE_RX_PULL_LEN    128
     55
     56#define SGE_PG_RSVD SMP_CACHE_BYTES
     57/*
     58 * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks.
     59 * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
     60 * directly.
     61 */
     62#define FL0_PG_CHUNK_SIZE  2048
     63#define FL0_PG_ORDER 0
     64#define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
     65#define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
     66#define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
     67#define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
     68
     69#define SGE_RX_DROP_THRES 16
     70#define RX_RECLAIM_PERIOD (HZ/4)
     71
     72/*
     73 * Max number of Rx buffers we replenish at a time.
     74 */
     75#define MAX_RX_REFILL 16U
     76/*
     77 * Period of the Tx buffer reclaim timer.  This timer does not need to run
     78 * frequently as Tx buffers are usually reclaimed by new Tx packets.
     79 */
     80#define TX_RECLAIM_PERIOD (HZ / 4)
     81#define TX_RECLAIM_TIMER_CHUNK 64U
     82#define TX_RECLAIM_CHUNK 16U
     83
     84/* WR size in bytes */
     85#define WR_LEN (WR_FLITS * 8)
     86
     87/*
     88 * Types of Tx queues in each queue set.  Order here matters, do not change.
     89 */
     90enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
     91
     92/* Values for sge_txq.flags */
     93enum {
     94	TXQ_RUNNING = 1 << 0,	/* fetch engine is running */
     95	TXQ_LAST_PKT_DB = 1 << 1,	/* last packet rang the doorbell */
     96};
     97
     98struct tx_desc {
     99	__be64 flit[TX_DESC_FLITS];
    100};
    101
    102struct rx_desc {
    103	__be32 addr_lo;
    104	__be32 len_gen;
    105	__be32 gen2;
    106	__be32 addr_hi;
    107};
    108
    109struct tx_sw_desc {		/* SW state per Tx descriptor */
    110	struct sk_buff *skb;
    111	u8 eop;       /* set if last descriptor for packet */
    112	u8 addr_idx;  /* buffer index of first SGL entry in descriptor */
    113	u8 fragidx;   /* first page fragment associated with descriptor */
    114	s8 sflit;     /* start flit of first SGL entry in descriptor */
    115};
    116
    117struct rx_sw_desc {                /* SW state per Rx descriptor */
    118	union {
    119		struct sk_buff *skb;
    120		struct fl_pg_chunk pg_chunk;
    121	};
    122	DEFINE_DMA_UNMAP_ADDR(dma_addr);
    123};
    124
    125struct rsp_desc {		/* response queue descriptor */
    126	struct rss_header rss_hdr;
    127	__be32 flags;
    128	__be32 len_cq;
    129	struct_group(immediate,
    130		u8 imm_data[47];
    131		u8 intr_gen;
    132	);
    133};
    134
    135/*
    136 * Holds unmapping information for Tx packets that need deferred unmapping.
    137 * This structure lives at skb->head and must be allocated by callers.
    138 */
    139struct deferred_unmap_info {
    140	struct pci_dev *pdev;
    141	dma_addr_t addr[MAX_SKB_FRAGS + 1];
    142};
    143
    144/*
    145 * Maps a number of flits to the number of Tx descriptors that can hold them.
    146 * The formula is
    147 *
    148 * desc = 1 + (flits - 2) / (WR_FLITS - 1).
    149 *
    150 * HW allows up to 4 descriptors to be combined into a WR.
    151 */
    152static u8 flit_desc_map[] = {
    153	0,
    154#if SGE_NUM_GENBITS == 1
    155	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    156	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    157	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    158	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
    159#elif SGE_NUM_GENBITS == 2
    160	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    161	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    162	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    163	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    164#else
    165# error "SGE_NUM_GENBITS must be 1 or 2"
    166#endif
    167};
    168
    169static inline struct sge_qset *fl_to_qset(const struct sge_fl *q, int qidx)
    170{
    171	return container_of(q, struct sge_qset, fl[qidx]);
    172}
    173
    174static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q)
    175{
    176	return container_of(q, struct sge_qset, rspq);
    177}
    178
    179static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx)
    180{
    181	return container_of(q, struct sge_qset, txq[qidx]);
    182}
    183
    184/**
    185 *	refill_rspq - replenish an SGE response queue
    186 *	@adapter: the adapter
    187 *	@q: the response queue to replenish
    188 *	@credits: how many new responses to make available
    189 *
    190 *	Replenishes a response queue by making the supplied number of responses
    191 *	available to HW.
    192 */
    193static inline void refill_rspq(struct adapter *adapter,
    194			       const struct sge_rspq *q, unsigned int credits)
    195{
    196	rmb();
    197	t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN,
    198		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
    199}
    200
    201/**
    202 *	need_skb_unmap - does the platform need unmapping of sk_buffs?
    203 *
    204 *	Returns true if the platform needs sk_buff unmapping.  The compiler
    205 *	optimizes away unnecessary code if this returns true.
    206 */
    207static inline int need_skb_unmap(void)
    208{
    209#ifdef CONFIG_NEED_DMA_MAP_STATE
    210	return 1;
    211#else
    212	return 0;
    213#endif
    214}
    215
    216/**
    217 *	unmap_skb - unmap a packet main body and its page fragments
    218 *	@skb: the packet
    219 *	@q: the Tx queue containing Tx descriptors for the packet
    220 *	@cidx: index of Tx descriptor
    221 *	@pdev: the PCI device
    222 *
    223 *	Unmap the main body of an sk_buff and its page fragments, if any.
    224 *	Because of the fairly complicated structure of our SGLs and the desire
    225 *	to conserve space for metadata, the information necessary to unmap an
    226 *	sk_buff is spread across the sk_buff itself (buffer lengths), the HW Tx
    227 *	descriptors (the physical addresses of the various data buffers), and
    228 *	the SW descriptor state (assorted indices).  The send functions
    229 *	initialize the indices for the first packet descriptor so we can unmap
    230 *	the buffers held in the first Tx descriptor here, and we have enough
    231 *	information at this point to set the state for the next Tx descriptor.
    232 *
    233 *	Note that it is possible to clean up the first descriptor of a packet
    234 *	before the send routines have written the next descriptors, but this
    235 *	race does not cause any problem.  We just end up writing the unmapping
    236 *	info for the descriptor first.
    237 */
    238static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q,
    239			     unsigned int cidx, struct pci_dev *pdev)
    240{
    241	const struct sg_ent *sgp;
    242	struct tx_sw_desc *d = &q->sdesc[cidx];
    243	int nfrags, frag_idx, curflit, j = d->addr_idx;
    244
    245	sgp = (struct sg_ent *)&q->desc[cidx].flit[d->sflit];
    246	frag_idx = d->fragidx;
    247
    248	if (frag_idx == 0 && skb_headlen(skb)) {
    249		dma_unmap_single(&pdev->dev, be64_to_cpu(sgp->addr[0]),
    250				 skb_headlen(skb), DMA_TO_DEVICE);
    251		j = 1;
    252	}
    253
    254	curflit = d->sflit + 1 + j;
    255	nfrags = skb_shinfo(skb)->nr_frags;
    256
    257	while (frag_idx < nfrags && curflit < WR_FLITS) {
    258		dma_unmap_page(&pdev->dev, be64_to_cpu(sgp->addr[j]),
    259			       skb_frag_size(&skb_shinfo(skb)->frags[frag_idx]),
    260			       DMA_TO_DEVICE);
    261		j ^= 1;
    262		if (j == 0) {
    263			sgp++;
    264			curflit++;
    265		}
    266		curflit++;
    267		frag_idx++;
    268	}
    269
    270	if (frag_idx < nfrags) {   /* SGL continues into next Tx descriptor */
    271		d = cidx + 1 == q->size ? q->sdesc : d + 1;
    272		d->fragidx = frag_idx;
    273		d->addr_idx = j;
    274		d->sflit = curflit - WR_FLITS - j; /* sflit can be -1 */
    275	}
    276}
    277
    278/**
    279 *	free_tx_desc - reclaims Tx descriptors and their buffers
    280 *	@adapter: the adapter
    281 *	@q: the Tx queue to reclaim descriptors from
    282 *	@n: the number of descriptors to reclaim
    283 *
    284 *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
    285 *	Tx buffers.  Called with the Tx queue lock held.
    286 */
    287static void free_tx_desc(struct adapter *adapter, struct sge_txq *q,
    288			 unsigned int n)
    289{
    290	struct tx_sw_desc *d;
    291	struct pci_dev *pdev = adapter->pdev;
    292	unsigned int cidx = q->cidx;
    293
    294	const int need_unmap = need_skb_unmap() &&
    295			       q->cntxt_id >= FW_TUNNEL_SGEEC_START;
    296
    297	d = &q->sdesc[cidx];
    298	while (n--) {
    299		if (d->skb) {	/* an SGL is present */
    300			if (need_unmap)
    301				unmap_skb(d->skb, q, cidx, pdev);
    302			if (d->eop) {
    303				dev_consume_skb_any(d->skb);
    304				d->skb = NULL;
    305			}
    306		}
    307		++d;
    308		if (++cidx == q->size) {
    309			cidx = 0;
    310			d = q->sdesc;
    311		}
    312	}
    313	q->cidx = cidx;
    314}
    315
    316/**
    317 *	reclaim_completed_tx - reclaims completed Tx descriptors
    318 *	@adapter: the adapter
    319 *	@q: the Tx queue to reclaim completed descriptors from
    320 *	@chunk: maximum number of descriptors to reclaim
    321 *
    322 *	Reclaims Tx descriptors that the SGE has indicated it has processed,
    323 *	and frees the associated buffers if possible.  Called with the Tx
    324 *	queue's lock held.
    325 */
    326static inline unsigned int reclaim_completed_tx(struct adapter *adapter,
    327						struct sge_txq *q,
    328						unsigned int chunk)
    329{
    330	unsigned int reclaim = q->processed - q->cleaned;
    331
    332	reclaim = min(chunk, reclaim);
    333	if (reclaim) {
    334		free_tx_desc(adapter, q, reclaim);
    335		q->cleaned += reclaim;
    336		q->in_use -= reclaim;
    337	}
    338	return q->processed - q->cleaned;
    339}
    340
    341/**
    342 *	should_restart_tx - are there enough resources to restart a Tx queue?
    343 *	@q: the Tx queue
    344 *
    345 *	Checks if there are enough descriptors to restart a suspended Tx queue.
    346 */
    347static inline int should_restart_tx(const struct sge_txq *q)
    348{
    349	unsigned int r = q->processed - q->cleaned;
    350
    351	return q->in_use - r < (q->size >> 1);
    352}
    353
    354static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q,
    355			  struct rx_sw_desc *d)
    356{
    357	if (q->use_pages && d->pg_chunk.page) {
    358		(*d->pg_chunk.p_cnt)--;
    359		if (!*d->pg_chunk.p_cnt)
    360			dma_unmap_page(&pdev->dev, d->pg_chunk.mapping,
    361				       q->alloc_size, DMA_FROM_DEVICE);
    362
    363		put_page(d->pg_chunk.page);
    364		d->pg_chunk.page = NULL;
    365	} else {
    366		dma_unmap_single(&pdev->dev, dma_unmap_addr(d, dma_addr),
    367				 q->buf_size, DMA_FROM_DEVICE);
    368		kfree_skb(d->skb);
    369		d->skb = NULL;
    370	}
    371}
    372
    373/**
    374 *	free_rx_bufs - free the Rx buffers on an SGE free list
    375 *	@pdev: the PCI device associated with the adapter
    376 *	@q: the SGE free list to clean up
    377 *
    378 *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
    379 *	this queue should be stopped before calling this function.
    380 */
    381static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q)
    382{
    383	unsigned int cidx = q->cidx;
    384
    385	while (q->credits--) {
    386		struct rx_sw_desc *d = &q->sdesc[cidx];
    387
    388
    389		clear_rx_desc(pdev, q, d);
    390		if (++cidx == q->size)
    391			cidx = 0;
    392	}
    393
    394	if (q->pg_chunk.page) {
    395		__free_pages(q->pg_chunk.page, q->order);
    396		q->pg_chunk.page = NULL;
    397	}
    398}
    399
    400/**
    401 *	add_one_rx_buf - add a packet buffer to a free-buffer list
    402 *	@va:  buffer start VA
    403 *	@len: the buffer length
    404 *	@d: the HW Rx descriptor to write
    405 *	@sd: the SW Rx descriptor to write
    406 *	@gen: the generation bit value
    407 *	@pdev: the PCI device associated with the adapter
    408 *
    409 *	Add a buffer of the given length to the supplied HW and SW Rx
    410 *	descriptors.
    411 */
    412static inline int add_one_rx_buf(void *va, unsigned int len,
    413				 struct rx_desc *d, struct rx_sw_desc *sd,
    414				 unsigned int gen, struct pci_dev *pdev)
    415{
    416	dma_addr_t mapping;
    417
    418	mapping = dma_map_single(&pdev->dev, va, len, DMA_FROM_DEVICE);
    419	if (unlikely(dma_mapping_error(&pdev->dev, mapping)))
    420		return -ENOMEM;
    421
    422	dma_unmap_addr_set(sd, dma_addr, mapping);
    423
    424	d->addr_lo = cpu_to_be32(mapping);
    425	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
    426	dma_wmb();
    427	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
    428	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
    429	return 0;
    430}
    431
    432static inline int add_one_rx_chunk(dma_addr_t mapping, struct rx_desc *d,
    433				   unsigned int gen)
    434{
    435	d->addr_lo = cpu_to_be32(mapping);
    436	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
    437	dma_wmb();
    438	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
    439	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
    440	return 0;
    441}
    442
    443static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
    444			  struct rx_sw_desc *sd, gfp_t gfp,
    445			  unsigned int order)
    446{
    447	if (!q->pg_chunk.page) {
    448		dma_addr_t mapping;
    449
    450		q->pg_chunk.page = alloc_pages(gfp, order);
    451		if (unlikely(!q->pg_chunk.page))
    452			return -ENOMEM;
    453		q->pg_chunk.va = page_address(q->pg_chunk.page);
    454		q->pg_chunk.p_cnt = q->pg_chunk.va + (PAGE_SIZE << order) -
    455				    SGE_PG_RSVD;
    456		q->pg_chunk.offset = 0;
    457		mapping = dma_map_page(&adapter->pdev->dev, q->pg_chunk.page,
    458				       0, q->alloc_size, DMA_FROM_DEVICE);
    459		if (unlikely(dma_mapping_error(&adapter->pdev->dev, mapping))) {
    460			__free_pages(q->pg_chunk.page, order);
    461			q->pg_chunk.page = NULL;
    462			return -EIO;
    463		}
    464		q->pg_chunk.mapping = mapping;
    465	}
    466	sd->pg_chunk = q->pg_chunk;
    467
    468	prefetch(sd->pg_chunk.p_cnt);
    469
    470	q->pg_chunk.offset += q->buf_size;
    471	if (q->pg_chunk.offset == (PAGE_SIZE << order))
    472		q->pg_chunk.page = NULL;
    473	else {
    474		q->pg_chunk.va += q->buf_size;
    475		get_page(q->pg_chunk.page);
    476	}
    477
    478	if (sd->pg_chunk.offset == 0)
    479		*sd->pg_chunk.p_cnt = 1;
    480	else
    481		*sd->pg_chunk.p_cnt += 1;
    482
    483	return 0;
    484}
    485
    486static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
    487{
    488	if (q->pend_cred >= q->credits / 4) {
    489		q->pend_cred = 0;
    490		wmb();
    491		t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
    492	}
    493}
    494
    495/**
    496 *	refill_fl - refill an SGE free-buffer list
    497 *	@adap: the adapter
    498 *	@q: the free-list to refill
    499 *	@n: the number of new buffers to allocate
    500 *	@gfp: the gfp flags for allocating new buffers
    501 *
    502 *	(Re)populate an SGE free-buffer list with up to @n new packet buffers,
    503 *	allocated with the supplied gfp flags.  The caller must assure that
    504 *	@n does not exceed the queue's capacity.
    505 */
    506static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
    507{
    508	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
    509	struct rx_desc *d = &q->desc[q->pidx];
    510	unsigned int count = 0;
    511
    512	while (n--) {
    513		dma_addr_t mapping;
    514		int err;
    515
    516		if (q->use_pages) {
    517			if (unlikely(alloc_pg_chunk(adap, q, sd, gfp,
    518						    q->order))) {
    519nomem:				q->alloc_failed++;
    520				break;
    521			}
    522			mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset;
    523			dma_unmap_addr_set(sd, dma_addr, mapping);
    524
    525			add_one_rx_chunk(mapping, d, q->gen);
    526			dma_sync_single_for_device(&adap->pdev->dev, mapping,
    527						   q->buf_size - SGE_PG_RSVD,
    528						   DMA_FROM_DEVICE);
    529		} else {
    530			void *buf_start;
    531
    532			struct sk_buff *skb = alloc_skb(q->buf_size, gfp);
    533			if (!skb)
    534				goto nomem;
    535
    536			sd->skb = skb;
    537			buf_start = skb->data;
    538			err = add_one_rx_buf(buf_start, q->buf_size, d, sd,
    539					     q->gen, adap->pdev);
    540			if (unlikely(err)) {
    541				clear_rx_desc(adap->pdev, q, sd);
    542				break;
    543			}
    544		}
    545
    546		d++;
    547		sd++;
    548		if (++q->pidx == q->size) {
    549			q->pidx = 0;
    550			q->gen ^= 1;
    551			sd = q->sdesc;
    552			d = q->desc;
    553		}
    554		count++;
    555	}
    556
    557	q->credits += count;
    558	q->pend_cred += count;
    559	ring_fl_db(adap, q);
    560
    561	return count;
    562}
    563
    564static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
    565{
    566	refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits),
    567		  GFP_ATOMIC | __GFP_COMP);
    568}
    569
    570/**
    571 *	recycle_rx_buf - recycle a receive buffer
    572 *	@adap: the adapter
    573 *	@q: the SGE free list
    574 *	@idx: index of buffer to recycle
    575 *
    576 *	Recycles the specified buffer on the given free list by adding it at
    577 *	the next available slot on the list.
    578 */
    579static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q,
    580			   unsigned int idx)
    581{
    582	struct rx_desc *from = &q->desc[idx];
    583	struct rx_desc *to = &q->desc[q->pidx];
    584
    585	q->sdesc[q->pidx] = q->sdesc[idx];
    586	to->addr_lo = from->addr_lo;	/* already big endian */
    587	to->addr_hi = from->addr_hi;	/* likewise */
    588	dma_wmb();
    589	to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen));
    590	to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen));
    591
    592	if (++q->pidx == q->size) {
    593		q->pidx = 0;
    594		q->gen ^= 1;
    595	}
    596
    597	q->credits++;
    598	q->pend_cred++;
    599	ring_fl_db(adap, q);
    600}
    601
    602/**
    603 *	alloc_ring - allocate resources for an SGE descriptor ring
    604 *	@pdev: the PCI device
    605 *	@nelem: the number of descriptors
    606 *	@elem_size: the size of each descriptor
    607 *	@sw_size: the size of the SW state associated with each ring element
    608 *	@phys: the physical address of the allocated ring
    609 *	@metadata: address of the array holding the SW state for the ring
    610 *
    611 *	Allocates resources for an SGE descriptor ring, such as Tx queues,
    612 *	free buffer lists, or response queues.  Each SGE ring requires
    613 *	space for its HW descriptors plus, optionally, space for the SW state
    614 *	associated with each HW entry (the metadata).  The function returns
    615 *	three values: the virtual address for the HW ring (the return value
    616 *	of the function), the physical address of the HW ring, and the address
    617 *	of the SW ring.
    618 */
    619static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
    620			size_t sw_size, dma_addr_t * phys, void *metadata)
    621{
    622	size_t len = nelem * elem_size;
    623	void *s = NULL;
    624	void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
    625
    626	if (!p)
    627		return NULL;
    628	if (sw_size && metadata) {
    629		s = kcalloc(nelem, sw_size, GFP_KERNEL);
    630
    631		if (!s) {
    632			dma_free_coherent(&pdev->dev, len, p, *phys);
    633			return NULL;
    634		}
    635		*(void **)metadata = s;
    636	}
    637	return p;
    638}
    639
    640/**
    641 *	t3_reset_qset - reset a sge qset
    642 *	@q: the queue set
    643 *
    644 *	Reset the qset structure.
    645 *	the NAPI structure is preserved in the event of
    646 *	the qset's reincarnation, for example during EEH recovery.
    647 */
    648static void t3_reset_qset(struct sge_qset *q)
    649{
    650	if (q->adap &&
    651	    !(q->adap->flags & NAPI_INIT)) {
    652		memset(q, 0, sizeof(*q));
    653		return;
    654	}
    655
    656	q->adap = NULL;
    657	memset(&q->rspq, 0, sizeof(q->rspq));
    658	memset(q->fl, 0, sizeof(struct sge_fl) * SGE_RXQ_PER_SET);
    659	memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
    660	q->txq_stopped = 0;
    661	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
    662	q->rx_reclaim_timer.function = NULL;
    663	q->nomem = 0;
    664	napi_free_frags(&q->napi);
    665}
    666
    667
    668/**
    669 *	t3_free_qset - free the resources of an SGE queue set
    670 *	@adapter: the adapter owning the queue set
    671 *	@q: the queue set
    672 *
    673 *	Release the HW and SW resources associated with an SGE queue set, such
    674 *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
    675 *	queue set must be quiesced prior to calling this.
    676 */
    677static void t3_free_qset(struct adapter *adapter, struct sge_qset *q)
    678{
    679	int i;
    680	struct pci_dev *pdev = adapter->pdev;
    681
    682	for (i = 0; i < SGE_RXQ_PER_SET; ++i)
    683		if (q->fl[i].desc) {
    684			spin_lock_irq(&adapter->sge.reg_lock);
    685			t3_sge_disable_fl(adapter, q->fl[i].cntxt_id);
    686			spin_unlock_irq(&adapter->sge.reg_lock);
    687			free_rx_bufs(pdev, &q->fl[i]);
    688			kfree(q->fl[i].sdesc);
    689			dma_free_coherent(&pdev->dev,
    690					  q->fl[i].size *
    691					  sizeof(struct rx_desc), q->fl[i].desc,
    692					  q->fl[i].phys_addr);
    693		}
    694
    695	for (i = 0; i < SGE_TXQ_PER_SET; ++i)
    696		if (q->txq[i].desc) {
    697			spin_lock_irq(&adapter->sge.reg_lock);
    698			t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0);
    699			spin_unlock_irq(&adapter->sge.reg_lock);
    700			if (q->txq[i].sdesc) {
    701				free_tx_desc(adapter, &q->txq[i],
    702					     q->txq[i].in_use);
    703				kfree(q->txq[i].sdesc);
    704			}
    705			dma_free_coherent(&pdev->dev,
    706					  q->txq[i].size *
    707					  sizeof(struct tx_desc),
    708					  q->txq[i].desc, q->txq[i].phys_addr);
    709			__skb_queue_purge(&q->txq[i].sendq);
    710		}
    711
    712	if (q->rspq.desc) {
    713		spin_lock_irq(&adapter->sge.reg_lock);
    714		t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id);
    715		spin_unlock_irq(&adapter->sge.reg_lock);
    716		dma_free_coherent(&pdev->dev,
    717				  q->rspq.size * sizeof(struct rsp_desc),
    718				  q->rspq.desc, q->rspq.phys_addr);
    719	}
    720
    721	t3_reset_qset(q);
    722}
    723
    724/**
    725 *	init_qset_cntxt - initialize an SGE queue set context info
    726 *	@qs: the queue set
    727 *	@id: the queue set id
    728 *
    729 *	Initializes the TIDs and context ids for the queues of a queue set.
    730 */
    731static void init_qset_cntxt(struct sge_qset *qs, unsigned int id)
    732{
    733	qs->rspq.cntxt_id = id;
    734	qs->fl[0].cntxt_id = 2 * id;
    735	qs->fl[1].cntxt_id = 2 * id + 1;
    736	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
    737	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
    738	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
    739	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
    740	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
    741}
    742
    743/**
    744 *	sgl_len - calculates the size of an SGL of the given capacity
    745 *	@n: the number of SGL entries
    746 *
    747 *	Calculates the number of flits needed for a scatter/gather list that
    748 *	can hold the given number of entries.
    749 */
    750static inline unsigned int sgl_len(unsigned int n)
    751{
    752	/* alternatively: 3 * (n / 2) + 2 * (n & 1) */
    753	return (3 * n) / 2 + (n & 1);
    754}
    755
    756/**
    757 *	flits_to_desc - returns the num of Tx descriptors for the given flits
    758 *	@n: the number of flits
    759 *
    760 *	Calculates the number of Tx descriptors needed for the supplied number
    761 *	of flits.
    762 */
    763static inline unsigned int flits_to_desc(unsigned int n)
    764{
    765	BUG_ON(n >= ARRAY_SIZE(flit_desc_map));
    766	return flit_desc_map[n];
    767}
    768
    769/**
    770 *	get_packet - return the next ingress packet buffer from a free list
    771 *	@adap: the adapter that received the packet
    772 *	@fl: the SGE free list holding the packet
    773 *	@len: the packet length including any SGE padding
    774 *	@drop_thres: # of remaining buffers before we start dropping packets
    775 *
    776 *	Get the next packet from a free list and complete setup of the
    777 *	sk_buff.  If the packet is small we make a copy and recycle the
    778 *	original buffer, otherwise we use the original buffer itself.  If a
    779 *	positive drop threshold is supplied packets are dropped and their
    780 *	buffers recycled if (a) the number of remaining buffers is under the
    781 *	threshold and the packet is too big to copy, or (b) the packet should
    782 *	be copied but there is no memory for the copy.
    783 */
    784static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl,
    785				  unsigned int len, unsigned int drop_thres)
    786{
    787	struct sk_buff *skb = NULL;
    788	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
    789
    790	prefetch(sd->skb->data);
    791	fl->credits--;
    792
    793	if (len <= SGE_RX_COPY_THRES) {
    794		skb = alloc_skb(len, GFP_ATOMIC);
    795		if (likely(skb != NULL)) {
    796			__skb_put(skb, len);
    797			dma_sync_single_for_cpu(&adap->pdev->dev,
    798						dma_unmap_addr(sd, dma_addr),
    799						len, DMA_FROM_DEVICE);
    800			memcpy(skb->data, sd->skb->data, len);
    801			dma_sync_single_for_device(&adap->pdev->dev,
    802						   dma_unmap_addr(sd, dma_addr),
    803						   len, DMA_FROM_DEVICE);
    804		} else if (!drop_thres)
    805			goto use_orig_buf;
    806recycle:
    807		recycle_rx_buf(adap, fl, fl->cidx);
    808		return skb;
    809	}
    810
    811	if (unlikely(fl->credits < drop_thres) &&
    812	    refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits - 1),
    813		      GFP_ATOMIC | __GFP_COMP) == 0)
    814		goto recycle;
    815
    816use_orig_buf:
    817	dma_unmap_single(&adap->pdev->dev, dma_unmap_addr(sd, dma_addr),
    818			 fl->buf_size, DMA_FROM_DEVICE);
    819	skb = sd->skb;
    820	skb_put(skb, len);
    821	__refill_fl(adap, fl);
    822	return skb;
    823}
    824
    825/**
    826 *	get_packet_pg - return the next ingress packet buffer from a free list
    827 *	@adap: the adapter that received the packet
    828 *	@fl: the SGE free list holding the packet
    829 *	@q: the queue
    830 *	@len: the packet length including any SGE padding
    831 *	@drop_thres: # of remaining buffers before we start dropping packets
    832 *
    833 *	Get the next packet from a free list populated with page chunks.
    834 *	If the packet is small we make a copy and recycle the original buffer,
    835 *	otherwise we attach the original buffer as a page fragment to a fresh
    836 *	sk_buff.  If a positive drop threshold is supplied packets are dropped
    837 *	and their buffers recycled if (a) the number of remaining buffers is
    838 *	under the threshold and the packet is too big to copy, or (b) there's
    839 *	no system memory.
    840 *
    841 * 	Note: this function is similar to @get_packet but deals with Rx buffers
    842 * 	that are page chunks rather than sk_buffs.
    843 */
    844static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl,
    845				     struct sge_rspq *q, unsigned int len,
    846				     unsigned int drop_thres)
    847{
    848	struct sk_buff *newskb, *skb;
    849	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
    850
    851	dma_addr_t dma_addr = dma_unmap_addr(sd, dma_addr);
    852
    853	newskb = skb = q->pg_skb;
    854	if (!skb && (len <= SGE_RX_COPY_THRES)) {
    855		newskb = alloc_skb(len, GFP_ATOMIC);
    856		if (likely(newskb != NULL)) {
    857			__skb_put(newskb, len);
    858			dma_sync_single_for_cpu(&adap->pdev->dev, dma_addr,
    859						len, DMA_FROM_DEVICE);
    860			memcpy(newskb->data, sd->pg_chunk.va, len);
    861			dma_sync_single_for_device(&adap->pdev->dev, dma_addr,
    862						   len, DMA_FROM_DEVICE);
    863		} else if (!drop_thres)
    864			return NULL;
    865recycle:
    866		fl->credits--;
    867		recycle_rx_buf(adap, fl, fl->cidx);
    868		q->rx_recycle_buf++;
    869		return newskb;
    870	}
    871
    872	if (unlikely(q->rx_recycle_buf || (!skb && fl->credits <= drop_thres)))
    873		goto recycle;
    874
    875	prefetch(sd->pg_chunk.p_cnt);
    876
    877	if (!skb)
    878		newskb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC);
    879
    880	if (unlikely(!newskb)) {
    881		if (!drop_thres)
    882			return NULL;
    883		goto recycle;
    884	}
    885
    886	dma_sync_single_for_cpu(&adap->pdev->dev, dma_addr, len,
    887				DMA_FROM_DEVICE);
    888	(*sd->pg_chunk.p_cnt)--;
    889	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
    890		dma_unmap_page(&adap->pdev->dev, sd->pg_chunk.mapping,
    891			       fl->alloc_size, DMA_FROM_DEVICE);
    892	if (!skb) {
    893		__skb_put(newskb, SGE_RX_PULL_LEN);
    894		memcpy(newskb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN);
    895		skb_fill_page_desc(newskb, 0, sd->pg_chunk.page,
    896				   sd->pg_chunk.offset + SGE_RX_PULL_LEN,
    897				   len - SGE_RX_PULL_LEN);
    898		newskb->len = len;
    899		newskb->data_len = len - SGE_RX_PULL_LEN;
    900		newskb->truesize += newskb->data_len;
    901	} else {
    902		skb_fill_page_desc(newskb, skb_shinfo(newskb)->nr_frags,
    903				   sd->pg_chunk.page,
    904				   sd->pg_chunk.offset, len);
    905		newskb->len += len;
    906		newskb->data_len += len;
    907		newskb->truesize += len;
    908	}
    909
    910	fl->credits--;
    911	/*
    912	 * We do not refill FLs here, we let the caller do it to overlap a
    913	 * prefetch.
    914	 */
    915	return newskb;
    916}
    917
    918/**
    919 *	get_imm_packet - return the next ingress packet buffer from a response
    920 *	@resp: the response descriptor containing the packet data
    921 *
    922 *	Return a packet containing the immediate data of the given response.
    923 */
    924static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp)
    925{
    926	struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC);
    927
    928	if (skb) {
    929		__skb_put(skb, IMMED_PKT_SIZE);
    930		BUILD_BUG_ON(IMMED_PKT_SIZE != sizeof(resp->immediate));
    931		skb_copy_to_linear_data(skb, &resp->immediate, IMMED_PKT_SIZE);
    932	}
    933	return skb;
    934}
    935
    936/**
    937 *	calc_tx_descs - calculate the number of Tx descriptors for a packet
    938 *	@skb: the packet
    939 *
    940 * 	Returns the number of Tx descriptors needed for the given Ethernet
    941 * 	packet.  Ethernet packets require addition of WR and CPL headers.
    942 */
    943static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
    944{
    945	unsigned int flits;
    946
    947	if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt))
    948		return 1;
    949
    950	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2;
    951	if (skb_shinfo(skb)->gso_size)
    952		flits++;
    953	return flits_to_desc(flits);
    954}
    955
    956/*	map_skb - map a packet main body and its page fragments
    957 *	@pdev: the PCI device
    958 *	@skb: the packet
    959 *	@addr: placeholder to save the mapped addresses
    960 *
    961 *	map the main body of an sk_buff and its page fragments, if any.
    962 */
    963static int map_skb(struct pci_dev *pdev, const struct sk_buff *skb,
    964		   dma_addr_t *addr)
    965{
    966	const skb_frag_t *fp, *end;
    967	const struct skb_shared_info *si;
    968
    969	if (skb_headlen(skb)) {
    970		*addr = dma_map_single(&pdev->dev, skb->data,
    971				       skb_headlen(skb), DMA_TO_DEVICE);
    972		if (dma_mapping_error(&pdev->dev, *addr))
    973			goto out_err;
    974		addr++;
    975	}
    976
    977	si = skb_shinfo(skb);
    978	end = &si->frags[si->nr_frags];
    979
    980	for (fp = si->frags; fp < end; fp++) {
    981		*addr = skb_frag_dma_map(&pdev->dev, fp, 0, skb_frag_size(fp),
    982					 DMA_TO_DEVICE);
    983		if (dma_mapping_error(&pdev->dev, *addr))
    984			goto unwind;
    985		addr++;
    986	}
    987	return 0;
    988
    989unwind:
    990	while (fp-- > si->frags)
    991		dma_unmap_page(&pdev->dev, *--addr, skb_frag_size(fp),
    992			       DMA_TO_DEVICE);
    993
    994	dma_unmap_single(&pdev->dev, addr[-1], skb_headlen(skb),
    995			 DMA_TO_DEVICE);
    996out_err:
    997	return -ENOMEM;
    998}
    999
   1000/**
   1001 *	write_sgl - populate a scatter/gather list for a packet
   1002 *	@skb: the packet
   1003 *	@sgp: the SGL to populate
   1004 *	@start: start address of skb main body data to include in the SGL
   1005 *	@len: length of skb main body data to include in the SGL
   1006 *	@addr: the list of the mapped addresses
   1007 *
   1008 *	Copies the scatter/gather list for the buffers that make up a packet
   1009 *	and returns the SGL size in 8-byte words.  The caller must size the SGL
   1010 *	appropriately.
   1011 */
   1012static inline unsigned int write_sgl(const struct sk_buff *skb,
   1013				     struct sg_ent *sgp, unsigned char *start,
   1014				     unsigned int len, const dma_addr_t *addr)
   1015{
   1016	unsigned int i, j = 0, k = 0, nfrags;
   1017
   1018	if (len) {
   1019		sgp->len[0] = cpu_to_be32(len);
   1020		sgp->addr[j++] = cpu_to_be64(addr[k++]);
   1021	}
   1022
   1023	nfrags = skb_shinfo(skb)->nr_frags;
   1024	for (i = 0; i < nfrags; i++) {
   1025		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
   1026
   1027		sgp->len[j] = cpu_to_be32(skb_frag_size(frag));
   1028		sgp->addr[j] = cpu_to_be64(addr[k++]);
   1029		j ^= 1;
   1030		if (j == 0)
   1031			++sgp;
   1032	}
   1033	if (j)
   1034		sgp->len[j] = 0;
   1035	return ((nfrags + (len != 0)) * 3) / 2 + j;
   1036}
   1037
   1038/**
   1039 *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
   1040 *	@adap: the adapter
   1041 *	@q: the Tx queue
   1042 *
   1043 *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
   1044 *	where the HW is going to sleep just after we checked, however,
   1045 *	then the interrupt handler will detect the outstanding TX packet
   1046 *	and ring the doorbell for us.
   1047 *
   1048 *	When GTS is disabled we unconditionally ring the doorbell.
   1049 */
   1050static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q)
   1051{
   1052#if USE_GTS
   1053	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
   1054	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
   1055		set_bit(TXQ_LAST_PKT_DB, &q->flags);
   1056		t3_write_reg(adap, A_SG_KDOORBELL,
   1057			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
   1058	}
   1059#else
   1060	wmb();			/* write descriptors before telling HW */
   1061	t3_write_reg(adap, A_SG_KDOORBELL,
   1062		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
   1063#endif
   1064}
   1065
   1066static inline void wr_gen2(struct tx_desc *d, unsigned int gen)
   1067{
   1068#if SGE_NUM_GENBITS == 2
   1069	d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen);
   1070#endif
   1071}
   1072
   1073/**
   1074 *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
   1075 *	@ndesc: number of Tx descriptors spanned by the SGL
   1076 *	@skb: the packet corresponding to the WR
   1077 *	@d: first Tx descriptor to be written
   1078 *	@pidx: index of above descriptors
   1079 *	@q: the SGE Tx queue
   1080 *	@sgl: the SGL
   1081 *	@flits: number of flits to the start of the SGL in the first descriptor
   1082 *	@sgl_flits: the SGL size in flits
   1083 *	@gen: the Tx descriptor generation
   1084 *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
   1085 *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
   1086 *
   1087 *	Write a work request header and an associated SGL.  If the SGL is
   1088 *	small enough to fit into one Tx descriptor it has already been written
   1089 *	and we just need to write the WR header.  Otherwise we distribute the
   1090 *	SGL across the number of descriptors it spans.
   1091 */
   1092static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb,
   1093			     struct tx_desc *d, unsigned int pidx,
   1094			     const struct sge_txq *q,
   1095			     const struct sg_ent *sgl,
   1096			     unsigned int flits, unsigned int sgl_flits,
   1097			     unsigned int gen, __be32 wr_hi,
   1098			     __be32 wr_lo)
   1099{
   1100	struct work_request_hdr *wrp = (struct work_request_hdr *)d;
   1101	struct tx_sw_desc *sd = &q->sdesc[pidx];
   1102
   1103	sd->skb = skb;
   1104	if (need_skb_unmap()) {
   1105		sd->fragidx = 0;
   1106		sd->addr_idx = 0;
   1107		sd->sflit = flits;
   1108	}
   1109
   1110	if (likely(ndesc == 1)) {
   1111		sd->eop = 1;
   1112		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
   1113				   V_WR_SGLSFLT(flits)) | wr_hi;
   1114		dma_wmb();
   1115		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
   1116				   V_WR_GEN(gen)) | wr_lo;
   1117		wr_gen2(d, gen);
   1118	} else {
   1119		unsigned int ogen = gen;
   1120		const u64 *fp = (const u64 *)sgl;
   1121		struct work_request_hdr *wp = wrp;
   1122
   1123		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
   1124				   V_WR_SGLSFLT(flits)) | wr_hi;
   1125
   1126		while (sgl_flits) {
   1127			unsigned int avail = WR_FLITS - flits;
   1128
   1129			if (avail > sgl_flits)
   1130				avail = sgl_flits;
   1131			memcpy(&d->flit[flits], fp, avail * sizeof(*fp));
   1132			sgl_flits -= avail;
   1133			ndesc--;
   1134			if (!sgl_flits)
   1135				break;
   1136
   1137			fp += avail;
   1138			d++;
   1139			sd->eop = 0;
   1140			sd++;
   1141			if (++pidx == q->size) {
   1142				pidx = 0;
   1143				gen ^= 1;
   1144				d = q->desc;
   1145				sd = q->sdesc;
   1146			}
   1147
   1148			sd->skb = skb;
   1149			wrp = (struct work_request_hdr *)d;
   1150			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
   1151					   V_WR_SGLSFLT(1)) | wr_hi;
   1152			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
   1153							sgl_flits + 1)) |
   1154					   V_WR_GEN(gen)) | wr_lo;
   1155			wr_gen2(d, gen);
   1156			flits = 1;
   1157		}
   1158		sd->eop = 1;
   1159		wrp->wr_hi |= htonl(F_WR_EOP);
   1160		dma_wmb();
   1161		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
   1162		wr_gen2((struct tx_desc *)wp, ogen);
   1163		WARN_ON(ndesc != 0);
   1164	}
   1165}
   1166
   1167/**
   1168 *	write_tx_pkt_wr - write a TX_PKT work request
   1169 *	@adap: the adapter
   1170 *	@skb: the packet to send
   1171 *	@pi: the egress interface
   1172 *	@pidx: index of the first Tx descriptor to write
   1173 *	@gen: the generation value to use
   1174 *	@q: the Tx queue
   1175 *	@ndesc: number of descriptors the packet will occupy
   1176 *	@compl: the value of the COMPL bit to use
   1177 *	@addr: address
   1178 *
   1179 *	Generate a TX_PKT work request to send the supplied packet.
   1180 */
   1181static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
   1182			    const struct port_info *pi,
   1183			    unsigned int pidx, unsigned int gen,
   1184			    struct sge_txq *q, unsigned int ndesc,
   1185			    unsigned int compl, const dma_addr_t *addr)
   1186{
   1187	unsigned int flits, sgl_flits, cntrl, tso_info;
   1188	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
   1189	struct tx_desc *d = &q->desc[pidx];
   1190	struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d;
   1191
   1192	cpl->len = htonl(skb->len);
   1193	cntrl = V_TXPKT_INTF(pi->port_id);
   1194
   1195	if (skb_vlan_tag_present(skb))
   1196		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(skb_vlan_tag_get(skb));
   1197
   1198	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
   1199	if (tso_info) {
   1200		int eth_type;
   1201		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl;
   1202
   1203		d->flit[2] = 0;
   1204		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
   1205		hdr->cntrl = htonl(cntrl);
   1206		eth_type = skb_network_offset(skb) == ETH_HLEN ?
   1207		    CPL_ETH_II : CPL_ETH_II_VLAN;
   1208		tso_info |= V_LSO_ETH_TYPE(eth_type) |
   1209		    V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) |
   1210		    V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff);
   1211		hdr->lso_info = htonl(tso_info);
   1212		flits = 3;
   1213	} else {
   1214		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
   1215		cntrl |= F_TXPKT_IPCSUM_DIS;	/* SW calculates IP csum */
   1216		cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL);
   1217		cpl->cntrl = htonl(cntrl);
   1218
   1219		if (skb->len <= WR_LEN - sizeof(*cpl)) {
   1220			q->sdesc[pidx].skb = NULL;
   1221			if (!skb->data_len)
   1222				skb_copy_from_linear_data(skb, &d->flit[2],
   1223							  skb->len);
   1224			else
   1225				skb_copy_bits(skb, 0, &d->flit[2], skb->len);
   1226
   1227			flits = (skb->len + 7) / 8 + 2;
   1228			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) |
   1229					      V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT)
   1230					      | F_WR_SOP | F_WR_EOP | compl);
   1231			dma_wmb();
   1232			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) |
   1233					      V_WR_TID(q->token));
   1234			wr_gen2(d, gen);
   1235			dev_consume_skb_any(skb);
   1236			return;
   1237		}
   1238
   1239		flits = 2;
   1240	}
   1241
   1242	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
   1243	sgl_flits = write_sgl(skb, sgp, skb->data, skb_headlen(skb), addr);
   1244
   1245	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
   1246			 htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
   1247			 htonl(V_WR_TID(q->token)));
   1248}
   1249
   1250static inline void t3_stop_tx_queue(struct netdev_queue *txq,
   1251				    struct sge_qset *qs, struct sge_txq *q)
   1252{
   1253	netif_tx_stop_queue(txq);
   1254	set_bit(TXQ_ETH, &qs->txq_stopped);
   1255	q->stops++;
   1256}
   1257
   1258/**
   1259 *	t3_eth_xmit - add a packet to the Ethernet Tx queue
   1260 *	@skb: the packet
   1261 *	@dev: the egress net device
   1262 *
   1263 *	Add a packet to an SGE Tx queue.  Runs with softirqs disabled.
   1264 */
   1265netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
   1266{
   1267	int qidx;
   1268	unsigned int ndesc, pidx, credits, gen, compl;
   1269	const struct port_info *pi = netdev_priv(dev);
   1270	struct adapter *adap = pi->adapter;
   1271	struct netdev_queue *txq;
   1272	struct sge_qset *qs;
   1273	struct sge_txq *q;
   1274	dma_addr_t addr[MAX_SKB_FRAGS + 1];
   1275
   1276	/*
   1277	 * The chip min packet length is 9 octets but play safe and reject
   1278	 * anything shorter than an Ethernet header.
   1279	 */
   1280	if (unlikely(skb->len < ETH_HLEN)) {
   1281		dev_kfree_skb_any(skb);
   1282		return NETDEV_TX_OK;
   1283	}
   1284
   1285	qidx = skb_get_queue_mapping(skb);
   1286	qs = &pi->qs[qidx];
   1287	q = &qs->txq[TXQ_ETH];
   1288	txq = netdev_get_tx_queue(dev, qidx);
   1289
   1290	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
   1291
   1292	credits = q->size - q->in_use;
   1293	ndesc = calc_tx_descs(skb);
   1294
   1295	if (unlikely(credits < ndesc)) {
   1296		t3_stop_tx_queue(txq, qs, q);
   1297		dev_err(&adap->pdev->dev,
   1298			"%s: Tx ring %u full while queue awake!\n",
   1299			dev->name, q->cntxt_id & 7);
   1300		return NETDEV_TX_BUSY;
   1301	}
   1302
   1303	/* Check if ethernet packet can't be sent as immediate data */
   1304	if (skb->len > (WR_LEN - sizeof(struct cpl_tx_pkt))) {
   1305		if (unlikely(map_skb(adap->pdev, skb, addr) < 0)) {
   1306			dev_kfree_skb(skb);
   1307			return NETDEV_TX_OK;
   1308		}
   1309	}
   1310
   1311	q->in_use += ndesc;
   1312	if (unlikely(credits - ndesc < q->stop_thres)) {
   1313		t3_stop_tx_queue(txq, qs, q);
   1314
   1315		if (should_restart_tx(q) &&
   1316		    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
   1317			q->restarts++;
   1318			netif_tx_start_queue(txq);
   1319		}
   1320	}
   1321
   1322	gen = q->gen;
   1323	q->unacked += ndesc;
   1324	compl = (q->unacked & 8) << (S_WR_COMPL - 3);
   1325	q->unacked &= 7;
   1326	pidx = q->pidx;
   1327	q->pidx += ndesc;
   1328	if (q->pidx >= q->size) {
   1329		q->pidx -= q->size;
   1330		q->gen ^= 1;
   1331	}
   1332
   1333	/* update port statistics */
   1334	if (skb->ip_summed == CHECKSUM_PARTIAL)
   1335		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
   1336	if (skb_shinfo(skb)->gso_size)
   1337		qs->port_stats[SGE_PSTAT_TSO]++;
   1338	if (skb_vlan_tag_present(skb))
   1339		qs->port_stats[SGE_PSTAT_VLANINS]++;
   1340
   1341	/*
   1342	 * We do not use Tx completion interrupts to free DMAd Tx packets.
   1343	 * This is good for performance but means that we rely on new Tx
   1344	 * packets arriving to run the destructors of completed packets,
   1345	 * which open up space in their sockets' send queues.  Sometimes
   1346	 * we do not get such new packets causing Tx to stall.  A single
   1347	 * UDP transmitter is a good example of this situation.  We have
   1348	 * a clean up timer that periodically reclaims completed packets
   1349	 * but it doesn't run often enough (nor do we want it to) to prevent
   1350	 * lengthy stalls.  A solution to this problem is to run the
   1351	 * destructor early, after the packet is queued but before it's DMAd.
   1352	 * A cons is that we lie to socket memory accounting, but the amount
   1353	 * of extra memory is reasonable (limited by the number of Tx
   1354	 * descriptors), the packets do actually get freed quickly by new
   1355	 * packets almost always, and for protocols like TCP that wait for
   1356	 * acks to really free up the data the extra memory is even less.
   1357	 * On the positive side we run the destructors on the sending CPU
   1358	 * rather than on a potentially different completing CPU, usually a
   1359	 * good thing.  We also run them without holding our Tx queue lock,
   1360	 * unlike what reclaim_completed_tx() would otherwise do.
   1361	 *
   1362	 * Run the destructor before telling the DMA engine about the packet
   1363	 * to make sure it doesn't complete and get freed prematurely.
   1364	 */
   1365	if (likely(!skb_shared(skb)))
   1366		skb_orphan(skb);
   1367
   1368	write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl, addr);
   1369	check_ring_tx_db(adap, q);
   1370	return NETDEV_TX_OK;
   1371}
   1372
   1373/**
   1374 *	write_imm - write a packet into a Tx descriptor as immediate data
   1375 *	@d: the Tx descriptor to write
   1376 *	@skb: the packet
   1377 *	@len: the length of packet data to write as immediate data
   1378 *	@gen: the generation bit value to write
   1379 *
   1380 *	Writes a packet as immediate data into a Tx descriptor.  The packet
   1381 *	contains a work request at its beginning.  We must write the packet
   1382 *	carefully so the SGE doesn't read it accidentally before it's written
   1383 *	in its entirety.
   1384 */
   1385static inline void write_imm(struct tx_desc *d, struct sk_buff *skb,
   1386			     unsigned int len, unsigned int gen)
   1387{
   1388	struct work_request_hdr *from = (struct work_request_hdr *)skb->data;
   1389	struct work_request_hdr *to = (struct work_request_hdr *)d;
   1390
   1391	if (likely(!skb->data_len))
   1392		memcpy(&to[1], &from[1], len - sizeof(*from));
   1393	else
   1394		skb_copy_bits(skb, sizeof(*from), &to[1], len - sizeof(*from));
   1395
   1396	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
   1397					V_WR_BCNTLFLT(len & 7));
   1398	dma_wmb();
   1399	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
   1400					V_WR_LEN((len + 7) / 8));
   1401	wr_gen2(d, gen);
   1402	kfree_skb(skb);
   1403}
   1404
   1405/**
   1406 *	check_desc_avail - check descriptor availability on a send queue
   1407 *	@adap: the adapter
   1408 *	@q: the send queue
   1409 *	@skb: the packet needing the descriptors
   1410 *	@ndesc: the number of Tx descriptors needed
   1411 *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
   1412 *
   1413 *	Checks if the requested number of Tx descriptors is available on an
   1414 *	SGE send queue.  If the queue is already suspended or not enough
   1415 *	descriptors are available the packet is queued for later transmission.
   1416 *	Must be called with the Tx queue locked.
   1417 *
   1418 *	Returns 0 if enough descriptors are available, 1 if there aren't
   1419 *	enough descriptors and the packet has been queued, and 2 if the caller
   1420 *	needs to retry because there weren't enough descriptors at the
   1421 *	beginning of the call but some freed up in the mean time.
   1422 */
   1423static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q,
   1424				   struct sk_buff *skb, unsigned int ndesc,
   1425				   unsigned int qid)
   1426{
   1427	if (unlikely(!skb_queue_empty(&q->sendq))) {
   1428	      addq_exit:__skb_queue_tail(&q->sendq, skb);
   1429		return 1;
   1430	}
   1431	if (unlikely(q->size - q->in_use < ndesc)) {
   1432		struct sge_qset *qs = txq_to_qset(q, qid);
   1433
   1434		set_bit(qid, &qs->txq_stopped);
   1435		smp_mb__after_atomic();
   1436
   1437		if (should_restart_tx(q) &&
   1438		    test_and_clear_bit(qid, &qs->txq_stopped))
   1439			return 2;
   1440
   1441		q->stops++;
   1442		goto addq_exit;
   1443	}
   1444	return 0;
   1445}
   1446
   1447/**
   1448 *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
   1449 *	@q: the SGE control Tx queue
   1450 *
   1451 *	This is a variant of reclaim_completed_tx() that is used for Tx queues
   1452 *	that send only immediate data (presently just the control queues) and
   1453 *	thus do not have any sk_buffs to release.
   1454 */
   1455static inline void reclaim_completed_tx_imm(struct sge_txq *q)
   1456{
   1457	unsigned int reclaim = q->processed - q->cleaned;
   1458
   1459	q->in_use -= reclaim;
   1460	q->cleaned += reclaim;
   1461}
   1462
   1463static inline int immediate(const struct sk_buff *skb)
   1464{
   1465	return skb->len <= WR_LEN;
   1466}
   1467
   1468/**
   1469 *	ctrl_xmit - send a packet through an SGE control Tx queue
   1470 *	@adap: the adapter
   1471 *	@q: the control queue
   1472 *	@skb: the packet
   1473 *
   1474 *	Send a packet through an SGE control Tx queue.  Packets sent through
   1475 *	a control queue must fit entirely as immediate data in a single Tx
   1476 *	descriptor and have no page fragments.
   1477 */
   1478static int ctrl_xmit(struct adapter *adap, struct sge_txq *q,
   1479		     struct sk_buff *skb)
   1480{
   1481	int ret;
   1482	struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data;
   1483
   1484	if (unlikely(!immediate(skb))) {
   1485		WARN_ON(1);
   1486		dev_kfree_skb(skb);
   1487		return NET_XMIT_SUCCESS;
   1488	}
   1489
   1490	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
   1491	wrp->wr_lo = htonl(V_WR_TID(q->token));
   1492
   1493	spin_lock(&q->lock);
   1494      again:reclaim_completed_tx_imm(q);
   1495
   1496	ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL);
   1497	if (unlikely(ret)) {
   1498		if (ret == 1) {
   1499			spin_unlock(&q->lock);
   1500			return NET_XMIT_CN;
   1501		}
   1502		goto again;
   1503	}
   1504
   1505	write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
   1506
   1507	q->in_use++;
   1508	if (++q->pidx >= q->size) {
   1509		q->pidx = 0;
   1510		q->gen ^= 1;
   1511	}
   1512	spin_unlock(&q->lock);
   1513	wmb();
   1514	t3_write_reg(adap, A_SG_KDOORBELL,
   1515		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
   1516	return NET_XMIT_SUCCESS;
   1517}
   1518
   1519/**
   1520 *	restart_ctrlq - restart a suspended control queue
   1521 *	@w: pointer to the work associated with this handler
   1522 *
   1523 *	Resumes transmission on a suspended Tx control queue.
   1524 */
   1525static void restart_ctrlq(struct work_struct *w)
   1526{
   1527	struct sk_buff *skb;
   1528	struct sge_qset *qs = container_of(w, struct sge_qset,
   1529					   txq[TXQ_CTRL].qresume_task);
   1530	struct sge_txq *q = &qs->txq[TXQ_CTRL];
   1531
   1532	spin_lock(&q->lock);
   1533      again:reclaim_completed_tx_imm(q);
   1534
   1535	while (q->in_use < q->size &&
   1536	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
   1537
   1538		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
   1539
   1540		if (++q->pidx >= q->size) {
   1541			q->pidx = 0;
   1542			q->gen ^= 1;
   1543		}
   1544		q->in_use++;
   1545	}
   1546
   1547	if (!skb_queue_empty(&q->sendq)) {
   1548		set_bit(TXQ_CTRL, &qs->txq_stopped);
   1549		smp_mb__after_atomic();
   1550
   1551		if (should_restart_tx(q) &&
   1552		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
   1553			goto again;
   1554		q->stops++;
   1555	}
   1556
   1557	spin_unlock(&q->lock);
   1558	wmb();
   1559	t3_write_reg(qs->adap, A_SG_KDOORBELL,
   1560		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
   1561}
   1562
   1563/*
   1564 * Send a management message through control queue 0
   1565 */
   1566int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
   1567{
   1568	int ret;
   1569	local_bh_disable();
   1570	ret = ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb);
   1571	local_bh_enable();
   1572
   1573	return ret;
   1574}
   1575
   1576/**
   1577 *	deferred_unmap_destructor - unmap a packet when it is freed
   1578 *	@skb: the packet
   1579 *
   1580 *	This is the packet destructor used for Tx packets that need to remain
   1581 *	mapped until they are freed rather than until their Tx descriptors are
   1582 *	freed.
   1583 */
   1584static void deferred_unmap_destructor(struct sk_buff *skb)
   1585{
   1586	int i;
   1587	const dma_addr_t *p;
   1588	const struct skb_shared_info *si;
   1589	const struct deferred_unmap_info *dui;
   1590
   1591	dui = (struct deferred_unmap_info *)skb->head;
   1592	p = dui->addr;
   1593
   1594	if (skb_tail_pointer(skb) - skb_transport_header(skb))
   1595		dma_unmap_single(&dui->pdev->dev, *p++,
   1596				 skb_tail_pointer(skb) - skb_transport_header(skb),
   1597				 DMA_TO_DEVICE);
   1598
   1599	si = skb_shinfo(skb);
   1600	for (i = 0; i < si->nr_frags; i++)
   1601		dma_unmap_page(&dui->pdev->dev, *p++,
   1602			       skb_frag_size(&si->frags[i]), DMA_TO_DEVICE);
   1603}
   1604
   1605static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
   1606				     const struct sg_ent *sgl, int sgl_flits)
   1607{
   1608	dma_addr_t *p;
   1609	struct deferred_unmap_info *dui;
   1610
   1611	dui = (struct deferred_unmap_info *)skb->head;
   1612	dui->pdev = pdev;
   1613	for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) {
   1614		*p++ = be64_to_cpu(sgl->addr[0]);
   1615		*p++ = be64_to_cpu(sgl->addr[1]);
   1616	}
   1617	if (sgl_flits)
   1618		*p = be64_to_cpu(sgl->addr[0]);
   1619}
   1620
   1621/**
   1622 *	write_ofld_wr - write an offload work request
   1623 *	@adap: the adapter
   1624 *	@skb: the packet to send
   1625 *	@q: the Tx queue
   1626 *	@pidx: index of the first Tx descriptor to write
   1627 *	@gen: the generation value to use
   1628 *	@ndesc: number of descriptors the packet will occupy
   1629 *	@addr: the address
   1630 *
   1631 *	Write an offload work request to send the supplied packet.  The packet
   1632 *	data already carry the work request with most fields populated.
   1633 */
   1634static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
   1635			  struct sge_txq *q, unsigned int pidx,
   1636			  unsigned int gen, unsigned int ndesc,
   1637			  const dma_addr_t *addr)
   1638{
   1639	unsigned int sgl_flits, flits;
   1640	struct work_request_hdr *from;
   1641	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
   1642	struct tx_desc *d = &q->desc[pidx];
   1643
   1644	if (immediate(skb)) {
   1645		q->sdesc[pidx].skb = NULL;
   1646		write_imm(d, skb, skb->len, gen);
   1647		return;
   1648	}
   1649
   1650	/* Only TX_DATA builds SGLs */
   1651
   1652	from = (struct work_request_hdr *)skb->data;
   1653	memcpy(&d->flit[1], &from[1],
   1654	       skb_transport_offset(skb) - sizeof(*from));
   1655
   1656	flits = skb_transport_offset(skb) / 8;
   1657	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
   1658	sgl_flits = write_sgl(skb, sgp, skb_transport_header(skb),
   1659			      skb_tail_pointer(skb) - skb_transport_header(skb),
   1660			      addr);
   1661	if (need_skb_unmap()) {
   1662		setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
   1663		skb->destructor = deferred_unmap_destructor;
   1664	}
   1665
   1666	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits,
   1667			 gen, from->wr_hi, from->wr_lo);
   1668}
   1669
   1670/**
   1671 *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
   1672 *	@skb: the packet
   1673 *
   1674 * 	Returns the number of Tx descriptors needed for the given offload
   1675 * 	packet.  These packets are already fully constructed.
   1676 */
   1677static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb)
   1678{
   1679	unsigned int flits, cnt;
   1680
   1681	if (skb->len <= WR_LEN)
   1682		return 1;	/* packet fits as immediate data */
   1683
   1684	flits = skb_transport_offset(skb) / 8;	/* headers */
   1685	cnt = skb_shinfo(skb)->nr_frags;
   1686	if (skb_tail_pointer(skb) != skb_transport_header(skb))
   1687		cnt++;
   1688	return flits_to_desc(flits + sgl_len(cnt));
   1689}
   1690
   1691/**
   1692 *	ofld_xmit - send a packet through an offload queue
   1693 *	@adap: the adapter
   1694 *	@q: the Tx offload queue
   1695 *	@skb: the packet
   1696 *
   1697 *	Send an offload packet through an SGE offload queue.
   1698 */
   1699static int ofld_xmit(struct adapter *adap, struct sge_txq *q,
   1700		     struct sk_buff *skb)
   1701{
   1702	int ret;
   1703	unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen;
   1704
   1705	spin_lock(&q->lock);
   1706again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
   1707
   1708	ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD);
   1709	if (unlikely(ret)) {
   1710		if (ret == 1) {
   1711			skb->priority = ndesc;	/* save for restart */
   1712			spin_unlock(&q->lock);
   1713			return NET_XMIT_CN;
   1714		}
   1715		goto again;
   1716	}
   1717
   1718	if (!immediate(skb) &&
   1719	    map_skb(adap->pdev, skb, (dma_addr_t *)skb->head)) {
   1720		spin_unlock(&q->lock);
   1721		return NET_XMIT_SUCCESS;
   1722	}
   1723
   1724	gen = q->gen;
   1725	q->in_use += ndesc;
   1726	pidx = q->pidx;
   1727	q->pidx += ndesc;
   1728	if (q->pidx >= q->size) {
   1729		q->pidx -= q->size;
   1730		q->gen ^= 1;
   1731	}
   1732	spin_unlock(&q->lock);
   1733
   1734	write_ofld_wr(adap, skb, q, pidx, gen, ndesc, (dma_addr_t *)skb->head);
   1735	check_ring_tx_db(adap, q);
   1736	return NET_XMIT_SUCCESS;
   1737}
   1738
   1739/**
   1740 *	restart_offloadq - restart a suspended offload queue
   1741 *	@w: pointer to the work associated with this handler
   1742 *
   1743 *	Resumes transmission on a suspended Tx offload queue.
   1744 */
   1745static void restart_offloadq(struct work_struct *w)
   1746{
   1747	struct sk_buff *skb;
   1748	struct sge_qset *qs = container_of(w, struct sge_qset,
   1749					   txq[TXQ_OFLD].qresume_task);
   1750	struct sge_txq *q = &qs->txq[TXQ_OFLD];
   1751	const struct port_info *pi = netdev_priv(qs->netdev);
   1752	struct adapter *adap = pi->adapter;
   1753	unsigned int written = 0;
   1754
   1755	spin_lock(&q->lock);
   1756again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
   1757
   1758	while ((skb = skb_peek(&q->sendq)) != NULL) {
   1759		unsigned int gen, pidx;
   1760		unsigned int ndesc = skb->priority;
   1761
   1762		if (unlikely(q->size - q->in_use < ndesc)) {
   1763			set_bit(TXQ_OFLD, &qs->txq_stopped);
   1764			smp_mb__after_atomic();
   1765
   1766			if (should_restart_tx(q) &&
   1767			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
   1768				goto again;
   1769			q->stops++;
   1770			break;
   1771		}
   1772
   1773		if (!immediate(skb) &&
   1774		    map_skb(adap->pdev, skb, (dma_addr_t *)skb->head))
   1775			break;
   1776
   1777		gen = q->gen;
   1778		q->in_use += ndesc;
   1779		pidx = q->pidx;
   1780		q->pidx += ndesc;
   1781		written += ndesc;
   1782		if (q->pidx >= q->size) {
   1783			q->pidx -= q->size;
   1784			q->gen ^= 1;
   1785		}
   1786		__skb_unlink(skb, &q->sendq);
   1787		spin_unlock(&q->lock);
   1788
   1789		write_ofld_wr(adap, skb, q, pidx, gen, ndesc,
   1790			      (dma_addr_t *)skb->head);
   1791		spin_lock(&q->lock);
   1792	}
   1793	spin_unlock(&q->lock);
   1794
   1795#if USE_GTS
   1796	set_bit(TXQ_RUNNING, &q->flags);
   1797	set_bit(TXQ_LAST_PKT_DB, &q->flags);
   1798#endif
   1799	wmb();
   1800	if (likely(written))
   1801		t3_write_reg(adap, A_SG_KDOORBELL,
   1802			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
   1803}
   1804
   1805/**
   1806 *	queue_set - return the queue set a packet should use
   1807 *	@skb: the packet
   1808 *
   1809 *	Maps a packet to the SGE queue set it should use.  The desired queue
   1810 *	set is carried in bits 1-3 in the packet's priority.
   1811 */
   1812static inline int queue_set(const struct sk_buff *skb)
   1813{
   1814	return skb->priority >> 1;
   1815}
   1816
   1817/**
   1818 *	is_ctrl_pkt - return whether an offload packet is a control packet
   1819 *	@skb: the packet
   1820 *
   1821 *	Determines whether an offload packet should use an OFLD or a CTRL
   1822 *	Tx queue.  This is indicated by bit 0 in the packet's priority.
   1823 */
   1824static inline int is_ctrl_pkt(const struct sk_buff *skb)
   1825{
   1826	return skb->priority & 1;
   1827}
   1828
   1829/**
   1830 *	t3_offload_tx - send an offload packet
   1831 *	@tdev: the offload device to send to
   1832 *	@skb: the packet
   1833 *
   1834 *	Sends an offload packet.  We use the packet priority to select the
   1835 *	appropriate Tx queue as follows: bit 0 indicates whether the packet
   1836 *	should be sent as regular or control, bits 1-3 select the queue set.
   1837 */
   1838int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
   1839{
   1840	struct adapter *adap = tdev2adap(tdev);
   1841	struct sge_qset *qs = &adap->sge.qs[queue_set(skb)];
   1842
   1843	if (unlikely(is_ctrl_pkt(skb)))
   1844		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb);
   1845
   1846	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb);
   1847}
   1848
   1849/**
   1850 *	offload_enqueue - add an offload packet to an SGE offload receive queue
   1851 *	@q: the SGE response queue
   1852 *	@skb: the packet
   1853 *
   1854 *	Add a new offload packet to an SGE response queue's offload packet
   1855 *	queue.  If the packet is the first on the queue it schedules the RX
   1856 *	softirq to process the queue.
   1857 */
   1858static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb)
   1859{
   1860	int was_empty = skb_queue_empty(&q->rx_queue);
   1861
   1862	__skb_queue_tail(&q->rx_queue, skb);
   1863
   1864	if (was_empty) {
   1865		struct sge_qset *qs = rspq_to_qset(q);
   1866
   1867		napi_schedule(&qs->napi);
   1868	}
   1869}
   1870
   1871/**
   1872 *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
   1873 *	@tdev: the offload device that will be receiving the packets
   1874 *	@q: the SGE response queue that assembled the bundle
   1875 *	@skbs: the partial bundle
   1876 *	@n: the number of packets in the bundle
   1877 *
   1878 *	Delivers a (partial) bundle of Rx offload packets to an offload device.
   1879 */
   1880static inline void deliver_partial_bundle(struct t3cdev *tdev,
   1881					  struct sge_rspq *q,
   1882					  struct sk_buff *skbs[], int n)
   1883{
   1884	if (n) {
   1885		q->offload_bundles++;
   1886		tdev->recv(tdev, skbs, n);
   1887	}
   1888}
   1889
   1890/**
   1891 *	ofld_poll - NAPI handler for offload packets in interrupt mode
   1892 *	@napi: the network device doing the polling
   1893 *	@budget: polling budget
   1894 *
   1895 *	The NAPI handler for offload packets when a response queue is serviced
   1896 *	by the hard interrupt handler, i.e., when it's operating in non-polling
   1897 *	mode.  Creates small packet batches and sends them through the offload
   1898 *	receive handler.  Batches need to be of modest size as we do prefetches
   1899 *	on the packets in each.
   1900 */
   1901static int ofld_poll(struct napi_struct *napi, int budget)
   1902{
   1903	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
   1904	struct sge_rspq *q = &qs->rspq;
   1905	struct adapter *adapter = qs->adap;
   1906	int work_done = 0;
   1907
   1908	while (work_done < budget) {
   1909		struct sk_buff *skb, *tmp, *skbs[RX_BUNDLE_SIZE];
   1910		struct sk_buff_head queue;
   1911		int ngathered;
   1912
   1913		spin_lock_irq(&q->lock);
   1914		__skb_queue_head_init(&queue);
   1915		skb_queue_splice_init(&q->rx_queue, &queue);
   1916		if (skb_queue_empty(&queue)) {
   1917			napi_complete_done(napi, work_done);
   1918			spin_unlock_irq(&q->lock);
   1919			return work_done;
   1920		}
   1921		spin_unlock_irq(&q->lock);
   1922
   1923		ngathered = 0;
   1924		skb_queue_walk_safe(&queue, skb, tmp) {
   1925			if (work_done >= budget)
   1926				break;
   1927			work_done++;
   1928
   1929			__skb_unlink(skb, &queue);
   1930			prefetch(skb->data);
   1931			skbs[ngathered] = skb;
   1932			if (++ngathered == RX_BUNDLE_SIZE) {
   1933				q->offload_bundles++;
   1934				adapter->tdev.recv(&adapter->tdev, skbs,
   1935						   ngathered);
   1936				ngathered = 0;
   1937			}
   1938		}
   1939		if (!skb_queue_empty(&queue)) {
   1940			/* splice remaining packets back onto Rx queue */
   1941			spin_lock_irq(&q->lock);
   1942			skb_queue_splice(&queue, &q->rx_queue);
   1943			spin_unlock_irq(&q->lock);
   1944		}
   1945		deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
   1946	}
   1947
   1948	return work_done;
   1949}
   1950
   1951/**
   1952 *	rx_offload - process a received offload packet
   1953 *	@tdev: the offload device receiving the packet
   1954 *	@rq: the response queue that received the packet
   1955 *	@skb: the packet
   1956 *	@rx_gather: a gather list of packets if we are building a bundle
   1957 *	@gather_idx: index of the next available slot in the bundle
   1958 *
   1959 *	Process an ingress offload packet and add it to the offload ingress
   1960 *	queue. 	Returns the index of the next available slot in the bundle.
   1961 */
   1962static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
   1963			     struct sk_buff *skb, struct sk_buff *rx_gather[],
   1964			     unsigned int gather_idx)
   1965{
   1966	skb_reset_mac_header(skb);
   1967	skb_reset_network_header(skb);
   1968	skb_reset_transport_header(skb);
   1969
   1970	if (rq->polling) {
   1971		rx_gather[gather_idx++] = skb;
   1972		if (gather_idx == RX_BUNDLE_SIZE) {
   1973			tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE);
   1974			gather_idx = 0;
   1975			rq->offload_bundles++;
   1976		}
   1977	} else
   1978		offload_enqueue(rq, skb);
   1979
   1980	return gather_idx;
   1981}
   1982
   1983/**
   1984 *	restart_tx - check whether to restart suspended Tx queues
   1985 *	@qs: the queue set to resume
   1986 *
   1987 *	Restarts suspended Tx queues of an SGE queue set if they have enough
   1988 *	free resources to resume operation.
   1989 */
   1990static void restart_tx(struct sge_qset *qs)
   1991{
   1992	if (test_bit(TXQ_ETH, &qs->txq_stopped) &&
   1993	    should_restart_tx(&qs->txq[TXQ_ETH]) &&
   1994	    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
   1995		qs->txq[TXQ_ETH].restarts++;
   1996		if (netif_running(qs->netdev))
   1997			netif_tx_wake_queue(qs->tx_q);
   1998	}
   1999
   2000	if (test_bit(TXQ_OFLD, &qs->txq_stopped) &&
   2001	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
   2002	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
   2003		qs->txq[TXQ_OFLD].restarts++;
   2004
   2005		/* The work can be quite lengthy so we use driver's own queue */
   2006		queue_work(cxgb3_wq, &qs->txq[TXQ_OFLD].qresume_task);
   2007	}
   2008	if (test_bit(TXQ_CTRL, &qs->txq_stopped) &&
   2009	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
   2010	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
   2011		qs->txq[TXQ_CTRL].restarts++;
   2012
   2013		/* The work can be quite lengthy so we use driver's own queue */
   2014		queue_work(cxgb3_wq, &qs->txq[TXQ_CTRL].qresume_task);
   2015	}
   2016}
   2017
   2018/**
   2019 *	cxgb3_arp_process - process an ARP request probing a private IP address
   2020 *	@pi: the port info
   2021 *	@skb: the skbuff containing the ARP request
   2022 *
   2023 *	Check if the ARP request is probing the private IP address
   2024 *	dedicated to iSCSI, generate an ARP reply if so.
   2025 */
   2026static void cxgb3_arp_process(struct port_info *pi, struct sk_buff *skb)
   2027{
   2028	struct net_device *dev = skb->dev;
   2029	struct arphdr *arp;
   2030	unsigned char *arp_ptr;
   2031	unsigned char *sha;
   2032	__be32 sip, tip;
   2033
   2034	if (!dev)
   2035		return;
   2036
   2037	skb_reset_network_header(skb);
   2038	arp = arp_hdr(skb);
   2039
   2040	if (arp->ar_op != htons(ARPOP_REQUEST))
   2041		return;
   2042
   2043	arp_ptr = (unsigned char *)(arp + 1);
   2044	sha = arp_ptr;
   2045	arp_ptr += dev->addr_len;
   2046	memcpy(&sip, arp_ptr, sizeof(sip));
   2047	arp_ptr += sizeof(sip);
   2048	arp_ptr += dev->addr_len;
   2049	memcpy(&tip, arp_ptr, sizeof(tip));
   2050
   2051	if (tip != pi->iscsi_ipv4addr)
   2052		return;
   2053
   2054	arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
   2055		 pi->iscsic.mac_addr, sha);
   2056
   2057}
   2058
   2059static inline int is_arp(struct sk_buff *skb)
   2060{
   2061	return skb->protocol == htons(ETH_P_ARP);
   2062}
   2063
   2064static void cxgb3_process_iscsi_prov_pack(struct port_info *pi,
   2065					struct sk_buff *skb)
   2066{
   2067	if (is_arp(skb)) {
   2068		cxgb3_arp_process(pi, skb);
   2069		return;
   2070	}
   2071
   2072	if (pi->iscsic.recv)
   2073		pi->iscsic.recv(pi, skb);
   2074
   2075}
   2076
   2077/**
   2078 *	rx_eth - process an ingress ethernet packet
   2079 *	@adap: the adapter
   2080 *	@rq: the response queue that received the packet
   2081 *	@skb: the packet
   2082 *	@pad: padding
   2083 *	@lro: large receive offload
   2084 *
   2085 *	Process an ingress ethernet packet and deliver it to the stack.
   2086 *	The padding is 2 if the packet was delivered in an Rx buffer and 0
   2087 *	if it was immediate data in a response.
   2088 */
   2089static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
   2090		   struct sk_buff *skb, int pad, int lro)
   2091{
   2092	struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad);
   2093	struct sge_qset *qs = rspq_to_qset(rq);
   2094	struct port_info *pi;
   2095
   2096	skb_pull(skb, sizeof(*p) + pad);
   2097	skb->protocol = eth_type_trans(skb, adap->port[p->iff]);
   2098	pi = netdev_priv(skb->dev);
   2099	if ((skb->dev->features & NETIF_F_RXCSUM) && p->csum_valid &&
   2100	    p->csum == htons(0xffff) && !p->fragment) {
   2101		qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
   2102		skb->ip_summed = CHECKSUM_UNNECESSARY;
   2103	} else
   2104		skb_checksum_none_assert(skb);
   2105	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
   2106
   2107	if (p->vlan_valid) {
   2108		qs->port_stats[SGE_PSTAT_VLANEX]++;
   2109		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan));
   2110	}
   2111	if (rq->polling) {
   2112		if (lro)
   2113			napi_gro_receive(&qs->napi, skb);
   2114		else {
   2115			if (unlikely(pi->iscsic.flags))
   2116				cxgb3_process_iscsi_prov_pack(pi, skb);
   2117			netif_receive_skb(skb);
   2118		}
   2119	} else
   2120		netif_rx(skb);
   2121}
   2122
   2123static inline int is_eth_tcp(u32 rss)
   2124{
   2125	return G_HASHTYPE(ntohl(rss)) == RSS_HASH_4_TUPLE;
   2126}
   2127
   2128/**
   2129 *	lro_add_page - add a page chunk to an LRO session
   2130 *	@adap: the adapter
   2131 *	@qs: the associated queue set
   2132 *	@fl: the free list containing the page chunk to add
   2133 *	@len: packet length
   2134 *	@complete: Indicates the last fragment of a frame
   2135 *
   2136 *	Add a received packet contained in a page chunk to an existing LRO
   2137 *	session.
   2138 */
   2139static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
   2140			 struct sge_fl *fl, int len, int complete)
   2141{
   2142	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
   2143	struct port_info *pi = netdev_priv(qs->netdev);
   2144	struct sk_buff *skb = NULL;
   2145	struct cpl_rx_pkt *cpl;
   2146	skb_frag_t *rx_frag;
   2147	int nr_frags;
   2148	int offset = 0;
   2149
   2150	if (!qs->nomem) {
   2151		skb = napi_get_frags(&qs->napi);
   2152		qs->nomem = !skb;
   2153	}
   2154
   2155	fl->credits--;
   2156
   2157	dma_sync_single_for_cpu(&adap->pdev->dev,
   2158				dma_unmap_addr(sd, dma_addr),
   2159				fl->buf_size - SGE_PG_RSVD, DMA_FROM_DEVICE);
   2160
   2161	(*sd->pg_chunk.p_cnt)--;
   2162	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
   2163		dma_unmap_page(&adap->pdev->dev, sd->pg_chunk.mapping,
   2164			       fl->alloc_size, DMA_FROM_DEVICE);
   2165
   2166	if (!skb) {
   2167		put_page(sd->pg_chunk.page);
   2168		if (complete)
   2169			qs->nomem = 0;
   2170		return;
   2171	}
   2172
   2173	rx_frag = skb_shinfo(skb)->frags;
   2174	nr_frags = skb_shinfo(skb)->nr_frags;
   2175
   2176	if (!nr_frags) {
   2177		offset = 2 + sizeof(struct cpl_rx_pkt);
   2178		cpl = qs->lro_va = sd->pg_chunk.va + 2;
   2179
   2180		if ((qs->netdev->features & NETIF_F_RXCSUM) &&
   2181		     cpl->csum_valid && cpl->csum == htons(0xffff)) {
   2182			skb->ip_summed = CHECKSUM_UNNECESSARY;
   2183			qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
   2184		} else
   2185			skb->ip_summed = CHECKSUM_NONE;
   2186	} else
   2187		cpl = qs->lro_va;
   2188
   2189	len -= offset;
   2190
   2191	rx_frag += nr_frags;
   2192	__skb_frag_set_page(rx_frag, sd->pg_chunk.page);
   2193	skb_frag_off_set(rx_frag, sd->pg_chunk.offset + offset);
   2194	skb_frag_size_set(rx_frag, len);
   2195
   2196	skb->len += len;
   2197	skb->data_len += len;
   2198	skb->truesize += len;
   2199	skb_shinfo(skb)->nr_frags++;
   2200
   2201	if (!complete)
   2202		return;
   2203
   2204	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
   2205
   2206	if (cpl->vlan_valid) {
   2207		qs->port_stats[SGE_PSTAT_VLANEX]++;
   2208		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
   2209	}
   2210	napi_gro_frags(&qs->napi);
   2211}
   2212
   2213/**
   2214 *	handle_rsp_cntrl_info - handles control information in a response
   2215 *	@qs: the queue set corresponding to the response
   2216 *	@flags: the response control flags
   2217 *
   2218 *	Handles the control information of an SGE response, such as GTS
   2219 *	indications and completion credits for the queue set's Tx queues.
   2220 *	HW coalesces credits, we don't do any extra SW coalescing.
   2221 */
   2222static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
   2223{
   2224	unsigned int credits;
   2225
   2226#if USE_GTS
   2227	if (flags & F_RSPD_TXQ0_GTS)
   2228		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
   2229#endif
   2230
   2231	credits = G_RSPD_TXQ0_CR(flags);
   2232	if (credits)
   2233		qs->txq[TXQ_ETH].processed += credits;
   2234
   2235	credits = G_RSPD_TXQ2_CR(flags);
   2236	if (credits)
   2237		qs->txq[TXQ_CTRL].processed += credits;
   2238
   2239# if USE_GTS
   2240	if (flags & F_RSPD_TXQ1_GTS)
   2241		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
   2242# endif
   2243	credits = G_RSPD_TXQ1_CR(flags);
   2244	if (credits)
   2245		qs->txq[TXQ_OFLD].processed += credits;
   2246}
   2247
   2248/**
   2249 *	check_ring_db - check if we need to ring any doorbells
   2250 *	@adap: the adapter
   2251 *	@qs: the queue set whose Tx queues are to be examined
   2252 *	@sleeping: indicates which Tx queue sent GTS
   2253 *
   2254 *	Checks if some of a queue set's Tx queues need to ring their doorbells
   2255 *	to resume transmission after idling while they still have unprocessed
   2256 *	descriptors.
   2257 */
   2258static void check_ring_db(struct adapter *adap, struct sge_qset *qs,
   2259			  unsigned int sleeping)
   2260{
   2261	if (sleeping & F_RSPD_TXQ0_GTS) {
   2262		struct sge_txq *txq = &qs->txq[TXQ_ETH];
   2263
   2264		if (txq->cleaned + txq->in_use != txq->processed &&
   2265		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
   2266			set_bit(TXQ_RUNNING, &txq->flags);
   2267			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
   2268				     V_EGRCNTX(txq->cntxt_id));
   2269		}
   2270	}
   2271
   2272	if (sleeping & F_RSPD_TXQ1_GTS) {
   2273		struct sge_txq *txq = &qs->txq[TXQ_OFLD];
   2274
   2275		if (txq->cleaned + txq->in_use != txq->processed &&
   2276		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
   2277			set_bit(TXQ_RUNNING, &txq->flags);
   2278			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
   2279				     V_EGRCNTX(txq->cntxt_id));
   2280		}
   2281	}
   2282}
   2283
   2284/**
   2285 *	is_new_response - check if a response is newly written
   2286 *	@r: the response descriptor
   2287 *	@q: the response queue
   2288 *
   2289 *	Returns true if a response descriptor contains a yet unprocessed
   2290 *	response.
   2291 */
   2292static inline int is_new_response(const struct rsp_desc *r,
   2293				  const struct sge_rspq *q)
   2294{
   2295	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
   2296}
   2297
   2298static inline void clear_rspq_bufstate(struct sge_rspq * const q)
   2299{
   2300	q->pg_skb = NULL;
   2301	q->rx_recycle_buf = 0;
   2302}
   2303
   2304#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
   2305#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
   2306			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
   2307			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
   2308			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
   2309
   2310/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
   2311#define NOMEM_INTR_DELAY 2500
   2312
   2313/**
   2314 *	process_responses - process responses from an SGE response queue
   2315 *	@adap: the adapter
   2316 *	@qs: the queue set to which the response queue belongs
   2317 *	@budget: how many responses can be processed in this round
   2318 *
   2319 *	Process responses from an SGE response queue up to the supplied budget.
   2320 *	Responses include received packets as well as credits and other events
   2321 *	for the queues that belong to the response queue's queue set.
   2322 *	A negative budget is effectively unlimited.
   2323 *
   2324 *	Additionally choose the interrupt holdoff time for the next interrupt
   2325 *	on this queue.  If the system is under memory shortage use a fairly
   2326 *	long delay to help recovery.
   2327 */
   2328static int process_responses(struct adapter *adap, struct sge_qset *qs,
   2329			     int budget)
   2330{
   2331	struct sge_rspq *q = &qs->rspq;
   2332	struct rsp_desc *r = &q->desc[q->cidx];
   2333	int budget_left = budget;
   2334	unsigned int sleeping = 0;
   2335	struct sk_buff *offload_skbs[RX_BUNDLE_SIZE];
   2336	int ngathered = 0;
   2337
   2338	q->next_holdoff = q->holdoff_tmr;
   2339
   2340	while (likely(budget_left && is_new_response(r, q))) {
   2341		int packet_complete, eth, ethpad = 2;
   2342		int lro = !!(qs->netdev->features & NETIF_F_GRO);
   2343		struct sk_buff *skb = NULL;
   2344		u32 len, flags;
   2345		__be32 rss_hi, rss_lo;
   2346
   2347		dma_rmb();
   2348		eth = r->rss_hdr.opcode == CPL_RX_PKT;
   2349		rss_hi = *(const __be32 *)r;
   2350		rss_lo = r->rss_hdr.rss_hash_val;
   2351		flags = ntohl(r->flags);
   2352
   2353		if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) {
   2354			skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC);
   2355			if (!skb)
   2356				goto no_mem;
   2357
   2358			__skb_put_data(skb, r, AN_PKT_SIZE);
   2359			skb->data[0] = CPL_ASYNC_NOTIF;
   2360			rss_hi = htonl(CPL_ASYNC_NOTIF << 24);
   2361			q->async_notif++;
   2362		} else if (flags & F_RSPD_IMM_DATA_VALID) {
   2363			skb = get_imm_packet(r);
   2364			if (unlikely(!skb)) {
   2365no_mem:
   2366				q->next_holdoff = NOMEM_INTR_DELAY;
   2367				q->nomem++;
   2368				/* consume one credit since we tried */
   2369				budget_left--;
   2370				break;
   2371			}
   2372			q->imm_data++;
   2373			ethpad = 0;
   2374		} else if ((len = ntohl(r->len_cq)) != 0) {
   2375			struct sge_fl *fl;
   2376
   2377			lro &= eth && is_eth_tcp(rss_hi);
   2378
   2379			fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
   2380			if (fl->use_pages) {
   2381				void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
   2382
   2383				net_prefetch(addr);
   2384				__refill_fl(adap, fl);
   2385				if (lro > 0) {
   2386					lro_add_page(adap, qs, fl,
   2387						     G_RSPD_LEN(len),
   2388						     flags & F_RSPD_EOP);
   2389					goto next_fl;
   2390				}
   2391
   2392				skb = get_packet_pg(adap, fl, q,
   2393						    G_RSPD_LEN(len),
   2394						    eth ?
   2395						    SGE_RX_DROP_THRES : 0);
   2396				q->pg_skb = skb;
   2397			} else
   2398				skb = get_packet(adap, fl, G_RSPD_LEN(len),
   2399						 eth ? SGE_RX_DROP_THRES : 0);
   2400			if (unlikely(!skb)) {
   2401				if (!eth)
   2402					goto no_mem;
   2403				q->rx_drops++;
   2404			} else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT))
   2405				__skb_pull(skb, 2);
   2406next_fl:
   2407			if (++fl->cidx == fl->size)
   2408				fl->cidx = 0;
   2409		} else
   2410			q->pure_rsps++;
   2411
   2412		if (flags & RSPD_CTRL_MASK) {
   2413			sleeping |= flags & RSPD_GTS_MASK;
   2414			handle_rsp_cntrl_info(qs, flags);
   2415		}
   2416
   2417		r++;
   2418		if (unlikely(++q->cidx == q->size)) {
   2419			q->cidx = 0;
   2420			q->gen ^= 1;
   2421			r = q->desc;
   2422		}
   2423		prefetch(r);
   2424
   2425		if (++q->credits >= (q->size / 4)) {
   2426			refill_rspq(adap, q, q->credits);
   2427			q->credits = 0;
   2428		}
   2429
   2430		packet_complete = flags &
   2431				  (F_RSPD_EOP | F_RSPD_IMM_DATA_VALID |
   2432				   F_RSPD_ASYNC_NOTIF);
   2433
   2434		if (skb != NULL && packet_complete) {
   2435			if (eth)
   2436				rx_eth(adap, q, skb, ethpad, lro);
   2437			else {
   2438				q->offload_pkts++;
   2439				/* Preserve the RSS info in csum & priority */
   2440				skb->csum = rss_hi;
   2441				skb->priority = rss_lo;
   2442				ngathered = rx_offload(&adap->tdev, q, skb,
   2443						       offload_skbs,
   2444						       ngathered);
   2445			}
   2446
   2447			if (flags & F_RSPD_EOP)
   2448				clear_rspq_bufstate(q);
   2449		}
   2450		--budget_left;
   2451	}
   2452
   2453	deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
   2454
   2455	if (sleeping)
   2456		check_ring_db(adap, qs, sleeping);
   2457
   2458	smp_mb();		/* commit Tx queue .processed updates */
   2459	if (unlikely(qs->txq_stopped != 0))
   2460		restart_tx(qs);
   2461
   2462	budget -= budget_left;
   2463	return budget;
   2464}
   2465
   2466static inline int is_pure_response(const struct rsp_desc *r)
   2467{
   2468	__be32 n = r->flags & htonl(F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID);
   2469
   2470	return (n | r->len_cq) == 0;
   2471}
   2472
   2473/**
   2474 *	napi_rx_handler - the NAPI handler for Rx processing
   2475 *	@napi: the napi instance
   2476 *	@budget: how many packets we can process in this round
   2477 *
   2478 *	Handler for new data events when using NAPI.
   2479 */
   2480static int napi_rx_handler(struct napi_struct *napi, int budget)
   2481{
   2482	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
   2483	struct adapter *adap = qs->adap;
   2484	int work_done = process_responses(adap, qs, budget);
   2485
   2486	if (likely(work_done < budget)) {
   2487		napi_complete_done(napi, work_done);
   2488
   2489		/*
   2490		 * Because we don't atomically flush the following
   2491		 * write it is possible that in very rare cases it can
   2492		 * reach the device in a way that races with a new
   2493		 * response being written plus an error interrupt
   2494		 * causing the NAPI interrupt handler below to return
   2495		 * unhandled status to the OS.  To protect against
   2496		 * this would require flushing the write and doing
   2497		 * both the write and the flush with interrupts off.
   2498		 * Way too expensive and unjustifiable given the
   2499		 * rarity of the race.
   2500		 *
   2501		 * The race cannot happen at all with MSI-X.
   2502		 */
   2503		t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
   2504			     V_NEWTIMER(qs->rspq.next_holdoff) |
   2505			     V_NEWINDEX(qs->rspq.cidx));
   2506	}
   2507	return work_done;
   2508}
   2509
   2510/*
   2511 * Returns true if the device is already scheduled for polling.
   2512 */
   2513static inline int napi_is_scheduled(struct napi_struct *napi)
   2514{
   2515	return test_bit(NAPI_STATE_SCHED, &napi->state);
   2516}
   2517
   2518/**
   2519 *	process_pure_responses - process pure responses from a response queue
   2520 *	@adap: the adapter
   2521 *	@qs: the queue set owning the response queue
   2522 *	@r: the first pure response to process
   2523 *
   2524 *	A simpler version of process_responses() that handles only pure (i.e.,
   2525 *	non data-carrying) responses.  Such respones are too light-weight to
   2526 *	justify calling a softirq under NAPI, so we handle them specially in
   2527 *	the interrupt handler.  The function is called with a pointer to a
   2528 *	response, which the caller must ensure is a valid pure response.
   2529 *
   2530 *	Returns 1 if it encounters a valid data-carrying response, 0 otherwise.
   2531 */
   2532static int process_pure_responses(struct adapter *adap, struct sge_qset *qs,
   2533				  struct rsp_desc *r)
   2534{
   2535	struct sge_rspq *q = &qs->rspq;
   2536	unsigned int sleeping = 0;
   2537
   2538	do {
   2539		u32 flags = ntohl(r->flags);
   2540
   2541		r++;
   2542		if (unlikely(++q->cidx == q->size)) {
   2543			q->cidx = 0;
   2544			q->gen ^= 1;
   2545			r = q->desc;
   2546		}
   2547		prefetch(r);
   2548
   2549		if (flags & RSPD_CTRL_MASK) {
   2550			sleeping |= flags & RSPD_GTS_MASK;
   2551			handle_rsp_cntrl_info(qs, flags);
   2552		}
   2553
   2554		q->pure_rsps++;
   2555		if (++q->credits >= (q->size / 4)) {
   2556			refill_rspq(adap, q, q->credits);
   2557			q->credits = 0;
   2558		}
   2559		if (!is_new_response(r, q))
   2560			break;
   2561		dma_rmb();
   2562	} while (is_pure_response(r));
   2563
   2564	if (sleeping)
   2565		check_ring_db(adap, qs, sleeping);
   2566
   2567	smp_mb();		/* commit Tx queue .processed updates */
   2568	if (unlikely(qs->txq_stopped != 0))
   2569		restart_tx(qs);
   2570
   2571	return is_new_response(r, q);
   2572}
   2573
   2574/**
   2575 *	handle_responses - decide what to do with new responses in NAPI mode
   2576 *	@adap: the adapter
   2577 *	@q: the response queue
   2578 *
   2579 *	This is used by the NAPI interrupt handlers to decide what to do with
   2580 *	new SGE responses.  If there are no new responses it returns -1.  If
   2581 *	there are new responses and they are pure (i.e., non-data carrying)
   2582 *	it handles them straight in hard interrupt context as they are very
   2583 *	cheap and don't deliver any packets.  Finally, if there are any data
   2584 *	signaling responses it schedules the NAPI handler.  Returns 1 if it
   2585 *	schedules NAPI, 0 if all new responses were pure.
   2586 *
   2587 *	The caller must ascertain NAPI is not already running.
   2588 */
   2589static inline int handle_responses(struct adapter *adap, struct sge_rspq *q)
   2590{
   2591	struct sge_qset *qs = rspq_to_qset(q);
   2592	struct rsp_desc *r = &q->desc[q->cidx];
   2593
   2594	if (!is_new_response(r, q))
   2595		return -1;
   2596	dma_rmb();
   2597	if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) {
   2598		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
   2599			     V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
   2600		return 0;
   2601	}
   2602	napi_schedule(&qs->napi);
   2603	return 1;
   2604}
   2605
   2606/*
   2607 * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case
   2608 * (i.e., response queue serviced in hard interrupt).
   2609 */
   2610static irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
   2611{
   2612	struct sge_qset *qs = cookie;
   2613	struct adapter *adap = qs->adap;
   2614	struct sge_rspq *q = &qs->rspq;
   2615
   2616	spin_lock(&q->lock);
   2617	if (process_responses(adap, qs, -1) == 0)
   2618		q->unhandled_irqs++;
   2619	t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
   2620		     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
   2621	spin_unlock(&q->lock);
   2622	return IRQ_HANDLED;
   2623}
   2624
   2625/*
   2626 * The MSI-X interrupt handler for an SGE response queue for the NAPI case
   2627 * (i.e., response queue serviced by NAPI polling).
   2628 */
   2629static irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
   2630{
   2631	struct sge_qset *qs = cookie;
   2632	struct sge_rspq *q = &qs->rspq;
   2633
   2634	spin_lock(&q->lock);
   2635
   2636	if (handle_responses(qs->adap, q) < 0)
   2637		q->unhandled_irqs++;
   2638	spin_unlock(&q->lock);
   2639	return IRQ_HANDLED;
   2640}
   2641
   2642/*
   2643 * The non-NAPI MSI interrupt handler.  This needs to handle data events from
   2644 * SGE response queues as well as error and other async events as they all use
   2645 * the same MSI vector.  We use one SGE response queue per port in this mode
   2646 * and protect all response queues with queue 0's lock.
   2647 */
   2648static irqreturn_t t3_intr_msi(int irq, void *cookie)
   2649{
   2650	int new_packets = 0;
   2651	struct adapter *adap = cookie;
   2652	struct sge_rspq *q = &adap->sge.qs[0].rspq;
   2653
   2654	spin_lock(&q->lock);
   2655
   2656	if (process_responses(adap, &adap->sge.qs[0], -1)) {
   2657		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
   2658			     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
   2659		new_packets = 1;
   2660	}
   2661
   2662	if (adap->params.nports == 2 &&
   2663	    process_responses(adap, &adap->sge.qs[1], -1)) {
   2664		struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
   2665
   2666		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) |
   2667			     V_NEWTIMER(q1->next_holdoff) |
   2668			     V_NEWINDEX(q1->cidx));
   2669		new_packets = 1;
   2670	}
   2671
   2672	if (!new_packets && t3_slow_intr_handler(adap) == 0)
   2673		q->unhandled_irqs++;
   2674
   2675	spin_unlock(&q->lock);
   2676	return IRQ_HANDLED;
   2677}
   2678
   2679static int rspq_check_napi(struct sge_qset *qs)
   2680{
   2681	struct sge_rspq *q = &qs->rspq;
   2682
   2683	if (!napi_is_scheduled(&qs->napi) &&
   2684	    is_new_response(&q->desc[q->cidx], q)) {
   2685		napi_schedule(&qs->napi);
   2686		return 1;
   2687	}
   2688	return 0;
   2689}
   2690
   2691/*
   2692 * The MSI interrupt handler for the NAPI case (i.e., response queues serviced
   2693 * by NAPI polling).  Handles data events from SGE response queues as well as
   2694 * error and other async events as they all use the same MSI vector.  We use
   2695 * one SGE response queue per port in this mode and protect all response
   2696 * queues with queue 0's lock.
   2697 */
   2698static irqreturn_t t3_intr_msi_napi(int irq, void *cookie)
   2699{
   2700	int new_packets;
   2701	struct adapter *adap = cookie;
   2702	struct sge_rspq *q = &adap->sge.qs[0].rspq;
   2703
   2704	spin_lock(&q->lock);
   2705
   2706	new_packets = rspq_check_napi(&adap->sge.qs[0]);
   2707	if (adap->params.nports == 2)
   2708		new_packets += rspq_check_napi(&adap->sge.qs[1]);
   2709	if (!new_packets && t3_slow_intr_handler(adap) == 0)
   2710		q->unhandled_irqs++;
   2711
   2712	spin_unlock(&q->lock);
   2713	return IRQ_HANDLED;
   2714}
   2715
   2716/*
   2717 * A helper function that processes responses and issues GTS.
   2718 */
   2719static inline int process_responses_gts(struct adapter *adap,
   2720					struct sge_rspq *rq)
   2721{
   2722	int work;
   2723
   2724	work = process_responses(adap, rspq_to_qset(rq), -1);
   2725	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
   2726		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
   2727	return work;
   2728}
   2729
   2730/*
   2731 * The legacy INTx interrupt handler.  This needs to handle data events from
   2732 * SGE response queues as well as error and other async events as they all use
   2733 * the same interrupt pin.  We use one SGE response queue per port in this mode
   2734 * and protect all response queues with queue 0's lock.
   2735 */
   2736static irqreturn_t t3_intr(int irq, void *cookie)
   2737{
   2738	int work_done, w0, w1;
   2739	struct adapter *adap = cookie;
   2740	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
   2741	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
   2742
   2743	spin_lock(&q0->lock);
   2744
   2745	w0 = is_new_response(&q0->desc[q0->cidx], q0);
   2746	w1 = adap->params.nports == 2 &&
   2747	    is_new_response(&q1->desc[q1->cidx], q1);
   2748
   2749	if (likely(w0 | w1)) {
   2750		t3_write_reg(adap, A_PL_CLI, 0);
   2751		t3_read_reg(adap, A_PL_CLI);	/* flush */
   2752
   2753		if (likely(w0))
   2754			process_responses_gts(adap, q0);
   2755
   2756		if (w1)
   2757			process_responses_gts(adap, q1);
   2758
   2759		work_done = w0 | w1;
   2760	} else
   2761		work_done = t3_slow_intr_handler(adap);
   2762
   2763	spin_unlock(&q0->lock);
   2764	return IRQ_RETVAL(work_done != 0);
   2765}
   2766
   2767/*
   2768 * Interrupt handler for legacy INTx interrupts for T3B-based cards.
   2769 * Handles data events from SGE response queues as well as error and other
   2770 * async events as they all use the same interrupt pin.  We use one SGE
   2771 * response queue per port in this mode and protect all response queues with
   2772 * queue 0's lock.
   2773 */
   2774static irqreturn_t t3b_intr(int irq, void *cookie)
   2775{
   2776	u32 map;
   2777	struct adapter *adap = cookie;
   2778	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
   2779
   2780	t3_write_reg(adap, A_PL_CLI, 0);
   2781	map = t3_read_reg(adap, A_SG_DATA_INTR);
   2782
   2783	if (unlikely(!map))	/* shared interrupt, most likely */
   2784		return IRQ_NONE;
   2785
   2786	spin_lock(&q0->lock);
   2787
   2788	if (unlikely(map & F_ERRINTR))
   2789		t3_slow_intr_handler(adap);
   2790
   2791	if (likely(map & 1))
   2792		process_responses_gts(adap, q0);
   2793
   2794	if (map & 2)
   2795		process_responses_gts(adap, &adap->sge.qs[1].rspq);
   2796
   2797	spin_unlock(&q0->lock);
   2798	return IRQ_HANDLED;
   2799}
   2800
   2801/*
   2802 * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards.
   2803 * Handles data events from SGE response queues as well as error and other
   2804 * async events as they all use the same interrupt pin.  We use one SGE
   2805 * response queue per port in this mode and protect all response queues with
   2806 * queue 0's lock.
   2807 */
   2808static irqreturn_t t3b_intr_napi(int irq, void *cookie)
   2809{
   2810	u32 map;
   2811	struct adapter *adap = cookie;
   2812	struct sge_qset *qs0 = &adap->sge.qs[0];
   2813	struct sge_rspq *q0 = &qs0->rspq;
   2814
   2815	t3_write_reg(adap, A_PL_CLI, 0);
   2816	map = t3_read_reg(adap, A_SG_DATA_INTR);
   2817
   2818	if (unlikely(!map))	/* shared interrupt, most likely */
   2819		return IRQ_NONE;
   2820
   2821	spin_lock(&q0->lock);
   2822
   2823	if (unlikely(map & F_ERRINTR))
   2824		t3_slow_intr_handler(adap);
   2825
   2826	if (likely(map & 1))
   2827		napi_schedule(&qs0->napi);
   2828
   2829	if (map & 2)
   2830		napi_schedule(&adap->sge.qs[1].napi);
   2831
   2832	spin_unlock(&q0->lock);
   2833	return IRQ_HANDLED;
   2834}
   2835
   2836/**
   2837 *	t3_intr_handler - select the top-level interrupt handler
   2838 *	@adap: the adapter
   2839 *	@polling: whether using NAPI to service response queues
   2840 *
   2841 *	Selects the top-level interrupt handler based on the type of interrupts
   2842 *	(MSI-X, MSI, or legacy) and whether NAPI will be used to service the
   2843 *	response queues.
   2844 */
   2845irq_handler_t t3_intr_handler(struct adapter *adap, int polling)
   2846{
   2847	if (adap->flags & USING_MSIX)
   2848		return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix;
   2849	if (adap->flags & USING_MSI)
   2850		return polling ? t3_intr_msi_napi : t3_intr_msi;
   2851	if (adap->params.rev > 0)
   2852		return polling ? t3b_intr_napi : t3b_intr;
   2853	return t3_intr;
   2854}
   2855
   2856#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
   2857		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
   2858		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
   2859		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
   2860		    F_HIRCQPARITYERROR)
   2861#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
   2862#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
   2863		      F_RSPQDISABLED)
   2864
   2865/**
   2866 *	t3_sge_err_intr_handler - SGE async event interrupt handler
   2867 *	@adapter: the adapter
   2868 *
   2869 *	Interrupt handler for SGE asynchronous (non-data) events.
   2870 */
   2871void t3_sge_err_intr_handler(struct adapter *adapter)
   2872{
   2873	unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE) &
   2874				 ~F_FLEMPTY;
   2875
   2876	if (status & SGE_PARERR)
   2877		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
   2878			 status & SGE_PARERR);
   2879	if (status & SGE_FRAMINGERR)
   2880		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
   2881			 status & SGE_FRAMINGERR);
   2882
   2883	if (status & F_RSPQCREDITOVERFOW)
   2884		CH_ALERT(adapter, "SGE response queue credit overflow\n");
   2885
   2886	if (status & F_RSPQDISABLED) {
   2887		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
   2888
   2889		CH_ALERT(adapter,
   2890			 "packet delivered to disabled response queue "
   2891			 "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff);
   2892	}
   2893
   2894	if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
   2895		queue_work(cxgb3_wq, &adapter->db_drop_task);
   2896
   2897	if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL))
   2898		queue_work(cxgb3_wq, &adapter->db_full_task);
   2899
   2900	if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY))
   2901		queue_work(cxgb3_wq, &adapter->db_empty_task);
   2902
   2903	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
   2904	if (status &  SGE_FATALERR)
   2905		t3_fatal_err(adapter);
   2906}
   2907
   2908/**
   2909 *	sge_timer_tx - perform periodic maintenance of an SGE qset
   2910 *	@t: a timer list containing the SGE queue set to maintain
   2911 *
   2912 *	Runs periodically from a timer to perform maintenance of an SGE queue
   2913 *	set.  It performs two tasks:
   2914 *
   2915 *	Cleans up any completed Tx descriptors that may still be pending.
   2916 *	Normal descriptor cleanup happens when new packets are added to a Tx
   2917 *	queue so this timer is relatively infrequent and does any cleanup only
   2918 *	if the Tx queue has not seen any new packets in a while.  We make a
   2919 *	best effort attempt to reclaim descriptors, in that we don't wait
   2920 *	around if we cannot get a queue's lock (which most likely is because
   2921 *	someone else is queueing new packets and so will also handle the clean
   2922 *	up).  Since control queues use immediate data exclusively we don't
   2923 *	bother cleaning them up here.
   2924 *
   2925 */
   2926static void sge_timer_tx(struct timer_list *t)
   2927{
   2928	struct sge_qset *qs = from_timer(qs, t, tx_reclaim_timer);
   2929	struct port_info *pi = netdev_priv(qs->netdev);
   2930	struct adapter *adap = pi->adapter;
   2931	unsigned int tbd[SGE_TXQ_PER_SET] = {0, 0};
   2932	unsigned long next_period;
   2933
   2934	if (__netif_tx_trylock(qs->tx_q)) {
   2935                tbd[TXQ_ETH] = reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
   2936                                                     TX_RECLAIM_TIMER_CHUNK);
   2937		__netif_tx_unlock(qs->tx_q);
   2938	}
   2939
   2940	if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) {
   2941		tbd[TXQ_OFLD] = reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD],
   2942						     TX_RECLAIM_TIMER_CHUNK);
   2943		spin_unlock(&qs->txq[TXQ_OFLD].lock);
   2944	}
   2945
   2946	next_period = TX_RECLAIM_PERIOD >>
   2947                      (max(tbd[TXQ_ETH], tbd[TXQ_OFLD]) /
   2948                      TX_RECLAIM_TIMER_CHUNK);
   2949	mod_timer(&qs->tx_reclaim_timer, jiffies + next_period);
   2950}
   2951
   2952/**
   2953 *	sge_timer_rx - perform periodic maintenance of an SGE qset
   2954 *	@t: the timer list containing the SGE queue set to maintain
   2955 *
   2956 *	a) Replenishes Rx queues that have run out due to memory shortage.
   2957 *	Normally new Rx buffers are added when existing ones are consumed but
   2958 *	when out of memory a queue can become empty.  We try to add only a few
   2959 *	buffers here, the queue will be replenished fully as these new buffers
   2960 *	are used up if memory shortage has subsided.
   2961 *
   2962 *	b) Return coalesced response queue credits in case a response queue is
   2963 *	starved.
   2964 *
   2965 */
   2966static void sge_timer_rx(struct timer_list *t)
   2967{
   2968	spinlock_t *lock;
   2969	struct sge_qset *qs = from_timer(qs, t, rx_reclaim_timer);
   2970	struct port_info *pi = netdev_priv(qs->netdev);
   2971	struct adapter *adap = pi->adapter;
   2972	u32 status;
   2973
   2974	lock = adap->params.rev > 0 ?
   2975	       &qs->rspq.lock : &adap->sge.qs[0].rspq.lock;
   2976
   2977	if (!spin_trylock_irq(lock))
   2978		goto out;
   2979
   2980	if (napi_is_scheduled(&qs->napi))
   2981		goto unlock;
   2982
   2983	if (adap->params.rev < 4) {
   2984		status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
   2985
   2986		if (status & (1 << qs->rspq.cntxt_id)) {
   2987			qs->rspq.starved++;
   2988			if (qs->rspq.credits) {
   2989				qs->rspq.credits--;
   2990				refill_rspq(adap, &qs->rspq, 1);
   2991				qs->rspq.restarted++;
   2992				t3_write_reg(adap, A_SG_RSPQ_FL_STATUS,
   2993					     1 << qs->rspq.cntxt_id);
   2994			}
   2995		}
   2996	}
   2997
   2998	if (qs->fl[0].credits < qs->fl[0].size)
   2999		__refill_fl(adap, &qs->fl[0]);
   3000	if (qs->fl[1].credits < qs->fl[1].size)
   3001		__refill_fl(adap, &qs->fl[1]);
   3002
   3003unlock:
   3004	spin_unlock_irq(lock);
   3005out:
   3006	mod_timer(&qs->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
   3007}
   3008
   3009/**
   3010 *	t3_update_qset_coalesce - update coalescing settings for a queue set
   3011 *	@qs: the SGE queue set
   3012 *	@p: new queue set parameters
   3013 *
   3014 *	Update the coalescing settings for an SGE queue set.  Nothing is done
   3015 *	if the queue set is not initialized yet.
   3016 */
   3017void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
   3018{
   3019	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
   3020	qs->rspq.polling = p->polling;
   3021	qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
   3022}
   3023
   3024/**
   3025 *	t3_sge_alloc_qset - initialize an SGE queue set
   3026 *	@adapter: the adapter
   3027 *	@id: the queue set id
   3028 *	@nports: how many Ethernet ports will be using this queue set
   3029 *	@irq_vec_idx: the IRQ vector index for response queue interrupts
   3030 *	@p: configuration parameters for this queue set
   3031 *	@ntxq: number of Tx queues for the queue set
   3032 *	@dev: net device associated with this queue set
   3033 *	@netdevq: net device TX queue associated with this queue set
   3034 *
   3035 *	Allocate resources and initialize an SGE queue set.  A queue set
   3036 *	comprises a response queue, two Rx free-buffer queues, and up to 3
   3037 *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
   3038 *	queue, offload queue, and control queue.
   3039 */
   3040int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
   3041		      int irq_vec_idx, const struct qset_params *p,
   3042		      int ntxq, struct net_device *dev,
   3043		      struct netdev_queue *netdevq)
   3044{
   3045	int i, avail, ret = -ENOMEM;
   3046	struct sge_qset *q = &adapter->sge.qs[id];
   3047
   3048	init_qset_cntxt(q, id);
   3049	timer_setup(&q->tx_reclaim_timer, sge_timer_tx, 0);
   3050	timer_setup(&q->rx_reclaim_timer, sge_timer_rx, 0);
   3051
   3052	q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size,
   3053				   sizeof(struct rx_desc),
   3054				   sizeof(struct rx_sw_desc),
   3055				   &q->fl[0].phys_addr, &q->fl[0].sdesc);
   3056	if (!q->fl[0].desc)
   3057		goto err;
   3058
   3059	q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size,
   3060				   sizeof(struct rx_desc),
   3061				   sizeof(struct rx_sw_desc),
   3062				   &q->fl[1].phys_addr, &q->fl[1].sdesc);
   3063	if (!q->fl[1].desc)
   3064		goto err;
   3065
   3066	q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size,
   3067				  sizeof(struct rsp_desc), 0,
   3068				  &q->rspq.phys_addr, NULL);
   3069	if (!q->rspq.desc)
   3070		goto err;
   3071
   3072	for (i = 0; i < ntxq; ++i) {
   3073		/*
   3074		 * The control queue always uses immediate data so does not
   3075		 * need to keep track of any sk_buffs.
   3076		 */
   3077		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
   3078
   3079		q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i],
   3080					    sizeof(struct tx_desc), sz,
   3081					    &q->txq[i].phys_addr,
   3082					    &q->txq[i].sdesc);
   3083		if (!q->txq[i].desc)
   3084			goto err;
   3085
   3086		q->txq[i].gen = 1;
   3087		q->txq[i].size = p->txq_size[i];
   3088		spin_lock_init(&q->txq[i].lock);
   3089		skb_queue_head_init(&q->txq[i].sendq);
   3090	}
   3091
   3092	INIT_WORK(&q->txq[TXQ_OFLD].qresume_task, restart_offloadq);
   3093	INIT_WORK(&q->txq[TXQ_CTRL].qresume_task, restart_ctrlq);
   3094
   3095	q->fl[0].gen = q->fl[1].gen = 1;
   3096	q->fl[0].size = p->fl_size;
   3097	q->fl[1].size = p->jumbo_size;
   3098
   3099	q->rspq.gen = 1;
   3100	q->rspq.size = p->rspq_size;
   3101	spin_lock_init(&q->rspq.lock);
   3102	skb_queue_head_init(&q->rspq.rx_queue);
   3103
   3104	q->txq[TXQ_ETH].stop_thres = nports *
   3105	    flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3);
   3106
   3107#if FL0_PG_CHUNK_SIZE > 0
   3108	q->fl[0].buf_size = FL0_PG_CHUNK_SIZE;
   3109#else
   3110	q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data);
   3111#endif
   3112#if FL1_PG_CHUNK_SIZE > 0
   3113	q->fl[1].buf_size = FL1_PG_CHUNK_SIZE;
   3114#else
   3115	q->fl[1].buf_size = is_offload(adapter) ?
   3116		(16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
   3117		MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt);
   3118#endif
   3119
   3120	q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0;
   3121	q->fl[1].use_pages = FL1_PG_CHUNK_SIZE > 0;
   3122	q->fl[0].order = FL0_PG_ORDER;
   3123	q->fl[1].order = FL1_PG_ORDER;
   3124	q->fl[0].alloc_size = FL0_PG_ALLOC_SIZE;
   3125	q->fl[1].alloc_size = FL1_PG_ALLOC_SIZE;
   3126
   3127	spin_lock_irq(&adapter->sge.reg_lock);
   3128
   3129	/* FL threshold comparison uses < */
   3130	ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx,
   3131				   q->rspq.phys_addr, q->rspq.size,
   3132				   q->fl[0].buf_size - SGE_PG_RSVD, 1, 0);
   3133	if (ret)
   3134		goto err_unlock;
   3135
   3136	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
   3137		ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0,
   3138					  q->fl[i].phys_addr, q->fl[i].size,
   3139					  q->fl[i].buf_size - SGE_PG_RSVD,
   3140					  p->cong_thres, 1, 0);
   3141		if (ret)
   3142			goto err_unlock;
   3143	}
   3144
   3145	ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
   3146				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
   3147				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
   3148				 1, 0);
   3149	if (ret)
   3150		goto err_unlock;
   3151
   3152	if (ntxq > 1) {
   3153		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id,
   3154					 USE_GTS, SGE_CNTXT_OFLD, id,
   3155					 q->txq[TXQ_OFLD].phys_addr,
   3156					 q->txq[TXQ_OFLD].size, 0, 1, 0);
   3157		if (ret)
   3158			goto err_unlock;
   3159	}
   3160
   3161	if (ntxq > 2) {
   3162		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0,
   3163					 SGE_CNTXT_CTRL, id,
   3164					 q->txq[TXQ_CTRL].phys_addr,
   3165					 q->txq[TXQ_CTRL].size,
   3166					 q->txq[TXQ_CTRL].token, 1, 0);
   3167		if (ret)
   3168			goto err_unlock;
   3169	}
   3170
   3171	spin_unlock_irq(&adapter->sge.reg_lock);
   3172
   3173	q->adap = adapter;
   3174	q->netdev = dev;
   3175	q->tx_q = netdevq;
   3176	t3_update_qset_coalesce(q, p);
   3177
   3178	avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
   3179			  GFP_KERNEL | __GFP_COMP);
   3180	if (!avail) {
   3181		CH_ALERT(adapter, "free list queue 0 initialization failed\n");
   3182		ret = -ENOMEM;
   3183		goto err;
   3184	}
   3185	if (avail < q->fl[0].size)
   3186		CH_WARN(adapter, "free list queue 0 enabled with %d credits\n",
   3187			avail);
   3188
   3189	avail = refill_fl(adapter, &q->fl[1], q->fl[1].size,
   3190			  GFP_KERNEL | __GFP_COMP);
   3191	if (avail < q->fl[1].size)
   3192		CH_WARN(adapter, "free list queue 1 enabled with %d credits\n",
   3193			avail);
   3194	refill_rspq(adapter, &q->rspq, q->rspq.size - 1);
   3195
   3196	t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
   3197		     V_NEWTIMER(q->rspq.holdoff_tmr));
   3198
   3199	return 0;
   3200
   3201err_unlock:
   3202	spin_unlock_irq(&adapter->sge.reg_lock);
   3203err:
   3204	t3_free_qset(adapter, q);
   3205	return ret;
   3206}
   3207
   3208/**
   3209 *      t3_start_sge_timers - start SGE timer call backs
   3210 *      @adap: the adapter
   3211 *
   3212 *      Starts each SGE queue set's timer call back
   3213 */
   3214void t3_start_sge_timers(struct adapter *adap)
   3215{
   3216	int i;
   3217
   3218	for (i = 0; i < SGE_QSETS; ++i) {
   3219		struct sge_qset *q = &adap->sge.qs[i];
   3220
   3221		if (q->tx_reclaim_timer.function)
   3222			mod_timer(&q->tx_reclaim_timer,
   3223				  jiffies + TX_RECLAIM_PERIOD);
   3224
   3225		if (q->rx_reclaim_timer.function)
   3226			mod_timer(&q->rx_reclaim_timer,
   3227				  jiffies + RX_RECLAIM_PERIOD);
   3228	}
   3229}
   3230
   3231/**
   3232 *	t3_stop_sge_timers - stop SGE timer call backs
   3233 *	@adap: the adapter
   3234 *
   3235 *	Stops each SGE queue set's timer call back
   3236 */
   3237void t3_stop_sge_timers(struct adapter *adap)
   3238{
   3239	int i;
   3240
   3241	for (i = 0; i < SGE_QSETS; ++i) {
   3242		struct sge_qset *q = &adap->sge.qs[i];
   3243
   3244		if (q->tx_reclaim_timer.function)
   3245			del_timer_sync(&q->tx_reclaim_timer);
   3246		if (q->rx_reclaim_timer.function)
   3247			del_timer_sync(&q->rx_reclaim_timer);
   3248	}
   3249}
   3250
   3251/**
   3252 *	t3_free_sge_resources - free SGE resources
   3253 *	@adap: the adapter
   3254 *
   3255 *	Frees resources used by the SGE queue sets.
   3256 */
   3257void t3_free_sge_resources(struct adapter *adap)
   3258{
   3259	int i;
   3260
   3261	for (i = 0; i < SGE_QSETS; ++i)
   3262		t3_free_qset(adap, &adap->sge.qs[i]);
   3263}
   3264
   3265/**
   3266 *	t3_sge_start - enable SGE
   3267 *	@adap: the adapter
   3268 *
   3269 *	Enables the SGE for DMAs.  This is the last step in starting packet
   3270 *	transfers.
   3271 */
   3272void t3_sge_start(struct adapter *adap)
   3273{
   3274	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
   3275}
   3276
   3277/**
   3278 *	t3_sge_stop_dma - Disable SGE DMA engine operation
   3279 *	@adap: the adapter
   3280 *
   3281 *	Can be invoked from interrupt context e.g.  error handler.
   3282 *
   3283 *	Note that this function cannot disable the restart of works as
   3284 *	it cannot wait if called from interrupt context, however the
   3285 *	works will have no effect since the doorbells are disabled. The
   3286 *	driver will call tg3_sge_stop() later from process context, at
   3287 *	which time the works will be stopped if they are still running.
   3288 */
   3289void t3_sge_stop_dma(struct adapter *adap)
   3290{
   3291	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0);
   3292}
   3293
   3294/**
   3295 *	t3_sge_stop - disable SGE operation completly
   3296 *	@adap: the adapter
   3297 *
   3298 *	Called from process context. Disables the DMA engine and any
   3299 *	pending queue restart works.
   3300 */
   3301void t3_sge_stop(struct adapter *adap)
   3302{
   3303	int i;
   3304
   3305	t3_sge_stop_dma(adap);
   3306
   3307	/* workqueues aren't initialized otherwise */
   3308	if (!(adap->flags & FULL_INIT_DONE))
   3309		return;
   3310	for (i = 0; i < SGE_QSETS; ++i) {
   3311		struct sge_qset *qs = &adap->sge.qs[i];
   3312
   3313		cancel_work_sync(&qs->txq[TXQ_OFLD].qresume_task);
   3314		cancel_work_sync(&qs->txq[TXQ_CTRL].qresume_task);
   3315	}
   3316}
   3317
   3318/**
   3319 *	t3_sge_init - initialize SGE
   3320 *	@adap: the adapter
   3321 *	@p: the SGE parameters
   3322 *
   3323 *	Performs SGE initialization needed every time after a chip reset.
   3324 *	We do not initialize any of the queue sets here, instead the driver
   3325 *	top-level must request those individually.  We also do not enable DMA
   3326 *	here, that should be done after the queues have been set up.
   3327 */
   3328void t3_sge_init(struct adapter *adap, struct sge_params *p)
   3329{
   3330	unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12);
   3331
   3332	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
   3333	    F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
   3334	    V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
   3335	    V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
   3336#if SGE_NUM_GENBITS == 1
   3337	ctrl |= F_EGRGENCTRL;
   3338#endif
   3339	if (adap->params.rev > 0) {
   3340		if (!(adap->flags & (USING_MSIX | USING_MSI)))
   3341			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
   3342	}
   3343	t3_write_reg(adap, A_SG_CONTROL, ctrl);
   3344	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
   3345		     V_LORCQDRBTHRSH(512));
   3346	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
   3347	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
   3348		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
   3349	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
   3350		     adap->params.rev < T3_REV_C ? 1000 : 500);
   3351	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
   3352	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
   3353	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
   3354	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
   3355	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
   3356}
   3357
   3358/**
   3359 *	t3_sge_prep - one-time SGE initialization
   3360 *	@adap: the associated adapter
   3361 *	@p: SGE parameters
   3362 *
   3363 *	Performs one-time initialization of SGE SW state.  Includes determining
   3364 *	defaults for the assorted SGE parameters, which admins can change until
   3365 *	they are used to initialize the SGE.
   3366 */
   3367void t3_sge_prep(struct adapter *adap, struct sge_params *p)
   3368{
   3369	int i;
   3370
   3371	p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) -
   3372	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
   3373
   3374	for (i = 0; i < SGE_QSETS; ++i) {
   3375		struct qset_params *q = p->qset + i;
   3376
   3377		q->polling = adap->params.rev > 0;
   3378		q->coalesce_usecs = 5;
   3379		q->rspq_size = 1024;
   3380		q->fl_size = 1024;
   3381		q->jumbo_size = 512;
   3382		q->txq_size[TXQ_ETH] = 1024;
   3383		q->txq_size[TXQ_OFLD] = 1024;
   3384		q->txq_size[TXQ_CTRL] = 256;
   3385		q->cong_thres = 0;
   3386	}
   3387
   3388	spin_lock_init(&adap->sge.reg_lock);
   3389}