iavf_txrx.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
iavf_txrx.c (72851B)
      1// SPDX-License-Identifier: GPL-2.0
      2/* Copyright(c) 2013 - 2018 Intel Corporation. */
      3
      4#include <linux/prefetch.h>
      5
      6#include "iavf.h"
      7#include "iavf_trace.h"
      8#include "iavf_prototype.h"
      9
     10static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
     11				u32 td_tag)
     12{
     13	return cpu_to_le64(IAVF_TX_DESC_DTYPE_DATA |
     14			   ((u64)td_cmd  << IAVF_TXD_QW1_CMD_SHIFT) |
     15			   ((u64)td_offset << IAVF_TXD_QW1_OFFSET_SHIFT) |
     16			   ((u64)size  << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT) |
     17			   ((u64)td_tag  << IAVF_TXD_QW1_L2TAG1_SHIFT));
     18}
     19
     20#define IAVF_TXD_CMD (IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_RS)
     21
     22/**
     23 * iavf_unmap_and_free_tx_resource - Release a Tx buffer
     24 * @ring:      the ring that owns the buffer
     25 * @tx_buffer: the buffer to free
     26 **/
     27static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
     28					    struct iavf_tx_buffer *tx_buffer)
     29{
     30	if (tx_buffer->skb) {
     31		if (tx_buffer->tx_flags & IAVF_TX_FLAGS_FD_SB)
     32			kfree(tx_buffer->raw_buf);
     33		else
     34			dev_kfree_skb_any(tx_buffer->skb);
     35		if (dma_unmap_len(tx_buffer, len))
     36			dma_unmap_single(ring->dev,
     37					 dma_unmap_addr(tx_buffer, dma),
     38					 dma_unmap_len(tx_buffer, len),
     39					 DMA_TO_DEVICE);
     40	} else if (dma_unmap_len(tx_buffer, len)) {
     41		dma_unmap_page(ring->dev,
     42			       dma_unmap_addr(tx_buffer, dma),
     43			       dma_unmap_len(tx_buffer, len),
     44			       DMA_TO_DEVICE);
     45	}
     46
     47	tx_buffer->next_to_watch = NULL;
     48	tx_buffer->skb = NULL;
     49	dma_unmap_len_set(tx_buffer, len, 0);
     50	/* tx_buffer must be completely set up in the transmit path */
     51}
     52
     53/**
     54 * iavf_clean_tx_ring - Free any empty Tx buffers
     55 * @tx_ring: ring to be cleaned
     56 **/
     57void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
     58{
     59	unsigned long bi_size;
     60	u16 i;
     61
     62	/* ring already cleared, nothing to do */
     63	if (!tx_ring->tx_bi)
     64		return;
     65
     66	/* Free all the Tx ring sk_buffs */
     67	for (i = 0; i < tx_ring->count; i++)
     68		iavf_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
     69
     70	bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
     71	memset(tx_ring->tx_bi, 0, bi_size);
     72
     73	/* Zero out the descriptor ring */
     74	memset(tx_ring->desc, 0, tx_ring->size);
     75
     76	tx_ring->next_to_use = 0;
     77	tx_ring->next_to_clean = 0;
     78
     79	if (!tx_ring->netdev)
     80		return;
     81
     82	/* cleanup Tx queue statistics */
     83	netdev_tx_reset_queue(txring_txq(tx_ring));
     84}
     85
     86/**
     87 * iavf_free_tx_resources - Free Tx resources per queue
     88 * @tx_ring: Tx descriptor ring for a specific queue
     89 *
     90 * Free all transmit software resources
     91 **/
     92void iavf_free_tx_resources(struct iavf_ring *tx_ring)
     93{
     94	iavf_clean_tx_ring(tx_ring);
     95	kfree(tx_ring->tx_bi);
     96	tx_ring->tx_bi = NULL;
     97
     98	if (tx_ring->desc) {
     99		dma_free_coherent(tx_ring->dev, tx_ring->size,
    100				  tx_ring->desc, tx_ring->dma);
    101		tx_ring->desc = NULL;
    102	}
    103}
    104
    105/**
    106 * iavf_get_tx_pending - how many Tx descriptors not processed
    107 * @ring: the ring of descriptors
    108 * @in_sw: is tx_pending being checked in SW or HW
    109 *
    110 * Since there is no access to the ring head register
    111 * in XL710, we need to use our local copies
    112 **/
    113u32 iavf_get_tx_pending(struct iavf_ring *ring, bool in_sw)
    114{
    115	u32 head, tail;
    116
    117	head = ring->next_to_clean;
    118	tail = readl(ring->tail);
    119
    120	if (head != tail)
    121		return (head < tail) ?
    122			tail - head : (tail + ring->count - head);
    123
    124	return 0;
    125}
    126
    127/**
    128 * iavf_detect_recover_hung - Function to detect and recover hung_queues
    129 * @vsi:  pointer to vsi struct with tx queues
    130 *
    131 * VSI has netdev and netdev has TX queues. This function is to check each of
    132 * those TX queues if they are hung, trigger recovery by issuing SW interrupt.
    133 **/
    134void iavf_detect_recover_hung(struct iavf_vsi *vsi)
    135{
    136	struct iavf_ring *tx_ring = NULL;
    137	struct net_device *netdev;
    138	unsigned int i;
    139	int packets;
    140
    141	if (!vsi)
    142		return;
    143
    144	if (test_bit(__IAVF_VSI_DOWN, vsi->state))
    145		return;
    146
    147	netdev = vsi->netdev;
    148	if (!netdev)
    149		return;
    150
    151	if (!netif_carrier_ok(netdev))
    152		return;
    153
    154	for (i = 0; i < vsi->back->num_active_queues; i++) {
    155		tx_ring = &vsi->back->tx_rings[i];
    156		if (tx_ring && tx_ring->desc) {
    157			/* If packet counter has not changed the queue is
    158			 * likely stalled, so force an interrupt for this
    159			 * queue.
    160			 *
    161			 * prev_pkt_ctr would be negative if there was no
    162			 * pending work.
    163			 */
    164			packets = tx_ring->stats.packets & INT_MAX;
    165			if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
    166				iavf_force_wb(vsi, tx_ring->q_vector);
    167				continue;
    168			}
    169
    170			/* Memory barrier between read of packet count and call
    171			 * to iavf_get_tx_pending()
    172			 */
    173			smp_rmb();
    174			tx_ring->tx_stats.prev_pkt_ctr =
    175			  iavf_get_tx_pending(tx_ring, true) ? packets : -1;
    176		}
    177	}
    178}
    179
    180#define WB_STRIDE 4
    181
    182/**
    183 * iavf_clean_tx_irq - Reclaim resources after transmit completes
    184 * @vsi: the VSI we care about
    185 * @tx_ring: Tx ring to clean
    186 * @napi_budget: Used to determine if we are in netpoll
    187 *
    188 * Returns true if there's any budget left (e.g. the clean is finished)
    189 **/
    190static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
    191			      struct iavf_ring *tx_ring, int napi_budget)
    192{
    193	int i = tx_ring->next_to_clean;
    194	struct iavf_tx_buffer *tx_buf;
    195	struct iavf_tx_desc *tx_desc;
    196	unsigned int total_bytes = 0, total_packets = 0;
    197	unsigned int budget = vsi->work_limit;
    198
    199	tx_buf = &tx_ring->tx_bi[i];
    200	tx_desc = IAVF_TX_DESC(tx_ring, i);
    201	i -= tx_ring->count;
    202
    203	do {
    204		struct iavf_tx_desc *eop_desc = tx_buf->next_to_watch;
    205
    206		/* if next_to_watch is not set then there is no work pending */
    207		if (!eop_desc)
    208			break;
    209
    210		/* prevent any other reads prior to eop_desc */
    211		smp_rmb();
    212
    213		iavf_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
    214		/* if the descriptor isn't done, no work yet to do */
    215		if (!(eop_desc->cmd_type_offset_bsz &
    216		      cpu_to_le64(IAVF_TX_DESC_DTYPE_DESC_DONE)))
    217			break;
    218
    219		/* clear next_to_watch to prevent false hangs */
    220		tx_buf->next_to_watch = NULL;
    221
    222		/* update the statistics for this packet */
    223		total_bytes += tx_buf->bytecount;
    224		total_packets += tx_buf->gso_segs;
    225
    226		/* free the skb */
    227		napi_consume_skb(tx_buf->skb, napi_budget);
    228
    229		/* unmap skb header data */
    230		dma_unmap_single(tx_ring->dev,
    231				 dma_unmap_addr(tx_buf, dma),
    232				 dma_unmap_len(tx_buf, len),
    233				 DMA_TO_DEVICE);
    234
    235		/* clear tx_buffer data */
    236		tx_buf->skb = NULL;
    237		dma_unmap_len_set(tx_buf, len, 0);
    238
    239		/* unmap remaining buffers */
    240		while (tx_desc != eop_desc) {
    241			iavf_trace(clean_tx_irq_unmap,
    242				   tx_ring, tx_desc, tx_buf);
    243
    244			tx_buf++;
    245			tx_desc++;
    246			i++;
    247			if (unlikely(!i)) {
    248				i -= tx_ring->count;
    249				tx_buf = tx_ring->tx_bi;
    250				tx_desc = IAVF_TX_DESC(tx_ring, 0);
    251			}
    252
    253			/* unmap any remaining paged data */
    254			if (dma_unmap_len(tx_buf, len)) {
    255				dma_unmap_page(tx_ring->dev,
    256					       dma_unmap_addr(tx_buf, dma),
    257					       dma_unmap_len(tx_buf, len),
    258					       DMA_TO_DEVICE);
    259				dma_unmap_len_set(tx_buf, len, 0);
    260			}
    261		}
    262
    263		/* move us one more past the eop_desc for start of next pkt */
    264		tx_buf++;
    265		tx_desc++;
    266		i++;
    267		if (unlikely(!i)) {
    268			i -= tx_ring->count;
    269			tx_buf = tx_ring->tx_bi;
    270			tx_desc = IAVF_TX_DESC(tx_ring, 0);
    271		}
    272
    273		prefetch(tx_desc);
    274
    275		/* update budget accounting */
    276		budget--;
    277	} while (likely(budget));
    278
    279	i += tx_ring->count;
    280	tx_ring->next_to_clean = i;
    281	u64_stats_update_begin(&tx_ring->syncp);
    282	tx_ring->stats.bytes += total_bytes;
    283	tx_ring->stats.packets += total_packets;
    284	u64_stats_update_end(&tx_ring->syncp);
    285	tx_ring->q_vector->tx.total_bytes += total_bytes;
    286	tx_ring->q_vector->tx.total_packets += total_packets;
    287
    288	if (tx_ring->flags & IAVF_TXR_FLAGS_WB_ON_ITR) {
    289		/* check to see if there are < 4 descriptors
    290		 * waiting to be written back, then kick the hardware to force
    291		 * them to be written back in case we stay in NAPI.
    292		 * In this mode on X722 we do not enable Interrupt.
    293		 */
    294		unsigned int j = iavf_get_tx_pending(tx_ring, false);
    295
    296		if (budget &&
    297		    ((j / WB_STRIDE) == 0) && (j > 0) &&
    298		    !test_bit(__IAVF_VSI_DOWN, vsi->state) &&
    299		    (IAVF_DESC_UNUSED(tx_ring) != tx_ring->count))
    300			tx_ring->arm_wb = true;
    301	}
    302
    303	/* notify netdev of completed buffers */
    304	netdev_tx_completed_queue(txring_txq(tx_ring),
    305				  total_packets, total_bytes);
    306
    307#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
    308	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
    309		     (IAVF_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
    310		/* Make sure that anybody stopping the queue after this
    311		 * sees the new next_to_clean.
    312		 */
    313		smp_mb();
    314		if (__netif_subqueue_stopped(tx_ring->netdev,
    315					     tx_ring->queue_index) &&
    316		   !test_bit(__IAVF_VSI_DOWN, vsi->state)) {
    317			netif_wake_subqueue(tx_ring->netdev,
    318					    tx_ring->queue_index);
    319			++tx_ring->tx_stats.restart_queue;
    320		}
    321	}
    322
    323	return !!budget;
    324}
    325
    326/**
    327 * iavf_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
    328 * @vsi: the VSI we care about
    329 * @q_vector: the vector on which to enable writeback
    330 *
    331 **/
    332static void iavf_enable_wb_on_itr(struct iavf_vsi *vsi,
    333				  struct iavf_q_vector *q_vector)
    334{
    335	u16 flags = q_vector->tx.ring[0].flags;
    336	u32 val;
    337
    338	if (!(flags & IAVF_TXR_FLAGS_WB_ON_ITR))
    339		return;
    340
    341	if (q_vector->arm_wb_state)
    342		return;
    343
    344	val = IAVF_VFINT_DYN_CTLN1_WB_ON_ITR_MASK |
    345	      IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK; /* set noitr */
    346
    347	wr32(&vsi->back->hw,
    348	     IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx), val);
    349	q_vector->arm_wb_state = true;
    350}
    351
    352/**
    353 * iavf_force_wb - Issue SW Interrupt so HW does a wb
    354 * @vsi: the VSI we care about
    355 * @q_vector: the vector  on which to force writeback
    356 *
    357 **/
    358void iavf_force_wb(struct iavf_vsi *vsi, struct iavf_q_vector *q_vector)
    359{
    360	u32 val = IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
    361		  IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK | /* set noitr */
    362		  IAVF_VFINT_DYN_CTLN1_SWINT_TRIG_MASK |
    363		  IAVF_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK
    364		  /* allow 00 to be written to the index */;
    365
    366	wr32(&vsi->back->hw,
    367	     IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx),
    368	     val);
    369}
    370
    371static inline bool iavf_container_is_rx(struct iavf_q_vector *q_vector,
    372					struct iavf_ring_container *rc)
    373{
    374	return &q_vector->rx == rc;
    375}
    376
    377#define IAVF_AIM_MULTIPLIER_100G	2560
    378#define IAVF_AIM_MULTIPLIER_50G		1280
    379#define IAVF_AIM_MULTIPLIER_40G		1024
    380#define IAVF_AIM_MULTIPLIER_20G		512
    381#define IAVF_AIM_MULTIPLIER_10G		256
    382#define IAVF_AIM_MULTIPLIER_1G		32
    383
    384static unsigned int iavf_mbps_itr_multiplier(u32 speed_mbps)
    385{
    386	switch (speed_mbps) {
    387	case SPEED_100000:
    388		return IAVF_AIM_MULTIPLIER_100G;
    389	case SPEED_50000:
    390		return IAVF_AIM_MULTIPLIER_50G;
    391	case SPEED_40000:
    392		return IAVF_AIM_MULTIPLIER_40G;
    393	case SPEED_25000:
    394	case SPEED_20000:
    395		return IAVF_AIM_MULTIPLIER_20G;
    396	case SPEED_10000:
    397	default:
    398		return IAVF_AIM_MULTIPLIER_10G;
    399	case SPEED_1000:
    400	case SPEED_100:
    401		return IAVF_AIM_MULTIPLIER_1G;
    402	}
    403}
    404
    405static unsigned int
    406iavf_virtchnl_itr_multiplier(enum virtchnl_link_speed speed_virtchnl)
    407{
    408	switch (speed_virtchnl) {
    409	case VIRTCHNL_LINK_SPEED_40GB:
    410		return IAVF_AIM_MULTIPLIER_40G;
    411	case VIRTCHNL_LINK_SPEED_25GB:
    412	case VIRTCHNL_LINK_SPEED_20GB:
    413		return IAVF_AIM_MULTIPLIER_20G;
    414	case VIRTCHNL_LINK_SPEED_10GB:
    415	default:
    416		return IAVF_AIM_MULTIPLIER_10G;
    417	case VIRTCHNL_LINK_SPEED_1GB:
    418	case VIRTCHNL_LINK_SPEED_100MB:
    419		return IAVF_AIM_MULTIPLIER_1G;
    420	}
    421}
    422
    423static unsigned int iavf_itr_divisor(struct iavf_adapter *adapter)
    424{
    425	if (ADV_LINK_SUPPORT(adapter))
    426		return IAVF_ITR_ADAPTIVE_MIN_INC *
    427			iavf_mbps_itr_multiplier(adapter->link_speed_mbps);
    428	else
    429		return IAVF_ITR_ADAPTIVE_MIN_INC *
    430			iavf_virtchnl_itr_multiplier(adapter->link_speed);
    431}
    432
    433/**
    434 * iavf_update_itr - update the dynamic ITR value based on statistics
    435 * @q_vector: structure containing interrupt and ring information
    436 * @rc: structure containing ring performance data
    437 *
    438 * Stores a new ITR value based on packets and byte
    439 * counts during the last interrupt.  The advantage of per interrupt
    440 * computation is faster updates and more accurate ITR for the current
    441 * traffic pattern.  Constants in this function were computed
    442 * based on theoretical maximum wire speed and thresholds were set based
    443 * on testing data as well as attempting to minimize response time
    444 * while increasing bulk throughput.
    445 **/
    446static void iavf_update_itr(struct iavf_q_vector *q_vector,
    447			    struct iavf_ring_container *rc)
    448{
    449	unsigned int avg_wire_size, packets, bytes, itr;
    450	unsigned long next_update = jiffies;
    451
    452	/* If we don't have any rings just leave ourselves set for maximum
    453	 * possible latency so we take ourselves out of the equation.
    454	 */
    455	if (!rc->ring || !ITR_IS_DYNAMIC(rc->ring->itr_setting))
    456		return;
    457
    458	/* For Rx we want to push the delay up and default to low latency.
    459	 * for Tx we want to pull the delay down and default to high latency.
    460	 */
    461	itr = iavf_container_is_rx(q_vector, rc) ?
    462	      IAVF_ITR_ADAPTIVE_MIN_USECS | IAVF_ITR_ADAPTIVE_LATENCY :
    463	      IAVF_ITR_ADAPTIVE_MAX_USECS | IAVF_ITR_ADAPTIVE_LATENCY;
    464
    465	/* If we didn't update within up to 1 - 2 jiffies we can assume
    466	 * that either packets are coming in so slow there hasn't been
    467	 * any work, or that there is so much work that NAPI is dealing
    468	 * with interrupt moderation and we don't need to do anything.
    469	 */
    470	if (time_after(next_update, rc->next_update))
    471		goto clear_counts;
    472
    473	/* If itr_countdown is set it means we programmed an ITR within
    474	 * the last 4 interrupt cycles. This has a side effect of us
    475	 * potentially firing an early interrupt. In order to work around
    476	 * this we need to throw out any data received for a few
    477	 * interrupts following the update.
    478	 */
    479	if (q_vector->itr_countdown) {
    480		itr = rc->target_itr;
    481		goto clear_counts;
    482	}
    483
    484	packets = rc->total_packets;
    485	bytes = rc->total_bytes;
    486
    487	if (iavf_container_is_rx(q_vector, rc)) {
    488		/* If Rx there are 1 to 4 packets and bytes are less than
    489		 * 9000 assume insufficient data to use bulk rate limiting
    490		 * approach unless Tx is already in bulk rate limiting. We
    491		 * are likely latency driven.
    492		 */
    493		if (packets && packets < 4 && bytes < 9000 &&
    494		    (q_vector->tx.target_itr & IAVF_ITR_ADAPTIVE_LATENCY)) {
    495			itr = IAVF_ITR_ADAPTIVE_LATENCY;
    496			goto adjust_by_size;
    497		}
    498	} else if (packets < 4) {
    499		/* If we have Tx and Rx ITR maxed and Tx ITR is running in
    500		 * bulk mode and we are receiving 4 or fewer packets just
    501		 * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so
    502		 * that the Rx can relax.
    503		 */
    504		if (rc->target_itr == IAVF_ITR_ADAPTIVE_MAX_USECS &&
    505		    (q_vector->rx.target_itr & IAVF_ITR_MASK) ==
    506		     IAVF_ITR_ADAPTIVE_MAX_USECS)
    507			goto clear_counts;
    508	} else if (packets > 32) {
    509		/* If we have processed over 32 packets in a single interrupt
    510		 * for Tx assume we need to switch over to "bulk" mode.
    511		 */
    512		rc->target_itr &= ~IAVF_ITR_ADAPTIVE_LATENCY;
    513	}
    514
    515	/* We have no packets to actually measure against. This means
    516	 * either one of the other queues on this vector is active or
    517	 * we are a Tx queue doing TSO with too high of an interrupt rate.
    518	 *
    519	 * Between 4 and 56 we can assume that our current interrupt delay
    520	 * is only slightly too low. As such we should increase it by a small
    521	 * fixed amount.
    522	 */
    523	if (packets < 56) {
    524		itr = rc->target_itr + IAVF_ITR_ADAPTIVE_MIN_INC;
    525		if ((itr & IAVF_ITR_MASK) > IAVF_ITR_ADAPTIVE_MAX_USECS) {
    526			itr &= IAVF_ITR_ADAPTIVE_LATENCY;
    527			itr += IAVF_ITR_ADAPTIVE_MAX_USECS;
    528		}
    529		goto clear_counts;
    530	}
    531
    532	if (packets <= 256) {
    533		itr = min(q_vector->tx.current_itr, q_vector->rx.current_itr);
    534		itr &= IAVF_ITR_MASK;
    535
    536		/* Between 56 and 112 is our "goldilocks" zone where we are
    537		 * working out "just right". Just report that our current
    538		 * ITR is good for us.
    539		 */
    540		if (packets <= 112)
    541			goto clear_counts;
    542
    543		/* If packet count is 128 or greater we are likely looking
    544		 * at a slight overrun of the delay we want. Try halving
    545		 * our delay to see if that will cut the number of packets
    546		 * in half per interrupt.
    547		 */
    548		itr /= 2;
    549		itr &= IAVF_ITR_MASK;
    550		if (itr < IAVF_ITR_ADAPTIVE_MIN_USECS)
    551			itr = IAVF_ITR_ADAPTIVE_MIN_USECS;
    552
    553		goto clear_counts;
    554	}
    555
    556	/* The paths below assume we are dealing with a bulk ITR since
    557	 * number of packets is greater than 256. We are just going to have
    558	 * to compute a value and try to bring the count under control,
    559	 * though for smaller packet sizes there isn't much we can do as
    560	 * NAPI polling will likely be kicking in sooner rather than later.
    561	 */
    562	itr = IAVF_ITR_ADAPTIVE_BULK;
    563
    564adjust_by_size:
    565	/* If packet counts are 256 or greater we can assume we have a gross
    566	 * overestimation of what the rate should be. Instead of trying to fine
    567	 * tune it just use the formula below to try and dial in an exact value
    568	 * give the current packet size of the frame.
    569	 */
    570	avg_wire_size = bytes / packets;
    571
    572	/* The following is a crude approximation of:
    573	 *  wmem_default / (size + overhead) = desired_pkts_per_int
    574	 *  rate / bits_per_byte / (size + ethernet overhead) = pkt_rate
    575	 *  (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
    576	 *
    577	 * Assuming wmem_default is 212992 and overhead is 640 bytes per
    578	 * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
    579	 * formula down to
    580	 *
    581	 *  (170 * (size + 24)) / (size + 640) = ITR
    582	 *
    583	 * We first do some math on the packet size and then finally bitshift
    584	 * by 8 after rounding up. We also have to account for PCIe link speed
    585	 * difference as ITR scales based on this.
    586	 */
    587	if (avg_wire_size <= 60) {
    588		/* Start at 250k ints/sec */
    589		avg_wire_size = 4096;
    590	} else if (avg_wire_size <= 380) {
    591		/* 250K ints/sec to 60K ints/sec */
    592		avg_wire_size *= 40;
    593		avg_wire_size += 1696;
    594	} else if (avg_wire_size <= 1084) {
    595		/* 60K ints/sec to 36K ints/sec */
    596		avg_wire_size *= 15;
    597		avg_wire_size += 11452;
    598	} else if (avg_wire_size <= 1980) {
    599		/* 36K ints/sec to 30K ints/sec */
    600		avg_wire_size *= 5;
    601		avg_wire_size += 22420;
    602	} else {
    603		/* plateau at a limit of 30K ints/sec */
    604		avg_wire_size = 32256;
    605	}
    606
    607	/* If we are in low latency mode halve our delay which doubles the
    608	 * rate to somewhere between 100K to 16K ints/sec
    609	 */
    610	if (itr & IAVF_ITR_ADAPTIVE_LATENCY)
    611		avg_wire_size /= 2;
    612
    613	/* Resultant value is 256 times larger than it needs to be. This
    614	 * gives us room to adjust the value as needed to either increase
    615	 * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc.
    616	 *
    617	 * Use addition as we have already recorded the new latency flag
    618	 * for the ITR value.
    619	 */
    620	itr += DIV_ROUND_UP(avg_wire_size,
    621			    iavf_itr_divisor(q_vector->adapter)) *
    622		IAVF_ITR_ADAPTIVE_MIN_INC;
    623
    624	if ((itr & IAVF_ITR_MASK) > IAVF_ITR_ADAPTIVE_MAX_USECS) {
    625		itr &= IAVF_ITR_ADAPTIVE_LATENCY;
    626		itr += IAVF_ITR_ADAPTIVE_MAX_USECS;
    627	}
    628
    629clear_counts:
    630	/* write back value */
    631	rc->target_itr = itr;
    632
    633	/* next update should occur within next jiffy */
    634	rc->next_update = next_update + 1;
    635
    636	rc->total_bytes = 0;
    637	rc->total_packets = 0;
    638}
    639
    640/**
    641 * iavf_setup_tx_descriptors - Allocate the Tx descriptors
    642 * @tx_ring: the tx ring to set up
    643 *
    644 * Return 0 on success, negative on error
    645 **/
    646int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
    647{
    648	struct device *dev = tx_ring->dev;
    649	int bi_size;
    650
    651	if (!dev)
    652		return -ENOMEM;
    653
    654	/* warn if we are about to overwrite the pointer */
    655	WARN_ON(tx_ring->tx_bi);
    656	bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
    657	tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
    658	if (!tx_ring->tx_bi)
    659		goto err;
    660
    661	/* round up to nearest 4K */
    662	tx_ring->size = tx_ring->count * sizeof(struct iavf_tx_desc);
    663	tx_ring->size = ALIGN(tx_ring->size, 4096);
    664	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
    665					   &tx_ring->dma, GFP_KERNEL);
    666	if (!tx_ring->desc) {
    667		dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
    668			 tx_ring->size);
    669		goto err;
    670	}
    671
    672	tx_ring->next_to_use = 0;
    673	tx_ring->next_to_clean = 0;
    674	tx_ring->tx_stats.prev_pkt_ctr = -1;
    675	return 0;
    676
    677err:
    678	kfree(tx_ring->tx_bi);
    679	tx_ring->tx_bi = NULL;
    680	return -ENOMEM;
    681}
    682
    683/**
    684 * iavf_clean_rx_ring - Free Rx buffers
    685 * @rx_ring: ring to be cleaned
    686 **/
    687void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
    688{
    689	unsigned long bi_size;
    690	u16 i;
    691
    692	/* ring already cleared, nothing to do */
    693	if (!rx_ring->rx_bi)
    694		return;
    695
    696	if (rx_ring->skb) {
    697		dev_kfree_skb(rx_ring->skb);
    698		rx_ring->skb = NULL;
    699	}
    700
    701	/* Free all the Rx ring sk_buffs */
    702	for (i = 0; i < rx_ring->count; i++) {
    703		struct iavf_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
    704
    705		if (!rx_bi->page)
    706			continue;
    707
    708		/* Invalidate cache lines that may have been written to by
    709		 * device so that we avoid corrupting memory.
    710		 */
    711		dma_sync_single_range_for_cpu(rx_ring->dev,
    712					      rx_bi->dma,
    713					      rx_bi->page_offset,
    714					      rx_ring->rx_buf_len,
    715					      DMA_FROM_DEVICE);
    716
    717		/* free resources associated with mapping */
    718		dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
    719				     iavf_rx_pg_size(rx_ring),
    720				     DMA_FROM_DEVICE,
    721				     IAVF_RX_DMA_ATTR);
    722
    723		__page_frag_cache_drain(rx_bi->page, rx_bi->pagecnt_bias);
    724
    725		rx_bi->page = NULL;
    726		rx_bi->page_offset = 0;
    727	}
    728
    729	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
    730	memset(rx_ring->rx_bi, 0, bi_size);
    731
    732	/* Zero out the descriptor ring */
    733	memset(rx_ring->desc, 0, rx_ring->size);
    734
    735	rx_ring->next_to_alloc = 0;
    736	rx_ring->next_to_clean = 0;
    737	rx_ring->next_to_use = 0;
    738}
    739
    740/**
    741 * iavf_free_rx_resources - Free Rx resources
    742 * @rx_ring: ring to clean the resources from
    743 *
    744 * Free all receive software resources
    745 **/
    746void iavf_free_rx_resources(struct iavf_ring *rx_ring)
    747{
    748	iavf_clean_rx_ring(rx_ring);
    749	kfree(rx_ring->rx_bi);
    750	rx_ring->rx_bi = NULL;
    751
    752	if (rx_ring->desc) {
    753		dma_free_coherent(rx_ring->dev, rx_ring->size,
    754				  rx_ring->desc, rx_ring->dma);
    755		rx_ring->desc = NULL;
    756	}
    757}
    758
    759/**
    760 * iavf_setup_rx_descriptors - Allocate Rx descriptors
    761 * @rx_ring: Rx descriptor ring (for a specific queue) to setup
    762 *
    763 * Returns 0 on success, negative on failure
    764 **/
    765int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
    766{
    767	struct device *dev = rx_ring->dev;
    768	int bi_size;
    769
    770	/* warn if we are about to overwrite the pointer */
    771	WARN_ON(rx_ring->rx_bi);
    772	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
    773	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
    774	if (!rx_ring->rx_bi)
    775		goto err;
    776
    777	u64_stats_init(&rx_ring->syncp);
    778
    779	/* Round up to nearest 4K */
    780	rx_ring->size = rx_ring->count * sizeof(union iavf_32byte_rx_desc);
    781	rx_ring->size = ALIGN(rx_ring->size, 4096);
    782	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
    783					   &rx_ring->dma, GFP_KERNEL);
    784
    785	if (!rx_ring->desc) {
    786		dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
    787			 rx_ring->size);
    788		goto err;
    789	}
    790
    791	rx_ring->next_to_alloc = 0;
    792	rx_ring->next_to_clean = 0;
    793	rx_ring->next_to_use = 0;
    794
    795	return 0;
    796err:
    797	kfree(rx_ring->rx_bi);
    798	rx_ring->rx_bi = NULL;
    799	return -ENOMEM;
    800}
    801
    802/**
    803 * iavf_release_rx_desc - Store the new tail and head values
    804 * @rx_ring: ring to bump
    805 * @val: new head index
    806 **/
    807static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
    808{
    809	rx_ring->next_to_use = val;
    810
    811	/* update next to alloc since we have filled the ring */
    812	rx_ring->next_to_alloc = val;
    813
    814	/* Force memory writes to complete before letting h/w
    815	 * know there are new descriptors to fetch.  (Only
    816	 * applicable for weak-ordered memory model archs,
    817	 * such as IA-64).
    818	 */
    819	wmb();
    820	writel(val, rx_ring->tail);
    821}
    822
    823/**
    824 * iavf_rx_offset - Return expected offset into page to access data
    825 * @rx_ring: Ring we are requesting offset of
    826 *
    827 * Returns the offset value for ring into the data buffer.
    828 */
    829static inline unsigned int iavf_rx_offset(struct iavf_ring *rx_ring)
    830{
    831	return ring_uses_build_skb(rx_ring) ? IAVF_SKB_PAD : 0;
    832}
    833
    834/**
    835 * iavf_alloc_mapped_page - recycle or make a new page
    836 * @rx_ring: ring to use
    837 * @bi: rx_buffer struct to modify
    838 *
    839 * Returns true if the page was successfully allocated or
    840 * reused.
    841 **/
    842static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
    843				   struct iavf_rx_buffer *bi)
    844{
    845	struct page *page = bi->page;
    846	dma_addr_t dma;
    847
    848	/* since we are recycling buffers we should seldom need to alloc */
    849	if (likely(page)) {
    850		rx_ring->rx_stats.page_reuse_count++;
    851		return true;
    852	}
    853
    854	/* alloc new page for storage */
    855	page = dev_alloc_pages(iavf_rx_pg_order(rx_ring));
    856	if (unlikely(!page)) {
    857		rx_ring->rx_stats.alloc_page_failed++;
    858		return false;
    859	}
    860
    861	/* map page for use */
    862	dma = dma_map_page_attrs(rx_ring->dev, page, 0,
    863				 iavf_rx_pg_size(rx_ring),
    864				 DMA_FROM_DEVICE,
    865				 IAVF_RX_DMA_ATTR);
    866
    867	/* if mapping failed free memory back to system since
    868	 * there isn't much point in holding memory we can't use
    869	 */
    870	if (dma_mapping_error(rx_ring->dev, dma)) {
    871		__free_pages(page, iavf_rx_pg_order(rx_ring));
    872		rx_ring->rx_stats.alloc_page_failed++;
    873		return false;
    874	}
    875
    876	bi->dma = dma;
    877	bi->page = page;
    878	bi->page_offset = iavf_rx_offset(rx_ring);
    879
    880	/* initialize pagecnt_bias to 1 representing we fully own page */
    881	bi->pagecnt_bias = 1;
    882
    883	return true;
    884}
    885
    886/**
    887 * iavf_receive_skb - Send a completed packet up the stack
    888 * @rx_ring:  rx ring in play
    889 * @skb: packet to send up
    890 * @vlan_tag: vlan tag for packet
    891 **/
    892static void iavf_receive_skb(struct iavf_ring *rx_ring,
    893			     struct sk_buff *skb, u16 vlan_tag)
    894{
    895	struct iavf_q_vector *q_vector = rx_ring->q_vector;
    896
    897	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
    898	    (vlan_tag & VLAN_VID_MASK))
    899		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
    900	else if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX) &&
    901		 vlan_tag & VLAN_VID_MASK)
    902		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
    903
    904	napi_gro_receive(&q_vector->napi, skb);
    905}
    906
    907/**
    908 * iavf_alloc_rx_buffers - Replace used receive buffers
    909 * @rx_ring: ring to place buffers on
    910 * @cleaned_count: number of buffers to replace
    911 *
    912 * Returns false if all allocations were successful, true if any fail
    913 **/
    914bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count)
    915{
    916	u16 ntu = rx_ring->next_to_use;
    917	union iavf_rx_desc *rx_desc;
    918	struct iavf_rx_buffer *bi;
    919
    920	/* do nothing if no valid netdev defined */
    921	if (!rx_ring->netdev || !cleaned_count)
    922		return false;
    923
    924	rx_desc = IAVF_RX_DESC(rx_ring, ntu);
    925	bi = &rx_ring->rx_bi[ntu];
    926
    927	do {
    928		if (!iavf_alloc_mapped_page(rx_ring, bi))
    929			goto no_buffers;
    930
    931		/* sync the buffer for use by the device */
    932		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
    933						 bi->page_offset,
    934						 rx_ring->rx_buf_len,
    935						 DMA_FROM_DEVICE);
    936
    937		/* Refresh the desc even if buffer_addrs didn't change
    938		 * because each write-back erases this info.
    939		 */
    940		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
    941
    942		rx_desc++;
    943		bi++;
    944		ntu++;
    945		if (unlikely(ntu == rx_ring->count)) {
    946			rx_desc = IAVF_RX_DESC(rx_ring, 0);
    947			bi = rx_ring->rx_bi;
    948			ntu = 0;
    949		}
    950
    951		/* clear the status bits for the next_to_use descriptor */
    952		rx_desc->wb.qword1.status_error_len = 0;
    953
    954		cleaned_count--;
    955	} while (cleaned_count);
    956
    957	if (rx_ring->next_to_use != ntu)
    958		iavf_release_rx_desc(rx_ring, ntu);
    959
    960	return false;
    961
    962no_buffers:
    963	if (rx_ring->next_to_use != ntu)
    964		iavf_release_rx_desc(rx_ring, ntu);
    965
    966	/* make sure to come back via polling to try again after
    967	 * allocation failure
    968	 */
    969	return true;
    970}
    971
    972/**
    973 * iavf_rx_checksum - Indicate in skb if hw indicated a good cksum
    974 * @vsi: the VSI we care about
    975 * @skb: skb currently being received and modified
    976 * @rx_desc: the receive descriptor
    977 **/
    978static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
    979				    struct sk_buff *skb,
    980				    union iavf_rx_desc *rx_desc)
    981{
    982	struct iavf_rx_ptype_decoded decoded;
    983	u32 rx_error, rx_status;
    984	bool ipv4, ipv6;
    985	u8 ptype;
    986	u64 qword;
    987
    988	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
    989	ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT;
    990	rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >>
    991		   IAVF_RXD_QW1_ERROR_SHIFT;
    992	rx_status = (qword & IAVF_RXD_QW1_STATUS_MASK) >>
    993		    IAVF_RXD_QW1_STATUS_SHIFT;
    994	decoded = decode_rx_desc_ptype(ptype);
    995
    996	skb->ip_summed = CHECKSUM_NONE;
    997
    998	skb_checksum_none_assert(skb);
    999
   1000	/* Rx csum enabled and ip headers found? */
   1001	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
   1002		return;
   1003
   1004	/* did the hardware decode the packet and checksum? */
   1005	if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT)))
   1006		return;
   1007
   1008	/* both known and outer_ip must be set for the below code to work */
   1009	if (!(decoded.known && decoded.outer_ip))
   1010		return;
   1011
   1012	ipv4 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
   1013	       (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV4);
   1014	ipv6 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
   1015	       (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV6);
   1016
   1017	if (ipv4 &&
   1018	    (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
   1019			 BIT(IAVF_RX_DESC_ERROR_EIPE_SHIFT))))
   1020		goto checksum_fail;
   1021
   1022	/* likely incorrect csum if alternate IP extension headers found */
   1023	if (ipv6 &&
   1024	    rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT))
   1025		/* don't increment checksum err here, non-fatal err */
   1026		return;
   1027
   1028	/* there was some L4 error, count error and punt packet to the stack */
   1029	if (rx_error & BIT(IAVF_RX_DESC_ERROR_L4E_SHIFT))
   1030		goto checksum_fail;
   1031
   1032	/* handle packets that were not able to be checksummed due
   1033	 * to arrival speed, in this case the stack can compute
   1034	 * the csum.
   1035	 */
   1036	if (rx_error & BIT(IAVF_RX_DESC_ERROR_PPRS_SHIFT))
   1037		return;
   1038
   1039	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
   1040	switch (decoded.inner_prot) {
   1041	case IAVF_RX_PTYPE_INNER_PROT_TCP:
   1042	case IAVF_RX_PTYPE_INNER_PROT_UDP:
   1043	case IAVF_RX_PTYPE_INNER_PROT_SCTP:
   1044		skb->ip_summed = CHECKSUM_UNNECESSARY;
   1045		fallthrough;
   1046	default:
   1047		break;
   1048	}
   1049
   1050	return;
   1051
   1052checksum_fail:
   1053	vsi->back->hw_csum_rx_error++;
   1054}
   1055
   1056/**
   1057 * iavf_ptype_to_htype - get a hash type
   1058 * @ptype: the ptype value from the descriptor
   1059 *
   1060 * Returns a hash type to be used by skb_set_hash
   1061 **/
   1062static inline int iavf_ptype_to_htype(u8 ptype)
   1063{
   1064	struct iavf_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
   1065
   1066	if (!decoded.known)
   1067		return PKT_HASH_TYPE_NONE;
   1068
   1069	if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
   1070	    decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4)
   1071		return PKT_HASH_TYPE_L4;
   1072	else if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
   1073		 decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3)
   1074		return PKT_HASH_TYPE_L3;
   1075	else
   1076		return PKT_HASH_TYPE_L2;
   1077}
   1078
   1079/**
   1080 * iavf_rx_hash - set the hash value in the skb
   1081 * @ring: descriptor ring
   1082 * @rx_desc: specific descriptor
   1083 * @skb: skb currently being received and modified
   1084 * @rx_ptype: Rx packet type
   1085 **/
   1086static inline void iavf_rx_hash(struct iavf_ring *ring,
   1087				union iavf_rx_desc *rx_desc,
   1088				struct sk_buff *skb,
   1089				u8 rx_ptype)
   1090{
   1091	u32 hash;
   1092	const __le64 rss_mask =
   1093		cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
   1094			    IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT);
   1095
   1096	if (ring->netdev->features & NETIF_F_RXHASH)
   1097		return;
   1098
   1099	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
   1100		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
   1101		skb_set_hash(skb, hash, iavf_ptype_to_htype(rx_ptype));
   1102	}
   1103}
   1104
   1105/**
   1106 * iavf_process_skb_fields - Populate skb header fields from Rx descriptor
   1107 * @rx_ring: rx descriptor ring packet is being transacted on
   1108 * @rx_desc: pointer to the EOP Rx descriptor
   1109 * @skb: pointer to current skb being populated
   1110 * @rx_ptype: the packet type decoded by hardware
   1111 *
   1112 * This function checks the ring, descriptor, and packet information in
   1113 * order to populate the hash, checksum, VLAN, protocol, and
   1114 * other fields within the skb.
   1115 **/
   1116static inline
   1117void iavf_process_skb_fields(struct iavf_ring *rx_ring,
   1118			     union iavf_rx_desc *rx_desc, struct sk_buff *skb,
   1119			     u8 rx_ptype)
   1120{
   1121	iavf_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
   1122
   1123	iavf_rx_checksum(rx_ring->vsi, skb, rx_desc);
   1124
   1125	skb_record_rx_queue(skb, rx_ring->queue_index);
   1126
   1127	/* modifies the skb - consumes the enet header */
   1128	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
   1129}
   1130
   1131/**
   1132 * iavf_cleanup_headers - Correct empty headers
   1133 * @rx_ring: rx descriptor ring packet is being transacted on
   1134 * @skb: pointer to current skb being fixed
   1135 *
   1136 * Also address the case where we are pulling data in on pages only
   1137 * and as such no data is present in the skb header.
   1138 *
   1139 * In addition if skb is not at least 60 bytes we need to pad it so that
   1140 * it is large enough to qualify as a valid Ethernet frame.
   1141 *
   1142 * Returns true if an error was encountered and skb was freed.
   1143 **/
   1144static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
   1145{
   1146	/* if eth_skb_pad returns an error the skb was freed */
   1147	if (eth_skb_pad(skb))
   1148		return true;
   1149
   1150	return false;
   1151}
   1152
   1153/**
   1154 * iavf_reuse_rx_page - page flip buffer and store it back on the ring
   1155 * @rx_ring: rx descriptor ring to store buffers on
   1156 * @old_buff: donor buffer to have page reused
   1157 *
   1158 * Synchronizes page for reuse by the adapter
   1159 **/
   1160static void iavf_reuse_rx_page(struct iavf_ring *rx_ring,
   1161			       struct iavf_rx_buffer *old_buff)
   1162{
   1163	struct iavf_rx_buffer *new_buff;
   1164	u16 nta = rx_ring->next_to_alloc;
   1165
   1166	new_buff = &rx_ring->rx_bi[nta];
   1167
   1168	/* update, and store next to alloc */
   1169	nta++;
   1170	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
   1171
   1172	/* transfer page from old buffer to new buffer */
   1173	new_buff->dma		= old_buff->dma;
   1174	new_buff->page		= old_buff->page;
   1175	new_buff->page_offset	= old_buff->page_offset;
   1176	new_buff->pagecnt_bias	= old_buff->pagecnt_bias;
   1177}
   1178
   1179/**
   1180 * iavf_can_reuse_rx_page - Determine if this page can be reused by
   1181 * the adapter for another receive
   1182 *
   1183 * @rx_buffer: buffer containing the page
   1184 *
   1185 * If page is reusable, rx_buffer->page_offset is adjusted to point to
   1186 * an unused region in the page.
   1187 *
   1188 * For small pages, @truesize will be a constant value, half the size
   1189 * of the memory at page.  We'll attempt to alternate between high and
   1190 * low halves of the page, with one half ready for use by the hardware
   1191 * and the other half being consumed by the stack.  We use the page
   1192 * ref count to determine whether the stack has finished consuming the
   1193 * portion of this page that was passed up with a previous packet.  If
   1194 * the page ref count is >1, we'll assume the "other" half page is
   1195 * still busy, and this page cannot be reused.
   1196 *
   1197 * For larger pages, @truesize will be the actual space used by the
   1198 * received packet (adjusted upward to an even multiple of the cache
   1199 * line size).  This will advance through the page by the amount
   1200 * actually consumed by the received packets while there is still
   1201 * space for a buffer.  Each region of larger pages will be used at
   1202 * most once, after which the page will not be reused.
   1203 *
   1204 * In either case, if the page is reusable its refcount is increased.
   1205 **/
   1206static bool iavf_can_reuse_rx_page(struct iavf_rx_buffer *rx_buffer)
   1207{
   1208	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
   1209	struct page *page = rx_buffer->page;
   1210
   1211	/* Is any reuse possible? */
   1212	if (!dev_page_is_reusable(page))
   1213		return false;
   1214
   1215#if (PAGE_SIZE < 8192)
   1216	/* if we are only owner of page we can reuse it */
   1217	if (unlikely((page_count(page) - pagecnt_bias) > 1))
   1218		return false;
   1219#else
   1220#define IAVF_LAST_OFFSET \
   1221	(SKB_WITH_OVERHEAD(PAGE_SIZE) - IAVF_RXBUFFER_2048)
   1222	if (rx_buffer->page_offset > IAVF_LAST_OFFSET)
   1223		return false;
   1224#endif
   1225
   1226	/* If we have drained the page fragment pool we need to update
   1227	 * the pagecnt_bias and page count so that we fully restock the
   1228	 * number of references the driver holds.
   1229	 */
   1230	if (unlikely(!pagecnt_bias)) {
   1231		page_ref_add(page, USHRT_MAX);
   1232		rx_buffer->pagecnt_bias = USHRT_MAX;
   1233	}
   1234
   1235	return true;
   1236}
   1237
   1238/**
   1239 * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
   1240 * @rx_ring: rx descriptor ring to transact packets on
   1241 * @rx_buffer: buffer containing page to add
   1242 * @skb: sk_buff to place the data into
   1243 * @size: packet length from rx_desc
   1244 *
   1245 * This function will add the data contained in rx_buffer->page to the skb.
   1246 * It will just attach the page as a frag to the skb.
   1247 *
   1248 * The function will then update the page offset.
   1249 **/
   1250static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
   1251			     struct iavf_rx_buffer *rx_buffer,
   1252			     struct sk_buff *skb,
   1253			     unsigned int size)
   1254{
   1255#if (PAGE_SIZE < 8192)
   1256	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
   1257#else
   1258	unsigned int truesize = SKB_DATA_ALIGN(size + iavf_rx_offset(rx_ring));
   1259#endif
   1260
   1261	if (!size)
   1262		return;
   1263
   1264	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
   1265			rx_buffer->page_offset, size, truesize);
   1266
   1267	/* page is being used so we must update the page offset */
   1268#if (PAGE_SIZE < 8192)
   1269	rx_buffer->page_offset ^= truesize;
   1270#else
   1271	rx_buffer->page_offset += truesize;
   1272#endif
   1273}
   1274
   1275/**
   1276 * iavf_get_rx_buffer - Fetch Rx buffer and synchronize data for use
   1277 * @rx_ring: rx descriptor ring to transact packets on
   1278 * @size: size of buffer to add to skb
   1279 *
   1280 * This function will pull an Rx buffer from the ring and synchronize it
   1281 * for use by the CPU.
   1282 */
   1283static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring,
   1284						 const unsigned int size)
   1285{
   1286	struct iavf_rx_buffer *rx_buffer;
   1287
   1288	if (!size)
   1289		return NULL;
   1290
   1291	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
   1292	prefetchw(rx_buffer->page);
   1293
   1294	/* we are reusing so sync this buffer for CPU use */
   1295	dma_sync_single_range_for_cpu(rx_ring->dev,
   1296				      rx_buffer->dma,
   1297				      rx_buffer->page_offset,
   1298				      size,
   1299				      DMA_FROM_DEVICE);
   1300
   1301	/* We have pulled a buffer for use, so decrement pagecnt_bias */
   1302	rx_buffer->pagecnt_bias--;
   1303
   1304	return rx_buffer;
   1305}
   1306
   1307/**
   1308 * iavf_construct_skb - Allocate skb and populate it
   1309 * @rx_ring: rx descriptor ring to transact packets on
   1310 * @rx_buffer: rx buffer to pull data from
   1311 * @size: size of buffer to add to skb
   1312 *
   1313 * This function allocates an skb.  It then populates it with the page
   1314 * data from the current receive descriptor, taking care to set up the
   1315 * skb correctly.
   1316 */
   1317static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
   1318					  struct iavf_rx_buffer *rx_buffer,
   1319					  unsigned int size)
   1320{
   1321	void *va;
   1322#if (PAGE_SIZE < 8192)
   1323	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
   1324#else
   1325	unsigned int truesize = SKB_DATA_ALIGN(size);
   1326#endif
   1327	unsigned int headlen;
   1328	struct sk_buff *skb;
   1329
   1330	if (!rx_buffer)
   1331		return NULL;
   1332	/* prefetch first cache line of first page */
   1333	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
   1334	net_prefetch(va);
   1335
   1336	/* allocate a skb to store the frags */
   1337	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
   1338			       IAVF_RX_HDR_SIZE,
   1339			       GFP_ATOMIC | __GFP_NOWARN);
   1340	if (unlikely(!skb))
   1341		return NULL;
   1342
   1343	/* Determine available headroom for copy */
   1344	headlen = size;
   1345	if (headlen > IAVF_RX_HDR_SIZE)
   1346		headlen = eth_get_headlen(skb->dev, va, IAVF_RX_HDR_SIZE);
   1347
   1348	/* align pull length to size of long to optimize memcpy performance */
   1349	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
   1350
   1351	/* update all of the pointers */
   1352	size -= headlen;
   1353	if (size) {
   1354		skb_add_rx_frag(skb, 0, rx_buffer->page,
   1355				rx_buffer->page_offset + headlen,
   1356				size, truesize);
   1357
   1358		/* buffer is used by skb, update page_offset */
   1359#if (PAGE_SIZE < 8192)
   1360		rx_buffer->page_offset ^= truesize;
   1361#else
   1362		rx_buffer->page_offset += truesize;
   1363#endif
   1364	} else {
   1365		/* buffer is unused, reset bias back to rx_buffer */
   1366		rx_buffer->pagecnt_bias++;
   1367	}
   1368
   1369	return skb;
   1370}
   1371
   1372/**
   1373 * iavf_build_skb - Build skb around an existing buffer
   1374 * @rx_ring: Rx descriptor ring to transact packets on
   1375 * @rx_buffer: Rx buffer to pull data from
   1376 * @size: size of buffer to add to skb
   1377 *
   1378 * This function builds an skb around an existing Rx buffer, taking care
   1379 * to set up the skb correctly and avoid any memcpy overhead.
   1380 */
   1381static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
   1382				      struct iavf_rx_buffer *rx_buffer,
   1383				      unsigned int size)
   1384{
   1385	void *va;
   1386#if (PAGE_SIZE < 8192)
   1387	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
   1388#else
   1389	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
   1390				SKB_DATA_ALIGN(IAVF_SKB_PAD + size);
   1391#endif
   1392	struct sk_buff *skb;
   1393
   1394	if (!rx_buffer)
   1395		return NULL;
   1396	/* prefetch first cache line of first page */
   1397	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
   1398	net_prefetch(va);
   1399
   1400	/* build an skb around the page buffer */
   1401	skb = napi_build_skb(va - IAVF_SKB_PAD, truesize);
   1402	if (unlikely(!skb))
   1403		return NULL;
   1404
   1405	/* update pointers within the skb to store the data */
   1406	skb_reserve(skb, IAVF_SKB_PAD);
   1407	__skb_put(skb, size);
   1408
   1409	/* buffer is used by skb, update page_offset */
   1410#if (PAGE_SIZE < 8192)
   1411	rx_buffer->page_offset ^= truesize;
   1412#else
   1413	rx_buffer->page_offset += truesize;
   1414#endif
   1415
   1416	return skb;
   1417}
   1418
   1419/**
   1420 * iavf_put_rx_buffer - Clean up used buffer and either recycle or free
   1421 * @rx_ring: rx descriptor ring to transact packets on
   1422 * @rx_buffer: rx buffer to pull data from
   1423 *
   1424 * This function will clean up the contents of the rx_buffer.  It will
   1425 * either recycle the buffer or unmap it and free the associated resources.
   1426 */
   1427static void iavf_put_rx_buffer(struct iavf_ring *rx_ring,
   1428			       struct iavf_rx_buffer *rx_buffer)
   1429{
   1430	if (!rx_buffer)
   1431		return;
   1432
   1433	if (iavf_can_reuse_rx_page(rx_buffer)) {
   1434		/* hand second half of page back to the ring */
   1435		iavf_reuse_rx_page(rx_ring, rx_buffer);
   1436		rx_ring->rx_stats.page_reuse_count++;
   1437	} else {
   1438		/* we are not reusing the buffer so unmap it */
   1439		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
   1440				     iavf_rx_pg_size(rx_ring),
   1441				     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
   1442		__page_frag_cache_drain(rx_buffer->page,
   1443					rx_buffer->pagecnt_bias);
   1444	}
   1445
   1446	/* clear contents of buffer_info */
   1447	rx_buffer->page = NULL;
   1448}
   1449
   1450/**
   1451 * iavf_is_non_eop - process handling of non-EOP buffers
   1452 * @rx_ring: Rx ring being processed
   1453 * @rx_desc: Rx descriptor for current buffer
   1454 * @skb: Current socket buffer containing buffer in progress
   1455 *
   1456 * This function updates next to clean.  If the buffer is an EOP buffer
   1457 * this function exits returning false, otherwise it will place the
   1458 * sk_buff in the next buffer to be chained and return true indicating
   1459 * that this is in fact a non-EOP buffer.
   1460 **/
   1461static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
   1462			    union iavf_rx_desc *rx_desc,
   1463			    struct sk_buff *skb)
   1464{
   1465	u32 ntc = rx_ring->next_to_clean + 1;
   1466
   1467	/* fetch, update, and store next to clean */
   1468	ntc = (ntc < rx_ring->count) ? ntc : 0;
   1469	rx_ring->next_to_clean = ntc;
   1470
   1471	prefetch(IAVF_RX_DESC(rx_ring, ntc));
   1472
   1473	/* if we are the last buffer then there is nothing else to do */
   1474#define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
   1475	if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
   1476		return false;
   1477
   1478	rx_ring->rx_stats.non_eop_descs++;
   1479
   1480	return true;
   1481}
   1482
   1483/**
   1484 * iavf_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
   1485 * @rx_ring: rx descriptor ring to transact packets on
   1486 * @budget: Total limit on number of packets to process
   1487 *
   1488 * This function provides a "bounce buffer" approach to Rx interrupt
   1489 * processing.  The advantage to this is that on systems that have
   1490 * expensive overhead for IOMMU access this provides a means of avoiding
   1491 * it by maintaining the mapping of the page to the system.
   1492 *
   1493 * Returns amount of work completed
   1494 **/
   1495static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
   1496{
   1497	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
   1498	struct sk_buff *skb = rx_ring->skb;
   1499	u16 cleaned_count = IAVF_DESC_UNUSED(rx_ring);
   1500	bool failure = false;
   1501
   1502	while (likely(total_rx_packets < (unsigned int)budget)) {
   1503		struct iavf_rx_buffer *rx_buffer;
   1504		union iavf_rx_desc *rx_desc;
   1505		unsigned int size;
   1506		u16 vlan_tag = 0;
   1507		u8 rx_ptype;
   1508		u64 qword;
   1509
   1510		/* return some buffers to hardware, one at a time is too slow */
   1511		if (cleaned_count >= IAVF_RX_BUFFER_WRITE) {
   1512			failure = failure ||
   1513				  iavf_alloc_rx_buffers(rx_ring, cleaned_count);
   1514			cleaned_count = 0;
   1515		}
   1516
   1517		rx_desc = IAVF_RX_DESC(rx_ring, rx_ring->next_to_clean);
   1518
   1519		/* status_error_len will always be zero for unused descriptors
   1520		 * because it's cleared in cleanup, and overlaps with hdr_addr
   1521		 * which is always zero because packet split isn't used, if the
   1522		 * hardware wrote DD then the length will be non-zero
   1523		 */
   1524		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
   1525
   1526		/* This memory barrier is needed to keep us from reading
   1527		 * any other fields out of the rx_desc until we have
   1528		 * verified the descriptor has been written back.
   1529		 */
   1530		dma_rmb();
   1531#define IAVF_RXD_DD BIT(IAVF_RX_DESC_STATUS_DD_SHIFT)
   1532		if (!iavf_test_staterr(rx_desc, IAVF_RXD_DD))
   1533			break;
   1534
   1535		size = (qword & IAVF_RXD_QW1_LENGTH_PBUF_MASK) >>
   1536		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
   1537
   1538		iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb);
   1539		rx_buffer = iavf_get_rx_buffer(rx_ring, size);
   1540
   1541		/* retrieve a buffer from the ring */
   1542		if (skb)
   1543			iavf_add_rx_frag(rx_ring, rx_buffer, skb, size);
   1544		else if (ring_uses_build_skb(rx_ring))
   1545			skb = iavf_build_skb(rx_ring, rx_buffer, size);
   1546		else
   1547			skb = iavf_construct_skb(rx_ring, rx_buffer, size);
   1548
   1549		/* exit if we failed to retrieve a buffer */
   1550		if (!skb) {
   1551			rx_ring->rx_stats.alloc_buff_failed++;
   1552			if (rx_buffer)
   1553				rx_buffer->pagecnt_bias++;
   1554			break;
   1555		}
   1556
   1557		iavf_put_rx_buffer(rx_ring, rx_buffer);
   1558		cleaned_count++;
   1559
   1560		if (iavf_is_non_eop(rx_ring, rx_desc, skb))
   1561			continue;
   1562
   1563		/* ERR_MASK will only have valid bits if EOP set, and
   1564		 * what we are doing here is actually checking
   1565		 * IAVF_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
   1566		 * the error field
   1567		 */
   1568		if (unlikely(iavf_test_staterr(rx_desc, BIT(IAVF_RXD_QW1_ERROR_SHIFT)))) {
   1569			dev_kfree_skb_any(skb);
   1570			skb = NULL;
   1571			continue;
   1572		}
   1573
   1574		if (iavf_cleanup_headers(rx_ring, skb)) {
   1575			skb = NULL;
   1576			continue;
   1577		}
   1578
   1579		/* probably a little skewed due to removing CRC */
   1580		total_rx_bytes += skb->len;
   1581
   1582		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
   1583		rx_ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >>
   1584			   IAVF_RXD_QW1_PTYPE_SHIFT;
   1585
   1586		/* populate checksum, VLAN, and protocol */
   1587		iavf_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
   1588
   1589		if (qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT) &&
   1590		    rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)
   1591			vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
   1592		if (rx_desc->wb.qword2.ext_status &
   1593		    cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) &&
   1594		    rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2)
   1595			vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);
   1596
   1597		iavf_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
   1598		iavf_receive_skb(rx_ring, skb, vlan_tag);
   1599		skb = NULL;
   1600
   1601		/* update budget accounting */
   1602		total_rx_packets++;
   1603	}
   1604
   1605	rx_ring->skb = skb;
   1606
   1607	u64_stats_update_begin(&rx_ring->syncp);
   1608	rx_ring->stats.packets += total_rx_packets;
   1609	rx_ring->stats.bytes += total_rx_bytes;
   1610	u64_stats_update_end(&rx_ring->syncp);
   1611	rx_ring->q_vector->rx.total_packets += total_rx_packets;
   1612	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
   1613
   1614	/* guarantee a trip back through this routine if there was a failure */
   1615	return failure ? budget : (int)total_rx_packets;
   1616}
   1617
   1618static inline u32 iavf_buildreg_itr(const int type, u16 itr)
   1619{
   1620	u32 val;
   1621
   1622	/* We don't bother with setting the CLEARPBA bit as the data sheet
   1623	 * points out doing so is "meaningless since it was already
   1624	 * auto-cleared". The auto-clearing happens when the interrupt is
   1625	 * asserted.
   1626	 *
   1627	 * Hardware errata 28 for also indicates that writing to a
   1628	 * xxINT_DYN_CTLx CSR with INTENA_MSK (bit 31) set to 0 will clear
   1629	 * an event in the PBA anyway so we need to rely on the automask
   1630	 * to hold pending events for us until the interrupt is re-enabled
   1631	 *
   1632	 * The itr value is reported in microseconds, and the register
   1633	 * value is recorded in 2 microsecond units. For this reason we
   1634	 * only need to shift by the interval shift - 1 instead of the
   1635	 * full value.
   1636	 */
   1637	itr &= IAVF_ITR_MASK;
   1638
   1639	val = IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
   1640	      (type << IAVF_VFINT_DYN_CTLN1_ITR_INDX_SHIFT) |
   1641	      (itr << (IAVF_VFINT_DYN_CTLN1_INTERVAL_SHIFT - 1));
   1642
   1643	return val;
   1644}
   1645
   1646/* a small macro to shorten up some long lines */
   1647#define INTREG IAVF_VFINT_DYN_CTLN1
   1648
   1649/* The act of updating the ITR will cause it to immediately trigger. In order
   1650 * to prevent this from throwing off adaptive update statistics we defer the
   1651 * update so that it can only happen so often. So after either Tx or Rx are
   1652 * updated we make the adaptive scheme wait until either the ITR completely
   1653 * expires via the next_update expiration or we have been through at least
   1654 * 3 interrupts.
   1655 */
   1656#define ITR_COUNTDOWN_START 3
   1657
   1658/**
   1659 * iavf_update_enable_itr - Update itr and re-enable MSIX interrupt
   1660 * @vsi: the VSI we care about
   1661 * @q_vector: q_vector for which itr is being updated and interrupt enabled
   1662 *
   1663 **/
   1664static inline void iavf_update_enable_itr(struct iavf_vsi *vsi,
   1665					  struct iavf_q_vector *q_vector)
   1666{
   1667	struct iavf_hw *hw = &vsi->back->hw;
   1668	u32 intval;
   1669
   1670	/* These will do nothing if dynamic updates are not enabled */
   1671	iavf_update_itr(q_vector, &q_vector->tx);
   1672	iavf_update_itr(q_vector, &q_vector->rx);
   1673
   1674	/* This block of logic allows us to get away with only updating
   1675	 * one ITR value with each interrupt. The idea is to perform a
   1676	 * pseudo-lazy update with the following criteria.
   1677	 *
   1678	 * 1. Rx is given higher priority than Tx if both are in same state
   1679	 * 2. If we must reduce an ITR that is given highest priority.
   1680	 * 3. We then give priority to increasing ITR based on amount.
   1681	 */
   1682	if (q_vector->rx.target_itr < q_vector->rx.current_itr) {
   1683		/* Rx ITR needs to be reduced, this is highest priority */
   1684		intval = iavf_buildreg_itr(IAVF_RX_ITR,
   1685					   q_vector->rx.target_itr);
   1686		q_vector->rx.current_itr = q_vector->rx.target_itr;
   1687		q_vector->itr_countdown = ITR_COUNTDOWN_START;
   1688	} else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) ||
   1689		   ((q_vector->rx.target_itr - q_vector->rx.current_itr) <
   1690		    (q_vector->tx.target_itr - q_vector->tx.current_itr))) {
   1691		/* Tx ITR needs to be reduced, this is second priority
   1692		 * Tx ITR needs to be increased more than Rx, fourth priority
   1693		 */
   1694		intval = iavf_buildreg_itr(IAVF_TX_ITR,
   1695					   q_vector->tx.target_itr);
   1696		q_vector->tx.current_itr = q_vector->tx.target_itr;
   1697		q_vector->itr_countdown = ITR_COUNTDOWN_START;
   1698	} else if (q_vector->rx.current_itr != q_vector->rx.target_itr) {
   1699		/* Rx ITR needs to be increased, third priority */
   1700		intval = iavf_buildreg_itr(IAVF_RX_ITR,
   1701					   q_vector->rx.target_itr);
   1702		q_vector->rx.current_itr = q_vector->rx.target_itr;
   1703		q_vector->itr_countdown = ITR_COUNTDOWN_START;
   1704	} else {
   1705		/* No ITR update, lowest priority */
   1706		intval = iavf_buildreg_itr(IAVF_ITR_NONE, 0);
   1707		if (q_vector->itr_countdown)
   1708			q_vector->itr_countdown--;
   1709	}
   1710
   1711	if (!test_bit(__IAVF_VSI_DOWN, vsi->state))
   1712		wr32(hw, INTREG(q_vector->reg_idx), intval);
   1713}
   1714
   1715/**
   1716 * iavf_napi_poll - NAPI polling Rx/Tx cleanup routine
   1717 * @napi: napi struct with our devices info in it
   1718 * @budget: amount of work driver is allowed to do this pass, in packets
   1719 *
   1720 * This function will clean all queues associated with a q_vector.
   1721 *
   1722 * Returns the amount of work done
   1723 **/
   1724int iavf_napi_poll(struct napi_struct *napi, int budget)
   1725{
   1726	struct iavf_q_vector *q_vector =
   1727			       container_of(napi, struct iavf_q_vector, napi);
   1728	struct iavf_vsi *vsi = q_vector->vsi;
   1729	struct iavf_ring *ring;
   1730	bool clean_complete = true;
   1731	bool arm_wb = false;
   1732	int budget_per_ring;
   1733	int work_done = 0;
   1734
   1735	if (test_bit(__IAVF_VSI_DOWN, vsi->state)) {
   1736		napi_complete(napi);
   1737		return 0;
   1738	}
   1739
   1740	/* Since the actual Tx work is minimal, we can give the Tx a larger
   1741	 * budget and be more aggressive about cleaning up the Tx descriptors.
   1742	 */
   1743	iavf_for_each_ring(ring, q_vector->tx) {
   1744		if (!iavf_clean_tx_irq(vsi, ring, budget)) {
   1745			clean_complete = false;
   1746			continue;
   1747		}
   1748		arm_wb |= ring->arm_wb;
   1749		ring->arm_wb = false;
   1750	}
   1751
   1752	/* Handle case where we are called by netpoll with a budget of 0 */
   1753	if (budget <= 0)
   1754		goto tx_only;
   1755
   1756	/* We attempt to distribute budget to each Rx queue fairly, but don't
   1757	 * allow the budget to go below 1 because that would exit polling early.
   1758	 */
   1759	budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
   1760
   1761	iavf_for_each_ring(ring, q_vector->rx) {
   1762		int cleaned = iavf_clean_rx_irq(ring, budget_per_ring);
   1763
   1764		work_done += cleaned;
   1765		/* if we clean as many as budgeted, we must not be done */
   1766		if (cleaned >= budget_per_ring)
   1767			clean_complete = false;
   1768	}
   1769
   1770	/* If work not completed, return budget and polling will return */
   1771	if (!clean_complete) {
   1772		int cpu_id = smp_processor_id();
   1773
   1774		/* It is possible that the interrupt affinity has changed but,
   1775		 * if the cpu is pegged at 100%, polling will never exit while
   1776		 * traffic continues and the interrupt will be stuck on this
   1777		 * cpu.  We check to make sure affinity is correct before we
   1778		 * continue to poll, otherwise we must stop polling so the
   1779		 * interrupt can move to the correct cpu.
   1780		 */
   1781		if (!cpumask_test_cpu(cpu_id, &q_vector->affinity_mask)) {
   1782			/* Tell napi that we are done polling */
   1783			napi_complete_done(napi, work_done);
   1784
   1785			/* Force an interrupt */
   1786			iavf_force_wb(vsi, q_vector);
   1787
   1788			/* Return budget-1 so that polling stops */
   1789			return budget - 1;
   1790		}
   1791tx_only:
   1792		if (arm_wb) {
   1793			q_vector->tx.ring[0].tx_stats.tx_force_wb++;
   1794			iavf_enable_wb_on_itr(vsi, q_vector);
   1795		}
   1796		return budget;
   1797	}
   1798
   1799	if (vsi->back->flags & IAVF_TXR_FLAGS_WB_ON_ITR)
   1800		q_vector->arm_wb_state = false;
   1801
   1802	/* Exit the polling mode, but don't re-enable interrupts if stack might
   1803	 * poll us due to busy-polling
   1804	 */
   1805	if (likely(napi_complete_done(napi, work_done)))
   1806		iavf_update_enable_itr(vsi, q_vector);
   1807
   1808	return min_t(int, work_done, budget - 1);
   1809}
   1810
   1811/**
   1812 * iavf_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
   1813 * @skb:     send buffer
   1814 * @tx_ring: ring to send buffer on
   1815 * @flags:   the tx flags to be set
   1816 *
   1817 * Checks the skb and set up correspondingly several generic transmit flags
   1818 * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
   1819 *
   1820 * Returns error code indicate the frame should be dropped upon error and the
   1821 * otherwise  returns 0 to indicate the flags has been set properly.
   1822 **/
   1823static void iavf_tx_prepare_vlan_flags(struct sk_buff *skb,
   1824				       struct iavf_ring *tx_ring, u32 *flags)
   1825{
   1826	u32  tx_flags = 0;
   1827
   1828
   1829	/* stack will only request hardware VLAN insertion offload for protocols
   1830	 * that the driver supports and has enabled
   1831	 */
   1832	if (!skb_vlan_tag_present(skb))
   1833		return;
   1834
   1835	tx_flags |= skb_vlan_tag_get(skb) << IAVF_TX_FLAGS_VLAN_SHIFT;
   1836	if (tx_ring->flags & IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2) {
   1837		tx_flags |= IAVF_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
   1838	} else if (tx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1) {
   1839		tx_flags |= IAVF_TX_FLAGS_HW_VLAN;
   1840	} else {
   1841		dev_dbg(tx_ring->dev, "Unsupported Tx VLAN tag location requested\n");
   1842		return;
   1843	}
   1844
   1845	*flags = tx_flags;
   1846}
   1847
   1848/**
   1849 * iavf_tso - set up the tso context descriptor
   1850 * @first:    pointer to first Tx buffer for xmit
   1851 * @hdr_len:  ptr to the size of the packet header
   1852 * @cd_type_cmd_tso_mss: Quad Word 1
   1853 *
   1854 * Returns 0 if no TSO can happen, 1 if tso is going, or error
   1855 **/
   1856static int iavf_tso(struct iavf_tx_buffer *first, u8 *hdr_len,
   1857		    u64 *cd_type_cmd_tso_mss)
   1858{
   1859	struct sk_buff *skb = first->skb;
   1860	u64 cd_cmd, cd_tso_len, cd_mss;
   1861	union {
   1862		struct iphdr *v4;
   1863		struct ipv6hdr *v6;
   1864		unsigned char *hdr;
   1865	} ip;
   1866	union {
   1867		struct tcphdr *tcp;
   1868		struct udphdr *udp;
   1869		unsigned char *hdr;
   1870	} l4;
   1871	u32 paylen, l4_offset;
   1872	u16 gso_segs, gso_size;
   1873	int err;
   1874
   1875	if (skb->ip_summed != CHECKSUM_PARTIAL)
   1876		return 0;
   1877
   1878	if (!skb_is_gso(skb))
   1879		return 0;
   1880
   1881	err = skb_cow_head(skb, 0);
   1882	if (err < 0)
   1883		return err;
   1884
   1885	ip.hdr = skb_network_header(skb);
   1886	l4.hdr = skb_transport_header(skb);
   1887
   1888	/* initialize outer IP header fields */
   1889	if (ip.v4->version == 4) {
   1890		ip.v4->tot_len = 0;
   1891		ip.v4->check = 0;
   1892	} else {
   1893		ip.v6->payload_len = 0;
   1894	}
   1895
   1896	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
   1897					 SKB_GSO_GRE_CSUM |
   1898					 SKB_GSO_IPXIP4 |
   1899					 SKB_GSO_IPXIP6 |
   1900					 SKB_GSO_UDP_TUNNEL |
   1901					 SKB_GSO_UDP_TUNNEL_CSUM)) {
   1902		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
   1903		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
   1904			l4.udp->len = 0;
   1905
   1906			/* determine offset of outer transport header */
   1907			l4_offset = l4.hdr - skb->data;
   1908
   1909			/* remove payload length from outer checksum */
   1910			paylen = skb->len - l4_offset;
   1911			csum_replace_by_diff(&l4.udp->check,
   1912					     (__force __wsum)htonl(paylen));
   1913		}
   1914
   1915		/* reset pointers to inner headers */
   1916		ip.hdr = skb_inner_network_header(skb);
   1917		l4.hdr = skb_inner_transport_header(skb);
   1918
   1919		/* initialize inner IP header fields */
   1920		if (ip.v4->version == 4) {
   1921			ip.v4->tot_len = 0;
   1922			ip.v4->check = 0;
   1923		} else {
   1924			ip.v6->payload_len = 0;
   1925		}
   1926	}
   1927
   1928	/* determine offset of inner transport header */
   1929	l4_offset = l4.hdr - skb->data;
   1930	/* remove payload length from inner checksum */
   1931	paylen = skb->len - l4_offset;
   1932
   1933	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
   1934		csum_replace_by_diff(&l4.udp->check,
   1935				     (__force __wsum)htonl(paylen));
   1936		/* compute length of UDP segmentation header */
   1937		*hdr_len = (u8)sizeof(l4.udp) + l4_offset;
   1938	} else {
   1939		csum_replace_by_diff(&l4.tcp->check,
   1940				     (__force __wsum)htonl(paylen));
   1941		/* compute length of TCP segmentation header */
   1942		*hdr_len = (u8)((l4.tcp->doff * 4) + l4_offset);
   1943	}
   1944
   1945	/* pull values out of skb_shinfo */
   1946	gso_size = skb_shinfo(skb)->gso_size;
   1947	gso_segs = skb_shinfo(skb)->gso_segs;
   1948
   1949	/* update GSO size and bytecount with header size */
   1950	first->gso_segs = gso_segs;
   1951	first->bytecount += (first->gso_segs - 1) * *hdr_len;
   1952
   1953	/* find the field values */
   1954	cd_cmd = IAVF_TX_CTX_DESC_TSO;
   1955	cd_tso_len = skb->len - *hdr_len;
   1956	cd_mss = gso_size;
   1957	*cd_type_cmd_tso_mss |= (cd_cmd << IAVF_TXD_CTX_QW1_CMD_SHIFT) |
   1958				(cd_tso_len << IAVF_TXD_CTX_QW1_TSO_LEN_SHIFT) |
   1959				(cd_mss << IAVF_TXD_CTX_QW1_MSS_SHIFT);
   1960	return 1;
   1961}
   1962
   1963/**
   1964 * iavf_tx_enable_csum - Enable Tx checksum offloads
   1965 * @skb: send buffer
   1966 * @tx_flags: pointer to Tx flags currently set
   1967 * @td_cmd: Tx descriptor command bits to set
   1968 * @td_offset: Tx descriptor header offsets to set
   1969 * @tx_ring: Tx descriptor ring
   1970 * @cd_tunneling: ptr to context desc bits
   1971 **/
   1972static int iavf_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
   1973			       u32 *td_cmd, u32 *td_offset,
   1974			       struct iavf_ring *tx_ring,
   1975			       u32 *cd_tunneling)
   1976{
   1977	union {
   1978		struct iphdr *v4;
   1979		struct ipv6hdr *v6;
   1980		unsigned char *hdr;
   1981	} ip;
   1982	union {
   1983		struct tcphdr *tcp;
   1984		struct udphdr *udp;
   1985		unsigned char *hdr;
   1986	} l4;
   1987	unsigned char *exthdr;
   1988	u32 offset, cmd = 0;
   1989	__be16 frag_off;
   1990	u8 l4_proto = 0;
   1991
   1992	if (skb->ip_summed != CHECKSUM_PARTIAL)
   1993		return 0;
   1994
   1995	ip.hdr = skb_network_header(skb);
   1996	l4.hdr = skb_transport_header(skb);
   1997
   1998	/* compute outer L2 header size */
   1999	offset = ((ip.hdr - skb->data) / 2) << IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
   2000
   2001	if (skb->encapsulation) {
   2002		u32 tunnel = 0;
   2003		/* define outer network header type */
   2004		if (*tx_flags & IAVF_TX_FLAGS_IPV4) {
   2005			tunnel |= (*tx_flags & IAVF_TX_FLAGS_TSO) ?
   2006				  IAVF_TX_CTX_EXT_IP_IPV4 :
   2007				  IAVF_TX_CTX_EXT_IP_IPV4_NO_CSUM;
   2008
   2009			l4_proto = ip.v4->protocol;
   2010		} else if (*tx_flags & IAVF_TX_FLAGS_IPV6) {
   2011			tunnel |= IAVF_TX_CTX_EXT_IP_IPV6;
   2012
   2013			exthdr = ip.hdr + sizeof(*ip.v6);
   2014			l4_proto = ip.v6->nexthdr;
   2015			if (l4.hdr != exthdr)
   2016				ipv6_skip_exthdr(skb, exthdr - skb->data,
   2017						 &l4_proto, &frag_off);
   2018		}
   2019
   2020		/* define outer transport */
   2021		switch (l4_proto) {
   2022		case IPPROTO_UDP:
   2023			tunnel |= IAVF_TXD_CTX_UDP_TUNNELING;
   2024			*tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL;
   2025			break;
   2026		case IPPROTO_GRE:
   2027			tunnel |= IAVF_TXD_CTX_GRE_TUNNELING;
   2028			*tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL;
   2029			break;
   2030		case IPPROTO_IPIP:
   2031		case IPPROTO_IPV6:
   2032			*tx_flags |= IAVF_TX_FLAGS_VXLAN_TUNNEL;
   2033			l4.hdr = skb_inner_network_header(skb);
   2034			break;
   2035		default:
   2036			if (*tx_flags & IAVF_TX_FLAGS_TSO)
   2037				return -1;
   2038
   2039			skb_checksum_help(skb);
   2040			return 0;
   2041		}
   2042
   2043		/* compute outer L3 header size */
   2044		tunnel |= ((l4.hdr - ip.hdr) / 4) <<
   2045			  IAVF_TXD_CTX_QW0_EXT_IPLEN_SHIFT;
   2046
   2047		/* switch IP header pointer from outer to inner header */
   2048		ip.hdr = skb_inner_network_header(skb);
   2049
   2050		/* compute tunnel header size */
   2051		tunnel |= ((ip.hdr - l4.hdr) / 2) <<
   2052			  IAVF_TXD_CTX_QW0_NATLEN_SHIFT;
   2053
   2054		/* indicate if we need to offload outer UDP header */
   2055		if ((*tx_flags & IAVF_TX_FLAGS_TSO) &&
   2056		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
   2057		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
   2058			tunnel |= IAVF_TXD_CTX_QW0_L4T_CS_MASK;
   2059
   2060		/* record tunnel offload values */
   2061		*cd_tunneling |= tunnel;
   2062
   2063		/* switch L4 header pointer from outer to inner */
   2064		l4.hdr = skb_inner_transport_header(skb);
   2065		l4_proto = 0;
   2066
   2067		/* reset type as we transition from outer to inner headers */
   2068		*tx_flags &= ~(IAVF_TX_FLAGS_IPV4 | IAVF_TX_FLAGS_IPV6);
   2069		if (ip.v4->version == 4)
   2070			*tx_flags |= IAVF_TX_FLAGS_IPV4;
   2071		if (ip.v6->version == 6)
   2072			*tx_flags |= IAVF_TX_FLAGS_IPV6;
   2073	}
   2074
   2075	/* Enable IP checksum offloads */
   2076	if (*tx_flags & IAVF_TX_FLAGS_IPV4) {
   2077		l4_proto = ip.v4->protocol;
   2078		/* the stack computes the IP header already, the only time we
   2079		 * need the hardware to recompute it is in the case of TSO.
   2080		 */
   2081		cmd |= (*tx_flags & IAVF_TX_FLAGS_TSO) ?
   2082		       IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM :
   2083		       IAVF_TX_DESC_CMD_IIPT_IPV4;
   2084	} else if (*tx_flags & IAVF_TX_FLAGS_IPV6) {
   2085		cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
   2086
   2087		exthdr = ip.hdr + sizeof(*ip.v6);
   2088		l4_proto = ip.v6->nexthdr;
   2089		if (l4.hdr != exthdr)
   2090			ipv6_skip_exthdr(skb, exthdr - skb->data,
   2091					 &l4_proto, &frag_off);
   2092	}
   2093
   2094	/* compute inner L3 header size */
   2095	offset |= ((l4.hdr - ip.hdr) / 4) << IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
   2096
   2097	/* Enable L4 checksum offloads */
   2098	switch (l4_proto) {
   2099	case IPPROTO_TCP:
   2100		/* enable checksum offloads */
   2101		cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
   2102		offset |= l4.tcp->doff << IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
   2103		break;
   2104	case IPPROTO_SCTP:
   2105		/* enable SCTP checksum offload */
   2106		cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
   2107		offset |= (sizeof(struct sctphdr) >> 2) <<
   2108			  IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
   2109		break;
   2110	case IPPROTO_UDP:
   2111		/* enable UDP checksum offload */
   2112		cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
   2113		offset |= (sizeof(struct udphdr) >> 2) <<
   2114			  IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
   2115		break;
   2116	default:
   2117		if (*tx_flags & IAVF_TX_FLAGS_TSO)
   2118			return -1;
   2119		skb_checksum_help(skb);
   2120		return 0;
   2121	}
   2122
   2123	*td_cmd |= cmd;
   2124	*td_offset |= offset;
   2125
   2126	return 1;
   2127}
   2128
   2129/**
   2130 * iavf_create_tx_ctx - Build the Tx context descriptor
   2131 * @tx_ring:  ring to create the descriptor on
   2132 * @cd_type_cmd_tso_mss: Quad Word 1
   2133 * @cd_tunneling: Quad Word 0 - bits 0-31
   2134 * @cd_l2tag2: Quad Word 0 - bits 32-63
   2135 **/
   2136static void iavf_create_tx_ctx(struct iavf_ring *tx_ring,
   2137			       const u64 cd_type_cmd_tso_mss,
   2138			       const u32 cd_tunneling, const u32 cd_l2tag2)
   2139{
   2140	struct iavf_tx_context_desc *context_desc;
   2141	int i = tx_ring->next_to_use;
   2142
   2143	if ((cd_type_cmd_tso_mss == IAVF_TX_DESC_DTYPE_CONTEXT) &&
   2144	    !cd_tunneling && !cd_l2tag2)
   2145		return;
   2146
   2147	/* grab the next descriptor */
   2148	context_desc = IAVF_TX_CTXTDESC(tx_ring, i);
   2149
   2150	i++;
   2151	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
   2152
   2153	/* cpu_to_le32 and assign to struct fields */
   2154	context_desc->tunneling_params = cpu_to_le32(cd_tunneling);
   2155	context_desc->l2tag2 = cpu_to_le16(cd_l2tag2);
   2156	context_desc->rsvd = cpu_to_le16(0);
   2157	context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss);
   2158}
   2159
   2160/**
   2161 * __iavf_chk_linearize - Check if there are more than 8 buffers per packet
   2162 * @skb:      send buffer
   2163 *
   2164 * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire
   2165 * and so we need to figure out the cases where we need to linearize the skb.
   2166 *
   2167 * For TSO we need to count the TSO header and segment payload separately.
   2168 * As such we need to check cases where we have 7 fragments or more as we
   2169 * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
   2170 * the segment payload in the first descriptor, and another 7 for the
   2171 * fragments.
   2172 **/
   2173bool __iavf_chk_linearize(struct sk_buff *skb)
   2174{
   2175	const skb_frag_t *frag, *stale;
   2176	int nr_frags, sum;
   2177
   2178	/* no need to check if number of frags is less than 7 */
   2179	nr_frags = skb_shinfo(skb)->nr_frags;
   2180	if (nr_frags < (IAVF_MAX_BUFFER_TXD - 1))
   2181		return false;
   2182
   2183	/* We need to walk through the list and validate that each group
   2184	 * of 6 fragments totals at least gso_size.
   2185	 */
   2186	nr_frags -= IAVF_MAX_BUFFER_TXD - 2;
   2187	frag = &skb_shinfo(skb)->frags[0];
   2188
   2189	/* Initialize size to the negative value of gso_size minus 1.  We
   2190	 * use this as the worst case scenerio in which the frag ahead
   2191	 * of us only provides one byte which is why we are limited to 6
   2192	 * descriptors for a single transmit as the header and previous
   2193	 * fragment are already consuming 2 descriptors.
   2194	 */
   2195	sum = 1 - skb_shinfo(skb)->gso_size;
   2196
   2197	/* Add size of frags 0 through 4 to create our initial sum */
   2198	sum += skb_frag_size(frag++);
   2199	sum += skb_frag_size(frag++);
   2200	sum += skb_frag_size(frag++);
   2201	sum += skb_frag_size(frag++);
   2202	sum += skb_frag_size(frag++);
   2203
   2204	/* Walk through fragments adding latest fragment, testing it, and
   2205	 * then removing stale fragments from the sum.
   2206	 */
   2207	for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
   2208		int stale_size = skb_frag_size(stale);
   2209
   2210		sum += skb_frag_size(frag++);
   2211
   2212		/* The stale fragment may present us with a smaller
   2213		 * descriptor than the actual fragment size. To account
   2214		 * for that we need to remove all the data on the front and
   2215		 * figure out what the remainder would be in the last
   2216		 * descriptor associated with the fragment.
   2217		 */
   2218		if (stale_size > IAVF_MAX_DATA_PER_TXD) {
   2219			int align_pad = -(skb_frag_off(stale)) &
   2220					(IAVF_MAX_READ_REQ_SIZE - 1);
   2221
   2222			sum -= align_pad;
   2223			stale_size -= align_pad;
   2224
   2225			do {
   2226				sum -= IAVF_MAX_DATA_PER_TXD_ALIGNED;
   2227				stale_size -= IAVF_MAX_DATA_PER_TXD_ALIGNED;
   2228			} while (stale_size > IAVF_MAX_DATA_PER_TXD);
   2229		}
   2230
   2231		/* if sum is negative we failed to make sufficient progress */
   2232		if (sum < 0)
   2233			return true;
   2234
   2235		if (!nr_frags--)
   2236			break;
   2237
   2238		sum -= stale_size;
   2239	}
   2240
   2241	return false;
   2242}
   2243
   2244/**
   2245 * __iavf_maybe_stop_tx - 2nd level check for tx stop conditions
   2246 * @tx_ring: the ring to be checked
   2247 * @size:    the size buffer we want to assure is available
   2248 *
   2249 * Returns -EBUSY if a stop is needed, else 0
   2250 **/
   2251int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
   2252{
   2253	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
   2254	/* Memory barrier before checking head and tail */
   2255	smp_mb();
   2256
   2257	/* Check again in a case another CPU has just made room available. */
   2258	if (likely(IAVF_DESC_UNUSED(tx_ring) < size))
   2259		return -EBUSY;
   2260
   2261	/* A reprieve! - use start_queue because it doesn't call schedule */
   2262	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
   2263	++tx_ring->tx_stats.restart_queue;
   2264	return 0;
   2265}
   2266
   2267/**
   2268 * iavf_tx_map - Build the Tx descriptor
   2269 * @tx_ring:  ring to send buffer on
   2270 * @skb:      send buffer
   2271 * @first:    first buffer info buffer to use
   2272 * @tx_flags: collected send information
   2273 * @hdr_len:  size of the packet header
   2274 * @td_cmd:   the command field in the descriptor
   2275 * @td_offset: offset for checksum or crc
   2276 **/
   2277static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
   2278			       struct iavf_tx_buffer *first, u32 tx_flags,
   2279			       const u8 hdr_len, u32 td_cmd, u32 td_offset)
   2280{
   2281	unsigned int data_len = skb->data_len;
   2282	unsigned int size = skb_headlen(skb);
   2283	skb_frag_t *frag;
   2284	struct iavf_tx_buffer *tx_bi;
   2285	struct iavf_tx_desc *tx_desc;
   2286	u16 i = tx_ring->next_to_use;
   2287	u32 td_tag = 0;
   2288	dma_addr_t dma;
   2289
   2290	if (tx_flags & IAVF_TX_FLAGS_HW_VLAN) {
   2291		td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
   2292		td_tag = (tx_flags & IAVF_TX_FLAGS_VLAN_MASK) >>
   2293			 IAVF_TX_FLAGS_VLAN_SHIFT;
   2294	}
   2295
   2296	first->tx_flags = tx_flags;
   2297
   2298	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
   2299
   2300	tx_desc = IAVF_TX_DESC(tx_ring, i);
   2301	tx_bi = first;
   2302
   2303	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
   2304		unsigned int max_data = IAVF_MAX_DATA_PER_TXD_ALIGNED;
   2305
   2306		if (dma_mapping_error(tx_ring->dev, dma))
   2307			goto dma_error;
   2308
   2309		/* record length, and DMA address */
   2310		dma_unmap_len_set(tx_bi, len, size);
   2311		dma_unmap_addr_set(tx_bi, dma, dma);
   2312
   2313		/* align size to end of page */
   2314		max_data += -dma & (IAVF_MAX_READ_REQ_SIZE - 1);
   2315		tx_desc->buffer_addr = cpu_to_le64(dma);
   2316
   2317		while (unlikely(size > IAVF_MAX_DATA_PER_TXD)) {
   2318			tx_desc->cmd_type_offset_bsz =
   2319				build_ctob(td_cmd, td_offset,
   2320					   max_data, td_tag);
   2321
   2322			tx_desc++;
   2323			i++;
   2324
   2325			if (i == tx_ring->count) {
   2326				tx_desc = IAVF_TX_DESC(tx_ring, 0);
   2327				i = 0;
   2328			}
   2329
   2330			dma += max_data;
   2331			size -= max_data;
   2332
   2333			max_data = IAVF_MAX_DATA_PER_TXD_ALIGNED;
   2334			tx_desc->buffer_addr = cpu_to_le64(dma);
   2335		}
   2336
   2337		if (likely(!data_len))
   2338			break;
   2339
   2340		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
   2341							  size, td_tag);
   2342
   2343		tx_desc++;
   2344		i++;
   2345
   2346		if (i == tx_ring->count) {
   2347			tx_desc = IAVF_TX_DESC(tx_ring, 0);
   2348			i = 0;
   2349		}
   2350
   2351		size = skb_frag_size(frag);
   2352		data_len -= size;
   2353
   2354		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
   2355				       DMA_TO_DEVICE);
   2356
   2357		tx_bi = &tx_ring->tx_bi[i];
   2358	}
   2359
   2360	netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
   2361
   2362	i++;
   2363	if (i == tx_ring->count)
   2364		i = 0;
   2365
   2366	tx_ring->next_to_use = i;
   2367
   2368	iavf_maybe_stop_tx(tx_ring, DESC_NEEDED);
   2369
   2370	/* write last descriptor with RS and EOP bits */
   2371	td_cmd |= IAVF_TXD_CMD;
   2372	tx_desc->cmd_type_offset_bsz =
   2373			build_ctob(td_cmd, td_offset, size, td_tag);
   2374
   2375	skb_tx_timestamp(skb);
   2376
   2377	/* Force memory writes to complete before letting h/w know there
   2378	 * are new descriptors to fetch.
   2379	 *
   2380	 * We also use this memory barrier to make certain all of the
   2381	 * status bits have been updated before next_to_watch is written.
   2382	 */
   2383	wmb();
   2384
   2385	/* set next_to_watch value indicating a packet is present */
   2386	first->next_to_watch = tx_desc;
   2387
   2388	/* notify HW of packet */
   2389	if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) {
   2390		writel(i, tx_ring->tail);
   2391	}
   2392
   2393	return;
   2394
   2395dma_error:
   2396	dev_info(tx_ring->dev, "TX DMA map failed\n");
   2397
   2398	/* clear dma mappings for failed tx_bi map */
   2399	for (;;) {
   2400		tx_bi = &tx_ring->tx_bi[i];
   2401		iavf_unmap_and_free_tx_resource(tx_ring, tx_bi);
   2402		if (tx_bi == first)
   2403			break;
   2404		if (i == 0)
   2405			i = tx_ring->count;
   2406		i--;
   2407	}
   2408
   2409	tx_ring->next_to_use = i;
   2410}
   2411
   2412/**
   2413 * iavf_xmit_frame_ring - Sends buffer on Tx ring
   2414 * @skb:     send buffer
   2415 * @tx_ring: ring to send buffer on
   2416 *
   2417 * Returns NETDEV_TX_OK if sent, else an error code
   2418 **/
   2419static netdev_tx_t iavf_xmit_frame_ring(struct sk_buff *skb,
   2420					struct iavf_ring *tx_ring)
   2421{
   2422	u64 cd_type_cmd_tso_mss = IAVF_TX_DESC_DTYPE_CONTEXT;
   2423	u32 cd_tunneling = 0, cd_l2tag2 = 0;
   2424	struct iavf_tx_buffer *first;
   2425	u32 td_offset = 0;
   2426	u32 tx_flags = 0;
   2427	__be16 protocol;
   2428	u32 td_cmd = 0;
   2429	u8 hdr_len = 0;
   2430	int tso, count;
   2431
   2432	/* prefetch the data, we'll need it later */
   2433	prefetch(skb->data);
   2434
   2435	iavf_trace(xmit_frame_ring, skb, tx_ring);
   2436
   2437	count = iavf_xmit_descriptor_count(skb);
   2438	if (iavf_chk_linearize(skb, count)) {
   2439		if (__skb_linearize(skb)) {
   2440			dev_kfree_skb_any(skb);
   2441			return NETDEV_TX_OK;
   2442		}
   2443		count = iavf_txd_use_count(skb->len);
   2444		tx_ring->tx_stats.tx_linearize++;
   2445	}
   2446
   2447	/* need: 1 descriptor per page * PAGE_SIZE/IAVF_MAX_DATA_PER_TXD,
   2448	 *       + 1 desc for skb_head_len/IAVF_MAX_DATA_PER_TXD,
   2449	 *       + 4 desc gap to avoid the cache line where head is,
   2450	 *       + 1 desc for context descriptor,
   2451	 * otherwise try next time
   2452	 */
   2453	if (iavf_maybe_stop_tx(tx_ring, count + 4 + 1)) {
   2454		tx_ring->tx_stats.tx_busy++;
   2455		return NETDEV_TX_BUSY;
   2456	}
   2457
   2458	/* record the location of the first descriptor for this packet */
   2459	first = &tx_ring->tx_bi[tx_ring->next_to_use];
   2460	first->skb = skb;
   2461	first->bytecount = skb->len;
   2462	first->gso_segs = 1;
   2463
   2464	/* prepare the xmit flags */
   2465	iavf_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags);
   2466	if (tx_flags & IAVF_TX_FLAGS_HW_OUTER_SINGLE_VLAN) {
   2467		cd_type_cmd_tso_mss |= IAVF_TX_CTX_DESC_IL2TAG2 <<
   2468			IAVF_TXD_CTX_QW1_CMD_SHIFT;
   2469		cd_l2tag2 = (tx_flags & IAVF_TX_FLAGS_VLAN_MASK) >>
   2470			IAVF_TX_FLAGS_VLAN_SHIFT;
   2471	}
   2472
   2473	/* obtain protocol of skb */
   2474	protocol = vlan_get_protocol(skb);
   2475
   2476	/* setup IPv4/IPv6 offloads */
   2477	if (protocol == htons(ETH_P_IP))
   2478		tx_flags |= IAVF_TX_FLAGS_IPV4;
   2479	else if (protocol == htons(ETH_P_IPV6))
   2480		tx_flags |= IAVF_TX_FLAGS_IPV6;
   2481
   2482	tso = iavf_tso(first, &hdr_len, &cd_type_cmd_tso_mss);
   2483
   2484	if (tso < 0)
   2485		goto out_drop;
   2486	else if (tso)
   2487		tx_flags |= IAVF_TX_FLAGS_TSO;
   2488
   2489	/* Always offload the checksum, since it's in the data descriptor */
   2490	tso = iavf_tx_enable_csum(skb, &tx_flags, &td_cmd, &td_offset,
   2491				  tx_ring, &cd_tunneling);
   2492	if (tso < 0)
   2493		goto out_drop;
   2494
   2495	/* always enable CRC insertion offload */
   2496	td_cmd |= IAVF_TX_DESC_CMD_ICRC;
   2497
   2498	iavf_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
   2499			   cd_tunneling, cd_l2tag2);
   2500
   2501	iavf_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
   2502		    td_cmd, td_offset);
   2503
   2504	return NETDEV_TX_OK;
   2505
   2506out_drop:
   2507	iavf_trace(xmit_frame_ring_drop, first->skb, tx_ring);
   2508	dev_kfree_skb_any(first->skb);
   2509	first->skb = NULL;
   2510	return NETDEV_TX_OK;
   2511}
   2512
   2513/**
   2514 * iavf_xmit_frame - Selects the correct VSI and Tx queue to send buffer
   2515 * @skb:    send buffer
   2516 * @netdev: network interface device structure
   2517 *
   2518 * Returns NETDEV_TX_OK if sent, else an error code
   2519 **/
   2520netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
   2521{
   2522	struct iavf_adapter *adapter = netdev_priv(netdev);
   2523	struct iavf_ring *tx_ring = &adapter->tx_rings[skb->queue_mapping];
   2524
   2525	/* hardware can't handle really short frames, hardware padding works
   2526	 * beyond this point
   2527	 */
   2528	if (unlikely(skb->len < IAVF_MIN_TX_LEN)) {
   2529		if (skb_pad(skb, IAVF_MIN_TX_LEN - skb->len))
   2530			return NETDEV_TX_OK;
   2531		skb->len = IAVF_MIN_TX_LEN;
   2532		skb_set_tail_pointer(skb, IAVF_MIN_TX_LEN);
   2533	}
   2534
   2535	return iavf_xmit_frame_ring(skb, tx_ring);
   2536}