cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

gve_tx.c (21090B)


      1// SPDX-License-Identifier: (GPL-2.0 OR MIT)
      2/* Google virtual Ethernet (gve) driver
      3 *
      4 * Copyright (C) 2015-2021 Google, Inc.
      5 */
      6
      7#include "gve.h"
      8#include "gve_adminq.h"
      9#include "gve_utils.h"
     10#include <linux/ip.h>
     11#include <linux/tcp.h>
     12#include <linux/vmalloc.h>
     13#include <linux/skbuff.h>
     14
     15static inline void gve_tx_put_doorbell(struct gve_priv *priv,
     16				       struct gve_queue_resources *q_resources,
     17				       u32 val)
     18{
     19	iowrite32be(val, &priv->db_bar2[be32_to_cpu(q_resources->db_index)]);
     20}
     21
     22/* gvnic can only transmit from a Registered Segment.
     23 * We copy skb payloads into the registered segment before writing Tx
     24 * descriptors and ringing the Tx doorbell.
     25 *
     26 * gve_tx_fifo_* manages the Registered Segment as a FIFO - clients must
     27 * free allocations in the order they were allocated.
     28 */
     29
     30static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_fifo *fifo)
     31{
     32	fifo->base = vmap(fifo->qpl->pages, fifo->qpl->num_entries, VM_MAP,
     33			  PAGE_KERNEL);
     34	if (unlikely(!fifo->base)) {
     35		netif_err(priv, drv, priv->dev, "Failed to vmap fifo, qpl_id = %d\n",
     36			  fifo->qpl->id);
     37		return -ENOMEM;
     38	}
     39
     40	fifo->size = fifo->qpl->num_entries * PAGE_SIZE;
     41	atomic_set(&fifo->available, fifo->size);
     42	fifo->head = 0;
     43	return 0;
     44}
     45
     46static void gve_tx_fifo_release(struct gve_priv *priv, struct gve_tx_fifo *fifo)
     47{
     48	WARN(atomic_read(&fifo->available) != fifo->size,
     49	     "Releasing non-empty fifo");
     50
     51	vunmap(fifo->base);
     52}
     53
     54static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo,
     55					  size_t bytes)
     56{
     57	return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
     58}
     59
     60static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
     61{
     62	return (atomic_read(&fifo->available) <= bytes) ? false : true;
     63}
     64
     65/* gve_tx_alloc_fifo - Allocate fragment(s) from Tx FIFO
     66 * @fifo: FIFO to allocate from
     67 * @bytes: Allocation size
     68 * @iov: Scatter-gather elements to fill with allocation fragment base/len
     69 *
     70 * Returns number of valid elements in iov[] or negative on error.
     71 *
     72 * Allocations from a given FIFO must be externally synchronized but concurrent
     73 * allocation and frees are allowed.
     74 */
     75static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
     76			     struct gve_tx_iovec iov[2])
     77{
     78	size_t overflow, padding;
     79	u32 aligned_head;
     80	int nfrags = 0;
     81
     82	if (!bytes)
     83		return 0;
     84
     85	/* This check happens before we know how much padding is needed to
     86	 * align to a cacheline boundary for the payload, but that is fine,
     87	 * because the FIFO head always start aligned, and the FIFO's boundaries
     88	 * are aligned, so if there is space for the data, there is space for
     89	 * the padding to the next alignment.
     90	 */
     91	WARN(!gve_tx_fifo_can_alloc(fifo, bytes),
     92	     "Reached %s when there's not enough space in the fifo", __func__);
     93
     94	nfrags++;
     95
     96	iov[0].iov_offset = fifo->head;
     97	iov[0].iov_len = bytes;
     98	fifo->head += bytes;
     99
    100	if (fifo->head > fifo->size) {
    101		/* If the allocation did not fit in the tail fragment of the
    102		 * FIFO, also use the head fragment.
    103		 */
    104		nfrags++;
    105		overflow = fifo->head - fifo->size;
    106		iov[0].iov_len -= overflow;
    107		iov[1].iov_offset = 0;	/* Start of fifo*/
    108		iov[1].iov_len = overflow;
    109
    110		fifo->head = overflow;
    111	}
    112
    113	/* Re-align to a cacheline boundary */
    114	aligned_head = L1_CACHE_ALIGN(fifo->head);
    115	padding = aligned_head - fifo->head;
    116	iov[nfrags - 1].iov_padding = padding;
    117	atomic_sub(bytes + padding, &fifo->available);
    118	fifo->head = aligned_head;
    119
    120	if (fifo->head == fifo->size)
    121		fifo->head = 0;
    122
    123	return nfrags;
    124}
    125
    126/* gve_tx_free_fifo - Return space to Tx FIFO
    127 * @fifo: FIFO to return fragments to
    128 * @bytes: Bytes to free
    129 */
    130static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
    131{
    132	atomic_add(bytes, &fifo->available);
    133}
    134
    135static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,
    136			     u32 to_do, bool try_to_wake);
    137
    138static void gve_tx_free_ring(struct gve_priv *priv, int idx)
    139{
    140	struct gve_tx_ring *tx = &priv->tx[idx];
    141	struct device *hdev = &priv->pdev->dev;
    142	size_t bytes;
    143	u32 slots;
    144
    145	gve_tx_remove_from_block(priv, idx);
    146	slots = tx->mask + 1;
    147	gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false);
    148	netdev_tx_reset_queue(tx->netdev_txq);
    149
    150	dma_free_coherent(hdev, sizeof(*tx->q_resources),
    151			  tx->q_resources, tx->q_resources_bus);
    152	tx->q_resources = NULL;
    153
    154	if (!tx->raw_addressing) {
    155		gve_tx_fifo_release(priv, &tx->tx_fifo);
    156		gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
    157		tx->tx_fifo.qpl = NULL;
    158	}
    159
    160	bytes = sizeof(*tx->desc) * slots;
    161	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
    162	tx->desc = NULL;
    163
    164	vfree(tx->info);
    165	tx->info = NULL;
    166
    167	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
    168}
    169
    170static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)
    171{
    172	struct gve_tx_ring *tx = &priv->tx[idx];
    173	struct device *hdev = &priv->pdev->dev;
    174	u32 slots = priv->tx_desc_cnt;
    175	size_t bytes;
    176
    177	/* Make sure everything is zeroed to start */
    178	memset(tx, 0, sizeof(*tx));
    179	spin_lock_init(&tx->clean_lock);
    180	tx->q_num = idx;
    181
    182	tx->mask = slots - 1;
    183
    184	/* alloc metadata */
    185	tx->info = vzalloc(sizeof(*tx->info) * slots);
    186	if (!tx->info)
    187		return -ENOMEM;
    188
    189	/* alloc tx queue */
    190	bytes = sizeof(*tx->desc) * slots;
    191	tx->desc = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
    192	if (!tx->desc)
    193		goto abort_with_info;
    194
    195	tx->raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT;
    196	tx->dev = &priv->pdev->dev;
    197	if (!tx->raw_addressing) {
    198		tx->tx_fifo.qpl = gve_assign_tx_qpl(priv);
    199		if (!tx->tx_fifo.qpl)
    200			goto abort_with_desc;
    201		/* map Tx FIFO */
    202		if (gve_tx_fifo_init(priv, &tx->tx_fifo))
    203			goto abort_with_qpl;
    204	}
    205
    206	tx->q_resources =
    207		dma_alloc_coherent(hdev,
    208				   sizeof(*tx->q_resources),
    209				   &tx->q_resources_bus,
    210				   GFP_KERNEL);
    211	if (!tx->q_resources)
    212		goto abort_with_fifo;
    213
    214	netif_dbg(priv, drv, priv->dev, "tx[%d]->bus=%lx\n", idx,
    215		  (unsigned long)tx->bus);
    216	tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
    217	gve_tx_add_to_block(priv, idx);
    218
    219	return 0;
    220
    221abort_with_fifo:
    222	if (!tx->raw_addressing)
    223		gve_tx_fifo_release(priv, &tx->tx_fifo);
    224abort_with_qpl:
    225	if (!tx->raw_addressing)
    226		gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
    227abort_with_desc:
    228	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
    229	tx->desc = NULL;
    230abort_with_info:
    231	vfree(tx->info);
    232	tx->info = NULL;
    233	return -ENOMEM;
    234}
    235
    236int gve_tx_alloc_rings(struct gve_priv *priv)
    237{
    238	int err = 0;
    239	int i;
    240
    241	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
    242		err = gve_tx_alloc_ring(priv, i);
    243		if (err) {
    244			netif_err(priv, drv, priv->dev,
    245				  "Failed to alloc tx ring=%d: err=%d\n",
    246				  i, err);
    247			break;
    248		}
    249	}
    250	/* Unallocate if there was an error */
    251	if (err) {
    252		int j;
    253
    254		for (j = 0; j < i; j++)
    255			gve_tx_free_ring(priv, j);
    256	}
    257	return err;
    258}
    259
    260void gve_tx_free_rings_gqi(struct gve_priv *priv)
    261{
    262	int i;
    263
    264	for (i = 0; i < priv->tx_cfg.num_queues; i++)
    265		gve_tx_free_ring(priv, i);
    266}
    267
    268/* gve_tx_avail - Calculates the number of slots available in the ring
    269 * @tx: tx ring to check
    270 *
    271 * Returns the number of slots available
    272 *
    273 * The capacity of the queue is mask + 1. We don't need to reserve an entry.
    274 **/
    275static inline u32 gve_tx_avail(struct gve_tx_ring *tx)
    276{
    277	return tx->mask + 1 - (tx->req - tx->done);
    278}
    279
    280static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx,
    281					      struct sk_buff *skb)
    282{
    283	int pad_bytes, align_hdr_pad;
    284	int bytes;
    285	int hlen;
    286
    287	hlen = skb_is_gso(skb) ? skb_checksum_start_offset(skb) +
    288				 tcp_hdrlen(skb) : skb_headlen(skb);
    289
    290	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo,
    291						   hlen);
    292	/* We need to take into account the header alignment padding. */
    293	align_hdr_pad = L1_CACHE_ALIGN(hlen) - hlen;
    294	bytes = align_hdr_pad + pad_bytes + skb->len;
    295
    296	return bytes;
    297}
    298
    299/* The most descriptors we could need is MAX_SKB_FRAGS + 4 :
    300 * 1 for each skb frag
    301 * 1 for the skb linear portion
    302 * 1 for when tcp hdr needs to be in separate descriptor
    303 * 1 if the payload wraps to the beginning of the FIFO
    304 * 1 for metadata descriptor
    305 */
    306#define MAX_TX_DESC_NEEDED	(MAX_SKB_FRAGS + 4)
    307static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info)
    308{
    309	if (info->skb) {
    310		dma_unmap_single(dev, dma_unmap_addr(info, dma),
    311				 dma_unmap_len(info, len),
    312				 DMA_TO_DEVICE);
    313		dma_unmap_len_set(info, len, 0);
    314	} else {
    315		dma_unmap_page(dev, dma_unmap_addr(info, dma),
    316			       dma_unmap_len(info, len),
    317			       DMA_TO_DEVICE);
    318		dma_unmap_len_set(info, len, 0);
    319	}
    320}
    321
    322/* Check if sufficient resources (descriptor ring space, FIFO space) are
    323 * available to transmit the given number of bytes.
    324 */
    325static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
    326{
    327	bool can_alloc = true;
    328
    329	if (!tx->raw_addressing)
    330		can_alloc = gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required);
    331
    332	return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc);
    333}
    334
    335static_assert(NAPI_POLL_WEIGHT >= MAX_TX_DESC_NEEDED);
    336
    337/* Stops the queue if the skb cannot be transmitted. */
    338static int gve_maybe_stop_tx(struct gve_priv *priv, struct gve_tx_ring *tx,
    339			     struct sk_buff *skb)
    340{
    341	int bytes_required = 0;
    342	u32 nic_done;
    343	u32 to_do;
    344	int ret;
    345
    346	if (!tx->raw_addressing)
    347		bytes_required = gve_skb_fifo_bytes_required(tx, skb);
    348
    349	if (likely(gve_can_tx(tx, bytes_required)))
    350		return 0;
    351
    352	ret = -EBUSY;
    353	spin_lock(&tx->clean_lock);
    354	nic_done = gve_tx_load_event_counter(priv, tx);
    355	to_do = nic_done - tx->done;
    356
    357	/* Only try to clean if there is hope for TX */
    358	if (to_do + gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED) {
    359		if (to_do > 0) {
    360			to_do = min_t(u32, to_do, NAPI_POLL_WEIGHT);
    361			gve_clean_tx_done(priv, tx, to_do, false);
    362		}
    363		if (likely(gve_can_tx(tx, bytes_required)))
    364			ret = 0;
    365	}
    366	if (ret) {
    367		/* No space, so stop the queue */
    368		tx->stop_queue++;
    369		netif_tx_stop_queue(tx->netdev_txq);
    370	}
    371	spin_unlock(&tx->clean_lock);
    372
    373	return ret;
    374}
    375
    376static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc,
    377				 struct sk_buff *skb, bool is_gso,
    378				 int l4_hdr_offset, u32 desc_cnt,
    379				 u16 hlen, u64 addr)
    380{
    381	/* l4_hdr_offset and csum_offset are in units of 16-bit words */
    382	if (is_gso) {
    383		pkt_desc->pkt.type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
    384		pkt_desc->pkt.l4_csum_offset = skb->csum_offset >> 1;
    385		pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1;
    386	} else if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
    387		pkt_desc->pkt.type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
    388		pkt_desc->pkt.l4_csum_offset = skb->csum_offset >> 1;
    389		pkt_desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1;
    390	} else {
    391		pkt_desc->pkt.type_flags = GVE_TXD_STD;
    392		pkt_desc->pkt.l4_csum_offset = 0;
    393		pkt_desc->pkt.l4_hdr_offset = 0;
    394	}
    395	pkt_desc->pkt.desc_cnt = desc_cnt;
    396	pkt_desc->pkt.len = cpu_to_be16(skb->len);
    397	pkt_desc->pkt.seg_len = cpu_to_be16(hlen);
    398	pkt_desc->pkt.seg_addr = cpu_to_be64(addr);
    399}
    400
    401static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc,
    402				 struct sk_buff *skb)
    403{
    404	BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt));
    405
    406	mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
    407	mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT |
    408				   GVE_MTD_PATH_HASH_L4;
    409	mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash);
    410	mtd_desc->mtd.reserved0 = 0;
    411	mtd_desc->mtd.reserved1 = 0;
    412}
    413
    414static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc,
    415				 struct sk_buff *skb, bool is_gso,
    416				 u16 len, u64 addr)
    417{
    418	seg_desc->seg.type_flags = GVE_TXD_SEG;
    419	if (is_gso) {
    420		if (skb_is_gso_v6(skb))
    421			seg_desc->seg.type_flags |= GVE_TXSF_IPV6;
    422		seg_desc->seg.l3_offset = skb_network_offset(skb) >> 1;
    423		seg_desc->seg.mss = cpu_to_be16(skb_shinfo(skb)->gso_size);
    424	}
    425	seg_desc->seg.seg_len = cpu_to_be16(len);
    426	seg_desc->seg.seg_addr = cpu_to_be64(addr);
    427}
    428
    429static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses,
    430				    u64 iov_offset, u64 iov_len)
    431{
    432	u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
    433	u64 first_page = iov_offset / PAGE_SIZE;
    434	u64 page;
    435
    436	for (page = first_page; page <= last_page; page++)
    437		dma_sync_single_for_device(dev, page_buses[page], PAGE_SIZE, DMA_TO_DEVICE);
    438}
    439
    440static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb)
    441{
    442	int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;
    443	union gve_tx_desc *pkt_desc, *seg_desc;
    444	struct gve_tx_buffer_state *info;
    445	int mtd_desc_nr = !!skb->l4_hash;
    446	bool is_gso = skb_is_gso(skb);
    447	u32 idx = tx->req & tx->mask;
    448	int payload_iov = 2;
    449	int copy_offset;
    450	u32 next_idx;
    451	int i;
    452
    453	info = &tx->info[idx];
    454	pkt_desc = &tx->desc[idx];
    455
    456	l4_hdr_offset = skb_checksum_start_offset(skb);
    457	/* If the skb is gso, then we want the tcp header in the first segment
    458	 * otherwise we want the linear portion of the skb (which will contain
    459	 * the checksum because skb->csum_start and skb->csum_offset are given
    460	 * relative to skb->head) in the first segment.
    461	 */
    462	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) :
    463			skb_headlen(skb);
    464
    465	info->skb =  skb;
    466	/* We don't want to split the header, so if necessary, pad to the end
    467	 * of the fifo and then put the header at the beginning of the fifo.
    468	 */
    469	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo, hlen);
    470	hdr_nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, hlen + pad_bytes,
    471				       &info->iov[0]);
    472	WARN(!hdr_nfrags, "hdr_nfrags should never be 0!");
    473	payload_nfrags = gve_tx_alloc_fifo(&tx->tx_fifo, skb->len - hlen,
    474					   &info->iov[payload_iov]);
    475
    476	gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
    477			     1 + mtd_desc_nr + payload_nfrags, hlen,
    478			     info->iov[hdr_nfrags - 1].iov_offset);
    479
    480	skb_copy_bits(skb, 0,
    481		      tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset,
    482		      hlen);
    483	gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses,
    484				info->iov[hdr_nfrags - 1].iov_offset,
    485				info->iov[hdr_nfrags - 1].iov_len);
    486	copy_offset = hlen;
    487
    488	if (mtd_desc_nr) {
    489		next_idx = (tx->req + 1) & tx->mask;
    490		gve_tx_fill_mtd_desc(&tx->desc[next_idx], skb);
    491	}
    492
    493	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
    494		next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
    495		seg_desc = &tx->desc[next_idx];
    496
    497		gve_tx_fill_seg_desc(seg_desc, skb, is_gso,
    498				     info->iov[i].iov_len,
    499				     info->iov[i].iov_offset);
    500
    501		skb_copy_bits(skb, copy_offset,
    502			      tx->tx_fifo.base + info->iov[i].iov_offset,
    503			      info->iov[i].iov_len);
    504		gve_dma_sync_for_device(&priv->pdev->dev, tx->tx_fifo.qpl->page_buses,
    505					info->iov[i].iov_offset,
    506					info->iov[i].iov_len);
    507		copy_offset += info->iov[i].iov_len;
    508	}
    509
    510	return 1 + mtd_desc_nr + payload_nfrags;
    511}
    512
    513static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
    514				  struct sk_buff *skb)
    515{
    516	const struct skb_shared_info *shinfo = skb_shinfo(skb);
    517	int hlen, num_descriptors, l4_hdr_offset;
    518	union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc;
    519	struct gve_tx_buffer_state *info;
    520	int mtd_desc_nr = !!skb->l4_hash;
    521	bool is_gso = skb_is_gso(skb);
    522	u32 idx = tx->req & tx->mask;
    523	u64 addr;
    524	u32 len;
    525	int i;
    526
    527	info = &tx->info[idx];
    528	pkt_desc = &tx->desc[idx];
    529
    530	l4_hdr_offset = skb_checksum_start_offset(skb);
    531	/* If the skb is gso, then we want only up to the tcp header in the first segment
    532	 * to efficiently replicate on each segment otherwise we want the linear portion
    533	 * of the skb (which will contain the checksum because skb->csum_start and
    534	 * skb->csum_offset are given relative to skb->head) in the first segment.
    535	 */
    536	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : skb_headlen(skb);
    537	len = skb_headlen(skb);
    538
    539	info->skb =  skb;
    540
    541	addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
    542	if (unlikely(dma_mapping_error(tx->dev, addr))) {
    543		tx->dma_mapping_error++;
    544		goto drop;
    545	}
    546	dma_unmap_len_set(info, len, len);
    547	dma_unmap_addr_set(info, dma, addr);
    548
    549	num_descriptors = 1 + shinfo->nr_frags;
    550	if (hlen < len)
    551		num_descriptors++;
    552	if (mtd_desc_nr)
    553		num_descriptors++;
    554
    555	gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
    556			     num_descriptors, hlen, addr);
    557
    558	if (mtd_desc_nr) {
    559		idx = (idx + 1) & tx->mask;
    560		mtd_desc = &tx->desc[idx];
    561		gve_tx_fill_mtd_desc(mtd_desc, skb);
    562	}
    563
    564	if (hlen < len) {
    565		/* For gso the rest of the linear portion of the skb needs to
    566		 * be in its own descriptor.
    567		 */
    568		len -= hlen;
    569		addr += hlen;
    570		idx = (idx + 1) & tx->mask;
    571		seg_desc = &tx->desc[idx];
    572		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
    573	}
    574
    575	for (i = 0; i < shinfo->nr_frags; i++) {
    576		const skb_frag_t *frag = &shinfo->frags[i];
    577
    578		idx = (idx + 1) & tx->mask;
    579		seg_desc = &tx->desc[idx];
    580		len = skb_frag_size(frag);
    581		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
    582		if (unlikely(dma_mapping_error(tx->dev, addr))) {
    583			tx->dma_mapping_error++;
    584			goto unmap_drop;
    585		}
    586		tx->info[idx].skb = NULL;
    587		dma_unmap_len_set(&tx->info[idx], len, len);
    588		dma_unmap_addr_set(&tx->info[idx], dma, addr);
    589
    590		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
    591	}
    592
    593	return num_descriptors;
    594
    595unmap_drop:
    596	i += num_descriptors - shinfo->nr_frags;
    597	while (i--) {
    598		/* Skip metadata descriptor, if set */
    599		if (i == 1 && mtd_desc_nr == 1)
    600			continue;
    601		idx--;
    602		gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]);
    603	}
    604drop:
    605	tx->dropped_pkt++;
    606	return 0;
    607}
    608
    609netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev)
    610{
    611	struct gve_priv *priv = netdev_priv(dev);
    612	struct gve_tx_ring *tx;
    613	int nsegs;
    614
    615	WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues,
    616	     "skb queue index out of range");
    617	tx = &priv->tx[skb_get_queue_mapping(skb)];
    618	if (unlikely(gve_maybe_stop_tx(priv, tx, skb))) {
    619		/* We need to ring the txq doorbell -- we have stopped the Tx
    620		 * queue for want of resources, but prior calls to gve_tx()
    621		 * may have added descriptors without ringing the doorbell.
    622		 */
    623
    624		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
    625		return NETDEV_TX_BUSY;
    626	}
    627	if (tx->raw_addressing)
    628		nsegs = gve_tx_add_skb_no_copy(priv, tx, skb);
    629	else
    630		nsegs = gve_tx_add_skb_copy(priv, tx, skb);
    631
    632	/* If the packet is getting sent, we need to update the skb */
    633	if (nsegs) {
    634		netdev_tx_sent_queue(tx->netdev_txq, skb->len);
    635		skb_tx_timestamp(skb);
    636		tx->req += nsegs;
    637	} else {
    638		dev_kfree_skb_any(skb);
    639	}
    640
    641	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
    642		return NETDEV_TX_OK;
    643
    644	/* Give packets to NIC. Even if this packet failed to send the doorbell
    645	 * might need to be rung because of xmit_more.
    646	 */
    647	gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
    648	return NETDEV_TX_OK;
    649}
    650
    651#define GVE_TX_START_THRESH	PAGE_SIZE
    652
    653static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,
    654			     u32 to_do, bool try_to_wake)
    655{
    656	struct gve_tx_buffer_state *info;
    657	u64 pkts = 0, bytes = 0;
    658	size_t space_freed = 0;
    659	struct sk_buff *skb;
    660	int i, j;
    661	u32 idx;
    662
    663	for (j = 0; j < to_do; j++) {
    664		idx = tx->done & tx->mask;
    665		netif_info(priv, tx_done, priv->dev,
    666			   "[%d] %s: idx=%d (req=%u done=%u)\n",
    667			   tx->q_num, __func__, idx, tx->req, tx->done);
    668		info = &tx->info[idx];
    669		skb = info->skb;
    670
    671		/* Unmap the buffer */
    672		if (tx->raw_addressing)
    673			gve_tx_unmap_buf(tx->dev, info);
    674		tx->done++;
    675		/* Mark as free */
    676		if (skb) {
    677			info->skb = NULL;
    678			bytes += skb->len;
    679			pkts++;
    680			dev_consume_skb_any(skb);
    681			if (tx->raw_addressing)
    682				continue;
    683			/* FIFO free */
    684			for (i = 0; i < ARRAY_SIZE(info->iov); i++) {
    685				space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
    686				info->iov[i].iov_len = 0;
    687				info->iov[i].iov_padding = 0;
    688			}
    689		}
    690	}
    691
    692	if (!tx->raw_addressing)
    693		gve_tx_free_fifo(&tx->tx_fifo, space_freed);
    694	u64_stats_update_begin(&tx->statss);
    695	tx->bytes_done += bytes;
    696	tx->pkt_done += pkts;
    697	u64_stats_update_end(&tx->statss);
    698	netdev_tx_completed_queue(tx->netdev_txq, pkts, bytes);
    699
    700	/* start the queue if we've stopped it */
    701#ifndef CONFIG_BQL
    702	/* Make sure that the doorbells are synced */
    703	smp_mb();
    704#endif
    705	if (try_to_wake && netif_tx_queue_stopped(tx->netdev_txq) &&
    706	    likely(gve_can_tx(tx, GVE_TX_START_THRESH))) {
    707		tx->wake_queue++;
    708		netif_tx_wake_queue(tx->netdev_txq);
    709	}
    710
    711	return pkts;
    712}
    713
    714u32 gve_tx_load_event_counter(struct gve_priv *priv,
    715			      struct gve_tx_ring *tx)
    716{
    717	u32 counter_index = be32_to_cpu(tx->q_resources->counter_index);
    718	__be32 counter = READ_ONCE(priv->counter_array[counter_index]);
    719
    720	return be32_to_cpu(counter);
    721}
    722
    723bool gve_tx_poll(struct gve_notify_block *block, int budget)
    724{
    725	struct gve_priv *priv = block->priv;
    726	struct gve_tx_ring *tx = block->tx;
    727	u32 nic_done;
    728	u32 to_do;
    729
    730	/* If budget is 0, do all the work */
    731	if (budget == 0)
    732		budget = INT_MAX;
    733
    734	/* In TX path, it may try to clean completed pkts in order to xmit,
    735	 * to avoid cleaning conflict, use spin_lock(), it yields better
    736	 * concurrency between xmit/clean than netif's lock.
    737	 */
    738	spin_lock(&tx->clean_lock);
    739	/* Find out how much work there is to be done */
    740	nic_done = gve_tx_load_event_counter(priv, tx);
    741	to_do = min_t(u32, (nic_done - tx->done), budget);
    742	gve_clean_tx_done(priv, tx, to_do, true);
    743	spin_unlock(&tx->clean_lock);
    744	/* If we still have work we want to repoll */
    745	return nic_done != tx->done;
    746}
    747
    748bool gve_tx_clean_pending(struct gve_priv *priv, struct gve_tx_ring *tx)
    749{
    750	u32 nic_done = gve_tx_load_event_counter(priv, tx);
    751
    752	return nic_done != tx->done;
    753}