cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dp.c (39975B)


      1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
      2/* Copyright (C) 2015-2019 Netronome Systems, Inc. */
      3
      4#include <linux/bpf_trace.h>
      5#include <linux/netdevice.h>
      6#include <linux/overflow.h>
      7#include <linux/sizes.h>
      8#include <linux/bitfield.h>
      9
     10#include "../nfp_app.h"
     11#include "../nfp_net.h"
     12#include "../nfp_net_dp.h"
     13#include "../crypto/crypto.h"
     14#include "../crypto/fw.h"
     15#include "nfdk.h"
     16
     17static int nfp_nfdk_tx_ring_should_wake(struct nfp_net_tx_ring *tx_ring)
     18{
     19	return !nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT * 2);
     20}
     21
     22static int nfp_nfdk_tx_ring_should_stop(struct nfp_net_tx_ring *tx_ring)
     23{
     24	return nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT);
     25}
     26
     27static void nfp_nfdk_tx_ring_stop(struct netdev_queue *nd_q,
     28				  struct nfp_net_tx_ring *tx_ring)
     29{
     30	netif_tx_stop_queue(nd_q);
     31
     32	/* We can race with the TX completion out of NAPI so recheck */
     33	smp_mb();
     34	if (unlikely(nfp_nfdk_tx_ring_should_wake(tx_ring)))
     35		netif_tx_start_queue(nd_q);
     36}
     37
     38static __le64
     39nfp_nfdk_tx_tso(struct nfp_net_r_vector *r_vec, struct nfp_nfdk_tx_buf *txbuf,
     40		struct sk_buff *skb)
     41{
     42	u32 segs, hdrlen, l3_offset, l4_offset;
     43	struct nfp_nfdk_tx_desc txd;
     44	u16 mss;
     45
     46	if (!skb->encapsulation) {
     47		l3_offset = skb_network_offset(skb);
     48		l4_offset = skb_transport_offset(skb);
     49		hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
     50	} else {
     51		l3_offset = skb_inner_network_offset(skb);
     52		l4_offset = skb_inner_transport_offset(skb);
     53		hdrlen = skb_inner_transport_header(skb) - skb->data +
     54			inner_tcp_hdrlen(skb);
     55	}
     56
     57	segs = skb_shinfo(skb)->gso_segs;
     58	mss = skb_shinfo(skb)->gso_size & NFDK_DESC_TX_MSS_MASK;
     59
     60	/* Note: TSO of the packet with metadata prepended to skb is not
     61	 * supported yet, in which case l3/l4_offset and lso_hdrlen need
     62	 * be correctly handled here.
     63	 * Concern:
     64	 * The driver doesn't have md_bytes easily available at this point.
     65	 * The PCI.IN PD ME won't have md_bytes bytes to add to lso_hdrlen,
     66	 * so it needs the full length there.  The app MEs might prefer
     67	 * l3_offset and l4_offset relative to the start of packet data,
     68	 * but could probably cope with it being relative to the CTM buf
     69	 * data offset.
     70	 */
     71	txd.l3_offset = l3_offset;
     72	txd.l4_offset = l4_offset;
     73	txd.lso_meta_res = 0;
     74	txd.mss = cpu_to_le16(mss);
     75	txd.lso_hdrlen = hdrlen;
     76	txd.lso_totsegs = segs;
     77
     78	txbuf->pkt_cnt = segs;
     79	txbuf->real_len = skb->len + hdrlen * (txbuf->pkt_cnt - 1);
     80
     81	u64_stats_update_begin(&r_vec->tx_sync);
     82	r_vec->tx_lso++;
     83	u64_stats_update_end(&r_vec->tx_sync);
     84
     85	return txd.raw;
     86}
     87
     88static u8
     89nfp_nfdk_tx_csum(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
     90		 unsigned int pkt_cnt, struct sk_buff *skb, u64 flags)
     91{
     92	struct ipv6hdr *ipv6h;
     93	struct iphdr *iph;
     94
     95	if (!(dp->ctrl & NFP_NET_CFG_CTRL_TXCSUM))
     96		return flags;
     97
     98	if (skb->ip_summed != CHECKSUM_PARTIAL)
     99		return flags;
    100
    101	flags |= NFDK_DESC_TX_L4_CSUM;
    102
    103	iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
    104	ipv6h = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb);
    105
    106	/* L3 checksum offloading flag is not required for ipv6 */
    107	if (iph->version == 4) {
    108		flags |= NFDK_DESC_TX_L3_CSUM;
    109	} else if (ipv6h->version != 6) {
    110		nn_dp_warn(dp, "partial checksum but ipv=%x!\n", iph->version);
    111		return flags;
    112	}
    113
    114	u64_stats_update_begin(&r_vec->tx_sync);
    115	if (!skb->encapsulation) {
    116		r_vec->hw_csum_tx += pkt_cnt;
    117	} else {
    118		flags |= NFDK_DESC_TX_ENCAP;
    119		r_vec->hw_csum_tx_inner += pkt_cnt;
    120	}
    121	u64_stats_update_end(&r_vec->tx_sync);
    122
    123	return flags;
    124}
    125
    126static int
    127nfp_nfdk_tx_maybe_close_block(struct nfp_net_tx_ring *tx_ring,
    128			      unsigned int nr_frags, struct sk_buff *skb)
    129{
    130	unsigned int n_descs, wr_p, nop_slots;
    131	const skb_frag_t *frag, *fend;
    132	struct nfp_nfdk_tx_desc *txd;
    133	unsigned int wr_idx;
    134	int err;
    135
    136recount_descs:
    137	n_descs = nfp_nfdk_headlen_to_segs(skb_headlen(skb));
    138
    139	frag = skb_shinfo(skb)->frags;
    140	fend = frag + nr_frags;
    141	for (; frag < fend; frag++)
    142		n_descs += DIV_ROUND_UP(skb_frag_size(frag),
    143					NFDK_TX_MAX_DATA_PER_DESC);
    144
    145	if (unlikely(n_descs > NFDK_TX_DESC_GATHER_MAX)) {
    146		if (skb_is_nonlinear(skb)) {
    147			err = skb_linearize(skb);
    148			if (err)
    149				return err;
    150			goto recount_descs;
    151		}
    152		return -EINVAL;
    153	}
    154
    155	/* Under count by 1 (don't count meta) for the round down to work out */
    156	n_descs += !!skb_is_gso(skb);
    157
    158	if (round_down(tx_ring->wr_p, NFDK_TX_DESC_BLOCK_CNT) !=
    159	    round_down(tx_ring->wr_p + n_descs, NFDK_TX_DESC_BLOCK_CNT))
    160		goto close_block;
    161
    162	if ((u32)tx_ring->data_pending + skb->len > NFDK_TX_MAX_DATA_PER_BLOCK)
    163		goto close_block;
    164
    165	return 0;
    166
    167close_block:
    168	wr_p = tx_ring->wr_p;
    169	nop_slots = D_BLOCK_CPL(wr_p);
    170
    171	wr_idx = D_IDX(tx_ring, wr_p);
    172	tx_ring->ktxbufs[wr_idx].skb = NULL;
    173	txd = &tx_ring->ktxds[wr_idx];
    174
    175	memset(txd, 0, array_size(nop_slots, sizeof(struct nfp_nfdk_tx_desc)));
    176
    177	tx_ring->data_pending = 0;
    178	tx_ring->wr_p += nop_slots;
    179	tx_ring->wr_ptr_add += nop_slots;
    180
    181	return 0;
    182}
    183
    184static int nfp_nfdk_prep_port_id(struct sk_buff *skb)
    185{
    186	struct metadata_dst *md_dst = skb_metadata_dst(skb);
    187	unsigned char *data;
    188
    189	if (likely(!md_dst))
    190		return 0;
    191	if (unlikely(md_dst->type != METADATA_HW_PORT_MUX))
    192		return 0;
    193
    194	/* Note: Unsupported case when TSO a skb with metedata prepended.
    195	 * See the comments in `nfp_nfdk_tx_tso` for details.
    196	 */
    197	if (unlikely(md_dst && skb_is_gso(skb)))
    198		return -EOPNOTSUPP;
    199
    200	if (unlikely(skb_cow_head(skb, sizeof(md_dst->u.port_info.port_id))))
    201		return -ENOMEM;
    202
    203	data = skb_push(skb, sizeof(md_dst->u.port_info.port_id));
    204	put_unaligned_be32(md_dst->u.port_info.port_id, data);
    205
    206	return sizeof(md_dst->u.port_info.port_id);
    207}
    208
    209static int
    210nfp_nfdk_prep_tx_meta(struct nfp_app *app, struct sk_buff *skb,
    211		      struct nfp_net_r_vector *r_vec)
    212{
    213	unsigned char *data;
    214	int res, md_bytes;
    215	u32 meta_id = 0;
    216
    217	res = nfp_nfdk_prep_port_id(skb);
    218	if (unlikely(res <= 0))
    219		return res;
    220
    221	md_bytes = res;
    222	meta_id = NFP_NET_META_PORTID;
    223
    224	if (unlikely(skb_cow_head(skb, sizeof(meta_id))))
    225		return -ENOMEM;
    226
    227	md_bytes += sizeof(meta_id);
    228
    229	meta_id = FIELD_PREP(NFDK_META_LEN, md_bytes) |
    230		  FIELD_PREP(NFDK_META_FIELDS, meta_id);
    231
    232	data = skb_push(skb, sizeof(meta_id));
    233	put_unaligned_be32(meta_id, data);
    234
    235	return NFDK_DESC_TX_CHAIN_META;
    236}
    237
    238/**
    239 * nfp_nfdk_tx() - Main transmit entry point
    240 * @skb:    SKB to transmit
    241 * @netdev: netdev structure
    242 *
    243 * Return: NETDEV_TX_OK on success.
    244 */
    245netdev_tx_t nfp_nfdk_tx(struct sk_buff *skb, struct net_device *netdev)
    246{
    247	struct nfp_net *nn = netdev_priv(netdev);
    248	struct nfp_nfdk_tx_buf *txbuf, *etxbuf;
    249	u32 cnt, tmp_dlen, dlen_type = 0;
    250	struct nfp_net_tx_ring *tx_ring;
    251	struct nfp_net_r_vector *r_vec;
    252	const skb_frag_t *frag, *fend;
    253	struct nfp_nfdk_tx_desc *txd;
    254	unsigned int real_len, qidx;
    255	unsigned int dma_len, type;
    256	struct netdev_queue *nd_q;
    257	struct nfp_net_dp *dp;
    258	int nr_frags, wr_idx;
    259	dma_addr_t dma_addr;
    260	u64 metadata;
    261
    262	dp = &nn->dp;
    263	qidx = skb_get_queue_mapping(skb);
    264	tx_ring = &dp->tx_rings[qidx];
    265	r_vec = tx_ring->r_vec;
    266	nd_q = netdev_get_tx_queue(dp->netdev, qidx);
    267
    268	/* Don't bother counting frags, assume the worst */
    269	if (unlikely(nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT))) {
    270		nn_dp_warn(dp, "TX ring %d busy. wrp=%u rdp=%u\n",
    271			   qidx, tx_ring->wr_p, tx_ring->rd_p);
    272		netif_tx_stop_queue(nd_q);
    273		nfp_net_tx_xmit_more_flush(tx_ring);
    274		u64_stats_update_begin(&r_vec->tx_sync);
    275		r_vec->tx_busy++;
    276		u64_stats_update_end(&r_vec->tx_sync);
    277		return NETDEV_TX_BUSY;
    278	}
    279
    280	metadata = nfp_nfdk_prep_tx_meta(nn->app, skb, r_vec);
    281	if (unlikely((int)metadata < 0))
    282		goto err_flush;
    283
    284	nr_frags = skb_shinfo(skb)->nr_frags;
    285	if (nfp_nfdk_tx_maybe_close_block(tx_ring, nr_frags, skb))
    286		goto err_flush;
    287
    288	/* DMA map all */
    289	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
    290	txd = &tx_ring->ktxds[wr_idx];
    291	txbuf = &tx_ring->ktxbufs[wr_idx];
    292
    293	dma_len = skb_headlen(skb);
    294	if (skb_is_gso(skb))
    295		type = NFDK_DESC_TX_TYPE_TSO;
    296	else if (!nr_frags && dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
    297		type = NFDK_DESC_TX_TYPE_SIMPLE;
    298	else
    299		type = NFDK_DESC_TX_TYPE_GATHER;
    300
    301	dma_addr = dma_map_single(dp->dev, skb->data, dma_len, DMA_TO_DEVICE);
    302	if (dma_mapping_error(dp->dev, dma_addr))
    303		goto err_warn_dma;
    304
    305	txbuf->skb = skb;
    306	txbuf++;
    307
    308	txbuf->dma_addr = dma_addr;
    309	txbuf++;
    310
    311	/* FIELD_PREP() implicitly truncates to chunk */
    312	dma_len -= 1;
    313	dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN_HEAD, dma_len) |
    314		    FIELD_PREP(NFDK_DESC_TX_TYPE_HEAD, type);
    315
    316	txd->dma_len_type = cpu_to_le16(dlen_type);
    317	nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
    318
    319	/* starts at bit 0 */
    320	BUILD_BUG_ON(!(NFDK_DESC_TX_DMA_LEN_HEAD & 1));
    321
    322	/* Preserve the original dlen_type, this way below the EOP logic
    323	 * can use dlen_type.
    324	 */
    325	tmp_dlen = dlen_type & NFDK_DESC_TX_DMA_LEN_HEAD;
    326	dma_len -= tmp_dlen;
    327	dma_addr += tmp_dlen + 1;
    328	txd++;
    329
    330	/* The rest of the data (if any) will be in larger dma descritors
    331	 * and is handled with the fragment loop.
    332	 */
    333	frag = skb_shinfo(skb)->frags;
    334	fend = frag + nr_frags;
    335
    336	while (true) {
    337		while (dma_len > 0) {
    338			dma_len -= 1;
    339			dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN, dma_len);
    340
    341			txd->dma_len_type = cpu_to_le16(dlen_type);
    342			nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
    343
    344			dma_len -= dlen_type;
    345			dma_addr += dlen_type + 1;
    346			txd++;
    347		}
    348
    349		if (frag >= fend)
    350			break;
    351
    352		dma_len = skb_frag_size(frag);
    353		dma_addr = skb_frag_dma_map(dp->dev, frag, 0, dma_len,
    354					    DMA_TO_DEVICE);
    355		if (dma_mapping_error(dp->dev, dma_addr))
    356			goto err_unmap;
    357
    358		txbuf->dma_addr = dma_addr;
    359		txbuf++;
    360
    361		frag++;
    362	}
    363
    364	(txd - 1)->dma_len_type = cpu_to_le16(dlen_type | NFDK_DESC_TX_EOP);
    365
    366	if (!skb_is_gso(skb)) {
    367		real_len = skb->len;
    368		/* Metadata desc */
    369		metadata = nfp_nfdk_tx_csum(dp, r_vec, 1, skb, metadata);
    370		txd->raw = cpu_to_le64(metadata);
    371		txd++;
    372	} else {
    373		/* lso desc should be placed after metadata desc */
    374		(txd + 1)->raw = nfp_nfdk_tx_tso(r_vec, txbuf, skb);
    375		real_len = txbuf->real_len;
    376		/* Metadata desc */
    377		metadata = nfp_nfdk_tx_csum(dp, r_vec, txbuf->pkt_cnt, skb, metadata);
    378		txd->raw = cpu_to_le64(metadata);
    379		txd += 2;
    380		txbuf++;
    381	}
    382
    383	cnt = txd - tx_ring->ktxds - wr_idx;
    384	if (unlikely(round_down(wr_idx, NFDK_TX_DESC_BLOCK_CNT) !=
    385		     round_down(wr_idx + cnt - 1, NFDK_TX_DESC_BLOCK_CNT)))
    386		goto err_warn_overflow;
    387
    388	skb_tx_timestamp(skb);
    389
    390	tx_ring->wr_p += cnt;
    391	if (tx_ring->wr_p % NFDK_TX_DESC_BLOCK_CNT)
    392		tx_ring->data_pending += skb->len;
    393	else
    394		tx_ring->data_pending = 0;
    395
    396	if (nfp_nfdk_tx_ring_should_stop(tx_ring))
    397		nfp_nfdk_tx_ring_stop(nd_q, tx_ring);
    398
    399	tx_ring->wr_ptr_add += cnt;
    400	if (__netdev_tx_sent_queue(nd_q, real_len, netdev_xmit_more()))
    401		nfp_net_tx_xmit_more_flush(tx_ring);
    402
    403	return NETDEV_TX_OK;
    404
    405err_warn_overflow:
    406	WARN_ONCE(1, "unable to fit packet into a descriptor wr_idx:%d head:%d frags:%d cnt:%d",
    407		  wr_idx, skb_headlen(skb), nr_frags, cnt);
    408	if (skb_is_gso(skb))
    409		txbuf--;
    410err_unmap:
    411	/* txbuf pointed to the next-to-use */
    412	etxbuf = txbuf;
    413	/* first txbuf holds the skb */
    414	txbuf = &tx_ring->ktxbufs[wr_idx + 1];
    415	if (txbuf < etxbuf) {
    416		dma_unmap_single(dp->dev, txbuf->dma_addr,
    417				 skb_headlen(skb), DMA_TO_DEVICE);
    418		txbuf->raw = 0;
    419		txbuf++;
    420	}
    421	frag = skb_shinfo(skb)->frags;
    422	while (etxbuf < txbuf) {
    423		dma_unmap_page(dp->dev, txbuf->dma_addr,
    424			       skb_frag_size(frag), DMA_TO_DEVICE);
    425		txbuf->raw = 0;
    426		frag++;
    427		txbuf++;
    428	}
    429err_warn_dma:
    430	nn_dp_warn(dp, "Failed to map DMA TX buffer\n");
    431err_flush:
    432	nfp_net_tx_xmit_more_flush(tx_ring);
    433	u64_stats_update_begin(&r_vec->tx_sync);
    434	r_vec->tx_errors++;
    435	u64_stats_update_end(&r_vec->tx_sync);
    436	dev_kfree_skb_any(skb);
    437	return NETDEV_TX_OK;
    438}
    439
    440/**
    441 * nfp_nfdk_tx_complete() - Handled completed TX packets
    442 * @tx_ring:	TX ring structure
    443 * @budget:	NAPI budget (only used as bool to determine if in NAPI context)
    444 */
    445static void nfp_nfdk_tx_complete(struct nfp_net_tx_ring *tx_ring, int budget)
    446{
    447	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
    448	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
    449	u32 done_pkts = 0, done_bytes = 0;
    450	struct nfp_nfdk_tx_buf *ktxbufs;
    451	struct device *dev = dp->dev;
    452	struct netdev_queue *nd_q;
    453	u32 rd_p, qcp_rd_p;
    454	int todo;
    455
    456	rd_p = tx_ring->rd_p;
    457	if (tx_ring->wr_p == rd_p)
    458		return;
    459
    460	/* Work out how many descriptors have been transmitted */
    461	qcp_rd_p = nfp_net_read_tx_cmpl(tx_ring, dp);
    462
    463	if (qcp_rd_p == tx_ring->qcp_rd_p)
    464		return;
    465
    466	todo = D_IDX(tx_ring, qcp_rd_p - tx_ring->qcp_rd_p);
    467	ktxbufs = tx_ring->ktxbufs;
    468
    469	while (todo > 0) {
    470		const skb_frag_t *frag, *fend;
    471		unsigned int size, n_descs = 1;
    472		struct nfp_nfdk_tx_buf *txbuf;
    473		struct sk_buff *skb;
    474
    475		txbuf = &ktxbufs[D_IDX(tx_ring, rd_p)];
    476		skb = txbuf->skb;
    477		txbuf++;
    478
    479		/* Closed block */
    480		if (!skb) {
    481			n_descs = D_BLOCK_CPL(rd_p);
    482			goto next;
    483		}
    484
    485		/* Unmap head */
    486		size = skb_headlen(skb);
    487		n_descs += nfp_nfdk_headlen_to_segs(size);
    488		dma_unmap_single(dev, txbuf->dma_addr, size, DMA_TO_DEVICE);
    489		txbuf++;
    490
    491		/* Unmap frags */
    492		frag = skb_shinfo(skb)->frags;
    493		fend = frag + skb_shinfo(skb)->nr_frags;
    494		for (; frag < fend; frag++) {
    495			size = skb_frag_size(frag);
    496			n_descs += DIV_ROUND_UP(size,
    497						NFDK_TX_MAX_DATA_PER_DESC);
    498			dma_unmap_page(dev, txbuf->dma_addr,
    499				       skb_frag_size(frag), DMA_TO_DEVICE);
    500			txbuf++;
    501		}
    502
    503		if (!skb_is_gso(skb)) {
    504			done_bytes += skb->len;
    505			done_pkts++;
    506		} else {
    507			done_bytes += txbuf->real_len;
    508			done_pkts += txbuf->pkt_cnt;
    509			n_descs++;
    510		}
    511
    512		napi_consume_skb(skb, budget);
    513next:
    514		rd_p += n_descs;
    515		todo -= n_descs;
    516	}
    517
    518	tx_ring->rd_p = rd_p;
    519	tx_ring->qcp_rd_p = qcp_rd_p;
    520
    521	u64_stats_update_begin(&r_vec->tx_sync);
    522	r_vec->tx_bytes += done_bytes;
    523	r_vec->tx_pkts += done_pkts;
    524	u64_stats_update_end(&r_vec->tx_sync);
    525
    526	if (!dp->netdev)
    527		return;
    528
    529	nd_q = netdev_get_tx_queue(dp->netdev, tx_ring->idx);
    530	netdev_tx_completed_queue(nd_q, done_pkts, done_bytes);
    531	if (nfp_nfdk_tx_ring_should_wake(tx_ring)) {
    532		/* Make sure TX thread will see updated tx_ring->rd_p */
    533		smp_mb();
    534
    535		if (unlikely(netif_tx_queue_stopped(nd_q)))
    536			netif_tx_wake_queue(nd_q);
    537	}
    538
    539	WARN_ONCE(tx_ring->wr_p - tx_ring->rd_p > tx_ring->cnt,
    540		  "TX ring corruption rd_p=%u wr_p=%u cnt=%u\n",
    541		  tx_ring->rd_p, tx_ring->wr_p, tx_ring->cnt);
    542}
    543
    544/* Receive processing */
    545static void *
    546nfp_nfdk_napi_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
    547{
    548	void *frag;
    549
    550	if (!dp->xdp_prog) {
    551		frag = napi_alloc_frag(dp->fl_bufsz);
    552		if (unlikely(!frag))
    553			return NULL;
    554	} else {
    555		struct page *page;
    556
    557		page = dev_alloc_page();
    558		if (unlikely(!page))
    559			return NULL;
    560		frag = page_address(page);
    561	}
    562
    563	*dma_addr = nfp_net_dma_map_rx(dp, frag);
    564	if (dma_mapping_error(dp->dev, *dma_addr)) {
    565		nfp_net_free_frag(frag, dp->xdp_prog);
    566		nn_dp_warn(dp, "Failed to map DMA RX buffer\n");
    567		return NULL;
    568	}
    569
    570	return frag;
    571}
    572
    573/**
    574 * nfp_nfdk_rx_give_one() - Put mapped skb on the software and hardware rings
    575 * @dp:		NFP Net data path struct
    576 * @rx_ring:	RX ring structure
    577 * @frag:	page fragment buffer
    578 * @dma_addr:	DMA address of skb mapping
    579 */
    580static void
    581nfp_nfdk_rx_give_one(const struct nfp_net_dp *dp,
    582		     struct nfp_net_rx_ring *rx_ring,
    583		     void *frag, dma_addr_t dma_addr)
    584{
    585	unsigned int wr_idx;
    586
    587	wr_idx = D_IDX(rx_ring, rx_ring->wr_p);
    588
    589	nfp_net_dma_sync_dev_rx(dp, dma_addr);
    590
    591	/* Stash SKB and DMA address away */
    592	rx_ring->rxbufs[wr_idx].frag = frag;
    593	rx_ring->rxbufs[wr_idx].dma_addr = dma_addr;
    594
    595	/* Fill freelist descriptor */
    596	rx_ring->rxds[wr_idx].fld.reserved = 0;
    597	rx_ring->rxds[wr_idx].fld.meta_len_dd = 0;
    598	nfp_desc_set_dma_addr(&rx_ring->rxds[wr_idx].fld,
    599			      dma_addr + dp->rx_dma_off);
    600
    601	rx_ring->wr_p++;
    602	if (!(rx_ring->wr_p % NFP_NET_FL_BATCH)) {
    603		/* Update write pointer of the freelist queue. Make
    604		 * sure all writes are flushed before telling the hardware.
    605		 */
    606		wmb();
    607		nfp_qcp_wr_ptr_add(rx_ring->qcp_fl, NFP_NET_FL_BATCH);
    608	}
    609}
    610
    611/**
    612 * nfp_nfdk_rx_ring_fill_freelist() - Give buffers from the ring to FW
    613 * @dp:	     NFP Net data path struct
    614 * @rx_ring: RX ring to fill
    615 */
    616void nfp_nfdk_rx_ring_fill_freelist(struct nfp_net_dp *dp,
    617				    struct nfp_net_rx_ring *rx_ring)
    618{
    619	unsigned int i;
    620
    621	for (i = 0; i < rx_ring->cnt - 1; i++)
    622		nfp_nfdk_rx_give_one(dp, rx_ring, rx_ring->rxbufs[i].frag,
    623				     rx_ring->rxbufs[i].dma_addr);
    624}
    625
    626/**
    627 * nfp_nfdk_rx_csum_has_errors() - group check if rxd has any csum errors
    628 * @flags: RX descriptor flags field in CPU byte order
    629 */
    630static int nfp_nfdk_rx_csum_has_errors(u16 flags)
    631{
    632	u16 csum_all_checked, csum_all_ok;
    633
    634	csum_all_checked = flags & __PCIE_DESC_RX_CSUM_ALL;
    635	csum_all_ok = flags & __PCIE_DESC_RX_CSUM_ALL_OK;
    636
    637	return csum_all_checked != (csum_all_ok << PCIE_DESC_RX_CSUM_OK_SHIFT);
    638}
    639
    640/**
    641 * nfp_nfdk_rx_csum() - set SKB checksum field based on RX descriptor flags
    642 * @dp:  NFP Net data path struct
    643 * @r_vec: per-ring structure
    644 * @rxd: Pointer to RX descriptor
    645 * @meta: Parsed metadata prepend
    646 * @skb: Pointer to SKB
    647 */
    648static void
    649nfp_nfdk_rx_csum(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
    650		 struct nfp_net_rx_desc *rxd, struct nfp_meta_parsed *meta,
    651		 struct sk_buff *skb)
    652{
    653	skb_checksum_none_assert(skb);
    654
    655	if (!(dp->netdev->features & NETIF_F_RXCSUM))
    656		return;
    657
    658	if (meta->csum_type) {
    659		skb->ip_summed = meta->csum_type;
    660		skb->csum = meta->csum;
    661		u64_stats_update_begin(&r_vec->rx_sync);
    662		r_vec->hw_csum_rx_complete++;
    663		u64_stats_update_end(&r_vec->rx_sync);
    664		return;
    665	}
    666
    667	if (nfp_nfdk_rx_csum_has_errors(le16_to_cpu(rxd->rxd.flags))) {
    668		u64_stats_update_begin(&r_vec->rx_sync);
    669		r_vec->hw_csum_rx_error++;
    670		u64_stats_update_end(&r_vec->rx_sync);
    671		return;
    672	}
    673
    674	/* Assume that the firmware will never report inner CSUM_OK unless outer
    675	 * L4 headers were successfully parsed. FW will always report zero UDP
    676	 * checksum as CSUM_OK.
    677	 */
    678	if (rxd->rxd.flags & PCIE_DESC_RX_TCP_CSUM_OK ||
    679	    rxd->rxd.flags & PCIE_DESC_RX_UDP_CSUM_OK) {
    680		__skb_incr_checksum_unnecessary(skb);
    681		u64_stats_update_begin(&r_vec->rx_sync);
    682		r_vec->hw_csum_rx_ok++;
    683		u64_stats_update_end(&r_vec->rx_sync);
    684	}
    685
    686	if (rxd->rxd.flags & PCIE_DESC_RX_I_TCP_CSUM_OK ||
    687	    rxd->rxd.flags & PCIE_DESC_RX_I_UDP_CSUM_OK) {
    688		__skb_incr_checksum_unnecessary(skb);
    689		u64_stats_update_begin(&r_vec->rx_sync);
    690		r_vec->hw_csum_rx_inner_ok++;
    691		u64_stats_update_end(&r_vec->rx_sync);
    692	}
    693}
    694
    695static void
    696nfp_nfdk_set_hash(struct net_device *netdev, struct nfp_meta_parsed *meta,
    697		  unsigned int type, __be32 *hash)
    698{
    699	if (!(netdev->features & NETIF_F_RXHASH))
    700		return;
    701
    702	switch (type) {
    703	case NFP_NET_RSS_IPV4:
    704	case NFP_NET_RSS_IPV6:
    705	case NFP_NET_RSS_IPV6_EX:
    706		meta->hash_type = PKT_HASH_TYPE_L3;
    707		break;
    708	default:
    709		meta->hash_type = PKT_HASH_TYPE_L4;
    710		break;
    711	}
    712
    713	meta->hash = get_unaligned_be32(hash);
    714}
    715
    716static bool
    717nfp_nfdk_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta,
    718		    void *data, void *pkt, unsigned int pkt_len, int meta_len)
    719{
    720	u32 meta_info;
    721
    722	meta_info = get_unaligned_be32(data);
    723	data += 4;
    724
    725	while (meta_info) {
    726		switch (meta_info & NFP_NET_META_FIELD_MASK) {
    727		case NFP_NET_META_HASH:
    728			meta_info >>= NFP_NET_META_FIELD_SIZE;
    729			nfp_nfdk_set_hash(netdev, meta,
    730					  meta_info & NFP_NET_META_FIELD_MASK,
    731					  (__be32 *)data);
    732			data += 4;
    733			break;
    734		case NFP_NET_META_MARK:
    735			meta->mark = get_unaligned_be32(data);
    736			data += 4;
    737			break;
    738		case NFP_NET_META_PORTID:
    739			meta->portid = get_unaligned_be32(data);
    740			data += 4;
    741			break;
    742		case NFP_NET_META_CSUM:
    743			meta->csum_type = CHECKSUM_COMPLETE;
    744			meta->csum =
    745				(__force __wsum)__get_unaligned_cpu32(data);
    746			data += 4;
    747			break;
    748		case NFP_NET_META_RESYNC_INFO:
    749			if (nfp_net_tls_rx_resync_req(netdev, data, pkt,
    750						      pkt_len))
    751				return false;
    752			data += sizeof(struct nfp_net_tls_resync_req);
    753			break;
    754		default:
    755			return true;
    756		}
    757
    758		meta_info >>= NFP_NET_META_FIELD_SIZE;
    759	}
    760
    761	return data != pkt;
    762}
    763
    764static void
    765nfp_nfdk_rx_drop(const struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
    766		 struct nfp_net_rx_ring *rx_ring, struct nfp_net_rx_buf *rxbuf,
    767		 struct sk_buff *skb)
    768{
    769	u64_stats_update_begin(&r_vec->rx_sync);
    770	r_vec->rx_drops++;
    771	/* If we have both skb and rxbuf the replacement buffer allocation
    772	 * must have failed, count this as an alloc failure.
    773	 */
    774	if (skb && rxbuf)
    775		r_vec->rx_replace_buf_alloc_fail++;
    776	u64_stats_update_end(&r_vec->rx_sync);
    777
    778	/* skb is build based on the frag, free_skb() would free the frag
    779	 * so to be able to reuse it we need an extra ref.
    780	 */
    781	if (skb && rxbuf && skb->head == rxbuf->frag)
    782		page_ref_inc(virt_to_head_page(rxbuf->frag));
    783	if (rxbuf)
    784		nfp_nfdk_rx_give_one(dp, rx_ring, rxbuf->frag, rxbuf->dma_addr);
    785	if (skb)
    786		dev_kfree_skb_any(skb);
    787}
    788
    789static bool nfp_nfdk_xdp_complete(struct nfp_net_tx_ring *tx_ring)
    790{
    791	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
    792	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
    793	struct nfp_net_rx_ring *rx_ring;
    794	u32 qcp_rd_p, done = 0;
    795	bool done_all;
    796	int todo;
    797
    798	/* Work out how many descriptors have been transmitted */
    799	qcp_rd_p = nfp_net_read_tx_cmpl(tx_ring, dp);
    800	if (qcp_rd_p == tx_ring->qcp_rd_p)
    801		return true;
    802
    803	todo = D_IDX(tx_ring, qcp_rd_p - tx_ring->qcp_rd_p);
    804
    805	done_all = todo <= NFP_NET_XDP_MAX_COMPLETE;
    806	todo = min(todo, NFP_NET_XDP_MAX_COMPLETE);
    807
    808	rx_ring = r_vec->rx_ring;
    809	while (todo > 0) {
    810		int idx = D_IDX(tx_ring, tx_ring->rd_p + done);
    811		struct nfp_nfdk_tx_buf *txbuf;
    812		unsigned int step = 1;
    813
    814		txbuf = &tx_ring->ktxbufs[idx];
    815		if (!txbuf->raw)
    816			goto next;
    817
    818		if (NFDK_TX_BUF_INFO(txbuf->val) != NFDK_TX_BUF_INFO_SOP) {
    819			WARN_ONCE(1, "Unexpected TX buffer in XDP TX ring\n");
    820			goto next;
    821		}
    822
    823		/* Two successive txbufs are used to stash virtual and dma
    824		 * address respectively, recycle and clean them here.
    825		 */
    826		nfp_nfdk_rx_give_one(dp, rx_ring,
    827				     (void *)NFDK_TX_BUF_PTR(txbuf[0].val),
    828				     txbuf[1].dma_addr);
    829		txbuf[0].raw = 0;
    830		txbuf[1].raw = 0;
    831		step = 2;
    832
    833		u64_stats_update_begin(&r_vec->tx_sync);
    834		/* Note: tx_bytes not accumulated. */
    835		r_vec->tx_pkts++;
    836		u64_stats_update_end(&r_vec->tx_sync);
    837next:
    838		todo -= step;
    839		done += step;
    840	}
    841
    842	tx_ring->qcp_rd_p = D_IDX(tx_ring, tx_ring->qcp_rd_p + done);
    843	tx_ring->rd_p += done;
    844
    845	WARN_ONCE(tx_ring->wr_p - tx_ring->rd_p > tx_ring->cnt,
    846		  "XDP TX ring corruption rd_p=%u wr_p=%u cnt=%u\n",
    847		  tx_ring->rd_p, tx_ring->wr_p, tx_ring->cnt);
    848
    849	return done_all;
    850}
    851
    852static bool
    853nfp_nfdk_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
    854		    struct nfp_net_tx_ring *tx_ring,
    855		    struct nfp_net_rx_buf *rxbuf, unsigned int dma_off,
    856		    unsigned int pkt_len, bool *completed)
    857{
    858	unsigned int dma_map_sz = dp->fl_bufsz - NFP_NET_RX_BUF_NON_DATA;
    859	unsigned int dma_len, type, cnt, dlen_type, tmp_dlen;
    860	struct nfp_nfdk_tx_buf *txbuf;
    861	struct nfp_nfdk_tx_desc *txd;
    862	unsigned int n_descs;
    863	dma_addr_t dma_addr;
    864	int wr_idx;
    865
    866	/* Reject if xdp_adjust_tail grow packet beyond DMA area */
    867	if (pkt_len + dma_off > dma_map_sz)
    868		return false;
    869
    870	/* Make sure there's still at least one block available after
    871	 * aligning to block boundary, so that the txds used below
    872	 * won't wrap around the tx_ring.
    873	 */
    874	if (unlikely(nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT))) {
    875		if (!*completed) {
    876			nfp_nfdk_xdp_complete(tx_ring);
    877			*completed = true;
    878		}
    879
    880		if (unlikely(nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT))) {
    881			nfp_nfdk_rx_drop(dp, rx_ring->r_vec, rx_ring, rxbuf,
    882					 NULL);
    883			return false;
    884		}
    885	}
    886
    887	/* Check if cross block boundary */
    888	n_descs = nfp_nfdk_headlen_to_segs(pkt_len);
    889	if ((round_down(tx_ring->wr_p, NFDK_TX_DESC_BLOCK_CNT) !=
    890	     round_down(tx_ring->wr_p + n_descs, NFDK_TX_DESC_BLOCK_CNT)) ||
    891	    ((u32)tx_ring->data_pending + pkt_len >
    892	     NFDK_TX_MAX_DATA_PER_BLOCK)) {
    893		unsigned int nop_slots = D_BLOCK_CPL(tx_ring->wr_p);
    894
    895		wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
    896		txd = &tx_ring->ktxds[wr_idx];
    897		memset(txd, 0,
    898		       array_size(nop_slots, sizeof(struct nfp_nfdk_tx_desc)));
    899
    900		tx_ring->data_pending = 0;
    901		tx_ring->wr_p += nop_slots;
    902		tx_ring->wr_ptr_add += nop_slots;
    903	}
    904
    905	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
    906
    907	txbuf = &tx_ring->ktxbufs[wr_idx];
    908
    909	txbuf[0].val = (unsigned long)rxbuf->frag | NFDK_TX_BUF_INFO_SOP;
    910	txbuf[1].dma_addr = rxbuf->dma_addr;
    911	/* Note: pkt len not stored */
    912
    913	dma_sync_single_for_device(dp->dev, rxbuf->dma_addr + dma_off,
    914				   pkt_len, DMA_BIDIRECTIONAL);
    915
    916	/* Build TX descriptor */
    917	txd = &tx_ring->ktxds[wr_idx];
    918	dma_len = pkt_len;
    919	dma_addr = rxbuf->dma_addr + dma_off;
    920
    921	if (dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
    922		type = NFDK_DESC_TX_TYPE_SIMPLE;
    923	else
    924		type = NFDK_DESC_TX_TYPE_GATHER;
    925
    926	/* FIELD_PREP() implicitly truncates to chunk */
    927	dma_len -= 1;
    928	dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN_HEAD, dma_len) |
    929		    FIELD_PREP(NFDK_DESC_TX_TYPE_HEAD, type);
    930
    931	txd->dma_len_type = cpu_to_le16(dlen_type);
    932	nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
    933
    934	tmp_dlen = dlen_type & NFDK_DESC_TX_DMA_LEN_HEAD;
    935	dma_len -= tmp_dlen;
    936	dma_addr += tmp_dlen + 1;
    937	txd++;
    938
    939	while (dma_len > 0) {
    940		dma_len -= 1;
    941		dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN, dma_len);
    942		txd->dma_len_type = cpu_to_le16(dlen_type);
    943		nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
    944
    945		dlen_type &= NFDK_DESC_TX_DMA_LEN;
    946		dma_len -= dlen_type;
    947		dma_addr += dlen_type + 1;
    948		txd++;
    949	}
    950
    951	(txd - 1)->dma_len_type = cpu_to_le16(dlen_type | NFDK_DESC_TX_EOP);
    952
    953	/* Metadata desc */
    954	txd->raw = 0;
    955	txd++;
    956
    957	cnt = txd - tx_ring->ktxds - wr_idx;
    958	tx_ring->wr_p += cnt;
    959	if (tx_ring->wr_p % NFDK_TX_DESC_BLOCK_CNT)
    960		tx_ring->data_pending += pkt_len;
    961	else
    962		tx_ring->data_pending = 0;
    963
    964	tx_ring->wr_ptr_add += cnt;
    965	return true;
    966}
    967
    968/**
    969 * nfp_nfdk_rx() - receive up to @budget packets on @rx_ring
    970 * @rx_ring:   RX ring to receive from
    971 * @budget:    NAPI budget
    972 *
    973 * Note, this function is separated out from the napi poll function to
    974 * more cleanly separate packet receive code from other bookkeeping
    975 * functions performed in the napi poll function.
    976 *
    977 * Return: Number of packets received.
    978 */
    979static int nfp_nfdk_rx(struct nfp_net_rx_ring *rx_ring, int budget)
    980{
    981	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
    982	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
    983	struct nfp_net_tx_ring *tx_ring;
    984	struct bpf_prog *xdp_prog;
    985	bool xdp_tx_cmpl = false;
    986	unsigned int true_bufsz;
    987	struct sk_buff *skb;
    988	int pkts_polled = 0;
    989	struct xdp_buff xdp;
    990	int idx;
    991
    992	xdp_prog = READ_ONCE(dp->xdp_prog);
    993	true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz;
    994	xdp_init_buff(&xdp, PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM,
    995		      &rx_ring->xdp_rxq);
    996	tx_ring = r_vec->xdp_ring;
    997
    998	while (pkts_polled < budget) {
    999		unsigned int meta_len, data_len, meta_off, pkt_len, pkt_off;
   1000		struct nfp_net_rx_buf *rxbuf;
   1001		struct nfp_net_rx_desc *rxd;
   1002		struct nfp_meta_parsed meta;
   1003		bool redir_egress = false;
   1004		struct net_device *netdev;
   1005		dma_addr_t new_dma_addr;
   1006		u32 meta_len_xdp = 0;
   1007		void *new_frag;
   1008
   1009		idx = D_IDX(rx_ring, rx_ring->rd_p);
   1010
   1011		rxd = &rx_ring->rxds[idx];
   1012		if (!(rxd->rxd.meta_len_dd & PCIE_DESC_RX_DD))
   1013			break;
   1014
   1015		/* Memory barrier to ensure that we won't do other reads
   1016		 * before the DD bit.
   1017		 */
   1018		dma_rmb();
   1019
   1020		memset(&meta, 0, sizeof(meta));
   1021
   1022		rx_ring->rd_p++;
   1023		pkts_polled++;
   1024
   1025		rxbuf =	&rx_ring->rxbufs[idx];
   1026		/*         < meta_len >
   1027		 *  <-- [rx_offset] -->
   1028		 *  ---------------------------------------------------------
   1029		 * | [XX] |  metadata  |             packet           | XXXX |
   1030		 *  ---------------------------------------------------------
   1031		 *         <---------------- data_len --------------->
   1032		 *
   1033		 * The rx_offset is fixed for all packets, the meta_len can vary
   1034		 * on a packet by packet basis. If rx_offset is set to zero
   1035		 * (_RX_OFFSET_DYNAMIC) metadata starts at the beginning of the
   1036		 * buffer and is immediately followed by the packet (no [XX]).
   1037		 */
   1038		meta_len = rxd->rxd.meta_len_dd & PCIE_DESC_RX_META_LEN_MASK;
   1039		data_len = le16_to_cpu(rxd->rxd.data_len);
   1040		pkt_len = data_len - meta_len;
   1041
   1042		pkt_off = NFP_NET_RX_BUF_HEADROOM + dp->rx_dma_off;
   1043		if (dp->rx_offset == NFP_NET_CFG_RX_OFFSET_DYNAMIC)
   1044			pkt_off += meta_len;
   1045		else
   1046			pkt_off += dp->rx_offset;
   1047		meta_off = pkt_off - meta_len;
   1048
   1049		/* Stats update */
   1050		u64_stats_update_begin(&r_vec->rx_sync);
   1051		r_vec->rx_pkts++;
   1052		r_vec->rx_bytes += pkt_len;
   1053		u64_stats_update_end(&r_vec->rx_sync);
   1054
   1055		if (unlikely(meta_len > NFP_NET_MAX_PREPEND ||
   1056			     (dp->rx_offset && meta_len > dp->rx_offset))) {
   1057			nn_dp_warn(dp, "oversized RX packet metadata %u\n",
   1058				   meta_len);
   1059			nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL);
   1060			continue;
   1061		}
   1062
   1063		nfp_net_dma_sync_cpu_rx(dp, rxbuf->dma_addr + meta_off,
   1064					data_len);
   1065
   1066		if (meta_len) {
   1067			if (unlikely(nfp_nfdk_parse_meta(dp->netdev, &meta,
   1068							 rxbuf->frag + meta_off,
   1069							 rxbuf->frag + pkt_off,
   1070							 pkt_len, meta_len))) {
   1071				nn_dp_warn(dp, "invalid RX packet metadata\n");
   1072				nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf,
   1073						 NULL);
   1074				continue;
   1075			}
   1076		}
   1077
   1078		if (xdp_prog && !meta.portid) {
   1079			void *orig_data = rxbuf->frag + pkt_off;
   1080			unsigned int dma_off;
   1081			int act;
   1082
   1083			xdp_prepare_buff(&xdp,
   1084					 rxbuf->frag + NFP_NET_RX_BUF_HEADROOM,
   1085					 pkt_off - NFP_NET_RX_BUF_HEADROOM,
   1086					 pkt_len, true);
   1087
   1088			act = bpf_prog_run_xdp(xdp_prog, &xdp);
   1089
   1090			pkt_len = xdp.data_end - xdp.data;
   1091			pkt_off += xdp.data - orig_data;
   1092
   1093			switch (act) {
   1094			case XDP_PASS:
   1095				meta_len_xdp = xdp.data - xdp.data_meta;
   1096				break;
   1097			case XDP_TX:
   1098				dma_off = pkt_off - NFP_NET_RX_BUF_HEADROOM;
   1099				if (unlikely(!nfp_nfdk_tx_xdp_buf(dp, rx_ring,
   1100								  tx_ring,
   1101								  rxbuf,
   1102								  dma_off,
   1103								  pkt_len,
   1104								  &xdp_tx_cmpl)))
   1105					trace_xdp_exception(dp->netdev,
   1106							    xdp_prog, act);
   1107				continue;
   1108			default:
   1109				bpf_warn_invalid_xdp_action(dp->netdev, xdp_prog, act);
   1110				fallthrough;
   1111			case XDP_ABORTED:
   1112				trace_xdp_exception(dp->netdev, xdp_prog, act);
   1113				fallthrough;
   1114			case XDP_DROP:
   1115				nfp_nfdk_rx_give_one(dp, rx_ring, rxbuf->frag,
   1116						     rxbuf->dma_addr);
   1117				continue;
   1118			}
   1119		}
   1120
   1121		if (likely(!meta.portid)) {
   1122			netdev = dp->netdev;
   1123		} else if (meta.portid == NFP_META_PORT_ID_CTRL) {
   1124			struct nfp_net *nn = netdev_priv(dp->netdev);
   1125
   1126			nfp_app_ctrl_rx_raw(nn->app, rxbuf->frag + pkt_off,
   1127					    pkt_len);
   1128			nfp_nfdk_rx_give_one(dp, rx_ring, rxbuf->frag,
   1129					     rxbuf->dma_addr);
   1130			continue;
   1131		} else {
   1132			struct nfp_net *nn;
   1133
   1134			nn = netdev_priv(dp->netdev);
   1135			netdev = nfp_app_dev_get(nn->app, meta.portid,
   1136						 &redir_egress);
   1137			if (unlikely(!netdev)) {
   1138				nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf,
   1139						 NULL);
   1140				continue;
   1141			}
   1142
   1143			if (nfp_netdev_is_nfp_repr(netdev))
   1144				nfp_repr_inc_rx_stats(netdev, pkt_len);
   1145		}
   1146
   1147		skb = build_skb(rxbuf->frag, true_bufsz);
   1148		if (unlikely(!skb)) {
   1149			nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL);
   1150			continue;
   1151		}
   1152		new_frag = nfp_nfdk_napi_alloc_one(dp, &new_dma_addr);
   1153		if (unlikely(!new_frag)) {
   1154			nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, skb);
   1155			continue;
   1156		}
   1157
   1158		nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr);
   1159
   1160		nfp_nfdk_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
   1161
   1162		skb_reserve(skb, pkt_off);
   1163		skb_put(skb, pkt_len);
   1164
   1165		skb->mark = meta.mark;
   1166		skb_set_hash(skb, meta.hash, meta.hash_type);
   1167
   1168		skb_record_rx_queue(skb, rx_ring->idx);
   1169		skb->protocol = eth_type_trans(skb, netdev);
   1170
   1171		nfp_nfdk_rx_csum(dp, r_vec, rxd, &meta, skb);
   1172
   1173		if (rxd->rxd.flags & PCIE_DESC_RX_VLAN)
   1174			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
   1175					       le16_to_cpu(rxd->rxd.vlan));
   1176		if (meta_len_xdp)
   1177			skb_metadata_set(skb, meta_len_xdp);
   1178
   1179		if (likely(!redir_egress)) {
   1180			napi_gro_receive(&rx_ring->r_vec->napi, skb);
   1181		} else {
   1182			skb->dev = netdev;
   1183			skb_reset_network_header(skb);
   1184			__skb_push(skb, ETH_HLEN);
   1185			dev_queue_xmit(skb);
   1186		}
   1187	}
   1188
   1189	if (xdp_prog) {
   1190		if (tx_ring->wr_ptr_add)
   1191			nfp_net_tx_xmit_more_flush(tx_ring);
   1192		else if (unlikely(tx_ring->wr_p != tx_ring->rd_p) &&
   1193			 !xdp_tx_cmpl)
   1194			if (!nfp_nfdk_xdp_complete(tx_ring))
   1195				pkts_polled = budget;
   1196	}
   1197
   1198	return pkts_polled;
   1199}
   1200
   1201/**
   1202 * nfp_nfdk_poll() - napi poll function
   1203 * @napi:    NAPI structure
   1204 * @budget:  NAPI budget
   1205 *
   1206 * Return: number of packets polled.
   1207 */
   1208int nfp_nfdk_poll(struct napi_struct *napi, int budget)
   1209{
   1210	struct nfp_net_r_vector *r_vec =
   1211		container_of(napi, struct nfp_net_r_vector, napi);
   1212	unsigned int pkts_polled = 0;
   1213
   1214	if (r_vec->tx_ring)
   1215		nfp_nfdk_tx_complete(r_vec->tx_ring, budget);
   1216	if (r_vec->rx_ring)
   1217		pkts_polled = nfp_nfdk_rx(r_vec->rx_ring, budget);
   1218
   1219	if (pkts_polled < budget)
   1220		if (napi_complete_done(napi, pkts_polled))
   1221			nfp_net_irq_unmask(r_vec->nfp_net, r_vec->irq_entry);
   1222
   1223	if (r_vec->nfp_net->rx_coalesce_adapt_on && r_vec->rx_ring) {
   1224		struct dim_sample dim_sample = {};
   1225		unsigned int start;
   1226		u64 pkts, bytes;
   1227
   1228		do {
   1229			start = u64_stats_fetch_begin(&r_vec->rx_sync);
   1230			pkts = r_vec->rx_pkts;
   1231			bytes = r_vec->rx_bytes;
   1232		} while (u64_stats_fetch_retry(&r_vec->rx_sync, start));
   1233
   1234		dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample);
   1235		net_dim(&r_vec->rx_dim, dim_sample);
   1236	}
   1237
   1238	if (r_vec->nfp_net->tx_coalesce_adapt_on && r_vec->tx_ring) {
   1239		struct dim_sample dim_sample = {};
   1240		unsigned int start;
   1241		u64 pkts, bytes;
   1242
   1243		do {
   1244			start = u64_stats_fetch_begin(&r_vec->tx_sync);
   1245			pkts = r_vec->tx_pkts;
   1246			bytes = r_vec->tx_bytes;
   1247		} while (u64_stats_fetch_retry(&r_vec->tx_sync, start));
   1248
   1249		dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample);
   1250		net_dim(&r_vec->tx_dim, dim_sample);
   1251	}
   1252
   1253	return pkts_polled;
   1254}
   1255
   1256/* Control device data path
   1257 */
   1258
   1259bool
   1260nfp_nfdk_ctrl_tx_one(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
   1261		     struct sk_buff *skb, bool old)
   1262{
   1263	u32 cnt, tmp_dlen, dlen_type = 0;
   1264	struct nfp_net_tx_ring *tx_ring;
   1265	struct nfp_nfdk_tx_buf *txbuf;
   1266	struct nfp_nfdk_tx_desc *txd;
   1267	unsigned int dma_len, type;
   1268	struct nfp_net_dp *dp;
   1269	dma_addr_t dma_addr;
   1270	u64 metadata = 0;
   1271	int wr_idx;
   1272
   1273	dp = &r_vec->nfp_net->dp;
   1274	tx_ring = r_vec->tx_ring;
   1275
   1276	if (WARN_ON_ONCE(skb_shinfo(skb)->nr_frags)) {
   1277		nn_dp_warn(dp, "Driver's CTRL TX does not implement gather\n");
   1278		goto err_free;
   1279	}
   1280
   1281	/* Don't bother counting frags, assume the worst */
   1282	if (unlikely(nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT))) {
   1283		u64_stats_update_begin(&r_vec->tx_sync);
   1284		r_vec->tx_busy++;
   1285		u64_stats_update_end(&r_vec->tx_sync);
   1286		if (!old)
   1287			__skb_queue_tail(&r_vec->queue, skb);
   1288		else
   1289			__skb_queue_head(&r_vec->queue, skb);
   1290		return NETDEV_TX_BUSY;
   1291	}
   1292
   1293	if (nfp_app_ctrl_has_meta(nn->app)) {
   1294		if (unlikely(skb_headroom(skb) < 8)) {
   1295			nn_dp_warn(dp, "CTRL TX on skb without headroom\n");
   1296			goto err_free;
   1297		}
   1298		metadata = NFDK_DESC_TX_CHAIN_META;
   1299		put_unaligned_be32(NFP_META_PORT_ID_CTRL, skb_push(skb, 4));
   1300		put_unaligned_be32(FIELD_PREP(NFDK_META_LEN, 8) |
   1301				   FIELD_PREP(NFDK_META_FIELDS,
   1302					      NFP_NET_META_PORTID),
   1303				   skb_push(skb, 4));
   1304	}
   1305
   1306	if (nfp_nfdk_tx_maybe_close_block(tx_ring, 0, skb))
   1307		goto err_free;
   1308
   1309	/* DMA map all */
   1310	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
   1311	txd = &tx_ring->ktxds[wr_idx];
   1312	txbuf = &tx_ring->ktxbufs[wr_idx];
   1313
   1314	dma_len = skb_headlen(skb);
   1315	if (dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
   1316		type = NFDK_DESC_TX_TYPE_SIMPLE;
   1317	else
   1318		type = NFDK_DESC_TX_TYPE_GATHER;
   1319
   1320	dma_addr = dma_map_single(dp->dev, skb->data, dma_len, DMA_TO_DEVICE);
   1321	if (dma_mapping_error(dp->dev, dma_addr))
   1322		goto err_warn_dma;
   1323
   1324	txbuf->skb = skb;
   1325	txbuf++;
   1326
   1327	txbuf->dma_addr = dma_addr;
   1328	txbuf++;
   1329
   1330	dma_len -= 1;
   1331	dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN_HEAD, dma_len) |
   1332		    FIELD_PREP(NFDK_DESC_TX_TYPE_HEAD, type);
   1333
   1334	txd->dma_len_type = cpu_to_le16(dlen_type);
   1335	nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
   1336
   1337	tmp_dlen = dlen_type & NFDK_DESC_TX_DMA_LEN_HEAD;
   1338	dma_len -= tmp_dlen;
   1339	dma_addr += tmp_dlen + 1;
   1340	txd++;
   1341
   1342	while (dma_len > 0) {
   1343		dma_len -= 1;
   1344		dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN, dma_len);
   1345		txd->dma_len_type = cpu_to_le16(dlen_type);
   1346		nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
   1347
   1348		dlen_type &= NFDK_DESC_TX_DMA_LEN;
   1349		dma_len -= dlen_type;
   1350		dma_addr += dlen_type + 1;
   1351		txd++;
   1352	}
   1353
   1354	(txd - 1)->dma_len_type = cpu_to_le16(dlen_type | NFDK_DESC_TX_EOP);
   1355
   1356	/* Metadata desc */
   1357	txd->raw = cpu_to_le64(metadata);
   1358	txd++;
   1359
   1360	cnt = txd - tx_ring->ktxds - wr_idx;
   1361	if (unlikely(round_down(wr_idx, NFDK_TX_DESC_BLOCK_CNT) !=
   1362		     round_down(wr_idx + cnt - 1, NFDK_TX_DESC_BLOCK_CNT)))
   1363		goto err_warn_overflow;
   1364
   1365	tx_ring->wr_p += cnt;
   1366	if (tx_ring->wr_p % NFDK_TX_DESC_BLOCK_CNT)
   1367		tx_ring->data_pending += skb->len;
   1368	else
   1369		tx_ring->data_pending = 0;
   1370
   1371	tx_ring->wr_ptr_add += cnt;
   1372	nfp_net_tx_xmit_more_flush(tx_ring);
   1373
   1374	return NETDEV_TX_OK;
   1375
   1376err_warn_overflow:
   1377	WARN_ONCE(1, "unable to fit packet into a descriptor wr_idx:%d head:%d frags:%d cnt:%d",
   1378		  wr_idx, skb_headlen(skb), 0, cnt);
   1379	txbuf--;
   1380	dma_unmap_single(dp->dev, txbuf->dma_addr,
   1381			 skb_headlen(skb), DMA_TO_DEVICE);
   1382	txbuf->raw = 0;
   1383err_warn_dma:
   1384	nn_dp_warn(dp, "Failed to map DMA TX buffer\n");
   1385err_free:
   1386	u64_stats_update_begin(&r_vec->tx_sync);
   1387	r_vec->tx_errors++;
   1388	u64_stats_update_end(&r_vec->tx_sync);
   1389	dev_kfree_skb_any(skb);
   1390	return NETDEV_TX_OK;
   1391}
   1392
   1393static void __nfp_ctrl_tx_queued(struct nfp_net_r_vector *r_vec)
   1394{
   1395	struct sk_buff *skb;
   1396
   1397	while ((skb = __skb_dequeue(&r_vec->queue)))
   1398		if (nfp_nfdk_ctrl_tx_one(r_vec->nfp_net, r_vec, skb, true))
   1399			return;
   1400}
   1401
   1402static bool
   1403nfp_ctrl_meta_ok(struct nfp_net *nn, void *data, unsigned int meta_len)
   1404{
   1405	u32 meta_type, meta_tag;
   1406
   1407	if (!nfp_app_ctrl_has_meta(nn->app))
   1408		return !meta_len;
   1409
   1410	if (meta_len != 8)
   1411		return false;
   1412
   1413	meta_type = get_unaligned_be32(data);
   1414	meta_tag = get_unaligned_be32(data + 4);
   1415
   1416	return (meta_type == NFP_NET_META_PORTID &&
   1417		meta_tag == NFP_META_PORT_ID_CTRL);
   1418}
   1419
   1420static bool
   1421nfp_ctrl_rx_one(struct nfp_net *nn, struct nfp_net_dp *dp,
   1422		struct nfp_net_r_vector *r_vec, struct nfp_net_rx_ring *rx_ring)
   1423{
   1424	unsigned int meta_len, data_len, meta_off, pkt_len, pkt_off;
   1425	struct nfp_net_rx_buf *rxbuf;
   1426	struct nfp_net_rx_desc *rxd;
   1427	dma_addr_t new_dma_addr;
   1428	struct sk_buff *skb;
   1429	void *new_frag;
   1430	int idx;
   1431
   1432	idx = D_IDX(rx_ring, rx_ring->rd_p);
   1433
   1434	rxd = &rx_ring->rxds[idx];
   1435	if (!(rxd->rxd.meta_len_dd & PCIE_DESC_RX_DD))
   1436		return false;
   1437
   1438	/* Memory barrier to ensure that we won't do other reads
   1439	 * before the DD bit.
   1440	 */
   1441	dma_rmb();
   1442
   1443	rx_ring->rd_p++;
   1444
   1445	rxbuf =	&rx_ring->rxbufs[idx];
   1446	meta_len = rxd->rxd.meta_len_dd & PCIE_DESC_RX_META_LEN_MASK;
   1447	data_len = le16_to_cpu(rxd->rxd.data_len);
   1448	pkt_len = data_len - meta_len;
   1449
   1450	pkt_off = NFP_NET_RX_BUF_HEADROOM + dp->rx_dma_off;
   1451	if (dp->rx_offset == NFP_NET_CFG_RX_OFFSET_DYNAMIC)
   1452		pkt_off += meta_len;
   1453	else
   1454		pkt_off += dp->rx_offset;
   1455	meta_off = pkt_off - meta_len;
   1456
   1457	/* Stats update */
   1458	u64_stats_update_begin(&r_vec->rx_sync);
   1459	r_vec->rx_pkts++;
   1460	r_vec->rx_bytes += pkt_len;
   1461	u64_stats_update_end(&r_vec->rx_sync);
   1462
   1463	nfp_net_dma_sync_cpu_rx(dp, rxbuf->dma_addr + meta_off,	data_len);
   1464
   1465	if (unlikely(!nfp_ctrl_meta_ok(nn, rxbuf->frag + meta_off, meta_len))) {
   1466		nn_dp_warn(dp, "incorrect metadata for ctrl packet (%d)\n",
   1467			   meta_len);
   1468		nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL);
   1469		return true;
   1470	}
   1471
   1472	skb = build_skb(rxbuf->frag, dp->fl_bufsz);
   1473	if (unlikely(!skb)) {
   1474		nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL);
   1475		return true;
   1476	}
   1477	new_frag = nfp_nfdk_napi_alloc_one(dp, &new_dma_addr);
   1478	if (unlikely(!new_frag)) {
   1479		nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, skb);
   1480		return true;
   1481	}
   1482
   1483	nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr);
   1484
   1485	nfp_nfdk_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
   1486
   1487	skb_reserve(skb, pkt_off);
   1488	skb_put(skb, pkt_len);
   1489
   1490	nfp_app_ctrl_rx(nn->app, skb);
   1491
   1492	return true;
   1493}
   1494
   1495static bool nfp_ctrl_rx(struct nfp_net_r_vector *r_vec)
   1496{
   1497	struct nfp_net_rx_ring *rx_ring = r_vec->rx_ring;
   1498	struct nfp_net *nn = r_vec->nfp_net;
   1499	struct nfp_net_dp *dp = &nn->dp;
   1500	unsigned int budget = 512;
   1501
   1502	while (nfp_ctrl_rx_one(nn, dp, r_vec, rx_ring) && budget--)
   1503		continue;
   1504
   1505	return budget;
   1506}
   1507
   1508void nfp_nfdk_ctrl_poll(struct tasklet_struct *t)
   1509{
   1510	struct nfp_net_r_vector *r_vec = from_tasklet(r_vec, t, tasklet);
   1511
   1512	spin_lock(&r_vec->lock);
   1513	nfp_nfdk_tx_complete(r_vec->tx_ring, 0);
   1514	__nfp_ctrl_tx_queued(r_vec);
   1515	spin_unlock(&r_vec->lock);
   1516
   1517	if (nfp_ctrl_rx(r_vec)) {
   1518		nfp_net_irq_unmask(r_vec->nfp_net, r_vec->irq_entry);
   1519	} else {
   1520		tasklet_schedule(&r_vec->tasklet);
   1521		nn_dp_warn(&r_vec->nfp_net->dp,
   1522			   "control message budget exceeded!\n");
   1523	}
   1524}