cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

veth.c (45471B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  drivers/net/veth.c
      4 *
      5 *  Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
      6 *
      7 * Author: Pavel Emelianov <xemul@openvz.org>
      8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
      9 *
     10 */
     11
     12#include <linux/netdevice.h>
     13#include <linux/slab.h>
     14#include <linux/ethtool.h>
     15#include <linux/etherdevice.h>
     16#include <linux/u64_stats_sync.h>
     17
     18#include <net/rtnetlink.h>
     19#include <net/dst.h>
     20#include <net/xfrm.h>
     21#include <net/xdp.h>
     22#include <linux/veth.h>
     23#include <linux/module.h>
     24#include <linux/bpf.h>
     25#include <linux/filter.h>
     26#include <linux/ptr_ring.h>
     27#include <linux/bpf_trace.h>
     28#include <linux/net_tstamp.h>
     29
     30#define DRV_NAME	"veth"
     31#define DRV_VERSION	"1.0"
     32
     33#define VETH_XDP_FLAG		BIT(0)
     34#define VETH_RING_SIZE		256
     35#define VETH_XDP_HEADROOM	(XDP_PACKET_HEADROOM + NET_IP_ALIGN)
     36
     37#define VETH_XDP_TX_BULK_SIZE	16
     38#define VETH_XDP_BATCH		16
     39
     40struct veth_stats {
     41	u64	rx_drops;
     42	/* xdp */
     43	u64	xdp_packets;
     44	u64	xdp_bytes;
     45	u64	xdp_redirect;
     46	u64	xdp_drops;
     47	u64	xdp_tx;
     48	u64	xdp_tx_err;
     49	u64	peer_tq_xdp_xmit;
     50	u64	peer_tq_xdp_xmit_err;
     51};
     52
     53struct veth_rq_stats {
     54	struct veth_stats	vs;
     55	struct u64_stats_sync	syncp;
     56};
     57
     58struct veth_rq {
     59	struct napi_struct	xdp_napi;
     60	struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
     61	struct net_device	*dev;
     62	struct bpf_prog __rcu	*xdp_prog;
     63	struct xdp_mem_info	xdp_mem;
     64	struct veth_rq_stats	stats;
     65	bool			rx_notify_masked;
     66	struct ptr_ring		xdp_ring;
     67	struct xdp_rxq_info	xdp_rxq;
     68};
     69
     70struct veth_priv {
     71	struct net_device __rcu	*peer;
     72	atomic64_t		dropped;
     73	struct bpf_prog		*_xdp_prog;
     74	struct veth_rq		*rq;
     75	unsigned int		requested_headroom;
     76};
     77
     78struct veth_xdp_tx_bq {
     79	struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
     80	unsigned int count;
     81};
     82
     83/*
     84 * ethtool interface
     85 */
     86
     87struct veth_q_stat_desc {
     88	char	desc[ETH_GSTRING_LEN];
     89	size_t	offset;
     90};
     91
     92#define VETH_RQ_STAT(m)	offsetof(struct veth_stats, m)
     93
     94static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
     95	{ "xdp_packets",	VETH_RQ_STAT(xdp_packets) },
     96	{ "xdp_bytes",		VETH_RQ_STAT(xdp_bytes) },
     97	{ "drops",		VETH_RQ_STAT(rx_drops) },
     98	{ "xdp_redirect",	VETH_RQ_STAT(xdp_redirect) },
     99	{ "xdp_drops",		VETH_RQ_STAT(xdp_drops) },
    100	{ "xdp_tx",		VETH_RQ_STAT(xdp_tx) },
    101	{ "xdp_tx_errors",	VETH_RQ_STAT(xdp_tx_err) },
    102};
    103
    104#define VETH_RQ_STATS_LEN	ARRAY_SIZE(veth_rq_stats_desc)
    105
    106static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
    107	{ "xdp_xmit",		VETH_RQ_STAT(peer_tq_xdp_xmit) },
    108	{ "xdp_xmit_errors",	VETH_RQ_STAT(peer_tq_xdp_xmit_err) },
    109};
    110
    111#define VETH_TQ_STATS_LEN	ARRAY_SIZE(veth_tq_stats_desc)
    112
    113static struct {
    114	const char string[ETH_GSTRING_LEN];
    115} ethtool_stats_keys[] = {
    116	{ "peer_ifindex" },
    117};
    118
    119static int veth_get_link_ksettings(struct net_device *dev,
    120				   struct ethtool_link_ksettings *cmd)
    121{
    122	cmd->base.speed		= SPEED_10000;
    123	cmd->base.duplex	= DUPLEX_FULL;
    124	cmd->base.port		= PORT_TP;
    125	cmd->base.autoneg	= AUTONEG_DISABLE;
    126	return 0;
    127}
    128
    129static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
    130{
    131	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
    132	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
    133}
    134
    135static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
    136{
    137	u8 *p = buf;
    138	int i, j;
    139
    140	switch(stringset) {
    141	case ETH_SS_STATS:
    142		memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
    143		p += sizeof(ethtool_stats_keys);
    144		for (i = 0; i < dev->real_num_rx_queues; i++)
    145			for (j = 0; j < VETH_RQ_STATS_LEN; j++)
    146				ethtool_sprintf(&p, "rx_queue_%u_%.18s",
    147						i, veth_rq_stats_desc[j].desc);
    148
    149		for (i = 0; i < dev->real_num_tx_queues; i++)
    150			for (j = 0; j < VETH_TQ_STATS_LEN; j++)
    151				ethtool_sprintf(&p, "tx_queue_%u_%.18s",
    152						i, veth_tq_stats_desc[j].desc);
    153		break;
    154	}
    155}
    156
    157static int veth_get_sset_count(struct net_device *dev, int sset)
    158{
    159	switch (sset) {
    160	case ETH_SS_STATS:
    161		return ARRAY_SIZE(ethtool_stats_keys) +
    162		       VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
    163		       VETH_TQ_STATS_LEN * dev->real_num_tx_queues;
    164	default:
    165		return -EOPNOTSUPP;
    166	}
    167}
    168
    169static void veth_get_ethtool_stats(struct net_device *dev,
    170		struct ethtool_stats *stats, u64 *data)
    171{
    172	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
    173	struct net_device *peer = rtnl_dereference(priv->peer);
    174	int i, j, idx;
    175
    176	data[0] = peer ? peer->ifindex : 0;
    177	idx = 1;
    178	for (i = 0; i < dev->real_num_rx_queues; i++) {
    179		const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
    180		const void *stats_base = (void *)&rq_stats->vs;
    181		unsigned int start;
    182		size_t offset;
    183
    184		do {
    185			start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
    186			for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
    187				offset = veth_rq_stats_desc[j].offset;
    188				data[idx + j] = *(u64 *)(stats_base + offset);
    189			}
    190		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
    191		idx += VETH_RQ_STATS_LEN;
    192	}
    193
    194	if (!peer)
    195		return;
    196
    197	rcv_priv = netdev_priv(peer);
    198	for (i = 0; i < peer->real_num_rx_queues; i++) {
    199		const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
    200		const void *base = (void *)&rq_stats->vs;
    201		unsigned int start, tx_idx = idx;
    202		size_t offset;
    203
    204		tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
    205		do {
    206			start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
    207			for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
    208				offset = veth_tq_stats_desc[j].offset;
    209				data[tx_idx + j] += *(u64 *)(base + offset);
    210			}
    211		} while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
    212	}
    213}
    214
    215static void veth_get_channels(struct net_device *dev,
    216			      struct ethtool_channels *channels)
    217{
    218	channels->tx_count = dev->real_num_tx_queues;
    219	channels->rx_count = dev->real_num_rx_queues;
    220	channels->max_tx = dev->num_tx_queues;
    221	channels->max_rx = dev->num_rx_queues;
    222}
    223
    224static int veth_set_channels(struct net_device *dev,
    225			     struct ethtool_channels *ch);
    226
    227static const struct ethtool_ops veth_ethtool_ops = {
    228	.get_drvinfo		= veth_get_drvinfo,
    229	.get_link		= ethtool_op_get_link,
    230	.get_strings		= veth_get_strings,
    231	.get_sset_count		= veth_get_sset_count,
    232	.get_ethtool_stats	= veth_get_ethtool_stats,
    233	.get_link_ksettings	= veth_get_link_ksettings,
    234	.get_ts_info		= ethtool_op_get_ts_info,
    235	.get_channels		= veth_get_channels,
    236	.set_channels		= veth_set_channels,
    237};
    238
    239/* general routines */
    240
    241static bool veth_is_xdp_frame(void *ptr)
    242{
    243	return (unsigned long)ptr & VETH_XDP_FLAG;
    244}
    245
    246static struct xdp_frame *veth_ptr_to_xdp(void *ptr)
    247{
    248	return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
    249}
    250
    251static void *veth_xdp_to_ptr(struct xdp_frame *xdp)
    252{
    253	return (void *)((unsigned long)xdp | VETH_XDP_FLAG);
    254}
    255
    256static void veth_ptr_free(void *ptr)
    257{
    258	if (veth_is_xdp_frame(ptr))
    259		xdp_return_frame(veth_ptr_to_xdp(ptr));
    260	else
    261		kfree_skb(ptr);
    262}
    263
    264static void __veth_xdp_flush(struct veth_rq *rq)
    265{
    266	/* Write ptr_ring before reading rx_notify_masked */
    267	smp_mb();
    268	if (!READ_ONCE(rq->rx_notify_masked) &&
    269	    napi_schedule_prep(&rq->xdp_napi)) {
    270		WRITE_ONCE(rq->rx_notify_masked, true);
    271		__napi_schedule(&rq->xdp_napi);
    272	}
    273}
    274
    275static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
    276{
    277	if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
    278		dev_kfree_skb_any(skb);
    279		return NET_RX_DROP;
    280	}
    281
    282	return NET_RX_SUCCESS;
    283}
    284
    285static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
    286			    struct veth_rq *rq, bool xdp)
    287{
    288	return __dev_forward_skb(dev, skb) ?: xdp ?
    289		veth_xdp_rx(rq, skb) :
    290		__netif_rx(skb);
    291}
    292
    293/* return true if the specified skb has chances of GRO aggregation
    294 * Don't strive for accuracy, but try to avoid GRO overhead in the most
    295 * common scenarios.
    296 * When XDP is enabled, all traffic is considered eligible, as the xmit
    297 * device has TSO off.
    298 * When TSO is enabled on the xmit device, we are likely interested only
    299 * in UDP aggregation, explicitly check for that if the skb is suspected
    300 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
    301 * to belong to locally generated UDP traffic.
    302 */
    303static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
    304					 const struct net_device *rcv,
    305					 const struct sk_buff *skb)
    306{
    307	return !(dev->features & NETIF_F_ALL_TSO) ||
    308		(skb->destructor == sock_wfree &&
    309		 rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
    310}
    311
    312static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
    313{
    314	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
    315	struct netdev_queue *queue = NULL;
    316	struct veth_rq *rq = NULL;
    317	struct net_device *rcv;
    318	int length = skb->len;
    319	bool use_napi = false;
    320	int rxq;
    321
    322	rcu_read_lock();
    323	rcv = rcu_dereference(priv->peer);
    324	if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) {
    325		kfree_skb(skb);
    326		goto drop;
    327	}
    328
    329	rcv_priv = netdev_priv(rcv);
    330	rxq = skb_get_queue_mapping(skb);
    331	if (rxq < rcv->real_num_rx_queues) {
    332		rq = &rcv_priv->rq[rxq];
    333		queue = netdev_get_tx_queue(dev, rxq);
    334
    335		/* The napi pointer is available when an XDP program is
    336		 * attached or when GRO is enabled
    337		 * Don't bother with napi/GRO if the skb can't be aggregated
    338		 */
    339		use_napi = rcu_access_pointer(rq->napi) &&
    340			   veth_skb_is_eligible_for_gro(dev, rcv, skb);
    341	}
    342
    343	skb_tx_timestamp(skb);
    344	if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
    345		if (queue)
    346			txq_trans_cond_update(queue);
    347		if (!use_napi)
    348			dev_lstats_add(dev, length);
    349	} else {
    350drop:
    351		atomic64_inc(&priv->dropped);
    352	}
    353
    354	if (use_napi)
    355		__veth_xdp_flush(rq);
    356
    357	rcu_read_unlock();
    358
    359	return NETDEV_TX_OK;
    360}
    361
    362static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes)
    363{
    364	struct veth_priv *priv = netdev_priv(dev);
    365
    366	dev_lstats_read(dev, packets, bytes);
    367	return atomic64_read(&priv->dropped);
    368}
    369
    370static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
    371{
    372	struct veth_priv *priv = netdev_priv(dev);
    373	int i;
    374
    375	result->peer_tq_xdp_xmit_err = 0;
    376	result->xdp_packets = 0;
    377	result->xdp_tx_err = 0;
    378	result->xdp_bytes = 0;
    379	result->rx_drops = 0;
    380	for (i = 0; i < dev->num_rx_queues; i++) {
    381		u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err;
    382		struct veth_rq_stats *stats = &priv->rq[i].stats;
    383		unsigned int start;
    384
    385		do {
    386			start = u64_stats_fetch_begin_irq(&stats->syncp);
    387			peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err;
    388			xdp_tx_err = stats->vs.xdp_tx_err;
    389			packets = stats->vs.xdp_packets;
    390			bytes = stats->vs.xdp_bytes;
    391			drops = stats->vs.rx_drops;
    392		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
    393		result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
    394		result->xdp_tx_err += xdp_tx_err;
    395		result->xdp_packets += packets;
    396		result->xdp_bytes += bytes;
    397		result->rx_drops += drops;
    398	}
    399}
    400
    401static void veth_get_stats64(struct net_device *dev,
    402			     struct rtnl_link_stats64 *tot)
    403{
    404	struct veth_priv *priv = netdev_priv(dev);
    405	struct net_device *peer;
    406	struct veth_stats rx;
    407	u64 packets, bytes;
    408
    409	tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes);
    410	tot->tx_bytes = bytes;
    411	tot->tx_packets = packets;
    412
    413	veth_stats_rx(&rx, dev);
    414	tot->tx_dropped += rx.xdp_tx_err;
    415	tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
    416	tot->rx_bytes = rx.xdp_bytes;
    417	tot->rx_packets = rx.xdp_packets;
    418
    419	rcu_read_lock();
    420	peer = rcu_dereference(priv->peer);
    421	if (peer) {
    422		veth_stats_tx(peer, &packets, &bytes);
    423		tot->rx_bytes += bytes;
    424		tot->rx_packets += packets;
    425
    426		veth_stats_rx(&rx, peer);
    427		tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
    428		tot->rx_dropped += rx.xdp_tx_err;
    429		tot->tx_bytes += rx.xdp_bytes;
    430		tot->tx_packets += rx.xdp_packets;
    431	}
    432	rcu_read_unlock();
    433}
    434
    435/* fake multicast ability */
    436static void veth_set_multicast_list(struct net_device *dev)
    437{
    438}
    439
    440static int veth_select_rxq(struct net_device *dev)
    441{
    442	return smp_processor_id() % dev->real_num_rx_queues;
    443}
    444
    445static struct net_device *veth_peer_dev(struct net_device *dev)
    446{
    447	struct veth_priv *priv = netdev_priv(dev);
    448
    449	/* Callers must be under RCU read side. */
    450	return rcu_dereference(priv->peer);
    451}
    452
    453static int veth_xdp_xmit(struct net_device *dev, int n,
    454			 struct xdp_frame **frames,
    455			 u32 flags, bool ndo_xmit)
    456{
    457	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
    458	int i, ret = -ENXIO, nxmit = 0;
    459	struct net_device *rcv;
    460	unsigned int max_len;
    461	struct veth_rq *rq;
    462
    463	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
    464		return -EINVAL;
    465
    466	rcu_read_lock();
    467	rcv = rcu_dereference(priv->peer);
    468	if (unlikely(!rcv))
    469		goto out;
    470
    471	rcv_priv = netdev_priv(rcv);
    472	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
    473	/* The napi pointer is set if NAPI is enabled, which ensures that
    474	 * xdp_ring is initialized on receive side and the peer device is up.
    475	 */
    476	if (!rcu_access_pointer(rq->napi))
    477		goto out;
    478
    479	max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
    480
    481	spin_lock(&rq->xdp_ring.producer_lock);
    482	for (i = 0; i < n; i++) {
    483		struct xdp_frame *frame = frames[i];
    484		void *ptr = veth_xdp_to_ptr(frame);
    485
    486		if (unlikely(xdp_get_frame_len(frame) > max_len ||
    487			     __ptr_ring_produce(&rq->xdp_ring, ptr)))
    488			break;
    489		nxmit++;
    490	}
    491	spin_unlock(&rq->xdp_ring.producer_lock);
    492
    493	if (flags & XDP_XMIT_FLUSH)
    494		__veth_xdp_flush(rq);
    495
    496	ret = nxmit;
    497	if (ndo_xmit) {
    498		u64_stats_update_begin(&rq->stats.syncp);
    499		rq->stats.vs.peer_tq_xdp_xmit += nxmit;
    500		rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit;
    501		u64_stats_update_end(&rq->stats.syncp);
    502	}
    503
    504out:
    505	rcu_read_unlock();
    506
    507	return ret;
    508}
    509
    510static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
    511			     struct xdp_frame **frames, u32 flags)
    512{
    513	int err;
    514
    515	err = veth_xdp_xmit(dev, n, frames, flags, true);
    516	if (err < 0) {
    517		struct veth_priv *priv = netdev_priv(dev);
    518
    519		atomic64_add(n, &priv->dropped);
    520	}
    521
    522	return err;
    523}
    524
    525static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
    526{
    527	int sent, i, err = 0, drops;
    528
    529	sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false);
    530	if (sent < 0) {
    531		err = sent;
    532		sent = 0;
    533	}
    534
    535	for (i = sent; unlikely(i < bq->count); i++)
    536		xdp_return_frame(bq->q[i]);
    537
    538	drops = bq->count - sent;
    539	trace_xdp_bulk_tx(rq->dev, sent, drops, err);
    540
    541	u64_stats_update_begin(&rq->stats.syncp);
    542	rq->stats.vs.xdp_tx += sent;
    543	rq->stats.vs.xdp_tx_err += drops;
    544	u64_stats_update_end(&rq->stats.syncp);
    545
    546	bq->count = 0;
    547}
    548
    549static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
    550{
    551	struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev);
    552	struct net_device *rcv;
    553	struct veth_rq *rcv_rq;
    554
    555	rcu_read_lock();
    556	veth_xdp_flush_bq(rq, bq);
    557	rcv = rcu_dereference(priv->peer);
    558	if (unlikely(!rcv))
    559		goto out;
    560
    561	rcv_priv = netdev_priv(rcv);
    562	rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)];
    563	/* xdp_ring is initialized on receive side? */
    564	if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog)))
    565		goto out;
    566
    567	__veth_xdp_flush(rcv_rq);
    568out:
    569	rcu_read_unlock();
    570}
    571
    572static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
    573		       struct veth_xdp_tx_bq *bq)
    574{
    575	struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
    576
    577	if (unlikely(!frame))
    578		return -EOVERFLOW;
    579
    580	if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
    581		veth_xdp_flush_bq(rq, bq);
    582
    583	bq->q[bq->count++] = frame;
    584
    585	return 0;
    586}
    587
    588static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
    589					  struct xdp_frame *frame,
    590					  struct veth_xdp_tx_bq *bq,
    591					  struct veth_stats *stats)
    592{
    593	struct xdp_frame orig_frame;
    594	struct bpf_prog *xdp_prog;
    595
    596	rcu_read_lock();
    597	xdp_prog = rcu_dereference(rq->xdp_prog);
    598	if (likely(xdp_prog)) {
    599		struct xdp_buff xdp;
    600		u32 act;
    601
    602		xdp_convert_frame_to_buff(frame, &xdp);
    603		xdp.rxq = &rq->xdp_rxq;
    604
    605		act = bpf_prog_run_xdp(xdp_prog, &xdp);
    606
    607		switch (act) {
    608		case XDP_PASS:
    609			if (xdp_update_frame_from_buff(&xdp, frame))
    610				goto err_xdp;
    611			break;
    612		case XDP_TX:
    613			orig_frame = *frame;
    614			xdp.rxq->mem = frame->mem;
    615			if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
    616				trace_xdp_exception(rq->dev, xdp_prog, act);
    617				frame = &orig_frame;
    618				stats->rx_drops++;
    619				goto err_xdp;
    620			}
    621			stats->xdp_tx++;
    622			rcu_read_unlock();
    623			goto xdp_xmit;
    624		case XDP_REDIRECT:
    625			orig_frame = *frame;
    626			xdp.rxq->mem = frame->mem;
    627			if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
    628				frame = &orig_frame;
    629				stats->rx_drops++;
    630				goto err_xdp;
    631			}
    632			stats->xdp_redirect++;
    633			rcu_read_unlock();
    634			goto xdp_xmit;
    635		default:
    636			bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
    637			fallthrough;
    638		case XDP_ABORTED:
    639			trace_xdp_exception(rq->dev, xdp_prog, act);
    640			fallthrough;
    641		case XDP_DROP:
    642			stats->xdp_drops++;
    643			goto err_xdp;
    644		}
    645	}
    646	rcu_read_unlock();
    647
    648	return frame;
    649err_xdp:
    650	rcu_read_unlock();
    651	xdp_return_frame(frame);
    652xdp_xmit:
    653	return NULL;
    654}
    655
    656/* frames array contains VETH_XDP_BATCH at most */
    657static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
    658				  int n_xdpf, struct veth_xdp_tx_bq *bq,
    659				  struct veth_stats *stats)
    660{
    661	void *skbs[VETH_XDP_BATCH];
    662	int i;
    663
    664	if (xdp_alloc_skb_bulk(skbs, n_xdpf,
    665			       GFP_ATOMIC | __GFP_ZERO) < 0) {
    666		for (i = 0; i < n_xdpf; i++)
    667			xdp_return_frame(frames[i]);
    668		stats->rx_drops += n_xdpf;
    669
    670		return;
    671	}
    672
    673	for (i = 0; i < n_xdpf; i++) {
    674		struct sk_buff *skb = skbs[i];
    675
    676		skb = __xdp_build_skb_from_frame(frames[i], skb,
    677						 rq->dev);
    678		if (!skb) {
    679			xdp_return_frame(frames[i]);
    680			stats->rx_drops++;
    681			continue;
    682		}
    683		napi_gro_receive(&rq->xdp_napi, skb);
    684	}
    685}
    686
    687static void veth_xdp_get(struct xdp_buff *xdp)
    688{
    689	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
    690	int i;
    691
    692	get_page(virt_to_page(xdp->data));
    693	if (likely(!xdp_buff_has_frags(xdp)))
    694		return;
    695
    696	for (i = 0; i < sinfo->nr_frags; i++)
    697		__skb_frag_ref(&sinfo->frags[i]);
    698}
    699
    700static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
    701					struct xdp_buff *xdp,
    702					struct sk_buff **pskb)
    703{
    704	struct sk_buff *skb = *pskb;
    705	u32 frame_sz;
    706
    707	if (skb_shared(skb) || skb_head_is_locked(skb) ||
    708	    skb_shinfo(skb)->nr_frags) {
    709		u32 size, len, max_head_size, off;
    710		struct sk_buff *nskb;
    711		struct page *page;
    712		int i, head_off;
    713
    714		/* We need a private copy of the skb and data buffers since
    715		 * the ebpf program can modify it. We segment the original skb
    716		 * into order-0 pages without linearize it.
    717		 *
    718		 * Make sure we have enough space for linear and paged area
    719		 */
    720		max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
    721						  VETH_XDP_HEADROOM);
    722		if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
    723			goto drop;
    724
    725		/* Allocate skb head */
    726		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
    727		if (!page)
    728			goto drop;
    729
    730		nskb = build_skb(page_address(page), PAGE_SIZE);
    731		if (!nskb) {
    732			put_page(page);
    733			goto drop;
    734		}
    735
    736		skb_reserve(nskb, VETH_XDP_HEADROOM);
    737		size = min_t(u32, skb->len, max_head_size);
    738		if (skb_copy_bits(skb, 0, nskb->data, size)) {
    739			consume_skb(nskb);
    740			goto drop;
    741		}
    742		skb_put(nskb, size);
    743
    744		skb_copy_header(nskb, skb);
    745		head_off = skb_headroom(nskb) - skb_headroom(skb);
    746		skb_headers_offset_update(nskb, head_off);
    747
    748		/* Allocate paged area of new skb */
    749		off = size;
    750		len = skb->len - off;
    751
    752		for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
    753			page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
    754			if (!page) {
    755				consume_skb(nskb);
    756				goto drop;
    757			}
    758
    759			size = min_t(u32, len, PAGE_SIZE);
    760			skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
    761			if (skb_copy_bits(skb, off, page_address(page),
    762					  size)) {
    763				consume_skb(nskb);
    764				goto drop;
    765			}
    766
    767			len -= size;
    768			off += size;
    769		}
    770
    771		consume_skb(skb);
    772		skb = nskb;
    773	} else if (skb_headroom(skb) < XDP_PACKET_HEADROOM &&
    774		   pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) {
    775		goto drop;
    776	}
    777
    778	/* SKB "head" area always have tailroom for skb_shared_info */
    779	frame_sz = skb_end_pointer(skb) - skb->head;
    780	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
    781	xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
    782	xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
    783			 skb_headlen(skb), true);
    784
    785	if (skb_is_nonlinear(skb)) {
    786		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
    787		xdp_buff_set_frags_flag(xdp);
    788	} else {
    789		xdp_buff_clear_frags_flag(xdp);
    790	}
    791	*pskb = skb;
    792
    793	return 0;
    794drop:
    795	consume_skb(skb);
    796	*pskb = NULL;
    797
    798	return -ENOMEM;
    799}
    800
    801static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
    802					struct sk_buff *skb,
    803					struct veth_xdp_tx_bq *bq,
    804					struct veth_stats *stats)
    805{
    806	void *orig_data, *orig_data_end;
    807	struct bpf_prog *xdp_prog;
    808	struct xdp_buff xdp;
    809	u32 act, metalen;
    810	int off;
    811
    812	skb_prepare_for_gro(skb);
    813
    814	rcu_read_lock();
    815	xdp_prog = rcu_dereference(rq->xdp_prog);
    816	if (unlikely(!xdp_prog)) {
    817		rcu_read_unlock();
    818		goto out;
    819	}
    820
    821	__skb_push(skb, skb->data - skb_mac_header(skb));
    822	if (veth_convert_skb_to_xdp_buff(rq, &xdp, &skb))
    823		goto drop;
    824
    825	orig_data = xdp.data;
    826	orig_data_end = xdp.data_end;
    827
    828	act = bpf_prog_run_xdp(xdp_prog, &xdp);
    829
    830	switch (act) {
    831	case XDP_PASS:
    832		break;
    833	case XDP_TX:
    834		veth_xdp_get(&xdp);
    835		consume_skb(skb);
    836		xdp.rxq->mem = rq->xdp_mem;
    837		if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
    838			trace_xdp_exception(rq->dev, xdp_prog, act);
    839			stats->rx_drops++;
    840			goto err_xdp;
    841		}
    842		stats->xdp_tx++;
    843		rcu_read_unlock();
    844		goto xdp_xmit;
    845	case XDP_REDIRECT:
    846		veth_xdp_get(&xdp);
    847		consume_skb(skb);
    848		xdp.rxq->mem = rq->xdp_mem;
    849		if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
    850			stats->rx_drops++;
    851			goto err_xdp;
    852		}
    853		stats->xdp_redirect++;
    854		rcu_read_unlock();
    855		goto xdp_xmit;
    856	default:
    857		bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
    858		fallthrough;
    859	case XDP_ABORTED:
    860		trace_xdp_exception(rq->dev, xdp_prog, act);
    861		fallthrough;
    862	case XDP_DROP:
    863		stats->xdp_drops++;
    864		goto xdp_drop;
    865	}
    866	rcu_read_unlock();
    867
    868	/* check if bpf_xdp_adjust_head was used */
    869	off = orig_data - xdp.data;
    870	if (off > 0)
    871		__skb_push(skb, off);
    872	else if (off < 0)
    873		__skb_pull(skb, -off);
    874
    875	skb_reset_mac_header(skb);
    876
    877	/* check if bpf_xdp_adjust_tail was used */
    878	off = xdp.data_end - orig_data_end;
    879	if (off != 0)
    880		__skb_put(skb, off); /* positive on grow, negative on shrink */
    881
    882	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
    883	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
    884	 */
    885	if (xdp_buff_has_frags(&xdp))
    886		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
    887	else
    888		skb->data_len = 0;
    889
    890	skb->protocol = eth_type_trans(skb, rq->dev);
    891
    892	metalen = xdp.data - xdp.data_meta;
    893	if (metalen)
    894		skb_metadata_set(skb, metalen);
    895out:
    896	return skb;
    897drop:
    898	stats->rx_drops++;
    899xdp_drop:
    900	rcu_read_unlock();
    901	kfree_skb(skb);
    902	return NULL;
    903err_xdp:
    904	rcu_read_unlock();
    905	xdp_return_buff(&xdp);
    906xdp_xmit:
    907	return NULL;
    908}
    909
    910static int veth_xdp_rcv(struct veth_rq *rq, int budget,
    911			struct veth_xdp_tx_bq *bq,
    912			struct veth_stats *stats)
    913{
    914	int i, done = 0, n_xdpf = 0;
    915	void *xdpf[VETH_XDP_BATCH];
    916
    917	for (i = 0; i < budget; i++) {
    918		void *ptr = __ptr_ring_consume(&rq->xdp_ring);
    919
    920		if (!ptr)
    921			break;
    922
    923		if (veth_is_xdp_frame(ptr)) {
    924			/* ndo_xdp_xmit */
    925			struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
    926
    927			stats->xdp_bytes += xdp_get_frame_len(frame);
    928			frame = veth_xdp_rcv_one(rq, frame, bq, stats);
    929			if (frame) {
    930				/* XDP_PASS */
    931				xdpf[n_xdpf++] = frame;
    932				if (n_xdpf == VETH_XDP_BATCH) {
    933					veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf,
    934							      bq, stats);
    935					n_xdpf = 0;
    936				}
    937			}
    938		} else {
    939			/* ndo_start_xmit */
    940			struct sk_buff *skb = ptr;
    941
    942			stats->xdp_bytes += skb->len;
    943			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
    944			if (skb) {
    945				if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
    946					netif_receive_skb(skb);
    947				else
    948					napi_gro_receive(&rq->xdp_napi, skb);
    949			}
    950		}
    951		done++;
    952	}
    953
    954	if (n_xdpf)
    955		veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
    956
    957	u64_stats_update_begin(&rq->stats.syncp);
    958	rq->stats.vs.xdp_redirect += stats->xdp_redirect;
    959	rq->stats.vs.xdp_bytes += stats->xdp_bytes;
    960	rq->stats.vs.xdp_drops += stats->xdp_drops;
    961	rq->stats.vs.rx_drops += stats->rx_drops;
    962	rq->stats.vs.xdp_packets += done;
    963	u64_stats_update_end(&rq->stats.syncp);
    964
    965	return done;
    966}
    967
    968static int veth_poll(struct napi_struct *napi, int budget)
    969{
    970	struct veth_rq *rq =
    971		container_of(napi, struct veth_rq, xdp_napi);
    972	struct veth_stats stats = {};
    973	struct veth_xdp_tx_bq bq;
    974	int done;
    975
    976	bq.count = 0;
    977
    978	xdp_set_return_frame_no_direct();
    979	done = veth_xdp_rcv(rq, budget, &bq, &stats);
    980
    981	if (done < budget && napi_complete_done(napi, done)) {
    982		/* Write rx_notify_masked before reading ptr_ring */
    983		smp_store_mb(rq->rx_notify_masked, false);
    984		if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
    985			if (napi_schedule_prep(&rq->xdp_napi)) {
    986				WRITE_ONCE(rq->rx_notify_masked, true);
    987				__napi_schedule(&rq->xdp_napi);
    988			}
    989		}
    990	}
    991
    992	if (stats.xdp_tx > 0)
    993		veth_xdp_flush(rq, &bq);
    994	if (stats.xdp_redirect > 0)
    995		xdp_do_flush();
    996	xdp_clear_return_frame_no_direct();
    997
    998	return done;
    999}
   1000
   1001static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
   1002{
   1003	struct veth_priv *priv = netdev_priv(dev);
   1004	int err, i;
   1005
   1006	for (i = start; i < end; i++) {
   1007		struct veth_rq *rq = &priv->rq[i];
   1008
   1009		err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
   1010		if (err)
   1011			goto err_xdp_ring;
   1012	}
   1013
   1014	for (i = start; i < end; i++) {
   1015		struct veth_rq *rq = &priv->rq[i];
   1016
   1017		napi_enable(&rq->xdp_napi);
   1018		rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
   1019	}
   1020
   1021	return 0;
   1022
   1023err_xdp_ring:
   1024	for (i--; i >= start; i--)
   1025		ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
   1026
   1027	return err;
   1028}
   1029
   1030static int __veth_napi_enable(struct net_device *dev)
   1031{
   1032	return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
   1033}
   1034
   1035static void veth_napi_del_range(struct net_device *dev, int start, int end)
   1036{
   1037	struct veth_priv *priv = netdev_priv(dev);
   1038	int i;
   1039
   1040	for (i = start; i < end; i++) {
   1041		struct veth_rq *rq = &priv->rq[i];
   1042
   1043		rcu_assign_pointer(priv->rq[i].napi, NULL);
   1044		napi_disable(&rq->xdp_napi);
   1045		__netif_napi_del(&rq->xdp_napi);
   1046	}
   1047	synchronize_net();
   1048
   1049	for (i = start; i < end; i++) {
   1050		struct veth_rq *rq = &priv->rq[i];
   1051
   1052		rq->rx_notify_masked = false;
   1053		ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
   1054	}
   1055}
   1056
   1057static void veth_napi_del(struct net_device *dev)
   1058{
   1059	veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
   1060}
   1061
   1062static bool veth_gro_requested(const struct net_device *dev)
   1063{
   1064	return !!(dev->wanted_features & NETIF_F_GRO);
   1065}
   1066
   1067static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
   1068				 bool napi_already_on)
   1069{
   1070	struct veth_priv *priv = netdev_priv(dev);
   1071	int err, i;
   1072
   1073	for (i = start; i < end; i++) {
   1074		struct veth_rq *rq = &priv->rq[i];
   1075
   1076		if (!napi_already_on)
   1077			netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
   1078		err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
   1079		if (err < 0)
   1080			goto err_rxq_reg;
   1081
   1082		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
   1083						 MEM_TYPE_PAGE_SHARED,
   1084						 NULL);
   1085		if (err < 0)
   1086			goto err_reg_mem;
   1087
   1088		/* Save original mem info as it can be overwritten */
   1089		rq->xdp_mem = rq->xdp_rxq.mem;
   1090	}
   1091	return 0;
   1092
   1093err_reg_mem:
   1094	xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
   1095err_rxq_reg:
   1096	for (i--; i >= start; i--) {
   1097		struct veth_rq *rq = &priv->rq[i];
   1098
   1099		xdp_rxq_info_unreg(&rq->xdp_rxq);
   1100		if (!napi_already_on)
   1101			netif_napi_del(&rq->xdp_napi);
   1102	}
   1103
   1104	return err;
   1105}
   1106
   1107static void veth_disable_xdp_range(struct net_device *dev, int start, int end,
   1108				   bool delete_napi)
   1109{
   1110	struct veth_priv *priv = netdev_priv(dev);
   1111	int i;
   1112
   1113	for (i = start; i < end; i++) {
   1114		struct veth_rq *rq = &priv->rq[i];
   1115
   1116		rq->xdp_rxq.mem = rq->xdp_mem;
   1117		xdp_rxq_info_unreg(&rq->xdp_rxq);
   1118
   1119		if (delete_napi)
   1120			netif_napi_del(&rq->xdp_napi);
   1121	}
   1122}
   1123
   1124static int veth_enable_xdp(struct net_device *dev)
   1125{
   1126	bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
   1127	struct veth_priv *priv = netdev_priv(dev);
   1128	int err, i;
   1129
   1130	if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
   1131		err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on);
   1132		if (err)
   1133			return err;
   1134
   1135		if (!napi_already_on) {
   1136			err = __veth_napi_enable(dev);
   1137			if (err) {
   1138				veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
   1139				return err;
   1140			}
   1141
   1142			if (!veth_gro_requested(dev)) {
   1143				/* user-space did not require GRO, but adding XDP
   1144				 * is supposed to get GRO working
   1145				 */
   1146				dev->features |= NETIF_F_GRO;
   1147				netdev_features_change(dev);
   1148			}
   1149		}
   1150	}
   1151
   1152	for (i = 0; i < dev->real_num_rx_queues; i++) {
   1153		rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
   1154		rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
   1155	}
   1156
   1157	return 0;
   1158}
   1159
   1160static void veth_disable_xdp(struct net_device *dev)
   1161{
   1162	struct veth_priv *priv = netdev_priv(dev);
   1163	int i;
   1164
   1165	for (i = 0; i < dev->real_num_rx_queues; i++)
   1166		rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
   1167
   1168	if (!netif_running(dev) || !veth_gro_requested(dev)) {
   1169		veth_napi_del(dev);
   1170
   1171		/* if user-space did not require GRO, since adding XDP
   1172		 * enabled it, clear it now
   1173		 */
   1174		if (!veth_gro_requested(dev) && netif_running(dev)) {
   1175			dev->features &= ~NETIF_F_GRO;
   1176			netdev_features_change(dev);
   1177		}
   1178	}
   1179
   1180	veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
   1181}
   1182
   1183static int veth_napi_enable_range(struct net_device *dev, int start, int end)
   1184{
   1185	struct veth_priv *priv = netdev_priv(dev);
   1186	int err, i;
   1187
   1188	for (i = start; i < end; i++) {
   1189		struct veth_rq *rq = &priv->rq[i];
   1190
   1191		netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
   1192	}
   1193
   1194	err = __veth_napi_enable_range(dev, start, end);
   1195	if (err) {
   1196		for (i = start; i < end; i++) {
   1197			struct veth_rq *rq = &priv->rq[i];
   1198
   1199			netif_napi_del(&rq->xdp_napi);
   1200		}
   1201		return err;
   1202	}
   1203	return err;
   1204}
   1205
   1206static int veth_napi_enable(struct net_device *dev)
   1207{
   1208	return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
   1209}
   1210
   1211static void veth_disable_range_safe(struct net_device *dev, int start, int end)
   1212{
   1213	struct veth_priv *priv = netdev_priv(dev);
   1214
   1215	if (start >= end)
   1216		return;
   1217
   1218	if (priv->_xdp_prog) {
   1219		veth_napi_del_range(dev, start, end);
   1220		veth_disable_xdp_range(dev, start, end, false);
   1221	} else if (veth_gro_requested(dev)) {
   1222		veth_napi_del_range(dev, start, end);
   1223	}
   1224}
   1225
   1226static int veth_enable_range_safe(struct net_device *dev, int start, int end)
   1227{
   1228	struct veth_priv *priv = netdev_priv(dev);
   1229	int err;
   1230
   1231	if (start >= end)
   1232		return 0;
   1233
   1234	if (priv->_xdp_prog) {
   1235		/* these channels are freshly initialized, napi is not on there even
   1236		 * when GRO is requeste
   1237		 */
   1238		err = veth_enable_xdp_range(dev, start, end, false);
   1239		if (err)
   1240			return err;
   1241
   1242		err = __veth_napi_enable_range(dev, start, end);
   1243		if (err) {
   1244			/* on error always delete the newly added napis */
   1245			veth_disable_xdp_range(dev, start, end, true);
   1246			return err;
   1247		}
   1248	} else if (veth_gro_requested(dev)) {
   1249		return veth_napi_enable_range(dev, start, end);
   1250	}
   1251	return 0;
   1252}
   1253
   1254static int veth_set_channels(struct net_device *dev,
   1255			     struct ethtool_channels *ch)
   1256{
   1257	struct veth_priv *priv = netdev_priv(dev);
   1258	unsigned int old_rx_count, new_rx_count;
   1259	struct veth_priv *peer_priv;
   1260	struct net_device *peer;
   1261	int err;
   1262
   1263	/* sanity check. Upper bounds are already enforced by the caller */
   1264	if (!ch->rx_count || !ch->tx_count)
   1265		return -EINVAL;
   1266
   1267	/* avoid braking XDP, if that is enabled */
   1268	peer = rtnl_dereference(priv->peer);
   1269	peer_priv = peer ? netdev_priv(peer) : NULL;
   1270	if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues)
   1271		return -EINVAL;
   1272
   1273	if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues)
   1274		return -EINVAL;
   1275
   1276	old_rx_count = dev->real_num_rx_queues;
   1277	new_rx_count = ch->rx_count;
   1278	if (netif_running(dev)) {
   1279		/* turn device off */
   1280		netif_carrier_off(dev);
   1281		if (peer)
   1282			netif_carrier_off(peer);
   1283
   1284		/* try to allocate new resurces, as needed*/
   1285		err = veth_enable_range_safe(dev, old_rx_count, new_rx_count);
   1286		if (err)
   1287			goto out;
   1288	}
   1289
   1290	err = netif_set_real_num_rx_queues(dev, ch->rx_count);
   1291	if (err)
   1292		goto revert;
   1293
   1294	err = netif_set_real_num_tx_queues(dev, ch->tx_count);
   1295	if (err) {
   1296		int err2 = netif_set_real_num_rx_queues(dev, old_rx_count);
   1297
   1298		/* this error condition could happen only if rx and tx change
   1299		 * in opposite directions (e.g. tx nr raises, rx nr decreases)
   1300		 * and we can't do anything to fully restore the original
   1301		 * status
   1302		 */
   1303		if (err2)
   1304			pr_warn("Can't restore rx queues config %d -> %d %d",
   1305				new_rx_count, old_rx_count, err2);
   1306		else
   1307			goto revert;
   1308	}
   1309
   1310out:
   1311	if (netif_running(dev)) {
   1312		/* note that we need to swap the arguments WRT the enable part
   1313		 * to identify the range we have to disable
   1314		 */
   1315		veth_disable_range_safe(dev, new_rx_count, old_rx_count);
   1316		netif_carrier_on(dev);
   1317		if (peer)
   1318			netif_carrier_on(peer);
   1319	}
   1320	return err;
   1321
   1322revert:
   1323	new_rx_count = old_rx_count;
   1324	old_rx_count = ch->rx_count;
   1325	goto out;
   1326}
   1327
   1328static int veth_open(struct net_device *dev)
   1329{
   1330	struct veth_priv *priv = netdev_priv(dev);
   1331	struct net_device *peer = rtnl_dereference(priv->peer);
   1332	int err;
   1333
   1334	if (!peer)
   1335		return -ENOTCONN;
   1336
   1337	if (priv->_xdp_prog) {
   1338		err = veth_enable_xdp(dev);
   1339		if (err)
   1340			return err;
   1341	} else if (veth_gro_requested(dev)) {
   1342		err = veth_napi_enable(dev);
   1343		if (err)
   1344			return err;
   1345	}
   1346
   1347	if (peer->flags & IFF_UP) {
   1348		netif_carrier_on(dev);
   1349		netif_carrier_on(peer);
   1350	}
   1351
   1352	return 0;
   1353}
   1354
   1355static int veth_close(struct net_device *dev)
   1356{
   1357	struct veth_priv *priv = netdev_priv(dev);
   1358	struct net_device *peer = rtnl_dereference(priv->peer);
   1359
   1360	netif_carrier_off(dev);
   1361	if (peer)
   1362		netif_carrier_off(peer);
   1363
   1364	if (priv->_xdp_prog)
   1365		veth_disable_xdp(dev);
   1366	else if (veth_gro_requested(dev))
   1367		veth_napi_del(dev);
   1368
   1369	return 0;
   1370}
   1371
   1372static int is_valid_veth_mtu(int mtu)
   1373{
   1374	return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
   1375}
   1376
   1377static int veth_alloc_queues(struct net_device *dev)
   1378{
   1379	struct veth_priv *priv = netdev_priv(dev);
   1380	int i;
   1381
   1382	priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT);
   1383	if (!priv->rq)
   1384		return -ENOMEM;
   1385
   1386	for (i = 0; i < dev->num_rx_queues; i++) {
   1387		priv->rq[i].dev = dev;
   1388		u64_stats_init(&priv->rq[i].stats.syncp);
   1389	}
   1390
   1391	return 0;
   1392}
   1393
   1394static void veth_free_queues(struct net_device *dev)
   1395{
   1396	struct veth_priv *priv = netdev_priv(dev);
   1397
   1398	kfree(priv->rq);
   1399}
   1400
   1401static int veth_dev_init(struct net_device *dev)
   1402{
   1403	int err;
   1404
   1405	dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
   1406	if (!dev->lstats)
   1407		return -ENOMEM;
   1408
   1409	err = veth_alloc_queues(dev);
   1410	if (err) {
   1411		free_percpu(dev->lstats);
   1412		return err;
   1413	}
   1414
   1415	return 0;
   1416}
   1417
   1418static void veth_dev_free(struct net_device *dev)
   1419{
   1420	veth_free_queues(dev);
   1421	free_percpu(dev->lstats);
   1422}
   1423
   1424#ifdef CONFIG_NET_POLL_CONTROLLER
   1425static void veth_poll_controller(struct net_device *dev)
   1426{
   1427	/* veth only receives frames when its peer sends one
   1428	 * Since it has nothing to do with disabling irqs, we are guaranteed
   1429	 * never to have pending data when we poll for it so
   1430	 * there is nothing to do here.
   1431	 *
   1432	 * We need this though so netpoll recognizes us as an interface that
   1433	 * supports polling, which enables bridge devices in virt setups to
   1434	 * still use netconsole
   1435	 */
   1436}
   1437#endif	/* CONFIG_NET_POLL_CONTROLLER */
   1438
   1439static int veth_get_iflink(const struct net_device *dev)
   1440{
   1441	struct veth_priv *priv = netdev_priv(dev);
   1442	struct net_device *peer;
   1443	int iflink;
   1444
   1445	rcu_read_lock();
   1446	peer = rcu_dereference(priv->peer);
   1447	iflink = peer ? peer->ifindex : 0;
   1448	rcu_read_unlock();
   1449
   1450	return iflink;
   1451}
   1452
   1453static netdev_features_t veth_fix_features(struct net_device *dev,
   1454					   netdev_features_t features)
   1455{
   1456	struct veth_priv *priv = netdev_priv(dev);
   1457	struct net_device *peer;
   1458
   1459	peer = rtnl_dereference(priv->peer);
   1460	if (peer) {
   1461		struct veth_priv *peer_priv = netdev_priv(peer);
   1462
   1463		if (peer_priv->_xdp_prog)
   1464			features &= ~NETIF_F_GSO_SOFTWARE;
   1465	}
   1466	if (priv->_xdp_prog)
   1467		features |= NETIF_F_GRO;
   1468
   1469	return features;
   1470}
   1471
   1472static int veth_set_features(struct net_device *dev,
   1473			     netdev_features_t features)
   1474{
   1475	netdev_features_t changed = features ^ dev->features;
   1476	struct veth_priv *priv = netdev_priv(dev);
   1477	int err;
   1478
   1479	if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
   1480		return 0;
   1481
   1482	if (features & NETIF_F_GRO) {
   1483		err = veth_napi_enable(dev);
   1484		if (err)
   1485			return err;
   1486	} else {
   1487		veth_napi_del(dev);
   1488	}
   1489	return 0;
   1490}
   1491
   1492static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
   1493{
   1494	struct veth_priv *peer_priv, *priv = netdev_priv(dev);
   1495	struct net_device *peer;
   1496
   1497	if (new_hr < 0)
   1498		new_hr = 0;
   1499
   1500	rcu_read_lock();
   1501	peer = rcu_dereference(priv->peer);
   1502	if (unlikely(!peer))
   1503		goto out;
   1504
   1505	peer_priv = netdev_priv(peer);
   1506	priv->requested_headroom = new_hr;
   1507	new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
   1508	dev->needed_headroom = new_hr;
   1509	peer->needed_headroom = new_hr;
   1510
   1511out:
   1512	rcu_read_unlock();
   1513}
   1514
   1515static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
   1516			struct netlink_ext_ack *extack)
   1517{
   1518	struct veth_priv *priv = netdev_priv(dev);
   1519	struct bpf_prog *old_prog;
   1520	struct net_device *peer;
   1521	unsigned int max_mtu;
   1522	int err;
   1523
   1524	old_prog = priv->_xdp_prog;
   1525	priv->_xdp_prog = prog;
   1526	peer = rtnl_dereference(priv->peer);
   1527
   1528	if (prog) {
   1529		if (!peer) {
   1530			NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
   1531			err = -ENOTCONN;
   1532			goto err;
   1533		}
   1534
   1535		max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) -
   1536			  peer->hard_header_len;
   1537		/* Allow increasing the max_mtu if the program supports
   1538		 * XDP fragments.
   1539		 */
   1540		if (prog->aux->xdp_has_frags)
   1541			max_mtu += PAGE_SIZE * MAX_SKB_FRAGS;
   1542
   1543		if (peer->mtu > max_mtu) {
   1544			NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
   1545			err = -ERANGE;
   1546			goto err;
   1547		}
   1548
   1549		if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
   1550			NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
   1551			err = -ENOSPC;
   1552			goto err;
   1553		}
   1554
   1555		if (dev->flags & IFF_UP) {
   1556			err = veth_enable_xdp(dev);
   1557			if (err) {
   1558				NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
   1559				goto err;
   1560			}
   1561		}
   1562
   1563		if (!old_prog) {
   1564			peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
   1565			peer->max_mtu = max_mtu;
   1566		}
   1567	}
   1568
   1569	if (old_prog) {
   1570		if (!prog) {
   1571			if (dev->flags & IFF_UP)
   1572				veth_disable_xdp(dev);
   1573
   1574			if (peer) {
   1575				peer->hw_features |= NETIF_F_GSO_SOFTWARE;
   1576				peer->max_mtu = ETH_MAX_MTU;
   1577			}
   1578		}
   1579		bpf_prog_put(old_prog);
   1580	}
   1581
   1582	if ((!!old_prog ^ !!prog) && peer)
   1583		netdev_update_features(peer);
   1584
   1585	return 0;
   1586err:
   1587	priv->_xdp_prog = old_prog;
   1588
   1589	return err;
   1590}
   1591
   1592static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
   1593{
   1594	switch (xdp->command) {
   1595	case XDP_SETUP_PROG:
   1596		return veth_xdp_set(dev, xdp->prog, xdp->extack);
   1597	default:
   1598		return -EINVAL;
   1599	}
   1600}
   1601
   1602static const struct net_device_ops veth_netdev_ops = {
   1603	.ndo_init            = veth_dev_init,
   1604	.ndo_open            = veth_open,
   1605	.ndo_stop            = veth_close,
   1606	.ndo_start_xmit      = veth_xmit,
   1607	.ndo_get_stats64     = veth_get_stats64,
   1608	.ndo_set_rx_mode     = veth_set_multicast_list,
   1609	.ndo_set_mac_address = eth_mac_addr,
   1610#ifdef CONFIG_NET_POLL_CONTROLLER
   1611	.ndo_poll_controller	= veth_poll_controller,
   1612#endif
   1613	.ndo_get_iflink		= veth_get_iflink,
   1614	.ndo_fix_features	= veth_fix_features,
   1615	.ndo_set_features	= veth_set_features,
   1616	.ndo_features_check	= passthru_features_check,
   1617	.ndo_set_rx_headroom	= veth_set_rx_headroom,
   1618	.ndo_bpf		= veth_xdp,
   1619	.ndo_xdp_xmit		= veth_ndo_xdp_xmit,
   1620	.ndo_get_peer_dev	= veth_peer_dev,
   1621};
   1622
   1623#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
   1624		       NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
   1625		       NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
   1626		       NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
   1627		       NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
   1628
   1629static void veth_setup(struct net_device *dev)
   1630{
   1631	ether_setup(dev);
   1632
   1633	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
   1634	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
   1635	dev->priv_flags |= IFF_NO_QUEUE;
   1636	dev->priv_flags |= IFF_PHONY_HEADROOM;
   1637
   1638	dev->netdev_ops = &veth_netdev_ops;
   1639	dev->ethtool_ops = &veth_ethtool_ops;
   1640	dev->features |= NETIF_F_LLTX;
   1641	dev->features |= VETH_FEATURES;
   1642	dev->vlan_features = dev->features &
   1643			     ~(NETIF_F_HW_VLAN_CTAG_TX |
   1644			       NETIF_F_HW_VLAN_STAG_TX |
   1645			       NETIF_F_HW_VLAN_CTAG_RX |
   1646			       NETIF_F_HW_VLAN_STAG_RX);
   1647	dev->needs_free_netdev = true;
   1648	dev->priv_destructor = veth_dev_free;
   1649	dev->max_mtu = ETH_MAX_MTU;
   1650
   1651	dev->hw_features = VETH_FEATURES;
   1652	dev->hw_enc_features = VETH_FEATURES;
   1653	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
   1654	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
   1655}
   1656
   1657/*
   1658 * netlink interface
   1659 */
   1660
   1661static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
   1662			 struct netlink_ext_ack *extack)
   1663{
   1664	if (tb[IFLA_ADDRESS]) {
   1665		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
   1666			return -EINVAL;
   1667		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
   1668			return -EADDRNOTAVAIL;
   1669	}
   1670	if (tb[IFLA_MTU]) {
   1671		if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
   1672			return -EINVAL;
   1673	}
   1674	return 0;
   1675}
   1676
   1677static struct rtnl_link_ops veth_link_ops;
   1678
   1679static void veth_disable_gro(struct net_device *dev)
   1680{
   1681	dev->features &= ~NETIF_F_GRO;
   1682	dev->wanted_features &= ~NETIF_F_GRO;
   1683	netdev_update_features(dev);
   1684}
   1685
   1686static int veth_init_queues(struct net_device *dev, struct nlattr *tb[])
   1687{
   1688	int err;
   1689
   1690	if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) {
   1691		err = netif_set_real_num_tx_queues(dev, 1);
   1692		if (err)
   1693			return err;
   1694	}
   1695	if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) {
   1696		err = netif_set_real_num_rx_queues(dev, 1);
   1697		if (err)
   1698			return err;
   1699	}
   1700	return 0;
   1701}
   1702
   1703static int veth_newlink(struct net *src_net, struct net_device *dev,
   1704			struct nlattr *tb[], struct nlattr *data[],
   1705			struct netlink_ext_ack *extack)
   1706{
   1707	int err;
   1708	struct net_device *peer;
   1709	struct veth_priv *priv;
   1710	char ifname[IFNAMSIZ];
   1711	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
   1712	unsigned char name_assign_type;
   1713	struct ifinfomsg *ifmp;
   1714	struct net *net;
   1715
   1716	/*
   1717	 * create and register peer first
   1718	 */
   1719	if (data != NULL && data[VETH_INFO_PEER] != NULL) {
   1720		struct nlattr *nla_peer;
   1721
   1722		nla_peer = data[VETH_INFO_PEER];
   1723		ifmp = nla_data(nla_peer);
   1724		err = rtnl_nla_parse_ifla(peer_tb,
   1725					  nla_data(nla_peer) + sizeof(struct ifinfomsg),
   1726					  nla_len(nla_peer) - sizeof(struct ifinfomsg),
   1727					  NULL);
   1728		if (err < 0)
   1729			return err;
   1730
   1731		err = veth_validate(peer_tb, NULL, extack);
   1732		if (err < 0)
   1733			return err;
   1734
   1735		tbp = peer_tb;
   1736	} else {
   1737		ifmp = NULL;
   1738		tbp = tb;
   1739	}
   1740
   1741	if (ifmp && tbp[IFLA_IFNAME]) {
   1742		nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
   1743		name_assign_type = NET_NAME_USER;
   1744	} else {
   1745		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
   1746		name_assign_type = NET_NAME_ENUM;
   1747	}
   1748
   1749	net = rtnl_link_get_net(src_net, tbp);
   1750	if (IS_ERR(net))
   1751		return PTR_ERR(net);
   1752
   1753	peer = rtnl_create_link(net, ifname, name_assign_type,
   1754				&veth_link_ops, tbp, extack);
   1755	if (IS_ERR(peer)) {
   1756		put_net(net);
   1757		return PTR_ERR(peer);
   1758	}
   1759
   1760	if (!ifmp || !tbp[IFLA_ADDRESS])
   1761		eth_hw_addr_random(peer);
   1762
   1763	if (ifmp && (dev->ifindex != 0))
   1764		peer->ifindex = ifmp->ifi_index;
   1765
   1766	netif_inherit_tso_max(peer, dev);
   1767
   1768	err = register_netdevice(peer);
   1769	put_net(net);
   1770	net = NULL;
   1771	if (err < 0)
   1772		goto err_register_peer;
   1773
   1774	/* keep GRO disabled by default to be consistent with the established
   1775	 * veth behavior
   1776	 */
   1777	veth_disable_gro(peer);
   1778	netif_carrier_off(peer);
   1779
   1780	err = rtnl_configure_link(peer, ifmp);
   1781	if (err < 0)
   1782		goto err_configure_peer;
   1783
   1784	/*
   1785	 * register dev last
   1786	 *
   1787	 * note, that since we've registered new device the dev's name
   1788	 * should be re-allocated
   1789	 */
   1790
   1791	if (tb[IFLA_ADDRESS] == NULL)
   1792		eth_hw_addr_random(dev);
   1793
   1794	if (tb[IFLA_IFNAME])
   1795		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
   1796	else
   1797		snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
   1798
   1799	err = register_netdevice(dev);
   1800	if (err < 0)
   1801		goto err_register_dev;
   1802
   1803	netif_carrier_off(dev);
   1804
   1805	/*
   1806	 * tie the deviced together
   1807	 */
   1808
   1809	priv = netdev_priv(dev);
   1810	rcu_assign_pointer(priv->peer, peer);
   1811	err = veth_init_queues(dev, tb);
   1812	if (err)
   1813		goto err_queues;
   1814
   1815	priv = netdev_priv(peer);
   1816	rcu_assign_pointer(priv->peer, dev);
   1817	err = veth_init_queues(peer, tb);
   1818	if (err)
   1819		goto err_queues;
   1820
   1821	veth_disable_gro(dev);
   1822	return 0;
   1823
   1824err_queues:
   1825	unregister_netdevice(dev);
   1826err_register_dev:
   1827	/* nothing to do */
   1828err_configure_peer:
   1829	unregister_netdevice(peer);
   1830	return err;
   1831
   1832err_register_peer:
   1833	free_netdev(peer);
   1834	return err;
   1835}
   1836
   1837static void veth_dellink(struct net_device *dev, struct list_head *head)
   1838{
   1839	struct veth_priv *priv;
   1840	struct net_device *peer;
   1841
   1842	priv = netdev_priv(dev);
   1843	peer = rtnl_dereference(priv->peer);
   1844
   1845	/* Note : dellink() is called from default_device_exit_batch(),
   1846	 * before a rcu_synchronize() point. The devices are guaranteed
   1847	 * not being freed before one RCU grace period.
   1848	 */
   1849	RCU_INIT_POINTER(priv->peer, NULL);
   1850	unregister_netdevice_queue(dev, head);
   1851
   1852	if (peer) {
   1853		priv = netdev_priv(peer);
   1854		RCU_INIT_POINTER(priv->peer, NULL);
   1855		unregister_netdevice_queue(peer, head);
   1856	}
   1857}
   1858
   1859static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
   1860	[VETH_INFO_PEER]	= { .len = sizeof(struct ifinfomsg) },
   1861};
   1862
   1863static struct net *veth_get_link_net(const struct net_device *dev)
   1864{
   1865	struct veth_priv *priv = netdev_priv(dev);
   1866	struct net_device *peer = rtnl_dereference(priv->peer);
   1867
   1868	return peer ? dev_net(peer) : dev_net(dev);
   1869}
   1870
   1871static unsigned int veth_get_num_queues(void)
   1872{
   1873	/* enforce the same queue limit as rtnl_create_link */
   1874	int queues = num_possible_cpus();
   1875
   1876	if (queues > 4096)
   1877		queues = 4096;
   1878	return queues;
   1879}
   1880
   1881static struct rtnl_link_ops veth_link_ops = {
   1882	.kind		= DRV_NAME,
   1883	.priv_size	= sizeof(struct veth_priv),
   1884	.setup		= veth_setup,
   1885	.validate	= veth_validate,
   1886	.newlink	= veth_newlink,
   1887	.dellink	= veth_dellink,
   1888	.policy		= veth_policy,
   1889	.maxtype	= VETH_INFO_MAX,
   1890	.get_link_net	= veth_get_link_net,
   1891	.get_num_tx_queues	= veth_get_num_queues,
   1892	.get_num_rx_queues	= veth_get_num_queues,
   1893};
   1894
   1895/*
   1896 * init/fini
   1897 */
   1898
   1899static __init int veth_init(void)
   1900{
   1901	return rtnl_link_register(&veth_link_ops);
   1902}
   1903
   1904static __exit void veth_exit(void)
   1905{
   1906	rtnl_link_unregister(&veth_link_ops);
   1907}
   1908
   1909module_init(veth_init);
   1910module_exit(veth_exit);
   1911
   1912MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
   1913MODULE_LICENSE("GPL v2");
   1914MODULE_ALIAS_RTNL_LINK(DRV_NAME);