cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

netvsc_drv.c (71688B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (c) 2009, Microsoft Corporation.
      4 *
      5 * Authors:
      6 *   Haiyang Zhang <haiyangz@microsoft.com>
      7 *   Hank Janssen  <hjanssen@microsoft.com>
      8 */
      9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     10
     11#include <linux/init.h>
     12#include <linux/atomic.h>
     13#include <linux/ethtool.h>
     14#include <linux/module.h>
     15#include <linux/highmem.h>
     16#include <linux/device.h>
     17#include <linux/io.h>
     18#include <linux/delay.h>
     19#include <linux/netdevice.h>
     20#include <linux/inetdevice.h>
     21#include <linux/etherdevice.h>
     22#include <linux/pci.h>
     23#include <linux/skbuff.h>
     24#include <linux/if_vlan.h>
     25#include <linux/in.h>
     26#include <linux/slab.h>
     27#include <linux/rtnetlink.h>
     28#include <linux/netpoll.h>
     29#include <linux/bpf.h>
     30
     31#include <net/arp.h>
     32#include <net/route.h>
     33#include <net/sock.h>
     34#include <net/pkt_sched.h>
     35#include <net/checksum.h>
     36#include <net/ip6_checksum.h>
     37
     38#include "hyperv_net.h"
     39
     40#define RING_SIZE_MIN	64
     41
     42#define LINKCHANGE_INT (2 * HZ)
     43#define VF_TAKEOVER_INT (HZ / 10)
     44
     45static unsigned int ring_size __ro_after_init = 128;
     46module_param(ring_size, uint, 0444);
     47MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
     48unsigned int netvsc_ring_bytes __ro_after_init;
     49
     50static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
     51				NETIF_MSG_LINK | NETIF_MSG_IFUP |
     52				NETIF_MSG_IFDOWN | NETIF_MSG_RX_ERR |
     53				NETIF_MSG_TX_ERR;
     54
     55static int debug = -1;
     56module_param(debug, int, 0444);
     57MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
     58
     59static LIST_HEAD(netvsc_dev_list);
     60
     61static void netvsc_change_rx_flags(struct net_device *net, int change)
     62{
     63	struct net_device_context *ndev_ctx = netdev_priv(net);
     64	struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
     65	int inc;
     66
     67	if (!vf_netdev)
     68		return;
     69
     70	if (change & IFF_PROMISC) {
     71		inc = (net->flags & IFF_PROMISC) ? 1 : -1;
     72		dev_set_promiscuity(vf_netdev, inc);
     73	}
     74
     75	if (change & IFF_ALLMULTI) {
     76		inc = (net->flags & IFF_ALLMULTI) ? 1 : -1;
     77		dev_set_allmulti(vf_netdev, inc);
     78	}
     79}
     80
     81static void netvsc_set_rx_mode(struct net_device *net)
     82{
     83	struct net_device_context *ndev_ctx = netdev_priv(net);
     84	struct net_device *vf_netdev;
     85	struct netvsc_device *nvdev;
     86
     87	rcu_read_lock();
     88	vf_netdev = rcu_dereference(ndev_ctx->vf_netdev);
     89	if (vf_netdev) {
     90		dev_uc_sync(vf_netdev, net);
     91		dev_mc_sync(vf_netdev, net);
     92	}
     93
     94	nvdev = rcu_dereference(ndev_ctx->nvdev);
     95	if (nvdev)
     96		rndis_filter_update(nvdev);
     97	rcu_read_unlock();
     98}
     99
    100static void netvsc_tx_enable(struct netvsc_device *nvscdev,
    101			     struct net_device *ndev)
    102{
    103	nvscdev->tx_disable = false;
    104	virt_wmb(); /* ensure queue wake up mechanism is on */
    105
    106	netif_tx_wake_all_queues(ndev);
    107}
    108
    109static int netvsc_open(struct net_device *net)
    110{
    111	struct net_device_context *ndev_ctx = netdev_priv(net);
    112	struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
    113	struct netvsc_device *nvdev = rtnl_dereference(ndev_ctx->nvdev);
    114	struct rndis_device *rdev;
    115	int ret = 0;
    116
    117	netif_carrier_off(net);
    118
    119	/* Open up the device */
    120	ret = rndis_filter_open(nvdev);
    121	if (ret != 0) {
    122		netdev_err(net, "unable to open device (ret %d).\n", ret);
    123		return ret;
    124	}
    125
    126	rdev = nvdev->extension;
    127	if (!rdev->link_state) {
    128		netif_carrier_on(net);
    129		netvsc_tx_enable(nvdev, net);
    130	}
    131
    132	if (vf_netdev) {
    133		/* Setting synthetic device up transparently sets
    134		 * slave as up. If open fails, then slave will be
    135		 * still be offline (and not used).
    136		 */
    137		ret = dev_open(vf_netdev, NULL);
    138		if (ret)
    139			netdev_warn(net,
    140				    "unable to open slave: %s: %d\n",
    141				    vf_netdev->name, ret);
    142	}
    143	return 0;
    144}
    145
    146static int netvsc_wait_until_empty(struct netvsc_device *nvdev)
    147{
    148	unsigned int retry = 0;
    149	int i;
    150
    151	/* Ensure pending bytes in ring are read */
    152	for (;;) {
    153		u32 aread = 0;
    154
    155		for (i = 0; i < nvdev->num_chn; i++) {
    156			struct vmbus_channel *chn
    157				= nvdev->chan_table[i].channel;
    158
    159			if (!chn)
    160				continue;
    161
    162			/* make sure receive not running now */
    163			napi_synchronize(&nvdev->chan_table[i].napi);
    164
    165			aread = hv_get_bytes_to_read(&chn->inbound);
    166			if (aread)
    167				break;
    168
    169			aread = hv_get_bytes_to_read(&chn->outbound);
    170			if (aread)
    171				break;
    172		}
    173
    174		if (aread == 0)
    175			return 0;
    176
    177		if (++retry > RETRY_MAX)
    178			return -ETIMEDOUT;
    179
    180		usleep_range(RETRY_US_LO, RETRY_US_HI);
    181	}
    182}
    183
    184static void netvsc_tx_disable(struct netvsc_device *nvscdev,
    185			      struct net_device *ndev)
    186{
    187	if (nvscdev) {
    188		nvscdev->tx_disable = true;
    189		virt_wmb(); /* ensure txq will not wake up after stop */
    190	}
    191
    192	netif_tx_disable(ndev);
    193}
    194
    195static int netvsc_close(struct net_device *net)
    196{
    197	struct net_device_context *net_device_ctx = netdev_priv(net);
    198	struct net_device *vf_netdev
    199		= rtnl_dereference(net_device_ctx->vf_netdev);
    200	struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
    201	int ret;
    202
    203	netvsc_tx_disable(nvdev, net);
    204
    205	/* No need to close rndis filter if it is removed already */
    206	if (!nvdev)
    207		return 0;
    208
    209	ret = rndis_filter_close(nvdev);
    210	if (ret != 0) {
    211		netdev_err(net, "unable to close device (ret %d).\n", ret);
    212		return ret;
    213	}
    214
    215	ret = netvsc_wait_until_empty(nvdev);
    216	if (ret)
    217		netdev_err(net, "Ring buffer not empty after closing rndis\n");
    218
    219	if (vf_netdev)
    220		dev_close(vf_netdev);
    221
    222	return ret;
    223}
    224
    225static inline void *init_ppi_data(struct rndis_message *msg,
    226				  u32 ppi_size, u32 pkt_type)
    227{
    228	struct rndis_packet *rndis_pkt = &msg->msg.pkt;
    229	struct rndis_per_packet_info *ppi;
    230
    231	rndis_pkt->data_offset += ppi_size;
    232	ppi = (void *)rndis_pkt + rndis_pkt->per_pkt_info_offset
    233		+ rndis_pkt->per_pkt_info_len;
    234
    235	ppi->size = ppi_size;
    236	ppi->type = pkt_type;
    237	ppi->internal = 0;
    238	ppi->ppi_offset = sizeof(struct rndis_per_packet_info);
    239
    240	rndis_pkt->per_pkt_info_len += ppi_size;
    241
    242	return ppi + 1;
    243}
    244
    245static inline int netvsc_get_tx_queue(struct net_device *ndev,
    246				      struct sk_buff *skb, int old_idx)
    247{
    248	const struct net_device_context *ndc = netdev_priv(ndev);
    249	struct sock *sk = skb->sk;
    250	int q_idx;
    251
    252	q_idx = ndc->tx_table[netvsc_get_hash(skb, ndc) &
    253			      (VRSS_SEND_TAB_SIZE - 1)];
    254
    255	/* If queue index changed record the new value */
    256	if (q_idx != old_idx &&
    257	    sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache))
    258		sk_tx_queue_set(sk, q_idx);
    259
    260	return q_idx;
    261}
    262
    263/*
    264 * Select queue for transmit.
    265 *
    266 * If a valid queue has already been assigned, then use that.
    267 * Otherwise compute tx queue based on hash and the send table.
    268 *
    269 * This is basically similar to default (netdev_pick_tx) with the added step
    270 * of using the host send_table when no other queue has been assigned.
    271 *
    272 * TODO support XPS - but get_xps_queue not exported
    273 */
    274static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb)
    275{
    276	int q_idx = sk_tx_queue_get(skb->sk);
    277
    278	if (q_idx < 0 || skb->ooo_okay || q_idx >= ndev->real_num_tx_queues) {
    279		/* If forwarding a packet, we use the recorded queue when
    280		 * available for better cache locality.
    281		 */
    282		if (skb_rx_queue_recorded(skb))
    283			q_idx = skb_get_rx_queue(skb);
    284		else
    285			q_idx = netvsc_get_tx_queue(ndev, skb, q_idx);
    286	}
    287
    288	return q_idx;
    289}
    290
    291static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
    292			       struct net_device *sb_dev)
    293{
    294	struct net_device_context *ndc = netdev_priv(ndev);
    295	struct net_device *vf_netdev;
    296	u16 txq;
    297
    298	rcu_read_lock();
    299	vf_netdev = rcu_dereference(ndc->vf_netdev);
    300	if (vf_netdev) {
    301		const struct net_device_ops *vf_ops = vf_netdev->netdev_ops;
    302
    303		if (vf_ops->ndo_select_queue)
    304			txq = vf_ops->ndo_select_queue(vf_netdev, skb, sb_dev);
    305		else
    306			txq = netdev_pick_tx(vf_netdev, skb, NULL);
    307
    308		/* Record the queue selected by VF so that it can be
    309		 * used for common case where VF has more queues than
    310		 * the synthetic device.
    311		 */
    312		qdisc_skb_cb(skb)->slave_dev_queue_mapping = txq;
    313	} else {
    314		txq = netvsc_pick_tx(ndev, skb);
    315	}
    316	rcu_read_unlock();
    317
    318	while (txq >= ndev->real_num_tx_queues)
    319		txq -= ndev->real_num_tx_queues;
    320
    321	return txq;
    322}
    323
    324static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len,
    325		       struct hv_page_buffer *pb)
    326{
    327	int j = 0;
    328
    329	hvpfn += offset >> HV_HYP_PAGE_SHIFT;
    330	offset = offset & ~HV_HYP_PAGE_MASK;
    331
    332	while (len > 0) {
    333		unsigned long bytes;
    334
    335		bytes = HV_HYP_PAGE_SIZE - offset;
    336		if (bytes > len)
    337			bytes = len;
    338		pb[j].pfn = hvpfn;
    339		pb[j].offset = offset;
    340		pb[j].len = bytes;
    341
    342		offset += bytes;
    343		len -= bytes;
    344
    345		if (offset == HV_HYP_PAGE_SIZE && len) {
    346			hvpfn++;
    347			offset = 0;
    348			j++;
    349		}
    350	}
    351
    352	return j + 1;
    353}
    354
    355static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
    356			   struct hv_netvsc_packet *packet,
    357			   struct hv_page_buffer *pb)
    358{
    359	u32 slots_used = 0;
    360	char *data = skb->data;
    361	int frags = skb_shinfo(skb)->nr_frags;
    362	int i;
    363
    364	/* The packet is laid out thus:
    365	 * 1. hdr: RNDIS header and PPI
    366	 * 2. skb linear data
    367	 * 3. skb fragment data
    368	 */
    369	slots_used += fill_pg_buf(virt_to_hvpfn(hdr),
    370				  offset_in_hvpage(hdr),
    371				  len,
    372				  &pb[slots_used]);
    373
    374	packet->rmsg_size = len;
    375	packet->rmsg_pgcnt = slots_used;
    376
    377	slots_used += fill_pg_buf(virt_to_hvpfn(data),
    378				  offset_in_hvpage(data),
    379				  skb_headlen(skb),
    380				  &pb[slots_used]);
    381
    382	for (i = 0; i < frags; i++) {
    383		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
    384
    385		slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)),
    386					  skb_frag_off(frag),
    387					  skb_frag_size(frag),
    388					  &pb[slots_used]);
    389	}
    390	return slots_used;
    391}
    392
    393static int count_skb_frag_slots(struct sk_buff *skb)
    394{
    395	int i, frags = skb_shinfo(skb)->nr_frags;
    396	int pages = 0;
    397
    398	for (i = 0; i < frags; i++) {
    399		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
    400		unsigned long size = skb_frag_size(frag);
    401		unsigned long offset = skb_frag_off(frag);
    402
    403		/* Skip unused frames from start of page */
    404		offset &= ~HV_HYP_PAGE_MASK;
    405		pages += HVPFN_UP(offset + size);
    406	}
    407	return pages;
    408}
    409
    410static int netvsc_get_slots(struct sk_buff *skb)
    411{
    412	char *data = skb->data;
    413	unsigned int offset = offset_in_hvpage(data);
    414	unsigned int len = skb_headlen(skb);
    415	int slots;
    416	int frag_slots;
    417
    418	slots = DIV_ROUND_UP(offset + len, HV_HYP_PAGE_SIZE);
    419	frag_slots = count_skb_frag_slots(skb);
    420	return slots + frag_slots;
    421}
    422
    423static u32 net_checksum_info(struct sk_buff *skb)
    424{
    425	if (skb->protocol == htons(ETH_P_IP)) {
    426		struct iphdr *ip = ip_hdr(skb);
    427
    428		if (ip->protocol == IPPROTO_TCP)
    429			return TRANSPORT_INFO_IPV4_TCP;
    430		else if (ip->protocol == IPPROTO_UDP)
    431			return TRANSPORT_INFO_IPV4_UDP;
    432	} else {
    433		struct ipv6hdr *ip6 = ipv6_hdr(skb);
    434
    435		if (ip6->nexthdr == IPPROTO_TCP)
    436			return TRANSPORT_INFO_IPV6_TCP;
    437		else if (ip6->nexthdr == IPPROTO_UDP)
    438			return TRANSPORT_INFO_IPV6_UDP;
    439	}
    440
    441	return TRANSPORT_INFO_NOT_IP;
    442}
    443
    444/* Send skb on the slave VF device. */
    445static int netvsc_vf_xmit(struct net_device *net, struct net_device *vf_netdev,
    446			  struct sk_buff *skb)
    447{
    448	struct net_device_context *ndev_ctx = netdev_priv(net);
    449	unsigned int len = skb->len;
    450	int rc;
    451
    452	skb->dev = vf_netdev;
    453	skb_record_rx_queue(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
    454
    455	rc = dev_queue_xmit(skb);
    456	if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
    457		struct netvsc_vf_pcpu_stats *pcpu_stats
    458			= this_cpu_ptr(ndev_ctx->vf_stats);
    459
    460		u64_stats_update_begin(&pcpu_stats->syncp);
    461		pcpu_stats->tx_packets++;
    462		pcpu_stats->tx_bytes += len;
    463		u64_stats_update_end(&pcpu_stats->syncp);
    464	} else {
    465		this_cpu_inc(ndev_ctx->vf_stats->tx_dropped);
    466	}
    467
    468	return rc;
    469}
    470
    471static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx)
    472{
    473	struct net_device_context *net_device_ctx = netdev_priv(net);
    474	struct hv_netvsc_packet *packet = NULL;
    475	int ret;
    476	unsigned int num_data_pgs;
    477	struct rndis_message *rndis_msg;
    478	struct net_device *vf_netdev;
    479	u32 rndis_msg_size;
    480	u32 hash;
    481	struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT];
    482
    483	/* If VF is present and up then redirect packets to it.
    484	 * Skip the VF if it is marked down or has no carrier.
    485	 * If netpoll is in uses, then VF can not be used either.
    486	 */
    487	vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev);
    488	if (vf_netdev && netif_running(vf_netdev) &&
    489	    netif_carrier_ok(vf_netdev) && !netpoll_tx_running(net) &&
    490	    net_device_ctx->data_path_is_vf)
    491		return netvsc_vf_xmit(net, vf_netdev, skb);
    492
    493	/* We will atmost need two pages to describe the rndis
    494	 * header. We can only transmit MAX_PAGE_BUFFER_COUNT number
    495	 * of pages in a single packet. If skb is scattered around
    496	 * more pages we try linearizing it.
    497	 */
    498
    499	num_data_pgs = netvsc_get_slots(skb) + 2;
    500
    501	if (unlikely(num_data_pgs > MAX_PAGE_BUFFER_COUNT)) {
    502		++net_device_ctx->eth_stats.tx_scattered;
    503
    504		if (skb_linearize(skb))
    505			goto no_memory;
    506
    507		num_data_pgs = netvsc_get_slots(skb) + 2;
    508		if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) {
    509			++net_device_ctx->eth_stats.tx_too_big;
    510			goto drop;
    511		}
    512	}
    513
    514	/*
    515	 * Place the rndis header in the skb head room and
    516	 * the skb->cb will be used for hv_netvsc_packet
    517	 * structure.
    518	 */
    519	ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE);
    520	if (ret)
    521		goto no_memory;
    522
    523	/* Use the skb control buffer for building up the packet */
    524	BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) >
    525			sizeof_field(struct sk_buff, cb));
    526	packet = (struct hv_netvsc_packet *)skb->cb;
    527
    528	packet->q_idx = skb_get_queue_mapping(skb);
    529
    530	packet->total_data_buflen = skb->len;
    531	packet->total_bytes = skb->len;
    532	packet->total_packets = 1;
    533
    534	rndis_msg = (struct rndis_message *)skb->head;
    535
    536	/* Add the rndis header */
    537	rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET;
    538	rndis_msg->msg_len = packet->total_data_buflen;
    539
    540	rndis_msg->msg.pkt = (struct rndis_packet) {
    541		.data_offset = sizeof(struct rndis_packet),
    542		.data_len = packet->total_data_buflen,
    543		.per_pkt_info_offset = sizeof(struct rndis_packet),
    544	};
    545
    546	rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet);
    547
    548	hash = skb_get_hash_raw(skb);
    549	if (hash != 0 && net->real_num_tx_queues > 1) {
    550		u32 *hash_info;
    551
    552		rndis_msg_size += NDIS_HASH_PPI_SIZE;
    553		hash_info = init_ppi_data(rndis_msg, NDIS_HASH_PPI_SIZE,
    554					  NBL_HASH_VALUE);
    555		*hash_info = hash;
    556	}
    557
    558	/* When using AF_PACKET we need to drop VLAN header from
    559	 * the frame and update the SKB to allow the HOST OS
    560	 * to transmit the 802.1Q packet
    561	 */
    562	if (skb->protocol == htons(ETH_P_8021Q)) {
    563		u16 vlan_tci;
    564
    565		skb_reset_mac_header(skb);
    566		if (eth_type_vlan(eth_hdr(skb)->h_proto)) {
    567			if (unlikely(__skb_vlan_pop(skb, &vlan_tci) != 0)) {
    568				++net_device_ctx->eth_stats.vlan_error;
    569				goto drop;
    570			}
    571
    572			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci);
    573			/* Update the NDIS header pkt lengths */
    574			packet->total_data_buflen -= VLAN_HLEN;
    575			packet->total_bytes -= VLAN_HLEN;
    576			rndis_msg->msg_len = packet->total_data_buflen;
    577			rndis_msg->msg.pkt.data_len = packet->total_data_buflen;
    578		}
    579	}
    580
    581	if (skb_vlan_tag_present(skb)) {
    582		struct ndis_pkt_8021q_info *vlan;
    583
    584		rndis_msg_size += NDIS_VLAN_PPI_SIZE;
    585		vlan = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE,
    586				     IEEE_8021Q_INFO);
    587
    588		vlan->value = 0;
    589		vlan->vlanid = skb_vlan_tag_get_id(skb);
    590		vlan->cfi = skb_vlan_tag_get_cfi(skb);
    591		vlan->pri = skb_vlan_tag_get_prio(skb);
    592	}
    593
    594	if (skb_is_gso(skb)) {
    595		struct ndis_tcp_lso_info *lso_info;
    596
    597		rndis_msg_size += NDIS_LSO_PPI_SIZE;
    598		lso_info = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE,
    599					 TCP_LARGESEND_PKTINFO);
    600
    601		lso_info->value = 0;
    602		lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
    603		if (skb->protocol == htons(ETH_P_IP)) {
    604			lso_info->lso_v2_transmit.ip_version =
    605				NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
    606			ip_hdr(skb)->tot_len = 0;
    607			ip_hdr(skb)->check = 0;
    608			tcp_hdr(skb)->check =
    609				~csum_tcpudp_magic(ip_hdr(skb)->saddr,
    610						   ip_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
    611		} else {
    612			lso_info->lso_v2_transmit.ip_version =
    613				NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
    614			tcp_v6_gso_csum_prep(skb);
    615		}
    616		lso_info->lso_v2_transmit.tcp_header_offset = skb_transport_offset(skb);
    617		lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size;
    618	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
    619		if (net_checksum_info(skb) & net_device_ctx->tx_checksum_mask) {
    620			struct ndis_tcp_ip_checksum_info *csum_info;
    621
    622			rndis_msg_size += NDIS_CSUM_PPI_SIZE;
    623			csum_info = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE,
    624						  TCPIP_CHKSUM_PKTINFO);
    625
    626			csum_info->value = 0;
    627			csum_info->transmit.tcp_header_offset = skb_transport_offset(skb);
    628
    629			if (skb->protocol == htons(ETH_P_IP)) {
    630				csum_info->transmit.is_ipv4 = 1;
    631
    632				if (ip_hdr(skb)->protocol == IPPROTO_TCP)
    633					csum_info->transmit.tcp_checksum = 1;
    634				else
    635					csum_info->transmit.udp_checksum = 1;
    636			} else {
    637				csum_info->transmit.is_ipv6 = 1;
    638
    639				if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
    640					csum_info->transmit.tcp_checksum = 1;
    641				else
    642					csum_info->transmit.udp_checksum = 1;
    643			}
    644		} else {
    645			/* Can't do offload of this type of checksum */
    646			if (skb_checksum_help(skb))
    647				goto drop;
    648		}
    649	}
    650
    651	/* Start filling in the page buffers with the rndis hdr */
    652	rndis_msg->msg_len += rndis_msg_size;
    653	packet->total_data_buflen = rndis_msg->msg_len;
    654	packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
    655					       skb, packet, pb);
    656
    657	/* timestamp packet in software */
    658	skb_tx_timestamp(skb);
    659
    660	ret = netvsc_send(net, packet, rndis_msg, pb, skb, xdp_tx);
    661	if (likely(ret == 0))
    662		return NETDEV_TX_OK;
    663
    664	if (ret == -EAGAIN) {
    665		++net_device_ctx->eth_stats.tx_busy;
    666		return NETDEV_TX_BUSY;
    667	}
    668
    669	if (ret == -ENOSPC)
    670		++net_device_ctx->eth_stats.tx_no_space;
    671
    672drop:
    673	dev_kfree_skb_any(skb);
    674	net->stats.tx_dropped++;
    675
    676	return NETDEV_TX_OK;
    677
    678no_memory:
    679	++net_device_ctx->eth_stats.tx_no_memory;
    680	goto drop;
    681}
    682
    683static netdev_tx_t netvsc_start_xmit(struct sk_buff *skb,
    684				     struct net_device *ndev)
    685{
    686	return netvsc_xmit(skb, ndev, false);
    687}
    688
    689/*
    690 * netvsc_linkstatus_callback - Link up/down notification
    691 */
    692void netvsc_linkstatus_callback(struct net_device *net,
    693				struct rndis_message *resp,
    694				void *data, u32 data_buflen)
    695{
    696	struct rndis_indicate_status *indicate = &resp->msg.indicate_status;
    697	struct net_device_context *ndev_ctx = netdev_priv(net);
    698	struct netvsc_reconfig *event;
    699	unsigned long flags;
    700
    701	/* Ensure the packet is big enough to access its fields */
    702	if (resp->msg_len - RNDIS_HEADER_SIZE < sizeof(struct rndis_indicate_status)) {
    703		netdev_err(net, "invalid rndis_indicate_status packet, len: %u\n",
    704			   resp->msg_len);
    705		return;
    706	}
    707
    708	/* Copy the RNDIS indicate status into nvchan->recv_buf */
    709	memcpy(indicate, data + RNDIS_HEADER_SIZE, sizeof(*indicate));
    710
    711	/* Update the physical link speed when changing to another vSwitch */
    712	if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) {
    713		u32 speed;
    714
    715		/* Validate status_buf_offset and status_buflen.
    716		 *
    717		 * Certain (pre-Fe) implementations of Hyper-V's vSwitch didn't account
    718		 * for the status buffer field in resp->msg_len; perform the validation
    719		 * using data_buflen (>= resp->msg_len).
    720		 */
    721		if (indicate->status_buflen < sizeof(speed) ||
    722		    indicate->status_buf_offset < sizeof(*indicate) ||
    723		    data_buflen - RNDIS_HEADER_SIZE < indicate->status_buf_offset ||
    724		    data_buflen - RNDIS_HEADER_SIZE - indicate->status_buf_offset
    725				< indicate->status_buflen) {
    726			netdev_err(net, "invalid rndis_indicate_status packet\n");
    727			return;
    728		}
    729
    730		speed = *(u32 *)(data + RNDIS_HEADER_SIZE + indicate->status_buf_offset) / 10000;
    731		ndev_ctx->speed = speed;
    732		return;
    733	}
    734
    735	/* Handle these link change statuses below */
    736	if (indicate->status != RNDIS_STATUS_NETWORK_CHANGE &&
    737	    indicate->status != RNDIS_STATUS_MEDIA_CONNECT &&
    738	    indicate->status != RNDIS_STATUS_MEDIA_DISCONNECT)
    739		return;
    740
    741	if (net->reg_state != NETREG_REGISTERED)
    742		return;
    743
    744	event = kzalloc(sizeof(*event), GFP_ATOMIC);
    745	if (!event)
    746		return;
    747	event->event = indicate->status;
    748
    749	spin_lock_irqsave(&ndev_ctx->lock, flags);
    750	list_add_tail(&event->list, &ndev_ctx->reconfig_events);
    751	spin_unlock_irqrestore(&ndev_ctx->lock, flags);
    752
    753	schedule_delayed_work(&ndev_ctx->dwork, 0);
    754}
    755
    756/* This function should only be called after skb_record_rx_queue() */
    757void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
    758{
    759	int rc;
    760
    761	skb->queue_mapping = skb_get_rx_queue(skb);
    762	__skb_push(skb, ETH_HLEN);
    763
    764	rc = netvsc_xmit(skb, ndev, true);
    765
    766	if (dev_xmit_complete(rc))
    767		return;
    768
    769	dev_kfree_skb_any(skb);
    770	ndev->stats.tx_dropped++;
    771}
    772
    773static void netvsc_comp_ipcsum(struct sk_buff *skb)
    774{
    775	struct iphdr *iph = (struct iphdr *)skb->data;
    776
    777	iph->check = 0;
    778	iph->check = ip_fast_csum(iph, iph->ihl);
    779}
    780
    781static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
    782					     struct netvsc_channel *nvchan,
    783					     struct xdp_buff *xdp)
    784{
    785	struct napi_struct *napi = &nvchan->napi;
    786	const struct ndis_pkt_8021q_info *vlan = &nvchan->rsc.vlan;
    787	const struct ndis_tcp_ip_checksum_info *csum_info =
    788						&nvchan->rsc.csum_info;
    789	const u32 *hash_info = &nvchan->rsc.hash_info;
    790	u8 ppi_flags = nvchan->rsc.ppi_flags;
    791	struct sk_buff *skb;
    792	void *xbuf = xdp->data_hard_start;
    793	int i;
    794
    795	if (xbuf) {
    796		unsigned int hdroom = xdp->data - xdp->data_hard_start;
    797		unsigned int xlen = xdp->data_end - xdp->data;
    798		unsigned int frag_size = xdp->frame_sz;
    799
    800		skb = build_skb(xbuf, frag_size);
    801
    802		if (!skb) {
    803			__free_page(virt_to_page(xbuf));
    804			return NULL;
    805		}
    806
    807		skb_reserve(skb, hdroom);
    808		skb_put(skb, xlen);
    809		skb->dev = napi->dev;
    810	} else {
    811		skb = napi_alloc_skb(napi, nvchan->rsc.pktlen);
    812
    813		if (!skb)
    814			return NULL;
    815
    816		/* Copy to skb. This copy is needed here since the memory
    817		 * pointed by hv_netvsc_packet cannot be deallocated.
    818		 */
    819		for (i = 0; i < nvchan->rsc.cnt; i++)
    820			skb_put_data(skb, nvchan->rsc.data[i],
    821				     nvchan->rsc.len[i]);
    822	}
    823
    824	skb->protocol = eth_type_trans(skb, net);
    825
    826	/* skb is already created with CHECKSUM_NONE */
    827	skb_checksum_none_assert(skb);
    828
    829	/* Incoming packets may have IP header checksum verified by the host.
    830	 * They may not have IP header checksum computed after coalescing.
    831	 * We compute it here if the flags are set, because on Linux, the IP
    832	 * checksum is always checked.
    833	 */
    834	if ((ppi_flags & NVSC_RSC_CSUM_INFO) && csum_info->receive.ip_checksum_value_invalid &&
    835	    csum_info->receive.ip_checksum_succeeded &&
    836	    skb->protocol == htons(ETH_P_IP)) {
    837		/* Check that there is enough space to hold the IP header. */
    838		if (skb_headlen(skb) < sizeof(struct iphdr)) {
    839			kfree_skb(skb);
    840			return NULL;
    841		}
    842		netvsc_comp_ipcsum(skb);
    843	}
    844
    845	/* Do L4 checksum offload if enabled and present. */
    846	if ((ppi_flags & NVSC_RSC_CSUM_INFO) && (net->features & NETIF_F_RXCSUM)) {
    847		if (csum_info->receive.tcp_checksum_succeeded ||
    848		    csum_info->receive.udp_checksum_succeeded)
    849			skb->ip_summed = CHECKSUM_UNNECESSARY;
    850	}
    851
    852	if ((ppi_flags & NVSC_RSC_HASH_INFO) && (net->features & NETIF_F_RXHASH))
    853		skb_set_hash(skb, *hash_info, PKT_HASH_TYPE_L4);
    854
    855	if (ppi_flags & NVSC_RSC_VLAN) {
    856		u16 vlan_tci = vlan->vlanid | (vlan->pri << VLAN_PRIO_SHIFT) |
    857			(vlan->cfi ? VLAN_CFI_MASK : 0);
    858
    859		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
    860				       vlan_tci);
    861	}
    862
    863	return skb;
    864}
    865
    866/*
    867 * netvsc_recv_callback -  Callback when we receive a packet from the
    868 * "wire" on the specified device.
    869 */
    870int netvsc_recv_callback(struct net_device *net,
    871			 struct netvsc_device *net_device,
    872			 struct netvsc_channel *nvchan)
    873{
    874	struct net_device_context *net_device_ctx = netdev_priv(net);
    875	struct vmbus_channel *channel = nvchan->channel;
    876	u16 q_idx = channel->offermsg.offer.sub_channel_index;
    877	struct sk_buff *skb;
    878	struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats;
    879	struct xdp_buff xdp;
    880	u32 act;
    881
    882	if (net->reg_state != NETREG_REGISTERED)
    883		return NVSP_STAT_FAIL;
    884
    885	act = netvsc_run_xdp(net, nvchan, &xdp);
    886
    887	if (act == XDP_REDIRECT)
    888		return NVSP_STAT_SUCCESS;
    889
    890	if (act != XDP_PASS && act != XDP_TX) {
    891		u64_stats_update_begin(&rx_stats->syncp);
    892		rx_stats->xdp_drop++;
    893		u64_stats_update_end(&rx_stats->syncp);
    894
    895		return NVSP_STAT_SUCCESS; /* consumed by XDP */
    896	}
    897
    898	/* Allocate a skb - TODO direct I/O to pages? */
    899	skb = netvsc_alloc_recv_skb(net, nvchan, &xdp);
    900
    901	if (unlikely(!skb)) {
    902		++net_device_ctx->eth_stats.rx_no_memory;
    903		return NVSP_STAT_FAIL;
    904	}
    905
    906	skb_record_rx_queue(skb, q_idx);
    907
    908	/*
    909	 * Even if injecting the packet, record the statistics
    910	 * on the synthetic device because modifying the VF device
    911	 * statistics will not work correctly.
    912	 */
    913	u64_stats_update_begin(&rx_stats->syncp);
    914	if (act == XDP_TX)
    915		rx_stats->xdp_tx++;
    916
    917	rx_stats->packets++;
    918	rx_stats->bytes += nvchan->rsc.pktlen;
    919
    920	if (skb->pkt_type == PACKET_BROADCAST)
    921		++rx_stats->broadcast;
    922	else if (skb->pkt_type == PACKET_MULTICAST)
    923		++rx_stats->multicast;
    924	u64_stats_update_end(&rx_stats->syncp);
    925
    926	if (act == XDP_TX) {
    927		netvsc_xdp_xmit(skb, net);
    928		return NVSP_STAT_SUCCESS;
    929	}
    930
    931	napi_gro_receive(&nvchan->napi, skb);
    932	return NVSP_STAT_SUCCESS;
    933}
    934
    935static void netvsc_get_drvinfo(struct net_device *net,
    936			       struct ethtool_drvinfo *info)
    937{
    938	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
    939	strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
    940}
    941
    942static void netvsc_get_channels(struct net_device *net,
    943				struct ethtool_channels *channel)
    944{
    945	struct net_device_context *net_device_ctx = netdev_priv(net);
    946	struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
    947
    948	if (nvdev) {
    949		channel->max_combined	= nvdev->max_chn;
    950		channel->combined_count = nvdev->num_chn;
    951	}
    952}
    953
    954/* Alloc struct netvsc_device_info, and initialize it from either existing
    955 * struct netvsc_device, or from default values.
    956 */
    957static
    958struct netvsc_device_info *netvsc_devinfo_get(struct netvsc_device *nvdev)
    959{
    960	struct netvsc_device_info *dev_info;
    961	struct bpf_prog *prog;
    962
    963	dev_info = kzalloc(sizeof(*dev_info), GFP_ATOMIC);
    964
    965	if (!dev_info)
    966		return NULL;
    967
    968	if (nvdev) {
    969		ASSERT_RTNL();
    970
    971		dev_info->num_chn = nvdev->num_chn;
    972		dev_info->send_sections = nvdev->send_section_cnt;
    973		dev_info->send_section_size = nvdev->send_section_size;
    974		dev_info->recv_sections = nvdev->recv_section_cnt;
    975		dev_info->recv_section_size = nvdev->recv_section_size;
    976
    977		memcpy(dev_info->rss_key, nvdev->extension->rss_key,
    978		       NETVSC_HASH_KEYLEN);
    979
    980		prog = netvsc_xdp_get(nvdev);
    981		if (prog) {
    982			bpf_prog_inc(prog);
    983			dev_info->bprog = prog;
    984		}
    985	} else {
    986		dev_info->num_chn = VRSS_CHANNEL_DEFAULT;
    987		dev_info->send_sections = NETVSC_DEFAULT_TX;
    988		dev_info->send_section_size = NETVSC_SEND_SECTION_SIZE;
    989		dev_info->recv_sections = NETVSC_DEFAULT_RX;
    990		dev_info->recv_section_size = NETVSC_RECV_SECTION_SIZE;
    991	}
    992
    993	return dev_info;
    994}
    995
    996/* Free struct netvsc_device_info */
    997static void netvsc_devinfo_put(struct netvsc_device_info *dev_info)
    998{
    999	if (dev_info->bprog) {
   1000		ASSERT_RTNL();
   1001		bpf_prog_put(dev_info->bprog);
   1002	}
   1003
   1004	kfree(dev_info);
   1005}
   1006
   1007static int netvsc_detach(struct net_device *ndev,
   1008			 struct netvsc_device *nvdev)
   1009{
   1010	struct net_device_context *ndev_ctx = netdev_priv(ndev);
   1011	struct hv_device *hdev = ndev_ctx->device_ctx;
   1012	int ret;
   1013
   1014	/* Don't try continuing to try and setup sub channels */
   1015	if (cancel_work_sync(&nvdev->subchan_work))
   1016		nvdev->num_chn = 1;
   1017
   1018	netvsc_xdp_set(ndev, NULL, NULL, nvdev);
   1019
   1020	/* If device was up (receiving) then shutdown */
   1021	if (netif_running(ndev)) {
   1022		netvsc_tx_disable(nvdev, ndev);
   1023
   1024		ret = rndis_filter_close(nvdev);
   1025		if (ret) {
   1026			netdev_err(ndev,
   1027				   "unable to close device (ret %d).\n", ret);
   1028			return ret;
   1029		}
   1030
   1031		ret = netvsc_wait_until_empty(nvdev);
   1032		if (ret) {
   1033			netdev_err(ndev,
   1034				   "Ring buffer not empty after closing rndis\n");
   1035			return ret;
   1036		}
   1037	}
   1038
   1039	netif_device_detach(ndev);
   1040
   1041	rndis_filter_device_remove(hdev, nvdev);
   1042
   1043	return 0;
   1044}
   1045
   1046static int netvsc_attach(struct net_device *ndev,
   1047			 struct netvsc_device_info *dev_info)
   1048{
   1049	struct net_device_context *ndev_ctx = netdev_priv(ndev);
   1050	struct hv_device *hdev = ndev_ctx->device_ctx;
   1051	struct netvsc_device *nvdev;
   1052	struct rndis_device *rdev;
   1053	struct bpf_prog *prog;
   1054	int ret = 0;
   1055
   1056	nvdev = rndis_filter_device_add(hdev, dev_info);
   1057	if (IS_ERR(nvdev))
   1058		return PTR_ERR(nvdev);
   1059
   1060	if (nvdev->num_chn > 1) {
   1061		ret = rndis_set_subchannel(ndev, nvdev, dev_info);
   1062
   1063		/* if unavailable, just proceed with one queue */
   1064		if (ret) {
   1065			nvdev->max_chn = 1;
   1066			nvdev->num_chn = 1;
   1067		}
   1068	}
   1069
   1070	prog = dev_info->bprog;
   1071	if (prog) {
   1072		bpf_prog_inc(prog);
   1073		ret = netvsc_xdp_set(ndev, prog, NULL, nvdev);
   1074		if (ret) {
   1075			bpf_prog_put(prog);
   1076			goto err1;
   1077		}
   1078	}
   1079
   1080	/* In any case device is now ready */
   1081	nvdev->tx_disable = false;
   1082	netif_device_attach(ndev);
   1083
   1084	/* Note: enable and attach happen when sub-channels setup */
   1085	netif_carrier_off(ndev);
   1086
   1087	if (netif_running(ndev)) {
   1088		ret = rndis_filter_open(nvdev);
   1089		if (ret)
   1090			goto err2;
   1091
   1092		rdev = nvdev->extension;
   1093		if (!rdev->link_state)
   1094			netif_carrier_on(ndev);
   1095	}
   1096
   1097	return 0;
   1098
   1099err2:
   1100	netif_device_detach(ndev);
   1101
   1102err1:
   1103	rndis_filter_device_remove(hdev, nvdev);
   1104
   1105	return ret;
   1106}
   1107
   1108static int netvsc_set_channels(struct net_device *net,
   1109			       struct ethtool_channels *channels)
   1110{
   1111	struct net_device_context *net_device_ctx = netdev_priv(net);
   1112	struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
   1113	unsigned int orig, count = channels->combined_count;
   1114	struct netvsc_device_info *device_info;
   1115	int ret;
   1116
   1117	/* We do not support separate count for rx, tx, or other */
   1118	if (count == 0 ||
   1119	    channels->rx_count || channels->tx_count || channels->other_count)
   1120		return -EINVAL;
   1121
   1122	if (!nvdev || nvdev->destroy)
   1123		return -ENODEV;
   1124
   1125	if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5)
   1126		return -EINVAL;
   1127
   1128	if (count > nvdev->max_chn)
   1129		return -EINVAL;
   1130
   1131	orig = nvdev->num_chn;
   1132
   1133	device_info = netvsc_devinfo_get(nvdev);
   1134
   1135	if (!device_info)
   1136		return -ENOMEM;
   1137
   1138	device_info->num_chn = count;
   1139
   1140	ret = netvsc_detach(net, nvdev);
   1141	if (ret)
   1142		goto out;
   1143
   1144	ret = netvsc_attach(net, device_info);
   1145	if (ret) {
   1146		device_info->num_chn = orig;
   1147		if (netvsc_attach(net, device_info))
   1148			netdev_err(net, "restoring channel setting failed\n");
   1149	}
   1150
   1151out:
   1152	netvsc_devinfo_put(device_info);
   1153	return ret;
   1154}
   1155
   1156static void netvsc_init_settings(struct net_device *dev)
   1157{
   1158	struct net_device_context *ndc = netdev_priv(dev);
   1159
   1160	ndc->l4_hash = HV_DEFAULT_L4HASH;
   1161
   1162	ndc->speed = SPEED_UNKNOWN;
   1163	ndc->duplex = DUPLEX_FULL;
   1164
   1165	dev->features = NETIF_F_LRO;
   1166}
   1167
   1168static int netvsc_get_link_ksettings(struct net_device *dev,
   1169				     struct ethtool_link_ksettings *cmd)
   1170{
   1171	struct net_device_context *ndc = netdev_priv(dev);
   1172	struct net_device *vf_netdev;
   1173
   1174	vf_netdev = rtnl_dereference(ndc->vf_netdev);
   1175
   1176	if (vf_netdev)
   1177		return __ethtool_get_link_ksettings(vf_netdev, cmd);
   1178
   1179	cmd->base.speed = ndc->speed;
   1180	cmd->base.duplex = ndc->duplex;
   1181	cmd->base.port = PORT_OTHER;
   1182
   1183	return 0;
   1184}
   1185
   1186static int netvsc_set_link_ksettings(struct net_device *dev,
   1187				     const struct ethtool_link_ksettings *cmd)
   1188{
   1189	struct net_device_context *ndc = netdev_priv(dev);
   1190	struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev);
   1191
   1192	if (vf_netdev) {
   1193		if (!vf_netdev->ethtool_ops->set_link_ksettings)
   1194			return -EOPNOTSUPP;
   1195
   1196		return vf_netdev->ethtool_ops->set_link_ksettings(vf_netdev,
   1197								  cmd);
   1198	}
   1199
   1200	return ethtool_virtdev_set_link_ksettings(dev, cmd,
   1201						  &ndc->speed, &ndc->duplex);
   1202}
   1203
   1204static int netvsc_change_mtu(struct net_device *ndev, int mtu)
   1205{
   1206	struct net_device_context *ndevctx = netdev_priv(ndev);
   1207	struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
   1208	struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
   1209	int orig_mtu = ndev->mtu;
   1210	struct netvsc_device_info *device_info;
   1211	int ret = 0;
   1212
   1213	if (!nvdev || nvdev->destroy)
   1214		return -ENODEV;
   1215
   1216	device_info = netvsc_devinfo_get(nvdev);
   1217
   1218	if (!device_info)
   1219		return -ENOMEM;
   1220
   1221	/* Change MTU of underlying VF netdev first. */
   1222	if (vf_netdev) {
   1223		ret = dev_set_mtu(vf_netdev, mtu);
   1224		if (ret)
   1225			goto out;
   1226	}
   1227
   1228	ret = netvsc_detach(ndev, nvdev);
   1229	if (ret)
   1230		goto rollback_vf;
   1231
   1232	ndev->mtu = mtu;
   1233
   1234	ret = netvsc_attach(ndev, device_info);
   1235	if (!ret)
   1236		goto out;
   1237
   1238	/* Attempt rollback to original MTU */
   1239	ndev->mtu = orig_mtu;
   1240
   1241	if (netvsc_attach(ndev, device_info))
   1242		netdev_err(ndev, "restoring mtu failed\n");
   1243rollback_vf:
   1244	if (vf_netdev)
   1245		dev_set_mtu(vf_netdev, orig_mtu);
   1246
   1247out:
   1248	netvsc_devinfo_put(device_info);
   1249	return ret;
   1250}
   1251
   1252static void netvsc_get_vf_stats(struct net_device *net,
   1253				struct netvsc_vf_pcpu_stats *tot)
   1254{
   1255	struct net_device_context *ndev_ctx = netdev_priv(net);
   1256	int i;
   1257
   1258	memset(tot, 0, sizeof(*tot));
   1259
   1260	for_each_possible_cpu(i) {
   1261		const struct netvsc_vf_pcpu_stats *stats
   1262			= per_cpu_ptr(ndev_ctx->vf_stats, i);
   1263		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
   1264		unsigned int start;
   1265
   1266		do {
   1267			start = u64_stats_fetch_begin_irq(&stats->syncp);
   1268			rx_packets = stats->rx_packets;
   1269			tx_packets = stats->tx_packets;
   1270			rx_bytes = stats->rx_bytes;
   1271			tx_bytes = stats->tx_bytes;
   1272		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
   1273
   1274		tot->rx_packets += rx_packets;
   1275		tot->tx_packets += tx_packets;
   1276		tot->rx_bytes   += rx_bytes;
   1277		tot->tx_bytes   += tx_bytes;
   1278		tot->tx_dropped += stats->tx_dropped;
   1279	}
   1280}
   1281
   1282static void netvsc_get_pcpu_stats(struct net_device *net,
   1283				  struct netvsc_ethtool_pcpu_stats *pcpu_tot)
   1284{
   1285	struct net_device_context *ndev_ctx = netdev_priv(net);
   1286	struct netvsc_device *nvdev = rcu_dereference_rtnl(ndev_ctx->nvdev);
   1287	int i;
   1288
   1289	/* fetch percpu stats of vf */
   1290	for_each_possible_cpu(i) {
   1291		const struct netvsc_vf_pcpu_stats *stats =
   1292			per_cpu_ptr(ndev_ctx->vf_stats, i);
   1293		struct netvsc_ethtool_pcpu_stats *this_tot = &pcpu_tot[i];
   1294		unsigned int start;
   1295
   1296		do {
   1297			start = u64_stats_fetch_begin_irq(&stats->syncp);
   1298			this_tot->vf_rx_packets = stats->rx_packets;
   1299			this_tot->vf_tx_packets = stats->tx_packets;
   1300			this_tot->vf_rx_bytes = stats->rx_bytes;
   1301			this_tot->vf_tx_bytes = stats->tx_bytes;
   1302		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
   1303		this_tot->rx_packets = this_tot->vf_rx_packets;
   1304		this_tot->tx_packets = this_tot->vf_tx_packets;
   1305		this_tot->rx_bytes   = this_tot->vf_rx_bytes;
   1306		this_tot->tx_bytes   = this_tot->vf_tx_bytes;
   1307	}
   1308
   1309	/* fetch percpu stats of netvsc */
   1310	for (i = 0; i < nvdev->num_chn; i++) {
   1311		const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
   1312		const struct netvsc_stats_tx *tx_stats;
   1313		const struct netvsc_stats_rx *rx_stats;
   1314		struct netvsc_ethtool_pcpu_stats *this_tot =
   1315			&pcpu_tot[nvchan->channel->target_cpu];
   1316		u64 packets, bytes;
   1317		unsigned int start;
   1318
   1319		tx_stats = &nvchan->tx_stats;
   1320		do {
   1321			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
   1322			packets = tx_stats->packets;
   1323			bytes = tx_stats->bytes;
   1324		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
   1325
   1326		this_tot->tx_bytes	+= bytes;
   1327		this_tot->tx_packets	+= packets;
   1328
   1329		rx_stats = &nvchan->rx_stats;
   1330		do {
   1331			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
   1332			packets = rx_stats->packets;
   1333			bytes = rx_stats->bytes;
   1334		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
   1335
   1336		this_tot->rx_bytes	+= bytes;
   1337		this_tot->rx_packets	+= packets;
   1338	}
   1339}
   1340
   1341static void netvsc_get_stats64(struct net_device *net,
   1342			       struct rtnl_link_stats64 *t)
   1343{
   1344	struct net_device_context *ndev_ctx = netdev_priv(net);
   1345	struct netvsc_device *nvdev;
   1346	struct netvsc_vf_pcpu_stats vf_tot;
   1347	int i;
   1348
   1349	rcu_read_lock();
   1350
   1351	nvdev = rcu_dereference(ndev_ctx->nvdev);
   1352	if (!nvdev)
   1353		goto out;
   1354
   1355	netdev_stats_to_stats64(t, &net->stats);
   1356
   1357	netvsc_get_vf_stats(net, &vf_tot);
   1358	t->rx_packets += vf_tot.rx_packets;
   1359	t->tx_packets += vf_tot.tx_packets;
   1360	t->rx_bytes   += vf_tot.rx_bytes;
   1361	t->tx_bytes   += vf_tot.tx_bytes;
   1362	t->tx_dropped += vf_tot.tx_dropped;
   1363
   1364	for (i = 0; i < nvdev->num_chn; i++) {
   1365		const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
   1366		const struct netvsc_stats_tx *tx_stats;
   1367		const struct netvsc_stats_rx *rx_stats;
   1368		u64 packets, bytes, multicast;
   1369		unsigned int start;
   1370
   1371		tx_stats = &nvchan->tx_stats;
   1372		do {
   1373			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
   1374			packets = tx_stats->packets;
   1375			bytes = tx_stats->bytes;
   1376		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
   1377
   1378		t->tx_bytes	+= bytes;
   1379		t->tx_packets	+= packets;
   1380
   1381		rx_stats = &nvchan->rx_stats;
   1382		do {
   1383			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
   1384			packets = rx_stats->packets;
   1385			bytes = rx_stats->bytes;
   1386			multicast = rx_stats->multicast + rx_stats->broadcast;
   1387		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
   1388
   1389		t->rx_bytes	+= bytes;
   1390		t->rx_packets	+= packets;
   1391		t->multicast	+= multicast;
   1392	}
   1393out:
   1394	rcu_read_unlock();
   1395}
   1396
   1397static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
   1398{
   1399	struct net_device_context *ndc = netdev_priv(ndev);
   1400	struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev);
   1401	struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
   1402	struct sockaddr *addr = p;
   1403	int err;
   1404
   1405	err = eth_prepare_mac_addr_change(ndev, p);
   1406	if (err)
   1407		return err;
   1408
   1409	if (!nvdev)
   1410		return -ENODEV;
   1411
   1412	if (vf_netdev) {
   1413		err = dev_set_mac_address(vf_netdev, addr, NULL);
   1414		if (err)
   1415			return err;
   1416	}
   1417
   1418	err = rndis_filter_set_device_mac(nvdev, addr->sa_data);
   1419	if (!err) {
   1420		eth_commit_mac_addr_change(ndev, p);
   1421	} else if (vf_netdev) {
   1422		/* rollback change on VF */
   1423		memcpy(addr->sa_data, ndev->dev_addr, ETH_ALEN);
   1424		dev_set_mac_address(vf_netdev, addr, NULL);
   1425	}
   1426
   1427	return err;
   1428}
   1429
   1430static const struct {
   1431	char name[ETH_GSTRING_LEN];
   1432	u16 offset;
   1433} netvsc_stats[] = {
   1434	{ "tx_scattered", offsetof(struct netvsc_ethtool_stats, tx_scattered) },
   1435	{ "tx_no_memory", offsetof(struct netvsc_ethtool_stats, tx_no_memory) },
   1436	{ "tx_no_space",  offsetof(struct netvsc_ethtool_stats, tx_no_space) },
   1437	{ "tx_too_big",	  offsetof(struct netvsc_ethtool_stats, tx_too_big) },
   1438	{ "tx_busy",	  offsetof(struct netvsc_ethtool_stats, tx_busy) },
   1439	{ "tx_send_full", offsetof(struct netvsc_ethtool_stats, tx_send_full) },
   1440	{ "rx_comp_busy", offsetof(struct netvsc_ethtool_stats, rx_comp_busy) },
   1441	{ "rx_no_memory", offsetof(struct netvsc_ethtool_stats, rx_no_memory) },
   1442	{ "stop_queue", offsetof(struct netvsc_ethtool_stats, stop_queue) },
   1443	{ "wake_queue", offsetof(struct netvsc_ethtool_stats, wake_queue) },
   1444	{ "vlan_error", offsetof(struct netvsc_ethtool_stats, vlan_error) },
   1445}, pcpu_stats[] = {
   1446	{ "cpu%u_rx_packets",
   1447		offsetof(struct netvsc_ethtool_pcpu_stats, rx_packets) },
   1448	{ "cpu%u_rx_bytes",
   1449		offsetof(struct netvsc_ethtool_pcpu_stats, rx_bytes) },
   1450	{ "cpu%u_tx_packets",
   1451		offsetof(struct netvsc_ethtool_pcpu_stats, tx_packets) },
   1452	{ "cpu%u_tx_bytes",
   1453		offsetof(struct netvsc_ethtool_pcpu_stats, tx_bytes) },
   1454	{ "cpu%u_vf_rx_packets",
   1455		offsetof(struct netvsc_ethtool_pcpu_stats, vf_rx_packets) },
   1456	{ "cpu%u_vf_rx_bytes",
   1457		offsetof(struct netvsc_ethtool_pcpu_stats, vf_rx_bytes) },
   1458	{ "cpu%u_vf_tx_packets",
   1459		offsetof(struct netvsc_ethtool_pcpu_stats, vf_tx_packets) },
   1460	{ "cpu%u_vf_tx_bytes",
   1461		offsetof(struct netvsc_ethtool_pcpu_stats, vf_tx_bytes) },
   1462}, vf_stats[] = {
   1463	{ "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) },
   1464	{ "vf_rx_bytes",   offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) },
   1465	{ "vf_tx_packets", offsetof(struct netvsc_vf_pcpu_stats, tx_packets) },
   1466	{ "vf_tx_bytes",   offsetof(struct netvsc_vf_pcpu_stats, tx_bytes) },
   1467	{ "vf_tx_dropped", offsetof(struct netvsc_vf_pcpu_stats, tx_dropped) },
   1468};
   1469
   1470#define NETVSC_GLOBAL_STATS_LEN	ARRAY_SIZE(netvsc_stats)
   1471#define NETVSC_VF_STATS_LEN	ARRAY_SIZE(vf_stats)
   1472
   1473/* statistics per queue (rx/tx packets/bytes) */
   1474#define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))
   1475
   1476/* 8 statistics per queue (rx/tx packets/bytes, XDP actions) */
   1477#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 8)
   1478
   1479static int netvsc_get_sset_count(struct net_device *dev, int string_set)
   1480{
   1481	struct net_device_context *ndc = netdev_priv(dev);
   1482	struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
   1483
   1484	if (!nvdev)
   1485		return -ENODEV;
   1486
   1487	switch (string_set) {
   1488	case ETH_SS_STATS:
   1489		return NETVSC_GLOBAL_STATS_LEN
   1490			+ NETVSC_VF_STATS_LEN
   1491			+ NETVSC_QUEUE_STATS_LEN(nvdev)
   1492			+ NETVSC_PCPU_STATS_LEN;
   1493	default:
   1494		return -EINVAL;
   1495	}
   1496}
   1497
   1498static void netvsc_get_ethtool_stats(struct net_device *dev,
   1499				     struct ethtool_stats *stats, u64 *data)
   1500{
   1501	struct net_device_context *ndc = netdev_priv(dev);
   1502	struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
   1503	const void *nds = &ndc->eth_stats;
   1504	const struct netvsc_stats_tx *tx_stats;
   1505	const struct netvsc_stats_rx *rx_stats;
   1506	struct netvsc_vf_pcpu_stats sum;
   1507	struct netvsc_ethtool_pcpu_stats *pcpu_sum;
   1508	unsigned int start;
   1509	u64 packets, bytes;
   1510	u64 xdp_drop;
   1511	u64 xdp_redirect;
   1512	u64 xdp_tx;
   1513	u64 xdp_xmit;
   1514	int i, j, cpu;
   1515
   1516	if (!nvdev)
   1517		return;
   1518
   1519	for (i = 0; i < NETVSC_GLOBAL_STATS_LEN; i++)
   1520		data[i] = *(unsigned long *)(nds + netvsc_stats[i].offset);
   1521
   1522	netvsc_get_vf_stats(dev, &sum);
   1523	for (j = 0; j < NETVSC_VF_STATS_LEN; j++)
   1524		data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);
   1525
   1526	for (j = 0; j < nvdev->num_chn; j++) {
   1527		tx_stats = &nvdev->chan_table[j].tx_stats;
   1528
   1529		do {
   1530			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
   1531			packets = tx_stats->packets;
   1532			bytes = tx_stats->bytes;
   1533			xdp_xmit = tx_stats->xdp_xmit;
   1534		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
   1535		data[i++] = packets;
   1536		data[i++] = bytes;
   1537		data[i++] = xdp_xmit;
   1538
   1539		rx_stats = &nvdev->chan_table[j].rx_stats;
   1540		do {
   1541			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
   1542			packets = rx_stats->packets;
   1543			bytes = rx_stats->bytes;
   1544			xdp_drop = rx_stats->xdp_drop;
   1545			xdp_redirect = rx_stats->xdp_redirect;
   1546			xdp_tx = rx_stats->xdp_tx;
   1547		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
   1548		data[i++] = packets;
   1549		data[i++] = bytes;
   1550		data[i++] = xdp_drop;
   1551		data[i++] = xdp_redirect;
   1552		data[i++] = xdp_tx;
   1553	}
   1554
   1555	pcpu_sum = kvmalloc_array(num_possible_cpus(),
   1556				  sizeof(struct netvsc_ethtool_pcpu_stats),
   1557				  GFP_KERNEL);
   1558	if (!pcpu_sum)
   1559		return;
   1560
   1561	netvsc_get_pcpu_stats(dev, pcpu_sum);
   1562	for_each_present_cpu(cpu) {
   1563		struct netvsc_ethtool_pcpu_stats *this_sum = &pcpu_sum[cpu];
   1564
   1565		for (j = 0; j < ARRAY_SIZE(pcpu_stats); j++)
   1566			data[i++] = *(u64 *)((void *)this_sum
   1567					     + pcpu_stats[j].offset);
   1568	}
   1569	kvfree(pcpu_sum);
   1570}
   1571
   1572static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
   1573{
   1574	struct net_device_context *ndc = netdev_priv(dev);
   1575	struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
   1576	u8 *p = data;
   1577	int i, cpu;
   1578
   1579	if (!nvdev)
   1580		return;
   1581
   1582	switch (stringset) {
   1583	case ETH_SS_STATS:
   1584		for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++)
   1585			ethtool_sprintf(&p, netvsc_stats[i].name);
   1586
   1587		for (i = 0; i < ARRAY_SIZE(vf_stats); i++)
   1588			ethtool_sprintf(&p, vf_stats[i].name);
   1589
   1590		for (i = 0; i < nvdev->num_chn; i++) {
   1591			ethtool_sprintf(&p, "tx_queue_%u_packets", i);
   1592			ethtool_sprintf(&p, "tx_queue_%u_bytes", i);
   1593			ethtool_sprintf(&p, "tx_queue_%u_xdp_xmit", i);
   1594			ethtool_sprintf(&p, "rx_queue_%u_packets", i);
   1595			ethtool_sprintf(&p, "rx_queue_%u_bytes", i);
   1596			ethtool_sprintf(&p, "rx_queue_%u_xdp_drop", i);
   1597			ethtool_sprintf(&p, "rx_queue_%u_xdp_redirect", i);
   1598			ethtool_sprintf(&p, "rx_queue_%u_xdp_tx", i);
   1599		}
   1600
   1601		for_each_present_cpu(cpu) {
   1602			for (i = 0; i < ARRAY_SIZE(pcpu_stats); i++)
   1603				ethtool_sprintf(&p, pcpu_stats[i].name, cpu);
   1604		}
   1605
   1606		break;
   1607	}
   1608}
   1609
   1610static int
   1611netvsc_get_rss_hash_opts(struct net_device_context *ndc,
   1612			 struct ethtool_rxnfc *info)
   1613{
   1614	const u32 l4_flag = RXH_L4_B_0_1 | RXH_L4_B_2_3;
   1615
   1616	info->data = RXH_IP_SRC | RXH_IP_DST;
   1617
   1618	switch (info->flow_type) {
   1619	case TCP_V4_FLOW:
   1620		if (ndc->l4_hash & HV_TCP4_L4HASH)
   1621			info->data |= l4_flag;
   1622
   1623		break;
   1624
   1625	case TCP_V6_FLOW:
   1626		if (ndc->l4_hash & HV_TCP6_L4HASH)
   1627			info->data |= l4_flag;
   1628
   1629		break;
   1630
   1631	case UDP_V4_FLOW:
   1632		if (ndc->l4_hash & HV_UDP4_L4HASH)
   1633			info->data |= l4_flag;
   1634
   1635		break;
   1636
   1637	case UDP_V6_FLOW:
   1638		if (ndc->l4_hash & HV_UDP6_L4HASH)
   1639			info->data |= l4_flag;
   1640
   1641		break;
   1642
   1643	case IPV4_FLOW:
   1644	case IPV6_FLOW:
   1645		break;
   1646	default:
   1647		info->data = 0;
   1648		break;
   1649	}
   1650
   1651	return 0;
   1652}
   1653
   1654static int
   1655netvsc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
   1656		 u32 *rules)
   1657{
   1658	struct net_device_context *ndc = netdev_priv(dev);
   1659	struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
   1660
   1661	if (!nvdev)
   1662		return -ENODEV;
   1663
   1664	switch (info->cmd) {
   1665	case ETHTOOL_GRXRINGS:
   1666		info->data = nvdev->num_chn;
   1667		return 0;
   1668
   1669	case ETHTOOL_GRXFH:
   1670		return netvsc_get_rss_hash_opts(ndc, info);
   1671	}
   1672	return -EOPNOTSUPP;
   1673}
   1674
   1675static int netvsc_set_rss_hash_opts(struct net_device_context *ndc,
   1676				    struct ethtool_rxnfc *info)
   1677{
   1678	if (info->data == (RXH_IP_SRC | RXH_IP_DST |
   1679			   RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
   1680		switch (info->flow_type) {
   1681		case TCP_V4_FLOW:
   1682			ndc->l4_hash |= HV_TCP4_L4HASH;
   1683			break;
   1684
   1685		case TCP_V6_FLOW:
   1686			ndc->l4_hash |= HV_TCP6_L4HASH;
   1687			break;
   1688
   1689		case UDP_V4_FLOW:
   1690			ndc->l4_hash |= HV_UDP4_L4HASH;
   1691			break;
   1692
   1693		case UDP_V6_FLOW:
   1694			ndc->l4_hash |= HV_UDP6_L4HASH;
   1695			break;
   1696
   1697		default:
   1698			return -EOPNOTSUPP;
   1699		}
   1700
   1701		return 0;
   1702	}
   1703
   1704	if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
   1705		switch (info->flow_type) {
   1706		case TCP_V4_FLOW:
   1707			ndc->l4_hash &= ~HV_TCP4_L4HASH;
   1708			break;
   1709
   1710		case TCP_V6_FLOW:
   1711			ndc->l4_hash &= ~HV_TCP6_L4HASH;
   1712			break;
   1713
   1714		case UDP_V4_FLOW:
   1715			ndc->l4_hash &= ~HV_UDP4_L4HASH;
   1716			break;
   1717
   1718		case UDP_V6_FLOW:
   1719			ndc->l4_hash &= ~HV_UDP6_L4HASH;
   1720			break;
   1721
   1722		default:
   1723			return -EOPNOTSUPP;
   1724		}
   1725
   1726		return 0;
   1727	}
   1728
   1729	return -EOPNOTSUPP;
   1730}
   1731
   1732static int
   1733netvsc_set_rxnfc(struct net_device *ndev, struct ethtool_rxnfc *info)
   1734{
   1735	struct net_device_context *ndc = netdev_priv(ndev);
   1736
   1737	if (info->cmd == ETHTOOL_SRXFH)
   1738		return netvsc_set_rss_hash_opts(ndc, info);
   1739
   1740	return -EOPNOTSUPP;
   1741}
   1742
   1743static u32 netvsc_get_rxfh_key_size(struct net_device *dev)
   1744{
   1745	return NETVSC_HASH_KEYLEN;
   1746}
   1747
   1748static u32 netvsc_rss_indir_size(struct net_device *dev)
   1749{
   1750	return ITAB_NUM;
   1751}
   1752
   1753static int netvsc_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
   1754			   u8 *hfunc)
   1755{
   1756	struct net_device_context *ndc = netdev_priv(dev);
   1757	struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
   1758	struct rndis_device *rndis_dev;
   1759	int i;
   1760
   1761	if (!ndev)
   1762		return -ENODEV;
   1763
   1764	if (hfunc)
   1765		*hfunc = ETH_RSS_HASH_TOP;	/* Toeplitz */
   1766
   1767	rndis_dev = ndev->extension;
   1768	if (indir) {
   1769		for (i = 0; i < ITAB_NUM; i++)
   1770			indir[i] = ndc->rx_table[i];
   1771	}
   1772
   1773	if (key)
   1774		memcpy(key, rndis_dev->rss_key, NETVSC_HASH_KEYLEN);
   1775
   1776	return 0;
   1777}
   1778
   1779static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir,
   1780			   const u8 *key, const u8 hfunc)
   1781{
   1782	struct net_device_context *ndc = netdev_priv(dev);
   1783	struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
   1784	struct rndis_device *rndis_dev;
   1785	int i;
   1786
   1787	if (!ndev)
   1788		return -ENODEV;
   1789
   1790	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
   1791		return -EOPNOTSUPP;
   1792
   1793	rndis_dev = ndev->extension;
   1794	if (indir) {
   1795		for (i = 0; i < ITAB_NUM; i++)
   1796			if (indir[i] >= ndev->num_chn)
   1797				return -EINVAL;
   1798
   1799		for (i = 0; i < ITAB_NUM; i++)
   1800			ndc->rx_table[i] = indir[i];
   1801	}
   1802
   1803	if (!key) {
   1804		if (!indir)
   1805			return 0;
   1806
   1807		key = rndis_dev->rss_key;
   1808	}
   1809
   1810	return rndis_filter_set_rss_param(rndis_dev, key);
   1811}
   1812
   1813/* Hyper-V RNDIS protocol does not have ring in the HW sense.
   1814 * It does have pre-allocated receive area which is divided into sections.
   1815 */
   1816static void __netvsc_get_ringparam(struct netvsc_device *nvdev,
   1817				   struct ethtool_ringparam *ring)
   1818{
   1819	u32 max_buf_size;
   1820
   1821	ring->rx_pending = nvdev->recv_section_cnt;
   1822	ring->tx_pending = nvdev->send_section_cnt;
   1823
   1824	if (nvdev->nvsp_version <= NVSP_PROTOCOL_VERSION_2)
   1825		max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY;
   1826	else
   1827		max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE;
   1828
   1829	ring->rx_max_pending = max_buf_size / nvdev->recv_section_size;
   1830	ring->tx_max_pending = NETVSC_SEND_BUFFER_SIZE
   1831		/ nvdev->send_section_size;
   1832}
   1833
   1834static void netvsc_get_ringparam(struct net_device *ndev,
   1835				 struct ethtool_ringparam *ring,
   1836				 struct kernel_ethtool_ringparam *kernel_ring,
   1837				 struct netlink_ext_ack *extack)
   1838{
   1839	struct net_device_context *ndevctx = netdev_priv(ndev);
   1840	struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
   1841
   1842	if (!nvdev)
   1843		return;
   1844
   1845	__netvsc_get_ringparam(nvdev, ring);
   1846}
   1847
   1848static int netvsc_set_ringparam(struct net_device *ndev,
   1849				struct ethtool_ringparam *ring,
   1850				struct kernel_ethtool_ringparam *kernel_ring,
   1851				struct netlink_ext_ack *extack)
   1852{
   1853	struct net_device_context *ndevctx = netdev_priv(ndev);
   1854	struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
   1855	struct netvsc_device_info *device_info;
   1856	struct ethtool_ringparam orig;
   1857	u32 new_tx, new_rx;
   1858	int ret = 0;
   1859
   1860	if (!nvdev || nvdev->destroy)
   1861		return -ENODEV;
   1862
   1863	memset(&orig, 0, sizeof(orig));
   1864	__netvsc_get_ringparam(nvdev, &orig);
   1865
   1866	new_tx = clamp_t(u32, ring->tx_pending,
   1867			 NETVSC_MIN_TX_SECTIONS, orig.tx_max_pending);
   1868	new_rx = clamp_t(u32, ring->rx_pending,
   1869			 NETVSC_MIN_RX_SECTIONS, orig.rx_max_pending);
   1870
   1871	if (new_tx == orig.tx_pending &&
   1872	    new_rx == orig.rx_pending)
   1873		return 0;	 /* no change */
   1874
   1875	device_info = netvsc_devinfo_get(nvdev);
   1876
   1877	if (!device_info)
   1878		return -ENOMEM;
   1879
   1880	device_info->send_sections = new_tx;
   1881	device_info->recv_sections = new_rx;
   1882
   1883	ret = netvsc_detach(ndev, nvdev);
   1884	if (ret)
   1885		goto out;
   1886
   1887	ret = netvsc_attach(ndev, device_info);
   1888	if (ret) {
   1889		device_info->send_sections = orig.tx_pending;
   1890		device_info->recv_sections = orig.rx_pending;
   1891
   1892		if (netvsc_attach(ndev, device_info))
   1893			netdev_err(ndev, "restoring ringparam failed");
   1894	}
   1895
   1896out:
   1897	netvsc_devinfo_put(device_info);
   1898	return ret;
   1899}
   1900
   1901static netdev_features_t netvsc_fix_features(struct net_device *ndev,
   1902					     netdev_features_t features)
   1903{
   1904	struct net_device_context *ndevctx = netdev_priv(ndev);
   1905	struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
   1906
   1907	if (!nvdev || nvdev->destroy)
   1908		return features;
   1909
   1910	if ((features & NETIF_F_LRO) && netvsc_xdp_get(nvdev)) {
   1911		features ^= NETIF_F_LRO;
   1912		netdev_info(ndev, "Skip LRO - unsupported with XDP\n");
   1913	}
   1914
   1915	return features;
   1916}
   1917
   1918static int netvsc_set_features(struct net_device *ndev,
   1919			       netdev_features_t features)
   1920{
   1921	netdev_features_t change = features ^ ndev->features;
   1922	struct net_device_context *ndevctx = netdev_priv(ndev);
   1923	struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
   1924	struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
   1925	struct ndis_offload_params offloads;
   1926	int ret = 0;
   1927
   1928	if (!nvdev || nvdev->destroy)
   1929		return -ENODEV;
   1930
   1931	if (!(change & NETIF_F_LRO))
   1932		goto syncvf;
   1933
   1934	memset(&offloads, 0, sizeof(struct ndis_offload_params));
   1935
   1936	if (features & NETIF_F_LRO) {
   1937		offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED;
   1938		offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED;
   1939	} else {
   1940		offloads.rsc_ip_v4 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
   1941		offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
   1942	}
   1943
   1944	ret = rndis_filter_set_offload_params(ndev, nvdev, &offloads);
   1945
   1946	if (ret) {
   1947		features ^= NETIF_F_LRO;
   1948		ndev->features = features;
   1949	}
   1950
   1951syncvf:
   1952	if (!vf_netdev)
   1953		return ret;
   1954
   1955	vf_netdev->wanted_features = features;
   1956	netdev_update_features(vf_netdev);
   1957
   1958	return ret;
   1959}
   1960
   1961static int netvsc_get_regs_len(struct net_device *netdev)
   1962{
   1963	return VRSS_SEND_TAB_SIZE * sizeof(u32);
   1964}
   1965
   1966static void netvsc_get_regs(struct net_device *netdev,
   1967			    struct ethtool_regs *regs, void *p)
   1968{
   1969	struct net_device_context *ndc = netdev_priv(netdev);
   1970	u32 *regs_buff = p;
   1971
   1972	/* increase the version, if buffer format is changed. */
   1973	regs->version = 1;
   1974
   1975	memcpy(regs_buff, ndc->tx_table, VRSS_SEND_TAB_SIZE * sizeof(u32));
   1976}
   1977
   1978static u32 netvsc_get_msglevel(struct net_device *ndev)
   1979{
   1980	struct net_device_context *ndev_ctx = netdev_priv(ndev);
   1981
   1982	return ndev_ctx->msg_enable;
   1983}
   1984
   1985static void netvsc_set_msglevel(struct net_device *ndev, u32 val)
   1986{
   1987	struct net_device_context *ndev_ctx = netdev_priv(ndev);
   1988
   1989	ndev_ctx->msg_enable = val;
   1990}
   1991
   1992static const struct ethtool_ops ethtool_ops = {
   1993	.get_drvinfo	= netvsc_get_drvinfo,
   1994	.get_regs_len	= netvsc_get_regs_len,
   1995	.get_regs	= netvsc_get_regs,
   1996	.get_msglevel	= netvsc_get_msglevel,
   1997	.set_msglevel	= netvsc_set_msglevel,
   1998	.get_link	= ethtool_op_get_link,
   1999	.get_ethtool_stats = netvsc_get_ethtool_stats,
   2000	.get_sset_count = netvsc_get_sset_count,
   2001	.get_strings	= netvsc_get_strings,
   2002	.get_channels   = netvsc_get_channels,
   2003	.set_channels   = netvsc_set_channels,
   2004	.get_ts_info	= ethtool_op_get_ts_info,
   2005	.get_rxnfc	= netvsc_get_rxnfc,
   2006	.set_rxnfc	= netvsc_set_rxnfc,
   2007	.get_rxfh_key_size = netvsc_get_rxfh_key_size,
   2008	.get_rxfh_indir_size = netvsc_rss_indir_size,
   2009	.get_rxfh	= netvsc_get_rxfh,
   2010	.set_rxfh	= netvsc_set_rxfh,
   2011	.get_link_ksettings = netvsc_get_link_ksettings,
   2012	.set_link_ksettings = netvsc_set_link_ksettings,
   2013	.get_ringparam	= netvsc_get_ringparam,
   2014	.set_ringparam	= netvsc_set_ringparam,
   2015};
   2016
   2017static const struct net_device_ops device_ops = {
   2018	.ndo_open =			netvsc_open,
   2019	.ndo_stop =			netvsc_close,
   2020	.ndo_start_xmit =		netvsc_start_xmit,
   2021	.ndo_change_rx_flags =		netvsc_change_rx_flags,
   2022	.ndo_set_rx_mode =		netvsc_set_rx_mode,
   2023	.ndo_fix_features =		netvsc_fix_features,
   2024	.ndo_set_features =		netvsc_set_features,
   2025	.ndo_change_mtu =		netvsc_change_mtu,
   2026	.ndo_validate_addr =		eth_validate_addr,
   2027	.ndo_set_mac_address =		netvsc_set_mac_addr,
   2028	.ndo_select_queue =		netvsc_select_queue,
   2029	.ndo_get_stats64 =		netvsc_get_stats64,
   2030	.ndo_bpf =			netvsc_bpf,
   2031	.ndo_xdp_xmit =			netvsc_ndoxdp_xmit,
   2032};
   2033
   2034/*
   2035 * Handle link status changes. For RNDIS_STATUS_NETWORK_CHANGE emulate link
   2036 * down/up sequence. In case of RNDIS_STATUS_MEDIA_CONNECT when carrier is
   2037 * present send GARP packet to network peers with netif_notify_peers().
   2038 */
   2039static void netvsc_link_change(struct work_struct *w)
   2040{
   2041	struct net_device_context *ndev_ctx =
   2042		container_of(w, struct net_device_context, dwork.work);
   2043	struct hv_device *device_obj = ndev_ctx->device_ctx;
   2044	struct net_device *net = hv_get_drvdata(device_obj);
   2045	unsigned long flags, next_reconfig, delay;
   2046	struct netvsc_reconfig *event = NULL;
   2047	struct netvsc_device *net_device;
   2048	struct rndis_device *rdev;
   2049	bool reschedule = false;
   2050
   2051	/* if changes are happening, comeback later */
   2052	if (!rtnl_trylock()) {
   2053		schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
   2054		return;
   2055	}
   2056
   2057	net_device = rtnl_dereference(ndev_ctx->nvdev);
   2058	if (!net_device)
   2059		goto out_unlock;
   2060
   2061	rdev = net_device->extension;
   2062
   2063	next_reconfig = ndev_ctx->last_reconfig + LINKCHANGE_INT;
   2064	if (time_is_after_jiffies(next_reconfig)) {
   2065		/* link_watch only sends one notification with current state
   2066		 * per second, avoid doing reconfig more frequently. Handle
   2067		 * wrap around.
   2068		 */
   2069		delay = next_reconfig - jiffies;
   2070		delay = delay < LINKCHANGE_INT ? delay : LINKCHANGE_INT;
   2071		schedule_delayed_work(&ndev_ctx->dwork, delay);
   2072		goto out_unlock;
   2073	}
   2074	ndev_ctx->last_reconfig = jiffies;
   2075
   2076	spin_lock_irqsave(&ndev_ctx->lock, flags);
   2077	if (!list_empty(&ndev_ctx->reconfig_events)) {
   2078		event = list_first_entry(&ndev_ctx->reconfig_events,
   2079					 struct netvsc_reconfig, list);
   2080		list_del(&event->list);
   2081		reschedule = !list_empty(&ndev_ctx->reconfig_events);
   2082	}
   2083	spin_unlock_irqrestore(&ndev_ctx->lock, flags);
   2084
   2085	if (!event)
   2086		goto out_unlock;
   2087
   2088	switch (event->event) {
   2089		/* Only the following events are possible due to the check in
   2090		 * netvsc_linkstatus_callback()
   2091		 */
   2092	case RNDIS_STATUS_MEDIA_CONNECT:
   2093		if (rdev->link_state) {
   2094			rdev->link_state = false;
   2095			netif_carrier_on(net);
   2096			netvsc_tx_enable(net_device, net);
   2097		} else {
   2098			__netdev_notify_peers(net);
   2099		}
   2100		kfree(event);
   2101		break;
   2102	case RNDIS_STATUS_MEDIA_DISCONNECT:
   2103		if (!rdev->link_state) {
   2104			rdev->link_state = true;
   2105			netif_carrier_off(net);
   2106			netvsc_tx_disable(net_device, net);
   2107		}
   2108		kfree(event);
   2109		break;
   2110	case RNDIS_STATUS_NETWORK_CHANGE:
   2111		/* Only makes sense if carrier is present */
   2112		if (!rdev->link_state) {
   2113			rdev->link_state = true;
   2114			netif_carrier_off(net);
   2115			netvsc_tx_disable(net_device, net);
   2116			event->event = RNDIS_STATUS_MEDIA_CONNECT;
   2117			spin_lock_irqsave(&ndev_ctx->lock, flags);
   2118			list_add(&event->list, &ndev_ctx->reconfig_events);
   2119			spin_unlock_irqrestore(&ndev_ctx->lock, flags);
   2120			reschedule = true;
   2121		}
   2122		break;
   2123	}
   2124
   2125	rtnl_unlock();
   2126
   2127	/* link_watch only sends one notification with current state per
   2128	 * second, handle next reconfig event in 2 seconds.
   2129	 */
   2130	if (reschedule)
   2131		schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
   2132
   2133	return;
   2134
   2135out_unlock:
   2136	rtnl_unlock();
   2137}
   2138
   2139static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
   2140{
   2141	struct net_device_context *net_device_ctx;
   2142	struct net_device *dev;
   2143
   2144	dev = netdev_master_upper_dev_get(vf_netdev);
   2145	if (!dev || dev->netdev_ops != &device_ops)
   2146		return NULL;	/* not a netvsc device */
   2147
   2148	net_device_ctx = netdev_priv(dev);
   2149	if (!rtnl_dereference(net_device_ctx->nvdev))
   2150		return NULL;	/* device is removed */
   2151
   2152	return dev;
   2153}
   2154
   2155/* Called when VF is injecting data into network stack.
   2156 * Change the associated network device from VF to netvsc.
   2157 * note: already called with rcu_read_lock
   2158 */
   2159static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
   2160{
   2161	struct sk_buff *skb = *pskb;
   2162	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
   2163	struct net_device_context *ndev_ctx = netdev_priv(ndev);
   2164	struct netvsc_vf_pcpu_stats *pcpu_stats
   2165		 = this_cpu_ptr(ndev_ctx->vf_stats);
   2166
   2167	skb = skb_share_check(skb, GFP_ATOMIC);
   2168	if (unlikely(!skb))
   2169		return RX_HANDLER_CONSUMED;
   2170
   2171	*pskb = skb;
   2172
   2173	skb->dev = ndev;
   2174
   2175	u64_stats_update_begin(&pcpu_stats->syncp);
   2176	pcpu_stats->rx_packets++;
   2177	pcpu_stats->rx_bytes += skb->len;
   2178	u64_stats_update_end(&pcpu_stats->syncp);
   2179
   2180	return RX_HANDLER_ANOTHER;
   2181}
   2182
   2183static int netvsc_vf_join(struct net_device *vf_netdev,
   2184			  struct net_device *ndev)
   2185{
   2186	struct net_device_context *ndev_ctx = netdev_priv(ndev);
   2187	int ret;
   2188
   2189	ret = netdev_rx_handler_register(vf_netdev,
   2190					 netvsc_vf_handle_frame, ndev);
   2191	if (ret != 0) {
   2192		netdev_err(vf_netdev,
   2193			   "can not register netvsc VF receive handler (err = %d)\n",
   2194			   ret);
   2195		goto rx_handler_failed;
   2196	}
   2197
   2198	ret = netdev_master_upper_dev_link(vf_netdev, ndev,
   2199					   NULL, NULL, NULL);
   2200	if (ret != 0) {
   2201		netdev_err(vf_netdev,
   2202			   "can not set master device %s (err = %d)\n",
   2203			   ndev->name, ret);
   2204		goto upper_link_failed;
   2205	}
   2206
   2207	/* set slave flag before open to prevent IPv6 addrconf */
   2208	vf_netdev->flags |= IFF_SLAVE;
   2209
   2210	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
   2211
   2212	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
   2213
   2214	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
   2215	return 0;
   2216
   2217upper_link_failed:
   2218	netdev_rx_handler_unregister(vf_netdev);
   2219rx_handler_failed:
   2220	return ret;
   2221}
   2222
   2223static void __netvsc_vf_setup(struct net_device *ndev,
   2224			      struct net_device *vf_netdev)
   2225{
   2226	int ret;
   2227
   2228	/* Align MTU of VF with master */
   2229	ret = dev_set_mtu(vf_netdev, ndev->mtu);
   2230	if (ret)
   2231		netdev_warn(vf_netdev,
   2232			    "unable to change mtu to %u\n", ndev->mtu);
   2233
   2234	/* set multicast etc flags on VF */
   2235	dev_change_flags(vf_netdev, ndev->flags | IFF_SLAVE, NULL);
   2236
   2237	/* sync address list from ndev to VF */
   2238	netif_addr_lock_bh(ndev);
   2239	dev_uc_sync(vf_netdev, ndev);
   2240	dev_mc_sync(vf_netdev, ndev);
   2241	netif_addr_unlock_bh(ndev);
   2242
   2243	if (netif_running(ndev)) {
   2244		ret = dev_open(vf_netdev, NULL);
   2245		if (ret)
   2246			netdev_warn(vf_netdev,
   2247				    "unable to open: %d\n", ret);
   2248	}
   2249}
   2250
   2251/* Setup VF as slave of the synthetic device.
   2252 * Runs in workqueue to avoid recursion in netlink callbacks.
   2253 */
   2254static void netvsc_vf_setup(struct work_struct *w)
   2255{
   2256	struct net_device_context *ndev_ctx
   2257		= container_of(w, struct net_device_context, vf_takeover.work);
   2258	struct net_device *ndev = hv_get_drvdata(ndev_ctx->device_ctx);
   2259	struct net_device *vf_netdev;
   2260
   2261	if (!rtnl_trylock()) {
   2262		schedule_delayed_work(&ndev_ctx->vf_takeover, 0);
   2263		return;
   2264	}
   2265
   2266	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
   2267	if (vf_netdev)
   2268		__netvsc_vf_setup(ndev, vf_netdev);
   2269
   2270	rtnl_unlock();
   2271}
   2272
   2273/* Find netvsc by VF serial number.
   2274 * The PCI hyperv controller records the serial number as the slot kobj name.
   2275 */
   2276static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
   2277{
   2278	struct device *parent = vf_netdev->dev.parent;
   2279	struct net_device_context *ndev_ctx;
   2280	struct net_device *ndev;
   2281	struct pci_dev *pdev;
   2282	u32 serial;
   2283
   2284	if (!parent || !dev_is_pci(parent))
   2285		return NULL; /* not a PCI device */
   2286
   2287	pdev = to_pci_dev(parent);
   2288	if (!pdev->slot) {
   2289		netdev_notice(vf_netdev, "no PCI slot information\n");
   2290		return NULL;
   2291	}
   2292
   2293	if (kstrtou32(pci_slot_name(pdev->slot), 10, &serial)) {
   2294		netdev_notice(vf_netdev, "Invalid vf serial:%s\n",
   2295			      pci_slot_name(pdev->slot));
   2296		return NULL;
   2297	}
   2298
   2299	list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
   2300		if (!ndev_ctx->vf_alloc)
   2301			continue;
   2302
   2303		if (ndev_ctx->vf_serial != serial)
   2304			continue;
   2305
   2306		ndev = hv_get_drvdata(ndev_ctx->device_ctx);
   2307		if (ndev->addr_len != vf_netdev->addr_len ||
   2308		    memcmp(ndev->perm_addr, vf_netdev->perm_addr,
   2309			   ndev->addr_len) != 0)
   2310			continue;
   2311
   2312		return ndev;
   2313
   2314	}
   2315
   2316	netdev_notice(vf_netdev,
   2317		      "no netdev found for vf serial:%u\n", serial);
   2318	return NULL;
   2319}
   2320
   2321static int netvsc_register_vf(struct net_device *vf_netdev)
   2322{
   2323	struct net_device_context *net_device_ctx;
   2324	struct netvsc_device *netvsc_dev;
   2325	struct bpf_prog *prog;
   2326	struct net_device *ndev;
   2327	int ret;
   2328
   2329	if (vf_netdev->addr_len != ETH_ALEN)
   2330		return NOTIFY_DONE;
   2331
   2332	ndev = get_netvsc_byslot(vf_netdev);
   2333	if (!ndev)
   2334		return NOTIFY_DONE;
   2335
   2336	net_device_ctx = netdev_priv(ndev);
   2337	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
   2338	if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
   2339		return NOTIFY_DONE;
   2340
   2341	/* if synthetic interface is a different namespace,
   2342	 * then move the VF to that namespace; join will be
   2343	 * done again in that context.
   2344	 */
   2345	if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
   2346		ret = dev_change_net_namespace(vf_netdev,
   2347					       dev_net(ndev), "eth%d");
   2348		if (ret)
   2349			netdev_err(vf_netdev,
   2350				   "could not move to same namespace as %s: %d\n",
   2351				   ndev->name, ret);
   2352		else
   2353			netdev_info(vf_netdev,
   2354				    "VF moved to namespace with: %s\n",
   2355				    ndev->name);
   2356		return NOTIFY_DONE;
   2357	}
   2358
   2359	netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
   2360
   2361	if (netvsc_vf_join(vf_netdev, ndev) != 0)
   2362		return NOTIFY_DONE;
   2363
   2364	dev_hold(vf_netdev);
   2365	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
   2366
   2367	if (ndev->needed_headroom < vf_netdev->needed_headroom)
   2368		ndev->needed_headroom = vf_netdev->needed_headroom;
   2369
   2370	vf_netdev->wanted_features = ndev->features;
   2371	netdev_update_features(vf_netdev);
   2372
   2373	prog = netvsc_xdp_get(netvsc_dev);
   2374	netvsc_vf_setxdp(vf_netdev, prog);
   2375
   2376	return NOTIFY_OK;
   2377}
   2378
   2379/* Change the data path when VF UP/DOWN/CHANGE are detected.
   2380 *
   2381 * Typically a UP or DOWN event is followed by a CHANGE event, so
   2382 * net_device_ctx->data_path_is_vf is used to cache the current data path
   2383 * to avoid the duplicate call of netvsc_switch_datapath() and the duplicate
   2384 * message.
   2385 *
   2386 * During hibernation, if a VF NIC driver (e.g. mlx5) preserves the network
   2387 * interface, there is only the CHANGE event and no UP or DOWN event.
   2388 */
   2389static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event)
   2390{
   2391	struct net_device_context *net_device_ctx;
   2392	struct netvsc_device *netvsc_dev;
   2393	struct net_device *ndev;
   2394	bool vf_is_up = false;
   2395	int ret;
   2396
   2397	if (event != NETDEV_GOING_DOWN)
   2398		vf_is_up = netif_running(vf_netdev);
   2399
   2400	ndev = get_netvsc_byref(vf_netdev);
   2401	if (!ndev)
   2402		return NOTIFY_DONE;
   2403
   2404	net_device_ctx = netdev_priv(ndev);
   2405	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
   2406	if (!netvsc_dev)
   2407		return NOTIFY_DONE;
   2408
   2409	if (net_device_ctx->data_path_is_vf == vf_is_up)
   2410		return NOTIFY_OK;
   2411
   2412	ret = netvsc_switch_datapath(ndev, vf_is_up);
   2413
   2414	if (ret) {
   2415		netdev_err(ndev,
   2416			   "Data path failed to switch %s VF: %s, err: %d\n",
   2417			   vf_is_up ? "to" : "from", vf_netdev->name, ret);
   2418		return NOTIFY_DONE;
   2419	} else {
   2420		netdev_info(ndev, "Data path switched %s VF: %s\n",
   2421			    vf_is_up ? "to" : "from", vf_netdev->name);
   2422	}
   2423
   2424	return NOTIFY_OK;
   2425}
   2426
   2427static int netvsc_unregister_vf(struct net_device *vf_netdev)
   2428{
   2429	struct net_device *ndev;
   2430	struct net_device_context *net_device_ctx;
   2431
   2432	ndev = get_netvsc_byref(vf_netdev);
   2433	if (!ndev)
   2434		return NOTIFY_DONE;
   2435
   2436	net_device_ctx = netdev_priv(ndev);
   2437	cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
   2438
   2439	netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
   2440
   2441	netvsc_vf_setxdp(vf_netdev, NULL);
   2442
   2443	netdev_rx_handler_unregister(vf_netdev);
   2444	netdev_upper_dev_unlink(vf_netdev, ndev);
   2445	RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
   2446	dev_put(vf_netdev);
   2447
   2448	ndev->needed_headroom = RNDIS_AND_PPI_SIZE;
   2449
   2450	return NOTIFY_OK;
   2451}
   2452
   2453static int netvsc_probe(struct hv_device *dev,
   2454			const struct hv_vmbus_device_id *dev_id)
   2455{
   2456	struct net_device *net = NULL;
   2457	struct net_device_context *net_device_ctx;
   2458	struct netvsc_device_info *device_info = NULL;
   2459	struct netvsc_device *nvdev;
   2460	int ret = -ENOMEM;
   2461
   2462	net = alloc_etherdev_mq(sizeof(struct net_device_context),
   2463				VRSS_CHANNEL_MAX);
   2464	if (!net)
   2465		goto no_net;
   2466
   2467	netif_carrier_off(net);
   2468
   2469	netvsc_init_settings(net);
   2470
   2471	net_device_ctx = netdev_priv(net);
   2472	net_device_ctx->device_ctx = dev;
   2473	net_device_ctx->msg_enable = netif_msg_init(debug, default_msg);
   2474	if (netif_msg_probe(net_device_ctx))
   2475		netdev_dbg(net, "netvsc msg_enable: %d\n",
   2476			   net_device_ctx->msg_enable);
   2477
   2478	hv_set_drvdata(dev, net);
   2479
   2480	INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
   2481
   2482	spin_lock_init(&net_device_ctx->lock);
   2483	INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
   2484	INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup);
   2485
   2486	net_device_ctx->vf_stats
   2487		= netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats);
   2488	if (!net_device_ctx->vf_stats)
   2489		goto no_stats;
   2490
   2491	net->netdev_ops = &device_ops;
   2492	net->ethtool_ops = &ethtool_ops;
   2493	SET_NETDEV_DEV(net, &dev->device);
   2494	dma_set_min_align_mask(&dev->device, HV_HYP_PAGE_SIZE - 1);
   2495
   2496	/* We always need headroom for rndis header */
   2497	net->needed_headroom = RNDIS_AND_PPI_SIZE;
   2498
   2499	/* Initialize the number of queues to be 1, we may change it if more
   2500	 * channels are offered later.
   2501	 */
   2502	netif_set_real_num_tx_queues(net, 1);
   2503	netif_set_real_num_rx_queues(net, 1);
   2504
   2505	/* Notify the netvsc driver of the new device */
   2506	device_info = netvsc_devinfo_get(NULL);
   2507
   2508	if (!device_info) {
   2509		ret = -ENOMEM;
   2510		goto devinfo_failed;
   2511	}
   2512
   2513	nvdev = rndis_filter_device_add(dev, device_info);
   2514	if (IS_ERR(nvdev)) {
   2515		ret = PTR_ERR(nvdev);
   2516		netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
   2517		goto rndis_failed;
   2518	}
   2519
   2520	eth_hw_addr_set(net, device_info->mac_adr);
   2521
   2522	/* We must get rtnl lock before scheduling nvdev->subchan_work,
   2523	 * otherwise netvsc_subchan_work() can get rtnl lock first and wait
   2524	 * all subchannels to show up, but that may not happen because
   2525	 * netvsc_probe() can't get rtnl lock and as a result vmbus_onoffer()
   2526	 * -> ... -> device_add() -> ... -> __device_attach() can't get
   2527	 * the device lock, so all the subchannels can't be processed --
   2528	 * finally netvsc_subchan_work() hangs forever.
   2529	 */
   2530	rtnl_lock();
   2531
   2532	if (nvdev->num_chn > 1)
   2533		schedule_work(&nvdev->subchan_work);
   2534
   2535	/* hw_features computed in rndis_netdev_set_hwcaps() */
   2536	net->features = net->hw_features |
   2537		NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX |
   2538		NETIF_F_HW_VLAN_CTAG_RX;
   2539	net->vlan_features = net->features;
   2540
   2541	netdev_lockdep_set_classes(net);
   2542
   2543	/* MTU range: 68 - 1500 or 65521 */
   2544	net->min_mtu = NETVSC_MTU_MIN;
   2545	if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
   2546		net->max_mtu = NETVSC_MTU - ETH_HLEN;
   2547	else
   2548		net->max_mtu = ETH_DATA_LEN;
   2549
   2550	nvdev->tx_disable = false;
   2551
   2552	ret = register_netdevice(net);
   2553	if (ret != 0) {
   2554		pr_err("Unable to register netdev.\n");
   2555		goto register_failed;
   2556	}
   2557
   2558	list_add(&net_device_ctx->list, &netvsc_dev_list);
   2559	rtnl_unlock();
   2560
   2561	netvsc_devinfo_put(device_info);
   2562	return 0;
   2563
   2564register_failed:
   2565	rtnl_unlock();
   2566	rndis_filter_device_remove(dev, nvdev);
   2567rndis_failed:
   2568	netvsc_devinfo_put(device_info);
   2569devinfo_failed:
   2570	free_percpu(net_device_ctx->vf_stats);
   2571no_stats:
   2572	hv_set_drvdata(dev, NULL);
   2573	free_netdev(net);
   2574no_net:
   2575	return ret;
   2576}
   2577
   2578static int netvsc_remove(struct hv_device *dev)
   2579{
   2580	struct net_device_context *ndev_ctx;
   2581	struct net_device *vf_netdev, *net;
   2582	struct netvsc_device *nvdev;
   2583
   2584	net = hv_get_drvdata(dev);
   2585	if (net == NULL) {
   2586		dev_err(&dev->device, "No net device to remove\n");
   2587		return 0;
   2588	}
   2589
   2590	ndev_ctx = netdev_priv(net);
   2591
   2592	cancel_delayed_work_sync(&ndev_ctx->dwork);
   2593
   2594	rtnl_lock();
   2595	nvdev = rtnl_dereference(ndev_ctx->nvdev);
   2596	if (nvdev) {
   2597		cancel_work_sync(&nvdev->subchan_work);
   2598		netvsc_xdp_set(net, NULL, NULL, nvdev);
   2599	}
   2600
   2601	/*
   2602	 * Call to the vsc driver to let it know that the device is being
   2603	 * removed. Also blocks mtu and channel changes.
   2604	 */
   2605	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
   2606	if (vf_netdev)
   2607		netvsc_unregister_vf(vf_netdev);
   2608
   2609	if (nvdev)
   2610		rndis_filter_device_remove(dev, nvdev);
   2611
   2612	unregister_netdevice(net);
   2613	list_del(&ndev_ctx->list);
   2614
   2615	rtnl_unlock();
   2616
   2617	hv_set_drvdata(dev, NULL);
   2618
   2619	free_percpu(ndev_ctx->vf_stats);
   2620	free_netdev(net);
   2621	return 0;
   2622}
   2623
   2624static int netvsc_suspend(struct hv_device *dev)
   2625{
   2626	struct net_device_context *ndev_ctx;
   2627	struct netvsc_device *nvdev;
   2628	struct net_device *net;
   2629	int ret;
   2630
   2631	net = hv_get_drvdata(dev);
   2632
   2633	ndev_ctx = netdev_priv(net);
   2634	cancel_delayed_work_sync(&ndev_ctx->dwork);
   2635
   2636	rtnl_lock();
   2637
   2638	nvdev = rtnl_dereference(ndev_ctx->nvdev);
   2639	if (nvdev == NULL) {
   2640		ret = -ENODEV;
   2641		goto out;
   2642	}
   2643
   2644	/* Save the current config info */
   2645	ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
   2646	if (!ndev_ctx->saved_netvsc_dev_info) {
   2647		ret = -ENOMEM;
   2648		goto out;
   2649	}
   2650	ret = netvsc_detach(net, nvdev);
   2651out:
   2652	rtnl_unlock();
   2653
   2654	return ret;
   2655}
   2656
   2657static int netvsc_resume(struct hv_device *dev)
   2658{
   2659	struct net_device *net = hv_get_drvdata(dev);
   2660	struct net_device_context *net_device_ctx;
   2661	struct netvsc_device_info *device_info;
   2662	int ret;
   2663
   2664	rtnl_lock();
   2665
   2666	net_device_ctx = netdev_priv(net);
   2667
   2668	/* Reset the data path to the netvsc NIC before re-opening the vmbus
   2669	 * channel. Later netvsc_netdev_event() will switch the data path to
   2670	 * the VF upon the UP or CHANGE event.
   2671	 */
   2672	net_device_ctx->data_path_is_vf = false;
   2673	device_info = net_device_ctx->saved_netvsc_dev_info;
   2674
   2675	ret = netvsc_attach(net, device_info);
   2676
   2677	netvsc_devinfo_put(device_info);
   2678	net_device_ctx->saved_netvsc_dev_info = NULL;
   2679
   2680	rtnl_unlock();
   2681
   2682	return ret;
   2683}
   2684static const struct hv_vmbus_device_id id_table[] = {
   2685	/* Network guid */
   2686	{ HV_NIC_GUID, },
   2687	{ },
   2688};
   2689
   2690MODULE_DEVICE_TABLE(vmbus, id_table);
   2691
   2692/* The one and only one */
   2693static struct  hv_driver netvsc_drv = {
   2694	.name = KBUILD_MODNAME,
   2695	.id_table = id_table,
   2696	.probe = netvsc_probe,
   2697	.remove = netvsc_remove,
   2698	.suspend = netvsc_suspend,
   2699	.resume = netvsc_resume,
   2700	.driver = {
   2701		.probe_type = PROBE_FORCE_SYNCHRONOUS,
   2702	},
   2703};
   2704
   2705/*
   2706 * On Hyper-V, every VF interface is matched with a corresponding
   2707 * synthetic interface. The synthetic interface is presented first
   2708 * to the guest. When the corresponding VF instance is registered,
   2709 * we will take care of switching the data path.
   2710 */
   2711static int netvsc_netdev_event(struct notifier_block *this,
   2712			       unsigned long event, void *ptr)
   2713{
   2714	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
   2715
   2716	/* Skip our own events */
   2717	if (event_dev->netdev_ops == &device_ops)
   2718		return NOTIFY_DONE;
   2719
   2720	/* Avoid non-Ethernet type devices */
   2721	if (event_dev->type != ARPHRD_ETHER)
   2722		return NOTIFY_DONE;
   2723
   2724	/* Avoid Vlan dev with same MAC registering as VF */
   2725	if (is_vlan_dev(event_dev))
   2726		return NOTIFY_DONE;
   2727
   2728	/* Avoid Bonding master dev with same MAC registering as VF */
   2729	if (netif_is_bond_master(event_dev))
   2730		return NOTIFY_DONE;
   2731
   2732	switch (event) {
   2733	case NETDEV_REGISTER:
   2734		return netvsc_register_vf(event_dev);
   2735	case NETDEV_UNREGISTER:
   2736		return netvsc_unregister_vf(event_dev);
   2737	case NETDEV_UP:
   2738	case NETDEV_DOWN:
   2739	case NETDEV_CHANGE:
   2740	case NETDEV_GOING_DOWN:
   2741		return netvsc_vf_changed(event_dev, event);
   2742	default:
   2743		return NOTIFY_DONE;
   2744	}
   2745}
   2746
   2747static struct notifier_block netvsc_netdev_notifier = {
   2748	.notifier_call = netvsc_netdev_event,
   2749};
   2750
   2751static void __exit netvsc_drv_exit(void)
   2752{
   2753	unregister_netdevice_notifier(&netvsc_netdev_notifier);
   2754	vmbus_driver_unregister(&netvsc_drv);
   2755}
   2756
   2757static int __init netvsc_drv_init(void)
   2758{
   2759	int ret;
   2760
   2761	if (ring_size < RING_SIZE_MIN) {
   2762		ring_size = RING_SIZE_MIN;
   2763		pr_info("Increased ring_size to %u (min allowed)\n",
   2764			ring_size);
   2765	}
   2766	netvsc_ring_bytes = ring_size * PAGE_SIZE;
   2767
   2768	ret = vmbus_driver_register(&netvsc_drv);
   2769	if (ret)
   2770		return ret;
   2771
   2772	register_netdevice_notifier(&netvsc_netdev_notifier);
   2773	return 0;
   2774}
   2775
   2776MODULE_LICENSE("GPL");
   2777MODULE_DESCRIPTION("Microsoft Hyper-V network driver");
   2778
   2779module_init(netvsc_drv_init);
   2780module_exit(netvsc_drv_exit);