cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

gve_main.c (45485B)


      1// SPDX-License-Identifier: (GPL-2.0 OR MIT)
      2/* Google virtual Ethernet (gve) driver
      3 *
      4 * Copyright (C) 2015-2021 Google, Inc.
      5 */
      6
      7#include <linux/cpumask.h>
      8#include <linux/etherdevice.h>
      9#include <linux/interrupt.h>
     10#include <linux/module.h>
     11#include <linux/pci.h>
     12#include <linux/sched.h>
     13#include <linux/timer.h>
     14#include <linux/workqueue.h>
     15#include <net/sch_generic.h>
     16#include "gve.h"
     17#include "gve_dqo.h"
     18#include "gve_adminq.h"
     19#include "gve_register.h"
     20
     21#define GVE_DEFAULT_RX_COPYBREAK	(256)
     22
     23#define DEFAULT_MSG_LEVEL	(NETIF_MSG_DRV | NETIF_MSG_LINK)
     24#define GVE_VERSION		"1.0.0"
     25#define GVE_VERSION_PREFIX	"GVE-"
     26
     27// Minimum amount of time between queue kicks in msec (10 seconds)
     28#define MIN_TX_TIMEOUT_GAP (1000 * 10)
     29
     30const char gve_version_str[] = GVE_VERSION;
     31static const char gve_version_prefix[] = GVE_VERSION_PREFIX;
     32
     33static netdev_tx_t gve_start_xmit(struct sk_buff *skb, struct net_device *dev)
     34{
     35	struct gve_priv *priv = netdev_priv(dev);
     36
     37	if (gve_is_gqi(priv))
     38		return gve_tx(skb, dev);
     39	else
     40		return gve_tx_dqo(skb, dev);
     41}
     42
     43static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
     44{
     45	struct gve_priv *priv = netdev_priv(dev);
     46	unsigned int start;
     47	u64 packets, bytes;
     48	int ring;
     49
     50	if (priv->rx) {
     51		for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
     52			do {
     53				start =
     54				  u64_stats_fetch_begin(&priv->rx[ring].statss);
     55				packets = priv->rx[ring].rpackets;
     56				bytes = priv->rx[ring].rbytes;
     57			} while (u64_stats_fetch_retry(&priv->rx[ring].statss,
     58						       start));
     59			s->rx_packets += packets;
     60			s->rx_bytes += bytes;
     61		}
     62	}
     63	if (priv->tx) {
     64		for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {
     65			do {
     66				start =
     67				  u64_stats_fetch_begin(&priv->tx[ring].statss);
     68				packets = priv->tx[ring].pkt_done;
     69				bytes = priv->tx[ring].bytes_done;
     70			} while (u64_stats_fetch_retry(&priv->tx[ring].statss,
     71						       start));
     72			s->tx_packets += packets;
     73			s->tx_bytes += bytes;
     74		}
     75	}
     76}
     77
     78static int gve_alloc_counter_array(struct gve_priv *priv)
     79{
     80	priv->counter_array =
     81		dma_alloc_coherent(&priv->pdev->dev,
     82				   priv->num_event_counters *
     83				   sizeof(*priv->counter_array),
     84				   &priv->counter_array_bus, GFP_KERNEL);
     85	if (!priv->counter_array)
     86		return -ENOMEM;
     87
     88	return 0;
     89}
     90
     91static void gve_free_counter_array(struct gve_priv *priv)
     92{
     93	if (!priv->counter_array)
     94		return;
     95
     96	dma_free_coherent(&priv->pdev->dev,
     97			  priv->num_event_counters *
     98			  sizeof(*priv->counter_array),
     99			  priv->counter_array, priv->counter_array_bus);
    100	priv->counter_array = NULL;
    101}
    102
    103/* NIC requests to report stats */
    104static void gve_stats_report_task(struct work_struct *work)
    105{
    106	struct gve_priv *priv = container_of(work, struct gve_priv,
    107					     stats_report_task);
    108	if (gve_get_do_report_stats(priv)) {
    109		gve_handle_report_stats(priv);
    110		gve_clear_do_report_stats(priv);
    111	}
    112}
    113
    114static void gve_stats_report_schedule(struct gve_priv *priv)
    115{
    116	if (!gve_get_probe_in_progress(priv) &&
    117	    !gve_get_reset_in_progress(priv)) {
    118		gve_set_do_report_stats(priv);
    119		queue_work(priv->gve_wq, &priv->stats_report_task);
    120	}
    121}
    122
    123static void gve_stats_report_timer(struct timer_list *t)
    124{
    125	struct gve_priv *priv = from_timer(priv, t, stats_report_timer);
    126
    127	mod_timer(&priv->stats_report_timer,
    128		  round_jiffies(jiffies +
    129		  msecs_to_jiffies(priv->stats_report_timer_period)));
    130	gve_stats_report_schedule(priv);
    131}
    132
    133static int gve_alloc_stats_report(struct gve_priv *priv)
    134{
    135	int tx_stats_num, rx_stats_num;
    136
    137	tx_stats_num = (GVE_TX_STATS_REPORT_NUM + NIC_TX_STATS_REPORT_NUM) *
    138		       priv->tx_cfg.num_queues;
    139	rx_stats_num = (GVE_RX_STATS_REPORT_NUM + NIC_RX_STATS_REPORT_NUM) *
    140		       priv->rx_cfg.num_queues;
    141	priv->stats_report_len = struct_size(priv->stats_report, stats,
    142					     tx_stats_num + rx_stats_num);
    143	priv->stats_report =
    144		dma_alloc_coherent(&priv->pdev->dev, priv->stats_report_len,
    145				   &priv->stats_report_bus, GFP_KERNEL);
    146	if (!priv->stats_report)
    147		return -ENOMEM;
    148	/* Set up timer for the report-stats task */
    149	timer_setup(&priv->stats_report_timer, gve_stats_report_timer, 0);
    150	priv->stats_report_timer_period = GVE_STATS_REPORT_TIMER_PERIOD;
    151	return 0;
    152}
    153
    154static void gve_free_stats_report(struct gve_priv *priv)
    155{
    156	if (!priv->stats_report)
    157		return;
    158
    159	del_timer_sync(&priv->stats_report_timer);
    160	dma_free_coherent(&priv->pdev->dev, priv->stats_report_len,
    161			  priv->stats_report, priv->stats_report_bus);
    162	priv->stats_report = NULL;
    163}
    164
    165static irqreturn_t gve_mgmnt_intr(int irq, void *arg)
    166{
    167	struct gve_priv *priv = arg;
    168
    169	queue_work(priv->gve_wq, &priv->service_task);
    170	return IRQ_HANDLED;
    171}
    172
    173static irqreturn_t gve_intr(int irq, void *arg)
    174{
    175	struct gve_notify_block *block = arg;
    176	struct gve_priv *priv = block->priv;
    177
    178	iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
    179	napi_schedule_irqoff(&block->napi);
    180	return IRQ_HANDLED;
    181}
    182
    183static irqreturn_t gve_intr_dqo(int irq, void *arg)
    184{
    185	struct gve_notify_block *block = arg;
    186
    187	/* Interrupts are automatically masked */
    188	napi_schedule_irqoff(&block->napi);
    189	return IRQ_HANDLED;
    190}
    191
    192static int gve_napi_poll(struct napi_struct *napi, int budget)
    193{
    194	struct gve_notify_block *block;
    195	__be32 __iomem *irq_doorbell;
    196	bool reschedule = false;
    197	struct gve_priv *priv;
    198	int work_done = 0;
    199
    200	block = container_of(napi, struct gve_notify_block, napi);
    201	priv = block->priv;
    202
    203	if (block->tx)
    204		reschedule |= gve_tx_poll(block, budget);
    205	if (block->rx) {
    206		work_done = gve_rx_poll(block, budget);
    207		reschedule |= work_done == budget;
    208	}
    209
    210	if (reschedule)
    211		return budget;
    212
    213       /* Complete processing - don't unmask irq if busy polling is enabled */
    214	if (likely(napi_complete_done(napi, work_done))) {
    215		irq_doorbell = gve_irq_doorbell(priv, block);
    216		iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell);
    217
    218		/* Ensure IRQ ACK is visible before we check pending work.
    219		 * If queue had issued updates, it would be truly visible.
    220		 */
    221		mb();
    222
    223		if (block->tx)
    224			reschedule |= gve_tx_clean_pending(priv, block->tx);
    225		if (block->rx)
    226			reschedule |= gve_rx_work_pending(block->rx);
    227
    228		if (reschedule && napi_reschedule(napi))
    229			iowrite32be(GVE_IRQ_MASK, irq_doorbell);
    230	}
    231	return work_done;
    232}
    233
    234static int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
    235{
    236	struct gve_notify_block *block =
    237		container_of(napi, struct gve_notify_block, napi);
    238	struct gve_priv *priv = block->priv;
    239	bool reschedule = false;
    240	int work_done = 0;
    241
    242	/* Clear PCI MSI-X Pending Bit Array (PBA)
    243	 *
    244	 * This bit is set if an interrupt event occurs while the vector is
    245	 * masked. If this bit is set and we reenable the interrupt, it will
    246	 * fire again. Since we're just about to poll the queue state, we don't
    247	 * need it to fire again.
    248	 *
    249	 * Under high softirq load, it's possible that the interrupt condition
    250	 * is triggered twice before we got the chance to process it.
    251	 */
    252	gve_write_irq_doorbell_dqo(priv, block,
    253				   GVE_ITR_NO_UPDATE_DQO | GVE_ITR_CLEAR_PBA_BIT_DQO);
    254
    255	if (block->tx)
    256		reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true);
    257
    258	if (block->rx) {
    259		work_done = gve_rx_poll_dqo(block, budget);
    260		reschedule |= work_done == budget;
    261	}
    262
    263	if (reschedule)
    264		return budget;
    265
    266	if (likely(napi_complete_done(napi, work_done))) {
    267		/* Enable interrupts again.
    268		 *
    269		 * We don't need to repoll afterwards because HW supports the
    270		 * PCI MSI-X PBA feature.
    271		 *
    272		 * Another interrupt would be triggered if a new event came in
    273		 * since the last one.
    274		 */
    275		gve_write_irq_doorbell_dqo(priv, block,
    276					   GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
    277	}
    278
    279	return work_done;
    280}
    281
    282static int gve_alloc_notify_blocks(struct gve_priv *priv)
    283{
    284	int num_vecs_requested = priv->num_ntfy_blks + 1;
    285	char *name = priv->dev->name;
    286	unsigned int active_cpus;
    287	int vecs_enabled;
    288	int i, j;
    289	int err;
    290
    291	priv->msix_vectors = kvcalloc(num_vecs_requested,
    292				      sizeof(*priv->msix_vectors), GFP_KERNEL);
    293	if (!priv->msix_vectors)
    294		return -ENOMEM;
    295	for (i = 0; i < num_vecs_requested; i++)
    296		priv->msix_vectors[i].entry = i;
    297	vecs_enabled = pci_enable_msix_range(priv->pdev, priv->msix_vectors,
    298					     GVE_MIN_MSIX, num_vecs_requested);
    299	if (vecs_enabled < 0) {
    300		dev_err(&priv->pdev->dev, "Could not enable min msix %d/%d\n",
    301			GVE_MIN_MSIX, vecs_enabled);
    302		err = vecs_enabled;
    303		goto abort_with_msix_vectors;
    304	}
    305	if (vecs_enabled != num_vecs_requested) {
    306		int new_num_ntfy_blks = (vecs_enabled - 1) & ~0x1;
    307		int vecs_per_type = new_num_ntfy_blks / 2;
    308		int vecs_left = new_num_ntfy_blks % 2;
    309
    310		priv->num_ntfy_blks = new_num_ntfy_blks;
    311		priv->mgmt_msix_idx = priv->num_ntfy_blks;
    312		priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues,
    313						vecs_per_type);
    314		priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues,
    315						vecs_per_type + vecs_left);
    316		dev_err(&priv->pdev->dev,
    317			"Could not enable desired msix, only enabled %d, adjusting tx max queues to %d, and rx max queues to %d\n",
    318			vecs_enabled, priv->tx_cfg.max_queues,
    319			priv->rx_cfg.max_queues);
    320		if (priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)
    321			priv->tx_cfg.num_queues = priv->tx_cfg.max_queues;
    322		if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues)
    323			priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
    324	}
    325	/* Half the notification blocks go to TX and half to RX */
    326	active_cpus = min_t(int, priv->num_ntfy_blks / 2, num_online_cpus());
    327
    328	/* Setup Management Vector  - the last vector */
    329	snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "%s-mgmnt",
    330		 name);
    331	err = request_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector,
    332			  gve_mgmnt_intr, 0, priv->mgmt_msix_name, priv);
    333	if (err) {
    334		dev_err(&priv->pdev->dev, "Did not receive management vector.\n");
    335		goto abort_with_msix_enabled;
    336	}
    337	priv->irq_db_indices =
    338		dma_alloc_coherent(&priv->pdev->dev,
    339				   priv->num_ntfy_blks *
    340				   sizeof(*priv->irq_db_indices),
    341				   &priv->irq_db_indices_bus, GFP_KERNEL);
    342	if (!priv->irq_db_indices) {
    343		err = -ENOMEM;
    344		goto abort_with_mgmt_vector;
    345	}
    346
    347	priv->ntfy_blocks = kvzalloc(priv->num_ntfy_blks *
    348				     sizeof(*priv->ntfy_blocks), GFP_KERNEL);
    349	if (!priv->ntfy_blocks) {
    350		err = -ENOMEM;
    351		goto abort_with_irq_db_indices;
    352	}
    353
    354	/* Setup the other blocks - the first n-1 vectors */
    355	for (i = 0; i < priv->num_ntfy_blks; i++) {
    356		struct gve_notify_block *block = &priv->ntfy_blocks[i];
    357		int msix_idx = i;
    358
    359		snprintf(block->name, sizeof(block->name), "%s-ntfy-block.%d",
    360			 name, i);
    361		block->priv = priv;
    362		err = request_irq(priv->msix_vectors[msix_idx].vector,
    363				  gve_is_gqi(priv) ? gve_intr : gve_intr_dqo,
    364				  0, block->name, block);
    365		if (err) {
    366			dev_err(&priv->pdev->dev,
    367				"Failed to receive msix vector %d\n", i);
    368			goto abort_with_some_ntfy_blocks;
    369		}
    370		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
    371				      get_cpu_mask(i % active_cpus));
    372		block->irq_db_index = &priv->irq_db_indices[i].index;
    373	}
    374	return 0;
    375abort_with_some_ntfy_blocks:
    376	for (j = 0; j < i; j++) {
    377		struct gve_notify_block *block = &priv->ntfy_blocks[j];
    378		int msix_idx = j;
    379
    380		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
    381				      NULL);
    382		free_irq(priv->msix_vectors[msix_idx].vector, block);
    383	}
    384	kvfree(priv->ntfy_blocks);
    385	priv->ntfy_blocks = NULL;
    386abort_with_irq_db_indices:
    387	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
    388			  sizeof(*priv->irq_db_indices),
    389			  priv->irq_db_indices, priv->irq_db_indices_bus);
    390	priv->irq_db_indices = NULL;
    391abort_with_mgmt_vector:
    392	free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
    393abort_with_msix_enabled:
    394	pci_disable_msix(priv->pdev);
    395abort_with_msix_vectors:
    396	kvfree(priv->msix_vectors);
    397	priv->msix_vectors = NULL;
    398	return err;
    399}
    400
    401static void gve_free_notify_blocks(struct gve_priv *priv)
    402{
    403	int i;
    404
    405	if (!priv->msix_vectors)
    406		return;
    407
    408	/* Free the irqs */
    409	for (i = 0; i < priv->num_ntfy_blks; i++) {
    410		struct gve_notify_block *block = &priv->ntfy_blocks[i];
    411		int msix_idx = i;
    412
    413		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
    414				      NULL);
    415		free_irq(priv->msix_vectors[msix_idx].vector, block);
    416	}
    417	free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
    418	kvfree(priv->ntfy_blocks);
    419	priv->ntfy_blocks = NULL;
    420	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
    421			  sizeof(*priv->irq_db_indices),
    422			  priv->irq_db_indices, priv->irq_db_indices_bus);
    423	priv->irq_db_indices = NULL;
    424	pci_disable_msix(priv->pdev);
    425	kvfree(priv->msix_vectors);
    426	priv->msix_vectors = NULL;
    427}
    428
    429static int gve_setup_device_resources(struct gve_priv *priv)
    430{
    431	int err;
    432
    433	err = gve_alloc_counter_array(priv);
    434	if (err)
    435		return err;
    436	err = gve_alloc_notify_blocks(priv);
    437	if (err)
    438		goto abort_with_counter;
    439	err = gve_alloc_stats_report(priv);
    440	if (err)
    441		goto abort_with_ntfy_blocks;
    442	err = gve_adminq_configure_device_resources(priv,
    443						    priv->counter_array_bus,
    444						    priv->num_event_counters,
    445						    priv->irq_db_indices_bus,
    446						    priv->num_ntfy_blks);
    447	if (unlikely(err)) {
    448		dev_err(&priv->pdev->dev,
    449			"could not setup device_resources: err=%d\n", err);
    450		err = -ENXIO;
    451		goto abort_with_stats_report;
    452	}
    453
    454	if (priv->queue_format == GVE_DQO_RDA_FORMAT) {
    455		priv->ptype_lut_dqo = kvzalloc(sizeof(*priv->ptype_lut_dqo),
    456					       GFP_KERNEL);
    457		if (!priv->ptype_lut_dqo) {
    458			err = -ENOMEM;
    459			goto abort_with_stats_report;
    460		}
    461		err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo);
    462		if (err) {
    463			dev_err(&priv->pdev->dev,
    464				"Failed to get ptype map: err=%d\n", err);
    465			goto abort_with_ptype_lut;
    466		}
    467	}
    468
    469	err = gve_adminq_report_stats(priv, priv->stats_report_len,
    470				      priv->stats_report_bus,
    471				      GVE_STATS_REPORT_TIMER_PERIOD);
    472	if (err)
    473		dev_err(&priv->pdev->dev,
    474			"Failed to report stats: err=%d\n", err);
    475	gve_set_device_resources_ok(priv);
    476	return 0;
    477
    478abort_with_ptype_lut:
    479	kvfree(priv->ptype_lut_dqo);
    480	priv->ptype_lut_dqo = NULL;
    481abort_with_stats_report:
    482	gve_free_stats_report(priv);
    483abort_with_ntfy_blocks:
    484	gve_free_notify_blocks(priv);
    485abort_with_counter:
    486	gve_free_counter_array(priv);
    487
    488	return err;
    489}
    490
    491static void gve_trigger_reset(struct gve_priv *priv);
    492
    493static void gve_teardown_device_resources(struct gve_priv *priv)
    494{
    495	int err;
    496
    497	/* Tell device its resources are being freed */
    498	if (gve_get_device_resources_ok(priv)) {
    499		/* detach the stats report */
    500		err = gve_adminq_report_stats(priv, 0, 0x0, GVE_STATS_REPORT_TIMER_PERIOD);
    501		if (err) {
    502			dev_err(&priv->pdev->dev,
    503				"Failed to detach stats report: err=%d\n", err);
    504			gve_trigger_reset(priv);
    505		}
    506		err = gve_adminq_deconfigure_device_resources(priv);
    507		if (err) {
    508			dev_err(&priv->pdev->dev,
    509				"Could not deconfigure device resources: err=%d\n",
    510				err);
    511			gve_trigger_reset(priv);
    512		}
    513	}
    514
    515	kvfree(priv->ptype_lut_dqo);
    516	priv->ptype_lut_dqo = NULL;
    517
    518	gve_free_counter_array(priv);
    519	gve_free_notify_blocks(priv);
    520	gve_free_stats_report(priv);
    521	gve_clear_device_resources_ok(priv);
    522}
    523
    524static void gve_add_napi(struct gve_priv *priv, int ntfy_idx,
    525			 int (*gve_poll)(struct napi_struct *, int))
    526{
    527	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
    528
    529	netif_napi_add(priv->dev, &block->napi, gve_poll,
    530		       NAPI_POLL_WEIGHT);
    531}
    532
    533static void gve_remove_napi(struct gve_priv *priv, int ntfy_idx)
    534{
    535	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
    536
    537	netif_napi_del(&block->napi);
    538}
    539
    540static int gve_register_qpls(struct gve_priv *priv)
    541{
    542	int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
    543	int err;
    544	int i;
    545
    546	for (i = 0; i < num_qpls; i++) {
    547		err = gve_adminq_register_page_list(priv, &priv->qpls[i]);
    548		if (err) {
    549			netif_err(priv, drv, priv->dev,
    550				  "failed to register queue page list %d\n",
    551				  priv->qpls[i].id);
    552			/* This failure will trigger a reset - no need to clean
    553			 * up
    554			 */
    555			return err;
    556		}
    557	}
    558	return 0;
    559}
    560
    561static int gve_unregister_qpls(struct gve_priv *priv)
    562{
    563	int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
    564	int err;
    565	int i;
    566
    567	for (i = 0; i < num_qpls; i++) {
    568		err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id);
    569		/* This failure will trigger a reset - no need to clean up */
    570		if (err) {
    571			netif_err(priv, drv, priv->dev,
    572				  "Failed to unregister queue page list %d\n",
    573				  priv->qpls[i].id);
    574			return err;
    575		}
    576	}
    577	return 0;
    578}
    579
    580static int gve_create_rings(struct gve_priv *priv)
    581{
    582	int err;
    583	int i;
    584
    585	err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
    586	if (err) {
    587		netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n",
    588			  priv->tx_cfg.num_queues);
    589		/* This failure will trigger a reset - no need to clean
    590		 * up
    591		 */
    592		return err;
    593	}
    594	netif_dbg(priv, drv, priv->dev, "created %d tx queues\n",
    595		  priv->tx_cfg.num_queues);
    596
    597	err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues);
    598	if (err) {
    599		netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n",
    600			  priv->rx_cfg.num_queues);
    601		/* This failure will trigger a reset - no need to clean
    602		 * up
    603		 */
    604		return err;
    605	}
    606	netif_dbg(priv, drv, priv->dev, "created %d rx queues\n",
    607		  priv->rx_cfg.num_queues);
    608
    609	if (gve_is_gqi(priv)) {
    610		/* Rx data ring has been prefilled with packet buffers at queue
    611		 * allocation time.
    612		 *
    613		 * Write the doorbell to provide descriptor slots and packet
    614		 * buffers to the NIC.
    615		 */
    616		for (i = 0; i < priv->rx_cfg.num_queues; i++)
    617			gve_rx_write_doorbell(priv, &priv->rx[i]);
    618	} else {
    619		for (i = 0; i < priv->rx_cfg.num_queues; i++) {
    620			/* Post buffers and ring doorbell. */
    621			gve_rx_post_buffers_dqo(&priv->rx[i]);
    622		}
    623	}
    624
    625	return 0;
    626}
    627
    628static void add_napi_init_sync_stats(struct gve_priv *priv,
    629				     int (*napi_poll)(struct napi_struct *napi,
    630						      int budget))
    631{
    632	int i;
    633
    634	/* Add tx napi & init sync stats*/
    635	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
    636		int ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
    637
    638		u64_stats_init(&priv->tx[i].statss);
    639		priv->tx[i].ntfy_id = ntfy_idx;
    640		gve_add_napi(priv, ntfy_idx, napi_poll);
    641	}
    642	/* Add rx napi  & init sync stats*/
    643	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
    644		int ntfy_idx = gve_rx_idx_to_ntfy(priv, i);
    645
    646		u64_stats_init(&priv->rx[i].statss);
    647		priv->rx[i].ntfy_id = ntfy_idx;
    648		gve_add_napi(priv, ntfy_idx, napi_poll);
    649	}
    650}
    651
    652static void gve_tx_free_rings(struct gve_priv *priv)
    653{
    654	if (gve_is_gqi(priv)) {
    655		gve_tx_free_rings_gqi(priv);
    656	} else {
    657		gve_tx_free_rings_dqo(priv);
    658	}
    659}
    660
    661static int gve_alloc_rings(struct gve_priv *priv)
    662{
    663	int err;
    664
    665	/* Setup tx rings */
    666	priv->tx = kvcalloc(priv->tx_cfg.num_queues, sizeof(*priv->tx),
    667			    GFP_KERNEL);
    668	if (!priv->tx)
    669		return -ENOMEM;
    670
    671	if (gve_is_gqi(priv))
    672		err = gve_tx_alloc_rings(priv);
    673	else
    674		err = gve_tx_alloc_rings_dqo(priv);
    675	if (err)
    676		goto free_tx;
    677
    678	/* Setup rx rings */
    679	priv->rx = kvcalloc(priv->rx_cfg.num_queues, sizeof(*priv->rx),
    680			    GFP_KERNEL);
    681	if (!priv->rx) {
    682		err = -ENOMEM;
    683		goto free_tx_queue;
    684	}
    685
    686	if (gve_is_gqi(priv))
    687		err = gve_rx_alloc_rings(priv);
    688	else
    689		err = gve_rx_alloc_rings_dqo(priv);
    690	if (err)
    691		goto free_rx;
    692
    693	if (gve_is_gqi(priv))
    694		add_napi_init_sync_stats(priv, gve_napi_poll);
    695	else
    696		add_napi_init_sync_stats(priv, gve_napi_poll_dqo);
    697
    698	return 0;
    699
    700free_rx:
    701	kvfree(priv->rx);
    702	priv->rx = NULL;
    703free_tx_queue:
    704	gve_tx_free_rings(priv);
    705free_tx:
    706	kvfree(priv->tx);
    707	priv->tx = NULL;
    708	return err;
    709}
    710
    711static int gve_destroy_rings(struct gve_priv *priv)
    712{
    713	int err;
    714
    715	err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
    716	if (err) {
    717		netif_err(priv, drv, priv->dev,
    718			  "failed to destroy tx queues\n");
    719		/* This failure will trigger a reset - no need to clean up */
    720		return err;
    721	}
    722	netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n");
    723	err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues);
    724	if (err) {
    725		netif_err(priv, drv, priv->dev,
    726			  "failed to destroy rx queues\n");
    727		/* This failure will trigger a reset - no need to clean up */
    728		return err;
    729	}
    730	netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n");
    731	return 0;
    732}
    733
    734static void gve_rx_free_rings(struct gve_priv *priv)
    735{
    736	if (gve_is_gqi(priv))
    737		gve_rx_free_rings_gqi(priv);
    738	else
    739		gve_rx_free_rings_dqo(priv);
    740}
    741
    742static void gve_free_rings(struct gve_priv *priv)
    743{
    744	int ntfy_idx;
    745	int i;
    746
    747	if (priv->tx) {
    748		for (i = 0; i < priv->tx_cfg.num_queues; i++) {
    749			ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
    750			gve_remove_napi(priv, ntfy_idx);
    751		}
    752		gve_tx_free_rings(priv);
    753		kvfree(priv->tx);
    754		priv->tx = NULL;
    755	}
    756	if (priv->rx) {
    757		for (i = 0; i < priv->rx_cfg.num_queues; i++) {
    758			ntfy_idx = gve_rx_idx_to_ntfy(priv, i);
    759			gve_remove_napi(priv, ntfy_idx);
    760		}
    761		gve_rx_free_rings(priv);
    762		kvfree(priv->rx);
    763		priv->rx = NULL;
    764	}
    765}
    766
    767int gve_alloc_page(struct gve_priv *priv, struct device *dev,
    768		   struct page **page, dma_addr_t *dma,
    769		   enum dma_data_direction dir, gfp_t gfp_flags)
    770{
    771	*page = alloc_page(gfp_flags);
    772	if (!*page) {
    773		priv->page_alloc_fail++;
    774		return -ENOMEM;
    775	}
    776	*dma = dma_map_page(dev, *page, 0, PAGE_SIZE, dir);
    777	if (dma_mapping_error(dev, *dma)) {
    778		priv->dma_mapping_error++;
    779		put_page(*page);
    780		return -ENOMEM;
    781	}
    782	return 0;
    783}
    784
    785static int gve_alloc_queue_page_list(struct gve_priv *priv, u32 id,
    786				     int pages)
    787{
    788	struct gve_queue_page_list *qpl = &priv->qpls[id];
    789	int err;
    790	int i;
    791
    792	if (pages + priv->num_registered_pages > priv->max_registered_pages) {
    793		netif_err(priv, drv, priv->dev,
    794			  "Reached max number of registered pages %llu > %llu\n",
    795			  pages + priv->num_registered_pages,
    796			  priv->max_registered_pages);
    797		return -EINVAL;
    798	}
    799
    800	qpl->id = id;
    801	qpl->num_entries = 0;
    802	qpl->pages = kvcalloc(pages, sizeof(*qpl->pages), GFP_KERNEL);
    803	/* caller handles clean up */
    804	if (!qpl->pages)
    805		return -ENOMEM;
    806	qpl->page_buses = kvcalloc(pages, sizeof(*qpl->page_buses), GFP_KERNEL);
    807	/* caller handles clean up */
    808	if (!qpl->page_buses)
    809		return -ENOMEM;
    810
    811	for (i = 0; i < pages; i++) {
    812		err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i],
    813				     &qpl->page_buses[i],
    814				     gve_qpl_dma_dir(priv, id), GFP_KERNEL);
    815		/* caller handles clean up */
    816		if (err)
    817			return -ENOMEM;
    818		qpl->num_entries++;
    819	}
    820	priv->num_registered_pages += pages;
    821
    822	return 0;
    823}
    824
    825void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma,
    826		   enum dma_data_direction dir)
    827{
    828	if (!dma_mapping_error(dev, dma))
    829		dma_unmap_page(dev, dma, PAGE_SIZE, dir);
    830	if (page)
    831		put_page(page);
    832}
    833
    834static void gve_free_queue_page_list(struct gve_priv *priv, u32 id)
    835{
    836	struct gve_queue_page_list *qpl = &priv->qpls[id];
    837	int i;
    838
    839	if (!qpl->pages)
    840		return;
    841	if (!qpl->page_buses)
    842		goto free_pages;
    843
    844	for (i = 0; i < qpl->num_entries; i++)
    845		gve_free_page(&priv->pdev->dev, qpl->pages[i],
    846			      qpl->page_buses[i], gve_qpl_dma_dir(priv, id));
    847
    848	kvfree(qpl->page_buses);
    849free_pages:
    850	kvfree(qpl->pages);
    851	priv->num_registered_pages -= qpl->num_entries;
    852}
    853
    854static int gve_alloc_qpls(struct gve_priv *priv)
    855{
    856	int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
    857	int i, j;
    858	int err;
    859
    860	if (num_qpls == 0)
    861		return 0;
    862
    863	priv->qpls = kvcalloc(num_qpls, sizeof(*priv->qpls), GFP_KERNEL);
    864	if (!priv->qpls)
    865		return -ENOMEM;
    866
    867	for (i = 0; i < gve_num_tx_qpls(priv); i++) {
    868		err = gve_alloc_queue_page_list(priv, i,
    869						priv->tx_pages_per_qpl);
    870		if (err)
    871			goto free_qpls;
    872	}
    873	for (; i < num_qpls; i++) {
    874		err = gve_alloc_queue_page_list(priv, i,
    875						priv->rx_data_slot_cnt);
    876		if (err)
    877			goto free_qpls;
    878	}
    879
    880	priv->qpl_cfg.qpl_map_size = BITS_TO_LONGS(num_qpls) *
    881				     sizeof(unsigned long) * BITS_PER_BYTE;
    882	priv->qpl_cfg.qpl_id_map = kvcalloc(BITS_TO_LONGS(num_qpls),
    883					    sizeof(unsigned long), GFP_KERNEL);
    884	if (!priv->qpl_cfg.qpl_id_map) {
    885		err = -ENOMEM;
    886		goto free_qpls;
    887	}
    888
    889	return 0;
    890
    891free_qpls:
    892	for (j = 0; j <= i; j++)
    893		gve_free_queue_page_list(priv, j);
    894	kvfree(priv->qpls);
    895	return err;
    896}
    897
    898static void gve_free_qpls(struct gve_priv *priv)
    899{
    900	int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
    901	int i;
    902
    903	if (num_qpls == 0)
    904		return;
    905
    906	kvfree(priv->qpl_cfg.qpl_id_map);
    907
    908	for (i = 0; i < num_qpls; i++)
    909		gve_free_queue_page_list(priv, i);
    910
    911	kvfree(priv->qpls);
    912}
    913
    914/* Use this to schedule a reset when the device is capable of continuing
    915 * to handle other requests in its current state. If it is not, do a reset
    916 * in thread instead.
    917 */
    918void gve_schedule_reset(struct gve_priv *priv)
    919{
    920	gve_set_do_reset(priv);
    921	queue_work(priv->gve_wq, &priv->service_task);
    922}
    923
    924static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up);
    925static int gve_reset_recovery(struct gve_priv *priv, bool was_up);
    926static void gve_turndown(struct gve_priv *priv);
    927static void gve_turnup(struct gve_priv *priv);
    928
    929static int gve_open(struct net_device *dev)
    930{
    931	struct gve_priv *priv = netdev_priv(dev);
    932	int err;
    933
    934	err = gve_alloc_qpls(priv);
    935	if (err)
    936		return err;
    937
    938	err = gve_alloc_rings(priv);
    939	if (err)
    940		goto free_qpls;
    941
    942	err = netif_set_real_num_tx_queues(dev, priv->tx_cfg.num_queues);
    943	if (err)
    944		goto free_rings;
    945	err = netif_set_real_num_rx_queues(dev, priv->rx_cfg.num_queues);
    946	if (err)
    947		goto free_rings;
    948
    949	err = gve_register_qpls(priv);
    950	if (err)
    951		goto reset;
    952
    953	if (!gve_is_gqi(priv)) {
    954		/* Hard code this for now. This may be tuned in the future for
    955		 * performance.
    956		 */
    957		priv->data_buffer_size_dqo = GVE_RX_BUFFER_SIZE_DQO;
    958	}
    959	err = gve_create_rings(priv);
    960	if (err)
    961		goto reset;
    962
    963	gve_set_device_rings_ok(priv);
    964
    965	if (gve_get_report_stats(priv))
    966		mod_timer(&priv->stats_report_timer,
    967			  round_jiffies(jiffies +
    968				msecs_to_jiffies(priv->stats_report_timer_period)));
    969
    970	gve_turnup(priv);
    971	queue_work(priv->gve_wq, &priv->service_task);
    972	priv->interface_up_cnt++;
    973	return 0;
    974
    975free_rings:
    976	gve_free_rings(priv);
    977free_qpls:
    978	gve_free_qpls(priv);
    979	return err;
    980
    981reset:
    982	/* This must have been called from a reset due to the rtnl lock
    983	 * so just return at this point.
    984	 */
    985	if (gve_get_reset_in_progress(priv))
    986		return err;
    987	/* Otherwise reset before returning */
    988	gve_reset_and_teardown(priv, true);
    989	/* if this fails there is nothing we can do so just ignore the return */
    990	gve_reset_recovery(priv, false);
    991	/* return the original error */
    992	return err;
    993}
    994
    995static int gve_close(struct net_device *dev)
    996{
    997	struct gve_priv *priv = netdev_priv(dev);
    998	int err;
    999
   1000	netif_carrier_off(dev);
   1001	if (gve_get_device_rings_ok(priv)) {
   1002		gve_turndown(priv);
   1003		err = gve_destroy_rings(priv);
   1004		if (err)
   1005			goto err;
   1006		err = gve_unregister_qpls(priv);
   1007		if (err)
   1008			goto err;
   1009		gve_clear_device_rings_ok(priv);
   1010	}
   1011	del_timer_sync(&priv->stats_report_timer);
   1012
   1013	gve_free_rings(priv);
   1014	gve_free_qpls(priv);
   1015	priv->interface_down_cnt++;
   1016	return 0;
   1017
   1018err:
   1019	/* This must have been called from a reset due to the rtnl lock
   1020	 * so just return at this point.
   1021	 */
   1022	if (gve_get_reset_in_progress(priv))
   1023		return err;
   1024	/* Otherwise reset before returning */
   1025	gve_reset_and_teardown(priv, true);
   1026	return gve_reset_recovery(priv, false);
   1027}
   1028
   1029int gve_adjust_queues(struct gve_priv *priv,
   1030		      struct gve_queue_config new_rx_config,
   1031		      struct gve_queue_config new_tx_config)
   1032{
   1033	int err;
   1034
   1035	if (netif_carrier_ok(priv->dev)) {
   1036		/* To make this process as simple as possible we teardown the
   1037		 * device, set the new configuration, and then bring the device
   1038		 * up again.
   1039		 */
   1040		err = gve_close(priv->dev);
   1041		/* we have already tried to reset in close,
   1042		 * just fail at this point
   1043		 */
   1044		if (err)
   1045			return err;
   1046		priv->tx_cfg = new_tx_config;
   1047		priv->rx_cfg = new_rx_config;
   1048
   1049		err = gve_open(priv->dev);
   1050		if (err)
   1051			goto err;
   1052
   1053		return 0;
   1054	}
   1055	/* Set the config for the next up. */
   1056	priv->tx_cfg = new_tx_config;
   1057	priv->rx_cfg = new_rx_config;
   1058
   1059	return 0;
   1060err:
   1061	netif_err(priv, drv, priv->dev,
   1062		  "Adjust queues failed! !!! DISABLING ALL QUEUES !!!\n");
   1063	gve_turndown(priv);
   1064	return err;
   1065}
   1066
   1067static void gve_turndown(struct gve_priv *priv)
   1068{
   1069	int idx;
   1070
   1071	if (netif_carrier_ok(priv->dev))
   1072		netif_carrier_off(priv->dev);
   1073
   1074	if (!gve_get_napi_enabled(priv))
   1075		return;
   1076
   1077	/* Disable napi to prevent more work from coming in */
   1078	for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
   1079		int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
   1080		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
   1081
   1082		napi_disable(&block->napi);
   1083	}
   1084	for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
   1085		int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
   1086		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
   1087
   1088		napi_disable(&block->napi);
   1089	}
   1090
   1091	/* Stop tx queues */
   1092	netif_tx_disable(priv->dev);
   1093
   1094	gve_clear_napi_enabled(priv);
   1095	gve_clear_report_stats(priv);
   1096}
   1097
   1098static void gve_turnup(struct gve_priv *priv)
   1099{
   1100	int idx;
   1101
   1102	/* Start the tx queues */
   1103	netif_tx_start_all_queues(priv->dev);
   1104
   1105	/* Enable napi and unmask interrupts for all queues */
   1106	for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
   1107		int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
   1108		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
   1109
   1110		napi_enable(&block->napi);
   1111		if (gve_is_gqi(priv)) {
   1112			iowrite32be(0, gve_irq_doorbell(priv, block));
   1113		} else {
   1114			gve_set_itr_coalesce_usecs_dqo(priv, block,
   1115						       priv->tx_coalesce_usecs);
   1116		}
   1117	}
   1118	for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
   1119		int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
   1120		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
   1121
   1122		napi_enable(&block->napi);
   1123		if (gve_is_gqi(priv)) {
   1124			iowrite32be(0, gve_irq_doorbell(priv, block));
   1125		} else {
   1126			gve_set_itr_coalesce_usecs_dqo(priv, block,
   1127						       priv->rx_coalesce_usecs);
   1128		}
   1129	}
   1130
   1131	gve_set_napi_enabled(priv);
   1132}
   1133
   1134static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)
   1135{
   1136	struct gve_notify_block *block;
   1137	struct gve_tx_ring *tx = NULL;
   1138	struct gve_priv *priv;
   1139	u32 last_nic_done;
   1140	u32 current_time;
   1141	u32 ntfy_idx;
   1142
   1143	netdev_info(dev, "Timeout on tx queue, %d", txqueue);
   1144	priv = netdev_priv(dev);
   1145	if (txqueue > priv->tx_cfg.num_queues)
   1146		goto reset;
   1147
   1148	ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue);
   1149	if (ntfy_idx >= priv->num_ntfy_blks)
   1150		goto reset;
   1151
   1152	block = &priv->ntfy_blocks[ntfy_idx];
   1153	tx = block->tx;
   1154
   1155	current_time = jiffies_to_msecs(jiffies);
   1156	if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time)
   1157		goto reset;
   1158
   1159	/* Check to see if there are missed completions, which will allow us to
   1160	 * kick the queue.
   1161	 */
   1162	last_nic_done = gve_tx_load_event_counter(priv, tx);
   1163	if (last_nic_done - tx->done) {
   1164		netdev_info(dev, "Kicking queue %d", txqueue);
   1165		iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
   1166		napi_schedule(&block->napi);
   1167		tx->last_kick_msec = current_time;
   1168		goto out;
   1169	} // Else reset.
   1170
   1171reset:
   1172	gve_schedule_reset(priv);
   1173
   1174out:
   1175	if (tx)
   1176		tx->queue_timeout++;
   1177	priv->tx_timeo_cnt++;
   1178}
   1179
   1180static int gve_set_features(struct net_device *netdev,
   1181			    netdev_features_t features)
   1182{
   1183	const netdev_features_t orig_features = netdev->features;
   1184	struct gve_priv *priv = netdev_priv(netdev);
   1185	int err;
   1186
   1187	if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) {
   1188		netdev->features ^= NETIF_F_LRO;
   1189		if (netif_carrier_ok(netdev)) {
   1190			/* To make this process as simple as possible we
   1191			 * teardown the device, set the new configuration,
   1192			 * and then bring the device up again.
   1193			 */
   1194			err = gve_close(netdev);
   1195			/* We have already tried to reset in close, just fail
   1196			 * at this point.
   1197			 */
   1198			if (err)
   1199				goto err;
   1200
   1201			err = gve_open(netdev);
   1202			if (err)
   1203				goto err;
   1204		}
   1205	}
   1206
   1207	return 0;
   1208err:
   1209	/* Reverts the change on error. */
   1210	netdev->features = orig_features;
   1211	netif_err(priv, drv, netdev,
   1212		  "Set features failed! !!! DISABLING ALL QUEUES !!!\n");
   1213	return err;
   1214}
   1215
   1216static const struct net_device_ops gve_netdev_ops = {
   1217	.ndo_start_xmit		=	gve_start_xmit,
   1218	.ndo_open		=	gve_open,
   1219	.ndo_stop		=	gve_close,
   1220	.ndo_get_stats64	=	gve_get_stats,
   1221	.ndo_tx_timeout         =       gve_tx_timeout,
   1222	.ndo_set_features	=	gve_set_features,
   1223};
   1224
   1225static void gve_handle_status(struct gve_priv *priv, u32 status)
   1226{
   1227	if (GVE_DEVICE_STATUS_RESET_MASK & status) {
   1228		dev_info(&priv->pdev->dev, "Device requested reset.\n");
   1229		gve_set_do_reset(priv);
   1230	}
   1231	if (GVE_DEVICE_STATUS_REPORT_STATS_MASK & status) {
   1232		priv->stats_report_trigger_cnt++;
   1233		gve_set_do_report_stats(priv);
   1234	}
   1235}
   1236
   1237static void gve_handle_reset(struct gve_priv *priv)
   1238{
   1239	/* A service task will be scheduled at the end of probe to catch any
   1240	 * resets that need to happen, and we don't want to reset until
   1241	 * probe is done.
   1242	 */
   1243	if (gve_get_probe_in_progress(priv))
   1244		return;
   1245
   1246	if (gve_get_do_reset(priv)) {
   1247		rtnl_lock();
   1248		gve_reset(priv, false);
   1249		rtnl_unlock();
   1250	}
   1251}
   1252
   1253void gve_handle_report_stats(struct gve_priv *priv)
   1254{
   1255	struct stats *stats = priv->stats_report->stats;
   1256	int idx, stats_idx = 0;
   1257	unsigned int start = 0;
   1258	u64 tx_bytes;
   1259
   1260	if (!gve_get_report_stats(priv))
   1261		return;
   1262
   1263	be64_add_cpu(&priv->stats_report->written_count, 1);
   1264	/* tx stats */
   1265	if (priv->tx) {
   1266		for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) {
   1267			u32 last_completion = 0;
   1268			u32 tx_frames = 0;
   1269
   1270			/* DQO doesn't currently support these metrics. */
   1271			if (gve_is_gqi(priv)) {
   1272				last_completion = priv->tx[idx].done;
   1273				tx_frames = priv->tx[idx].req;
   1274			}
   1275
   1276			do {
   1277				start = u64_stats_fetch_begin(&priv->tx[idx].statss);
   1278				tx_bytes = priv->tx[idx].bytes_done;
   1279			} while (u64_stats_fetch_retry(&priv->tx[idx].statss, start));
   1280			stats[stats_idx++] = (struct stats) {
   1281				.stat_name = cpu_to_be32(TX_WAKE_CNT),
   1282				.value = cpu_to_be64(priv->tx[idx].wake_queue),
   1283				.queue_id = cpu_to_be32(idx),
   1284			};
   1285			stats[stats_idx++] = (struct stats) {
   1286				.stat_name = cpu_to_be32(TX_STOP_CNT),
   1287				.value = cpu_to_be64(priv->tx[idx].stop_queue),
   1288				.queue_id = cpu_to_be32(idx),
   1289			};
   1290			stats[stats_idx++] = (struct stats) {
   1291				.stat_name = cpu_to_be32(TX_FRAMES_SENT),
   1292				.value = cpu_to_be64(tx_frames),
   1293				.queue_id = cpu_to_be32(idx),
   1294			};
   1295			stats[stats_idx++] = (struct stats) {
   1296				.stat_name = cpu_to_be32(TX_BYTES_SENT),
   1297				.value = cpu_to_be64(tx_bytes),
   1298				.queue_id = cpu_to_be32(idx),
   1299			};
   1300			stats[stats_idx++] = (struct stats) {
   1301				.stat_name = cpu_to_be32(TX_LAST_COMPLETION_PROCESSED),
   1302				.value = cpu_to_be64(last_completion),
   1303				.queue_id = cpu_to_be32(idx),
   1304			};
   1305			stats[stats_idx++] = (struct stats) {
   1306				.stat_name = cpu_to_be32(TX_TIMEOUT_CNT),
   1307				.value = cpu_to_be64(priv->tx[idx].queue_timeout),
   1308				.queue_id = cpu_to_be32(idx),
   1309			};
   1310		}
   1311	}
   1312	/* rx stats */
   1313	if (priv->rx) {
   1314		for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
   1315			stats[stats_idx++] = (struct stats) {
   1316				.stat_name = cpu_to_be32(RX_NEXT_EXPECTED_SEQUENCE),
   1317				.value = cpu_to_be64(priv->rx[idx].desc.seqno),
   1318				.queue_id = cpu_to_be32(idx),
   1319			};
   1320			stats[stats_idx++] = (struct stats) {
   1321				.stat_name = cpu_to_be32(RX_BUFFERS_POSTED),
   1322				.value = cpu_to_be64(priv->rx[0].fill_cnt),
   1323				.queue_id = cpu_to_be32(idx),
   1324			};
   1325		}
   1326	}
   1327}
   1328
   1329static void gve_handle_link_status(struct gve_priv *priv, bool link_status)
   1330{
   1331	if (!gve_get_napi_enabled(priv))
   1332		return;
   1333
   1334	if (link_status == netif_carrier_ok(priv->dev))
   1335		return;
   1336
   1337	if (link_status) {
   1338		netdev_info(priv->dev, "Device link is up.\n");
   1339		netif_carrier_on(priv->dev);
   1340	} else {
   1341		netdev_info(priv->dev, "Device link is down.\n");
   1342		netif_carrier_off(priv->dev);
   1343	}
   1344}
   1345
   1346/* Handle NIC status register changes, reset requests and report stats */
   1347static void gve_service_task(struct work_struct *work)
   1348{
   1349	struct gve_priv *priv = container_of(work, struct gve_priv,
   1350					     service_task);
   1351	u32 status = ioread32be(&priv->reg_bar0->device_status);
   1352
   1353	gve_handle_status(priv, status);
   1354
   1355	gve_handle_reset(priv);
   1356	gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
   1357}
   1358
   1359static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
   1360{
   1361	int num_ntfy;
   1362	int err;
   1363
   1364	/* Set up the adminq */
   1365	err = gve_adminq_alloc(&priv->pdev->dev, priv);
   1366	if (err) {
   1367		dev_err(&priv->pdev->dev,
   1368			"Failed to alloc admin queue: err=%d\n", err);
   1369		return err;
   1370	}
   1371
   1372	if (skip_describe_device)
   1373		goto setup_device;
   1374
   1375	priv->queue_format = GVE_QUEUE_FORMAT_UNSPECIFIED;
   1376	/* Get the initial information we need from the device */
   1377	err = gve_adminq_describe_device(priv);
   1378	if (err) {
   1379		dev_err(&priv->pdev->dev,
   1380			"Could not get device information: err=%d\n", err);
   1381		goto err;
   1382	}
   1383	priv->dev->mtu = priv->dev->max_mtu;
   1384	num_ntfy = pci_msix_vec_count(priv->pdev);
   1385	if (num_ntfy <= 0) {
   1386		dev_err(&priv->pdev->dev,
   1387			"could not count MSI-x vectors: err=%d\n", num_ntfy);
   1388		err = num_ntfy;
   1389		goto err;
   1390	} else if (num_ntfy < GVE_MIN_MSIX) {
   1391		dev_err(&priv->pdev->dev, "gve needs at least %d MSI-x vectors, but only has %d\n",
   1392			GVE_MIN_MSIX, num_ntfy);
   1393		err = -EINVAL;
   1394		goto err;
   1395	}
   1396
   1397	priv->num_registered_pages = 0;
   1398	priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK;
   1399	/* gvnic has one Notification Block per MSI-x vector, except for the
   1400	 * management vector
   1401	 */
   1402	priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1;
   1403	priv->mgmt_msix_idx = priv->num_ntfy_blks;
   1404
   1405	priv->tx_cfg.max_queues =
   1406		min_t(int, priv->tx_cfg.max_queues, priv->num_ntfy_blks / 2);
   1407	priv->rx_cfg.max_queues =
   1408		min_t(int, priv->rx_cfg.max_queues, priv->num_ntfy_blks / 2);
   1409
   1410	priv->tx_cfg.num_queues = priv->tx_cfg.max_queues;
   1411	priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
   1412	if (priv->default_num_queues > 0) {
   1413		priv->tx_cfg.num_queues = min_t(int, priv->default_num_queues,
   1414						priv->tx_cfg.num_queues);
   1415		priv->rx_cfg.num_queues = min_t(int, priv->default_num_queues,
   1416						priv->rx_cfg.num_queues);
   1417	}
   1418
   1419	dev_info(&priv->pdev->dev, "TX queues %d, RX queues %d\n",
   1420		 priv->tx_cfg.num_queues, priv->rx_cfg.num_queues);
   1421	dev_info(&priv->pdev->dev, "Max TX queues %d, Max RX queues %d\n",
   1422		 priv->tx_cfg.max_queues, priv->rx_cfg.max_queues);
   1423
   1424	if (!gve_is_gqi(priv)) {
   1425		priv->tx_coalesce_usecs = GVE_TX_IRQ_RATELIMIT_US_DQO;
   1426		priv->rx_coalesce_usecs = GVE_RX_IRQ_RATELIMIT_US_DQO;
   1427	}
   1428
   1429setup_device:
   1430	err = gve_setup_device_resources(priv);
   1431	if (!err)
   1432		return 0;
   1433err:
   1434	gve_adminq_free(&priv->pdev->dev, priv);
   1435	return err;
   1436}
   1437
   1438static void gve_teardown_priv_resources(struct gve_priv *priv)
   1439{
   1440	gve_teardown_device_resources(priv);
   1441	gve_adminq_free(&priv->pdev->dev, priv);
   1442}
   1443
   1444static void gve_trigger_reset(struct gve_priv *priv)
   1445{
   1446	/* Reset the device by releasing the AQ */
   1447	gve_adminq_release(priv);
   1448}
   1449
   1450static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up)
   1451{
   1452	gve_trigger_reset(priv);
   1453	/* With the reset having already happened, close cannot fail */
   1454	if (was_up)
   1455		gve_close(priv->dev);
   1456	gve_teardown_priv_resources(priv);
   1457}
   1458
   1459static int gve_reset_recovery(struct gve_priv *priv, bool was_up)
   1460{
   1461	int err;
   1462
   1463	err = gve_init_priv(priv, true);
   1464	if (err)
   1465		goto err;
   1466	if (was_up) {
   1467		err = gve_open(priv->dev);
   1468		if (err)
   1469			goto err;
   1470	}
   1471	return 0;
   1472err:
   1473	dev_err(&priv->pdev->dev, "Reset failed! !!! DISABLING ALL QUEUES !!!\n");
   1474	gve_turndown(priv);
   1475	return err;
   1476}
   1477
   1478int gve_reset(struct gve_priv *priv, bool attempt_teardown)
   1479{
   1480	bool was_up = netif_carrier_ok(priv->dev);
   1481	int err;
   1482
   1483	dev_info(&priv->pdev->dev, "Performing reset\n");
   1484	gve_clear_do_reset(priv);
   1485	gve_set_reset_in_progress(priv);
   1486	/* If we aren't attempting to teardown normally, just go turndown and
   1487	 * reset right away.
   1488	 */
   1489	if (!attempt_teardown) {
   1490		gve_turndown(priv);
   1491		gve_reset_and_teardown(priv, was_up);
   1492	} else {
   1493		/* Otherwise attempt to close normally */
   1494		if (was_up) {
   1495			err = gve_close(priv->dev);
   1496			/* If that fails reset as we did above */
   1497			if (err)
   1498				gve_reset_and_teardown(priv, was_up);
   1499		}
   1500		/* Clean up any remaining resources */
   1501		gve_teardown_priv_resources(priv);
   1502	}
   1503
   1504	/* Set it all back up */
   1505	err = gve_reset_recovery(priv, was_up);
   1506	gve_clear_reset_in_progress(priv);
   1507	priv->reset_cnt++;
   1508	priv->interface_up_cnt = 0;
   1509	priv->interface_down_cnt = 0;
   1510	priv->stats_report_trigger_cnt = 0;
   1511	return err;
   1512}
   1513
   1514static void gve_write_version(u8 __iomem *driver_version_register)
   1515{
   1516	const char *c = gve_version_prefix;
   1517
   1518	while (*c) {
   1519		writeb(*c, driver_version_register);
   1520		c++;
   1521	}
   1522
   1523	c = gve_version_str;
   1524	while (*c) {
   1525		writeb(*c, driver_version_register);
   1526		c++;
   1527	}
   1528	writeb('\n', driver_version_register);
   1529}
   1530
   1531static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
   1532{
   1533	int max_tx_queues, max_rx_queues;
   1534	struct net_device *dev;
   1535	__be32 __iomem *db_bar;
   1536	struct gve_registers __iomem *reg_bar;
   1537	struct gve_priv *priv;
   1538	int err;
   1539
   1540	err = pci_enable_device(pdev);
   1541	if (err)
   1542		return err;
   1543
   1544	err = pci_request_regions(pdev, "gvnic-cfg");
   1545	if (err)
   1546		goto abort_with_enabled;
   1547
   1548	pci_set_master(pdev);
   1549
   1550	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
   1551	if (err) {
   1552		dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);
   1553		goto abort_with_pci_region;
   1554	}
   1555
   1556	reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0);
   1557	if (!reg_bar) {
   1558		dev_err(&pdev->dev, "Failed to map pci bar!\n");
   1559		err = -ENOMEM;
   1560		goto abort_with_pci_region;
   1561	}
   1562
   1563	db_bar = pci_iomap(pdev, GVE_DOORBELL_BAR, 0);
   1564	if (!db_bar) {
   1565		dev_err(&pdev->dev, "Failed to map doorbell bar!\n");
   1566		err = -ENOMEM;
   1567		goto abort_with_reg_bar;
   1568	}
   1569
   1570	gve_write_version(&reg_bar->driver_version);
   1571	/* Get max queues to alloc etherdev */
   1572	max_tx_queues = ioread32be(&reg_bar->max_tx_queues);
   1573	max_rx_queues = ioread32be(&reg_bar->max_rx_queues);
   1574	/* Alloc and setup the netdev and priv */
   1575	dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues);
   1576	if (!dev) {
   1577		dev_err(&pdev->dev, "could not allocate netdev\n");
   1578		err = -ENOMEM;
   1579		goto abort_with_db_bar;
   1580	}
   1581	SET_NETDEV_DEV(dev, &pdev->dev);
   1582	pci_set_drvdata(pdev, dev);
   1583	dev->ethtool_ops = &gve_ethtool_ops;
   1584	dev->netdev_ops = &gve_netdev_ops;
   1585
   1586	/* Set default and supported features.
   1587	 *
   1588	 * Features might be set in other locations as well (such as
   1589	 * `gve_adminq_describe_device`).
   1590	 */
   1591	dev->hw_features = NETIF_F_HIGHDMA;
   1592	dev->hw_features |= NETIF_F_SG;
   1593	dev->hw_features |= NETIF_F_HW_CSUM;
   1594	dev->hw_features |= NETIF_F_TSO;
   1595	dev->hw_features |= NETIF_F_TSO6;
   1596	dev->hw_features |= NETIF_F_TSO_ECN;
   1597	dev->hw_features |= NETIF_F_RXCSUM;
   1598	dev->hw_features |= NETIF_F_RXHASH;
   1599	dev->features = dev->hw_features;
   1600	dev->watchdog_timeo = 5 * HZ;
   1601	dev->min_mtu = ETH_MIN_MTU;
   1602	netif_carrier_off(dev);
   1603
   1604	priv = netdev_priv(dev);
   1605	priv->dev = dev;
   1606	priv->pdev = pdev;
   1607	priv->msg_enable = DEFAULT_MSG_LEVEL;
   1608	priv->reg_bar0 = reg_bar;
   1609	priv->db_bar2 = db_bar;
   1610	priv->service_task_flags = 0x0;
   1611	priv->state_flags = 0x0;
   1612	priv->ethtool_flags = 0x0;
   1613
   1614	gve_set_probe_in_progress(priv);
   1615	priv->gve_wq = alloc_ordered_workqueue("gve", 0);
   1616	if (!priv->gve_wq) {
   1617		dev_err(&pdev->dev, "Could not allocate workqueue");
   1618		err = -ENOMEM;
   1619		goto abort_with_netdev;
   1620	}
   1621	INIT_WORK(&priv->service_task, gve_service_task);
   1622	INIT_WORK(&priv->stats_report_task, gve_stats_report_task);
   1623	priv->tx_cfg.max_queues = max_tx_queues;
   1624	priv->rx_cfg.max_queues = max_rx_queues;
   1625
   1626	err = gve_init_priv(priv, false);
   1627	if (err)
   1628		goto abort_with_wq;
   1629
   1630	err = register_netdev(dev);
   1631	if (err)
   1632		goto abort_with_gve_init;
   1633
   1634	dev_info(&pdev->dev, "GVE version %s\n", gve_version_str);
   1635	dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format);
   1636	gve_clear_probe_in_progress(priv);
   1637	queue_work(priv->gve_wq, &priv->service_task);
   1638	return 0;
   1639
   1640abort_with_gve_init:
   1641	gve_teardown_priv_resources(priv);
   1642
   1643abort_with_wq:
   1644	destroy_workqueue(priv->gve_wq);
   1645
   1646abort_with_netdev:
   1647	free_netdev(dev);
   1648
   1649abort_with_db_bar:
   1650	pci_iounmap(pdev, db_bar);
   1651
   1652abort_with_reg_bar:
   1653	pci_iounmap(pdev, reg_bar);
   1654
   1655abort_with_pci_region:
   1656	pci_release_regions(pdev);
   1657
   1658abort_with_enabled:
   1659	pci_disable_device(pdev);
   1660	return err;
   1661}
   1662
   1663static void gve_remove(struct pci_dev *pdev)
   1664{
   1665	struct net_device *netdev = pci_get_drvdata(pdev);
   1666	struct gve_priv *priv = netdev_priv(netdev);
   1667	__be32 __iomem *db_bar = priv->db_bar2;
   1668	void __iomem *reg_bar = priv->reg_bar0;
   1669
   1670	unregister_netdev(netdev);
   1671	gve_teardown_priv_resources(priv);
   1672	destroy_workqueue(priv->gve_wq);
   1673	free_netdev(netdev);
   1674	pci_iounmap(pdev, db_bar);
   1675	pci_iounmap(pdev, reg_bar);
   1676	pci_release_regions(pdev);
   1677	pci_disable_device(pdev);
   1678}
   1679
   1680static void gve_shutdown(struct pci_dev *pdev)
   1681{
   1682	struct net_device *netdev = pci_get_drvdata(pdev);
   1683	struct gve_priv *priv = netdev_priv(netdev);
   1684	bool was_up = netif_carrier_ok(priv->dev);
   1685
   1686	rtnl_lock();
   1687	if (was_up && gve_close(priv->dev)) {
   1688		/* If the dev was up, attempt to close, if close fails, reset */
   1689		gve_reset_and_teardown(priv, was_up);
   1690	} else {
   1691		/* If the dev wasn't up or close worked, finish tearing down */
   1692		gve_teardown_priv_resources(priv);
   1693	}
   1694	rtnl_unlock();
   1695}
   1696
   1697#ifdef CONFIG_PM
   1698static int gve_suspend(struct pci_dev *pdev, pm_message_t state)
   1699{
   1700	struct net_device *netdev = pci_get_drvdata(pdev);
   1701	struct gve_priv *priv = netdev_priv(netdev);
   1702	bool was_up = netif_carrier_ok(priv->dev);
   1703
   1704	priv->suspend_cnt++;
   1705	rtnl_lock();
   1706	if (was_up && gve_close(priv->dev)) {
   1707		/* If the dev was up, attempt to close, if close fails, reset */
   1708		gve_reset_and_teardown(priv, was_up);
   1709	} else {
   1710		/* If the dev wasn't up or close worked, finish tearing down */
   1711		gve_teardown_priv_resources(priv);
   1712	}
   1713	priv->up_before_suspend = was_up;
   1714	rtnl_unlock();
   1715	return 0;
   1716}
   1717
   1718static int gve_resume(struct pci_dev *pdev)
   1719{
   1720	struct net_device *netdev = pci_get_drvdata(pdev);
   1721	struct gve_priv *priv = netdev_priv(netdev);
   1722	int err;
   1723
   1724	priv->resume_cnt++;
   1725	rtnl_lock();
   1726	err = gve_reset_recovery(priv, priv->up_before_suspend);
   1727	rtnl_unlock();
   1728	return err;
   1729}
   1730#endif /* CONFIG_PM */
   1731
   1732static const struct pci_device_id gve_id_table[] = {
   1733	{ PCI_DEVICE(PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC) },
   1734	{ }
   1735};
   1736
   1737static struct pci_driver gvnic_driver = {
   1738	.name		= "gvnic",
   1739	.id_table	= gve_id_table,
   1740	.probe		= gve_probe,
   1741	.remove		= gve_remove,
   1742	.shutdown	= gve_shutdown,
   1743#ifdef CONFIG_PM
   1744	.suspend        = gve_suspend,
   1745	.resume         = gve_resume,
   1746#endif
   1747};
   1748
   1749module_pci_driver(gvnic_driver);
   1750
   1751MODULE_DEVICE_TABLE(pci, gve_id_table);
   1752MODULE_AUTHOR("Google, Inc.");
   1753MODULE_DESCRIPTION("gVNIC Driver");
   1754MODULE_LICENSE("Dual MIT/GPL");
   1755MODULE_VERSION(GVE_VERSION);