cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

lag.c (37016B)


      1/*
      2 * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
      3 *
      4 * This software is available to you under a choice of one of two
      5 * licenses.  You may choose to be licensed under the terms of the GNU
      6 * General Public License (GPL) Version 2, available from the file
      7 * COPYING in the main directory of this source tree, or the
      8 * OpenIB.org BSD license below:
      9 *
     10 *     Redistribution and use in source and binary forms, with or
     11 *     without modification, are permitted provided that the following
     12 *     conditions are met:
     13 *
     14 *      - Redistributions of source code must retain the above
     15 *        copyright notice, this list of conditions and the following
     16 *        disclaimer.
     17 *
     18 *      - Redistributions in binary form must reproduce the above
     19 *        copyright notice, this list of conditions and the following
     20 *        disclaimer in the documentation and/or other materials
     21 *        provided with the distribution.
     22 *
     23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     30 * SOFTWARE.
     31 */
     32
     33#include <linux/netdevice.h>
     34#include <net/bonding.h>
     35#include <linux/mlx5/driver.h>
     36#include <linux/mlx5/eswitch.h>
     37#include <linux/mlx5/vport.h>
     38#include "lib/devcom.h"
     39#include "mlx5_core.h"
     40#include "eswitch.h"
     41#include "esw/acl/ofld.h"
     42#include "lag.h"
     43#include "mp.h"
     44#include "mpesw.h"
     45
     46enum {
     47	MLX5_LAG_EGRESS_PORT_1 = 1,
     48	MLX5_LAG_EGRESS_PORT_2,
     49};
     50
     51/* General purpose, use for short periods of time.
     52 * Beware of lock dependencies (preferably, no locks should be acquired
     53 * under it).
     54 */
     55static DEFINE_SPINLOCK(lag_lock);
     56
     57static int get_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
     58{
     59	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
     60		return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT;
     61
     62	if (mode == MLX5_LAG_MODE_MPESW)
     63		return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW;
     64
     65	return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY;
     66}
     67
     68static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode,
     69			       unsigned long flags)
     70{
     71	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
     72	int port_sel_mode = get_port_sel_mode(mode, flags);
     73	u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {};
     74	void *lag_ctx;
     75
     76	lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
     77	MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG);
     78	MLX5_SET(lagc, lag_ctx, fdb_selection_mode, shared_fdb);
     79	if (port_sel_mode == MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY) {
     80		MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
     81		MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
     82	}
     83	MLX5_SET(lagc, lag_ctx, port_select_mode, port_sel_mode);
     84
     85	return mlx5_cmd_exec_in(dev, create_lag, in);
     86}
     87
     88static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports,
     89			       u8 *ports)
     90{
     91	u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
     92	void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
     93
     94	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
     95	MLX5_SET(modify_lag_in, in, field_select, 0x1);
     96
     97	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
     98	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
     99
    100	return mlx5_cmd_exec_in(dev, modify_lag, in);
    101}
    102
    103int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
    104{
    105	u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {};
    106
    107	MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
    108
    109	return mlx5_cmd_exec_in(dev, create_vport_lag, in);
    110}
    111EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
    112
    113int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
    114{
    115	u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {};
    116
    117	MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
    118
    119	return mlx5_cmd_exec_in(dev, destroy_vport_lag, in);
    120}
    121EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
    122
    123static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports,
    124				   u8 *ports, int *num_disabled)
    125{
    126	int i;
    127
    128	*num_disabled = 0;
    129	for (i = 0; i < num_ports; i++) {
    130		if (!tracker->netdev_state[i].tx_enabled ||
    131		    !tracker->netdev_state[i].link_up)
    132			ports[(*num_disabled)++] = i;
    133	}
    134}
    135
    136void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports,
    137			   u8 *ports, int *num_enabled)
    138{
    139	int i;
    140
    141	*num_enabled = 0;
    142	for (i = 0; i < num_ports; i++) {
    143		if (tracker->netdev_state[i].tx_enabled &&
    144		    tracker->netdev_state[i].link_up)
    145			ports[(*num_enabled)++] = i;
    146	}
    147
    148	if (*num_enabled == 0)
    149		mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled);
    150}
    151
    152static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev,
    153				   struct mlx5_lag *ldev,
    154				   struct lag_tracker *tracker,
    155				   unsigned long flags)
    156{
    157	char buf[MLX5_MAX_PORTS * 10 + 1] = {};
    158	u8 enabled_ports[MLX5_MAX_PORTS] = {};
    159	int written = 0;
    160	int num_enabled;
    161	int idx;
    162	int err;
    163	int i;
    164	int j;
    165
    166	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
    167		mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports,
    168				      &num_enabled);
    169		for (i = 0; i < num_enabled; i++) {
    170			err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1);
    171			if (err != 3)
    172				return;
    173			written += err;
    174		}
    175		buf[written - 2] = 0;
    176		mlx5_core_info(dev, "lag map active ports: %s\n", buf);
    177	} else {
    178		for (i = 0; i < ldev->ports; i++) {
    179			for (j  = 0; j < ldev->buckets; j++) {
    180				idx = i * ldev->buckets + j;
    181				err = scnprintf(buf + written, 10,
    182						" port %d:%d", i + 1, ldev->v2p_map[idx]);
    183				if (err != 9)
    184					return;
    185				written += err;
    186			}
    187		}
    188		mlx5_core_info(dev, "lag map:%s\n", buf);
    189	}
    190}
    191
    192static int mlx5_lag_netdev_event(struct notifier_block *this,
    193				 unsigned long event, void *ptr);
    194static void mlx5_do_bond_work(struct work_struct *work);
    195
    196static void mlx5_ldev_free(struct kref *ref)
    197{
    198	struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref);
    199
    200	if (ldev->nb.notifier_call)
    201		unregister_netdevice_notifier_net(&init_net, &ldev->nb);
    202	mlx5_lag_mp_cleanup(ldev);
    203	mlx5_lag_mpesw_cleanup(ldev);
    204	cancel_work_sync(&ldev->mpesw_work);
    205	destroy_workqueue(ldev->wq);
    206	mutex_destroy(&ldev->lock);
    207	kfree(ldev);
    208}
    209
    210static void mlx5_ldev_put(struct mlx5_lag *ldev)
    211{
    212	kref_put(&ldev->ref, mlx5_ldev_free);
    213}
    214
    215static void mlx5_ldev_get(struct mlx5_lag *ldev)
    216{
    217	kref_get(&ldev->ref);
    218}
    219
    220static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
    221{
    222	struct mlx5_lag *ldev;
    223	int err;
    224
    225	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
    226	if (!ldev)
    227		return NULL;
    228
    229	ldev->wq = create_singlethread_workqueue("mlx5_lag");
    230	if (!ldev->wq) {
    231		kfree(ldev);
    232		return NULL;
    233	}
    234
    235	kref_init(&ldev->ref);
    236	mutex_init(&ldev->lock);
    237	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
    238
    239	ldev->nb.notifier_call = mlx5_lag_netdev_event;
    240	if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
    241		ldev->nb.notifier_call = NULL;
    242		mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
    243	}
    244	ldev->mode = MLX5_LAG_MODE_NONE;
    245
    246	err = mlx5_lag_mp_init(ldev);
    247	if (err)
    248		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
    249			      err);
    250
    251	mlx5_lag_mpesw_init(ldev);
    252	ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
    253	ldev->buckets = 1;
    254
    255	return ldev;
    256}
    257
    258int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
    259				struct net_device *ndev)
    260{
    261	int i;
    262
    263	for (i = 0; i < ldev->ports; i++)
    264		if (ldev->pf[i].netdev == ndev)
    265			return i;
    266
    267	return -ENOENT;
    268}
    269
    270static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev)
    271{
    272	return ldev->mode == MLX5_LAG_MODE_ROCE;
    273}
    274
    275static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
    276{
    277	return ldev->mode == MLX5_LAG_MODE_SRIOV;
    278}
    279
    280/* Create a mapping between steering slots and active ports.
    281 * As we have ldev->buckets slots per port first assume the native
    282 * mapping should be used.
    283 * If there are ports that are disabled fill the relevant slots
    284 * with mapping that points to active ports.
    285 */
    286static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
    287					   u8 num_ports,
    288					   u8 buckets,
    289					   u8 *ports)
    290{
    291	int disabled[MLX5_MAX_PORTS] = {};
    292	int enabled[MLX5_MAX_PORTS] = {};
    293	int disabled_ports_num = 0;
    294	int enabled_ports_num = 0;
    295	int idx;
    296	u32 rand;
    297	int i;
    298	int j;
    299
    300	for (i = 0; i < num_ports; i++) {
    301		if (tracker->netdev_state[i].tx_enabled &&
    302		    tracker->netdev_state[i].link_up)
    303			enabled[enabled_ports_num++] = i;
    304		else
    305			disabled[disabled_ports_num++] = i;
    306	}
    307
    308	/* Use native mapping by default where each port's buckets
    309	 * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc
    310	 */
    311	for (i = 0; i < num_ports; i++)
    312		for (j = 0; j < buckets; j++) {
    313			idx = i * buckets + j;
    314			ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i;
    315		}
    316
    317	/* If all ports are disabled/enabled keep native mapping */
    318	if (enabled_ports_num == num_ports ||
    319	    disabled_ports_num == num_ports)
    320		return;
    321
    322	/* Go over the disabled ports and for each assign a random active port */
    323	for (i = 0; i < disabled_ports_num; i++) {
    324		for (j = 0; j < buckets; j++) {
    325			get_random_bytes(&rand, 4);
    326			ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1;
    327		}
    328	}
    329}
    330
    331static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev)
    332{
    333	int i;
    334
    335	for (i = 0; i < ldev->ports; i++)
    336		if (ldev->pf[i].has_drop)
    337			return true;
    338	return false;
    339}
    340
    341static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev)
    342{
    343	int i;
    344
    345	for (i = 0; i < ldev->ports; i++) {
    346		if (!ldev->pf[i].has_drop)
    347			continue;
    348
    349		mlx5_esw_acl_ingress_vport_drop_rule_destroy(ldev->pf[i].dev->priv.eswitch,
    350							     MLX5_VPORT_UPLINK);
    351		ldev->pf[i].has_drop = false;
    352	}
    353}
    354
    355static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev,
    356				     struct lag_tracker *tracker)
    357{
    358	u8 disabled_ports[MLX5_MAX_PORTS] = {};
    359	struct mlx5_core_dev *dev;
    360	int disabled_index;
    361	int num_disabled;
    362	int err;
    363	int i;
    364
    365	/* First delete the current drop rule so there won't be any dropped
    366	 * packets
    367	 */
    368	mlx5_lag_drop_rule_cleanup(ldev);
    369
    370	if (!ldev->tracker.has_inactive)
    371		return;
    372
    373	mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled);
    374
    375	for (i = 0; i < num_disabled; i++) {
    376		disabled_index = disabled_ports[i];
    377		dev = ldev->pf[disabled_index].dev;
    378		err = mlx5_esw_acl_ingress_vport_drop_rule_create(dev->priv.eswitch,
    379								  MLX5_VPORT_UPLINK);
    380		if (!err)
    381			ldev->pf[disabled_index].has_drop = true;
    382		else
    383			mlx5_core_err(dev,
    384				      "Failed to create lag drop rule, error: %d", err);
    385	}
    386}
    387
    388static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports)
    389{
    390	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
    391
    392	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags))
    393		return mlx5_lag_port_sel_modify(ldev, ports);
    394	return mlx5_cmd_modify_lag(dev0, ldev->ports, ports);
    395}
    396
    397void mlx5_modify_lag(struct mlx5_lag *ldev,
    398		     struct lag_tracker *tracker)
    399{
    400	u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {};
    401	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
    402	int idx;
    403	int err;
    404	int i;
    405	int j;
    406
    407	mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports);
    408
    409	for (i = 0; i < ldev->ports; i++) {
    410		for (j = 0; j < ldev->buckets; j++) {
    411			idx = i * ldev->buckets + j;
    412			if (ports[idx] == ldev->v2p_map[idx])
    413				continue;
    414			err = _mlx5_modify_lag(ldev, ports);
    415			if (err) {
    416				mlx5_core_err(dev0,
    417					      "Failed to modify LAG (%d)\n",
    418					      err);
    419				return;
    420			}
    421			memcpy(ldev->v2p_map, ports, sizeof(ports));
    422
    423			mlx5_lag_print_mapping(dev0, ldev, tracker,
    424					       ldev->mode_flags);
    425			break;
    426		}
    427	}
    428
    429	if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
    430	    !(ldev->mode == MLX5_LAG_MODE_ROCE))
    431		mlx5_lag_drop_rule_setup(ldev, tracker);
    432}
    433
    434#define MLX5_LAG_ROCE_HASH_PORTS_SUPPORTED 4
    435static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
    436					   unsigned long *flags)
    437{
    438	struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
    439
    440	if (ldev->ports == MLX5_LAG_ROCE_HASH_PORTS_SUPPORTED) {
    441		/* Four ports are support only in hash mode */
    442		if (!MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table))
    443			return -EINVAL;
    444		set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
    445		if (ldev->ports > 2)
    446			ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS;
    447	}
    448
    449	return 0;
    450}
    451
    452static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
    453						struct lag_tracker *tracker,
    454						enum mlx5_lag_mode mode,
    455						unsigned long *flags)
    456{
    457	struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
    458
    459	if (mode == MLX5_LAG_MODE_MPESW)
    460		return;
    461
    462	if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
    463	    tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)
    464		set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
    465}
    466
    467static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
    468			      struct lag_tracker *tracker, bool shared_fdb,
    469			      unsigned long *flags)
    470{
    471	bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
    472
    473	*flags = 0;
    474	if (shared_fdb)
    475		set_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, flags);
    476
    477	if (roce_lag)
    478		return mlx5_lag_set_port_sel_mode_roce(ldev, flags);
    479
    480	mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags);
    481	return 0;
    482}
    483
    484char *mlx5_get_str_port_sel_mode(struct mlx5_lag *ldev)
    485{
    486	int port_sel_mode = get_port_sel_mode(ldev->mode, ldev->mode_flags);
    487
    488	switch (port_sel_mode) {
    489	case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity";
    490	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash";
    491	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw";
    492	default: return "invalid";
    493	}
    494}
    495
    496static int mlx5_create_lag(struct mlx5_lag *ldev,
    497			   struct lag_tracker *tracker,
    498			   enum mlx5_lag_mode mode,
    499			   unsigned long flags)
    500{
    501	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
    502	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
    503	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
    504	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
    505	int err;
    506
    507	if (tracker)
    508		mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
    509	mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
    510		       shared_fdb, mlx5_get_str_port_sel_mode(ldev));
    511
    512	err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags);
    513	if (err) {
    514		mlx5_core_err(dev0,
    515			      "Failed to create LAG (%d)\n",
    516			      err);
    517		return err;
    518	}
    519
    520	if (shared_fdb) {
    521		err = mlx5_eswitch_offloads_config_single_fdb(dev0->priv.eswitch,
    522							      dev1->priv.eswitch);
    523		if (err)
    524			mlx5_core_err(dev0, "Can't enable single FDB mode\n");
    525		else
    526			mlx5_core_info(dev0, "Operation mode is single FDB\n");
    527	}
    528
    529	if (err) {
    530		MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
    531		if (mlx5_cmd_exec_in(dev0, destroy_lag, in))
    532			mlx5_core_err(dev0,
    533				      "Failed to deactivate RoCE LAG; driver restart required\n");
    534	}
    535
    536	return err;
    537}
    538
    539int mlx5_activate_lag(struct mlx5_lag *ldev,
    540		      struct lag_tracker *tracker,
    541		      enum mlx5_lag_mode mode,
    542		      bool shared_fdb)
    543{
    544	bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
    545	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
    546	unsigned long flags = 0;
    547	int err;
    548
    549	err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags);
    550	if (err)
    551		return err;
    552
    553	if (mode != MLX5_LAG_MODE_MPESW) {
    554		mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
    555		if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
    556			err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
    557						       ldev->v2p_map);
    558			if (err) {
    559				mlx5_core_err(dev0,
    560					      "Failed to create LAG port selection(%d)\n",
    561					      err);
    562				return err;
    563			}
    564		}
    565	}
    566
    567	err = mlx5_create_lag(ldev, tracker, mode, flags);
    568	if (err) {
    569		if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
    570			mlx5_lag_port_sel_destroy(ldev);
    571		if (roce_lag)
    572			mlx5_core_err(dev0,
    573				      "Failed to activate RoCE LAG\n");
    574		else
    575			mlx5_core_err(dev0,
    576				      "Failed to activate VF LAG\n"
    577				      "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
    578		return err;
    579	}
    580
    581	if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
    582	    !roce_lag)
    583		mlx5_lag_drop_rule_setup(ldev, tracker);
    584
    585	ldev->mode = mode;
    586	ldev->mode_flags = flags;
    587	return 0;
    588}
    589
    590static int mlx5_deactivate_lag(struct mlx5_lag *ldev)
    591{
    592	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
    593	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
    594	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
    595	bool roce_lag = __mlx5_lag_is_roce(ldev);
    596	unsigned long flags = ldev->mode_flags;
    597	int err;
    598
    599	ldev->mode = MLX5_LAG_MODE_NONE;
    600	ldev->mode_flags = 0;
    601	mlx5_lag_mp_reset(ldev);
    602
    603	if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) {
    604		mlx5_eswitch_offloads_destroy_single_fdb(dev0->priv.eswitch,
    605							 dev1->priv.eswitch);
    606		clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
    607	}
    608
    609	MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
    610	err = mlx5_cmd_exec_in(dev0, destroy_lag, in);
    611	if (err) {
    612		if (roce_lag) {
    613			mlx5_core_err(dev0,
    614				      "Failed to deactivate RoCE LAG; driver restart required\n");
    615		} else {
    616			mlx5_core_err(dev0,
    617				      "Failed to deactivate VF LAG; driver restart required\n"
    618				      "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
    619		}
    620		return err;
    621	}
    622
    623	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
    624		mlx5_lag_port_sel_destroy(ldev);
    625	if (mlx5_lag_has_drop_rule(ldev))
    626		mlx5_lag_drop_rule_cleanup(ldev);
    627
    628	return 0;
    629}
    630
    631#define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 2
    632static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
    633{
    634#ifdef CONFIG_MLX5_ESWITCH
    635	u8 mode;
    636#endif
    637	int i;
    638
    639	for (i = 0; i < ldev->ports; i++)
    640		if (!ldev->pf[i].dev)
    641			return false;
    642
    643#ifdef CONFIG_MLX5_ESWITCH
    644	mode = mlx5_eswitch_mode(ldev->pf[MLX5_LAG_P1].dev);
    645
    646	if (mode != MLX5_ESWITCH_NONE && mode != MLX5_ESWITCH_OFFLOADS)
    647		return false;
    648
    649	for (i = 0; i < ldev->ports; i++)
    650		if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode)
    651			return false;
    652
    653	if (mode == MLX5_ESWITCH_OFFLOADS && ldev->ports != MLX5_LAG_OFFLOADS_SUPPORTED_PORTS)
    654		return false;
    655#else
    656	for (i = 0; i < ldev->ports; i++)
    657		if (mlx5_sriov_is_enabled(ldev->pf[i].dev))
    658			return false;
    659#endif
    660	return true;
    661}
    662
    663static void mlx5_lag_add_devices(struct mlx5_lag *ldev)
    664{
    665	int i;
    666
    667	for (i = 0; i < ldev->ports; i++) {
    668		if (!ldev->pf[i].dev)
    669			continue;
    670
    671		if (ldev->pf[i].dev->priv.flags &
    672		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
    673			continue;
    674
    675		ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
    676		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
    677	}
    678}
    679
    680static void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
    681{
    682	int i;
    683
    684	for (i = 0; i < ldev->ports; i++) {
    685		if (!ldev->pf[i].dev)
    686			continue;
    687
    688		if (ldev->pf[i].dev->priv.flags &
    689		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
    690			continue;
    691
    692		ldev->pf[i].dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
    693		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
    694	}
    695}
    696
    697void mlx5_disable_lag(struct mlx5_lag *ldev)
    698{
    699	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
    700	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
    701	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
    702	bool roce_lag;
    703	int err;
    704	int i;
    705
    706	roce_lag = __mlx5_lag_is_roce(ldev);
    707
    708	if (shared_fdb) {
    709		mlx5_lag_remove_devices(ldev);
    710	} else if (roce_lag) {
    711		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
    712			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
    713			mlx5_rescan_drivers_locked(dev0);
    714		}
    715		for (i = 1; i < ldev->ports; i++)
    716			mlx5_nic_vport_disable_roce(ldev->pf[i].dev);
    717	}
    718
    719	err = mlx5_deactivate_lag(ldev);
    720	if (err)
    721		return;
    722
    723	if (shared_fdb || roce_lag)
    724		mlx5_lag_add_devices(ldev);
    725
    726	if (shared_fdb) {
    727		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
    728			mlx5_eswitch_reload_reps(dev0->priv.eswitch);
    729		if (!(dev1->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
    730			mlx5_eswitch_reload_reps(dev1->priv.eswitch);
    731	}
    732}
    733
    734bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
    735{
    736	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
    737	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
    738
    739	if (is_mdev_switchdev_mode(dev0) &&
    740	    is_mdev_switchdev_mode(dev1) &&
    741	    mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) &&
    742	    mlx5_eswitch_vport_match_metadata_enabled(dev1->priv.eswitch) &&
    743	    mlx5_devcom_is_paired(dev0->priv.devcom,
    744				  MLX5_DEVCOM_ESW_OFFLOADS) &&
    745	    MLX5_CAP_GEN(dev1, lag_native_fdb_selection) &&
    746	    MLX5_CAP_ESW(dev1, root_ft_on_other_esw) &&
    747	    MLX5_CAP_ESW(dev0, esw_shared_ingress_acl))
    748		return true;
    749
    750	return false;
    751}
    752
    753static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
    754{
    755	bool roce_lag = true;
    756	int i;
    757
    758	for (i = 0; i < ldev->ports; i++)
    759		roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev);
    760
    761#ifdef CONFIG_MLX5_ESWITCH
    762	for (i = 0; i < ldev->ports; i++)
    763		roce_lag = roce_lag &&
    764			ldev->pf[i].dev->priv.eswitch->mode == MLX5_ESWITCH_NONE;
    765#endif
    766
    767	return roce_lag;
    768}
    769
    770static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond)
    771{
    772	return do_bond && __mlx5_lag_is_active(ldev) &&
    773	       ldev->mode != MLX5_LAG_MODE_MPESW;
    774}
    775
    776static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
    777{
    778	return !do_bond && __mlx5_lag_is_active(ldev) &&
    779	       ldev->mode != MLX5_LAG_MODE_MPESW;
    780}
    781
    782static void mlx5_do_bond(struct mlx5_lag *ldev)
    783{
    784	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
    785	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
    786	struct lag_tracker tracker = { };
    787	bool do_bond, roce_lag;
    788	int err;
    789	int i;
    790
    791	if (!mlx5_lag_is_ready(ldev)) {
    792		do_bond = false;
    793	} else {
    794		/* VF LAG is in multipath mode, ignore bond change requests */
    795		if (mlx5_lag_is_multipath(dev0))
    796			return;
    797
    798		tracker = ldev->tracker;
    799
    800		do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev);
    801	}
    802
    803	if (do_bond && !__mlx5_lag_is_active(ldev)) {
    804		bool shared_fdb = mlx5_shared_fdb_supported(ldev);
    805
    806		roce_lag = mlx5_lag_is_roce_lag(ldev);
    807
    808		if (shared_fdb || roce_lag)
    809			mlx5_lag_remove_devices(ldev);
    810
    811		err = mlx5_activate_lag(ldev, &tracker,
    812					roce_lag ? MLX5_LAG_MODE_ROCE :
    813						   MLX5_LAG_MODE_SRIOV,
    814					shared_fdb);
    815		if (err) {
    816			if (shared_fdb || roce_lag)
    817				mlx5_lag_add_devices(ldev);
    818
    819			return;
    820		} else if (roce_lag) {
    821			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
    822			mlx5_rescan_drivers_locked(dev0);
    823			for (i = 1; i < ldev->ports; i++)
    824				mlx5_nic_vport_enable_roce(ldev->pf[i].dev);
    825		} else if (shared_fdb) {
    826			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
    827			mlx5_rescan_drivers_locked(dev0);
    828
    829			err = mlx5_eswitch_reload_reps(dev0->priv.eswitch);
    830			if (!err)
    831				err = mlx5_eswitch_reload_reps(dev1->priv.eswitch);
    832
    833			if (err) {
    834				dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
    835				mlx5_rescan_drivers_locked(dev0);
    836				mlx5_deactivate_lag(ldev);
    837				mlx5_lag_add_devices(ldev);
    838				mlx5_eswitch_reload_reps(dev0->priv.eswitch);
    839				mlx5_eswitch_reload_reps(dev1->priv.eswitch);
    840				mlx5_core_err(dev0, "Failed to enable lag\n");
    841				return;
    842			}
    843		}
    844	} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
    845		mlx5_modify_lag(ldev, &tracker);
    846	} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
    847		mlx5_disable_lag(ldev);
    848	}
    849}
    850
    851static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
    852{
    853	queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
    854}
    855
    856static void mlx5_do_bond_work(struct work_struct *work)
    857{
    858	struct delayed_work *delayed_work = to_delayed_work(work);
    859	struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
    860					     bond_work);
    861	int status;
    862
    863	status = mlx5_dev_list_trylock();
    864	if (!status) {
    865		mlx5_queue_bond_work(ldev, HZ);
    866		return;
    867	}
    868
    869	mutex_lock(&ldev->lock);
    870	if (ldev->mode_changes_in_progress) {
    871		mutex_unlock(&ldev->lock);
    872		mlx5_dev_list_unlock();
    873		mlx5_queue_bond_work(ldev, HZ);
    874		return;
    875	}
    876
    877	mlx5_do_bond(ldev);
    878	mutex_unlock(&ldev->lock);
    879	mlx5_dev_list_unlock();
    880}
    881
    882static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
    883					 struct lag_tracker *tracker,
    884					 struct netdev_notifier_changeupper_info *info)
    885{
    886	struct net_device *upper = info->upper_dev, *ndev_tmp;
    887	struct netdev_lag_upper_info *lag_upper_info = NULL;
    888	bool is_bonded, is_in_lag, mode_supported;
    889	bool has_inactive = 0;
    890	struct slave *slave;
    891	u8 bond_status = 0;
    892	int num_slaves = 0;
    893	int changed = 0;
    894	int idx;
    895
    896	if (!netif_is_lag_master(upper))
    897		return 0;
    898
    899	if (info->linking)
    900		lag_upper_info = info->upper_info;
    901
    902	/* The event may still be of interest if the slave does not belong to
    903	 * us, but is enslaved to a master which has one or more of our netdevs
    904	 * as slaves (e.g., if a new slave is added to a master that bonds two
    905	 * of our netdevs, we should unbond).
    906	 */
    907	rcu_read_lock();
    908	for_each_netdev_in_bond_rcu(upper, ndev_tmp) {
    909		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
    910		if (idx >= 0) {
    911			slave = bond_slave_get_rcu(ndev_tmp);
    912			if (slave)
    913				has_inactive |= bond_is_slave_inactive(slave);
    914			bond_status |= (1 << idx);
    915		}
    916
    917		num_slaves++;
    918	}
    919	rcu_read_unlock();
    920
    921	/* None of this lagdev's netdevs are slaves of this master. */
    922	if (!(bond_status & GENMASK(ldev->ports - 1, 0)))
    923		return 0;
    924
    925	if (lag_upper_info) {
    926		tracker->tx_type = lag_upper_info->tx_type;
    927		tracker->hash_type = lag_upper_info->hash_type;
    928	}
    929
    930	tracker->has_inactive = has_inactive;
    931	/* Determine bonding status:
    932	 * A device is considered bonded if both its physical ports are slaves
    933	 * of the same lag master, and only them.
    934	 */
    935	is_in_lag = num_slaves == ldev->ports &&
    936		bond_status == GENMASK(ldev->ports - 1, 0);
    937
    938	/* Lag mode must be activebackup or hash. */
    939	mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP ||
    940			 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH;
    941
    942	is_bonded = is_in_lag && mode_supported;
    943	if (tracker->is_bonded != is_bonded) {
    944		tracker->is_bonded = is_bonded;
    945		changed = 1;
    946	}
    947
    948	if (!is_in_lag)
    949		return changed;
    950
    951	if (!mlx5_lag_is_ready(ldev))
    952		NL_SET_ERR_MSG_MOD(info->info.extack,
    953				   "Can't activate LAG offload, PF is configured with more than 64 VFs");
    954	else if (!mode_supported)
    955		NL_SET_ERR_MSG_MOD(info->info.extack,
    956				   "Can't activate LAG offload, TX type isn't supported");
    957
    958	return changed;
    959}
    960
    961static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
    962					      struct lag_tracker *tracker,
    963					      struct net_device *ndev,
    964					      struct netdev_notifier_changelowerstate_info *info)
    965{
    966	struct netdev_lag_lower_state_info *lag_lower_info;
    967	int idx;
    968
    969	if (!netif_is_lag_port(ndev))
    970		return 0;
    971
    972	idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev);
    973	if (idx < 0)
    974		return 0;
    975
    976	/* This information is used to determine virtual to physical
    977	 * port mapping.
    978	 */
    979	lag_lower_info = info->lower_state_info;
    980	if (!lag_lower_info)
    981		return 0;
    982
    983	tracker->netdev_state[idx] = *lag_lower_info;
    984
    985	return 1;
    986}
    987
    988static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
    989					    struct lag_tracker *tracker,
    990					    struct net_device *ndev)
    991{
    992	struct net_device *ndev_tmp;
    993	struct slave *slave;
    994	bool has_inactive = 0;
    995	int idx;
    996
    997	if (!netif_is_lag_master(ndev))
    998		return 0;
    999
   1000	rcu_read_lock();
   1001	for_each_netdev_in_bond_rcu(ndev, ndev_tmp) {
   1002		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
   1003		if (idx < 0)
   1004			continue;
   1005
   1006		slave = bond_slave_get_rcu(ndev_tmp);
   1007		if (slave)
   1008			has_inactive |= bond_is_slave_inactive(slave);
   1009	}
   1010	rcu_read_unlock();
   1011
   1012	if (tracker->has_inactive == has_inactive)
   1013		return 0;
   1014
   1015	tracker->has_inactive = has_inactive;
   1016
   1017	return 1;
   1018}
   1019
   1020/* this handler is always registered to netdev events */
   1021static int mlx5_lag_netdev_event(struct notifier_block *this,
   1022				 unsigned long event, void *ptr)
   1023{
   1024	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
   1025	struct lag_tracker tracker;
   1026	struct mlx5_lag *ldev;
   1027	int changed = 0;
   1028
   1029	if (event != NETDEV_CHANGEUPPER &&
   1030	    event != NETDEV_CHANGELOWERSTATE &&
   1031	    event != NETDEV_CHANGEINFODATA)
   1032		return NOTIFY_DONE;
   1033
   1034	ldev    = container_of(this, struct mlx5_lag, nb);
   1035
   1036	tracker = ldev->tracker;
   1037
   1038	switch (event) {
   1039	case NETDEV_CHANGEUPPER:
   1040		changed = mlx5_handle_changeupper_event(ldev, &tracker, ptr);
   1041		break;
   1042	case NETDEV_CHANGELOWERSTATE:
   1043		changed = mlx5_handle_changelowerstate_event(ldev, &tracker,
   1044							     ndev, ptr);
   1045		break;
   1046	case NETDEV_CHANGEINFODATA:
   1047		changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev);
   1048		break;
   1049	}
   1050
   1051	ldev->tracker = tracker;
   1052
   1053	if (changed)
   1054		mlx5_queue_bond_work(ldev, 0);
   1055
   1056	return NOTIFY_DONE;
   1057}
   1058
   1059static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev,
   1060				 struct mlx5_core_dev *dev,
   1061				 struct net_device *netdev)
   1062{
   1063	unsigned int fn = mlx5_get_dev_index(dev);
   1064
   1065	if (fn >= ldev->ports)
   1066		return;
   1067
   1068	spin_lock(&lag_lock);
   1069	ldev->pf[fn].netdev = netdev;
   1070	ldev->tracker.netdev_state[fn].link_up = 0;
   1071	ldev->tracker.netdev_state[fn].tx_enabled = 0;
   1072	spin_unlock(&lag_lock);
   1073}
   1074
   1075static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
   1076				    struct net_device *netdev)
   1077{
   1078	int i;
   1079
   1080	spin_lock(&lag_lock);
   1081	for (i = 0; i < ldev->ports; i++) {
   1082		if (ldev->pf[i].netdev == netdev) {
   1083			ldev->pf[i].netdev = NULL;
   1084			break;
   1085		}
   1086	}
   1087	spin_unlock(&lag_lock);
   1088}
   1089
   1090static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
   1091			       struct mlx5_core_dev *dev)
   1092{
   1093	unsigned int fn = mlx5_get_dev_index(dev);
   1094
   1095	if (fn >= ldev->ports)
   1096		return;
   1097
   1098	ldev->pf[fn].dev = dev;
   1099	dev->priv.lag = ldev;
   1100}
   1101
   1102static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
   1103				  struct mlx5_core_dev *dev)
   1104{
   1105	int i;
   1106
   1107	for (i = 0; i < ldev->ports; i++)
   1108		if (ldev->pf[i].dev == dev)
   1109			break;
   1110
   1111	if (i == ldev->ports)
   1112		return;
   1113
   1114	ldev->pf[i].dev = NULL;
   1115	dev->priv.lag = NULL;
   1116}
   1117
   1118/* Must be called with intf_mutex held */
   1119static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
   1120{
   1121	struct mlx5_lag *ldev = NULL;
   1122	struct mlx5_core_dev *tmp_dev;
   1123
   1124	tmp_dev = mlx5_get_next_phys_dev_lag(dev);
   1125	if (tmp_dev)
   1126		ldev = tmp_dev->priv.lag;
   1127
   1128	if (!ldev) {
   1129		ldev = mlx5_lag_dev_alloc(dev);
   1130		if (!ldev) {
   1131			mlx5_core_err(dev, "Failed to alloc lag dev\n");
   1132			return 0;
   1133		}
   1134		mlx5_ldev_add_mdev(ldev, dev);
   1135		return 0;
   1136	}
   1137
   1138	mutex_lock(&ldev->lock);
   1139	if (ldev->mode_changes_in_progress) {
   1140		mutex_unlock(&ldev->lock);
   1141		return -EAGAIN;
   1142	}
   1143	mlx5_ldev_get(ldev);
   1144	mlx5_ldev_add_mdev(ldev, dev);
   1145	mutex_unlock(&ldev->lock);
   1146
   1147	return 0;
   1148}
   1149
   1150void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
   1151{
   1152	struct mlx5_lag *ldev;
   1153
   1154	ldev = mlx5_lag_dev(dev);
   1155	if (!ldev)
   1156		return;
   1157
   1158	/* mdev is being removed, might as well remove debugfs
   1159	 * as early as possible.
   1160	 */
   1161	mlx5_ldev_remove_debugfs(dev->priv.dbg.lag_debugfs);
   1162recheck:
   1163	mutex_lock(&ldev->lock);
   1164	if (ldev->mode_changes_in_progress) {
   1165		mutex_unlock(&ldev->lock);
   1166		msleep(100);
   1167		goto recheck;
   1168	}
   1169	mlx5_ldev_remove_mdev(ldev, dev);
   1170	mutex_unlock(&ldev->lock);
   1171	mlx5_ldev_put(ldev);
   1172}
   1173
   1174void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
   1175{
   1176	int err;
   1177
   1178	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
   1179	    !MLX5_CAP_GEN(dev, lag_master) ||
   1180	    (MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS ||
   1181	     MLX5_CAP_GEN(dev, num_lag_ports) <= 1))
   1182		return;
   1183
   1184recheck:
   1185	mlx5_dev_list_lock();
   1186	err = __mlx5_lag_dev_add_mdev(dev);
   1187	mlx5_dev_list_unlock();
   1188
   1189	if (err) {
   1190		msleep(100);
   1191		goto recheck;
   1192	}
   1193	mlx5_ldev_add_debugfs(dev);
   1194}
   1195
   1196void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev,
   1197			    struct net_device *netdev)
   1198{
   1199	struct mlx5_lag *ldev;
   1200	bool lag_is_active;
   1201
   1202	ldev = mlx5_lag_dev(dev);
   1203	if (!ldev)
   1204		return;
   1205
   1206	mutex_lock(&ldev->lock);
   1207	mlx5_ldev_remove_netdev(ldev, netdev);
   1208	clear_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
   1209
   1210	lag_is_active = __mlx5_lag_is_active(ldev);
   1211	mutex_unlock(&ldev->lock);
   1212
   1213	if (lag_is_active)
   1214		mlx5_queue_bond_work(ldev, 0);
   1215}
   1216
   1217void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
   1218			 struct net_device *netdev)
   1219{
   1220	struct mlx5_lag *ldev;
   1221	int i;
   1222
   1223	ldev = mlx5_lag_dev(dev);
   1224	if (!ldev)
   1225		return;
   1226
   1227	mutex_lock(&ldev->lock);
   1228	mlx5_ldev_add_netdev(ldev, dev, netdev);
   1229
   1230	for (i = 0; i < ldev->ports; i++)
   1231		if (!ldev->pf[i].dev)
   1232			break;
   1233
   1234	if (i >= ldev->ports)
   1235		set_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
   1236	mutex_unlock(&ldev->lock);
   1237	mlx5_queue_bond_work(ldev, 0);
   1238}
   1239
   1240bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
   1241{
   1242	struct mlx5_lag *ldev;
   1243	bool res;
   1244
   1245	spin_lock(&lag_lock);
   1246	ldev = mlx5_lag_dev(dev);
   1247	res  = ldev && __mlx5_lag_is_roce(ldev);
   1248	spin_unlock(&lag_lock);
   1249
   1250	return res;
   1251}
   1252EXPORT_SYMBOL(mlx5_lag_is_roce);
   1253
   1254bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
   1255{
   1256	struct mlx5_lag *ldev;
   1257	bool res;
   1258
   1259	spin_lock(&lag_lock);
   1260	ldev = mlx5_lag_dev(dev);
   1261	res  = ldev && __mlx5_lag_is_active(ldev);
   1262	spin_unlock(&lag_lock);
   1263
   1264	return res;
   1265}
   1266EXPORT_SYMBOL(mlx5_lag_is_active);
   1267
   1268bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
   1269{
   1270	struct mlx5_lag *ldev;
   1271	bool res;
   1272
   1273	spin_lock(&lag_lock);
   1274	ldev = mlx5_lag_dev(dev);
   1275	res = ldev && __mlx5_lag_is_active(ldev) &&
   1276		dev == ldev->pf[MLX5_LAG_P1].dev;
   1277	spin_unlock(&lag_lock);
   1278
   1279	return res;
   1280}
   1281EXPORT_SYMBOL(mlx5_lag_is_master);
   1282
   1283bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
   1284{
   1285	struct mlx5_lag *ldev;
   1286	bool res;
   1287
   1288	spin_lock(&lag_lock);
   1289	ldev = mlx5_lag_dev(dev);
   1290	res  = ldev && __mlx5_lag_is_sriov(ldev);
   1291	spin_unlock(&lag_lock);
   1292
   1293	return res;
   1294}
   1295EXPORT_SYMBOL(mlx5_lag_is_sriov);
   1296
   1297bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
   1298{
   1299	struct mlx5_lag *ldev;
   1300	bool res;
   1301
   1302	spin_lock(&lag_lock);
   1303	ldev = mlx5_lag_dev(dev);
   1304	res = ldev && __mlx5_lag_is_sriov(ldev) &&
   1305	      test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
   1306	spin_unlock(&lag_lock);
   1307
   1308	return res;
   1309}
   1310EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
   1311
   1312void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
   1313{
   1314	struct mlx5_lag *ldev;
   1315
   1316	ldev = mlx5_lag_dev(dev);
   1317	if (!ldev)
   1318		return;
   1319
   1320	mlx5_dev_list_lock();
   1321	mutex_lock(&ldev->lock);
   1322
   1323	ldev->mode_changes_in_progress++;
   1324	if (__mlx5_lag_is_active(ldev))
   1325		mlx5_disable_lag(ldev);
   1326
   1327	mutex_unlock(&ldev->lock);
   1328	mlx5_dev_list_unlock();
   1329}
   1330
   1331void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
   1332{
   1333	struct mlx5_lag *ldev;
   1334
   1335	ldev = mlx5_lag_dev(dev);
   1336	if (!ldev)
   1337		return;
   1338
   1339	mutex_lock(&ldev->lock);
   1340	ldev->mode_changes_in_progress--;
   1341	mutex_unlock(&ldev->lock);
   1342	mlx5_queue_bond_work(ldev, 0);
   1343}
   1344
   1345struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
   1346{
   1347	struct net_device *ndev = NULL;
   1348	struct mlx5_lag *ldev;
   1349	int i;
   1350
   1351	spin_lock(&lag_lock);
   1352	ldev = mlx5_lag_dev(dev);
   1353
   1354	if (!(ldev && __mlx5_lag_is_roce(ldev)))
   1355		goto unlock;
   1356
   1357	if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
   1358		for (i = 0; i < ldev->ports; i++)
   1359			if (ldev->tracker.netdev_state[i].tx_enabled)
   1360				ndev = ldev->pf[i].netdev;
   1361		if (!ndev)
   1362			ndev = ldev->pf[ldev->ports - 1].netdev;
   1363	} else {
   1364		ndev = ldev->pf[MLX5_LAG_P1].netdev;
   1365	}
   1366	if (ndev)
   1367		dev_hold(ndev);
   1368
   1369unlock:
   1370	spin_unlock(&lag_lock);
   1371
   1372	return ndev;
   1373}
   1374EXPORT_SYMBOL(mlx5_lag_get_roce_netdev);
   1375
   1376u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
   1377			   struct net_device *slave)
   1378{
   1379	struct mlx5_lag *ldev;
   1380	u8 port = 0;
   1381	int i;
   1382
   1383	spin_lock(&lag_lock);
   1384	ldev = mlx5_lag_dev(dev);
   1385	if (!(ldev && __mlx5_lag_is_roce(ldev)))
   1386		goto unlock;
   1387
   1388	for (i = 0; i < ldev->ports; i++) {
   1389		if (ldev->pf[MLX5_LAG_P1].netdev == slave) {
   1390			port = i;
   1391			break;
   1392		}
   1393	}
   1394
   1395	port = ldev->v2p_map[port * ldev->buckets];
   1396
   1397unlock:
   1398	spin_unlock(&lag_lock);
   1399	return port;
   1400}
   1401EXPORT_SYMBOL(mlx5_lag_get_slave_port);
   1402
   1403u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev)
   1404{
   1405	struct mlx5_lag *ldev;
   1406
   1407	ldev = mlx5_lag_dev(dev);
   1408	if (!ldev)
   1409		return 0;
   1410
   1411	return ldev->ports;
   1412}
   1413EXPORT_SYMBOL(mlx5_lag_get_num_ports);
   1414
   1415struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev)
   1416{
   1417	struct mlx5_core_dev *peer_dev = NULL;
   1418	struct mlx5_lag *ldev;
   1419
   1420	spin_lock(&lag_lock);
   1421	ldev = mlx5_lag_dev(dev);
   1422	if (!ldev)
   1423		goto unlock;
   1424
   1425	peer_dev = ldev->pf[MLX5_LAG_P1].dev == dev ?
   1426			   ldev->pf[MLX5_LAG_P2].dev :
   1427			   ldev->pf[MLX5_LAG_P1].dev;
   1428
   1429unlock:
   1430	spin_unlock(&lag_lock);
   1431	return peer_dev;
   1432}
   1433EXPORT_SYMBOL(mlx5_lag_get_peer_mdev);
   1434
   1435int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
   1436				 u64 *values,
   1437				 int num_counters,
   1438				 size_t *offsets)
   1439{
   1440	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
   1441	struct mlx5_core_dev **mdev;
   1442	struct mlx5_lag *ldev;
   1443	int num_ports;
   1444	int ret, i, j;
   1445	void *out;
   1446
   1447	out = kvzalloc(outlen, GFP_KERNEL);
   1448	if (!out)
   1449		return -ENOMEM;
   1450
   1451	mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL);
   1452	if (!mdev) {
   1453		ret = -ENOMEM;
   1454		goto free_out;
   1455	}
   1456
   1457	memset(values, 0, sizeof(*values) * num_counters);
   1458
   1459	spin_lock(&lag_lock);
   1460	ldev = mlx5_lag_dev(dev);
   1461	if (ldev && __mlx5_lag_is_active(ldev)) {
   1462		num_ports = ldev->ports;
   1463		for (i = 0; i < ldev->ports; i++)
   1464			mdev[i] = ldev->pf[i].dev;
   1465	} else {
   1466		num_ports = 1;
   1467		mdev[MLX5_LAG_P1] = dev;
   1468	}
   1469	spin_unlock(&lag_lock);
   1470
   1471	for (i = 0; i < num_ports; ++i) {
   1472		u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {};
   1473
   1474		MLX5_SET(query_cong_statistics_in, in, opcode,
   1475			 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
   1476		ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in,
   1477					  out);
   1478		if (ret)
   1479			goto free_mdev;
   1480
   1481		for (j = 0; j < num_counters; ++j)
   1482			values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
   1483	}
   1484
   1485free_mdev:
   1486	kvfree(mdev);
   1487free_out:
   1488	kvfree(out);
   1489	return ret;
   1490}
   1491EXPORT_SYMBOL(mlx5_lag_query_cong_counters);