cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

smc_ib.c (27206B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
      4 *
      5 *  IB infrastructure:
      6 *  Establish SMC-R as an Infiniband Client to be notified about added and
      7 *  removed IB devices of type RDMA.
      8 *  Determine device and port characteristics for these IB devices.
      9 *
     10 *  Copyright IBM Corp. 2016
     11 *
     12 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
     13 */
     14
     15#include <linux/etherdevice.h>
     16#include <linux/if_vlan.h>
     17#include <linux/random.h>
     18#include <linux/workqueue.h>
     19#include <linux/scatterlist.h>
     20#include <linux/wait.h>
     21#include <linux/mutex.h>
     22#include <linux/inetdevice.h>
     23#include <rdma/ib_verbs.h>
     24#include <rdma/ib_cache.h>
     25
     26#include "smc_pnet.h"
     27#include "smc_ib.h"
     28#include "smc_core.h"
     29#include "smc_wr.h"
     30#include "smc.h"
     31#include "smc_netlink.h"
     32
     33#define SMC_MAX_CQE 32766	/* max. # of completion queue elements */
     34
     35#define SMC_QP_MIN_RNR_TIMER		5
     36#define SMC_QP_TIMEOUT			15 /* 4096 * 2 ** timeout usec */
     37#define SMC_QP_RETRY_CNT			7 /* 7: infinite */
     38#define SMC_QP_RNR_RETRY			7 /* 7: infinite */
     39
     40struct smc_ib_devices smc_ib_devices = {	/* smc-registered ib devices */
     41	.mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
     42	.list = LIST_HEAD_INIT(smc_ib_devices.list),
     43};
     44
     45u8 local_systemid[SMC_SYSTEMID_LEN];		/* unique system identifier */
     46
     47static int smc_ib_modify_qp_init(struct smc_link *lnk)
     48{
     49	struct ib_qp_attr qp_attr;
     50
     51	memset(&qp_attr, 0, sizeof(qp_attr));
     52	qp_attr.qp_state = IB_QPS_INIT;
     53	qp_attr.pkey_index = 0;
     54	qp_attr.port_num = lnk->ibport;
     55	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
     56				| IB_ACCESS_REMOTE_WRITE;
     57	return ib_modify_qp(lnk->roce_qp, &qp_attr,
     58			    IB_QP_STATE | IB_QP_PKEY_INDEX |
     59			    IB_QP_ACCESS_FLAGS | IB_QP_PORT);
     60}
     61
     62static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
     63{
     64	enum ib_qp_attr_mask qp_attr_mask =
     65		IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
     66		IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
     67	struct ib_qp_attr qp_attr;
     68	u8 hop_lim = 1;
     69
     70	memset(&qp_attr, 0, sizeof(qp_attr));
     71	qp_attr.qp_state = IB_QPS_RTR;
     72	qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
     73	qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
     74	rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
     75	if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
     76		hop_lim = IPV6_DEFAULT_HOPLIMIT;
     77	rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
     78	rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
     79	if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
     80		memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
     81		       sizeof(lnk->lgr->nexthop_mac));
     82	else
     83		memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
     84		       sizeof(lnk->peer_mac));
     85	qp_attr.dest_qp_num = lnk->peer_qpn;
     86	qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
     87	qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
     88					 * requests
     89					 */
     90	qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
     91
     92	return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
     93}
     94
     95int smc_ib_modify_qp_rts(struct smc_link *lnk)
     96{
     97	struct ib_qp_attr qp_attr;
     98
     99	memset(&qp_attr, 0, sizeof(qp_attr));
    100	qp_attr.qp_state = IB_QPS_RTS;
    101	qp_attr.timeout = SMC_QP_TIMEOUT;	/* local ack timeout */
    102	qp_attr.retry_cnt = SMC_QP_RETRY_CNT;	/* retry count */
    103	qp_attr.rnr_retry = SMC_QP_RNR_RETRY;	/* RNR retries, 7=infinite */
    104	qp_attr.sq_psn = lnk->psn_initial;	/* starting send packet seq # */
    105	qp_attr.max_rd_atomic = 1;	/* # of outstanding RDMA reads and
    106					 * atomic ops allowed
    107					 */
    108	return ib_modify_qp(lnk->roce_qp, &qp_attr,
    109			    IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
    110			    IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
    111			    IB_QP_MAX_QP_RD_ATOMIC);
    112}
    113
    114int smc_ib_modify_qp_error(struct smc_link *lnk)
    115{
    116	struct ib_qp_attr qp_attr;
    117
    118	memset(&qp_attr, 0, sizeof(qp_attr));
    119	qp_attr.qp_state = IB_QPS_ERR;
    120	return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
    121}
    122
    123int smc_ib_ready_link(struct smc_link *lnk)
    124{
    125	struct smc_link_group *lgr = smc_get_lgr(lnk);
    126	int rc = 0;
    127
    128	rc = smc_ib_modify_qp_init(lnk);
    129	if (rc)
    130		goto out;
    131
    132	rc = smc_ib_modify_qp_rtr(lnk);
    133	if (rc)
    134		goto out;
    135	smc_wr_remember_qp_attr(lnk);
    136	rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
    137			      IB_CQ_SOLICITED_MASK);
    138	if (rc)
    139		goto out;
    140	rc = smc_wr_rx_post_init(lnk);
    141	if (rc)
    142		goto out;
    143	smc_wr_remember_qp_attr(lnk);
    144
    145	if (lgr->role == SMC_SERV) {
    146		rc = smc_ib_modify_qp_rts(lnk);
    147		if (rc)
    148			goto out;
    149		smc_wr_remember_qp_attr(lnk);
    150	}
    151out:
    152	return rc;
    153}
    154
    155static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
    156{
    157	const struct ib_gid_attr *attr;
    158	int rc;
    159
    160	attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
    161	if (IS_ERR(attr))
    162		return -ENODEV;
    163
    164	rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
    165	rdma_put_gid_attr(attr);
    166	return rc;
    167}
    168
    169/* Create an identifier unique for this instance of SMC-R.
    170 * The MAC-address of the first active registered IB device
    171 * plus a random 2-byte number is used to create this identifier.
    172 * This name is delivered to the peer during connection initialization.
    173 */
    174static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
    175						u8 ibport)
    176{
    177	memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
    178	       sizeof(smcibdev->mac[ibport - 1]));
    179}
    180
    181bool smc_ib_is_valid_local_systemid(void)
    182{
    183	return !is_zero_ether_addr(&local_systemid[2]);
    184}
    185
    186static void smc_ib_init_local_systemid(void)
    187{
    188	get_random_bytes(&local_systemid[0], 2);
    189}
    190
    191bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
    192{
    193	return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
    194}
    195
    196int smc_ib_find_route(__be32 saddr, __be32 daddr,
    197		      u8 nexthop_mac[], u8 *uses_gateway)
    198{
    199	struct neighbour *neigh = NULL;
    200	struct rtable *rt = NULL;
    201	struct flowi4 fl4 = {
    202		.saddr = saddr,
    203		.daddr = daddr
    204	};
    205
    206	if (daddr == cpu_to_be32(INADDR_NONE))
    207		goto out;
    208	rt = ip_route_output_flow(&init_net, &fl4, NULL);
    209	if (IS_ERR(rt))
    210		goto out;
    211	if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
    212		goto out;
    213	neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr);
    214	if (neigh) {
    215		memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
    216		*uses_gateway = rt->rt_uses_gateway;
    217		return 0;
    218	}
    219out:
    220	return -ENOENT;
    221}
    222
    223static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
    224				    const struct ib_gid_attr *attr,
    225				    u8 gid[], u8 *sgid_index,
    226				    struct smc_init_info_smcrv2 *smcrv2)
    227{
    228	if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
    229		if (gid)
    230			memcpy(gid, &attr->gid, SMC_GID_SIZE);
    231		if (sgid_index)
    232			*sgid_index = attr->index;
    233		return 0;
    234	}
    235	if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
    236	    smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
    237		struct in_device *in_dev = __in_dev_get_rcu(ndev);
    238		const struct in_ifaddr *ifa;
    239		bool subnet_match = false;
    240
    241		if (!in_dev)
    242			goto out;
    243		in_dev_for_each_ifa_rcu(ifa, in_dev) {
    244			if (!inet_ifa_match(smcrv2->saddr, ifa))
    245				continue;
    246			subnet_match = true;
    247			break;
    248		}
    249		if (!subnet_match)
    250			goto out;
    251		if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr,
    252						       smcrv2->daddr,
    253						       smcrv2->nexthop_mac,
    254						       &smcrv2->uses_gateway))
    255			goto out;
    256
    257		if (gid)
    258			memcpy(gid, &attr->gid, SMC_GID_SIZE);
    259		if (sgid_index)
    260			*sgid_index = attr->index;
    261		return 0;
    262	}
    263out:
    264	return -ENODEV;
    265}
    266
    267/* determine the gid for an ib-device port and vlan id */
    268int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
    269			 unsigned short vlan_id, u8 gid[], u8 *sgid_index,
    270			 struct smc_init_info_smcrv2 *smcrv2)
    271{
    272	const struct ib_gid_attr *attr;
    273	const struct net_device *ndev;
    274	int i;
    275
    276	for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
    277		attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
    278		if (IS_ERR(attr))
    279			continue;
    280
    281		rcu_read_lock();
    282		ndev = rdma_read_gid_attr_ndev_rcu(attr);
    283		if (!IS_ERR(ndev) &&
    284		    ((!vlan_id && !is_vlan_dev(ndev)) ||
    285		     (vlan_id && is_vlan_dev(ndev) &&
    286		      vlan_dev_vlan_id(ndev) == vlan_id))) {
    287			if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
    288						      sgid_index, smcrv2)) {
    289				rcu_read_unlock();
    290				rdma_put_gid_attr(attr);
    291				return 0;
    292			}
    293		}
    294		rcu_read_unlock();
    295		rdma_put_gid_attr(attr);
    296	}
    297	return -ENODEV;
    298}
    299
    300/* check if gid is still defined on smcibdev */
    301static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
    302				  struct smc_ib_device *smcibdev, u8 ibport)
    303{
    304	const struct ib_gid_attr *attr;
    305	bool rc = false;
    306	int i;
    307
    308	for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
    309		attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
    310		if (IS_ERR(attr))
    311			continue;
    312
    313		rcu_read_lock();
    314		if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
    315		    (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
    316		     !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
    317				     & IPV6_ADDR_LINKLOCAL)))
    318			if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
    319				rc = true;
    320		rcu_read_unlock();
    321		rdma_put_gid_attr(attr);
    322	}
    323	return rc;
    324}
    325
    326/* check all links if the gid is still defined on smcibdev */
    327static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
    328{
    329	struct smc_link_group *lgr;
    330	int i;
    331
    332	spin_lock_bh(&smc_lgr_list.lock);
    333	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
    334		if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
    335			    SMC_MAX_PNETID_LEN))
    336			continue; /* lgr is not affected */
    337		if (list_empty(&lgr->list))
    338			continue;
    339		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
    340			if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
    341			    lgr->lnk[i].smcibdev != smcibdev)
    342				continue;
    343			if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
    344						   lgr->smc_version == SMC_V2,
    345						   smcibdev, ibport))
    346				smcr_port_err(smcibdev, ibport);
    347		}
    348	}
    349	spin_unlock_bh(&smc_lgr_list.lock);
    350}
    351
    352static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
    353{
    354	int rc;
    355
    356	memset(&smcibdev->pattr[ibport - 1], 0,
    357	       sizeof(smcibdev->pattr[ibport - 1]));
    358	rc = ib_query_port(smcibdev->ibdev, ibport,
    359			   &smcibdev->pattr[ibport - 1]);
    360	if (rc)
    361		goto out;
    362	/* the SMC protocol requires specification of the RoCE MAC address */
    363	rc = smc_ib_fill_mac(smcibdev, ibport);
    364	if (rc)
    365		goto out;
    366	if (!smc_ib_is_valid_local_systemid() &&
    367	    smc_ib_port_active(smcibdev, ibport))
    368		/* create unique system identifier */
    369		smc_ib_define_local_systemid(smcibdev, ibport);
    370out:
    371	return rc;
    372}
    373
    374/* process context wrapper for might_sleep smc_ib_remember_port_attr */
    375static void smc_ib_port_event_work(struct work_struct *work)
    376{
    377	struct smc_ib_device *smcibdev = container_of(
    378		work, struct smc_ib_device, port_event_work);
    379	u8 port_idx;
    380
    381	for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
    382		smc_ib_remember_port_attr(smcibdev, port_idx + 1);
    383		clear_bit(port_idx, &smcibdev->port_event_mask);
    384		if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
    385			set_bit(port_idx, smcibdev->ports_going_away);
    386			smcr_port_err(smcibdev, port_idx + 1);
    387		} else {
    388			clear_bit(port_idx, smcibdev->ports_going_away);
    389			smcr_port_add(smcibdev, port_idx + 1);
    390			smc_ib_gid_check(smcibdev, port_idx + 1);
    391		}
    392	}
    393}
    394
    395/* can be called in IRQ context */
    396static void smc_ib_global_event_handler(struct ib_event_handler *handler,
    397					struct ib_event *ibevent)
    398{
    399	struct smc_ib_device *smcibdev;
    400	bool schedule = false;
    401	u8 port_idx;
    402
    403	smcibdev = container_of(handler, struct smc_ib_device, event_handler);
    404
    405	switch (ibevent->event) {
    406	case IB_EVENT_DEVICE_FATAL:
    407		/* terminate all ports on device */
    408		for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
    409			set_bit(port_idx, &smcibdev->port_event_mask);
    410			if (!test_and_set_bit(port_idx,
    411					      smcibdev->ports_going_away))
    412				schedule = true;
    413		}
    414		if (schedule)
    415			schedule_work(&smcibdev->port_event_work);
    416		break;
    417	case IB_EVENT_PORT_ACTIVE:
    418		port_idx = ibevent->element.port_num - 1;
    419		if (port_idx >= SMC_MAX_PORTS)
    420			break;
    421		set_bit(port_idx, &smcibdev->port_event_mask);
    422		if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
    423			schedule_work(&smcibdev->port_event_work);
    424		break;
    425	case IB_EVENT_PORT_ERR:
    426		port_idx = ibevent->element.port_num - 1;
    427		if (port_idx >= SMC_MAX_PORTS)
    428			break;
    429		set_bit(port_idx, &smcibdev->port_event_mask);
    430		if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
    431			schedule_work(&smcibdev->port_event_work);
    432		break;
    433	case IB_EVENT_GID_CHANGE:
    434		port_idx = ibevent->element.port_num - 1;
    435		if (port_idx >= SMC_MAX_PORTS)
    436			break;
    437		set_bit(port_idx, &smcibdev->port_event_mask);
    438		schedule_work(&smcibdev->port_event_work);
    439		break;
    440	default:
    441		break;
    442	}
    443}
    444
    445void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
    446{
    447	if (lnk->roce_pd)
    448		ib_dealloc_pd(lnk->roce_pd);
    449	lnk->roce_pd = NULL;
    450}
    451
    452int smc_ib_create_protection_domain(struct smc_link *lnk)
    453{
    454	int rc;
    455
    456	lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
    457	rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
    458	if (IS_ERR(lnk->roce_pd))
    459		lnk->roce_pd = NULL;
    460	return rc;
    461}
    462
    463static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr,
    464				      struct smc_ib_device *smcibdev)
    465{
    466	struct smc_link_group *lgr;
    467	bool rc = false;
    468	int i;
    469
    470	spin_lock_bh(&smc_lgr->lock);
    471	list_for_each_entry(lgr, &smc_lgr->list, list) {
    472		if (lgr->is_smcd)
    473			continue;
    474		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
    475			if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
    476			    lgr->lnk[i].smcibdev != smcibdev)
    477				continue;
    478			if (lgr->type == SMC_LGR_SINGLE ||
    479			    lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) {
    480				rc = true;
    481				goto out;
    482			}
    483		}
    484	}
    485out:
    486	spin_unlock_bh(&smc_lgr->lock);
    487	return rc;
    488}
    489
    490static int smc_nl_handle_dev_port(struct sk_buff *skb,
    491				  struct ib_device *ibdev,
    492				  struct smc_ib_device *smcibdev,
    493				  int port)
    494{
    495	char smc_pnet[SMC_MAX_PNETID_LEN + 1];
    496	struct nlattr *port_attrs;
    497	unsigned char port_state;
    498	int lnk_count = 0;
    499
    500	port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port);
    501	if (!port_attrs)
    502		goto errout;
    503
    504	if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR,
    505		       smcibdev->pnetid_by_user[port]))
    506		goto errattr;
    507	memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN);
    508	smc_pnet[SMC_MAX_PNETID_LEN] = 0;
    509	if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
    510		goto errattr;
    511	if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV,
    512			smcibdev->ndev_ifidx[port]))
    513		goto errattr;
    514	if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1))
    515		goto errattr;
    516	port_state = smc_ib_port_active(smcibdev, port + 1);
    517	if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state))
    518		goto errattr;
    519	lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]);
    520	if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count))
    521		goto errattr;
    522	nla_nest_end(skb, port_attrs);
    523	return 0;
    524errattr:
    525	nla_nest_cancel(skb, port_attrs);
    526errout:
    527	return -EMSGSIZE;
    528}
    529
    530static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev,
    531				     struct sk_buff *skb)
    532{
    533	if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid))
    534		return false;
    535	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid))
    536		return false;
    537	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor))
    538		return false;
    539	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device))
    540		return false;
    541	if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id))
    542		return false;
    543	return true;
    544}
    545
    546static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
    547				  struct sk_buff *skb,
    548				  struct netlink_callback *cb)
    549{
    550	char smc_ibname[IB_DEVICE_NAME_MAX];
    551	struct smc_pci_dev smc_pci_dev;
    552	struct pci_dev *pci_dev;
    553	unsigned char is_crit;
    554	struct nlattr *attrs;
    555	void *nlh;
    556	int i;
    557
    558	nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
    559			  &smc_gen_nl_family, NLM_F_MULTI,
    560			  SMC_NETLINK_GET_DEV_SMCR);
    561	if (!nlh)
    562		goto errmsg;
    563	attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR);
    564	if (!attrs)
    565		goto errout;
    566	is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev);
    567	if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit))
    568		goto errattr;
    569	if (smcibdev->ibdev->dev.parent) {
    570		memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
    571		pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
    572		smc_set_pci_values(pci_dev, &smc_pci_dev);
    573		if (!smc_nl_handle_pci_values(&smc_pci_dev, skb))
    574			goto errattr;
    575	}
    576	snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name);
    577	if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname))
    578		goto errattr;
    579	for (i = 1; i <= SMC_MAX_PORTS; i++) {
    580		if (!rdma_is_port_valid(smcibdev->ibdev, i))
    581			continue;
    582		if (smc_nl_handle_dev_port(skb, smcibdev->ibdev,
    583					   smcibdev, i - 1))
    584			goto errattr;
    585	}
    586
    587	nla_nest_end(skb, attrs);
    588	genlmsg_end(skb, nlh);
    589	return 0;
    590
    591errattr:
    592	nla_nest_cancel(skb, attrs);
    593errout:
    594	genlmsg_cancel(skb, nlh);
    595errmsg:
    596	return -EMSGSIZE;
    597}
    598
    599static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list,
    600				 struct sk_buff *skb,
    601				 struct netlink_callback *cb)
    602{
    603	struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
    604	struct smc_ib_device *smcibdev;
    605	int snum = cb_ctx->pos[0];
    606	int num = 0;
    607
    608	mutex_lock(&dev_list->mutex);
    609	list_for_each_entry(smcibdev, &dev_list->list, list) {
    610		if (num < snum)
    611			goto next;
    612		if (smc_nl_handle_smcr_dev(smcibdev, skb, cb))
    613			goto errout;
    614next:
    615		num++;
    616	}
    617errout:
    618	mutex_unlock(&dev_list->mutex);
    619	cb_ctx->pos[0] = num;
    620}
    621
    622int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
    623{
    624	smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb);
    625	return skb->len;
    626}
    627
    628static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
    629{
    630	struct smc_link *lnk = (struct smc_link *)priv;
    631	struct smc_ib_device *smcibdev = lnk->smcibdev;
    632	u8 port_idx;
    633
    634	switch (ibevent->event) {
    635	case IB_EVENT_QP_FATAL:
    636	case IB_EVENT_QP_ACCESS_ERR:
    637		port_idx = ibevent->element.qp->port - 1;
    638		if (port_idx >= SMC_MAX_PORTS)
    639			break;
    640		set_bit(port_idx, &smcibdev->port_event_mask);
    641		if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
    642			schedule_work(&smcibdev->port_event_work);
    643		break;
    644	default:
    645		break;
    646	}
    647}
    648
    649void smc_ib_destroy_queue_pair(struct smc_link *lnk)
    650{
    651	if (lnk->roce_qp)
    652		ib_destroy_qp(lnk->roce_qp);
    653	lnk->roce_qp = NULL;
    654}
    655
    656/* create a queue pair within the protection domain for a link */
    657int smc_ib_create_queue_pair(struct smc_link *lnk)
    658{
    659	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
    660	struct ib_qp_init_attr qp_attr = {
    661		.event_handler = smc_ib_qp_event_handler,
    662		.qp_context = lnk,
    663		.send_cq = lnk->smcibdev->roce_cq_send,
    664		.recv_cq = lnk->smcibdev->roce_cq_recv,
    665		.srq = NULL,
    666		.cap = {
    667				/* include unsolicited rdma_writes as well,
    668				 * there are max. 2 RDMA_WRITE per 1 WR_SEND
    669				 */
    670			.max_send_wr = SMC_WR_BUF_CNT * 3,
    671			.max_recv_wr = SMC_WR_BUF_CNT * 3,
    672			.max_send_sge = SMC_IB_MAX_SEND_SGE,
    673			.max_recv_sge = sges_per_buf,
    674			.max_inline_data = 0,
    675		},
    676		.sq_sig_type = IB_SIGNAL_REQ_WR,
    677		.qp_type = IB_QPT_RC,
    678	};
    679	int rc;
    680
    681	lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
    682	rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
    683	if (IS_ERR(lnk->roce_qp))
    684		lnk->roce_qp = NULL;
    685	else
    686		smc_wr_remember_qp_attr(lnk);
    687	return rc;
    688}
    689
    690void smc_ib_put_memory_region(struct ib_mr *mr)
    691{
    692	ib_dereg_mr(mr);
    693}
    694
    695static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
    696{
    697	unsigned int offset = 0;
    698	int sg_num;
    699
    700	/* map the largest prefix of a dma mapped SG list */
    701	sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx],
    702			      buf_slot->sgt[link_idx].sgl,
    703			      buf_slot->sgt[link_idx].orig_nents,
    704			      &offset, PAGE_SIZE);
    705
    706	return sg_num;
    707}
    708
    709/* Allocate a memory region and map the dma mapped SG list of buf_slot */
    710int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
    711			     struct smc_buf_desc *buf_slot, u8 link_idx)
    712{
    713	if (buf_slot->mr_rx[link_idx])
    714		return 0; /* already done */
    715
    716	buf_slot->mr_rx[link_idx] =
    717		ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
    718	if (IS_ERR(buf_slot->mr_rx[link_idx])) {
    719		int rc;
    720
    721		rc = PTR_ERR(buf_slot->mr_rx[link_idx]);
    722		buf_slot->mr_rx[link_idx] = NULL;
    723		return rc;
    724	}
    725
    726	if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1)
    727		return -EINVAL;
    728
    729	return 0;
    730}
    731
    732/* synchronize buffer usage for cpu access */
    733void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
    734			    struct smc_buf_desc *buf_slot,
    735			    enum dma_data_direction data_direction)
    736{
    737	struct scatterlist *sg;
    738	unsigned int i;
    739
    740	/* for now there is just one DMA address */
    741	for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
    742		    buf_slot->sgt[lnk->link_idx].nents, i) {
    743		if (!sg_dma_len(sg))
    744			break;
    745		ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
    746					   sg_dma_address(sg),
    747					   sg_dma_len(sg),
    748					   data_direction);
    749	}
    750}
    751
    752/* synchronize buffer usage for device access */
    753void smc_ib_sync_sg_for_device(struct smc_link *lnk,
    754			       struct smc_buf_desc *buf_slot,
    755			       enum dma_data_direction data_direction)
    756{
    757	struct scatterlist *sg;
    758	unsigned int i;
    759
    760	/* for now there is just one DMA address */
    761	for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
    762		    buf_slot->sgt[lnk->link_idx].nents, i) {
    763		if (!sg_dma_len(sg))
    764			break;
    765		ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
    766					      sg_dma_address(sg),
    767					      sg_dma_len(sg),
    768					      data_direction);
    769	}
    770}
    771
    772/* Map a new TX or RX buffer SG-table to DMA */
    773int smc_ib_buf_map_sg(struct smc_link *lnk,
    774		      struct smc_buf_desc *buf_slot,
    775		      enum dma_data_direction data_direction)
    776{
    777	int mapped_nents;
    778
    779	mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
    780				     buf_slot->sgt[lnk->link_idx].sgl,
    781				     buf_slot->sgt[lnk->link_idx].orig_nents,
    782				     data_direction);
    783	if (!mapped_nents)
    784		return -ENOMEM;
    785
    786	return mapped_nents;
    787}
    788
    789void smc_ib_buf_unmap_sg(struct smc_link *lnk,
    790			 struct smc_buf_desc *buf_slot,
    791			 enum dma_data_direction data_direction)
    792{
    793	if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
    794		return; /* already unmapped */
    795
    796	ib_dma_unmap_sg(lnk->smcibdev->ibdev,
    797			buf_slot->sgt[lnk->link_idx].sgl,
    798			buf_slot->sgt[lnk->link_idx].orig_nents,
    799			data_direction);
    800	buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
    801}
    802
    803long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
    804{
    805	struct ib_cq_init_attr cqattr =	{
    806		.cqe = SMC_MAX_CQE, .comp_vector = 0 };
    807	int cqe_size_order, smc_order;
    808	long rc;
    809
    810	mutex_lock(&smcibdev->mutex);
    811	rc = 0;
    812	if (smcibdev->initialized)
    813		goto out;
    814	/* the calculated number of cq entries fits to mlx5 cq allocation */
    815	cqe_size_order = cache_line_size() == 128 ? 7 : 6;
    816	smc_order = MAX_ORDER - cqe_size_order - 1;
    817	if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
    818		cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
    819	smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
    820					      smc_wr_tx_cq_handler, NULL,
    821					      smcibdev, &cqattr);
    822	rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
    823	if (IS_ERR(smcibdev->roce_cq_send)) {
    824		smcibdev->roce_cq_send = NULL;
    825		goto out;
    826	}
    827	smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
    828					      smc_wr_rx_cq_handler, NULL,
    829					      smcibdev, &cqattr);
    830	rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
    831	if (IS_ERR(smcibdev->roce_cq_recv)) {
    832		smcibdev->roce_cq_recv = NULL;
    833		goto err;
    834	}
    835	smc_wr_add_dev(smcibdev);
    836	smcibdev->initialized = 1;
    837	goto out;
    838
    839err:
    840	ib_destroy_cq(smcibdev->roce_cq_send);
    841out:
    842	mutex_unlock(&smcibdev->mutex);
    843	return rc;
    844}
    845
    846static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
    847{
    848	mutex_lock(&smcibdev->mutex);
    849	if (!smcibdev->initialized)
    850		goto out;
    851	smcibdev->initialized = 0;
    852	ib_destroy_cq(smcibdev->roce_cq_recv);
    853	ib_destroy_cq(smcibdev->roce_cq_send);
    854	smc_wr_remove_dev(smcibdev);
    855out:
    856	mutex_unlock(&smcibdev->mutex);
    857}
    858
    859static struct ib_client smc_ib_client;
    860
    861static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
    862{
    863	struct ib_device *ibdev = smcibdev->ibdev;
    864	struct net_device *ndev;
    865
    866	if (!ibdev->ops.get_netdev)
    867		return;
    868	ndev = ibdev->ops.get_netdev(ibdev, port + 1);
    869	if (ndev) {
    870		smcibdev->ndev_ifidx[port] = ndev->ifindex;
    871		dev_put(ndev);
    872	}
    873}
    874
    875void smc_ib_ndev_change(struct net_device *ndev, unsigned long event)
    876{
    877	struct smc_ib_device *smcibdev;
    878	struct ib_device *libdev;
    879	struct net_device *lndev;
    880	u8 port_cnt;
    881	int i;
    882
    883	mutex_lock(&smc_ib_devices.mutex);
    884	list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
    885		port_cnt = smcibdev->ibdev->phys_port_cnt;
    886		for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) {
    887			libdev = smcibdev->ibdev;
    888			if (!libdev->ops.get_netdev)
    889				continue;
    890			lndev = libdev->ops.get_netdev(libdev, i + 1);
    891			dev_put(lndev);
    892			if (lndev != ndev)
    893				continue;
    894			if (event == NETDEV_REGISTER)
    895				smcibdev->ndev_ifidx[i] = ndev->ifindex;
    896			if (event == NETDEV_UNREGISTER)
    897				smcibdev->ndev_ifidx[i] = 0;
    898		}
    899	}
    900	mutex_unlock(&smc_ib_devices.mutex);
    901}
    902
    903/* callback function for ib_register_client() */
    904static int smc_ib_add_dev(struct ib_device *ibdev)
    905{
    906	struct smc_ib_device *smcibdev;
    907	u8 port_cnt;
    908	int i;
    909
    910	if (ibdev->node_type != RDMA_NODE_IB_CA)
    911		return -EOPNOTSUPP;
    912
    913	smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
    914	if (!smcibdev)
    915		return -ENOMEM;
    916
    917	smcibdev->ibdev = ibdev;
    918	INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
    919	atomic_set(&smcibdev->lnk_cnt, 0);
    920	init_waitqueue_head(&smcibdev->lnks_deleted);
    921	mutex_init(&smcibdev->mutex);
    922	mutex_lock(&smc_ib_devices.mutex);
    923	list_add_tail(&smcibdev->list, &smc_ib_devices.list);
    924	mutex_unlock(&smc_ib_devices.mutex);
    925	ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
    926	INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
    927			      smc_ib_global_event_handler);
    928	ib_register_event_handler(&smcibdev->event_handler);
    929
    930	/* trigger reading of the port attributes */
    931	port_cnt = smcibdev->ibdev->phys_port_cnt;
    932	pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
    933			    smcibdev->ibdev->name, port_cnt);
    934	for (i = 0;
    935	     i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
    936	     i++) {
    937		set_bit(i, &smcibdev->port_event_mask);
    938		/* determine pnetids of the port */
    939		if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
    940					   smcibdev->pnetid[i]))
    941			smc_pnetid_by_table_ib(smcibdev, i + 1);
    942		smc_copy_netdev_ifindex(smcibdev, i);
    943		pr_warn_ratelimited("smc:    ib device %s port %d has pnetid "
    944				    "%.16s%s\n",
    945				    smcibdev->ibdev->name, i + 1,
    946				    smcibdev->pnetid[i],
    947				    smcibdev->pnetid_by_user[i] ?
    948				     " (user defined)" :
    949				     "");
    950	}
    951	schedule_work(&smcibdev->port_event_work);
    952	return 0;
    953}
    954
    955/* callback function for ib_unregister_client() */
    956static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
    957{
    958	struct smc_ib_device *smcibdev = client_data;
    959
    960	mutex_lock(&smc_ib_devices.mutex);
    961	list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
    962	mutex_unlock(&smc_ib_devices.mutex);
    963	pr_warn_ratelimited("smc: removing ib device %s\n",
    964			    smcibdev->ibdev->name);
    965	smc_smcr_terminate_all(smcibdev);
    966	smc_ib_cleanup_per_ibdev(smcibdev);
    967	ib_unregister_event_handler(&smcibdev->event_handler);
    968	cancel_work_sync(&smcibdev->port_event_work);
    969	kfree(smcibdev);
    970}
    971
    972static struct ib_client smc_ib_client = {
    973	.name	= "smc_ib",
    974	.add	= smc_ib_add_dev,
    975	.remove = smc_ib_remove_dev,
    976};
    977
    978int __init smc_ib_register_client(void)
    979{
    980	smc_ib_init_local_systemid();
    981	return ib_register_client(&smc_ib_client);
    982}
    983
    984void smc_ib_unregister_client(void)
    985{
    986	ib_unregister_client(&smc_ib_client);
    987}