cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

af_smc.c (89274B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
      4 *
      5 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
      6 *  applies to SOCK_STREAM sockets only
      7 *  offers an alternative communication option for TCP-protocol sockets
      8 *  applicable with RoCE-cards only
      9 *
     10 *  Initial restrictions:
     11 *    - support for alternate links postponed
     12 *
     13 *  Copyright IBM Corp. 2016, 2018
     14 *
     15 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
     16 *              based on prototype from Frank Blaschka
     17 */
     18
     19#define KMSG_COMPONENT "smc"
     20#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
     21
     22#include <linux/module.h>
     23#include <linux/socket.h>
     24#include <linux/workqueue.h>
     25#include <linux/in.h>
     26#include <linux/sched/signal.h>
     27#include <linux/if_vlan.h>
     28#include <linux/rcupdate_wait.h>
     29#include <linux/ctype.h>
     30
     31#include <net/sock.h>
     32#include <net/tcp.h>
     33#include <net/smc.h>
     34#include <asm/ioctls.h>
     35
     36#include <net/net_namespace.h>
     37#include <net/netns/generic.h>
     38#include "smc_netns.h"
     39
     40#include "smc.h"
     41#include "smc_clc.h"
     42#include "smc_llc.h"
     43#include "smc_cdc.h"
     44#include "smc_core.h"
     45#include "smc_ib.h"
     46#include "smc_ism.h"
     47#include "smc_pnet.h"
     48#include "smc_netlink.h"
     49#include "smc_tx.h"
     50#include "smc_rx.h"
     51#include "smc_close.h"
     52#include "smc_stats.h"
     53#include "smc_tracepoint.h"
     54#include "smc_sysctl.h"
     55
     56static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
     57						 * creation on server
     58						 */
     59static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
     60						 * creation on client
     61						 */
     62
     63static struct workqueue_struct	*smc_tcp_ls_wq;	/* wq for tcp listen work */
     64struct workqueue_struct	*smc_hs_wq;	/* wq for handshake work */
     65struct workqueue_struct	*smc_close_wq;	/* wq for close work */
     66
     67static void smc_tcp_listen_work(struct work_struct *);
     68static void smc_connect_work(struct work_struct *);
     69
     70int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
     71{
     72	struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
     73	void *hdr;
     74
     75	if (cb_ctx->pos[0])
     76		goto out;
     77
     78	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
     79			  &smc_gen_nl_family, NLM_F_MULTI,
     80			  SMC_NETLINK_DUMP_HS_LIMITATION);
     81	if (!hdr)
     82		return -ENOMEM;
     83
     84	if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
     85		       sock_net(skb->sk)->smc.limit_smc_hs))
     86		goto err;
     87
     88	genlmsg_end(skb, hdr);
     89	cb_ctx->pos[0] = 1;
     90out:
     91	return skb->len;
     92err:
     93	genlmsg_cancel(skb, hdr);
     94	return -EMSGSIZE;
     95}
     96
     97int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
     98{
     99	sock_net(skb->sk)->smc.limit_smc_hs = true;
    100	return 0;
    101}
    102
    103int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
    104{
    105	sock_net(skb->sk)->smc.limit_smc_hs = false;
    106	return 0;
    107}
    108
    109static void smc_set_keepalive(struct sock *sk, int val)
    110{
    111	struct smc_sock *smc = smc_sk(sk);
    112
    113	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
    114}
    115
    116static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
    117					  struct sk_buff *skb,
    118					  struct request_sock *req,
    119					  struct dst_entry *dst,
    120					  struct request_sock *req_unhash,
    121					  bool *own_req)
    122{
    123	struct smc_sock *smc;
    124	struct sock *child;
    125
    126	smc = smc_clcsock_user_data(sk);
    127
    128	if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
    129				sk->sk_max_ack_backlog)
    130		goto drop;
    131
    132	if (sk_acceptq_is_full(&smc->sk)) {
    133		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
    134		goto drop;
    135	}
    136
    137	/* passthrough to original syn recv sock fct */
    138	child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
    139					       own_req);
    140	/* child must not inherit smc or its ops */
    141	if (child) {
    142		rcu_assign_sk_user_data(child, NULL);
    143
    144		/* v4-mapped sockets don't inherit parent ops. Don't restore. */
    145		if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops)
    146			inet_csk(child)->icsk_af_ops = smc->ori_af_ops;
    147	}
    148	return child;
    149
    150drop:
    151	dst_release(dst);
    152	tcp_listendrop(sk);
    153	return NULL;
    154}
    155
    156static bool smc_hs_congested(const struct sock *sk)
    157{
    158	const struct smc_sock *smc;
    159
    160	smc = smc_clcsock_user_data(sk);
    161
    162	if (!smc)
    163		return true;
    164
    165	if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
    166		return true;
    167
    168	return false;
    169}
    170
    171static struct smc_hashinfo smc_v4_hashinfo = {
    172	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
    173};
    174
    175static struct smc_hashinfo smc_v6_hashinfo = {
    176	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
    177};
    178
    179int smc_hash_sk(struct sock *sk)
    180{
    181	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
    182	struct hlist_head *head;
    183
    184	head = &h->ht;
    185
    186	write_lock_bh(&h->lock);
    187	sk_add_node(sk, head);
    188	write_unlock_bh(&h->lock);
    189	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    190
    191	return 0;
    192}
    193EXPORT_SYMBOL_GPL(smc_hash_sk);
    194
    195void smc_unhash_sk(struct sock *sk)
    196{
    197	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
    198
    199	write_lock_bh(&h->lock);
    200	if (sk_del_node_init(sk))
    201		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
    202	write_unlock_bh(&h->lock);
    203}
    204EXPORT_SYMBOL_GPL(smc_unhash_sk);
    205
    206/* This will be called before user really release sock_lock. So do the
    207 * work which we didn't do because of user hold the sock_lock in the
    208 * BH context
    209 */
    210static void smc_release_cb(struct sock *sk)
    211{
    212	struct smc_sock *smc = smc_sk(sk);
    213
    214	if (smc->conn.tx_in_release_sock) {
    215		smc_tx_pending(&smc->conn);
    216		smc->conn.tx_in_release_sock = false;
    217	}
    218}
    219
    220struct proto smc_proto = {
    221	.name		= "SMC",
    222	.owner		= THIS_MODULE,
    223	.keepalive	= smc_set_keepalive,
    224	.hash		= smc_hash_sk,
    225	.unhash		= smc_unhash_sk,
    226	.release_cb	= smc_release_cb,
    227	.obj_size	= sizeof(struct smc_sock),
    228	.h.smc_hash	= &smc_v4_hashinfo,
    229	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
    230};
    231EXPORT_SYMBOL_GPL(smc_proto);
    232
    233struct proto smc_proto6 = {
    234	.name		= "SMC6",
    235	.owner		= THIS_MODULE,
    236	.keepalive	= smc_set_keepalive,
    237	.hash		= smc_hash_sk,
    238	.unhash		= smc_unhash_sk,
    239	.release_cb	= smc_release_cb,
    240	.obj_size	= sizeof(struct smc_sock),
    241	.h.smc_hash	= &smc_v6_hashinfo,
    242	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
    243};
    244EXPORT_SYMBOL_GPL(smc_proto6);
    245
    246static void smc_fback_restore_callbacks(struct smc_sock *smc)
    247{
    248	struct sock *clcsk = smc->clcsock->sk;
    249
    250	write_lock_bh(&clcsk->sk_callback_lock);
    251	clcsk->sk_user_data = NULL;
    252
    253	smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change);
    254	smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready);
    255	smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space);
    256	smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report);
    257
    258	write_unlock_bh(&clcsk->sk_callback_lock);
    259}
    260
    261static void smc_restore_fallback_changes(struct smc_sock *smc)
    262{
    263	if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
    264		smc->clcsock->file->private_data = smc->sk.sk_socket;
    265		smc->clcsock->file = NULL;
    266		smc_fback_restore_callbacks(smc);
    267	}
    268}
    269
    270static int __smc_release(struct smc_sock *smc)
    271{
    272	struct sock *sk = &smc->sk;
    273	int rc = 0;
    274
    275	if (!smc->use_fallback) {
    276		rc = smc_close_active(smc);
    277		sock_set_flag(sk, SOCK_DEAD);
    278		sk->sk_shutdown |= SHUTDOWN_MASK;
    279	} else {
    280		if (sk->sk_state != SMC_CLOSED) {
    281			if (sk->sk_state != SMC_LISTEN &&
    282			    sk->sk_state != SMC_INIT)
    283				sock_put(sk); /* passive closing */
    284			if (sk->sk_state == SMC_LISTEN) {
    285				/* wake up clcsock accept */
    286				rc = kernel_sock_shutdown(smc->clcsock,
    287							  SHUT_RDWR);
    288			}
    289			sk->sk_state = SMC_CLOSED;
    290			sk->sk_state_change(sk);
    291		}
    292		smc_restore_fallback_changes(smc);
    293	}
    294
    295	sk->sk_prot->unhash(sk);
    296
    297	if (sk->sk_state == SMC_CLOSED) {
    298		if (smc->clcsock) {
    299			release_sock(sk);
    300			smc_clcsock_release(smc);
    301			lock_sock(sk);
    302		}
    303		if (!smc->use_fallback)
    304			smc_conn_free(&smc->conn);
    305	}
    306
    307	return rc;
    308}
    309
    310static int smc_release(struct socket *sock)
    311{
    312	struct sock *sk = sock->sk;
    313	struct smc_sock *smc;
    314	int old_state, rc = 0;
    315
    316	if (!sk)
    317		goto out;
    318
    319	sock_hold(sk); /* sock_put below */
    320	smc = smc_sk(sk);
    321
    322	old_state = sk->sk_state;
    323
    324	/* cleanup for a dangling non-blocking connect */
    325	if (smc->connect_nonblock && old_state == SMC_INIT)
    326		tcp_abort(smc->clcsock->sk, ECONNABORTED);
    327
    328	if (cancel_work_sync(&smc->connect_work))
    329		sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
    330
    331	if (sk->sk_state == SMC_LISTEN)
    332		/* smc_close_non_accepted() is called and acquires
    333		 * sock lock for child sockets again
    334		 */
    335		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
    336	else
    337		lock_sock(sk);
    338
    339	if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
    340	    !smc->use_fallback)
    341		smc_close_active_abort(smc);
    342
    343	rc = __smc_release(smc);
    344
    345	/* detach socket */
    346	sock_orphan(sk);
    347	sock->sk = NULL;
    348	release_sock(sk);
    349
    350	sock_put(sk); /* sock_hold above */
    351	sock_put(sk); /* final sock_put */
    352out:
    353	return rc;
    354}
    355
    356static void smc_destruct(struct sock *sk)
    357{
    358	if (sk->sk_state != SMC_CLOSED)
    359		return;
    360	if (!sock_flag(sk, SOCK_DEAD))
    361		return;
    362
    363	sk_refcnt_debug_dec(sk);
    364}
    365
    366static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
    367				   int protocol)
    368{
    369	struct smc_sock *smc;
    370	struct proto *prot;
    371	struct sock *sk;
    372
    373	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
    374	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
    375	if (!sk)
    376		return NULL;
    377
    378	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
    379	sk->sk_state = SMC_INIT;
    380	sk->sk_destruct = smc_destruct;
    381	sk->sk_protocol = protocol;
    382	smc = smc_sk(sk);
    383	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
    384	INIT_WORK(&smc->connect_work, smc_connect_work);
    385	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
    386	INIT_LIST_HEAD(&smc->accept_q);
    387	spin_lock_init(&smc->accept_q_lock);
    388	spin_lock_init(&smc->conn.send_lock);
    389	sk->sk_prot->hash(sk);
    390	sk_refcnt_debug_inc(sk);
    391	mutex_init(&smc->clcsock_release_lock);
    392	smc_init_saved_callbacks(smc);
    393
    394	return sk;
    395}
    396
    397static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
    398		    int addr_len)
    399{
    400	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
    401	struct sock *sk = sock->sk;
    402	struct smc_sock *smc;
    403	int rc;
    404
    405	smc = smc_sk(sk);
    406
    407	/* replicate tests from inet_bind(), to be safe wrt. future changes */
    408	rc = -EINVAL;
    409	if (addr_len < sizeof(struct sockaddr_in))
    410		goto out;
    411
    412	rc = -EAFNOSUPPORT;
    413	if (addr->sin_family != AF_INET &&
    414	    addr->sin_family != AF_INET6 &&
    415	    addr->sin_family != AF_UNSPEC)
    416		goto out;
    417	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
    418	if (addr->sin_family == AF_UNSPEC &&
    419	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
    420		goto out;
    421
    422	lock_sock(sk);
    423
    424	/* Check if socket is already active */
    425	rc = -EINVAL;
    426	if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
    427		goto out_rel;
    428
    429	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
    430	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
    431
    432out_rel:
    433	release_sock(sk);
    434out:
    435	return rc;
    436}
    437
    438static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
    439				   unsigned long mask)
    440{
    441	/* options we don't get control via setsockopt for */
    442	nsk->sk_type = osk->sk_type;
    443	nsk->sk_sndbuf = osk->sk_sndbuf;
    444	nsk->sk_rcvbuf = osk->sk_rcvbuf;
    445	nsk->sk_sndtimeo = osk->sk_sndtimeo;
    446	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
    447	nsk->sk_mark = osk->sk_mark;
    448	nsk->sk_priority = osk->sk_priority;
    449	nsk->sk_rcvlowat = osk->sk_rcvlowat;
    450	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
    451	nsk->sk_err = osk->sk_err;
    452
    453	nsk->sk_flags &= ~mask;
    454	nsk->sk_flags |= osk->sk_flags & mask;
    455}
    456
    457#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
    458			     (1UL << SOCK_KEEPOPEN) | \
    459			     (1UL << SOCK_LINGER) | \
    460			     (1UL << SOCK_BROADCAST) | \
    461			     (1UL << SOCK_TIMESTAMP) | \
    462			     (1UL << SOCK_DBG) | \
    463			     (1UL << SOCK_RCVTSTAMP) | \
    464			     (1UL << SOCK_RCVTSTAMPNS) | \
    465			     (1UL << SOCK_LOCALROUTE) | \
    466			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
    467			     (1UL << SOCK_RXQ_OVFL) | \
    468			     (1UL << SOCK_WIFI_STATUS) | \
    469			     (1UL << SOCK_NOFCS) | \
    470			     (1UL << SOCK_FILTER_LOCKED) | \
    471			     (1UL << SOCK_TSTAMP_NEW))
    472/* copy only relevant settings and flags of SOL_SOCKET level from smc to
    473 * clc socket (since smc is not called for these options from net/core)
    474 */
    475static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
    476{
    477	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
    478}
    479
    480#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
    481			     (1UL << SOCK_KEEPOPEN) | \
    482			     (1UL << SOCK_LINGER) | \
    483			     (1UL << SOCK_DBG))
    484/* copy only settings and flags relevant for smc from clc to smc socket */
    485static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
    486{
    487	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
    488}
    489
    490/* register the new rmb on all links */
    491static int smcr_lgr_reg_rmbs(struct smc_link *link,
    492			     struct smc_buf_desc *rmb_desc)
    493{
    494	struct smc_link_group *lgr = link->lgr;
    495	int i, rc = 0;
    496
    497	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
    498	if (rc)
    499		return rc;
    500	/* protect against parallel smc_llc_cli_rkey_exchange() and
    501	 * parallel smcr_link_reg_rmb()
    502	 */
    503	mutex_lock(&lgr->llc_conf_mutex);
    504	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
    505		if (!smc_link_active(&lgr->lnk[i]))
    506			continue;
    507		rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
    508		if (rc)
    509			goto out;
    510	}
    511
    512	/* exchange confirm_rkey msg with peer */
    513	rc = smc_llc_do_confirm_rkey(link, rmb_desc);
    514	if (rc) {
    515		rc = -EFAULT;
    516		goto out;
    517	}
    518	rmb_desc->is_conf_rkey = true;
    519out:
    520	mutex_unlock(&lgr->llc_conf_mutex);
    521	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
    522	return rc;
    523}
    524
    525static int smcr_clnt_conf_first_link(struct smc_sock *smc)
    526{
    527	struct smc_link *link = smc->conn.lnk;
    528	struct smc_llc_qentry *qentry;
    529	int rc;
    530
    531	/* receive CONFIRM LINK request from server over RoCE fabric */
    532	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
    533			      SMC_LLC_CONFIRM_LINK);
    534	if (!qentry) {
    535		struct smc_clc_msg_decline dclc;
    536
    537		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
    538				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
    539		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
    540	}
    541	smc_llc_save_peer_uid(qentry);
    542	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
    543	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
    544	if (rc)
    545		return SMC_CLC_DECL_RMBE_EC;
    546
    547	rc = smc_ib_modify_qp_rts(link);
    548	if (rc)
    549		return SMC_CLC_DECL_ERR_RDYLNK;
    550
    551	smc_wr_remember_qp_attr(link);
    552
    553	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
    554		return SMC_CLC_DECL_ERR_REGRMB;
    555
    556	/* confirm_rkey is implicit on 1st contact */
    557	smc->conn.rmb_desc->is_conf_rkey = true;
    558
    559	/* send CONFIRM LINK response over RoCE fabric */
    560	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
    561	if (rc < 0)
    562		return SMC_CLC_DECL_TIMEOUT_CL;
    563
    564	smc_llc_link_active(link);
    565	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
    566
    567	/* optional 2nd link, receive ADD LINK request from server */
    568	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
    569			      SMC_LLC_ADD_LINK);
    570	if (!qentry) {
    571		struct smc_clc_msg_decline dclc;
    572
    573		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
    574				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
    575		if (rc == -EAGAIN)
    576			rc = 0; /* no DECLINE received, go with one link */
    577		return rc;
    578	}
    579	smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
    580	smc_llc_cli_add_link(link, qentry);
    581	return 0;
    582}
    583
    584static bool smc_isascii(char *hostname)
    585{
    586	int i;
    587
    588	for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
    589		if (!isascii(hostname[i]))
    590			return false;
    591	return true;
    592}
    593
    594static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
    595					struct smc_clc_msg_accept_confirm *clc)
    596{
    597	struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
    598		(struct smc_clc_msg_accept_confirm_v2 *)clc;
    599	struct smc_clc_first_contact_ext *fce;
    600	int clc_v2_len;
    601
    602	if (clc->hdr.version == SMC_V1 ||
    603	    !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
    604		return;
    605
    606	if (smc->conn.lgr->is_smcd) {
    607		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid,
    608		       SMC_MAX_EID_LEN);
    609		clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
    610					 d1);
    611	} else {
    612		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid,
    613		       SMC_MAX_EID_LEN);
    614		clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
    615					 r1);
    616	}
    617	fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len);
    618	smc->conn.lgr->peer_os = fce->os_type;
    619	smc->conn.lgr->peer_smc_release = fce->release;
    620	if (smc_isascii(fce->hostname))
    621		memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
    622		       SMC_MAX_HOSTNAME_LEN);
    623}
    624
    625static void smcr_conn_save_peer_info(struct smc_sock *smc,
    626				     struct smc_clc_msg_accept_confirm *clc)
    627{
    628	int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
    629
    630	smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
    631	smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
    632	smc->conn.peer_rmbe_size = bufsize;
    633	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
    634	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
    635}
    636
    637static void smcd_conn_save_peer_info(struct smc_sock *smc,
    638				     struct smc_clc_msg_accept_confirm *clc)
    639{
    640	int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
    641
    642	smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
    643	smc->conn.peer_token = clc->d0.token;
    644	/* msg header takes up space in the buffer */
    645	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
    646	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
    647	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
    648}
    649
    650static void smc_conn_save_peer_info(struct smc_sock *smc,
    651				    struct smc_clc_msg_accept_confirm *clc)
    652{
    653	if (smc->conn.lgr->is_smcd)
    654		smcd_conn_save_peer_info(smc, clc);
    655	else
    656		smcr_conn_save_peer_info(smc, clc);
    657	smc_conn_save_peer_info_fce(smc, clc);
    658}
    659
    660static void smc_link_save_peer_info(struct smc_link *link,
    661				    struct smc_clc_msg_accept_confirm *clc,
    662				    struct smc_init_info *ini)
    663{
    664	link->peer_qpn = ntoh24(clc->r0.qpn);
    665	memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
    666	memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
    667	link->peer_psn = ntoh24(clc->r0.psn);
    668	link->peer_mtu = clc->r0.qp_mtu;
    669}
    670
    671static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc,
    672				       struct smc_stats_fback *fback_arr)
    673{
    674	int cnt;
    675
    676	for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) {
    677		if (fback_arr[cnt].fback_code == smc->fallback_rsn) {
    678			fback_arr[cnt].count++;
    679			break;
    680		}
    681		if (!fback_arr[cnt].fback_code) {
    682			fback_arr[cnt].fback_code = smc->fallback_rsn;
    683			fback_arr[cnt].count++;
    684			break;
    685		}
    686	}
    687}
    688
    689static void smc_stat_fallback(struct smc_sock *smc)
    690{
    691	struct net *net = sock_net(&smc->sk);
    692
    693	mutex_lock(&net->smc.mutex_fback_rsn);
    694	if (smc->listen_smc) {
    695		smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv);
    696		net->smc.fback_rsn->srv_fback_cnt++;
    697	} else {
    698		smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt);
    699		net->smc.fback_rsn->clnt_fback_cnt++;
    700	}
    701	mutex_unlock(&net->smc.mutex_fback_rsn);
    702}
    703
    704/* must be called under rcu read lock */
    705static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
    706{
    707	struct socket_wq *wq;
    708	__poll_t flags;
    709
    710	wq = rcu_dereference(smc->sk.sk_wq);
    711	if (!skwq_has_sleeper(wq))
    712		return;
    713
    714	/* wake up smc sk->sk_wq */
    715	if (!key) {
    716		/* sk_state_change */
    717		wake_up_interruptible_all(&wq->wait);
    718	} else {
    719		flags = key_to_poll(key);
    720		if (flags & (EPOLLIN | EPOLLOUT))
    721			/* sk_data_ready or sk_write_space */
    722			wake_up_interruptible_sync_poll(&wq->wait, flags);
    723		else if (flags & EPOLLERR)
    724			/* sk_error_report */
    725			wake_up_interruptible_poll(&wq->wait, flags);
    726	}
    727}
    728
    729static int smc_fback_mark_woken(wait_queue_entry_t *wait,
    730				unsigned int mode, int sync, void *key)
    731{
    732	struct smc_mark_woken *mark =
    733		container_of(wait, struct smc_mark_woken, wait_entry);
    734
    735	mark->woken = true;
    736	mark->key = key;
    737	return 0;
    738}
    739
    740static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
    741				     void (*clcsock_callback)(struct sock *sk))
    742{
    743	struct smc_mark_woken mark = { .woken = false };
    744	struct socket_wq *wq;
    745
    746	init_waitqueue_func_entry(&mark.wait_entry,
    747				  smc_fback_mark_woken);
    748	rcu_read_lock();
    749	wq = rcu_dereference(clcsk->sk_wq);
    750	if (!wq)
    751		goto out;
    752	add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
    753	clcsock_callback(clcsk);
    754	remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
    755
    756	if (mark.woken)
    757		smc_fback_wakeup_waitqueue(smc, mark.key);
    758out:
    759	rcu_read_unlock();
    760}
    761
    762static void smc_fback_state_change(struct sock *clcsk)
    763{
    764	struct smc_sock *smc;
    765
    766	read_lock_bh(&clcsk->sk_callback_lock);
    767	smc = smc_clcsock_user_data(clcsk);
    768	if (smc)
    769		smc_fback_forward_wakeup(smc, clcsk,
    770					 smc->clcsk_state_change);
    771	read_unlock_bh(&clcsk->sk_callback_lock);
    772}
    773
    774static void smc_fback_data_ready(struct sock *clcsk)
    775{
    776	struct smc_sock *smc;
    777
    778	read_lock_bh(&clcsk->sk_callback_lock);
    779	smc = smc_clcsock_user_data(clcsk);
    780	if (smc)
    781		smc_fback_forward_wakeup(smc, clcsk,
    782					 smc->clcsk_data_ready);
    783	read_unlock_bh(&clcsk->sk_callback_lock);
    784}
    785
    786static void smc_fback_write_space(struct sock *clcsk)
    787{
    788	struct smc_sock *smc;
    789
    790	read_lock_bh(&clcsk->sk_callback_lock);
    791	smc = smc_clcsock_user_data(clcsk);
    792	if (smc)
    793		smc_fback_forward_wakeup(smc, clcsk,
    794					 smc->clcsk_write_space);
    795	read_unlock_bh(&clcsk->sk_callback_lock);
    796}
    797
    798static void smc_fback_error_report(struct sock *clcsk)
    799{
    800	struct smc_sock *smc;
    801
    802	read_lock_bh(&clcsk->sk_callback_lock);
    803	smc = smc_clcsock_user_data(clcsk);
    804	if (smc)
    805		smc_fback_forward_wakeup(smc, clcsk,
    806					 smc->clcsk_error_report);
    807	read_unlock_bh(&clcsk->sk_callback_lock);
    808}
    809
    810static void smc_fback_replace_callbacks(struct smc_sock *smc)
    811{
    812	struct sock *clcsk = smc->clcsock->sk;
    813
    814	write_lock_bh(&clcsk->sk_callback_lock);
    815	clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
    816
    817	smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change,
    818			       &smc->clcsk_state_change);
    819	smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready,
    820			       &smc->clcsk_data_ready);
    821	smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space,
    822			       &smc->clcsk_write_space);
    823	smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report,
    824			       &smc->clcsk_error_report);
    825
    826	write_unlock_bh(&clcsk->sk_callback_lock);
    827}
    828
    829static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
    830{
    831	int rc = 0;
    832
    833	mutex_lock(&smc->clcsock_release_lock);
    834	if (!smc->clcsock) {
    835		rc = -EBADF;
    836		goto out;
    837	}
    838
    839	smc->use_fallback = true;
    840	smc->fallback_rsn = reason_code;
    841	smc_stat_fallback(smc);
    842	trace_smc_switch_to_fallback(smc, reason_code);
    843	if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
    844		smc->clcsock->file = smc->sk.sk_socket->file;
    845		smc->clcsock->file->private_data = smc->clcsock;
    846		smc->clcsock->wq.fasync_list =
    847			smc->sk.sk_socket->wq.fasync_list;
    848
    849		/* There might be some wait entries remaining
    850		 * in smc sk->sk_wq and they should be woken up
    851		 * as clcsock's wait queue is woken up.
    852		 */
    853		smc_fback_replace_callbacks(smc);
    854	}
    855out:
    856	mutex_unlock(&smc->clcsock_release_lock);
    857	return rc;
    858}
    859
    860/* fall back during connect */
    861static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
    862{
    863	struct net *net = sock_net(&smc->sk);
    864	int rc = 0;
    865
    866	rc = smc_switch_to_fallback(smc, reason_code);
    867	if (rc) { /* fallback fails */
    868		this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
    869		if (smc->sk.sk_state == SMC_INIT)
    870			sock_put(&smc->sk); /* passive closing */
    871		return rc;
    872	}
    873	smc_copy_sock_settings_to_clc(smc);
    874	smc->connect_nonblock = 0;
    875	if (smc->sk.sk_state == SMC_INIT)
    876		smc->sk.sk_state = SMC_ACTIVE;
    877	return 0;
    878}
    879
    880/* decline and fall back during connect */
    881static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
    882					u8 version)
    883{
    884	struct net *net = sock_net(&smc->sk);
    885	int rc;
    886
    887	if (reason_code < 0) { /* error, fallback is not possible */
    888		this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
    889		if (smc->sk.sk_state == SMC_INIT)
    890			sock_put(&smc->sk); /* passive closing */
    891		return reason_code;
    892	}
    893	if (reason_code != SMC_CLC_DECL_PEERDECL) {
    894		rc = smc_clc_send_decline(smc, reason_code, version);
    895		if (rc < 0) {
    896			this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
    897			if (smc->sk.sk_state == SMC_INIT)
    898				sock_put(&smc->sk); /* passive closing */
    899			return rc;
    900		}
    901	}
    902	return smc_connect_fallback(smc, reason_code);
    903}
    904
    905static void smc_conn_abort(struct smc_sock *smc, int local_first)
    906{
    907	struct smc_connection *conn = &smc->conn;
    908	struct smc_link_group *lgr = conn->lgr;
    909	bool lgr_valid = false;
    910
    911	if (smc_conn_lgr_valid(conn))
    912		lgr_valid = true;
    913
    914	smc_conn_free(conn);
    915	if (local_first && lgr_valid)
    916		smc_lgr_cleanup_early(lgr);
    917}
    918
    919/* check if there is a rdma device available for this connection. */
    920/* called for connect and listen */
    921static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
    922{
    923	/* PNET table look up: search active ib_device and port
    924	 * within same PNETID that also contains the ethernet device
    925	 * used for the internal TCP socket
    926	 */
    927	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
    928	if (!ini->check_smcrv2 && !ini->ib_dev)
    929		return SMC_CLC_DECL_NOSMCRDEV;
    930	if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
    931		return SMC_CLC_DECL_NOSMCRDEV;
    932	return 0;
    933}
    934
    935/* check if there is an ISM device available for this connection. */
    936/* called for connect and listen */
    937static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
    938{
    939	/* Find ISM device with same PNETID as connecting interface  */
    940	smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
    941	if (!ini->ism_dev[0])
    942		return SMC_CLC_DECL_NOSMCDDEV;
    943	else
    944		ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
    945	return 0;
    946}
    947
    948/* is chid unique for the ism devices that are already determined? */
    949static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
    950					   int cnt)
    951{
    952	int i = (!ini->ism_dev[0]) ? 1 : 0;
    953
    954	for (; i < cnt; i++)
    955		if (ini->ism_chid[i] == chid)
    956			return false;
    957	return true;
    958}
    959
    960/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
    961 * PNETID matching net_device)
    962 */
    963static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
    964				       struct smc_init_info *ini)
    965{
    966	int rc = SMC_CLC_DECL_NOSMCDDEV;
    967	struct smcd_dev *smcd;
    968	int i = 1;
    969	u16 chid;
    970
    971	if (smcd_indicated(ini->smc_type_v1))
    972		rc = 0;		/* already initialized for V1 */
    973	mutex_lock(&smcd_dev_list.mutex);
    974	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
    975		if (smcd->going_away || smcd == ini->ism_dev[0])
    976			continue;
    977		chid = smc_ism_get_chid(smcd);
    978		if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
    979			continue;
    980		if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
    981		    smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
    982			ini->ism_dev[i] = smcd;
    983			ini->ism_chid[i] = chid;
    984			ini->is_smcd = true;
    985			rc = 0;
    986			i++;
    987			if (i > SMC_MAX_ISM_DEVS)
    988				break;
    989		}
    990	}
    991	mutex_unlock(&smcd_dev_list.mutex);
    992	ini->ism_offered_cnt = i - 1;
    993	if (!ini->ism_dev[0] && !ini->ism_dev[1])
    994		ini->smcd_version = 0;
    995
    996	return rc;
    997}
    998
    999/* Check for VLAN ID and register it on ISM device just for CLC handshake */
   1000static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
   1001				      struct smc_init_info *ini)
   1002{
   1003	if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
   1004		return SMC_CLC_DECL_ISMVLANERR;
   1005	return 0;
   1006}
   1007
   1008static int smc_find_proposal_devices(struct smc_sock *smc,
   1009				     struct smc_init_info *ini)
   1010{
   1011	int rc = 0;
   1012
   1013	/* check if there is an ism device available */
   1014	if (!(ini->smcd_version & SMC_V1) ||
   1015	    smc_find_ism_device(smc, ini) ||
   1016	    smc_connect_ism_vlan_setup(smc, ini))
   1017		ini->smcd_version &= ~SMC_V1;
   1018	/* else ISM V1 is supported for this connection */
   1019
   1020	/* check if there is an rdma device available */
   1021	if (!(ini->smcr_version & SMC_V1) ||
   1022	    smc_find_rdma_device(smc, ini))
   1023		ini->smcr_version &= ~SMC_V1;
   1024	/* else RDMA is supported for this connection */
   1025
   1026	ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
   1027					      ini->smcr_version & SMC_V1);
   1028
   1029	/* check if there is an ism v2 device available */
   1030	if (!(ini->smcd_version & SMC_V2) ||
   1031	    !smc_ism_is_v2_capable() ||
   1032	    smc_find_ism_v2_device_clnt(smc, ini))
   1033		ini->smcd_version &= ~SMC_V2;
   1034
   1035	/* check if there is an rdma v2 device available */
   1036	ini->check_smcrv2 = true;
   1037	ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
   1038	if (!(ini->smcr_version & SMC_V2) ||
   1039	    smc->clcsock->sk->sk_family != AF_INET ||
   1040	    !smc_clc_ueid_count() ||
   1041	    smc_find_rdma_device(smc, ini))
   1042		ini->smcr_version &= ~SMC_V2;
   1043	ini->check_smcrv2 = false;
   1044
   1045	ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
   1046					      ini->smcr_version & SMC_V2);
   1047
   1048	/* if neither ISM nor RDMA are supported, fallback */
   1049	if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
   1050		rc = SMC_CLC_DECL_NOSMCDEV;
   1051
   1052	return rc;
   1053}
   1054
   1055/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
   1056 * used, the VLAN ID will be registered again during the connection setup.
   1057 */
   1058static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
   1059					struct smc_init_info *ini)
   1060{
   1061	if (!smcd_indicated(ini->smc_type_v1))
   1062		return 0;
   1063	if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
   1064		return SMC_CLC_DECL_CNFERR;
   1065	return 0;
   1066}
   1067
   1068#define SMC_CLC_MAX_ACCEPT_LEN \
   1069	(sizeof(struct smc_clc_msg_accept_confirm_v2) + \
   1070	 sizeof(struct smc_clc_first_contact_ext) + \
   1071	 sizeof(struct smc_clc_msg_trail))
   1072
   1073/* CLC handshake during connect */
   1074static int smc_connect_clc(struct smc_sock *smc,
   1075			   struct smc_clc_msg_accept_confirm_v2 *aclc2,
   1076			   struct smc_init_info *ini)
   1077{
   1078	int rc = 0;
   1079
   1080	/* do inband token exchange */
   1081	rc = smc_clc_send_proposal(smc, ini);
   1082	if (rc)
   1083		return rc;
   1084	/* receive SMC Accept CLC message */
   1085	return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
   1086				SMC_CLC_ACCEPT, CLC_WAIT_TIME);
   1087}
   1088
   1089void smc_fill_gid_list(struct smc_link_group *lgr,
   1090		       struct smc_gidlist *gidlist,
   1091		       struct smc_ib_device *known_dev, u8 *known_gid)
   1092{
   1093	struct smc_init_info *alt_ini = NULL;
   1094
   1095	memset(gidlist, 0, sizeof(*gidlist));
   1096	memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);
   1097
   1098	alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
   1099	if (!alt_ini)
   1100		goto out;
   1101
   1102	alt_ini->vlan_id = lgr->vlan_id;
   1103	alt_ini->check_smcrv2 = true;
   1104	alt_ini->smcrv2.saddr = lgr->saddr;
   1105	smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);
   1106
   1107	if (!alt_ini->smcrv2.ib_dev_v2)
   1108		goto out;
   1109
   1110	memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
   1111	       SMC_GID_SIZE);
   1112
   1113out:
   1114	kfree(alt_ini);
   1115}
   1116
   1117static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
   1118				       struct smc_clc_msg_accept_confirm *aclc,
   1119				       struct smc_init_info *ini)
   1120{
   1121	struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
   1122		(struct smc_clc_msg_accept_confirm_v2 *)aclc;
   1123	struct smc_clc_first_contact_ext *fce =
   1124		(struct smc_clc_first_contact_ext *)
   1125			(((u8 *)clc_v2) + sizeof(*clc_v2));
   1126
   1127	if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
   1128		return 0;
   1129
   1130	if (fce->v2_direct) {
   1131		memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
   1132		ini->smcrv2.uses_gateway = false;
   1133	} else {
   1134		if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr,
   1135				      smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
   1136				      ini->smcrv2.nexthop_mac,
   1137				      &ini->smcrv2.uses_gateway))
   1138			return SMC_CLC_DECL_NOROUTE;
   1139		if (!ini->smcrv2.uses_gateway) {
   1140			/* mismatch: peer claims indirect, but its direct */
   1141			return SMC_CLC_DECL_NOINDIRECT;
   1142		}
   1143	}
   1144	return 0;
   1145}
   1146
   1147/* setup for RDMA connection of client */
   1148static int smc_connect_rdma(struct smc_sock *smc,
   1149			    struct smc_clc_msg_accept_confirm *aclc,
   1150			    struct smc_init_info *ini)
   1151{
   1152	int i, reason_code = 0;
   1153	struct smc_link *link;
   1154	u8 *eid = NULL;
   1155
   1156	ini->is_smcd = false;
   1157	ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
   1158	ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
   1159	memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
   1160	memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
   1161	memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);
   1162
   1163	reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
   1164	if (reason_code)
   1165		return reason_code;
   1166
   1167	mutex_lock(&smc_client_lgr_pending);
   1168	reason_code = smc_conn_create(smc, ini);
   1169	if (reason_code) {
   1170		mutex_unlock(&smc_client_lgr_pending);
   1171		return reason_code;
   1172	}
   1173
   1174	smc_conn_save_peer_info(smc, aclc);
   1175
   1176	if (ini->first_contact_local) {
   1177		link = smc->conn.lnk;
   1178	} else {
   1179		/* set link that was assigned by server */
   1180		link = NULL;
   1181		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
   1182			struct smc_link *l = &smc->conn.lgr->lnk[i];
   1183
   1184			if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
   1185			    !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
   1186				    SMC_GID_SIZE) &&
   1187			    (aclc->hdr.version > SMC_V1 ||
   1188			     !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
   1189				     sizeof(l->peer_mac)))) {
   1190				link = l;
   1191				break;
   1192			}
   1193		}
   1194		if (!link) {
   1195			reason_code = SMC_CLC_DECL_NOSRVLINK;
   1196			goto connect_abort;
   1197		}
   1198		smc_switch_link_and_count(&smc->conn, link);
   1199	}
   1200
   1201	/* create send buffer and rmb */
   1202	if (smc_buf_create(smc, false)) {
   1203		reason_code = SMC_CLC_DECL_MEM;
   1204		goto connect_abort;
   1205	}
   1206
   1207	if (ini->first_contact_local)
   1208		smc_link_save_peer_info(link, aclc, ini);
   1209
   1210	if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
   1211		reason_code = SMC_CLC_DECL_ERR_RTOK;
   1212		goto connect_abort;
   1213	}
   1214
   1215	smc_close_init(smc);
   1216	smc_rx_init(smc);
   1217
   1218	if (ini->first_contact_local) {
   1219		if (smc_ib_ready_link(link)) {
   1220			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
   1221			goto connect_abort;
   1222		}
   1223	} else {
   1224		if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
   1225			reason_code = SMC_CLC_DECL_ERR_REGRMB;
   1226			goto connect_abort;
   1227		}
   1228	}
   1229	smc_rmb_sync_sg_for_device(&smc->conn);
   1230
   1231	if (aclc->hdr.version > SMC_V1) {
   1232		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
   1233			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
   1234
   1235		eid = clc_v2->r1.eid;
   1236		if (ini->first_contact_local)
   1237			smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
   1238					  link->smcibdev, link->gid);
   1239	}
   1240
   1241	reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
   1242					   aclc->hdr.version, eid, ini);
   1243	if (reason_code)
   1244		goto connect_abort;
   1245
   1246	smc_tx_init(smc);
   1247
   1248	if (ini->first_contact_local) {
   1249		/* QP confirmation over RoCE fabric */
   1250		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
   1251		reason_code = smcr_clnt_conf_first_link(smc);
   1252		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
   1253		if (reason_code)
   1254			goto connect_abort;
   1255	}
   1256	mutex_unlock(&smc_client_lgr_pending);
   1257
   1258	smc_copy_sock_settings_to_clc(smc);
   1259	smc->connect_nonblock = 0;
   1260	if (smc->sk.sk_state == SMC_INIT)
   1261		smc->sk.sk_state = SMC_ACTIVE;
   1262
   1263	return 0;
   1264connect_abort:
   1265	smc_conn_abort(smc, ini->first_contact_local);
   1266	mutex_unlock(&smc_client_lgr_pending);
   1267	smc->connect_nonblock = 0;
   1268
   1269	return reason_code;
   1270}
   1271
   1272/* The server has chosen one of the proposed ISM devices for the communication.
   1273 * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
   1274 */
   1275static int
   1276smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
   1277			       struct smc_init_info *ini)
   1278{
   1279	int i;
   1280
   1281	for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
   1282		if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
   1283			ini->ism_selected = i;
   1284			return 0;
   1285		}
   1286	}
   1287
   1288	return -EPROTO;
   1289}
   1290
   1291/* setup for ISM connection of client */
   1292static int smc_connect_ism(struct smc_sock *smc,
   1293			   struct smc_clc_msg_accept_confirm *aclc,
   1294			   struct smc_init_info *ini)
   1295{
   1296	u8 *eid = NULL;
   1297	int rc = 0;
   1298
   1299	ini->is_smcd = true;
   1300	ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
   1301
   1302	if (aclc->hdr.version == SMC_V2) {
   1303		struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
   1304			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
   1305
   1306		rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
   1307		if (rc)
   1308			return rc;
   1309	}
   1310	ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
   1311
   1312	/* there is only one lgr role for SMC-D; use server lock */
   1313	mutex_lock(&smc_server_lgr_pending);
   1314	rc = smc_conn_create(smc, ini);
   1315	if (rc) {
   1316		mutex_unlock(&smc_server_lgr_pending);
   1317		return rc;
   1318	}
   1319
   1320	/* Create send and receive buffers */
   1321	rc = smc_buf_create(smc, true);
   1322	if (rc) {
   1323		rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
   1324		goto connect_abort;
   1325	}
   1326
   1327	smc_conn_save_peer_info(smc, aclc);
   1328	smc_close_init(smc);
   1329	smc_rx_init(smc);
   1330	smc_tx_init(smc);
   1331
   1332	if (aclc->hdr.version > SMC_V1) {
   1333		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
   1334			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
   1335
   1336		eid = clc_v2->d1.eid;
   1337	}
   1338
   1339	rc = smc_clc_send_confirm(smc, ini->first_contact_local,
   1340				  aclc->hdr.version, eid, NULL);
   1341	if (rc)
   1342		goto connect_abort;
   1343	mutex_unlock(&smc_server_lgr_pending);
   1344
   1345	smc_copy_sock_settings_to_clc(smc);
   1346	smc->connect_nonblock = 0;
   1347	if (smc->sk.sk_state == SMC_INIT)
   1348		smc->sk.sk_state = SMC_ACTIVE;
   1349
   1350	return 0;
   1351connect_abort:
   1352	smc_conn_abort(smc, ini->first_contact_local);
   1353	mutex_unlock(&smc_server_lgr_pending);
   1354	smc->connect_nonblock = 0;
   1355
   1356	return rc;
   1357}
   1358
   1359/* check if received accept type and version matches a proposed one */
   1360static int smc_connect_check_aclc(struct smc_init_info *ini,
   1361				  struct smc_clc_msg_accept_confirm *aclc)
   1362{
   1363	if (aclc->hdr.typev1 != SMC_TYPE_R &&
   1364	    aclc->hdr.typev1 != SMC_TYPE_D)
   1365		return SMC_CLC_DECL_MODEUNSUPP;
   1366
   1367	if (aclc->hdr.version >= SMC_V2) {
   1368		if ((aclc->hdr.typev1 == SMC_TYPE_R &&
   1369		     !smcr_indicated(ini->smc_type_v2)) ||
   1370		    (aclc->hdr.typev1 == SMC_TYPE_D &&
   1371		     !smcd_indicated(ini->smc_type_v2)))
   1372			return SMC_CLC_DECL_MODEUNSUPP;
   1373	} else {
   1374		if ((aclc->hdr.typev1 == SMC_TYPE_R &&
   1375		     !smcr_indicated(ini->smc_type_v1)) ||
   1376		    (aclc->hdr.typev1 == SMC_TYPE_D &&
   1377		     !smcd_indicated(ini->smc_type_v1)))
   1378			return SMC_CLC_DECL_MODEUNSUPP;
   1379	}
   1380
   1381	return 0;
   1382}
   1383
   1384/* perform steps before actually connecting */
   1385static int __smc_connect(struct smc_sock *smc)
   1386{
   1387	u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
   1388	struct smc_clc_msg_accept_confirm_v2 *aclc2;
   1389	struct smc_clc_msg_accept_confirm *aclc;
   1390	struct smc_init_info *ini = NULL;
   1391	u8 *buf = NULL;
   1392	int rc = 0;
   1393
   1394	if (smc->use_fallback)
   1395		return smc_connect_fallback(smc, smc->fallback_rsn);
   1396
   1397	/* if peer has not signalled SMC-capability, fall back */
   1398	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
   1399		return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
   1400
   1401	/* IPSec connections opt out of SMC optimizations */
   1402	if (using_ipsec(smc))
   1403		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
   1404						    version);
   1405
   1406	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
   1407	if (!ini)
   1408		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
   1409						    version);
   1410
   1411	ini->smcd_version = SMC_V1 | SMC_V2;
   1412	ini->smcr_version = SMC_V1 | SMC_V2;
   1413	ini->smc_type_v1 = SMC_TYPE_B;
   1414	ini->smc_type_v2 = SMC_TYPE_B;
   1415
   1416	/* get vlan id from IP device */
   1417	if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
   1418		ini->smcd_version &= ~SMC_V1;
   1419		ini->smcr_version = 0;
   1420		ini->smc_type_v1 = SMC_TYPE_N;
   1421		if (!ini->smcd_version) {
   1422			rc = SMC_CLC_DECL_GETVLANERR;
   1423			goto fallback;
   1424		}
   1425	}
   1426
   1427	rc = smc_find_proposal_devices(smc, ini);
   1428	if (rc)
   1429		goto fallback;
   1430
   1431	buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
   1432	if (!buf) {
   1433		rc = SMC_CLC_DECL_MEM;
   1434		goto fallback;
   1435	}
   1436	aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
   1437	aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
   1438
   1439	/* perform CLC handshake */
   1440	rc = smc_connect_clc(smc, aclc2, ini);
   1441	if (rc) {
   1442		/* -EAGAIN on timeout, see tcp_recvmsg() */
   1443		if (rc == -EAGAIN) {
   1444			rc = -ETIMEDOUT;
   1445			smc->sk.sk_err = ETIMEDOUT;
   1446		}
   1447		goto vlan_cleanup;
   1448	}
   1449
   1450	/* check if smc modes and versions of CLC proposal and accept match */
   1451	rc = smc_connect_check_aclc(ini, aclc);
   1452	version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
   1453	if (rc)
   1454		goto vlan_cleanup;
   1455
   1456	/* depending on previous steps, connect using rdma or ism */
   1457	if (aclc->hdr.typev1 == SMC_TYPE_R) {
   1458		ini->smcr_version = version;
   1459		rc = smc_connect_rdma(smc, aclc, ini);
   1460	} else if (aclc->hdr.typev1 == SMC_TYPE_D) {
   1461		ini->smcd_version = version;
   1462		rc = smc_connect_ism(smc, aclc, ini);
   1463	}
   1464	if (rc)
   1465		goto vlan_cleanup;
   1466
   1467	SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
   1468	smc_connect_ism_vlan_cleanup(smc, ini);
   1469	kfree(buf);
   1470	kfree(ini);
   1471	return 0;
   1472
   1473vlan_cleanup:
   1474	smc_connect_ism_vlan_cleanup(smc, ini);
   1475	kfree(buf);
   1476fallback:
   1477	kfree(ini);
   1478	return smc_connect_decline_fallback(smc, rc, version);
   1479}
   1480
   1481static void smc_connect_work(struct work_struct *work)
   1482{
   1483	struct smc_sock *smc = container_of(work, struct smc_sock,
   1484					    connect_work);
   1485	long timeo = smc->sk.sk_sndtimeo;
   1486	int rc = 0;
   1487
   1488	if (!timeo)
   1489		timeo = MAX_SCHEDULE_TIMEOUT;
   1490	lock_sock(smc->clcsock->sk);
   1491	if (smc->clcsock->sk->sk_err) {
   1492		smc->sk.sk_err = smc->clcsock->sk->sk_err;
   1493	} else if ((1 << smc->clcsock->sk->sk_state) &
   1494					(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
   1495		rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
   1496		if ((rc == -EPIPE) &&
   1497		    ((1 << smc->clcsock->sk->sk_state) &
   1498					(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
   1499			rc = 0;
   1500	}
   1501	release_sock(smc->clcsock->sk);
   1502	lock_sock(&smc->sk);
   1503	if (rc != 0 || smc->sk.sk_err) {
   1504		smc->sk.sk_state = SMC_CLOSED;
   1505		if (rc == -EPIPE || rc == -EAGAIN)
   1506			smc->sk.sk_err = EPIPE;
   1507		else if (rc == -ECONNREFUSED)
   1508			smc->sk.sk_err = ECONNREFUSED;
   1509		else if (signal_pending(current))
   1510			smc->sk.sk_err = -sock_intr_errno(timeo);
   1511		sock_put(&smc->sk); /* passive closing */
   1512		goto out;
   1513	}
   1514
   1515	rc = __smc_connect(smc);
   1516	if (rc < 0)
   1517		smc->sk.sk_err = -rc;
   1518
   1519out:
   1520	if (!sock_flag(&smc->sk, SOCK_DEAD)) {
   1521		if (smc->sk.sk_err) {
   1522			smc->sk.sk_state_change(&smc->sk);
   1523		} else { /* allow polling before and after fallback decision */
   1524			smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
   1525			smc->sk.sk_write_space(&smc->sk);
   1526		}
   1527	}
   1528	release_sock(&smc->sk);
   1529}
   1530
   1531static int smc_connect(struct socket *sock, struct sockaddr *addr,
   1532		       int alen, int flags)
   1533{
   1534	struct sock *sk = sock->sk;
   1535	struct smc_sock *smc;
   1536	int rc = -EINVAL;
   1537
   1538	smc = smc_sk(sk);
   1539
   1540	/* separate smc parameter checking to be safe */
   1541	if (alen < sizeof(addr->sa_family))
   1542		goto out_err;
   1543	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
   1544		goto out_err;
   1545
   1546	lock_sock(sk);
   1547	switch (sock->state) {
   1548	default:
   1549		rc = -EINVAL;
   1550		goto out;
   1551	case SS_CONNECTED:
   1552		rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL;
   1553		goto out;
   1554	case SS_CONNECTING:
   1555		if (sk->sk_state == SMC_ACTIVE)
   1556			goto connected;
   1557		break;
   1558	case SS_UNCONNECTED:
   1559		sock->state = SS_CONNECTING;
   1560		break;
   1561	}
   1562
   1563	switch (sk->sk_state) {
   1564	default:
   1565		goto out;
   1566	case SMC_CLOSED:
   1567		rc = sock_error(sk) ? : -ECONNABORTED;
   1568		sock->state = SS_UNCONNECTED;
   1569		goto out;
   1570	case SMC_ACTIVE:
   1571		rc = -EISCONN;
   1572		goto out;
   1573	case SMC_INIT:
   1574		break;
   1575	}
   1576
   1577	smc_copy_sock_settings_to_clc(smc);
   1578	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
   1579	if (smc->connect_nonblock) {
   1580		rc = -EALREADY;
   1581		goto out;
   1582	}
   1583	rc = kernel_connect(smc->clcsock, addr, alen, flags);
   1584	if (rc && rc != -EINPROGRESS)
   1585		goto out;
   1586
   1587	if (smc->use_fallback) {
   1588		sock->state = rc ? SS_CONNECTING : SS_CONNECTED;
   1589		goto out;
   1590	}
   1591	sock_hold(&smc->sk); /* sock put in passive closing */
   1592	if (flags & O_NONBLOCK) {
   1593		if (queue_work(smc_hs_wq, &smc->connect_work))
   1594			smc->connect_nonblock = 1;
   1595		rc = -EINPROGRESS;
   1596		goto out;
   1597	} else {
   1598		rc = __smc_connect(smc);
   1599		if (rc < 0)
   1600			goto out;
   1601	}
   1602
   1603connected:
   1604	rc = 0;
   1605	sock->state = SS_CONNECTED;
   1606out:
   1607	release_sock(sk);
   1608out_err:
   1609	return rc;
   1610}
   1611
   1612static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
   1613{
   1614	struct socket *new_clcsock = NULL;
   1615	struct sock *lsk = &lsmc->sk;
   1616	struct sock *new_sk;
   1617	int rc = -EINVAL;
   1618
   1619	release_sock(lsk);
   1620	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
   1621	if (!new_sk) {
   1622		rc = -ENOMEM;
   1623		lsk->sk_err = ENOMEM;
   1624		*new_smc = NULL;
   1625		lock_sock(lsk);
   1626		goto out;
   1627	}
   1628	*new_smc = smc_sk(new_sk);
   1629
   1630	mutex_lock(&lsmc->clcsock_release_lock);
   1631	if (lsmc->clcsock)
   1632		rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
   1633	mutex_unlock(&lsmc->clcsock_release_lock);
   1634	lock_sock(lsk);
   1635	if  (rc < 0 && rc != -EAGAIN)
   1636		lsk->sk_err = -rc;
   1637	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
   1638		new_sk->sk_prot->unhash(new_sk);
   1639		if (new_clcsock)
   1640			sock_release(new_clcsock);
   1641		new_sk->sk_state = SMC_CLOSED;
   1642		sock_set_flag(new_sk, SOCK_DEAD);
   1643		sock_put(new_sk); /* final */
   1644		*new_smc = NULL;
   1645		goto out;
   1646	}
   1647
   1648	/* new clcsock has inherited the smc listen-specific sk_data_ready
   1649	 * function; switch it back to the original sk_data_ready function
   1650	 */
   1651	new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
   1652
   1653	/* if new clcsock has also inherited the fallback-specific callback
   1654	 * functions, switch them back to the original ones.
   1655	 */
   1656	if (lsmc->use_fallback) {
   1657		if (lsmc->clcsk_state_change)
   1658			new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change;
   1659		if (lsmc->clcsk_write_space)
   1660			new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space;
   1661		if (lsmc->clcsk_error_report)
   1662			new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report;
   1663	}
   1664
   1665	(*new_smc)->clcsock = new_clcsock;
   1666out:
   1667	return rc;
   1668}
   1669
   1670/* add a just created sock to the accept queue of the listen sock as
   1671 * candidate for a following socket accept call from user space
   1672 */
   1673static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
   1674{
   1675	struct smc_sock *par = smc_sk(parent);
   1676
   1677	sock_hold(sk); /* sock_put in smc_accept_unlink () */
   1678	spin_lock(&par->accept_q_lock);
   1679	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
   1680	spin_unlock(&par->accept_q_lock);
   1681	sk_acceptq_added(parent);
   1682}
   1683
   1684/* remove a socket from the accept queue of its parental listening socket */
   1685static void smc_accept_unlink(struct sock *sk)
   1686{
   1687	struct smc_sock *par = smc_sk(sk)->listen_smc;
   1688
   1689	spin_lock(&par->accept_q_lock);
   1690	list_del_init(&smc_sk(sk)->accept_q);
   1691	spin_unlock(&par->accept_q_lock);
   1692	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
   1693	sock_put(sk); /* sock_hold in smc_accept_enqueue */
   1694}
   1695
   1696/* remove a sock from the accept queue to bind it to a new socket created
   1697 * for a socket accept call from user space
   1698 */
   1699struct sock *smc_accept_dequeue(struct sock *parent,
   1700				struct socket *new_sock)
   1701{
   1702	struct smc_sock *isk, *n;
   1703	struct sock *new_sk;
   1704
   1705	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
   1706		new_sk = (struct sock *)isk;
   1707
   1708		smc_accept_unlink(new_sk);
   1709		if (new_sk->sk_state == SMC_CLOSED) {
   1710			new_sk->sk_prot->unhash(new_sk);
   1711			if (isk->clcsock) {
   1712				sock_release(isk->clcsock);
   1713				isk->clcsock = NULL;
   1714			}
   1715			sock_put(new_sk); /* final */
   1716			continue;
   1717		}
   1718		if (new_sock) {
   1719			sock_graft(new_sk, new_sock);
   1720			new_sock->state = SS_CONNECTED;
   1721			if (isk->use_fallback) {
   1722				smc_sk(new_sk)->clcsock->file = new_sock->file;
   1723				isk->clcsock->file->private_data = isk->clcsock;
   1724			}
   1725		}
   1726		return new_sk;
   1727	}
   1728	return NULL;
   1729}
   1730
   1731/* clean up for a created but never accepted sock */
   1732void smc_close_non_accepted(struct sock *sk)
   1733{
   1734	struct smc_sock *smc = smc_sk(sk);
   1735
   1736	sock_hold(sk); /* sock_put below */
   1737	lock_sock(sk);
   1738	if (!sk->sk_lingertime)
   1739		/* wait for peer closing */
   1740		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
   1741	__smc_release(smc);
   1742	release_sock(sk);
   1743	sock_put(sk); /* sock_hold above */
   1744	sock_put(sk); /* final sock_put */
   1745}
   1746
   1747static int smcr_serv_conf_first_link(struct smc_sock *smc)
   1748{
   1749	struct smc_link *link = smc->conn.lnk;
   1750	struct smc_llc_qentry *qentry;
   1751	int rc;
   1752
   1753	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
   1754		return SMC_CLC_DECL_ERR_REGRMB;
   1755
   1756	/* send CONFIRM LINK request to client over the RoCE fabric */
   1757	rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
   1758	if (rc < 0)
   1759		return SMC_CLC_DECL_TIMEOUT_CL;
   1760
   1761	/* receive CONFIRM LINK response from client over the RoCE fabric */
   1762	qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
   1763			      SMC_LLC_CONFIRM_LINK);
   1764	if (!qentry) {
   1765		struct smc_clc_msg_decline dclc;
   1766
   1767		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
   1768				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
   1769		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
   1770	}
   1771	smc_llc_save_peer_uid(qentry);
   1772	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
   1773	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
   1774	if (rc)
   1775		return SMC_CLC_DECL_RMBE_EC;
   1776
   1777	/* confirm_rkey is implicit on 1st contact */
   1778	smc->conn.rmb_desc->is_conf_rkey = true;
   1779
   1780	smc_llc_link_active(link);
   1781	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
   1782
   1783	/* initial contact - try to establish second link */
   1784	smc_llc_srv_add_link(link, NULL);
   1785	return 0;
   1786}
   1787
   1788/* listen worker: finish */
   1789static void smc_listen_out(struct smc_sock *new_smc)
   1790{
   1791	struct smc_sock *lsmc = new_smc->listen_smc;
   1792	struct sock *newsmcsk = &new_smc->sk;
   1793
   1794	if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
   1795		atomic_dec(&lsmc->queued_smc_hs);
   1796
   1797	if (lsmc->sk.sk_state == SMC_LISTEN) {
   1798		lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
   1799		smc_accept_enqueue(&lsmc->sk, newsmcsk);
   1800		release_sock(&lsmc->sk);
   1801	} else { /* no longer listening */
   1802		smc_close_non_accepted(newsmcsk);
   1803	}
   1804
   1805	/* Wake up accept */
   1806	lsmc->sk.sk_data_ready(&lsmc->sk);
   1807	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
   1808}
   1809
   1810/* listen worker: finish in state connected */
   1811static void smc_listen_out_connected(struct smc_sock *new_smc)
   1812{
   1813	struct sock *newsmcsk = &new_smc->sk;
   1814
   1815	sk_refcnt_debug_inc(newsmcsk);
   1816	if (newsmcsk->sk_state == SMC_INIT)
   1817		newsmcsk->sk_state = SMC_ACTIVE;
   1818
   1819	smc_listen_out(new_smc);
   1820}
   1821
   1822/* listen worker: finish in error state */
   1823static void smc_listen_out_err(struct smc_sock *new_smc)
   1824{
   1825	struct sock *newsmcsk = &new_smc->sk;
   1826	struct net *net = sock_net(newsmcsk);
   1827
   1828	this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt);
   1829	if (newsmcsk->sk_state == SMC_INIT)
   1830		sock_put(&new_smc->sk); /* passive closing */
   1831	newsmcsk->sk_state = SMC_CLOSED;
   1832
   1833	smc_listen_out(new_smc);
   1834}
   1835
   1836/* listen worker: decline and fall back if possible */
   1837static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
   1838			       int local_first, u8 version)
   1839{
   1840	/* RDMA setup failed, switch back to TCP */
   1841	smc_conn_abort(new_smc, local_first);
   1842	if (reason_code < 0 ||
   1843	    smc_switch_to_fallback(new_smc, reason_code)) {
   1844		/* error, no fallback possible */
   1845		smc_listen_out_err(new_smc);
   1846		return;
   1847	}
   1848	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
   1849		if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
   1850			smc_listen_out_err(new_smc);
   1851			return;
   1852		}
   1853	}
   1854	smc_listen_out_connected(new_smc);
   1855}
   1856
   1857/* listen worker: version checking */
   1858static int smc_listen_v2_check(struct smc_sock *new_smc,
   1859			       struct smc_clc_msg_proposal *pclc,
   1860			       struct smc_init_info *ini)
   1861{
   1862	struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
   1863	struct smc_clc_v2_extension *pclc_v2_ext;
   1864	int rc = SMC_CLC_DECL_PEERNOSMC;
   1865
   1866	ini->smc_type_v1 = pclc->hdr.typev1;
   1867	ini->smc_type_v2 = pclc->hdr.typev2;
   1868	ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
   1869	ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
   1870	if (pclc->hdr.version > SMC_V1) {
   1871		if (smcd_indicated(ini->smc_type_v2))
   1872			ini->smcd_version |= SMC_V2;
   1873		if (smcr_indicated(ini->smc_type_v2))
   1874			ini->smcr_version |= SMC_V2;
   1875	}
   1876	if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) {
   1877		rc = SMC_CLC_DECL_PEERNOSMC;
   1878		goto out;
   1879	}
   1880	pclc_v2_ext = smc_get_clc_v2_ext(pclc);
   1881	if (!pclc_v2_ext) {
   1882		ini->smcd_version &= ~SMC_V2;
   1883		ini->smcr_version &= ~SMC_V2;
   1884		rc = SMC_CLC_DECL_NOV2EXT;
   1885		goto out;
   1886	}
   1887	pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
   1888	if (ini->smcd_version & SMC_V2) {
   1889		if (!smc_ism_is_v2_capable()) {
   1890			ini->smcd_version &= ~SMC_V2;
   1891			rc = SMC_CLC_DECL_NOISM2SUPP;
   1892		} else if (!pclc_smcd_v2_ext) {
   1893			ini->smcd_version &= ~SMC_V2;
   1894			rc = SMC_CLC_DECL_NOV2DEXT;
   1895		} else if (!pclc_v2_ext->hdr.eid_cnt &&
   1896			   !pclc_v2_ext->hdr.flag.seid) {
   1897			ini->smcd_version &= ~SMC_V2;
   1898			rc = SMC_CLC_DECL_NOUEID;
   1899		}
   1900	}
   1901	if (ini->smcr_version & SMC_V2) {
   1902		if (!pclc_v2_ext->hdr.eid_cnt) {
   1903			ini->smcr_version &= ~SMC_V2;
   1904			rc = SMC_CLC_DECL_NOUEID;
   1905		}
   1906	}
   1907
   1908out:
   1909	if (!ini->smcd_version && !ini->smcr_version)
   1910		return rc;
   1911
   1912	return 0;
   1913}
   1914
   1915/* listen worker: check prefixes */
   1916static int smc_listen_prfx_check(struct smc_sock *new_smc,
   1917				 struct smc_clc_msg_proposal *pclc)
   1918{
   1919	struct smc_clc_msg_proposal_prefix *pclc_prfx;
   1920	struct socket *newclcsock = new_smc->clcsock;
   1921
   1922	if (pclc->hdr.typev1 == SMC_TYPE_N)
   1923		return 0;
   1924	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
   1925	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
   1926		return SMC_CLC_DECL_DIFFPREFIX;
   1927
   1928	return 0;
   1929}
   1930
   1931/* listen worker: initialize connection and buffers */
   1932static int smc_listen_rdma_init(struct smc_sock *new_smc,
   1933				struct smc_init_info *ini)
   1934{
   1935	int rc;
   1936
   1937	/* allocate connection / link group */
   1938	rc = smc_conn_create(new_smc, ini);
   1939	if (rc)
   1940		return rc;
   1941
   1942	/* create send buffer and rmb */
   1943	if (smc_buf_create(new_smc, false))
   1944		return SMC_CLC_DECL_MEM;
   1945
   1946	return 0;
   1947}
   1948
   1949/* listen worker: initialize connection and buffers for SMC-D */
   1950static int smc_listen_ism_init(struct smc_sock *new_smc,
   1951			       struct smc_init_info *ini)
   1952{
   1953	int rc;
   1954
   1955	rc = smc_conn_create(new_smc, ini);
   1956	if (rc)
   1957		return rc;
   1958
   1959	/* Create send and receive buffers */
   1960	rc = smc_buf_create(new_smc, true);
   1961	if (rc) {
   1962		smc_conn_abort(new_smc, ini->first_contact_local);
   1963		return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
   1964					 SMC_CLC_DECL_MEM;
   1965	}
   1966
   1967	return 0;
   1968}
   1969
   1970static bool smc_is_already_selected(struct smcd_dev *smcd,
   1971				    struct smc_init_info *ini,
   1972				    int matches)
   1973{
   1974	int i;
   1975
   1976	for (i = 0; i < matches; i++)
   1977		if (smcd == ini->ism_dev[i])
   1978			return true;
   1979
   1980	return false;
   1981}
   1982
   1983/* check for ISM devices matching proposed ISM devices */
   1984static void smc_check_ism_v2_match(struct smc_init_info *ini,
   1985				   u16 proposed_chid, u64 proposed_gid,
   1986				   unsigned int *matches)
   1987{
   1988	struct smcd_dev *smcd;
   1989
   1990	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
   1991		if (smcd->going_away)
   1992			continue;
   1993		if (smc_is_already_selected(smcd, ini, *matches))
   1994			continue;
   1995		if (smc_ism_get_chid(smcd) == proposed_chid &&
   1996		    !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
   1997			ini->ism_peer_gid[*matches] = proposed_gid;
   1998			ini->ism_dev[*matches] = smcd;
   1999			(*matches)++;
   2000			break;
   2001		}
   2002	}
   2003}
   2004
   2005static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini)
   2006{
   2007	if (!ini->rc)
   2008		ini->rc = rc;
   2009}
   2010
   2011static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
   2012					struct smc_clc_msg_proposal *pclc,
   2013					struct smc_init_info *ini)
   2014{
   2015	struct smc_clc_smcd_v2_extension *smcd_v2_ext;
   2016	struct smc_clc_v2_extension *smc_v2_ext;
   2017	struct smc_clc_msg_smcd *pclc_smcd;
   2018	unsigned int matches = 0;
   2019	u8 smcd_version;
   2020	u8 *eid = NULL;
   2021	int i, rc;
   2022
   2023	if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
   2024		goto not_found;
   2025
   2026	pclc_smcd = smc_get_clc_msg_smcd(pclc);
   2027	smc_v2_ext = smc_get_clc_v2_ext(pclc);
   2028	smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
   2029
   2030	mutex_lock(&smcd_dev_list.mutex);
   2031	if (pclc_smcd->ism.chid)
   2032		/* check for ISM device matching proposed native ISM device */
   2033		smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
   2034				       ntohll(pclc_smcd->ism.gid), &matches);
   2035	for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
   2036		/* check for ISM devices matching proposed non-native ISM
   2037		 * devices
   2038		 */
   2039		smc_check_ism_v2_match(ini,
   2040				       ntohs(smcd_v2_ext->gidchid[i - 1].chid),
   2041				       ntohll(smcd_v2_ext->gidchid[i - 1].gid),
   2042				       &matches);
   2043	}
   2044	mutex_unlock(&smcd_dev_list.mutex);
   2045
   2046	if (!ini->ism_dev[0]) {
   2047		smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini);
   2048		goto not_found;
   2049	}
   2050
   2051	smc_ism_get_system_eid(&eid);
   2052	if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext,
   2053			       smcd_v2_ext->system_eid, eid))
   2054		goto not_found;
   2055
   2056	/* separate - outside the smcd_dev_list.lock */
   2057	smcd_version = ini->smcd_version;
   2058	for (i = 0; i < matches; i++) {
   2059		ini->smcd_version = SMC_V2;
   2060		ini->is_smcd = true;
   2061		ini->ism_selected = i;
   2062		rc = smc_listen_ism_init(new_smc, ini);
   2063		if (rc) {
   2064			smc_find_ism_store_rc(rc, ini);
   2065			/* try next active ISM device */
   2066			continue;
   2067		}
   2068		return; /* matching and usable V2 ISM device found */
   2069	}
   2070	/* no V2 ISM device could be initialized */
   2071	ini->smcd_version = smcd_version;	/* restore original value */
   2072	ini->negotiated_eid[0] = 0;
   2073
   2074not_found:
   2075	ini->smcd_version &= ~SMC_V2;
   2076	ini->ism_dev[0] = NULL;
   2077	ini->is_smcd = false;
   2078}
   2079
   2080static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
   2081					struct smc_clc_msg_proposal *pclc,
   2082					struct smc_init_info *ini)
   2083{
   2084	struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
   2085	int rc = 0;
   2086
   2087	/* check if ISM V1 is available */
   2088	if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
   2089		goto not_found;
   2090	ini->is_smcd = true; /* prepare ISM check */
   2091	ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
   2092	rc = smc_find_ism_device(new_smc, ini);
   2093	if (rc)
   2094		goto not_found;
   2095	ini->ism_selected = 0;
   2096	rc = smc_listen_ism_init(new_smc, ini);
   2097	if (!rc)
   2098		return;		/* V1 ISM device found */
   2099
   2100not_found:
   2101	smc_find_ism_store_rc(rc, ini);
   2102	ini->smcd_version &= ~SMC_V1;
   2103	ini->ism_dev[0] = NULL;
   2104	ini->is_smcd = false;
   2105}
   2106
   2107/* listen worker: register buffers */
   2108static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
   2109{
   2110	struct smc_connection *conn = &new_smc->conn;
   2111
   2112	if (!local_first) {
   2113		if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
   2114			return SMC_CLC_DECL_ERR_REGRMB;
   2115	}
   2116	smc_rmb_sync_sg_for_device(&new_smc->conn);
   2117
   2118	return 0;
   2119}
   2120
   2121static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc,
   2122					 struct smc_clc_msg_proposal *pclc,
   2123					 struct smc_init_info *ini)
   2124{
   2125	struct smc_clc_v2_extension *smc_v2_ext;
   2126	u8 smcr_version;
   2127	int rc;
   2128
   2129	if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2))
   2130		goto not_found;
   2131
   2132	smc_v2_ext = smc_get_clc_v2_ext(pclc);
   2133	if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL))
   2134		goto not_found;
   2135
   2136	/* prepare RDMA check */
   2137	memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
   2138	memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE);
   2139	memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
   2140	ini->check_smcrv2 = true;
   2141	ini->smcrv2.clc_sk = new_smc->clcsock->sk;
   2142	ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr;
   2143	ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce);
   2144	rc = smc_find_rdma_device(new_smc, ini);
   2145	if (rc) {
   2146		smc_find_ism_store_rc(rc, ini);
   2147		goto not_found;
   2148	}
   2149	if (!ini->smcrv2.uses_gateway)
   2150		memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN);
   2151
   2152	smcr_version = ini->smcr_version;
   2153	ini->smcr_version = SMC_V2;
   2154	rc = smc_listen_rdma_init(new_smc, ini);
   2155	if (!rc)
   2156		rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local);
   2157	if (!rc)
   2158		return;
   2159	ini->smcr_version = smcr_version;
   2160	smc_find_ism_store_rc(rc, ini);
   2161
   2162not_found:
   2163	ini->smcr_version &= ~SMC_V2;
   2164	ini->smcrv2.ib_dev_v2 = NULL;
   2165	ini->check_smcrv2 = false;
   2166}
   2167
   2168static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
   2169					struct smc_clc_msg_proposal *pclc,
   2170					struct smc_init_info *ini)
   2171{
   2172	int rc;
   2173
   2174	if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1))
   2175		return SMC_CLC_DECL_NOSMCDEV;
   2176
   2177	/* prepare RDMA check */
   2178	memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
   2179	memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE);
   2180	memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
   2181	rc = smc_find_rdma_device(new_smc, ini);
   2182	if (rc) {
   2183		/* no RDMA device found */
   2184		return SMC_CLC_DECL_NOSMCDEV;
   2185	}
   2186	rc = smc_listen_rdma_init(new_smc, ini);
   2187	if (rc)
   2188		return rc;
   2189	return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
   2190}
   2191
   2192/* determine the local device matching to proposal */
   2193static int smc_listen_find_device(struct smc_sock *new_smc,
   2194				  struct smc_clc_msg_proposal *pclc,
   2195				  struct smc_init_info *ini)
   2196{
   2197	int prfx_rc;
   2198
   2199	/* check for ISM device matching V2 proposed device */
   2200	smc_find_ism_v2_device_serv(new_smc, pclc, ini);
   2201	if (ini->ism_dev[0])
   2202		return 0;
   2203
   2204	/* check for matching IP prefix and subnet length (V1) */
   2205	prfx_rc = smc_listen_prfx_check(new_smc, pclc);
   2206	if (prfx_rc)
   2207		smc_find_ism_store_rc(prfx_rc, ini);
   2208
   2209	/* get vlan id from IP device */
   2210	if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
   2211		return ini->rc ?: SMC_CLC_DECL_GETVLANERR;
   2212
   2213	/* check for ISM device matching V1 proposed device */
   2214	if (!prfx_rc)
   2215		smc_find_ism_v1_device_serv(new_smc, pclc, ini);
   2216	if (ini->ism_dev[0])
   2217		return 0;
   2218
   2219	if (!smcr_indicated(pclc->hdr.typev1) &&
   2220	    !smcr_indicated(pclc->hdr.typev2))
   2221		/* skip RDMA and decline */
   2222		return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;
   2223
   2224	/* check if RDMA V2 is available */
   2225	smc_find_rdma_v2_device_serv(new_smc, pclc, ini);
   2226	if (ini->smcrv2.ib_dev_v2)
   2227		return 0;
   2228
   2229	/* check if RDMA V1 is available */
   2230	if (!prfx_rc) {
   2231		int rc;
   2232
   2233		rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
   2234		smc_find_ism_store_rc(rc, ini);
   2235		return (!rc) ? 0 : ini->rc;
   2236	}
   2237	return SMC_CLC_DECL_NOSMCDEV;
   2238}
   2239
   2240/* listen worker: finish RDMA setup */
   2241static int smc_listen_rdma_finish(struct smc_sock *new_smc,
   2242				  struct smc_clc_msg_accept_confirm *cclc,
   2243				  bool local_first,
   2244				  struct smc_init_info *ini)
   2245{
   2246	struct smc_link *link = new_smc->conn.lnk;
   2247	int reason_code = 0;
   2248
   2249	if (local_first)
   2250		smc_link_save_peer_info(link, cclc, ini);
   2251
   2252	if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
   2253		return SMC_CLC_DECL_ERR_RTOK;
   2254
   2255	if (local_first) {
   2256		if (smc_ib_ready_link(link))
   2257			return SMC_CLC_DECL_ERR_RDYLNK;
   2258		/* QP confirmation over RoCE fabric */
   2259		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
   2260		reason_code = smcr_serv_conf_first_link(new_smc);
   2261		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
   2262	}
   2263	return reason_code;
   2264}
   2265
   2266/* setup for connection of server */
   2267static void smc_listen_work(struct work_struct *work)
   2268{
   2269	struct smc_sock *new_smc = container_of(work, struct smc_sock,
   2270						smc_listen_work);
   2271	struct socket *newclcsock = new_smc->clcsock;
   2272	struct smc_clc_msg_accept_confirm *cclc;
   2273	struct smc_clc_msg_proposal_area *buf;
   2274	struct smc_clc_msg_proposal *pclc;
   2275	struct smc_init_info *ini = NULL;
   2276	u8 proposal_version = SMC_V1;
   2277	u8 accept_version;
   2278	int rc = 0;
   2279
   2280	if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
   2281		return smc_listen_out_err(new_smc);
   2282
   2283	if (new_smc->use_fallback) {
   2284		smc_listen_out_connected(new_smc);
   2285		return;
   2286	}
   2287
   2288	/* check if peer is smc capable */
   2289	if (!tcp_sk(newclcsock->sk)->syn_smc) {
   2290		rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
   2291		if (rc)
   2292			smc_listen_out_err(new_smc);
   2293		else
   2294			smc_listen_out_connected(new_smc);
   2295		return;
   2296	}
   2297
   2298	/* do inband token exchange -
   2299	 * wait for and receive SMC Proposal CLC message
   2300	 */
   2301	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
   2302	if (!buf) {
   2303		rc = SMC_CLC_DECL_MEM;
   2304		goto out_decl;
   2305	}
   2306	pclc = (struct smc_clc_msg_proposal *)buf;
   2307	rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
   2308			      SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
   2309	if (rc)
   2310		goto out_decl;
   2311
   2312	if (pclc->hdr.version > SMC_V1)
   2313		proposal_version = SMC_V2;
   2314
   2315	/* IPSec connections opt out of SMC optimizations */
   2316	if (using_ipsec(new_smc)) {
   2317		rc = SMC_CLC_DECL_IPSEC;
   2318		goto out_decl;
   2319	}
   2320
   2321	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
   2322	if (!ini) {
   2323		rc = SMC_CLC_DECL_MEM;
   2324		goto out_decl;
   2325	}
   2326
   2327	/* initial version checking */
   2328	rc = smc_listen_v2_check(new_smc, pclc, ini);
   2329	if (rc)
   2330		goto out_decl;
   2331
   2332	mutex_lock(&smc_server_lgr_pending);
   2333	smc_close_init(new_smc);
   2334	smc_rx_init(new_smc);
   2335	smc_tx_init(new_smc);
   2336
   2337	/* determine ISM or RoCE device used for connection */
   2338	rc = smc_listen_find_device(new_smc, pclc, ini);
   2339	if (rc)
   2340		goto out_unlock;
   2341
   2342	/* send SMC Accept CLC message */
   2343	accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
   2344	rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
   2345				 accept_version, ini->negotiated_eid);
   2346	if (rc)
   2347		goto out_unlock;
   2348
   2349	/* SMC-D does not need this lock any more */
   2350	if (ini->is_smcd)
   2351		mutex_unlock(&smc_server_lgr_pending);
   2352
   2353	/* receive SMC Confirm CLC message */
   2354	memset(buf, 0, sizeof(*buf));
   2355	cclc = (struct smc_clc_msg_accept_confirm *)buf;
   2356	rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
   2357			      SMC_CLC_CONFIRM, CLC_WAIT_TIME);
   2358	if (rc) {
   2359		if (!ini->is_smcd)
   2360			goto out_unlock;
   2361		goto out_decl;
   2362	}
   2363
   2364	/* finish worker */
   2365	if (!ini->is_smcd) {
   2366		rc = smc_listen_rdma_finish(new_smc, cclc,
   2367					    ini->first_contact_local, ini);
   2368		if (rc)
   2369			goto out_unlock;
   2370		mutex_unlock(&smc_server_lgr_pending);
   2371	}
   2372	smc_conn_save_peer_info(new_smc, cclc);
   2373	smc_listen_out_connected(new_smc);
   2374	SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
   2375	goto out_free;
   2376
   2377out_unlock:
   2378	mutex_unlock(&smc_server_lgr_pending);
   2379out_decl:
   2380	smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
   2381			   proposal_version);
   2382out_free:
   2383	kfree(ini);
   2384	kfree(buf);
   2385}
   2386
   2387static void smc_tcp_listen_work(struct work_struct *work)
   2388{
   2389	struct smc_sock *lsmc = container_of(work, struct smc_sock,
   2390					     tcp_listen_work);
   2391	struct sock *lsk = &lsmc->sk;
   2392	struct smc_sock *new_smc;
   2393	int rc = 0;
   2394
   2395	lock_sock(lsk);
   2396	while (lsk->sk_state == SMC_LISTEN) {
   2397		rc = smc_clcsock_accept(lsmc, &new_smc);
   2398		if (rc) /* clcsock accept queue empty or error */
   2399			goto out;
   2400		if (!new_smc)
   2401			continue;
   2402
   2403		if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
   2404			atomic_inc(&lsmc->queued_smc_hs);
   2405
   2406		new_smc->listen_smc = lsmc;
   2407		new_smc->use_fallback = lsmc->use_fallback;
   2408		new_smc->fallback_rsn = lsmc->fallback_rsn;
   2409		sock_hold(lsk); /* sock_put in smc_listen_work */
   2410		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
   2411		smc_copy_sock_settings_to_smc(new_smc);
   2412		new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
   2413		new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
   2414		sock_hold(&new_smc->sk); /* sock_put in passive closing */
   2415		if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
   2416			sock_put(&new_smc->sk);
   2417	}
   2418
   2419out:
   2420	release_sock(lsk);
   2421	sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
   2422}
   2423
   2424static void smc_clcsock_data_ready(struct sock *listen_clcsock)
   2425{
   2426	struct smc_sock *lsmc;
   2427
   2428	read_lock_bh(&listen_clcsock->sk_callback_lock);
   2429	lsmc = smc_clcsock_user_data(listen_clcsock);
   2430	if (!lsmc)
   2431		goto out;
   2432	lsmc->clcsk_data_ready(listen_clcsock);
   2433	if (lsmc->sk.sk_state == SMC_LISTEN) {
   2434		sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
   2435		if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
   2436			sock_put(&lsmc->sk);
   2437	}
   2438out:
   2439	read_unlock_bh(&listen_clcsock->sk_callback_lock);
   2440}
   2441
   2442static int smc_listen(struct socket *sock, int backlog)
   2443{
   2444	struct sock *sk = sock->sk;
   2445	struct smc_sock *smc;
   2446	int rc;
   2447
   2448	smc = smc_sk(sk);
   2449	lock_sock(sk);
   2450
   2451	rc = -EINVAL;
   2452	if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
   2453	    smc->connect_nonblock || sock->state != SS_UNCONNECTED)
   2454		goto out;
   2455
   2456	rc = 0;
   2457	if (sk->sk_state == SMC_LISTEN) {
   2458		sk->sk_max_ack_backlog = backlog;
   2459		goto out;
   2460	}
   2461	/* some socket options are handled in core, so we could not apply
   2462	 * them to the clc socket -- copy smc socket options to clc socket
   2463	 */
   2464	smc_copy_sock_settings_to_clc(smc);
   2465	if (!smc->use_fallback)
   2466		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
   2467
   2468	/* save original sk_data_ready function and establish
   2469	 * smc-specific sk_data_ready function
   2470	 */
   2471	write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
   2472	smc->clcsock->sk->sk_user_data =
   2473		(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
   2474	smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready,
   2475			       smc_clcsock_data_ready, &smc->clcsk_data_ready);
   2476	write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
   2477
   2478	/* save original ops */
   2479	smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
   2480
   2481	smc->af_ops = *smc->ori_af_ops;
   2482	smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
   2483
   2484	inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
   2485
   2486	if (smc->limit_smc_hs)
   2487		tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
   2488
   2489	rc = kernel_listen(smc->clcsock, backlog);
   2490	if (rc) {
   2491		write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
   2492		smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
   2493				       &smc->clcsk_data_ready);
   2494		smc->clcsock->sk->sk_user_data = NULL;
   2495		write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
   2496		goto out;
   2497	}
   2498	sk->sk_max_ack_backlog = backlog;
   2499	sk->sk_ack_backlog = 0;
   2500	sk->sk_state = SMC_LISTEN;
   2501
   2502out:
   2503	release_sock(sk);
   2504	return rc;
   2505}
   2506
   2507static int smc_accept(struct socket *sock, struct socket *new_sock,
   2508		      int flags, bool kern)
   2509{
   2510	struct sock *sk = sock->sk, *nsk;
   2511	DECLARE_WAITQUEUE(wait, current);
   2512	struct smc_sock *lsmc;
   2513	long timeo;
   2514	int rc = 0;
   2515
   2516	lsmc = smc_sk(sk);
   2517	sock_hold(sk); /* sock_put below */
   2518	lock_sock(sk);
   2519
   2520	if (lsmc->sk.sk_state != SMC_LISTEN) {
   2521		rc = -EINVAL;
   2522		release_sock(sk);
   2523		goto out;
   2524	}
   2525
   2526	/* Wait for an incoming connection */
   2527	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
   2528	add_wait_queue_exclusive(sk_sleep(sk), &wait);
   2529	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
   2530		set_current_state(TASK_INTERRUPTIBLE);
   2531		if (!timeo) {
   2532			rc = -EAGAIN;
   2533			break;
   2534		}
   2535		release_sock(sk);
   2536		timeo = schedule_timeout(timeo);
   2537		/* wakeup by sk_data_ready in smc_listen_work() */
   2538		sched_annotate_sleep();
   2539		lock_sock(sk);
   2540		if (signal_pending(current)) {
   2541			rc = sock_intr_errno(timeo);
   2542			break;
   2543		}
   2544	}
   2545	set_current_state(TASK_RUNNING);
   2546	remove_wait_queue(sk_sleep(sk), &wait);
   2547
   2548	if (!rc)
   2549		rc = sock_error(nsk);
   2550	release_sock(sk);
   2551	if (rc)
   2552		goto out;
   2553
   2554	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
   2555		/* wait till data arrives on the socket */
   2556		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
   2557								MSEC_PER_SEC);
   2558		if (smc_sk(nsk)->use_fallback) {
   2559			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
   2560
   2561			lock_sock(clcsk);
   2562			if (skb_queue_empty(&clcsk->sk_receive_queue))
   2563				sk_wait_data(clcsk, &timeo, NULL);
   2564			release_sock(clcsk);
   2565		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
   2566			lock_sock(nsk);
   2567			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
   2568			release_sock(nsk);
   2569		}
   2570	}
   2571
   2572out:
   2573	sock_put(sk); /* sock_hold above */
   2574	return rc;
   2575}
   2576
   2577static int smc_getname(struct socket *sock, struct sockaddr *addr,
   2578		       int peer)
   2579{
   2580	struct smc_sock *smc;
   2581
   2582	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
   2583	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
   2584		return -ENOTCONN;
   2585
   2586	smc = smc_sk(sock->sk);
   2587
   2588	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
   2589}
   2590
   2591static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
   2592{
   2593	struct sock *sk = sock->sk;
   2594	struct smc_sock *smc;
   2595	int rc = -EPIPE;
   2596
   2597	smc = smc_sk(sk);
   2598	lock_sock(sk);
   2599	if ((sk->sk_state != SMC_ACTIVE) &&
   2600	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
   2601	    (sk->sk_state != SMC_INIT))
   2602		goto out;
   2603
   2604	if (msg->msg_flags & MSG_FASTOPEN) {
   2605		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
   2606			rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
   2607			if (rc)
   2608				goto out;
   2609		} else {
   2610			rc = -EINVAL;
   2611			goto out;
   2612		}
   2613	}
   2614
   2615	if (smc->use_fallback) {
   2616		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
   2617	} else {
   2618		rc = smc_tx_sendmsg(smc, msg, len);
   2619		SMC_STAT_TX_PAYLOAD(smc, len, rc);
   2620	}
   2621out:
   2622	release_sock(sk);
   2623	return rc;
   2624}
   2625
   2626static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
   2627		       int flags)
   2628{
   2629	struct sock *sk = sock->sk;
   2630	struct smc_sock *smc;
   2631	int rc = -ENOTCONN;
   2632
   2633	smc = smc_sk(sk);
   2634	lock_sock(sk);
   2635	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
   2636		/* socket was connected before, no more data to read */
   2637		rc = 0;
   2638		goto out;
   2639	}
   2640	if ((sk->sk_state == SMC_INIT) ||
   2641	    (sk->sk_state == SMC_LISTEN) ||
   2642	    (sk->sk_state == SMC_CLOSED))
   2643		goto out;
   2644
   2645	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
   2646		rc = 0;
   2647		goto out;
   2648	}
   2649
   2650	if (smc->use_fallback) {
   2651		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
   2652	} else {
   2653		msg->msg_namelen = 0;
   2654		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
   2655		SMC_STAT_RX_PAYLOAD(smc, rc, rc);
   2656	}
   2657
   2658out:
   2659	release_sock(sk);
   2660	return rc;
   2661}
   2662
   2663static __poll_t smc_accept_poll(struct sock *parent)
   2664{
   2665	struct smc_sock *isk = smc_sk(parent);
   2666	__poll_t mask = 0;
   2667
   2668	spin_lock(&isk->accept_q_lock);
   2669	if (!list_empty(&isk->accept_q))
   2670		mask = EPOLLIN | EPOLLRDNORM;
   2671	spin_unlock(&isk->accept_q_lock);
   2672
   2673	return mask;
   2674}
   2675
   2676static __poll_t smc_poll(struct file *file, struct socket *sock,
   2677			     poll_table *wait)
   2678{
   2679	struct sock *sk = sock->sk;
   2680	struct smc_sock *smc;
   2681	__poll_t mask = 0;
   2682
   2683	if (!sk)
   2684		return EPOLLNVAL;
   2685
   2686	smc = smc_sk(sock->sk);
   2687	if (smc->use_fallback) {
   2688		/* delegate to CLC child sock */
   2689		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
   2690		sk->sk_err = smc->clcsock->sk->sk_err;
   2691	} else {
   2692		if (sk->sk_state != SMC_CLOSED)
   2693			sock_poll_wait(file, sock, wait);
   2694		if (sk->sk_err)
   2695			mask |= EPOLLERR;
   2696		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
   2697		    (sk->sk_state == SMC_CLOSED))
   2698			mask |= EPOLLHUP;
   2699		if (sk->sk_state == SMC_LISTEN) {
   2700			/* woken up by sk_data_ready in smc_listen_work() */
   2701			mask |= smc_accept_poll(sk);
   2702		} else if (smc->use_fallback) { /* as result of connect_work()*/
   2703			mask |= smc->clcsock->ops->poll(file, smc->clcsock,
   2704							   wait);
   2705			sk->sk_err = smc->clcsock->sk->sk_err;
   2706		} else {
   2707			if ((sk->sk_state != SMC_INIT &&
   2708			     atomic_read(&smc->conn.sndbuf_space)) ||
   2709			    sk->sk_shutdown & SEND_SHUTDOWN) {
   2710				mask |= EPOLLOUT | EPOLLWRNORM;
   2711			} else {
   2712				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
   2713				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
   2714			}
   2715			if (atomic_read(&smc->conn.bytes_to_rcv))
   2716				mask |= EPOLLIN | EPOLLRDNORM;
   2717			if (sk->sk_shutdown & RCV_SHUTDOWN)
   2718				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
   2719			if (sk->sk_state == SMC_APPCLOSEWAIT1)
   2720				mask |= EPOLLIN;
   2721			if (smc->conn.urg_state == SMC_URG_VALID)
   2722				mask |= EPOLLPRI;
   2723		}
   2724	}
   2725
   2726	return mask;
   2727}
   2728
   2729static int smc_shutdown(struct socket *sock, int how)
   2730{
   2731	struct sock *sk = sock->sk;
   2732	bool do_shutdown = true;
   2733	struct smc_sock *smc;
   2734	int rc = -EINVAL;
   2735	int old_state;
   2736	int rc1 = 0;
   2737
   2738	smc = smc_sk(sk);
   2739
   2740	if ((how < SHUT_RD) || (how > SHUT_RDWR))
   2741		return rc;
   2742
   2743	lock_sock(sk);
   2744
   2745	if (sock->state == SS_CONNECTING) {
   2746		if (sk->sk_state == SMC_ACTIVE)
   2747			sock->state = SS_CONNECTED;
   2748		else if (sk->sk_state == SMC_PEERCLOSEWAIT1 ||
   2749			 sk->sk_state == SMC_PEERCLOSEWAIT2 ||
   2750			 sk->sk_state == SMC_APPCLOSEWAIT1 ||
   2751			 sk->sk_state == SMC_APPCLOSEWAIT2 ||
   2752			 sk->sk_state == SMC_APPFINCLOSEWAIT)
   2753			sock->state = SS_DISCONNECTING;
   2754	}
   2755
   2756	rc = -ENOTCONN;
   2757	if ((sk->sk_state != SMC_ACTIVE) &&
   2758	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
   2759	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
   2760	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
   2761	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
   2762	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
   2763		goto out;
   2764	if (smc->use_fallback) {
   2765		rc = kernel_sock_shutdown(smc->clcsock, how);
   2766		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
   2767		if (sk->sk_shutdown == SHUTDOWN_MASK) {
   2768			sk->sk_state = SMC_CLOSED;
   2769			sk->sk_socket->state = SS_UNCONNECTED;
   2770			sock_put(sk);
   2771		}
   2772		goto out;
   2773	}
   2774	switch (how) {
   2775	case SHUT_RDWR:		/* shutdown in both directions */
   2776		old_state = sk->sk_state;
   2777		rc = smc_close_active(smc);
   2778		if (old_state == SMC_ACTIVE &&
   2779		    sk->sk_state == SMC_PEERCLOSEWAIT1)
   2780			do_shutdown = false;
   2781		break;
   2782	case SHUT_WR:
   2783		rc = smc_close_shutdown_write(smc);
   2784		break;
   2785	case SHUT_RD:
   2786		rc = 0;
   2787		/* nothing more to do because peer is not involved */
   2788		break;
   2789	}
   2790	if (do_shutdown && smc->clcsock)
   2791		rc1 = kernel_sock_shutdown(smc->clcsock, how);
   2792	/* map sock_shutdown_cmd constants to sk_shutdown value range */
   2793	sk->sk_shutdown |= how + 1;
   2794
   2795	if (sk->sk_state == SMC_CLOSED)
   2796		sock->state = SS_UNCONNECTED;
   2797	else
   2798		sock->state = SS_DISCONNECTING;
   2799out:
   2800	release_sock(sk);
   2801	return rc ? rc : rc1;
   2802}
   2803
   2804static int __smc_getsockopt(struct socket *sock, int level, int optname,
   2805			    char __user *optval, int __user *optlen)
   2806{
   2807	struct smc_sock *smc;
   2808	int val, len;
   2809
   2810	smc = smc_sk(sock->sk);
   2811
   2812	if (get_user(len, optlen))
   2813		return -EFAULT;
   2814
   2815	len = min_t(int, len, sizeof(int));
   2816
   2817	if (len < 0)
   2818		return -EINVAL;
   2819
   2820	switch (optname) {
   2821	case SMC_LIMIT_HS:
   2822		val = smc->limit_smc_hs;
   2823		break;
   2824	default:
   2825		return -EOPNOTSUPP;
   2826	}
   2827
   2828	if (put_user(len, optlen))
   2829		return -EFAULT;
   2830	if (copy_to_user(optval, &val, len))
   2831		return -EFAULT;
   2832
   2833	return 0;
   2834}
   2835
   2836static int __smc_setsockopt(struct socket *sock, int level, int optname,
   2837			    sockptr_t optval, unsigned int optlen)
   2838{
   2839	struct sock *sk = sock->sk;
   2840	struct smc_sock *smc;
   2841	int val, rc;
   2842
   2843	smc = smc_sk(sk);
   2844
   2845	lock_sock(sk);
   2846	switch (optname) {
   2847	case SMC_LIMIT_HS:
   2848		if (optlen < sizeof(int)) {
   2849			rc = -EINVAL;
   2850			break;
   2851		}
   2852		if (copy_from_sockptr(&val, optval, sizeof(int))) {
   2853			rc = -EFAULT;
   2854			break;
   2855		}
   2856
   2857		smc->limit_smc_hs = !!val;
   2858		rc = 0;
   2859		break;
   2860	default:
   2861		rc = -EOPNOTSUPP;
   2862		break;
   2863	}
   2864	release_sock(sk);
   2865
   2866	return rc;
   2867}
   2868
   2869static int smc_setsockopt(struct socket *sock, int level, int optname,
   2870			  sockptr_t optval, unsigned int optlen)
   2871{
   2872	struct sock *sk = sock->sk;
   2873	struct smc_sock *smc;
   2874	int val, rc;
   2875
   2876	if (level == SOL_TCP && optname == TCP_ULP)
   2877		return -EOPNOTSUPP;
   2878	else if (level == SOL_SMC)
   2879		return __smc_setsockopt(sock, level, optname, optval, optlen);
   2880
   2881	smc = smc_sk(sk);
   2882
   2883	/* generic setsockopts reaching us here always apply to the
   2884	 * CLC socket
   2885	 */
   2886	mutex_lock(&smc->clcsock_release_lock);
   2887	if (!smc->clcsock) {
   2888		mutex_unlock(&smc->clcsock_release_lock);
   2889		return -EBADF;
   2890	}
   2891	if (unlikely(!smc->clcsock->ops->setsockopt))
   2892		rc = -EOPNOTSUPP;
   2893	else
   2894		rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
   2895						   optval, optlen);
   2896	if (smc->clcsock->sk->sk_err) {
   2897		sk->sk_err = smc->clcsock->sk->sk_err;
   2898		sk_error_report(sk);
   2899	}
   2900	mutex_unlock(&smc->clcsock_release_lock);
   2901
   2902	if (optlen < sizeof(int))
   2903		return -EINVAL;
   2904	if (copy_from_sockptr(&val, optval, sizeof(int)))
   2905		return -EFAULT;
   2906
   2907	lock_sock(sk);
   2908	if (rc || smc->use_fallback)
   2909		goto out;
   2910	switch (optname) {
   2911	case TCP_FASTOPEN:
   2912	case TCP_FASTOPEN_CONNECT:
   2913	case TCP_FASTOPEN_KEY:
   2914	case TCP_FASTOPEN_NO_COOKIE:
   2915		/* option not supported by SMC */
   2916		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
   2917			rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
   2918		} else {
   2919			rc = -EINVAL;
   2920		}
   2921		break;
   2922	case TCP_NODELAY:
   2923		if (sk->sk_state != SMC_INIT &&
   2924		    sk->sk_state != SMC_LISTEN &&
   2925		    sk->sk_state != SMC_CLOSED) {
   2926			if (val) {
   2927				SMC_STAT_INC(smc, ndly_cnt);
   2928				smc_tx_pending(&smc->conn);
   2929				cancel_delayed_work(&smc->conn.tx_work);
   2930			}
   2931		}
   2932		break;
   2933	case TCP_CORK:
   2934		if (sk->sk_state != SMC_INIT &&
   2935		    sk->sk_state != SMC_LISTEN &&
   2936		    sk->sk_state != SMC_CLOSED) {
   2937			if (!val) {
   2938				SMC_STAT_INC(smc, cork_cnt);
   2939				smc_tx_pending(&smc->conn);
   2940				cancel_delayed_work(&smc->conn.tx_work);
   2941			}
   2942		}
   2943		break;
   2944	case TCP_DEFER_ACCEPT:
   2945		smc->sockopt_defer_accept = val;
   2946		break;
   2947	default:
   2948		break;
   2949	}
   2950out:
   2951	release_sock(sk);
   2952
   2953	return rc;
   2954}
   2955
   2956static int smc_getsockopt(struct socket *sock, int level, int optname,
   2957			  char __user *optval, int __user *optlen)
   2958{
   2959	struct smc_sock *smc;
   2960	int rc;
   2961
   2962	if (level == SOL_SMC)
   2963		return __smc_getsockopt(sock, level, optname, optval, optlen);
   2964
   2965	smc = smc_sk(sock->sk);
   2966	mutex_lock(&smc->clcsock_release_lock);
   2967	if (!smc->clcsock) {
   2968		mutex_unlock(&smc->clcsock_release_lock);
   2969		return -EBADF;
   2970	}
   2971	/* socket options apply to the CLC socket */
   2972	if (unlikely(!smc->clcsock->ops->getsockopt)) {
   2973		mutex_unlock(&smc->clcsock_release_lock);
   2974		return -EOPNOTSUPP;
   2975	}
   2976	rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
   2977					   optval, optlen);
   2978	mutex_unlock(&smc->clcsock_release_lock);
   2979	return rc;
   2980}
   2981
   2982static int smc_ioctl(struct socket *sock, unsigned int cmd,
   2983		     unsigned long arg)
   2984{
   2985	union smc_host_cursor cons, urg;
   2986	struct smc_connection *conn;
   2987	struct smc_sock *smc;
   2988	int answ;
   2989
   2990	smc = smc_sk(sock->sk);
   2991	conn = &smc->conn;
   2992	lock_sock(&smc->sk);
   2993	if (smc->use_fallback) {
   2994		if (!smc->clcsock) {
   2995			release_sock(&smc->sk);
   2996			return -EBADF;
   2997		}
   2998		answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
   2999		release_sock(&smc->sk);
   3000		return answ;
   3001	}
   3002	switch (cmd) {
   3003	case SIOCINQ: /* same as FIONREAD */
   3004		if (smc->sk.sk_state == SMC_LISTEN) {
   3005			release_sock(&smc->sk);
   3006			return -EINVAL;
   3007		}
   3008		if (smc->sk.sk_state == SMC_INIT ||
   3009		    smc->sk.sk_state == SMC_CLOSED)
   3010			answ = 0;
   3011		else
   3012			answ = atomic_read(&smc->conn.bytes_to_rcv);
   3013		break;
   3014	case SIOCOUTQ:
   3015		/* output queue size (not send + not acked) */
   3016		if (smc->sk.sk_state == SMC_LISTEN) {
   3017			release_sock(&smc->sk);
   3018			return -EINVAL;
   3019		}
   3020		if (smc->sk.sk_state == SMC_INIT ||
   3021		    smc->sk.sk_state == SMC_CLOSED)
   3022			answ = 0;
   3023		else
   3024			answ = smc->conn.sndbuf_desc->len -
   3025					atomic_read(&smc->conn.sndbuf_space);
   3026		break;
   3027	case SIOCOUTQNSD:
   3028		/* output queue size (not send only) */
   3029		if (smc->sk.sk_state == SMC_LISTEN) {
   3030			release_sock(&smc->sk);
   3031			return -EINVAL;
   3032		}
   3033		if (smc->sk.sk_state == SMC_INIT ||
   3034		    smc->sk.sk_state == SMC_CLOSED)
   3035			answ = 0;
   3036		else
   3037			answ = smc_tx_prepared_sends(&smc->conn);
   3038		break;
   3039	case SIOCATMARK:
   3040		if (smc->sk.sk_state == SMC_LISTEN) {
   3041			release_sock(&smc->sk);
   3042			return -EINVAL;
   3043		}
   3044		if (smc->sk.sk_state == SMC_INIT ||
   3045		    smc->sk.sk_state == SMC_CLOSED) {
   3046			answ = 0;
   3047		} else {
   3048			smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
   3049			smc_curs_copy(&urg, &conn->urg_curs, conn);
   3050			answ = smc_curs_diff(conn->rmb_desc->len,
   3051					     &cons, &urg) == 1;
   3052		}
   3053		break;
   3054	default:
   3055		release_sock(&smc->sk);
   3056		return -ENOIOCTLCMD;
   3057	}
   3058	release_sock(&smc->sk);
   3059
   3060	return put_user(answ, (int __user *)arg);
   3061}
   3062
   3063static ssize_t smc_sendpage(struct socket *sock, struct page *page,
   3064			    int offset, size_t size, int flags)
   3065{
   3066	struct sock *sk = sock->sk;
   3067	struct smc_sock *smc;
   3068	int rc = -EPIPE;
   3069
   3070	smc = smc_sk(sk);
   3071	lock_sock(sk);
   3072	if (sk->sk_state != SMC_ACTIVE) {
   3073		release_sock(sk);
   3074		goto out;
   3075	}
   3076	release_sock(sk);
   3077	if (smc->use_fallback) {
   3078		rc = kernel_sendpage(smc->clcsock, page, offset,
   3079				     size, flags);
   3080	} else {
   3081		lock_sock(sk);
   3082		rc = smc_tx_sendpage(smc, page, offset, size, flags);
   3083		release_sock(sk);
   3084		SMC_STAT_INC(smc, sendpage_cnt);
   3085	}
   3086
   3087out:
   3088	return rc;
   3089}
   3090
   3091/* Map the affected portions of the rmbe into an spd, note the number of bytes
   3092 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
   3093 * updates till whenever a respective page has been fully processed.
   3094 * Note that subsequent recv() calls have to wait till all splice() processing
   3095 * completed.
   3096 */
   3097static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
   3098			       struct pipe_inode_info *pipe, size_t len,
   3099			       unsigned int flags)
   3100{
   3101	struct sock *sk = sock->sk;
   3102	struct smc_sock *smc;
   3103	int rc = -ENOTCONN;
   3104
   3105	smc = smc_sk(sk);
   3106	lock_sock(sk);
   3107	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
   3108		/* socket was connected before, no more data to read */
   3109		rc = 0;
   3110		goto out;
   3111	}
   3112	if (sk->sk_state == SMC_INIT ||
   3113	    sk->sk_state == SMC_LISTEN ||
   3114	    sk->sk_state == SMC_CLOSED)
   3115		goto out;
   3116
   3117	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
   3118		rc = 0;
   3119		goto out;
   3120	}
   3121
   3122	if (smc->use_fallback) {
   3123		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
   3124						    pipe, len, flags);
   3125	} else {
   3126		if (*ppos) {
   3127			rc = -ESPIPE;
   3128			goto out;
   3129		}
   3130		if (flags & SPLICE_F_NONBLOCK)
   3131			flags = MSG_DONTWAIT;
   3132		else
   3133			flags = 0;
   3134		SMC_STAT_INC(smc, splice_cnt);
   3135		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
   3136	}
   3137out:
   3138	release_sock(sk);
   3139
   3140	return rc;
   3141}
   3142
   3143/* must look like tcp */
   3144static const struct proto_ops smc_sock_ops = {
   3145	.family		= PF_SMC,
   3146	.owner		= THIS_MODULE,
   3147	.release	= smc_release,
   3148	.bind		= smc_bind,
   3149	.connect	= smc_connect,
   3150	.socketpair	= sock_no_socketpair,
   3151	.accept		= smc_accept,
   3152	.getname	= smc_getname,
   3153	.poll		= smc_poll,
   3154	.ioctl		= smc_ioctl,
   3155	.listen		= smc_listen,
   3156	.shutdown	= smc_shutdown,
   3157	.setsockopt	= smc_setsockopt,
   3158	.getsockopt	= smc_getsockopt,
   3159	.sendmsg	= smc_sendmsg,
   3160	.recvmsg	= smc_recvmsg,
   3161	.mmap		= sock_no_mmap,
   3162	.sendpage	= smc_sendpage,
   3163	.splice_read	= smc_splice_read,
   3164};
   3165
   3166static int __smc_create(struct net *net, struct socket *sock, int protocol,
   3167			int kern, struct socket *clcsock)
   3168{
   3169	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
   3170	struct smc_sock *smc;
   3171	struct sock *sk;
   3172	int rc;
   3173
   3174	rc = -ESOCKTNOSUPPORT;
   3175	if (sock->type != SOCK_STREAM)
   3176		goto out;
   3177
   3178	rc = -EPROTONOSUPPORT;
   3179	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
   3180		goto out;
   3181
   3182	rc = -ENOBUFS;
   3183	sock->ops = &smc_sock_ops;
   3184	sock->state = SS_UNCONNECTED;
   3185	sk = smc_sock_alloc(net, sock, protocol);
   3186	if (!sk)
   3187		goto out;
   3188
   3189	/* create internal TCP socket for CLC handshake and fallback */
   3190	smc = smc_sk(sk);
   3191	smc->use_fallback = false; /* assume rdma capability first */
   3192	smc->fallback_rsn = 0;
   3193
   3194	/* default behavior from limit_smc_hs in every net namespace */
   3195	smc->limit_smc_hs = net->smc.limit_smc_hs;
   3196
   3197	rc = 0;
   3198	if (!clcsock) {
   3199		rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
   3200				      &smc->clcsock);
   3201		if (rc) {
   3202			sk_common_release(sk);
   3203			goto out;
   3204		}
   3205	} else {
   3206		smc->clcsock = clcsock;
   3207	}
   3208
   3209	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
   3210	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
   3211
   3212out:
   3213	return rc;
   3214}
   3215
   3216static int smc_create(struct net *net, struct socket *sock, int protocol,
   3217		      int kern)
   3218{
   3219	return __smc_create(net, sock, protocol, kern, NULL);
   3220}
   3221
   3222static const struct net_proto_family smc_sock_family_ops = {
   3223	.family	= PF_SMC,
   3224	.owner	= THIS_MODULE,
   3225	.create	= smc_create,
   3226};
   3227
   3228static int smc_ulp_init(struct sock *sk)
   3229{
   3230	struct socket *tcp = sk->sk_socket;
   3231	struct net *net = sock_net(sk);
   3232	struct socket *smcsock;
   3233	int protocol, ret;
   3234
   3235	/* only TCP can be replaced */
   3236	if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP ||
   3237	    (sk->sk_family != AF_INET && sk->sk_family != AF_INET6))
   3238		return -ESOCKTNOSUPPORT;
   3239	/* don't handle wq now */
   3240	if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list)
   3241		return -ENOTCONN;
   3242
   3243	if (sk->sk_family == AF_INET)
   3244		protocol = SMCPROTO_SMC;
   3245	else
   3246		protocol = SMCPROTO_SMC6;
   3247
   3248	smcsock = sock_alloc();
   3249	if (!smcsock)
   3250		return -ENFILE;
   3251
   3252	smcsock->type = SOCK_STREAM;
   3253	__module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */
   3254	ret = __smc_create(net, smcsock, protocol, 1, tcp);
   3255	if (ret) {
   3256		sock_release(smcsock); /* module_put() which ops won't be NULL */
   3257		return ret;
   3258	}
   3259
   3260	/* replace tcp socket to smc */
   3261	smcsock->file = tcp->file;
   3262	smcsock->file->private_data = smcsock;
   3263	smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */
   3264	smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */
   3265	tcp->file = NULL;
   3266
   3267	return ret;
   3268}
   3269
   3270static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk,
   3271			  const gfp_t priority)
   3272{
   3273	struct inet_connection_sock *icsk = inet_csk(newsk);
   3274
   3275	/* don't inherit ulp ops to child when listen */
   3276	icsk->icsk_ulp_ops = NULL;
   3277}
   3278
   3279static struct tcp_ulp_ops smc_ulp_ops __read_mostly = {
   3280	.name		= "smc",
   3281	.owner		= THIS_MODULE,
   3282	.init		= smc_ulp_init,
   3283	.clone		= smc_ulp_clone,
   3284};
   3285
   3286unsigned int smc_net_id;
   3287
   3288static __net_init int smc_net_init(struct net *net)
   3289{
   3290	int rc;
   3291
   3292	rc = smc_sysctl_net_init(net);
   3293	if (rc)
   3294		return rc;
   3295	return smc_pnet_net_init(net);
   3296}
   3297
   3298static void __net_exit smc_net_exit(struct net *net)
   3299{
   3300	smc_sysctl_net_exit(net);
   3301	smc_pnet_net_exit(net);
   3302}
   3303
   3304static __net_init int smc_net_stat_init(struct net *net)
   3305{
   3306	return smc_stats_init(net);
   3307}
   3308
   3309static void __net_exit smc_net_stat_exit(struct net *net)
   3310{
   3311	smc_stats_exit(net);
   3312}
   3313
   3314static struct pernet_operations smc_net_ops = {
   3315	.init = smc_net_init,
   3316	.exit = smc_net_exit,
   3317	.id   = &smc_net_id,
   3318	.size = sizeof(struct smc_net),
   3319};
   3320
   3321static struct pernet_operations smc_net_stat_ops = {
   3322	.init = smc_net_stat_init,
   3323	.exit = smc_net_stat_exit,
   3324};
   3325
   3326static int __init smc_init(void)
   3327{
   3328	int rc;
   3329
   3330	rc = register_pernet_subsys(&smc_net_ops);
   3331	if (rc)
   3332		return rc;
   3333
   3334	rc = register_pernet_subsys(&smc_net_stat_ops);
   3335	if (rc)
   3336		return rc;
   3337
   3338	smc_ism_init();
   3339	smc_clc_init();
   3340
   3341	rc = smc_nl_init();
   3342	if (rc)
   3343		goto out_pernet_subsys;
   3344
   3345	rc = smc_pnet_init();
   3346	if (rc)
   3347		goto out_nl;
   3348
   3349	rc = -ENOMEM;
   3350
   3351	smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
   3352	if (!smc_tcp_ls_wq)
   3353		goto out_pnet;
   3354
   3355	smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
   3356	if (!smc_hs_wq)
   3357		goto out_alloc_tcp_ls_wq;
   3358
   3359	smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
   3360	if (!smc_close_wq)
   3361		goto out_alloc_hs_wq;
   3362
   3363	rc = smc_core_init();
   3364	if (rc) {
   3365		pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
   3366		goto out_alloc_wqs;
   3367	}
   3368
   3369	rc = smc_llc_init();
   3370	if (rc) {
   3371		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
   3372		goto out_core;
   3373	}
   3374
   3375	rc = smc_cdc_init();
   3376	if (rc) {
   3377		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
   3378		goto out_core;
   3379	}
   3380
   3381	rc = proto_register(&smc_proto, 1);
   3382	if (rc) {
   3383		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
   3384		goto out_core;
   3385	}
   3386
   3387	rc = proto_register(&smc_proto6, 1);
   3388	if (rc) {
   3389		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
   3390		goto out_proto;
   3391	}
   3392
   3393	rc = sock_register(&smc_sock_family_ops);
   3394	if (rc) {
   3395		pr_err("%s: sock_register fails with %d\n", __func__, rc);
   3396		goto out_proto6;
   3397	}
   3398	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
   3399	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
   3400
   3401	rc = smc_ib_register_client();
   3402	if (rc) {
   3403		pr_err("%s: ib_register fails with %d\n", __func__, rc);
   3404		goto out_sock;
   3405	}
   3406
   3407	rc = tcp_register_ulp(&smc_ulp_ops);
   3408	if (rc) {
   3409		pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
   3410		goto out_ib;
   3411	}
   3412
   3413	static_branch_enable(&tcp_have_smc);
   3414	return 0;
   3415
   3416out_ib:
   3417	smc_ib_unregister_client();
   3418out_sock:
   3419	sock_unregister(PF_SMC);
   3420out_proto6:
   3421	proto_unregister(&smc_proto6);
   3422out_proto:
   3423	proto_unregister(&smc_proto);
   3424out_core:
   3425	smc_core_exit();
   3426out_alloc_wqs:
   3427	destroy_workqueue(smc_close_wq);
   3428out_alloc_hs_wq:
   3429	destroy_workqueue(smc_hs_wq);
   3430out_alloc_tcp_ls_wq:
   3431	destroy_workqueue(smc_tcp_ls_wq);
   3432out_pnet:
   3433	smc_pnet_exit();
   3434out_nl:
   3435	smc_nl_exit();
   3436out_pernet_subsys:
   3437	unregister_pernet_subsys(&smc_net_ops);
   3438
   3439	return rc;
   3440}
   3441
   3442static void __exit smc_exit(void)
   3443{
   3444	static_branch_disable(&tcp_have_smc);
   3445	tcp_unregister_ulp(&smc_ulp_ops);
   3446	sock_unregister(PF_SMC);
   3447	smc_core_exit();
   3448	smc_ib_unregister_client();
   3449	destroy_workqueue(smc_close_wq);
   3450	destroy_workqueue(smc_tcp_ls_wq);
   3451	destroy_workqueue(smc_hs_wq);
   3452	proto_unregister(&smc_proto6);
   3453	proto_unregister(&smc_proto);
   3454	smc_pnet_exit();
   3455	smc_nl_exit();
   3456	smc_clc_exit();
   3457	unregister_pernet_subsys(&smc_net_stat_ops);
   3458	unregister_pernet_subsys(&smc_net_ops);
   3459	rcu_barrier();
   3460}
   3461
   3462module_init(smc_init);
   3463module_exit(smc_exit);
   3464
   3465MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
   3466MODULE_DESCRIPTION("smc socket address family");
   3467MODULE_LICENSE("GPL");
   3468MODULE_ALIAS_NETPROTO(PF_SMC);
   3469MODULE_ALIAS_TCP_ULP("smc");