cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

tcp.c (21168B)


      1/*
      2 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
      3 *
      4 * This software is available to you under a choice of one of two
      5 * licenses.  You may choose to be licensed under the terms of the GNU
      6 * General Public License (GPL) Version 2, available from the file
      7 * COPYING in the main directory of this source tree, or the
      8 * OpenIB.org BSD license below:
      9 *
     10 *     Redistribution and use in source and binary forms, with or
     11 *     without modification, are permitted provided that the following
     12 *     conditions are met:
     13 *
     14 *      - Redistributions of source code must retain the above
     15 *        copyright notice, this list of conditions and the following
     16 *        disclaimer.
     17 *
     18 *      - Redistributions in binary form must reproduce the above
     19 *        copyright notice, this list of conditions and the following
     20 *        disclaimer in the documentation and/or other materials
     21 *        provided with the distribution.
     22 *
     23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     30 * SOFTWARE.
     31 *
     32 */
     33#include <linux/kernel.h>
     34#include <linux/slab.h>
     35#include <linux/in.h>
     36#include <linux/module.h>
     37#include <net/tcp.h>
     38#include <net/net_namespace.h>
     39#include <net/netns/generic.h>
     40#include <net/addrconf.h>
     41
     42#include "rds.h"
     43#include "tcp.h"
     44
     45/* only for info exporting */
     46static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
     47static LIST_HEAD(rds_tcp_tc_list);
     48
     49/* rds_tcp_tc_count counts only IPv4 connections.
     50 * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
     51 */
     52static unsigned int rds_tcp_tc_count;
     53#if IS_ENABLED(CONFIG_IPV6)
     54static unsigned int rds6_tcp_tc_count;
     55#endif
     56
     57/* Track rds_tcp_connection structs so they can be cleaned up */
     58static DEFINE_SPINLOCK(rds_tcp_conn_lock);
     59static LIST_HEAD(rds_tcp_conn_list);
     60static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
     61
     62static struct kmem_cache *rds_tcp_conn_slab;
     63
     64static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
     65				 void *buffer, size_t *lenp, loff_t *fpos);
     66
     67static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
     68static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
     69
     70static struct ctl_table rds_tcp_sysctl_table[] = {
     71#define	RDS_TCP_SNDBUF	0
     72	{
     73		.procname       = "rds_tcp_sndbuf",
     74		/* data is per-net pointer */
     75		.maxlen         = sizeof(int),
     76		.mode           = 0644,
     77		.proc_handler   = rds_tcp_skbuf_handler,
     78		.extra1		= &rds_tcp_min_sndbuf,
     79	},
     80#define	RDS_TCP_RCVBUF	1
     81	{
     82		.procname       = "rds_tcp_rcvbuf",
     83		/* data is per-net pointer */
     84		.maxlen         = sizeof(int),
     85		.mode           = 0644,
     86		.proc_handler   = rds_tcp_skbuf_handler,
     87		.extra1		= &rds_tcp_min_rcvbuf,
     88	},
     89	{ }
     90};
     91
     92u32 rds_tcp_write_seq(struct rds_tcp_connection *tc)
     93{
     94	/* seq# of the last byte of data in tcp send buffer */
     95	return tcp_sk(tc->t_sock->sk)->write_seq;
     96}
     97
     98u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
     99{
    100	return tcp_sk(tc->t_sock->sk)->snd_una;
    101}
    102
    103void rds_tcp_restore_callbacks(struct socket *sock,
    104			       struct rds_tcp_connection *tc)
    105{
    106	rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
    107	write_lock_bh(&sock->sk->sk_callback_lock);
    108
    109	/* done under the callback_lock to serialize with write_space */
    110	spin_lock(&rds_tcp_tc_list_lock);
    111	list_del_init(&tc->t_list_item);
    112#if IS_ENABLED(CONFIG_IPV6)
    113	rds6_tcp_tc_count--;
    114#endif
    115	if (!tc->t_cpath->cp_conn->c_isv6)
    116		rds_tcp_tc_count--;
    117	spin_unlock(&rds_tcp_tc_list_lock);
    118
    119	tc->t_sock = NULL;
    120
    121	sock->sk->sk_write_space = tc->t_orig_write_space;
    122	sock->sk->sk_data_ready = tc->t_orig_data_ready;
    123	sock->sk->sk_state_change = tc->t_orig_state_change;
    124	sock->sk->sk_user_data = NULL;
    125
    126	write_unlock_bh(&sock->sk->sk_callback_lock);
    127}
    128
    129/*
    130 * rds_tcp_reset_callbacks() switches the to the new sock and
    131 * returns the existing tc->t_sock.
    132 *
    133 * The only functions that set tc->t_sock are rds_tcp_set_callbacks
    134 * and rds_tcp_reset_callbacks.  Send and receive trust that
    135 * it is set.  The absence of RDS_CONN_UP bit protects those paths
    136 * from being called while it isn't set.
    137 */
    138void rds_tcp_reset_callbacks(struct socket *sock,
    139			     struct rds_conn_path *cp)
    140{
    141	struct rds_tcp_connection *tc = cp->cp_transport_data;
    142	struct socket *osock = tc->t_sock;
    143
    144	if (!osock)
    145		goto newsock;
    146
    147	/* Need to resolve a duelling SYN between peers.
    148	 * We have an outstanding SYN to this peer, which may
    149	 * potentially have transitioned to the RDS_CONN_UP state,
    150	 * so we must quiesce any send threads before resetting
    151	 * cp_transport_data. We quiesce these threads by setting
    152	 * cp_state to something other than RDS_CONN_UP, and then
    153	 * waiting for any existing threads in rds_send_xmit to
    154	 * complete release_in_xmit(). (Subsequent threads entering
    155	 * rds_send_xmit() will bail on !rds_conn_up().
    156	 *
    157	 * However an incoming syn-ack at this point would end up
    158	 * marking the conn as RDS_CONN_UP, and would again permit
    159	 * rds_send_xmi() threads through, so ideally we would
    160	 * synchronize on RDS_CONN_UP after lock_sock(), but cannot
    161	 * do that: waiting on !RDS_IN_XMIT after lock_sock() may
    162	 * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
    163	 * would not get set. As a result, we set c_state to
    164	 * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
    165	 * cannot mark rds_conn_path_up() in the window before lock_sock()
    166	 */
    167	atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
    168	wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
    169	lock_sock(osock->sk);
    170	/* reset receive side state for rds_tcp_data_recv() for osock  */
    171	cancel_delayed_work_sync(&cp->cp_send_w);
    172	cancel_delayed_work_sync(&cp->cp_recv_w);
    173	if (tc->t_tinc) {
    174		rds_inc_put(&tc->t_tinc->ti_inc);
    175		tc->t_tinc = NULL;
    176	}
    177	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
    178	tc->t_tinc_data_rem = 0;
    179	rds_tcp_restore_callbacks(osock, tc);
    180	release_sock(osock->sk);
    181	sock_release(osock);
    182newsock:
    183	rds_send_path_reset(cp);
    184	lock_sock(sock->sk);
    185	rds_tcp_set_callbacks(sock, cp);
    186	release_sock(sock->sk);
    187}
    188
    189/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
    190 * above rds_tcp_reset_callbacks for notes about synchronization
    191 * with data path
    192 */
    193void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
    194{
    195	struct rds_tcp_connection *tc = cp->cp_transport_data;
    196
    197	rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
    198	write_lock_bh(&sock->sk->sk_callback_lock);
    199
    200	/* done under the callback_lock to serialize with write_space */
    201	spin_lock(&rds_tcp_tc_list_lock);
    202	list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
    203#if IS_ENABLED(CONFIG_IPV6)
    204	rds6_tcp_tc_count++;
    205#endif
    206	if (!tc->t_cpath->cp_conn->c_isv6)
    207		rds_tcp_tc_count++;
    208	spin_unlock(&rds_tcp_tc_list_lock);
    209
    210	/* accepted sockets need our listen data ready undone */
    211	if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
    212		sock->sk->sk_data_ready = sock->sk->sk_user_data;
    213
    214	tc->t_sock = sock;
    215	tc->t_cpath = cp;
    216	tc->t_orig_data_ready = sock->sk->sk_data_ready;
    217	tc->t_orig_write_space = sock->sk->sk_write_space;
    218	tc->t_orig_state_change = sock->sk->sk_state_change;
    219
    220	sock->sk->sk_user_data = cp;
    221	sock->sk->sk_data_ready = rds_tcp_data_ready;
    222	sock->sk->sk_write_space = rds_tcp_write_space;
    223	sock->sk->sk_state_change = rds_tcp_state_change;
    224
    225	write_unlock_bh(&sock->sk->sk_callback_lock);
    226}
    227
    228/* Handle RDS_INFO_TCP_SOCKETS socket option.  It only returns IPv4
    229 * connections for backward compatibility.
    230 */
    231static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
    232			    struct rds_info_iterator *iter,
    233			    struct rds_info_lengths *lens)
    234{
    235	struct rds_info_tcp_socket tsinfo;
    236	struct rds_tcp_connection *tc;
    237	unsigned long flags;
    238
    239	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
    240
    241	if (len / sizeof(tsinfo) < rds_tcp_tc_count)
    242		goto out;
    243
    244	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
    245		struct inet_sock *inet = inet_sk(tc->t_sock->sk);
    246
    247		if (tc->t_cpath->cp_conn->c_isv6)
    248			continue;
    249
    250		tsinfo.local_addr = inet->inet_saddr;
    251		tsinfo.local_port = inet->inet_sport;
    252		tsinfo.peer_addr = inet->inet_daddr;
    253		tsinfo.peer_port = inet->inet_dport;
    254
    255		tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
    256		tsinfo.data_rem = tc->t_tinc_data_rem;
    257		tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
    258		tsinfo.last_expected_una = tc->t_last_expected_una;
    259		tsinfo.last_seen_una = tc->t_last_seen_una;
    260		tsinfo.tos = tc->t_cpath->cp_conn->c_tos;
    261
    262		rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
    263	}
    264
    265out:
    266	lens->nr = rds_tcp_tc_count;
    267	lens->each = sizeof(tsinfo);
    268
    269	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
    270}
    271
    272#if IS_ENABLED(CONFIG_IPV6)
    273/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
    274 * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
    275 * address.
    276 */
    277static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
    278			     struct rds_info_iterator *iter,
    279			     struct rds_info_lengths *lens)
    280{
    281	struct rds6_info_tcp_socket tsinfo6;
    282	struct rds_tcp_connection *tc;
    283	unsigned long flags;
    284
    285	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
    286
    287	if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
    288		goto out;
    289
    290	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
    291		struct sock *sk = tc->t_sock->sk;
    292		struct inet_sock *inet = inet_sk(sk);
    293
    294		tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
    295		tsinfo6.local_port = inet->inet_sport;
    296		tsinfo6.peer_addr = sk->sk_v6_daddr;
    297		tsinfo6.peer_port = inet->inet_dport;
    298
    299		tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
    300		tsinfo6.data_rem = tc->t_tinc_data_rem;
    301		tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
    302		tsinfo6.last_expected_una = tc->t_last_expected_una;
    303		tsinfo6.last_seen_una = tc->t_last_seen_una;
    304
    305		rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
    306	}
    307
    308out:
    309	lens->nr = rds6_tcp_tc_count;
    310	lens->each = sizeof(tsinfo6);
    311
    312	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
    313}
    314#endif
    315
    316int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
    317			__u32 scope_id)
    318{
    319	struct net_device *dev = NULL;
    320#if IS_ENABLED(CONFIG_IPV6)
    321	int ret;
    322#endif
    323
    324	if (ipv6_addr_v4mapped(addr)) {
    325		if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
    326			return 0;
    327		return -EADDRNOTAVAIL;
    328	}
    329
    330	/* If the scope_id is specified, check only those addresses
    331	 * hosted on the specified interface.
    332	 */
    333	if (scope_id != 0) {
    334		rcu_read_lock();
    335		dev = dev_get_by_index_rcu(net, scope_id);
    336		/* scope_id is not valid... */
    337		if (!dev) {
    338			rcu_read_unlock();
    339			return -EADDRNOTAVAIL;
    340		}
    341		rcu_read_unlock();
    342	}
    343#if IS_ENABLED(CONFIG_IPV6)
    344	ret = ipv6_chk_addr(net, addr, dev, 0);
    345	if (ret)
    346		return 0;
    347#endif
    348	return -EADDRNOTAVAIL;
    349}
    350
    351static void rds_tcp_conn_free(void *arg)
    352{
    353	struct rds_tcp_connection *tc = arg;
    354	unsigned long flags;
    355
    356	rdsdebug("freeing tc %p\n", tc);
    357
    358	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
    359	if (!tc->t_tcp_node_detached)
    360		list_del(&tc->t_tcp_node);
    361	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
    362
    363	kmem_cache_free(rds_tcp_conn_slab, tc);
    364}
    365
    366static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
    367{
    368	struct rds_tcp_connection *tc;
    369	int i, j;
    370	int ret = 0;
    371
    372	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
    373		tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
    374		if (!tc) {
    375			ret = -ENOMEM;
    376			goto fail;
    377		}
    378		mutex_init(&tc->t_conn_path_lock);
    379		tc->t_sock = NULL;
    380		tc->t_tinc = NULL;
    381		tc->t_tinc_hdr_rem = sizeof(struct rds_header);
    382		tc->t_tinc_data_rem = 0;
    383
    384		conn->c_path[i].cp_transport_data = tc;
    385		tc->t_cpath = &conn->c_path[i];
    386		tc->t_tcp_node_detached = true;
    387
    388		rdsdebug("rds_conn_path [%d] tc %p\n", i,
    389			 conn->c_path[i].cp_transport_data);
    390	}
    391	spin_lock_irq(&rds_tcp_conn_lock);
    392	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
    393		tc = conn->c_path[i].cp_transport_data;
    394		tc->t_tcp_node_detached = false;
    395		list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
    396	}
    397	spin_unlock_irq(&rds_tcp_conn_lock);
    398fail:
    399	if (ret) {
    400		for (j = 0; j < i; j++)
    401			rds_tcp_conn_free(conn->c_path[j].cp_transport_data);
    402	}
    403	return ret;
    404}
    405
    406static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
    407{
    408	struct rds_tcp_connection *tc, *_tc;
    409
    410	list_for_each_entry_safe(tc, _tc, list, t_tcp_node) {
    411		if (tc->t_cpath->cp_conn == conn)
    412			return true;
    413	}
    414	return false;
    415}
    416
    417static void rds_tcp_set_unloading(void)
    418{
    419	atomic_set(&rds_tcp_unloading, 1);
    420}
    421
    422static bool rds_tcp_is_unloading(struct rds_connection *conn)
    423{
    424	return atomic_read(&rds_tcp_unloading) != 0;
    425}
    426
    427static void rds_tcp_destroy_conns(void)
    428{
    429	struct rds_tcp_connection *tc, *_tc;
    430	LIST_HEAD(tmp_list);
    431
    432	/* avoid calling conn_destroy with irqs off */
    433	spin_lock_irq(&rds_tcp_conn_lock);
    434	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
    435		if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
    436			list_move_tail(&tc->t_tcp_node, &tmp_list);
    437	}
    438	spin_unlock_irq(&rds_tcp_conn_lock);
    439
    440	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
    441		rds_conn_destroy(tc->t_cpath->cp_conn);
    442}
    443
    444static void rds_tcp_exit(void);
    445
    446static u8 rds_tcp_get_tos_map(u8 tos)
    447{
    448	/* all user tos mapped to default 0 for TCP transport */
    449	return 0;
    450}
    451
    452struct rds_transport rds_tcp_transport = {
    453	.laddr_check		= rds_tcp_laddr_check,
    454	.xmit_path_prepare	= rds_tcp_xmit_path_prepare,
    455	.xmit_path_complete	= rds_tcp_xmit_path_complete,
    456	.xmit			= rds_tcp_xmit,
    457	.recv_path		= rds_tcp_recv_path,
    458	.conn_alloc		= rds_tcp_conn_alloc,
    459	.conn_free		= rds_tcp_conn_free,
    460	.conn_path_connect	= rds_tcp_conn_path_connect,
    461	.conn_path_shutdown	= rds_tcp_conn_path_shutdown,
    462	.inc_copy_to_user	= rds_tcp_inc_copy_to_user,
    463	.inc_free		= rds_tcp_inc_free,
    464	.stats_info_copy	= rds_tcp_stats_info_copy,
    465	.exit			= rds_tcp_exit,
    466	.get_tos_map		= rds_tcp_get_tos_map,
    467	.t_owner		= THIS_MODULE,
    468	.t_name			= "tcp",
    469	.t_type			= RDS_TRANS_TCP,
    470	.t_prefer_loopback	= 1,
    471	.t_mp_capable		= 1,
    472	.t_unloading		= rds_tcp_is_unloading,
    473};
    474
    475static unsigned int rds_tcp_netid;
    476
    477/* per-network namespace private data for this module */
    478struct rds_tcp_net {
    479	struct socket *rds_tcp_listen_sock;
    480	struct work_struct rds_tcp_accept_w;
    481	struct ctl_table_header *rds_tcp_sysctl;
    482	struct ctl_table *ctl_table;
    483	int sndbuf_size;
    484	int rcvbuf_size;
    485};
    486
    487/* All module specific customizations to the RDS-TCP socket should be done in
    488 * rds_tcp_tune() and applied after socket creation.
    489 */
    490bool rds_tcp_tune(struct socket *sock)
    491{
    492	struct sock *sk = sock->sk;
    493	struct net *net = sock_net(sk);
    494	struct rds_tcp_net *rtn;
    495
    496	tcp_sock_set_nodelay(sock->sk);
    497	lock_sock(sk);
    498	/* TCP timer functions might access net namespace even after
    499	 * a process which created this net namespace terminated.
    500	 */
    501	if (!sk->sk_net_refcnt) {
    502		if (!maybe_get_net(net)) {
    503			release_sock(sk);
    504			return false;
    505		}
    506		sk->sk_net_refcnt = 1;
    507		netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);
    508		sock_inuse_add(net, 1);
    509	}
    510	rtn = net_generic(net, rds_tcp_netid);
    511	if (rtn->sndbuf_size > 0) {
    512		sk->sk_sndbuf = rtn->sndbuf_size;
    513		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
    514	}
    515	if (rtn->rcvbuf_size > 0) {
    516		sk->sk_rcvbuf = rtn->rcvbuf_size;
    517		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
    518	}
    519	release_sock(sk);
    520	return true;
    521}
    522
    523static void rds_tcp_accept_worker(struct work_struct *work)
    524{
    525	struct rds_tcp_net *rtn = container_of(work,
    526					       struct rds_tcp_net,
    527					       rds_tcp_accept_w);
    528
    529	while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
    530		cond_resched();
    531}
    532
    533void rds_tcp_accept_work(struct sock *sk)
    534{
    535	struct net *net = sock_net(sk);
    536	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
    537
    538	queue_work(rds_wq, &rtn->rds_tcp_accept_w);
    539}
    540
    541static __net_init int rds_tcp_init_net(struct net *net)
    542{
    543	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
    544	struct ctl_table *tbl;
    545	int err = 0;
    546
    547	memset(rtn, 0, sizeof(*rtn));
    548
    549	/* {snd, rcv}buf_size default to 0, which implies we let the
    550	 * stack pick the value, and permit auto-tuning of buffer size.
    551	 */
    552	if (net == &init_net) {
    553		tbl = rds_tcp_sysctl_table;
    554	} else {
    555		tbl = kmemdup(rds_tcp_sysctl_table,
    556			      sizeof(rds_tcp_sysctl_table), GFP_KERNEL);
    557		if (!tbl) {
    558			pr_warn("could not set allocate sysctl table\n");
    559			return -ENOMEM;
    560		}
    561		rtn->ctl_table = tbl;
    562	}
    563	tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size;
    564	tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size;
    565	rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl);
    566	if (!rtn->rds_tcp_sysctl) {
    567		pr_warn("could not register sysctl\n");
    568		err = -ENOMEM;
    569		goto fail;
    570	}
    571
    572#if IS_ENABLED(CONFIG_IPV6)
    573	rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
    574#else
    575	rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
    576#endif
    577	if (!rtn->rds_tcp_listen_sock) {
    578		pr_warn("could not set up IPv6 listen sock\n");
    579
    580#if IS_ENABLED(CONFIG_IPV6)
    581		/* Try IPv4 as some systems disable IPv6 */
    582		rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
    583		if (!rtn->rds_tcp_listen_sock) {
    584#endif
    585			unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
    586			rtn->rds_tcp_sysctl = NULL;
    587			err = -EAFNOSUPPORT;
    588			goto fail;
    589#if IS_ENABLED(CONFIG_IPV6)
    590		}
    591#endif
    592	}
    593	INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
    594	return 0;
    595
    596fail:
    597	if (net != &init_net)
    598		kfree(tbl);
    599	return err;
    600}
    601
    602static void rds_tcp_kill_sock(struct net *net)
    603{
    604	struct rds_tcp_connection *tc, *_tc;
    605	LIST_HEAD(tmp_list);
    606	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
    607	struct socket *lsock = rtn->rds_tcp_listen_sock;
    608
    609	rtn->rds_tcp_listen_sock = NULL;
    610	rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
    611	spin_lock_irq(&rds_tcp_conn_lock);
    612	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
    613		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
    614
    615		if (net != c_net)
    616			continue;
    617		if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) {
    618			list_move_tail(&tc->t_tcp_node, &tmp_list);
    619		} else {
    620			list_del(&tc->t_tcp_node);
    621			tc->t_tcp_node_detached = true;
    622		}
    623	}
    624	spin_unlock_irq(&rds_tcp_conn_lock);
    625	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
    626		rds_conn_destroy(tc->t_cpath->cp_conn);
    627}
    628
    629static void __net_exit rds_tcp_exit_net(struct net *net)
    630{
    631	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
    632
    633	rds_tcp_kill_sock(net);
    634
    635	if (rtn->rds_tcp_sysctl)
    636		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
    637
    638	if (net != &init_net)
    639		kfree(rtn->ctl_table);
    640}
    641
    642static struct pernet_operations rds_tcp_net_ops = {
    643	.init = rds_tcp_init_net,
    644	.exit = rds_tcp_exit_net,
    645	.id = &rds_tcp_netid,
    646	.size = sizeof(struct rds_tcp_net),
    647};
    648
    649void *rds_tcp_listen_sock_def_readable(struct net *net)
    650{
    651	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
    652	struct socket *lsock = rtn->rds_tcp_listen_sock;
    653
    654	if (!lsock)
    655		return NULL;
    656
    657	return lsock->sk->sk_user_data;
    658}
    659
    660/* when sysctl is used to modify some kernel socket parameters,this
    661 * function  resets the RDS connections in that netns  so that we can
    662 * restart with new parameters.  The assumption is that such reset
    663 * events are few and far-between.
    664 */
    665static void rds_tcp_sysctl_reset(struct net *net)
    666{
    667	struct rds_tcp_connection *tc, *_tc;
    668
    669	spin_lock_irq(&rds_tcp_conn_lock);
    670	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
    671		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
    672
    673		if (net != c_net || !tc->t_sock)
    674			continue;
    675
    676		/* reconnect with new parameters */
    677		rds_conn_path_drop(tc->t_cpath, false);
    678	}
    679	spin_unlock_irq(&rds_tcp_conn_lock);
    680}
    681
    682static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
    683				 void *buffer, size_t *lenp, loff_t *fpos)
    684{
    685	struct net *net = current->nsproxy->net_ns;
    686	int err;
    687
    688	err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos);
    689	if (err < 0) {
    690		pr_warn("Invalid input. Must be >= %d\n",
    691			*(int *)(ctl->extra1));
    692		return err;
    693	}
    694	if (write)
    695		rds_tcp_sysctl_reset(net);
    696	return 0;
    697}
    698
    699static void rds_tcp_exit(void)
    700{
    701	rds_tcp_set_unloading();
    702	synchronize_rcu();
    703	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
    704#if IS_ENABLED(CONFIG_IPV6)
    705	rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
    706#endif
    707	unregister_pernet_device(&rds_tcp_net_ops);
    708	rds_tcp_destroy_conns();
    709	rds_trans_unregister(&rds_tcp_transport);
    710	rds_tcp_recv_exit();
    711	kmem_cache_destroy(rds_tcp_conn_slab);
    712}
    713module_exit(rds_tcp_exit);
    714
    715static int rds_tcp_init(void)
    716{
    717	int ret;
    718
    719	rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
    720					      sizeof(struct rds_tcp_connection),
    721					      0, 0, NULL);
    722	if (!rds_tcp_conn_slab) {
    723		ret = -ENOMEM;
    724		goto out;
    725	}
    726
    727	ret = rds_tcp_recv_init();
    728	if (ret)
    729		goto out_slab;
    730
    731	ret = register_pernet_device(&rds_tcp_net_ops);
    732	if (ret)
    733		goto out_recv;
    734
    735	rds_trans_register(&rds_tcp_transport);
    736
    737	rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
    738#if IS_ENABLED(CONFIG_IPV6)
    739	rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
    740#endif
    741
    742	goto out;
    743out_recv:
    744	rds_tcp_recv_exit();
    745out_slab:
    746	kmem_cache_destroy(rds_tcp_conn_slab);
    747out:
    748	return ret;
    749}
    750module_init(rds_tcp_init);
    751
    752MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
    753MODULE_DESCRIPTION("RDS: TCP transport");
    754MODULE_LICENSE("Dual BSD/GPL");