cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

rds.h (31627B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2#ifndef _RDS_RDS_H
      3#define _RDS_RDS_H
      4
      5#include <net/sock.h>
      6#include <linux/scatterlist.h>
      7#include <linux/highmem.h>
      8#include <rdma/rdma_cm.h>
      9#include <linux/mutex.h>
     10#include <linux/rds.h>
     11#include <linux/rhashtable.h>
     12#include <linux/refcount.h>
     13#include <linux/in6.h>
     14
     15#include "info.h"
     16
     17/*
     18 * RDS Network protocol version
     19 */
     20#define RDS_PROTOCOL_3_0	0x0300
     21#define RDS_PROTOCOL_3_1	0x0301
     22#define RDS_PROTOCOL_4_0	0x0400
     23#define RDS_PROTOCOL_4_1	0x0401
     24#define RDS_PROTOCOL_VERSION	RDS_PROTOCOL_3_1
     25#define RDS_PROTOCOL_MAJOR(v)	((v) >> 8)
     26#define RDS_PROTOCOL_MINOR(v)	((v) & 255)
     27#define RDS_PROTOCOL(maj, min)	(((maj) << 8) | min)
     28#define RDS_PROTOCOL_COMPAT_VERSION	RDS_PROTOCOL_3_1
     29
     30/* The following ports, 16385, 18634, 18635, are registered with IANA as
     31 * the ports to be used for RDS over TCP and UDP.  Currently, only RDS over
     32 * TCP and RDS over IB/RDMA are implemented.  18634 is the historical value
     33 * used for the RDMA_CM listener port.  RDS/TCP uses port 16385.  After
     34 * IPv6 work, RDMA_CM also uses 16385 as the listener port.  18634 is kept
     35 * to ensure compatibility with older RDS modules.  Those ports are defined
     36 * in each transport's header file.
     37 */
     38#define RDS_PORT	18634
     39
     40#ifdef ATOMIC64_INIT
     41#define KERNEL_HAS_ATOMIC64
     42#endif
     43#ifdef RDS_DEBUG
     44#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
     45#else
     46/* sigh, pr_debug() causes unused variable warnings */
     47static inline __printf(1, 2)
     48void rdsdebug(char *fmt, ...)
     49{
     50}
     51#endif
     52
     53#define RDS_FRAG_SHIFT	12
     54#define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
     55
     56/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
     57#define RDS_MAX_MSG_SIZE	((unsigned int)(1 << 20))
     58
     59#define RDS_CONG_MAP_BYTES	(65536 / 8)
     60#define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
     61#define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
     62
     63struct rds_cong_map {
     64	struct rb_node		m_rb_node;
     65	struct in6_addr		m_addr;
     66	wait_queue_head_t	m_waitq;
     67	struct list_head	m_conn_list;
     68	unsigned long		m_page_addrs[RDS_CONG_MAP_PAGES];
     69};
     70
     71
     72/*
     73 * This is how we will track the connection state:
     74 * A connection is always in one of the following
     75 * states. Updates to the state are atomic and imply
     76 * a memory barrier.
     77 */
     78enum {
     79	RDS_CONN_DOWN = 0,
     80	RDS_CONN_CONNECTING,
     81	RDS_CONN_DISCONNECTING,
     82	RDS_CONN_UP,
     83	RDS_CONN_RESETTING,
     84	RDS_CONN_ERROR,
     85};
     86
     87/* Bits for c_flags */
     88#define RDS_LL_SEND_FULL	0
     89#define RDS_RECONNECT_PENDING	1
     90#define RDS_IN_XMIT		2
     91#define RDS_RECV_REFILL		3
     92#define	RDS_DESTROY_PENDING	4
     93
     94/* Max number of multipaths per RDS connection. Must be a power of 2 */
     95#define	RDS_MPATH_WORKERS	8
     96#define	RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \
     97			       (rs)->rs_hash_initval) & ((n) - 1))
     98
     99#define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr))
    100
    101/* Per mpath connection state */
    102struct rds_conn_path {
    103	struct rds_connection	*cp_conn;
    104	struct rds_message	*cp_xmit_rm;
    105	unsigned long		cp_xmit_sg;
    106	unsigned int		cp_xmit_hdr_off;
    107	unsigned int		cp_xmit_data_off;
    108	unsigned int		cp_xmit_atomic_sent;
    109	unsigned int		cp_xmit_rdma_sent;
    110	unsigned int		cp_xmit_data_sent;
    111
    112	spinlock_t		cp_lock;		/* protect msg queues */
    113	u64			cp_next_tx_seq;
    114	struct list_head	cp_send_queue;
    115	struct list_head	cp_retrans;
    116
    117	u64			cp_next_rx_seq;
    118
    119	void			*cp_transport_data;
    120
    121	atomic_t		cp_state;
    122	unsigned long		cp_send_gen;
    123	unsigned long		cp_flags;
    124	unsigned long		cp_reconnect_jiffies;
    125	struct delayed_work	cp_send_w;
    126	struct delayed_work	cp_recv_w;
    127	struct delayed_work	cp_conn_w;
    128	struct work_struct	cp_down_w;
    129	struct mutex		cp_cm_lock;	/* protect cp_state & cm */
    130	wait_queue_head_t	cp_waitq;
    131
    132	unsigned int		cp_unacked_packets;
    133	unsigned int		cp_unacked_bytes;
    134	unsigned int		cp_index;
    135};
    136
    137/* One rds_connection per RDS address pair */
    138struct rds_connection {
    139	struct hlist_node	c_hash_node;
    140	struct in6_addr		c_laddr;
    141	struct in6_addr		c_faddr;
    142	int			c_dev_if; /* ifindex used for this conn */
    143	int			c_bound_if; /* ifindex of c_laddr */
    144	unsigned int		c_loopback:1,
    145				c_isv6:1,
    146				c_ping_triggered:1,
    147				c_pad_to_32:29;
    148	int			c_npaths;
    149	struct rds_connection	*c_passive;
    150	struct rds_transport	*c_trans;
    151
    152	struct rds_cong_map	*c_lcong;
    153	struct rds_cong_map	*c_fcong;
    154
    155	/* Protocol version */
    156	unsigned int		c_proposed_version;
    157	unsigned int		c_version;
    158	possible_net_t		c_net;
    159
    160	/* TOS */
    161	u8			c_tos;
    162
    163	struct list_head	c_map_item;
    164	unsigned long		c_map_queued;
    165
    166	struct rds_conn_path	*c_path;
    167	wait_queue_head_t	c_hs_waitq; /* handshake waitq */
    168
    169	u32			c_my_gen_num;
    170	u32			c_peer_gen_num;
    171};
    172
    173static inline
    174struct net *rds_conn_net(struct rds_connection *conn)
    175{
    176	return read_pnet(&conn->c_net);
    177}
    178
    179static inline
    180void rds_conn_net_set(struct rds_connection *conn, struct net *net)
    181{
    182	write_pnet(&conn->c_net, net);
    183}
    184
    185#define RDS_FLAG_CONG_BITMAP	0x01
    186#define RDS_FLAG_ACK_REQUIRED	0x02
    187#define RDS_FLAG_RETRANSMITTED	0x04
    188#define RDS_MAX_ADV_CREDIT	255
    189
    190/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
    191 * probe to exchange control information before establishing a connection.
    192 * Currently the control information that is exchanged is the number of
    193 * supported paths. If the peer is a legacy (older kernel revision) peer,
    194 * it would return a pong message without additional control information
    195 * that would then alert the sender that the peer was an older rev.
    196 */
    197#define RDS_FLAG_PROBE_PORT	1
    198#define	RDS_HS_PROBE(sport, dport) \
    199		((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \
    200		 (sport == 0 && dport == RDS_FLAG_PROBE_PORT))
    201/*
    202 * Maximum space available for extension headers.
    203 */
    204#define RDS_HEADER_EXT_SPACE	16
    205
    206struct rds_header {
    207	__be64	h_sequence;
    208	__be64	h_ack;
    209	__be32	h_len;
    210	__be16	h_sport;
    211	__be16	h_dport;
    212	u8	h_flags;
    213	u8	h_credit;
    214	u8	h_padding[4];
    215	__sum16	h_csum;
    216
    217	u8	h_exthdr[RDS_HEADER_EXT_SPACE];
    218};
    219
    220/*
    221 * Reserved - indicates end of extensions
    222 */
    223#define RDS_EXTHDR_NONE		0
    224
    225/*
    226 * This extension header is included in the very
    227 * first message that is sent on a new connection,
    228 * and identifies the protocol level. This will help
    229 * rolling updates if a future change requires breaking
    230 * the protocol.
    231 * NB: This is no longer true for IB, where we do a version
    232 * negotiation during the connection setup phase (protocol
    233 * version information is included in the RDMA CM private data).
    234 */
    235#define RDS_EXTHDR_VERSION	1
    236struct rds_ext_header_version {
    237	__be32			h_version;
    238};
    239
    240/*
    241 * This extension header is included in the RDS message
    242 * chasing an RDMA operation.
    243 */
    244#define RDS_EXTHDR_RDMA		2
    245struct rds_ext_header_rdma {
    246	__be32			h_rdma_rkey;
    247};
    248
    249/*
    250 * This extension header tells the peer about the
    251 * destination <R_Key,offset> of the requested RDMA
    252 * operation.
    253 */
    254#define RDS_EXTHDR_RDMA_DEST	3
    255struct rds_ext_header_rdma_dest {
    256	__be32			h_rdma_rkey;
    257	__be32			h_rdma_offset;
    258};
    259
    260/* Extension header announcing number of paths.
    261 * Implicit length = 2 bytes.
    262 */
    263#define RDS_EXTHDR_NPATHS	5
    264#define RDS_EXTHDR_GEN_NUM	6
    265
    266#define __RDS_EXTHDR_MAX	16 /* for now */
    267#define RDS_RX_MAX_TRACES	(RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
    268#define	RDS_MSG_RX_HDR		0
    269#define	RDS_MSG_RX_START	1
    270#define	RDS_MSG_RX_END		2
    271#define	RDS_MSG_RX_CMSG		3
    272
    273/* The following values are whitelisted for usercopy */
    274struct rds_inc_usercopy {
    275	rds_rdma_cookie_t	rdma_cookie;
    276	ktime_t			rx_tstamp;
    277};
    278
    279struct rds_incoming {
    280	refcount_t		i_refcount;
    281	struct list_head	i_item;
    282	struct rds_connection	*i_conn;
    283	struct rds_conn_path	*i_conn_path;
    284	struct rds_header	i_hdr;
    285	unsigned long		i_rx_jiffies;
    286	struct in6_addr		i_saddr;
    287
    288	struct rds_inc_usercopy i_usercopy;
    289	u64			i_rx_lat_trace[RDS_RX_MAX_TRACES];
    290};
    291
    292struct rds_mr {
    293	struct rb_node		r_rb_node;
    294	struct kref		r_kref;
    295	u32			r_key;
    296
    297	/* A copy of the creation flags */
    298	unsigned int		r_use_once:1;
    299	unsigned int		r_invalidate:1;
    300	unsigned int		r_write:1;
    301
    302	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
    303	struct rds_transport	*r_trans;
    304	void			*r_trans_private;
    305};
    306
    307static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
    308{
    309	return r_key | (((u64) offset) << 32);
    310}
    311
    312static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
    313{
    314	return cookie;
    315}
    316
    317static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
    318{
    319	return cookie >> 32;
    320}
    321
    322/* atomic operation types */
    323#define RDS_ATOMIC_TYPE_CSWP		0
    324#define RDS_ATOMIC_TYPE_FADD		1
    325
    326/*
    327 * m_sock_item and m_conn_item are on lists that are serialized under
    328 * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
    329 * the message will not be put back on the retransmit list after being sent.
    330 * messages that are canceled while being sent rely on this.
    331 *
    332 * m_inc is used by loopback so that it can pass an incoming message straight
    333 * back up into the rx path.  It embeds a wire header which is also used by
    334 * the send path, which is kind of awkward.
    335 *
    336 * m_sock_item indicates the message's presence on a socket's send or receive
    337 * queue.  m_rs will point to that socket.
    338 *
    339 * m_daddr is used by cancellation to prune messages to a given destination.
    340 *
    341 * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
    342 * nesting.  As paths iterate over messages on a sock, or conn, they must
    343 * also lock the conn, or sock, to remove the message from those lists too.
    344 * Testing the flag to determine if the message is still on the lists lets
    345 * us avoid testing the list_head directly.  That means each path can use
    346 * the message's list_head to keep it on a local list while juggling locks
    347 * without confusing the other path.
    348 *
    349 * m_ack_seq is an optional field set by transports who need a different
    350 * sequence number range to invalidate.  They can use this in a callback
    351 * that they pass to rds_send_drop_acked() to see if each message has been
    352 * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't
    353 * had ack_seq set yet.
    354 */
    355#define RDS_MSG_ON_SOCK		1
    356#define RDS_MSG_ON_CONN		2
    357#define RDS_MSG_HAS_ACK_SEQ	3
    358#define RDS_MSG_ACK_REQUIRED	4
    359#define RDS_MSG_RETRANSMITTED	5
    360#define RDS_MSG_MAPPED		6
    361#define RDS_MSG_PAGEVEC		7
    362#define RDS_MSG_FLUSH		8
    363
    364struct rds_znotifier {
    365	struct mmpin		z_mmp;
    366	u32			z_cookie;
    367};
    368
    369struct rds_msg_zcopy_info {
    370	struct list_head rs_zcookie_next;
    371	union {
    372		struct rds_znotifier znotif;
    373		struct rds_zcopy_cookies zcookies;
    374	};
    375};
    376
    377struct rds_msg_zcopy_queue {
    378	struct list_head zcookie_head;
    379	spinlock_t lock; /* protects zcookie_head queue */
    380};
    381
    382static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q)
    383{
    384	spin_lock_init(&q->lock);
    385	INIT_LIST_HEAD(&q->zcookie_head);
    386}
    387
    388struct rds_iov_vector {
    389	struct rds_iovec *iov;
    390	int               len;
    391};
    392
    393struct rds_iov_vector_arr {
    394	struct rds_iov_vector *vec;
    395	int                    len;
    396	int                    indx;
    397	int                    incr;
    398};
    399
    400struct rds_message {
    401	refcount_t		m_refcount;
    402	struct list_head	m_sock_item;
    403	struct list_head	m_conn_item;
    404	struct rds_incoming	m_inc;
    405	u64			m_ack_seq;
    406	struct in6_addr		m_daddr;
    407	unsigned long		m_flags;
    408
    409	/* Never access m_rs without holding m_rs_lock.
    410	 * Lock nesting is
    411	 *  rm->m_rs_lock
    412	 *   -> rs->rs_lock
    413	 */
    414	spinlock_t		m_rs_lock;
    415	wait_queue_head_t	m_flush_wait;
    416
    417	struct rds_sock		*m_rs;
    418
    419	/* cookie to send to remote, in rds header */
    420	rds_rdma_cookie_t	m_rdma_cookie;
    421
    422	unsigned int		m_used_sgs;
    423	unsigned int		m_total_sgs;
    424
    425	void			*m_final_op;
    426
    427	struct {
    428		struct rm_atomic_op {
    429			int			op_type;
    430			union {
    431				struct {
    432					uint64_t	compare;
    433					uint64_t	swap;
    434					uint64_t	compare_mask;
    435					uint64_t	swap_mask;
    436				} op_m_cswp;
    437				struct {
    438					uint64_t	add;
    439					uint64_t	nocarry_mask;
    440				} op_m_fadd;
    441			};
    442
    443			u32			op_rkey;
    444			u64			op_remote_addr;
    445			unsigned int		op_notify:1;
    446			unsigned int		op_recverr:1;
    447			unsigned int		op_mapped:1;
    448			unsigned int		op_silent:1;
    449			unsigned int		op_active:1;
    450			struct scatterlist	*op_sg;
    451			struct rds_notifier	*op_notifier;
    452
    453			struct rds_mr		*op_rdma_mr;
    454		} atomic;
    455		struct rm_rdma_op {
    456			u32			op_rkey;
    457			u64			op_remote_addr;
    458			unsigned int		op_write:1;
    459			unsigned int		op_fence:1;
    460			unsigned int		op_notify:1;
    461			unsigned int		op_recverr:1;
    462			unsigned int		op_mapped:1;
    463			unsigned int		op_silent:1;
    464			unsigned int		op_active:1;
    465			unsigned int		op_bytes;
    466			unsigned int		op_nents;
    467			unsigned int		op_count;
    468			struct scatterlist	*op_sg;
    469			struct rds_notifier	*op_notifier;
    470
    471			struct rds_mr		*op_rdma_mr;
    472
    473			u64			op_odp_addr;
    474			struct rds_mr		*op_odp_mr;
    475		} rdma;
    476		struct rm_data_op {
    477			unsigned int		op_active:1;
    478			unsigned int		op_nents;
    479			unsigned int		op_count;
    480			unsigned int		op_dmasg;
    481			unsigned int		op_dmaoff;
    482			struct rds_znotifier	*op_mmp_znotifier;
    483			struct scatterlist	*op_sg;
    484		} data;
    485	};
    486
    487	struct rds_conn_path *m_conn_path;
    488};
    489
    490/*
    491 * The RDS notifier is used (optionally) to tell the application about
    492 * completed RDMA operations. Rather than keeping the whole rds message
    493 * around on the queue, we allocate a small notifier that is put on the
    494 * socket's notifier_list. Notifications are delivered to the application
    495 * through control messages.
    496 */
    497struct rds_notifier {
    498	struct list_head	n_list;
    499	uint64_t		n_user_token;
    500	int			n_status;
    501};
    502
    503/* Available as part of RDS core, so doesn't need to participate
    504 * in get_preferred transport etc
    505 */
    506#define	RDS_TRANS_LOOP	3
    507
    508/**
    509 * struct rds_transport -  transport specific behavioural hooks
    510 *
    511 * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
    512 *        part of a message.  The caller serializes on the send_sem so this
    513 *        doesn't need to be reentrant for a given conn.  The header must be
    514 *        sent before the data payload.  .xmit must be prepared to send a
    515 *        message with no data payload.  .xmit should return the number of
    516 *        bytes that were sent down the connection, including header bytes.
    517 *        Returning 0 tells the caller that it doesn't need to perform any
    518 *        additional work now.  This is usually the case when the transport has
    519 *        filled the sending queue for its connection and will handle
    520 *        triggering the rds thread to continue the send when space becomes
    521 *        available.  Returning -EAGAIN tells the caller to retry the send
    522 *        immediately.  Returning -ENOMEM tells the caller to retry the send at
    523 *        some point in the future.
    524 *
    525 * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once
    526 *                 it returns the connection can not call rds_recv_incoming().
    527 *                 This will only be called once after conn_connect returns
    528 *                 non-zero success and will The caller serializes this with
    529 *                 the send and connecting paths (xmit_* and conn_*).  The
    530 *                 transport is responsible for other serialization, including
    531 *                 rds_recv_incoming().  This is called in process context but
    532 *                 should try hard not to block.
    533 */
    534
    535struct rds_transport {
    536	char			t_name[TRANSNAMSIZ];
    537	struct list_head	t_item;
    538	struct module		*t_owner;
    539	unsigned int		t_prefer_loopback:1,
    540				t_mp_capable:1;
    541	unsigned int		t_type;
    542
    543	int (*laddr_check)(struct net *net, const struct in6_addr *addr,
    544			   __u32 scope_id);
    545	int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
    546	void (*conn_free)(void *data);
    547	int (*conn_path_connect)(struct rds_conn_path *cp);
    548	void (*conn_path_shutdown)(struct rds_conn_path *conn);
    549	void (*xmit_path_prepare)(struct rds_conn_path *cp);
    550	void (*xmit_path_complete)(struct rds_conn_path *cp);
    551	int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
    552		    unsigned int hdr_off, unsigned int sg, unsigned int off);
    553	int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
    554	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
    555	int (*recv_path)(struct rds_conn_path *cp);
    556	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
    557	void (*inc_free)(struct rds_incoming *inc);
    558
    559	int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
    560				 struct rdma_cm_event *event, bool isv6);
    561	int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
    562	void (*cm_connect_complete)(struct rds_connection *conn,
    563				    struct rdma_cm_event *event);
    564
    565	unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
    566					unsigned int avail);
    567	void (*exit)(void);
    568	void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
    569			struct rds_sock *rs, u32 *key_ret,
    570			struct rds_connection *conn,
    571			u64 start, u64 length, int need_odp);
    572	void (*sync_mr)(void *trans_private, int direction);
    573	void (*free_mr)(void *trans_private, int invalidate);
    574	void (*flush_mrs)(void);
    575	bool (*t_unloading)(struct rds_connection *conn);
    576	u8 (*get_tos_map)(u8 tos);
    577};
    578
    579/* Bind hash table key length.  It is the sum of the size of a struct
    580 * in6_addr, a scope_id  and a port.
    581 */
    582#define RDS_BOUND_KEY_LEN \
    583	(sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
    584
    585struct rds_sock {
    586	struct sock		rs_sk;
    587
    588	u64			rs_user_addr;
    589	u64			rs_user_bytes;
    590
    591	/*
    592	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
    593	 * support.
    594	 */
    595	struct rhash_head	rs_bound_node;
    596	u8			rs_bound_key[RDS_BOUND_KEY_LEN];
    597	struct sockaddr_in6	rs_bound_sin6;
    598#define rs_bound_addr		rs_bound_sin6.sin6_addr
    599#define rs_bound_addr_v4	rs_bound_sin6.sin6_addr.s6_addr32[3]
    600#define rs_bound_port		rs_bound_sin6.sin6_port
    601#define rs_bound_scope_id	rs_bound_sin6.sin6_scope_id
    602	struct in6_addr		rs_conn_addr;
    603#define rs_conn_addr_v4		rs_conn_addr.s6_addr32[3]
    604	__be16			rs_conn_port;
    605	struct rds_transport    *rs_transport;
    606
    607	/*
    608	 * rds_sendmsg caches the conn it used the last time around.
    609	 * This helps avoid costly lookups.
    610	 */
    611	struct rds_connection	*rs_conn;
    612
    613	/* flag indicating we were congested or not */
    614	int			rs_congested;
    615	/* seen congestion (ENOBUFS) when sending? */
    616	int			rs_seen_congestion;
    617
    618	/* rs_lock protects all these adjacent members before the newline */
    619	spinlock_t		rs_lock;
    620	struct list_head	rs_send_queue;
    621	u32			rs_snd_bytes;
    622	int			rs_rcv_bytes;
    623	struct list_head	rs_notify_queue;	/* currently used for failed RDMAs */
    624
    625	/* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
    626	 * to decide whether the application should be woken up.
    627	 * If not set, we use rs_cong_track to find out whether a cong map
    628	 * update arrived.
    629	 */
    630	uint64_t		rs_cong_mask;
    631	uint64_t		rs_cong_notify;
    632	struct list_head	rs_cong_list;
    633	unsigned long		rs_cong_track;
    634
    635	/*
    636	 * rs_recv_lock protects the receive queue, and is
    637	 * used to serialize with rds_release.
    638	 */
    639	rwlock_t		rs_recv_lock;
    640	struct list_head	rs_recv_queue;
    641
    642	/* just for stats reporting */
    643	struct list_head	rs_item;
    644
    645	/* these have their own lock */
    646	spinlock_t		rs_rdma_lock;
    647	struct rb_root		rs_rdma_keys;
    648
    649	/* Socket options - in case there will be more */
    650	unsigned char		rs_recverr,
    651				rs_cong_monitor;
    652	u32			rs_hash_initval;
    653
    654	/* Socket receive path trace points*/
    655	u8			rs_rx_traces;
    656	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
    657	struct rds_msg_zcopy_queue rs_zcookie_queue;
    658	u8			rs_tos;
    659};
    660
    661static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
    662{
    663	return container_of(sk, struct rds_sock, rs_sk);
    664}
    665static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
    666{
    667	return &rs->rs_sk;
    668}
    669
    670/*
    671 * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
    672 * to account for overhead.  We don't account for overhead, we just apply
    673 * the number of payload bytes to the specified value.
    674 */
    675static inline int rds_sk_sndbuf(struct rds_sock *rs)
    676{
    677	return rds_rs_to_sk(rs)->sk_sndbuf / 2;
    678}
    679static inline int rds_sk_rcvbuf(struct rds_sock *rs)
    680{
    681	return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
    682}
    683
    684struct rds_statistics {
    685	uint64_t	s_conn_reset;
    686	uint64_t	s_recv_drop_bad_checksum;
    687	uint64_t	s_recv_drop_old_seq;
    688	uint64_t	s_recv_drop_no_sock;
    689	uint64_t	s_recv_drop_dead_sock;
    690	uint64_t	s_recv_deliver_raced;
    691	uint64_t	s_recv_delivered;
    692	uint64_t	s_recv_queued;
    693	uint64_t	s_recv_immediate_retry;
    694	uint64_t	s_recv_delayed_retry;
    695	uint64_t	s_recv_ack_required;
    696	uint64_t	s_recv_rdma_bytes;
    697	uint64_t	s_recv_ping;
    698	uint64_t	s_send_queue_empty;
    699	uint64_t	s_send_queue_full;
    700	uint64_t	s_send_lock_contention;
    701	uint64_t	s_send_lock_queue_raced;
    702	uint64_t	s_send_immediate_retry;
    703	uint64_t	s_send_delayed_retry;
    704	uint64_t	s_send_drop_acked;
    705	uint64_t	s_send_ack_required;
    706	uint64_t	s_send_queued;
    707	uint64_t	s_send_rdma;
    708	uint64_t	s_send_rdma_bytes;
    709	uint64_t	s_send_pong;
    710	uint64_t	s_page_remainder_hit;
    711	uint64_t	s_page_remainder_miss;
    712	uint64_t	s_copy_to_user;
    713	uint64_t	s_copy_from_user;
    714	uint64_t	s_cong_update_queued;
    715	uint64_t	s_cong_update_received;
    716	uint64_t	s_cong_send_error;
    717	uint64_t	s_cong_send_blocked;
    718	uint64_t	s_recv_bytes_added_to_socket;
    719	uint64_t	s_recv_bytes_removed_from_socket;
    720	uint64_t	s_send_stuck_rm;
    721};
    722
    723/* af_rds.c */
    724void rds_sock_addref(struct rds_sock *rs);
    725void rds_sock_put(struct rds_sock *rs);
    726void rds_wake_sk_sleep(struct rds_sock *rs);
    727static inline void __rds_wake_sk_sleep(struct sock *sk)
    728{
    729	wait_queue_head_t *waitq = sk_sleep(sk);
    730
    731	if (!sock_flag(sk, SOCK_DEAD) && waitq)
    732		wake_up(waitq);
    733}
    734extern wait_queue_head_t rds_poll_waitq;
    735
    736
    737/* bind.c */
    738int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
    739void rds_remove_bound(struct rds_sock *rs);
    740struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
    741				__u32 scope_id);
    742int rds_bind_lock_init(void);
    743void rds_bind_lock_destroy(void);
    744
    745/* cong.c */
    746int rds_cong_get_maps(struct rds_connection *conn);
    747void rds_cong_add_conn(struct rds_connection *conn);
    748void rds_cong_remove_conn(struct rds_connection *conn);
    749void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
    750void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
    751int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
    752void rds_cong_queue_updates(struct rds_cong_map *map);
    753void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
    754int rds_cong_updated_since(unsigned long *recent);
    755void rds_cong_add_socket(struct rds_sock *);
    756void rds_cong_remove_socket(struct rds_sock *);
    757void rds_cong_exit(void);
    758struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
    759
    760/* connection.c */
    761extern u32 rds_gen_num;
    762int rds_conn_init(void);
    763void rds_conn_exit(void);
    764struct rds_connection *rds_conn_create(struct net *net,
    765				       const struct in6_addr *laddr,
    766				       const struct in6_addr *faddr,
    767				       struct rds_transport *trans,
    768				       u8 tos, gfp_t gfp,
    769				       int dev_if);
    770struct rds_connection *rds_conn_create_outgoing(struct net *net,
    771						const struct in6_addr *laddr,
    772						const struct in6_addr *faddr,
    773						struct rds_transport *trans,
    774						u8 tos, gfp_t gfp, int dev_if);
    775void rds_conn_shutdown(struct rds_conn_path *cpath);
    776void rds_conn_destroy(struct rds_connection *conn);
    777void rds_conn_drop(struct rds_connection *conn);
    778void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy);
    779void rds_conn_connect_if_down(struct rds_connection *conn);
    780void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
    781void rds_check_all_paths(struct rds_connection *conn);
    782void rds_for_each_conn_info(struct socket *sock, unsigned int len,
    783			  struct rds_info_iterator *iter,
    784			  struct rds_info_lengths *lens,
    785			  int (*visitor)(struct rds_connection *, void *),
    786			  u64 *buffer,
    787			  size_t item_len);
    788
    789__printf(2, 3)
    790void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
    791#define rds_conn_path_error(cp, fmt...) \
    792	__rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
    793
    794static inline int
    795rds_conn_path_transition(struct rds_conn_path *cp, int old, int new)
    796{
    797	return atomic_cmpxchg(&cp->cp_state, old, new) == old;
    798}
    799
    800static inline int
    801rds_conn_transition(struct rds_connection *conn, int old, int new)
    802{
    803	WARN_ON(conn->c_trans->t_mp_capable);
    804	return rds_conn_path_transition(&conn->c_path[0], old, new);
    805}
    806
    807static inline int
    808rds_conn_path_state(struct rds_conn_path *cp)
    809{
    810	return atomic_read(&cp->cp_state);
    811}
    812
    813static inline int
    814rds_conn_state(struct rds_connection *conn)
    815{
    816	WARN_ON(conn->c_trans->t_mp_capable);
    817	return rds_conn_path_state(&conn->c_path[0]);
    818}
    819
    820static inline int
    821rds_conn_path_up(struct rds_conn_path *cp)
    822{
    823	return atomic_read(&cp->cp_state) == RDS_CONN_UP;
    824}
    825
    826static inline int
    827rds_conn_path_down(struct rds_conn_path *cp)
    828{
    829	return atomic_read(&cp->cp_state) == RDS_CONN_DOWN;
    830}
    831
    832static inline int
    833rds_conn_up(struct rds_connection *conn)
    834{
    835	WARN_ON(conn->c_trans->t_mp_capable);
    836	return rds_conn_path_up(&conn->c_path[0]);
    837}
    838
    839static inline int
    840rds_conn_path_connecting(struct rds_conn_path *cp)
    841{
    842	return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING;
    843}
    844
    845static inline int
    846rds_conn_connecting(struct rds_connection *conn)
    847{
    848	WARN_ON(conn->c_trans->t_mp_capable);
    849	return rds_conn_path_connecting(&conn->c_path[0]);
    850}
    851
    852/* message.c */
    853struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
    854struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
    855int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
    856			       bool zcopy);
    857struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
    858void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
    859				 __be16 dport, u64 seq);
    860int rds_message_add_extension(struct rds_header *hdr,
    861			      unsigned int type, const void *data, unsigned int len);
    862int rds_message_next_extension(struct rds_header *hdr,
    863			       unsigned int *pos, void *buf, unsigned int *buflen);
    864int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
    865int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
    866void rds_message_inc_free(struct rds_incoming *inc);
    867void rds_message_addref(struct rds_message *rm);
    868void rds_message_put(struct rds_message *rm);
    869void rds_message_wait(struct rds_message *rm);
    870void rds_message_unmapped(struct rds_message *rm);
    871void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info);
    872
    873static inline void rds_message_make_checksum(struct rds_header *hdr)
    874{
    875	hdr->h_csum = 0;
    876	hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
    877}
    878
    879static inline int rds_message_verify_checksum(const struct rds_header *hdr)
    880{
    881	return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
    882}
    883
    884
    885/* page.c */
    886int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
    887			     gfp_t gfp);
    888void rds_page_exit(void);
    889
    890/* recv.c */
    891void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
    892		  struct in6_addr *saddr);
    893void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
    894		       struct in6_addr *saddr);
    895void rds_inc_put(struct rds_incoming *inc);
    896void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
    897		       struct in6_addr *daddr,
    898		       struct rds_incoming *inc, gfp_t gfp);
    899int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
    900		int msg_flags);
    901void rds_clear_recv_queue(struct rds_sock *rs);
    902int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
    903void rds_inc_info_copy(struct rds_incoming *inc,
    904		       struct rds_info_iterator *iter,
    905		       __be32 saddr, __be32 daddr, int flip);
    906void rds6_inc_info_copy(struct rds_incoming *inc,
    907			struct rds_info_iterator *iter,
    908			struct in6_addr *saddr, struct in6_addr *daddr,
    909			int flip);
    910
    911/* send.c */
    912int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
    913void rds_send_path_reset(struct rds_conn_path *conn);
    914int rds_send_xmit(struct rds_conn_path *cp);
    915struct sockaddr_in;
    916void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
    917typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
    918void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
    919			 is_acked_func is_acked);
    920void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
    921			      is_acked_func is_acked);
    922void rds_send_ping(struct rds_connection *conn, int cp_index);
    923int rds_send_pong(struct rds_conn_path *cp, __be16 dport);
    924
    925/* rdma.c */
    926void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
    927int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
    928int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen);
    929int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
    930void rds_rdma_drop_keys(struct rds_sock *rs);
    931int rds_rdma_extra_size(struct rds_rdma_args *args,
    932			struct rds_iov_vector *iov);
    933int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
    934			  struct cmsghdr *cmsg);
    935int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
    936			  struct cmsghdr *cmsg,
    937			  struct rds_iov_vector *vec);
    938int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
    939			  struct cmsghdr *cmsg);
    940void rds_rdma_free_op(struct rm_rdma_op *ro);
    941void rds_atomic_free_op(struct rm_atomic_op *ao);
    942void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
    943void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
    944int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
    945		    struct cmsghdr *cmsg);
    946
    947void __rds_put_mr_final(struct kref *kref);
    948
    949static inline bool rds_destroy_pending(struct rds_connection *conn)
    950{
    951	return !check_net(rds_conn_net(conn)) ||
    952	       (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
    953}
    954
    955enum {
    956	ODP_NOT_NEEDED,
    957	ODP_ZEROBASED,
    958	ODP_VIRTUAL
    959};
    960
    961/* stats.c */
    962DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
    963#define rds_stats_inc_which(which, member) do {		\
    964	per_cpu(which, get_cpu()).member++;		\
    965	put_cpu();					\
    966} while (0)
    967#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
    968#define rds_stats_add_which(which, member, count) do {		\
    969	per_cpu(which, get_cpu()).member += count;	\
    970	put_cpu();					\
    971} while (0)
    972#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
    973int rds_stats_init(void);
    974void rds_stats_exit(void);
    975void rds_stats_info_copy(struct rds_info_iterator *iter,
    976			 uint64_t *values, const char *const *names,
    977			 size_t nr);
    978
    979/* sysctl.c */
    980int rds_sysctl_init(void);
    981void rds_sysctl_exit(void);
    982extern unsigned long rds_sysctl_sndbuf_min;
    983extern unsigned long rds_sysctl_sndbuf_default;
    984extern unsigned long rds_sysctl_sndbuf_max;
    985extern unsigned long rds_sysctl_reconnect_min_jiffies;
    986extern unsigned long rds_sysctl_reconnect_max_jiffies;
    987extern unsigned int  rds_sysctl_max_unacked_packets;
    988extern unsigned int  rds_sysctl_max_unacked_bytes;
    989extern unsigned int  rds_sysctl_ping_enable;
    990extern unsigned long rds_sysctl_trace_flags;
    991extern unsigned int  rds_sysctl_trace_level;
    992
    993/* threads.c */
    994int rds_threads_init(void);
    995void rds_threads_exit(void);
    996extern struct workqueue_struct *rds_wq;
    997void rds_queue_reconnect(struct rds_conn_path *cp);
    998void rds_connect_worker(struct work_struct *);
    999void rds_shutdown_worker(struct work_struct *);
   1000void rds_send_worker(struct work_struct *);
   1001void rds_recv_worker(struct work_struct *);
   1002void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
   1003void rds_connect_complete(struct rds_connection *conn);
   1004int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
   1005
   1006/* transport.c */
   1007void rds_trans_register(struct rds_transport *trans);
   1008void rds_trans_unregister(struct rds_transport *trans);
   1009struct rds_transport *rds_trans_get_preferred(struct net *net,
   1010					      const struct in6_addr *addr,
   1011					      __u32 scope_id);
   1012void rds_trans_put(struct rds_transport *trans);
   1013unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
   1014				       unsigned int avail);
   1015struct rds_transport *rds_trans_get(int t_type);
   1016int rds_trans_init(void);
   1017void rds_trans_exit(void);
   1018
   1019#endif