From f213c05272100f385912372fff678d0af4d7f8ad Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 23 May 2016 15:20:49 +0300 Subject: IB/uverbs: Add WQ support User space applications which use RSS functionality need to create a work queue object (WQ). The lifetime of such an object is: * Create a WQ * Modify the WQ from reset to init state. * Use the WQ (by downstream patches). * Destroy the WQ. These commands are added to the uverbs API. Signed-off-by: Yishai Hadas Signed-off-by: Matan Barak Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford --- include/uapi/rdma/ib_user_verbs.h | 41 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index b6543d73d20a..c9470e511e03 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -95,6 +95,9 @@ enum { IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, IB_USER_VERBS_EX_CMD_DESTROY_FLOW, + IB_USER_VERBS_EX_CMD_CREATE_WQ, + IB_USER_VERBS_EX_CMD_MODIFY_WQ, + IB_USER_VERBS_EX_CMD_DESTROY_WQ, }; /* @@ -946,4 +949,42 @@ struct ib_uverbs_destroy_srq_resp { __u32 events_reported; }; +struct ib_uverbs_ex_create_wq { + __u32 comp_mask; + __u32 wq_type; + __u64 user_handle; + __u32 pd_handle; + __u32 cq_handle; + __u32 max_wr; + __u32 max_sge; +}; + +struct ib_uverbs_ex_create_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 wq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 wqn; +}; + +struct ib_uverbs_ex_destroy_wq { + __u32 comp_mask; + __u32 wq_handle; +}; + +struct ib_uverbs_ex_destroy_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 events_reported; + __u32 reserved; +}; + +struct ib_uverbs_ex_modify_wq { + __u32 attr_mask; + __u32 wq_handle; + __u32 wq_state; + __u32 curr_wq_state; +}; + #endif /* IB_USER_VERBS_H */ -- cgit v1.2.3-71-gd317 From de019a94049d579608a5511f8c50652faf125182 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 23 May 2016 15:20:52 +0300 Subject: IB/uverbs: Introduce RWQ Indirection table User applications that want to spread traffic on several WQs, need to create an indirection table, by using already created WQs. Adding uverbs API in order to create and destroy this table. Signed-off-by: Yishai Hadas Signed-off-by: Matan Barak Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs.h | 3 + drivers/infiniband/core/uverbs_cmd.c | 210 ++++++++++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_main.c | 13 +++ include/rdma/ib_verbs.h | 1 + include/uapi/rdma/ib_user_verbs.h | 26 +++++ 5 files changed, 253 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 74776c6531f4..6c2292397cd6 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -186,6 +186,7 @@ extern struct idr ib_uverbs_srq_idr; extern struct idr ib_uverbs_xrcd_idr; extern struct idr ib_uverbs_rule_idr; extern struct idr ib_uverbs_wq_idr; +extern struct idr ib_uverbs_rwq_ind_tbl_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); @@ -284,5 +285,7 @@ IB_UVERBS_DECLARE_EX_CMD(create_qp); IB_UVERBS_DECLARE_EX_CMD(create_wq); IB_UVERBS_DECLARE_EX_CMD(modify_wq); IB_UVERBS_DECLARE_EX_CMD(destroy_wq); +IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table); +IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 22e617391657..327a56cccc27 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -58,6 +58,7 @@ static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" }; +static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" }; /* * The ib_uobject locking scheme is as follows: @@ -338,6 +339,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); INIT_LIST_HEAD(&ucontext->wq_list); + INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); rcu_read_lock(); @@ -3299,6 +3301,214 @@ int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, return ret; } +int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_rwq_ind_table cmd = {}; + struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; + struct ib_uobject *uobj; + int err = 0; + struct ib_rwq_ind_table_init_attr init_attr = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_wq **wqs = NULL; + u32 *wqs_handles = NULL; + struct ib_wq *wq = NULL; + int i, j, num_read_wqs; + u32 num_wq_handles; + u32 expected_in_size; + size_t required_cmd_sz_header; + size_t required_resp_len; + + required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size); + required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num); + + if (ucore->inlen < required_cmd_sz_header) + return -EINVAL; + + if (ucore->outlen < required_resp_len) + return -ENOSPC; + + err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header); + if (err) + return err; + + ucore->inbuf += required_cmd_sz_header; + ucore->inlen -= required_cmd_sz_header; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE) + return -EINVAL; + + num_wq_handles = 1 << cmd.log_ind_tbl_size; + expected_in_size = num_wq_handles * sizeof(__u32); + if (num_wq_handles == 1) + /* input size for wq handles is u64 aligned */ + expected_in_size += sizeof(__u32); + + if (ucore->inlen < expected_in_size) + return -EINVAL; + + if (ucore->inlen > expected_in_size && + !ib_is_udata_cleared(ucore, expected_in_size, + ucore->inlen - expected_in_size)) + return -EOPNOTSUPP; + + wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles), + GFP_KERNEL); + if (!wqs_handles) + return -ENOMEM; + + err = ib_copy_from_udata(wqs_handles, ucore, + num_wq_handles * sizeof(__u32)); + if (err) + goto err_free; + + wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL); + if (!wqs) { + err = -ENOMEM; + goto err_free; + } + + for (num_read_wqs = 0; num_read_wqs < num_wq_handles; + num_read_wqs++) { + wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext); + if (!wq) { + err = -EINVAL; + goto put_wqs; + } + + wqs[num_read_wqs] = wq; + } + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto put_wqs; + } + + init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class); + down_write(&uobj->mutex); + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw); + + if (IS_ERR(rwq_ind_tbl)) { + err = PTR_ERR(rwq_ind_tbl); + goto err_uobj; + } + + rwq_ind_tbl->ind_tbl = wqs; + rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; + rwq_ind_tbl->uobject = uobj; + uobj->object = rwq_ind_tbl; + rwq_ind_tbl->device = ib_dev; + atomic_set(&rwq_ind_tbl->usecnt, 0); + + for (i = 0; i < num_wq_handles; i++) + atomic_inc(&wqs[i]->usecnt); + + err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + if (err) + goto destroy_ind_tbl; + + resp.ind_tbl_handle = uobj->id; + resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; + resp.response_length = required_resp_len; + + err = ib_copy_to_udata(ucore, + &resp, resp.response_length); + if (err) + goto err_copy; + + kfree(wqs_handles); + + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); +destroy_ind_tbl: + ib_destroy_rwq_ind_table(rwq_ind_tbl); +err_uobj: + put_uobj_write(uobj); +put_wqs: + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); +err_free: + kfree(wqs_handles); + kfree(wqs); + return err; +} + +int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_uobject *uobj; + int ret; + struct ib_wq **ind_tbl; + size_t required_cmd_sz; + + required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle); + + if (ucore->inlen < required_cmd_sz) + return -EINVAL; + + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; + + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + rwq_ind_tbl = uobj->object; + ind_tbl = rwq_ind_tbl->ind_tbl; + + ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + kfree(ind_tbl); + return ret; +} + int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 91cb36f66ea7..426e0ac203fc 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -77,6 +77,7 @@ DEFINE_IDR(ib_uverbs_srq_idr); DEFINE_IDR(ib_uverbs_xrcd_idr); DEFINE_IDR(ib_uverbs_rule_idr); DEFINE_IDR(ib_uverbs_wq_idr); +DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr); static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -134,6 +135,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_EX_CMD_CREATE_WQ] = ib_uverbs_ex_create_wq, [IB_USER_VERBS_EX_CMD_MODIFY_WQ] = ib_uverbs_ex_modify_wq, [IB_USER_VERBS_EX_CMD_DESTROY_WQ] = ib_uverbs_ex_destroy_wq, + [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table, + [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table, }; static void ib_uverbs_add_one(struct ib_device *device); @@ -269,6 +272,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uqp); } + list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) { + struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object; + struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; + + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + ib_destroy_rwq_ind_table(rwq_ind_tbl); + kfree(ind_tbl); + kfree(uobj); + } + list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) { struct ib_wq *wq = uobj->object; struct ib_uwq_object *uwq = diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index fa2e0184dcc5..e305c9a36663 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1326,6 +1326,7 @@ struct ib_ucontext { struct list_head xrcd_list; struct list_head rule_list; struct list_head wq_list; + struct list_head rwq_ind_tbl_list; int closing; struct pid *tgid; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index c9470e511e03..2cf7c95860d8 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -98,6 +98,8 @@ enum { IB_USER_VERBS_EX_CMD_CREATE_WQ, IB_USER_VERBS_EX_CMD_MODIFY_WQ, IB_USER_VERBS_EX_CMD_DESTROY_WQ, + IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, + IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL }; /* @@ -987,4 +989,28 @@ struct ib_uverbs_ex_modify_wq { __u32 curr_wq_state; }; +/* Prevent memory allocation rather than max expected size */ +#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d +struct ib_uverbs_ex_create_rwq_ind_table { + __u32 comp_mask; + __u32 log_ind_tbl_size; + /* Following are the wq handles according to log_ind_tbl_size + * wq_handle1 + * wq_handle2 + */ + __u32 wq_handles[0]; +}; + +struct ib_uverbs_ex_create_rwq_ind_table_resp { + __u32 comp_mask; + __u32 response_length; + __u32 ind_tbl_handle; + __u32 ind_tbl_num; +}; + +struct ib_uverbs_ex_destroy_rwq_ind_table { + __u32 comp_mask; + __u32 ind_tbl_handle; +}; + #endif /* IB_USER_VERBS_H */ -- cgit v1.2.3-71-gd317 From c70285f880e88cb4f73effb722065a182ba5936f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 23 May 2016 15:20:55 +0300 Subject: IB/uverbs: Extend create QP to get RWQ indirection table User applications that want to spread incoming traffic between several WQs should create a QP which contains an indirection table. When such a QP is created other receive side parameters are not valid and should not be given. Its send side is optional and assumed active based on max_send_wr capability value. Extend create QP to work accordingly. Signed-off-by: Yishai Hadas Signed-off-by: Matan Barak Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 75 ++++++++++++++++++++++++++++++------ include/uapi/rdma/ib_user_verbs.h | 10 +++++ 2 files changed, 73 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 327a56cccc27..65ab2097cd81 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -255,6 +255,17 @@ static void put_wq_read(struct ib_wq *wq) put_uobj_read(wq->uobject); } +static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle, + struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0); +} + +static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table) +{ + put_uobj_read(ind_table->uobject); +} + static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) { struct ib_uobject *uobj; @@ -1761,9 +1772,11 @@ static int create_qp(struct ib_uverbs_file *file, struct ib_srq *srq = NULL; struct ib_qp *qp; char *buf; - struct ib_qp_init_attr attr; + struct ib_qp_init_attr attr = {}; struct ib_uverbs_ex_create_qp_resp resp; int ret; + struct ib_rwq_ind_table *ind_tbl = NULL; + bool has_sq = true; if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) return -EPERM; @@ -1775,6 +1788,32 @@ static int create_qp(struct ib_uverbs_file *file, init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); + if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) + + sizeof(cmd->rwq_ind_tbl_handle) && + (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) { + ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle, + file->ucontext); + if (!ind_tbl) { + ret = -EINVAL; + goto err_put; + } + + attr.rwq_ind_tbl = ind_tbl; + } + + if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + + sizeof(cmd->reserved1)) && cmd->reserved1) { + ret = -EOPNOTSUPP; + goto err_put; + } + + if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) { + ret = -EINVAL; + goto err_put; + } + + if (ind_tbl && !cmd->max_send_wr) + has_sq = false; if (cmd->qp_type == IB_QPT_XRC_TGT) { xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, @@ -1798,20 +1837,24 @@ static int create_qp(struct ib_uverbs_file *file, } } - if (cmd->recv_cq_handle != cmd->send_cq_handle) { - rcq = idr_read_cq(cmd->recv_cq_handle, - file->ucontext, 0); - if (!rcq) { - ret = -EINVAL; - goto err_put; + if (!ind_tbl) { + if (cmd->recv_cq_handle != cmd->send_cq_handle) { + rcq = idr_read_cq(cmd->recv_cq_handle, + file->ucontext, 0); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } } } } - scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); - rcq = rcq ?: scq; + if (has_sq) + scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); + if (!ind_tbl) + rcq = rcq ?: scq; pd = idr_read_pd(cmd->pd_handle, file->ucontext); - if (!pd || !scq) { + if (!pd || (!scq && has_sq)) { ret = -EINVAL; goto err_put; } @@ -1878,16 +1921,20 @@ static int create_qp(struct ib_uverbs_file *file, qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; qp->srq = attr.srq; + qp->rwq_ind_tbl = ind_tbl; qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); + if (attr.send_cq) + atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); + if (ind_tbl) + atomic_inc(&ind_tbl->usecnt); } qp->uobject = &obj->uevent.uobject; @@ -1927,6 +1974,8 @@ static int create_qp(struct ib_uverbs_file *file, put_cq_read(rcq); if (srq) put_srq_read(srq); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); @@ -1954,6 +2003,8 @@ err_put: put_cq_read(rcq); if (srq) put_srq_read(srq); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); put_uobj_write(&obj->uevent.uobject); return ret; @@ -2047,7 +2098,7 @@ int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file, if (err) return err; - if (cmd.comp_mask) + if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK) return -EINVAL; if (cmd.reserved) diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 2cf7c95860d8..2c8bca8cd2c2 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -523,6 +523,14 @@ struct ib_uverbs_create_qp { __u64 driver_data[0]; }; +enum ib_uverbs_create_qp_mask { + IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0, +}; + +enum { + IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE, +}; + struct ib_uverbs_ex_create_qp { __u64 user_handle; __u32 pd_handle; @@ -540,6 +548,8 @@ struct ib_uverbs_ex_create_qp { __u8 reserved; __u32 comp_mask; __u32 create_flags; + __u32 rwq_ind_tbl_handle; + __u32 reserved1; }; struct ib_uverbs_open_qp { -- cgit v1.2.3-71-gd317 From 4c2aae712cb024f9d30a1fa62e3ba2ff785c6a3e Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Fri, 17 Jun 2016 15:14:50 +0300 Subject: IB/core: Add IPv6 support to flow steering Add IPv6 flow specification support. Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 9 +++++++++ include/rdma/ib_verbs.h | 14 ++++++++++++++ include/uapi/rdma/ib_user_verbs.h | 18 ++++++++++++++++++ 4 files changed, 42 insertions(+) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 6c2292397cd6..b7f3b8d359b9 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -226,6 +226,7 @@ struct ib_uverbs_flow_spec { struct ib_uverbs_flow_spec_eth eth; struct ib_uverbs_flow_spec_ipv4 ipv4; struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + struct ib_uverbs_flow_spec_ipv6 ipv6; }; }; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 65ab2097cd81..f6647318138d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3105,6 +3105,15 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, sizeof(struct ib_flow_ipv4_filter)); break; + case IB_FLOW_SPEC_IPV6: + ib_spec->ipv6.size = sizeof(struct ib_flow_spec_ipv6); + if (ib_spec->ipv6.size != kern_spec->ipv6.size) + return -EINVAL; + memcpy(&ib_spec->ipv6.val, &kern_spec->ipv6.val, + sizeof(struct ib_flow_ipv6_filter)); + memcpy(&ib_spec->ipv6.mask, &kern_spec->ipv6.mask, + sizeof(struct ib_flow_ipv6_filter)); + break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9b2fafe5eb38..9bbca6887f7f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1569,6 +1569,7 @@ enum ib_flow_spec_type { IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, + IB_FLOW_SPEC_IPV6 = 0x31, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41 @@ -1630,6 +1631,18 @@ struct ib_flow_spec_ipv4 { struct ib_flow_ipv4_filter mask; }; +struct ib_flow_ipv6_filter { + u8 src_ip[16]; + u8 dst_ip[16]; +}; + +struct ib_flow_spec_ipv6 { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_ipv6_filter val; + struct ib_flow_ipv6_filter mask; +}; + struct ib_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; @@ -1651,6 +1664,7 @@ union ib_flow_spec { struct ib_flow_spec_ib ib; struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; + struct ib_flow_spec_ipv6 ipv6; }; struct ib_flow_attr { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 2c8bca8cd2c2..7f035f4b53b0 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -867,6 +867,24 @@ struct ib_uverbs_flow_spec_tcp_udp { struct ib_uverbs_flow_tcp_udp_filter mask; }; +struct ib_uverbs_flow_ipv6_filter { + __u8 src_ip[16]; + __u8 dst_ip[16]; +}; + +struct ib_uverbs_flow_spec_ipv6 { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_ipv6_filter val; + struct ib_uverbs_flow_ipv6_filter mask; +}; + struct ib_uverbs_flow_attr { __u32 type; __u16 size; -- cgit v1.2.3-71-gd317 From b810253bd9342f863a86ec7dfff4a5a7a0394d2f Mon Sep 17 00:00:00 2001 From: Philippe Bergheaud Date: Thu, 23 Jun 2016 15:03:53 +0200 Subject: cxl: Add mechanism for delivering AFU driver specific events This adds an afu_driver_ops structure with fetch_event() and event_delivered() callbacks. An AFU driver such as cxlflash can fill this out and associate it with a context to enable passing custom AFU specific events to userspace. This also adds a new kernel API function cxl_context_pending_events(), that the AFU driver can use to notify the cxl driver that new specific events are ready to be delivered, and wake up anyone waiting on the context wait queue. The current count of AFU driver specific events is stored in the field afu_driver_events of the context structure. The cxl driver checks the afu_driver_events count during poll, select, read, etc. calls to check if an AFU driver specific event is pending, and calls fetch_event() to obtain and deliver that event. This way, the cxl driver takes care of all the usual locking semantics around these calls and handles all the generic cxl events, so that the AFU driver only needs to worry about it's own events. fetch_event() return a struct cxl_event_afu_driver_reserved, allocated by the AFU driver, and filled in with the specific event information and size. Total event size (header + data) should not be greater than CXL_READ_MIN_SIZE (4K). Th cxl driver prepends an appropriate cxl event header, copies the event to userspace, and finally calls event_delivered() to return the status of the operation to the AFU driver. The event is identified by the context and cxl_event_afu_driver_reserved pointers. Since AFU drivers provide their own means for userspace to obtain the AFU file descriptor (i.e. cxlflash uses an ioctl on their scsi file descriptor to obtain the AFU file descriptor) and the generic cxl driver will never use this event, the ABI of the event is up to each individual AFU driver. Signed-off-by: Philippe Bergheaud Signed-off-by: Michael Ellerman --- drivers/misc/cxl/Kconfig | 5 ++++ drivers/misc/cxl/api.c | 17 +++++++++++++ drivers/misc/cxl/cxl.h | 7 +++++- drivers/misc/cxl/file.c | 64 ++++++++++++++++++++++++++++++++++++++++++------ include/misc/cxl.h | 48 ++++++++++++++++++++++++++++++++++++ include/uapi/misc/cxl.h | 17 +++++++++++++ 6 files changed, 149 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/misc/cxl/Kconfig b/drivers/misc/cxl/Kconfig index 8756d06e2bb8..560412cd4c98 100644 --- a/drivers/misc/cxl/Kconfig +++ b/drivers/misc/cxl/Kconfig @@ -15,12 +15,17 @@ config CXL_EEH bool default n +config CXL_AFU_DRIVER_OPS + bool + default n + config CXL tristate "Support for IBM Coherent Accelerators (CXL)" depends on PPC_POWERNV && PCI_MSI && EEH select CXL_BASE select CXL_KERNEL_API select CXL_EEH + select CXL_AFU_DRIVER_OPS default m help Select this option to enable driver support for IBM Coherent diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index 99081b821f12..f11dc0e5fdd6 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -333,6 +333,23 @@ struct cxl_context *cxl_fops_get_context(struct file *file) } EXPORT_SYMBOL_GPL(cxl_fops_get_context); +void cxl_set_driver_ops(struct cxl_context *ctx, + struct cxl_afu_driver_ops *ops) +{ + WARN_ON(!ops->fetch_event || !ops->event_delivered); + atomic_set(&ctx->afu_driver_events, 0); + ctx->afu_driver_ops = ops; +} +EXPORT_SYMBOL_GPL(cxl_set_driver_ops); + +void cxl_context_events_pending(struct cxl_context *ctx, + unsigned int new_events) +{ + atomic_add(new_events, &ctx->afu_driver_events); + wake_up_all(&ctx->wq); +} +EXPORT_SYMBOL_GPL(cxl_context_events_pending); + int cxl_start_work(struct cxl_context *ctx, struct cxl_ioctl_start_work *work) { diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index ce2b9d513069..422ee53868a8 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -24,6 +24,7 @@ #include #include +#include #include extern uint cxl_verbose; @@ -34,7 +35,7 @@ extern uint cxl_verbose; * Bump version each time a user API change is made, whether it is * backwards compatible ot not. */ -#define CXL_API_VERSION 2 +#define CXL_API_VERSION 3 #define CXL_API_VERSION_COMPATIBLE 1 /* @@ -528,6 +529,10 @@ struct cxl_context { bool pending_fault; bool pending_afu_err; + /* Used by AFU drivers for driver specific event delivery */ + struct cxl_afu_driver_ops *afu_driver_ops; + atomic_t afu_driver_events; + struct rcu_head rcu; }; diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index eec468f1612f..5fb9894b157f 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -293,6 +293,17 @@ int afu_mmap(struct file *file, struct vm_area_struct *vm) return cxl_context_iomap(ctx, vm); } +static inline bool ctx_event_pending(struct cxl_context *ctx) +{ + if (ctx->pending_irq || ctx->pending_fault || ctx->pending_afu_err) + return true; + + if (ctx->afu_driver_ops && atomic_read(&ctx->afu_driver_events)) + return true; + + return false; +} + unsigned int afu_poll(struct file *file, struct poll_table_struct *poll) { struct cxl_context *ctx = file->private_data; @@ -305,8 +316,7 @@ unsigned int afu_poll(struct file *file, struct poll_table_struct *poll) pr_devel("afu_poll wait done pe: %i\n", ctx->pe); spin_lock_irqsave(&ctx->lock, flags); - if (ctx->pending_irq || ctx->pending_fault || - ctx->pending_afu_err) + if (ctx_event_pending(ctx)) mask |= POLLIN | POLLRDNORM; else if (ctx->status == CLOSED) /* Only error on closed when there are no futher events pending @@ -319,16 +329,46 @@ unsigned int afu_poll(struct file *file, struct poll_table_struct *poll) return mask; } -static inline int ctx_event_pending(struct cxl_context *ctx) +static ssize_t afu_driver_event_copy(struct cxl_context *ctx, + char __user *buf, + struct cxl_event *event, + struct cxl_event_afu_driver_reserved *pl) { - return (ctx->pending_irq || ctx->pending_fault || - ctx->pending_afu_err || (ctx->status == CLOSED)); + /* Check event */ + if (!pl) { + ctx->afu_driver_ops->event_delivered(ctx, pl, -EINVAL); + return -EFAULT; + } + + /* Check event size */ + event->header.size += pl->data_size; + if (event->header.size > CXL_READ_MIN_SIZE) { + ctx->afu_driver_ops->event_delivered(ctx, pl, -EINVAL); + return -EFAULT; + } + + /* Copy event header */ + if (copy_to_user(buf, event, sizeof(struct cxl_event_header))) { + ctx->afu_driver_ops->event_delivered(ctx, pl, -EFAULT); + return -EFAULT; + } + + /* Copy event data */ + buf += sizeof(struct cxl_event_header); + if (copy_to_user(buf, &pl->data, pl->data_size)) { + ctx->afu_driver_ops->event_delivered(ctx, pl, -EFAULT); + return -EFAULT; + } + + ctx->afu_driver_ops->event_delivered(ctx, pl, 0); /* Success */ + return event->header.size; } ssize_t afu_read(struct file *file, char __user *buf, size_t count, loff_t *off) { struct cxl_context *ctx = file->private_data; + struct cxl_event_afu_driver_reserved *pl = NULL; struct cxl_event event; unsigned long flags; int rc; @@ -344,7 +384,7 @@ ssize_t afu_read(struct file *file, char __user *buf, size_t count, for (;;) { prepare_to_wait(&ctx->wq, &wait, TASK_INTERRUPTIBLE); - if (ctx_event_pending(ctx)) + if (ctx_event_pending(ctx) || (ctx->status == CLOSED)) break; if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu)) { @@ -374,7 +414,12 @@ ssize_t afu_read(struct file *file, char __user *buf, size_t count, memset(&event, 0, sizeof(event)); event.header.process_element = ctx->pe; event.header.size = sizeof(struct cxl_event_header); - if (ctx->pending_irq) { + if (ctx->afu_driver_ops && atomic_read(&ctx->afu_driver_events)) { + pr_devel("afu_read delivering AFU driver specific event\n"); + pl = ctx->afu_driver_ops->fetch_event(ctx); + atomic_dec(&ctx->afu_driver_events); + event.header.type = CXL_EVENT_AFU_DRIVER; + } else if (ctx->pending_irq) { pr_devel("afu_read delivering AFU interrupt\n"); event.header.size += sizeof(struct cxl_event_afu_interrupt); event.header.type = CXL_EVENT_AFU_INTERRUPT; @@ -404,6 +449,9 @@ ssize_t afu_read(struct file *file, char __user *buf, size_t count, spin_unlock_irqrestore(&ctx->lock, flags); + if (event.header.type == CXL_EVENT_AFU_DRIVER) + return afu_driver_event_copy(ctx, buf, &event, pl); + if (copy_to_user(buf, &event, event.header.size)) return -EFAULT; return event.header.size; @@ -558,7 +606,7 @@ int __init cxl_file_init(void) * If these change we really need to update API. Either change some * flags or update API version number CXL_API_VERSION. */ - BUILD_BUG_ON(CXL_API_VERSION != 2); + BUILD_BUG_ON(CXL_API_VERSION != 3); BUILD_BUG_ON(sizeof(struct cxl_ioctl_start_work) != 64); BUILD_BUG_ON(sizeof(struct cxl_event_header) != 8); BUILD_BUG_ON(sizeof(struct cxl_event_afu_interrupt) != 8); diff --git a/include/misc/cxl.h b/include/misc/cxl.h index 56560c5781b4..17419f61e611 100644 --- a/include/misc/cxl.h +++ b/include/misc/cxl.h @@ -220,4 +220,52 @@ void cxl_perst_reloads_same_image(struct cxl_afu *afu, */ ssize_t cxl_read_adapter_vpd(struct pci_dev *dev, void *buf, size_t count); +/* + * AFU driver ops allow an AFU driver to create their own events to pass to + * userspace through the file descriptor as a simpler alternative to overriding + * the read() and poll() calls that works with the generic cxl events. These + * events are given priority over the generic cxl events, so they will be + * delivered first if multiple types of events are pending. + * + * The AFU driver must call cxl_context_events_pending() to notify the cxl + * driver that new events are ready to be delivered for a specific context. + * cxl_context_events_pending() will adjust the current count of AFU driver + * events for this context, and wake up anyone waiting on the context wait + * queue. + * + * The cxl driver will then call fetch_event() to get a structure defining + * the size and address of the driver specific event data. The cxl driver + * will build a cxl header with type and process_element fields filled in, + * and header.size set to sizeof(struct cxl_event_header) + data_size. + * The total size of the event is limited to CXL_READ_MIN_SIZE (4K). + * + * fetch_event() is called with a spin lock held, so it must not sleep. + * + * The cxl driver will then deliver the event to userspace, and finally + * call event_delivered() to return the status of the operation, identified + * by cxl context and AFU driver event data pointers. + * 0 Success + * -EFAULT copy_to_user() has failed + * -EINVAL Event data pointer is NULL, or event size is greater than + * CXL_READ_MIN_SIZE. + */ +struct cxl_afu_driver_ops { + struct cxl_event_afu_driver_reserved *(*fetch_event) ( + struct cxl_context *ctx); + void (*event_delivered) (struct cxl_context *ctx, + struct cxl_event_afu_driver_reserved *event, + int rc); +}; + +/* + * Associate the above driver ops with a specific context. + * Reset the current count of AFU driver events. + */ +void cxl_set_driver_ops(struct cxl_context *ctx, + struct cxl_afu_driver_ops *ops); + +/* Notify cxl driver that new events are ready to be delivered for context */ +void cxl_context_events_pending(struct cxl_context *ctx, + unsigned int new_events); + #endif /* _MISC_CXL_H */ diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h index 8cd334f99ddc..cbae529b7ce0 100644 --- a/include/uapi/misc/cxl.h +++ b/include/uapi/misc/cxl.h @@ -93,6 +93,7 @@ enum cxl_event_type { CXL_EVENT_AFU_INTERRUPT = 1, CXL_EVENT_DATA_STORAGE = 2, CXL_EVENT_AFU_ERROR = 3, + CXL_EVENT_AFU_DRIVER = 4, }; struct cxl_event_header { @@ -124,12 +125,28 @@ struct cxl_event_afu_error { __u64 error; }; +struct cxl_event_afu_driver_reserved { + /* + * Defines the buffer passed to the cxl driver by the AFU driver. + * + * This is not ABI since the event header.size passed to the user for + * existing events is set in the read call to sizeof(cxl_event_header) + * + sizeof(whatever event is being dispatched) and the user is already + * required to use a 4K buffer on the read call. + * + * Of course the contents will be ABI, but that's up the AFU driver. + */ + size_t data_size; + u8 data[]; +}; + struct cxl_event { struct cxl_event_header header; union { struct cxl_event_afu_interrupt irq; struct cxl_event_data_storage fault; struct cxl_event_afu_error afu_error; + struct cxl_event_afu_driver_reserved afu_driver_event; }; }; -- cgit v1.2.3-71-gd317 From 37f501afed23fa1126017255495d5be5e97c9d6d Mon Sep 17 00:00:00 2001 From: "arun.siluvery@linux.intel.com" Date: Fri, 1 Jul 2016 11:43:02 +0100 Subject: drm/i915/bxt: Export pooled eu info to userspace Pooled EU is a bxt only feature and kernel changes are already merged. This feature is not yet exposed to userspace as the support was not yet available. Beignet team expressed interest and added patches to use this. Since we now have a user and patches to use them, expose them from the kernel side as well. v2: fix compile error [1] https://lists.freedesktop.org/archives/beignet/2016-June/007698.html [2] https://lists.freedesktop.org/archives/beignet/2016-June/007699.html Cc: Winiarski, Michal Cc: Zou, Nanhai Cc: Yang, Rong R Cc: Tim Gore Cc: Jeff McGee Signed-off-by: Arun Siluvery Acked-by: Chris Wilson Signed-off-by: Tvrtko Ursulin Link: http://patchwork.freedesktop.org/patch/msgid/1467369782-25992-1-git-send-email-arun.siluvery@linux.intel.com Acked-by: Jani Nikula --- drivers/gpu/drm/i915/i915_drv.c | 6 ++++++ include/uapi/drm/i915_drm.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index c580e24095b0..8a2674013aef 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -365,6 +365,12 @@ static int i915_getparam(struct drm_device *dev, void *data, case I915_PARAM_HAS_EXEC_SOFTPIN: value = 1; break; + case I915_PARAM_HAS_POOLED_EU: + value = HAS_POOLED_EU(dev); + break; + case I915_PARAM_MIN_EU_IN_POOL: + value = INTEL_INFO(dev)->min_eu_in_pool; + break; default: DRM_DEBUG("Unknown parameter %d\n", param->param); return -EINVAL; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index c17d63d8b543..a642bbc7777d 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -361,6 +361,8 @@ typedef struct drm_i915_irq_wait { #define I915_PARAM_HAS_GPU_RESET 35 #define I915_PARAM_HAS_RESOURCE_STREAMER 36 #define I915_PARAM_HAS_EXEC_SOFTPIN 37 +#define I915_PARAM_HAS_POOLED_EU 38 +#define I915_PARAM_MIN_EU_IN_POOL 39 typedef struct drm_i915_getparam { __s32 param; -- cgit v1.2.3-71-gd317 From bc3d674462e5df5f2b33adbfcaad9edff8b827f4 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 4 Jul 2016 08:08:39 +0100 Subject: drm/i915: Allow userspace to request no-error-capture upon GPU hangs igt likes to inject GPU hangs into its command streams. However, as we expect these hangs, we don't actually want them recorded in the dmesg output or stored in the i915_error_state (usually). To accommodate this allow userspace to set a flag on the context that any hang emanating from that context will not be recorded. We still do the error capture (otherwise how do we find the guilty context and know its intent?) as part of the reason for random GPU hang injection is to exercise the race conditions between the error capture and normal execution. v2: Split out the request->ringbuf error capture changes. v3: Move the flag defines next to the intel_context->flags definition Signed-off-by: Chris Wilson Acked-by: Daniel Vetter Reviewed-by: Dave Gordon Link: http://patchwork.freedesktop.org/patch/msgid/1467616119-4093-9-git-send-email-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_drv.h | 4 +++- drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++ drivers/gpu/drm/i915/i915_gpu_error.c | 20 ++++++++++++-------- include/uapi/drm/i915_drm.h | 1 + 4 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 488891853cb5..251a08d8808d 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -475,6 +475,7 @@ struct drm_i915_error_state { struct timeval time; char error_msg[128]; + bool simulated; int iommu; u32 reset_count; u32 suspend_count; @@ -875,9 +876,10 @@ struct i915_gem_context { /* Unique identifier for this context, used by the hw for tracking */ unsigned long flags; +#define CONTEXT_NO_ZEROMAP BIT(0) +#define CONTEXT_NO_ERROR_CAPTURE BIT(1) unsigned hw_id; u32 user_handle; -#define CONTEXT_NO_ZEROMAP (1<<0) u32 ggtt_alignment; diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 3a6594b70900..8e952b1a31b3 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -1026,6 +1026,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, else args->value = to_i915(dev)->ggtt.base.total; break; + case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: + args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE); + break; default: ret = -EINVAL; break; @@ -1071,6 +1074,16 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, ctx->flags |= args->value ? CONTEXT_NO_ZEROMAP : 0; } break; + case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: + if (args->size) { + ret = -EINVAL; + } else { + if (args->value) + ctx->flags |= CONTEXT_NO_ERROR_CAPTURE; + else + ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE; + } + break; default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 1be63590a7fe..c6e05cccbedf 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1093,9 +1093,8 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, struct i915_address_space *vm; struct intel_ringbuffer *rb; - vm = request->ctx && request->ctx->ppgtt ? - &request->ctx->ppgtt->base : - &ggtt->base; + vm = request->ctx->ppgtt ? + &request->ctx->ppgtt->base : &ggtt->base; /* We need to copy these to an anonymous buffer * as the simplest method to avoid being overwritten @@ -1123,6 +1122,9 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, rcu_read_unlock(); } + error->simulated |= + request->ctx->flags & CONTEXT_NO_ERROR_CAPTURE; + rb = request->ringbuf; error->ring[i].cpu_ring_head = rb->head; error->ring[i].cpu_ring_tail = rb->tail; @@ -1422,12 +1424,14 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, i915_error_capture_msg(dev_priv, error, engine_mask, error_msg); DRM_INFO("%s\n", error->error_msg); - spin_lock_irqsave(&dev_priv->gpu_error.lock, flags); - if (dev_priv->gpu_error.first_error == NULL) { - dev_priv->gpu_error.first_error = error; - error = NULL; + if (!error->simulated) { + spin_lock_irqsave(&dev_priv->gpu_error.lock, flags); + if (!dev_priv->gpu_error.first_error) { + dev_priv->gpu_error.first_error = error; + error = NULL; + } + spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags); } - spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags); if (error) { i915_error_state_free(&error->ref); diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index a642bbc7777d..d7e81a3886fd 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1173,6 +1173,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_BAN_PERIOD 0x1 #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 #define I915_CONTEXT_PARAM_GTT_SIZE 0x3 +#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4 __u64 value; }; -- cgit v1.2.3-71-gd317 From 000cab9a61ea9e8dc42144e39a6eb8333a402b86 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Sun, 12 Jun 2016 15:44:44 +0800 Subject: drm/amdgpu: factor out the AMDGPU_INFO_FW_VERSION case branch into amdgpu_firmware_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new amdgpu_firmware_info function will be used on amdgpu firmware version debugfs. Suggested-by: Christian König Signed-off-by: Huang Rui Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 116 ++++++++++++++++++-------------- include/uapi/drm/amdgpu_drm.h | 32 ++++----- 2 files changed, 81 insertions(+), 67 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index d851ea15059f..56c857f6e7ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -142,6 +142,65 @@ out: return r; } +static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info, + struct drm_amdgpu_query_fw *query_fw, + struct amdgpu_device *adev) +{ + switch (query_fw->fw_type) { + case AMDGPU_INFO_FW_VCE: + fw_info->ver = adev->vce.fw_version; + fw_info->feature = adev->vce.fb_version; + break; + case AMDGPU_INFO_FW_UVD: + fw_info->ver = adev->uvd.fw_version; + fw_info->feature = 0; + break; + case AMDGPU_INFO_FW_GMC: + fw_info->ver = adev->mc.fw_version; + fw_info->feature = 0; + break; + case AMDGPU_INFO_FW_GFX_ME: + fw_info->ver = adev->gfx.me_fw_version; + fw_info->feature = adev->gfx.me_feature_version; + break; + case AMDGPU_INFO_FW_GFX_PFP: + fw_info->ver = adev->gfx.pfp_fw_version; + fw_info->feature = adev->gfx.pfp_feature_version; + break; + case AMDGPU_INFO_FW_GFX_CE: + fw_info->ver = adev->gfx.ce_fw_version; + fw_info->feature = adev->gfx.ce_feature_version; + break; + case AMDGPU_INFO_FW_GFX_RLC: + fw_info->ver = adev->gfx.rlc_fw_version; + fw_info->feature = adev->gfx.rlc_feature_version; + break; + case AMDGPU_INFO_FW_GFX_MEC: + if (query_fw->index == 0) { + fw_info->ver = adev->gfx.mec_fw_version; + fw_info->feature = adev->gfx.mec_feature_version; + } else if (query_fw->index == 1) { + fw_info->ver = adev->gfx.mec2_fw_version; + fw_info->feature = adev->gfx.mec2_feature_version; + } else + return -EINVAL; + break; + case AMDGPU_INFO_FW_SMC: + fw_info->ver = adev->pm.fw_version; + fw_info->feature = 0; + break; + case AMDGPU_INFO_FW_SDMA: + if (query_fw->index >= adev->sdma.num_instances) + return -EINVAL; + fw_info->ver = adev->sdma.instance[query_fw->index].fw_version; + fw_info->feature = adev->sdma.instance[query_fw->index].feature_version; + break; + default: + return -EINVAL; + } + return 0; +} + /* * Userspace get information ioctl */ @@ -292,63 +351,16 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0; case AMDGPU_INFO_FW_VERSION: { struct drm_amdgpu_info_firmware fw_info; + int ret; /* We only support one instance of each IP block right now. */ if (info->query_fw.ip_instance != 0) return -EINVAL; - switch (info->query_fw.fw_type) { - case AMDGPU_INFO_FW_VCE: - fw_info.ver = adev->vce.fw_version; - fw_info.feature = adev->vce.fb_version; - break; - case AMDGPU_INFO_FW_UVD: - fw_info.ver = adev->uvd.fw_version; - fw_info.feature = 0; - break; - case AMDGPU_INFO_FW_GMC: - fw_info.ver = adev->mc.fw_version; - fw_info.feature = 0; - break; - case AMDGPU_INFO_FW_GFX_ME: - fw_info.ver = adev->gfx.me_fw_version; - fw_info.feature = adev->gfx.me_feature_version; - break; - case AMDGPU_INFO_FW_GFX_PFP: - fw_info.ver = adev->gfx.pfp_fw_version; - fw_info.feature = adev->gfx.pfp_feature_version; - break; - case AMDGPU_INFO_FW_GFX_CE: - fw_info.ver = adev->gfx.ce_fw_version; - fw_info.feature = adev->gfx.ce_feature_version; - break; - case AMDGPU_INFO_FW_GFX_RLC: - fw_info.ver = adev->gfx.rlc_fw_version; - fw_info.feature = adev->gfx.rlc_feature_version; - break; - case AMDGPU_INFO_FW_GFX_MEC: - if (info->query_fw.index == 0) { - fw_info.ver = adev->gfx.mec_fw_version; - fw_info.feature = adev->gfx.mec_feature_version; - } else if (info->query_fw.index == 1) { - fw_info.ver = adev->gfx.mec2_fw_version; - fw_info.feature = adev->gfx.mec2_feature_version; - } else - return -EINVAL; - break; - case AMDGPU_INFO_FW_SMC: - fw_info.ver = adev->pm.fw_version; - fw_info.feature = 0; - break; - case AMDGPU_INFO_FW_SDMA: - if (info->query_fw.index >= adev->sdma.num_instances) - return -EINVAL; - fw_info.ver = adev->sdma.instance[info->query_fw.index].fw_version; - fw_info.feature = adev->sdma.instance[info->query_fw.index].feature_version; - break; - default: - return -EINVAL; - } + ret = amdgpu_firmware_info(&fw_info, &info->query_fw, adev); + if (ret) + return ret; + return copy_to_user(out, &fw_info, min((size_t)size, sizeof(fw_info))) ? -EFAULT : 0; } diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index cdecf87576e8..462246aa200e 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -487,6 +487,22 @@ struct drm_amdgpu_cs_chunk_data { #define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 #define AMDGPU_INFO_MMR_SH_INDEX_MASK 0xff +struct drm_amdgpu_query_fw { + /** AMDGPU_INFO_FW_* */ + __u32 fw_type; + /** + * Index of the IP if there are more IPs of + * the same type. + */ + __u32 ip_instance; + /** + * Index of the engine. Whether this is used depends + * on the firmware type. (e.g. MEC, SDMA) + */ + __u32 index; + __u32 _pad; +}; + /* Input structure for the INFO ioctl */ struct drm_amdgpu_info { /* Where the return value will be stored */ @@ -522,21 +538,7 @@ struct drm_amdgpu_info { __u32 flags; } read_mmr_reg; - struct { - /** AMDGPU_INFO_FW_* */ - __u32 fw_type; - /** - * Index of the IP if there are more IPs of - * the same type. - */ - __u32 ip_instance; - /** - * Index of the engine. Whether this is used depends - * on the firmware type. (e.g. MEC, SDMA) - */ - __u32 index; - __u32 _pad; - } query_fw; + struct drm_amdgpu_query_fw query_fw; }; }; -- cgit v1.2.3-71-gd317 From 3713131345fbea291cbd859d248e06ed77815962 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Tue, 12 Jul 2016 22:09:27 +0200 Subject: KVM: x86: add KVM_CAP_X2APIC_API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM_CAP_X2APIC_API is a capability for features related to x2APIC enablement. KVM_X2APIC_API_32BIT_FORMAT feature can be enabled to extend APIC ID in get/set ioctl and MSI addresses to 32 bits. Both are needed to support x2APIC. The feature has to be enableable and disabled by default, because get/set ioctl shifted and truncated APIC ID to 8 bits by using a non-standard protocol inspired by xAPIC and the change is not backward-compatible. Changes to MSI addresses follow the format used by interrupt remapping unit. The upper address word, that used to be 0, contains upper 24 bits of the LAPIC address in its upper 24 bits. Lower 8 bits are reserved as 0. Using the upper address word is not backward-compatible either as we didn't check that userspace zeroed the word. Reserved bits are still not explicitly checked, but non-zero data will affect LAPIC addresses, which will cause a bug. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 41 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 4 +++- arch/x86/kvm/irq_comm.c | 29 ++++++++++++++++++++++----- arch/x86/kvm/lapic.c | 13 +++++++++---- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 15 ++++++++++++++ include/trace/events/kvm.h | 5 +++-- include/uapi/linux/kvm.h | 3 +++ 8 files changed, 99 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09efa9eb3926..e34e51fa28b0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi { __u32 pad; }; +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + struct kvm_irq_routing_s390_adapter { __u64 ind_addr; __u64 summary_addr; @@ -1583,6 +1588,17 @@ struct kvm_lapic_state { Reads the Local APIC registers and copies them into the input argument. The data format and layout are the same as documented in the architecture manual. +If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is +enabled, then the format of APIC_ID register depends on the APIC mode +(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in +the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID +which is stored in bits 31-24 of the APIC register, or equivalently in +byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then +be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. + +If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state +always uses xAPIC format. + 4.58 KVM_SET_LAPIC @@ -1600,6 +1616,10 @@ struct kvm_lapic_state { Copies the input argument into the Local APIC registers. The data format and layout are the same as documented in the architecture manual. +The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's +regs field) depends on the state of the KVM_CAP_X2APIC_API capability. +See the note in KVM_GET_LAPIC. + 4.59 KVM_IOEVENTFD @@ -2180,6 +2200,10 @@ struct kvm_msi { No flags are defined so far. The corresponding field must be 0. +On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is +enabled. If it is enabled, address_hi bits 31-8 provide bits 31-8 of the +destination id. Bits 7-0 of address_hi must be zero. + 4.71 KVM_CREATE_PIT2 @@ -3811,6 +3835,23 @@ Allows use of runtime-instrumentation introduced with zEC12 processor. Will return -EINVAL if the machine does not support runtime-instrumentation. Will return -EBUSY if a VCPU has already been created. +7.7 KVM_CAP_X2APIC_API + +Architectures: x86 +Parameters: args[0] - features that should be enabled +Returns: 0 on success, -EINVAL when args[0] contains invalid features + +Valid feature flags in args[0] are + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) + +Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of +KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, +allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their +respective sections. + + + 8. Other capabilities. ---------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a2832cc3cb81..7c00ba3242d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -782,6 +782,8 @@ struct kvm_arch { u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + + bool x2apic_format; }; struct kvm_vm_stat { @@ -1364,7 +1366,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 889563d50c55..25810b144b58 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? + (u64)e->msi.address_hi << 32 : 0), + e->msi.data); irq->dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + if (kvm->arch.x2apic_format) + irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; @@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); +static inline bool kvm_msi_route_invalid(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e) +{ + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); +} + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_lapic_irq irq; + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + if (!level) return -1; - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } @@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) return -EWOULDBLOCK; - kvm_set_msi_irq(e, &irq); + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + kvm_set_msi_irq(kvm, e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) return r; @@ -286,6 +302,9 @@ int kvm_set_routing_entry(struct kvm *kvm, e->msi.address_lo = ue->u.msi.address_lo; e->msi.address_hi = ue->u.msi.address_hi; e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + goto out; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -394,7 +413,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, if (entry->type != KVM_IRQ_ROUTING_MSI) continue; - kvm_set_msi_irq(entry, &irq); + kvm_set_msi_irq(vcpu->kvm, entry, &irq); if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, irq.dest_id, irq.dest_mode)) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3c2a8c113054..d27a7829a4ce 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1991,10 +1991,15 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, if (apic_x2apic_mode(vcpu->arch.apic)) { u32 *id = (u32 *)(s->regs + APIC_ID); - if (set) - *id >>= 24; - else - *id <<= 24; + if (vcpu->kvm->arch.x2apic_format) { + if (*id != vcpu->vcpu_id) + return -EINVAL; + } else { + if (set) + *id >>= 24; + else + *id <<= 24; + } } return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7bdd6b1a2373..b61cdadf8623 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11104,7 +11104,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * We will support full lowest-priority interrupt later. */ - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { /* * Make sure the IRTE is in remapped mode if diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b6e402d16e0c..d86f563a6896 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,6 +90,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS) + static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); @@ -2625,6 +2627,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_X2APIC_API: + r = KVM_X2APIC_API_VALID_FLAGS; + break; default: r = 0; break; @@ -3799,6 +3804,16 @@ split_irqchip_unlock: mutex_unlock(&kvm->lock); break; } + case KVM_CAP_X2APIC_API: + r = -EINVAL; + if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) + break; + + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) + kvm->arch.x2apic_format = true; + + r = 0; + break; default: r = -EINVAL; break; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index f28292d73ddb..8ade3eb6c640 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq, __entry->data = data; ), - TP_printk("dst %u vec %u (%s|%s|%s%s)", - (u8)(__entry->address >> 12), (u8)__entry->data, + TP_printk("dst %llx vec %u (%s|%s|%s%s)", + (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), + (u8)__entry->data, __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), (__entry->address & (1<<2)) ? "logical" : "physical", (__entry->data & (1<<15)) ? "level" : "edge", diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 05ebf475104c..f704403e19a0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PMU_V3 126 #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 +#define KVM_CAP_X2APIC_API 129 #ifdef KVM_CAP_IRQ_ROUTING @@ -1313,4 +1314,6 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3-71-gd317 From c519265f2aa348b2f1b9ecf8fbe20bb7c0fb102e Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Tue, 12 Jul 2016 22:09:28 +0200 Subject: KVM: x86: add a flag to disable KVM x2apic broadcast quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK as a feature flag to KVM_CAP_X2APIC_API. The quirk made KVM interpret 0xff as a broadcast even in x2APIC mode. The enableable capability is needed in order to support standard x2APIC and remain backward compatible. Signed-off-by: Radim Krčmář [Expand kvm_apic_mda comment. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 6 +++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 53 +++++++++++++++++++++++++++++---------- arch/x86/kvm/x86.c | 5 +++- include/uapi/linux/kvm.h | 1 + 5 files changed, 52 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e34e51fa28b0..c4d2fb0e28de 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3844,12 +3844,18 @@ Returns: 0 on success, -EINVAL when args[0] contains invalid features Valid feature flags in args[0] are #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their respective sections. +KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work +in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff +as a broadcast even in x2APIC mode in order to support physical x2APIC +without interrupt remapping. This is undesirable in logical mode, +where 0xff represents CPUs 0-7 in cluster 0. 8. Other capabilities. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7c00ba3242d7..074b5c760327 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -784,6 +784,7 @@ struct kvm_arch { struct page *avic_physical_id_table_page; bool x2apic_format; + bool x2apic_broadcast_quirk_disabled; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d27a7829a4ce..a16e0bb95d28 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -616,17 +616,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } -/* KVM APIC implementation has two quirks - * - dest always begins at 0 while xAPIC MDA has offset 24, - * - IOxAPIC messages have to be delivered (directly) to x2APIC. +/* The KVM local APIC implementation has two quirks: + * + * - the xAPIC MDA stores the destination at bits 24-31, while this + * is not true of struct kvm_lapic_irq's dest_id field. This is + * just a quirk in the API and is not problematic. + * + * - in-kernel IOAPIC messages have to be delivered directly to + * x2APIC, because the kernel does not support interrupt remapping. + * In order to support broadcast without interrupt remapping, x2APIC + * rewrites the destination of non-IPI messages from APIC_BROADCAST + * to X2APIC_BROADCAST. + * + * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is + * important when userspace wants to use x2APIC-format MSIs, because + * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ -static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source, - struct kvm_lapic *target) +static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, + struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); - if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda) + if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && + !ipi && dest_id == APIC_BROADCAST && x2apic_mda) return X2APIC_BROADCAST; return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); @@ -636,7 +649,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; - u32 mda = kvm_apic_mda(dest, source, target); + u32 mda = kvm_apic_mda(vcpu, dest, source, target); apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x\n", @@ -688,6 +701,25 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) } } +static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, + struct kvm_lapic_irq *irq, struct kvm_apic_map *map) +{ + if (kvm->arch.x2apic_broadcast_quirk_disabled) { + if ((irq->dest_id == APIC_BROADCAST && + map->mode != KVM_APIC_MODE_X2APIC)) + return true; + if (irq->dest_id == X2APIC_BROADCAST) + return true; + } else { + bool x2apic_ipi = src && *src && apic_x2apic_mode(*src); + if (irq->dest_id == (x2apic_ipi ? + X2APIC_BROADCAST : APIC_BROADCAST)) + return true; + } + + return false; +} + /* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. @@ -701,7 +733,6 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, unsigned long *bitmap) { int i, lowest; - bool x2apic_ipi; if (irq->shorthand == APIC_DEST_SELF && src) { *dst = src; @@ -710,11 +741,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, } else if (irq->shorthand) return false; - x2apic_ipi = src && *src && apic_x2apic_mode(*src); - if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) - return false; - - if (!map) + if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map)) return false; if (irq->dest_mode == APIC_DEST_PHYSICAL) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d86f563a6896..f0d23622bc4e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,7 +90,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU -#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS) +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); @@ -3811,6 +3812,8 @@ split_irqchip_unlock: if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) kvm->arch.x2apic_format = true; + if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + kvm->arch.x2apic_broadcast_quirk_disabled = true; r = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f704403e19a0..4f8030e5b05d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1315,5 +1315,6 @@ struct kvm_assigned_msix_entry { }; #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) #endif /* __LINUX_KVM_H */ -- cgit v1.2.3-71-gd317 From af713795c59fea36161a7debf97dbc10bf652cf7 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Fri, 1 Jul 2016 13:10:38 -0700 Subject: drm/vc4: Add a getparam ioctl for getting the V3D identity regs. As I extend the driver to support different V3D revisions, userspace needs to know what version it's targeting. This is most easily detected using the V3D identity registers. v2: Make sure V3D is runtime PM on when reading the registers. v3: Switch to a 64-bit param value (suggested by Rob Clark in review) Signed-off-by: Eric Anholt Acked-by: Daniel Vetter (v2) Reviewed-by: Rob Clark (v3, over irc) --- drivers/gpu/drm/vc4/vc4_drv.c | 42 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/drm/vc4_drm.h | 12 ++++++++++++ 2 files changed, 54 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c index 2f30214ee810..047d7a265ceb 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.c +++ b/drivers/gpu/drm/vc4/vc4_drv.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "drm_fb_cma_helper.h" #include "uapi/drm/vc4_drm.h" @@ -43,6 +44,46 @@ void __iomem *vc4_ioremap_regs(struct platform_device *dev, int index) return map; } +static int vc4_get_param_ioctl(struct drm_device *dev, void *data, + struct drm_file *file_priv) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct drm_vc4_get_param *args = data; + int ret; + + if (args->pad != 0) + return -EINVAL; + + switch (args->param) { + case DRM_VC4_PARAM_V3D_IDENT0: + ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); + if (ret) + return ret; + args->value = V3D_READ(V3D_IDENT0); + pm_runtime_put(&vc4->v3d->pdev->dev); + break; + case DRM_VC4_PARAM_V3D_IDENT1: + ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); + if (ret) + return ret; + args->value = V3D_READ(V3D_IDENT1); + pm_runtime_put(&vc4->v3d->pdev->dev); + break; + case DRM_VC4_PARAM_V3D_IDENT2: + ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); + if (ret) + return ret; + args->value = V3D_READ(V3D_IDENT2); + pm_runtime_put(&vc4->v3d->pdev->dev); + break; + default: + DRM_DEBUG("Unknown parameter %d\n", args->param); + return -EINVAL; + } + + return 0; +} + static void vc4_lastclose(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); @@ -74,6 +115,7 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = { DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VC4_GET_HANG_STATE, vc4_get_hang_state_ioctl, DRM_ROOT_ONLY), + DRM_IOCTL_DEF_DRV(VC4_GET_PARAM, vc4_get_param_ioctl, DRM_RENDER_ALLOW), }; static struct drm_driver vc4_drm_driver = { diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h index af12e8a184c8..1143e954048d 100644 --- a/include/uapi/drm/vc4_drm.h +++ b/include/uapi/drm/vc4_drm.h @@ -37,6 +37,7 @@ extern "C" { #define DRM_VC4_MMAP_BO 0x04 #define DRM_VC4_CREATE_SHADER_BO 0x05 #define DRM_VC4_GET_HANG_STATE 0x06 +#define DRM_VC4_GET_PARAM 0x07 #define DRM_IOCTL_VC4_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl) #define DRM_IOCTL_VC4_WAIT_SEQNO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno) @@ -45,6 +46,7 @@ extern "C" { #define DRM_IOCTL_VC4_MMAP_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo) #define DRM_IOCTL_VC4_CREATE_SHADER_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo) #define DRM_IOCTL_VC4_GET_HANG_STATE DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state) +#define DRM_IOCTL_VC4_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_PARAM, struct drm_vc4_get_param) struct drm_vc4_submit_rcl_surface { __u32 hindex; /* Handle index, or ~0 if not present. */ @@ -280,6 +282,16 @@ struct drm_vc4_get_hang_state { __u32 pad[16]; }; +#define DRM_VC4_PARAM_V3D_IDENT0 0 +#define DRM_VC4_PARAM_V3D_IDENT1 1 +#define DRM_VC4_PARAM_V3D_IDENT2 2 + +struct drm_vc4_get_param { + __u32 param; + __u32 pad; + __u64 value; +}; + #if defined(__cplusplus) } #endif -- cgit v1.2.3-71-gd317 From 7363cee5b467c31dc3af2ac98df0634bb8bbc668 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sat, 2 Jul 2016 14:14:27 -0700 Subject: drm/vc4: Add a getparam to signal support for branches. Userspace needs to know if it can create shaders that do branching. Otherwise, for backwards compatibility with old kernels it needs to lower if statements to conditional assignments. Signed-off-by: Eric Anholt --- drivers/gpu/drm/vc4/vc4_drv.c | 3 +++ include/uapi/drm/vc4_drm.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c index 047d7a265ceb..9435894822d5 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.c +++ b/drivers/gpu/drm/vc4/vc4_drv.c @@ -76,6 +76,9 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data, args->value = V3D_READ(V3D_IDENT2); pm_runtime_put(&vc4->v3d->pdev->dev); break; + case DRM_VC4_PARAM_SUPPORTS_BRANCHES: + args->value = true; + break; default: DRM_DEBUG("Unknown parameter %d\n", args->param); return -EINVAL; diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h index 1143e954048d..ad7edc3edf7c 100644 --- a/include/uapi/drm/vc4_drm.h +++ b/include/uapi/drm/vc4_drm.h @@ -285,6 +285,7 @@ struct drm_vc4_get_hang_state { #define DRM_VC4_PARAM_V3D_IDENT0 0 #define DRM_VC4_PARAM_V3D_IDENT1 1 #define DRM_VC4_PARAM_V3D_IDENT2 2 +#define DRM_VC4_PARAM_SUPPORTS_BRANCHES 3 struct drm_vc4_get_param { __u32 param; -- cgit v1.2.3-71-gd317 From 4cd33c48ea25ba17e9d0383fe914c3e58b48f7dd Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 17 May 2016 15:44:49 -0400 Subject: drm/msm: add madvise ioctl Doesn't do anything too interesting until we wire up shrinker. Pretty much lifted from i915. Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_drv.c | 39 +++++++++++++++++++++++++++++++++++++++ drivers/gpu/drm/msm/msm_drv.h | 1 + drivers/gpu/drm/msm/msm_gem.c | 35 +++++++++++++++++++++++++++++++++-- drivers/gpu/drm/msm/msm_gem.h | 5 +++++ include/uapi/drm/msm_drm.h | 25 ++++++++++++++++++++++++- 5 files changed, 102 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 435a8f90c290..00881f3ed32e 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -690,6 +690,44 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, void *data, return msm_wait_fence(priv->gpu->fctx, args->fence, &timeout, true); } +static int msm_ioctl_gem_madvise(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct drm_msm_gem_madvise *args = data; + struct drm_gem_object *obj; + int ret; + + switch (args->madv) { + case MSM_MADV_DONTNEED: + case MSM_MADV_WILLNEED: + break; + default: + return -EINVAL; + } + + ret = mutex_lock_interruptible(&dev->struct_mutex); + if (ret) + return ret; + + obj = drm_gem_object_lookup(file, args->handle); + if (!obj) { + ret = -ENOENT; + goto unlock; + } + + ret = msm_gem_madvise(obj, args->madv); + if (ret >= 0) { + args->retained = ret; + ret = 0; + } + + drm_gem_object_unreference(obj); + +unlock: + mutex_unlock(&dev->struct_mutex); + return ret; +} + static const struct drm_ioctl_desc msm_ioctls[] = { DRM_IOCTL_DEF_DRV(MSM_GET_PARAM, msm_ioctl_get_param, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(MSM_GEM_NEW, msm_ioctl_gem_new, DRM_AUTH|DRM_RENDER_ALLOW), @@ -698,6 +736,7 @@ static const struct drm_ioctl_desc msm_ioctls[] = { DRM_IOCTL_DEF_DRV(MSM_GEM_CPU_FINI, msm_ioctl_gem_cpu_fini, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(MSM_GEM_SUBMIT, msm_ioctl_gem_submit, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(MSM_WAIT_FENCE, msm_ioctl_wait_fence, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(MSM_GEM_MADVISE, msm_ioctl_gem_madvise, DRM_AUTH|DRM_RENDER_ALLOW), }; static const struct vm_operations_struct vm_ops = { diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index be01e3895178..a49d7fd67ec3 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -195,6 +195,7 @@ int msm_gem_prime_pin(struct drm_gem_object *obj); void msm_gem_prime_unpin(struct drm_gem_object *obj); void *msm_gem_vaddr_locked(struct drm_gem_object *obj); void *msm_gem_vaddr(struct drm_gem_object *obj); +int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv); int msm_gem_sync_object(struct drm_gem_object *obj, struct msm_fence_context *fctx, bool exclusive); void msm_gem_move_to_active(struct drm_gem_object *obj, diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 69836f5685b1..c40db08647d1 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -413,6 +413,21 @@ void *msm_gem_vaddr(struct drm_gem_object *obj) return ret; } +/* Update madvise status, returns true if not purged, else + * false or -errno. + */ +int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv) +{ + struct msm_gem_object *msm_obj = to_msm_bo(obj); + + WARN_ON(!mutex_is_locked(&obj->dev->struct_mutex)); + + if (msm_obj->madv != __MSM_MADV_PURGED) + msm_obj->madv = madv; + + return (msm_obj->madv != __MSM_MADV_PURGED); +} + /* must be called before _move_to_active().. */ int msm_gem_sync_object(struct drm_gem_object *obj, struct msm_fence_context *fctx, bool exclusive) @@ -464,6 +479,7 @@ void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu, bool exclusive, struct fence *fence) { struct msm_gem_object *msm_obj = to_msm_bo(obj); + WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED); msm_obj->gpu = gpu; if (exclusive) reservation_object_add_excl_fence(msm_obj->resv, fence); @@ -532,13 +548,27 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m) struct reservation_object_list *fobj; struct fence *fence; uint64_t off = drm_vma_node_start(&obj->vma_node); + const char *madv; WARN_ON(!mutex_is_locked(&obj->dev->struct_mutex)); - seq_printf(m, "%08x: %c %2d (%2d) %08llx %p %zu\n", + switch (msm_obj->madv) { + case __MSM_MADV_PURGED: + madv = " purged"; + break; + case MSM_MADV_DONTNEED: + madv = " purgeable"; + break; + case MSM_MADV_WILLNEED: + default: + madv = ""; + break; + } + + seq_printf(m, "%08x: %c %2d (%2d) %08llx %p %zu%s\n", msm_obj->flags, is_active(msm_obj) ? 'A' : 'I', obj->name, obj->refcount.refcount.counter, - off, msm_obj->vaddr, obj->size); + off, msm_obj->vaddr, obj->size, madv); rcu_read_lock(); fobj = rcu_dereference(robj->fence); @@ -688,6 +718,7 @@ static int msm_gem_new_impl(struct drm_device *dev, msm_obj->vram_node = (void *)&msm_obj[1]; msm_obj->flags = flags; + msm_obj->madv = MSM_MADV_WILLNEED; if (resv) { msm_obj->resv = resv; diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index 9facd4b6ffd9..fa8e1f16f18e 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -29,6 +29,11 @@ struct msm_gem_object { uint32_t flags; + /** + * Advice: are the backing pages purgeable? + */ + uint8_t madv; + /* And object is either: * inactive - on priv->inactive_list * active - on one one of the gpu's active_list.. well, at diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index bf19d2cd9078..49f778de8e06 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -201,6 +201,27 @@ struct drm_msm_wait_fence { struct drm_msm_timespec timeout; /* in */ }; +/* madvise provides a way to tell the kernel in case a buffers contents + * can be discarded under memory pressure, which is useful for userspace + * bo cache where we want to optimistically hold on to buffer allocate + * and potential mmap, but allow the pages to be discarded under memory + * pressure. + * + * Typical usage would involve madvise(DONTNEED) when buffer enters BO + * cache, and madvise(WILLNEED) if trying to recycle buffer from BO cache. + * In the WILLNEED case, 'retained' indicates to userspace whether the + * backing pages still exist. + */ +#define MSM_MADV_WILLNEED 0 /* backing pages are needed, status returned in 'retained' */ +#define MSM_MADV_DONTNEED 1 /* backing pages not needed */ +#define __MSM_MADV_PURGED 2 /* internal state */ + +struct drm_msm_gem_madvise { + __u32 handle; /* in, GEM handle */ + __u32 madv; /* in, MSM_MADV_x */ + __u32 retained; /* out, whether backing store still exists */ +}; + #define DRM_MSM_GET_PARAM 0x00 /* placeholder: #define DRM_MSM_SET_PARAM 0x01 @@ -211,7 +232,8 @@ struct drm_msm_wait_fence { #define DRM_MSM_GEM_CPU_FINI 0x05 #define DRM_MSM_GEM_SUBMIT 0x06 #define DRM_MSM_WAIT_FENCE 0x07 -#define DRM_MSM_NUM_IOCTLS 0x08 +#define DRM_MSM_GEM_MADVISE 0x08 +#define DRM_MSM_NUM_IOCTLS 0x09 #define DRM_IOCTL_MSM_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param) #define DRM_IOCTL_MSM_GEM_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new) @@ -220,6 +242,7 @@ struct drm_msm_wait_fence { #define DRM_IOCTL_MSM_GEM_CPU_FINI DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_CPU_FINI, struct drm_msm_gem_cpu_fini) #define DRM_IOCTL_MSM_GEM_SUBMIT DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_SUBMIT, struct drm_msm_gem_submit) #define DRM_IOCTL_MSM_WAIT_FENCE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_WAIT_FENCE, struct drm_msm_wait_fence) +#define DRM_IOCTL_MSM_GEM_MADVISE DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_MADVISE, struct drm_msm_gem_madvise) #if defined(__cplusplus) } -- cgit v1.2.3-71-gd317 From 4077798484459a2eced2050045099a466ecb618a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 15 Jul 2016 09:31:11 +0100 Subject: drm/vgem: Attach sw fences to exported vGEM dma-buf (ioctl) vGEM buffers are useful for passing data between software clients and hardware renders. By allowing the user to create and attach fences to the exported vGEM buffers (on the dma-buf), the user can implement a deferred renderer and queue hardware operations like flipping and then signal the buffer readiness (i.e. this allows the user to schedule operations out-of-order, but have them complete in-order). This also makes it much easier to write tightly controlled testcases for dma-buf fencing and signaling between hardware drivers. v2: Don't pretend the fences exist in an ordered timeline, but allocate a separate fence-context for each fence so that the fences are unordered. v3: Make the debug output more interesting, and show the signaled status. v4: Automatically signal the fence to prevent userspace from indefinitely hanging drivers. Testcase: igt/vgem_basic/dmabuf-fence Testcase: igt/vgem_slow/nohang Signed-off-by: Chris Wilson Cc: Sean Paul Cc: Zach Reizner Cc: Gustavo Padovan Cc: Daniel Vetter Acked-by: Zach Reizner Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1468571471-12610-1-git-send-email-chris@chris-wilson.co.uk --- drivers/gpu/drm/vgem/Makefile | 2 +- drivers/gpu/drm/vgem/vgem_drv.c | 34 +++++ drivers/gpu/drm/vgem/vgem_drv.h | 16 +++ drivers/gpu/drm/vgem/vgem_fence.c | 283 ++++++++++++++++++++++++++++++++++++++ include/uapi/drm/vgem_drm.h | 62 +++++++++ 5 files changed, 396 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/vgem/vgem_fence.c create mode 100644 include/uapi/drm/vgem_drm.h (limited to 'include/uapi') diff --git a/drivers/gpu/drm/vgem/Makefile b/drivers/gpu/drm/vgem/Makefile index 3f4c7b842028..bfcdea1330e6 100644 --- a/drivers/gpu/drm/vgem/Makefile +++ b/drivers/gpu/drm/vgem/Makefile @@ -1,4 +1,4 @@ ccflags-y := -Iinclude/drm -vgem-y := vgem_drv.o +vgem-y := vgem_drv.o vgem_fence.o obj-$(CONFIG_DRM_VGEM) += vgem.o diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index 29c2aab3c1a7..c15bafb06665 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -83,6 +83,34 @@ static const struct vm_operations_struct vgem_gem_vm_ops = { .close = drm_gem_vm_close, }; +static int vgem_open(struct drm_device *dev, struct drm_file *file) +{ + struct vgem_file *vfile; + int ret; + + vfile = kzalloc(sizeof(*vfile), GFP_KERNEL); + if (!vfile) + return -ENOMEM; + + file->driver_priv = vfile; + + ret = vgem_fence_open(vfile); + if (ret) { + kfree(vfile); + return ret; + } + + return 0; +} + +static void vgem_preclose(struct drm_device *dev, struct drm_file *file) +{ + struct vgem_file *vfile = file->driver_priv; + + vgem_fence_close(vfile); + kfree(vfile); +} + /* ioctls */ static struct drm_gem_object *vgem_gem_create(struct drm_device *dev, @@ -164,6 +192,8 @@ unref: } static struct drm_ioctl_desc vgem_ioctls[] = { + DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), }; static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) @@ -271,9 +301,12 @@ static int vgem_prime_mmap(struct drm_gem_object *obj, static struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_PRIME, + .open = vgem_open, + .preclose = vgem_preclose, .gem_free_object_unlocked = vgem_gem_free_object, .gem_vm_ops = &vgem_gem_vm_ops, .ioctls = vgem_ioctls, + .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops, .dumb_create = vgem_gem_dumb_create, @@ -328,5 +361,6 @@ module_init(vgem_init); module_exit(vgem_exit); MODULE_AUTHOR("Red Hat, Inc."); +MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL and additional rights"); diff --git a/drivers/gpu/drm/vgem/vgem_drv.h b/drivers/gpu/drm/vgem/vgem_drv.h index 988cbaae7588..1f8798ad329c 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.h +++ b/drivers/gpu/drm/vgem/vgem_drv.h @@ -32,9 +32,25 @@ #include #include +#include + +struct vgem_file { + struct idr fence_idr; + struct mutex fence_mutex; +}; + #define to_vgem_bo(x) container_of(x, struct drm_vgem_gem_object, base) struct drm_vgem_gem_object { struct drm_gem_object base; }; +int vgem_fence_open(struct vgem_file *file); +int vgem_fence_attach_ioctl(struct drm_device *dev, + void *data, + struct drm_file *file); +int vgem_fence_signal_ioctl(struct drm_device *dev, + void *data, + struct drm_file *file); +void vgem_fence_close(struct vgem_file *file); + #endif diff --git a/drivers/gpu/drm/vgem/vgem_fence.c b/drivers/gpu/drm/vgem/vgem_fence.c new file mode 100644 index 000000000000..e77b52208699 --- /dev/null +++ b/drivers/gpu/drm/vgem/vgem_fence.c @@ -0,0 +1,283 @@ +/* + * Copyright 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software") + * to deal in the software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * them Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTIBILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include "vgem_drv.h" + +#define VGEM_FENCE_TIMEOUT (10*HZ) + +struct vgem_fence { + struct fence base; + struct spinlock lock; + struct timer_list timer; +}; + +static const char *vgem_fence_get_driver_name(struct fence *fence) +{ + return "vgem"; +} + +static const char *vgem_fence_get_timeline_name(struct fence *fence) +{ + return "unbound"; +} + +static bool vgem_fence_signaled(struct fence *fence) +{ + return false; +} + +static bool vgem_fence_enable_signaling(struct fence *fence) +{ + return true; +} + +static void vgem_fence_release(struct fence *base) +{ + struct vgem_fence *fence = container_of(base, typeof(*fence), base); + + del_timer_sync(&fence->timer); + fence_free(&fence->base); +} + +static void vgem_fence_value_str(struct fence *fence, char *str, int size) +{ + snprintf(str, size, "%u", fence->seqno); +} + +static void vgem_fence_timeline_value_str(struct fence *fence, char *str, + int size) +{ + snprintf(str, size, "%u", fence_is_signaled(fence) ? fence->seqno : 0); +} + +const struct fence_ops vgem_fence_ops = { + .get_driver_name = vgem_fence_get_driver_name, + .get_timeline_name = vgem_fence_get_timeline_name, + .enable_signaling = vgem_fence_enable_signaling, + .signaled = vgem_fence_signaled, + .wait = fence_default_wait, + .release = vgem_fence_release, + + .fence_value_str = vgem_fence_value_str, + .timeline_value_str = vgem_fence_timeline_value_str, +}; + +static void vgem_fence_timeout(unsigned long data) +{ + struct vgem_fence *fence = (struct vgem_fence *)data; + + fence_signal(&fence->base); +} + +static struct fence *vgem_fence_create(struct vgem_file *vfile, + unsigned int flags) +{ + struct vgem_fence *fence; + + fence = kzalloc(sizeof(*fence), GFP_KERNEL); + if (!fence) + return NULL; + + spin_lock_init(&fence->lock); + fence_init(&fence->base, &vgem_fence_ops, &fence->lock, + fence_context_alloc(1), 1); + + setup_timer(&fence->timer, vgem_fence_timeout, (unsigned long)fence); + + /* We force the fence to expire within 10s to prevent driver hangs */ + mod_timer(&fence->timer, VGEM_FENCE_TIMEOUT); + + return &fence->base; +} + +static int attach_dmabuf(struct drm_device *dev, + struct drm_gem_object *obj) +{ + struct dma_buf *dmabuf; + + if (obj->dma_buf) + return 0; + + dmabuf = dev->driver->gem_prime_export(dev, obj, 0); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + obj->dma_buf = dmabuf; + drm_gem_object_reference(obj); + return 0; +} + +/* + * vgem_fence_attach_ioctl (DRM_IOCTL_VGEM_FENCE_ATTACH): + * + * Create and attach a fence to the vGEM handle. This fence is then exposed + * via the dma-buf reservation object and visible to consumers of the exported + * dma-buf. If the flags contain VGEM_FENCE_WRITE, the fence indicates the + * vGEM buffer is being written to by the client and is exposed as an exclusive + * fence, otherwise the fence indicates the client is current reading from the + * buffer and all future writes should wait for the client to signal its + * completion. Note that if a conflicting fence is already on the dma-buf (i.e. + * an exclusive fence when adding a read, or any fence when adding a write), + * -EBUSY is reported. Serialisation between operations should be handled + * by waiting upon the dma-buf. + * + * This returns the handle for the new fence that must be signaled within 10 + * seconds (or otherwise it will automatically expire). See + * vgem_fence_signal_ioctl (DRM_IOCTL_VGEM_FENCE_SIGNAL). + * + * If the vGEM handle does not exist, vgem_fence_attach_ioctl returns -ENOENT. + */ +int vgem_fence_attach_ioctl(struct drm_device *dev, + void *data, + struct drm_file *file) +{ + struct drm_vgem_fence_attach *arg = data; + struct vgem_file *vfile = file->driver_priv; + struct reservation_object *resv; + struct drm_gem_object *obj; + struct fence *fence; + int ret; + + if (arg->flags & ~VGEM_FENCE_WRITE) + return -EINVAL; + + if (arg->pad) + return -EINVAL; + + obj = drm_gem_object_lookup(file, arg->handle); + if (!obj) + return -ENOENT; + + ret = attach_dmabuf(dev, obj); + if (ret) + goto err; + + fence = vgem_fence_create(vfile, arg->flags); + if (!fence) { + ret = -ENOMEM; + goto err; + } + + /* Check for a conflicting fence */ + resv = obj->dma_buf->resv; + if (!reservation_object_test_signaled_rcu(resv, + arg->flags & VGEM_FENCE_WRITE)) { + ret = -EBUSY; + goto err_fence; + } + + /* Expose the fence via the dma-buf */ + ret = 0; + mutex_lock(&resv->lock.base); + if (arg->flags & VGEM_FENCE_WRITE) + reservation_object_add_excl_fence(resv, fence); + else if ((ret = reservation_object_reserve_shared(resv)) == 0) + reservation_object_add_shared_fence(resv, fence); + mutex_unlock(&resv->lock.base); + + /* Record the fence in our idr for later signaling */ + if (ret == 0) { + mutex_lock(&vfile->fence_mutex); + ret = idr_alloc(&vfile->fence_idr, fence, 1, 0, GFP_KERNEL); + mutex_unlock(&vfile->fence_mutex); + if (ret > 0) { + arg->out_fence = ret; + ret = 0; + } + } +err_fence: + if (ret) { + fence_signal(fence); + fence_put(fence); + } +err: + drm_gem_object_unreference_unlocked(obj); + return ret; +} + +/* + * vgem_fence_signal_ioctl (DRM_IOCTL_VGEM_FENCE_SIGNAL): + * + * Signal and consume a fence ealier attached to a vGEM handle using + * vgem_fence_attach_ioctl (DRM_IOCTL_VGEM_FENCE_ATTACH). + * + * All fences must be signaled within 10s of attachment or otherwise they + * will automatically expire (and a vgem_fence_signal_ioctl returns -ETIMEDOUT). + * + * Signaling a fence indicates to all consumers of the dma-buf that the + * client has completed the operation associated with the fence, and that the + * buffer is then ready for consumption. + * + * If the fence does not exist (or has already been signaled by the client), + * vgem_fence_signal_ioctl returns -ENOENT. + */ +int vgem_fence_signal_ioctl(struct drm_device *dev, + void *data, + struct drm_file *file) +{ + struct vgem_file *vfile = file->driver_priv; + struct drm_vgem_fence_signal *arg = data; + struct fence *fence; + int ret; + + if (arg->flags) + return -EINVAL; + + mutex_lock(&vfile->fence_mutex); + fence = idr_replace(&vfile->fence_idr, NULL, arg->fence); + mutex_unlock(&vfile->fence_mutex); + if (!fence) + return -ENOENT; + if (IS_ERR(fence)) + return PTR_ERR(fence); + + if (fence_is_signaled(fence)) + ret = -ETIMEDOUT; + + fence_signal(fence); + fence_put(fence); + return ret; +} + +int vgem_fence_open(struct vgem_file *vfile) +{ + mutex_init(&vfile->fence_mutex); + idr_init(&vfile->fence_idr); + + return 0; +} + +static int __vgem_fence_idr_fini(int id, void *p, void *data) +{ + fence_signal(p); + fence_put(p); + return 0; +} + +void vgem_fence_close(struct vgem_file *vfile) +{ + idr_for_each(&vfile->fence_idr, __vgem_fence_idr_fini, vfile); + idr_destroy(&vfile->fence_idr); +} diff --git a/include/uapi/drm/vgem_drm.h b/include/uapi/drm/vgem_drm.h new file mode 100644 index 000000000000..bf66f5db6da8 --- /dev/null +++ b/include/uapi/drm/vgem_drm.h @@ -0,0 +1,62 @@ +/* + * Copyright 2016 Intel Corporation + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _UAPI_VGEM_DRM_H_ +#define _UAPI_VGEM_DRM_H_ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Please note that modifications to all structs defined here are + * subject to backwards-compatibility constraints. + */ +#define DRM_VGEM_FENCE_ATTACH 0x1 +#define DRM_VGEM_FENCE_SIGNAL 0x2 + +#define DRM_IOCTL_VGEM_FENCE_ATTACH DRM_IOWR( DRM_COMMAND_BASE + DRM_VGEM_FENCE_ATTACH, struct drm_vgem_fence_attach) +#define DRM_IOCTL_VGEM_FENCE_SIGNAL DRM_IOW( DRM_COMMAND_BASE + DRM_VGEM_FENCE_SIGNAL, struct drm_vgem_fence_signal) + +struct drm_vgem_fence_attach { + __u32 handle; + __u32 flags; +#define VGEM_FENCE_WRITE 0x1 + __u32 out_fence; + __u32 pad; +}; + +struct drm_vgem_fence_signal { + __u32 fence; + __u32 flags; +}; + +#if defined(__cplusplus) +} +#endif + +#endif /* _UAPI_VGEM_DRM_H_ */ -- cgit v1.2.3-71-gd317 From 6502a34cfd6695929086187f63fe670cc3050e68 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 21 Jun 2016 14:19:51 +0200 Subject: KVM: s390: allow user space to handle instr 0x0000 We will use illegal instruction 0x0000 for handling 2 byte sw breakpoints from user space. As it can be enabled dynamically via a capability, let's move setting of ICTL_OPEREXC to the post creation step, so we avoid any races when enabling that capability just while adding new cpus. Acked-by: Janosch Frank Reviewed-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 13 +++++++++++++ arch/s390/include/asm/kvm_host.h | 2 ++ arch/s390/kvm/intercept.c | 3 +++ arch/s390/kvm/kvm-s390.c | 26 ++++++++++++++++++++++++-- include/uapi/linux/kvm.h | 1 + 5 files changed, 43 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c4d2fb0e28de..299306db5d84 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3857,6 +3857,19 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC without interrupt remapping. This is undesirable in logical mode, where 0xff represents CPUs 0-7 in cluster 0. +7.8 KVM_CAP_S390_USER_INSTR0 + +Architectures: s390 +Parameters: none + +With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +be intercepted and forwarded to user space. User space can use this +mechanism e.g. to realize 2-byte software breakpoints. The kernel will +not inject an operating exception for these instructions, user space has +to take care of that. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. 8. Other capabilities. ---------------------- diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 946fc86202fd..183b01727de4 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -43,6 +43,7 @@ /* s390-specific vcpu->requests bit members */ #define KVM_REQ_ENABLE_IBS 8 #define KVM_REQ_DISABLE_IBS 9 +#define KVM_REQ_ICPT_OPEREXC 10 #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -666,6 +667,7 @@ struct kvm_arch{ int user_cpu_state_ctrl; int user_sigp; int user_stsi; + int user_instr0; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 850be47c4cc9..7a2f1551bc39 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -359,6 +359,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu) test_kvm_facility(vcpu->kvm, 74)) return handle_sthyi(vcpu); + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) + return -EOPNOTSUPP; + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d42428c11794..63ac7c1641a7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -364,6 +364,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_STSI: case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: + case KVM_CAP_S390_USER_INSTR0: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -456,6 +457,16 @@ out: return r; } +static void icpt_operexc_on_all_vcpus(struct kvm *kvm) +{ + unsigned int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu); + } +} + static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -507,6 +518,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) kvm->arch.user_stsi = 1; r = 0; break; + case KVM_CAP_S390_USER_INSTR0: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0"); + kvm->arch.user_instr0 = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; default: r = -EINVAL; break; @@ -1836,6 +1853,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; sca_add_vcpu(vcpu); } + if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0) + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; /* make vcpu_load load the right gmap on the first trigger */ vcpu->arch.enabled_gmap = vcpu->arch.gmap; } @@ -1923,8 +1942,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) } vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; - if (test_kvm_facility(vcpu->kvm, 74)) - vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; if (vcpu->kvm->arch.use_cmma) { rc = kvm_s390_vcpu_setup_cmma(vcpu); @@ -2369,6 +2386,11 @@ retry: goto retry; } + if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) { + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + goto retry; + } + /* nothing to do, just clear the request */ clear_bit(KVM_REQ_UNHALT, &vcpu->requests); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4f8030e5b05d..70941f4ab6d8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -867,6 +867,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 #define KVM_CAP_X2APIC_API 129 +#define KVM_CAP_S390_USER_INSTR0 130 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-71-gd317 From 2b8ddd9337ee0d001b22507f95596648a1a90992 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:24 +0100 Subject: KVM: Extend struct kvm_msi to hold a 32-bit device ID The ARM GICv3 ITS MSI controller requires a device ID to be able to assign the proper interrupt vector. On real hardware, this ID is sampled from the bus. To be able to emulate an ITS controller, extend the KVM MSI interface to let userspace provide such a device ID. For PCI devices, the device ID is simply the 16-bit bus-device-function triplet, which should be easily available to the userland tool. Also there is a new KVM capability which advertises whether the current VM requires a device ID to be set along with the MSI data. This flag is still reported as not available everywhere, later we will enable it when ITS emulation is used. Signed-off-by: Andre Przywara Reviewed-by: Eric Auger Reviewed-by: Marc Zyngier Acked-by: Christoffer Dall Acked-by: Paolo Bonzini Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 12 ++++++++++-- include/uapi/linux/kvm.h | 5 ++++- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09efa9eb3926..65513119fee8 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2175,10 +2175,18 @@ struct kvm_msi { __u32 address_hi; __u32 data; __u32 flags; - __u8 pad[16]; + __u32 devid; + __u8 pad[12]; }; -No flags are defined so far. The corresponding field must be 0. +flags: KVM_MSI_VALID_DEVID: devid contains a valid value +devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier + for the device that wrote the MSI message. + For PCI, this is usually a BFD identifier in the lower 16 bits. + +The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide +the device ID. If this capability is not set, userland cannot rely on +the kernel to allow the KVM_MSI_VALID_DEVID flag being set. 4.71 KVM_CREATE_PIT2 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 05ebf475104c..7de96f5bb92c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PMU_V3 126 #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 +#define KVM_CAP_MSI_DEVID 129 #ifdef KVM_CAP_IRQ_ROUTING @@ -1024,12 +1025,14 @@ struct kvm_one_reg { __u64 addr; }; +#define KVM_MSI_VALID_DEVID (1U << 0) struct kvm_msi { __u32 address_lo; __u32 address_hi; __u32 data; __u32 flags; - __u8 pad[16]; + __u32 devid; + __u8 pad[12]; }; struct kvm_arm_device_addr { -- cgit v1.2.3-71-gd317 From 1085fdc68c6097244627a02a56bd2d8fe58a1a9c Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:31 +0100 Subject: KVM: arm64: vgic-its: Introduce new KVM ITS device Introduce a new KVM device that represents an ARM Interrupt Translation Service (ITS) controller. Since there can be multiple of this per guest, we can't piggy back on the existing GICv3 distributor device, but create a new type of KVM device. On the KVM_CREATE_DEVICE ioctl we allocate and initialize the ITS data structure and store the pointer in the kvm_device data. Upon an explicit init ioctl from userland (after having setup the MMIO address) we register the handlers with the kvm_io_bus framework. Any reference to an ITS thus has to go via this interface. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/devices/arm-vgic.txt | 25 +++-- arch/arm/kvm/arm.c | 1 + arch/arm64/include/uapi/asm/kvm.h | 2 + include/kvm/arm_vgic.h | 3 + include/uapi/linux/kvm.h | 2 + virt/kvm/arm/vgic/vgic-its.c | 135 +++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic-kvm-device.c | 4 +- virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic.h | 3 + 9 files changed, 168 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index 59541d49e15c..89182f80cc7f 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC) Device types supported: KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 + KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller -Only one VGIC instance may be instantiated through either this API or the -legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt -controller, requiring emulated user-space devices to inject interrupts to the -VGIC instead of directly to CPUs. +Only one VGIC instance of the V2/V3 types above may be instantiated through +either this API or the legacy KVM_CREATE_IRQCHIP api. The created VGIC will +act as the VM interrupt controller, requiring emulated user-space devices to +inject interrupts to the VGIC instead of directly to CPUs. Creating a guest GICv3 device requires a host GICv3 as well. GICv3 implementations with hardware compatibility support allow a guest GICv2 as well. +Creating a virtual ITS controller requires a host GICv3 (but does not depend +on having physical ITS controllers). +There can be multiple ITS controllers per guest, each of them has to have +a separate, non-overlapping MMIO region. + Groups: KVM_DEV_ARM_VGIC_GRP_ADDR Attributes: @@ -39,6 +45,13 @@ Groups: Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. This address needs to be 64K aligned. + KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit) + Base address in the guest physical address space of the GICv3 ITS + control register frame. The ITS allows MSI(-X) interrupts to be + injected into guests. This extension is optional. If the kernel + does not support the ITS, the call returns -ENODEV. + Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS. + This address needs to be 64K aligned and the region covers 128K. KVM_DEV_ARM_VGIC_GRP_DIST_REGS Attributes: @@ -109,8 +122,8 @@ Groups: KVM_DEV_ARM_VGIC_GRP_CTRL Attributes: KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC, no additional parameter in - kvm_device_attr.addr. + request the initialization of the VGIC or ITS, no additional parameter + in kvm_device_attr.addr. Errors: -ENXIO: VGIC not properly configured as required prior to calling this attribute diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 972075cc111c..fb4661cf896e 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index f209ea151dca..3051f86a9b5f 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -87,9 +87,11 @@ struct kvm_regs { /* Supported VGICv3 address types */ #define KVM_VGIC_V3_ADDR_TYPE_DIST 2 #define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 +#define KVM_VGIC_ITS_ADDR_TYPE 4 #define KVM_VGIC_V3_DIST_SIZE SZ_64K #define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) +#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K) #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 685f33975ce4..8609faced83e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -134,6 +134,7 @@ struct vgic_its { gpa_t vgic_its_base; bool enabled; + bool initialized; struct vgic_io_device iodev; }; @@ -167,6 +168,8 @@ struct vgic_dist { struct vgic_io_device dist_iodev; + bool has_its; + /* * Contains the attributes and gpa of the LPI configuration table. * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7de96f5bb92c..d8c4c324cfae 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1077,6 +1077,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_ARM_VGIC_V3, #define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3 + KVM_DEV_TYPE_ARM_VGIC_ITS, +#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_MAX, }; diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 4654d6edf6a6..6b47b3674690 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -84,6 +85,9 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) struct vgic_io_device *iodev = &its->iodev; int ret; + if (its->initialized) + return 0; + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) return -ENXIO; @@ -99,5 +103,136 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) KVM_VGIC_V3_ITS_SIZE, &iodev->dev); mutex_unlock(&kvm->slots_lock); + if (!ret) + its->initialized = true; + return ret; } + +static int vgic_its_create(struct kvm_device *dev, u32 type) +{ + struct vgic_its *its; + + if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) + return -ENODEV; + + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); + if (!its) + return -ENOMEM; + + its->vgic_its_base = VGIC_ADDR_UNDEF; + + dev->kvm->arch.vgic.has_its = true; + its->initialized = false; + its->enabled = false; + + dev->private = its; + + return 0; +} + +static void vgic_its_destroy(struct kvm_device *kvm_dev) +{ + struct vgic_its *its = kvm_dev->private; + + kfree(its); +} + +static int vgic_its_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_ITS_ADDR_TYPE: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + } + break; + } + return -ENXIO; +} + +static int vgic_its_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct vgic_its *its = dev->private; + int ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + u64 addr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (its->initialized) + return -EBUSY; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, + addr, SZ_64K); + if (ret) + return ret; + + its->vgic_its_base = addr; + + return 0; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return vgic_its_init_its(dev->kvm, its); + } + break; + } + return -ENXIO; +} + +static int vgic_its_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + struct vgic_its *its = dev->private; + u64 addr = its->vgic_its_base; + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + default: + return -ENXIO; + } + } + + return 0; +} + +static struct kvm_device_ops kvm_arm_vgic_its_ops = { + .name = "kvm-arm-vgic-its", + .create = vgic_its_create, + .destroy = vgic_its_destroy, + .set_attr = vgic_its_set_attr, + .get_attr = vgic_its_get_attr, + .has_attr = vgic_its_has_attr, +}; + +int kvm_vgic_register_its_device(void) +{ + return kvm_register_device_ops(&kvm_arm_vgic_its_ops, + KVM_DEV_TYPE_ARM_VGIC_ITS); +} diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 2f24f13c6c90..561d2ba96a4f 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -21,8 +21,8 @@ /* common helpers */ -static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment) { if (addr & ~KVM_PHYS_MASK) return -E2BIG; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index b92b7d6cabe6..a5c35050c786 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -49,7 +49,7 @@ bool vgic_has_its(struct kvm *kvm) if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) return false; - return false; + return dist->has_its; } static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 31807c166d2a..8192a293f119 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,6 +42,9 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); void vgic_kick_vcpus(struct kvm *kvm); +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment); + void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); -- cgit v1.2.3-71-gd317 From 76a10b86785c5e3fc49bcee355502d035b07e47a Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 22 Jul 2016 16:20:37 +0000 Subject: KVM: api: Pass the devid in the msi routing entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On ARM, the MSI msg (address and data) comes along with out-of-band device ID information. The device ID encodes the device that writes the MSI msg. Let's convey the device id in kvm_irq_routing_msi and use KVM_MSI_VALID_DEVID flag value in kvm_irq_routing_entry to indicate the msi devid is populated. Signed-off-by: Eric Auger Reviewed-by: Andre Przywara Acked-by: Radim Krčmář Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 19 +++++++++++++++++-- include/uapi/linux/kvm.h | 5 ++++- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 07049eadb124..415cde1647e9 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1468,7 +1468,11 @@ struct kvm_irq_routing_entry { #define KVM_IRQ_ROUTING_S390_ADAPTER 3 #define KVM_IRQ_ROUTING_HV_SINT 4 -No flags are specified so far, the corresponding field must be set to zero. +flags: +- KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI + routing entry type, specifies that the devid field contains + a valid value. +- zero otherwise struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1479,9 +1483,20 @@ struct kvm_irq_routing_msi { __u32 address_lo; __u32 address_hi; __u32 data; - __u32 pad; + union { + __u32 pad; + __u32 devid; + }; }; +devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier + for the device that wrote the MSI message. + For PCI, this is usually a BFD identifier in the lower 16 bits. + +The per-VM KVM_CAP_MSI_DEVID capability advertises the requirement to +provide the device ID. If this capability is not set, userland cannot +rely on the kernel to allow the KVM_MSI_VALID_DEVID flag being set. + struct kvm_irq_routing_s390_adapter { __u64 ind_addr; __u64 summary_addr; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d8c4c324cfae..eb2220895c6e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -879,7 +879,10 @@ struct kvm_irq_routing_msi { __u32 address_lo; __u32 address_hi; __u32 data; - __u32 pad; + union { + __u32 pad; + __u32 devid; + }; }; struct kvm_irq_routing_s390_adapter { -- cgit v1.2.3-71-gd317 From 7af7c616fa2f1ce6c0d806b89898d2df098b4bd8 Mon Sep 17 00:00:00 2001 From: Hans van Kranenburg Date: Sun, 3 Jul 2016 23:23:06 +0200 Subject: Btrfs: use the correct struct for BTRFS_IOC_LOGICAL_INO BTRFS_IOC_LOGICAL_INO takes a btrfs_ioctl_logical_ino_args as argument, not a btrfs_ioctl_ino_path_args. The lines were probably copy/pasted when the code was written. Since btrfs_ioctl_logical_ino_args and btrfs_ioctl_ino_path_args have the same size, the actual IOCTL definition here does not change. But, it makes the code less confusing for the reader. Signed-off-by: Hans van Kranenburg Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 2bdd1e3e7007..ac5eacd3055b 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -798,7 +798,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ - struct btrfs_ioctl_ino_path_args) + struct btrfs_ioctl_logical_ino_args) #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ struct btrfs_ioctl_received_subvol_args) #define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) -- cgit v1.2.3-71-gd317 From 0dc696bcf2e86f48a23fb95ca2f40c8708241e7e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Jul 2016 10:57:30 +0800 Subject: elf: Add powerpc specific core note sections This patch adds twelve ELF core note sections for powerpc architecture for various registers and register sets which need to be accessed from ptrace interface and then gdb. These additions include special purpose registers like TAR, PPR, DSCR, TM running and checkpointed state for various register sets, EBB related register set, performance monitor register set etc. Addition of these new ELF core note sections extends the existing ELF ABI on powerpc arch without affecting it in any manner. Signed-off-by: Anshuman Khandual Signed-off-by: Simon Guo Signed-off-by: Michael Ellerman --- include/uapi/linux/elf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index cb4a72f888d5..1be3c5f6183b 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -381,6 +381,19 @@ typedef struct elf64_shdr { #define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ #define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ +#define NT_PPC_TAR 0x103 /* Target Address Register */ +#define NT_PPC_PPR 0x104 /* Program Priority Register */ +#define NT_PPC_DSCR 0x105 /* Data Stream Control Register */ +#define NT_PPC_EBB 0x106 /* Event Based Branch Registers */ +#define NT_PPC_PMU 0x107 /* Performance Monitor Registers */ +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NT_PPC_TM_CTAR 0x10d /* TM checkpointed Target Address Register */ +#define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ +#define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ -- cgit v1.2.3-71-gd317 From 23528bb21ee2c9b27f3feddd77a2a3351a8df148 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 20 Jul 2016 13:41:36 +1000 Subject: KVM: PPC: Introduce KVM_CAP_PPC_HTM Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to determine if a PowerPC KVM guest should use HTM (Hardware Transactional Memory). This will be used by QEMU to populate the pa-features bits in the guest's device tree. Signed-off-by: Sam Bobroff Reviewed-by: David Gibson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 4 ++++ include/uapi/linux/kvm.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1ac036e45ed4..6ce40dd6fe51 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; #endif + case KVM_CAP_PPC_HTM: + r = cpu_has_feature(CPU_FTR_TM_COMP) && + is_kvmppc_hv_enabled(kvm); + break; default: r = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 8f2756c263d4..e98bb4cce639 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -869,6 +869,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_X2APIC_API 129 #define KVM_CAP_S390_USER_INSTR0 130 #define KVM_CAP_MSI_DEVID 131 +#define KVM_CAP_PPC_HTM 132 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-71-gd317 From 1a937693993ff10d7e80cca6ddd55f3000aa6376 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 18 Apr 2016 12:58:14 +0300 Subject: virtio: new feature to detect IOMMU device quirk The interaction between virtio and IOMMUs is messy. On most systems with virtio, physical addresses match bus addresses, and it doesn't particularly matter which one we use to program the device. On some systems, including Xen and any system with a physical device that speaks virtio behind a physical IOMMU, we must program the IOMMU for virtio DMA to work at all. On other systems, including SPARC and PPC64, virtio-pci devices are enumerated as though they are behind an IOMMU, but the virtio host ignores the IOMMU, so we must either pretend that the IOMMU isn't there or somehow map everything as the identity. Add a feature bit to detect that quirk: VIRTIO_F_IOMMU_PLATFORM. Any device with this feature bit set to 0 needs a quirk and has to be passed physical addresses (as opposed to bus addresses) even though the device is behind an IOMMU. Note: it has to be a per-device quirk because for example, there could be a mix of passed-through and virtual virtio devices. As another example, some devices could be implemented by an out of process hypervisor backend (in case of qemu vhost, or vhost-user) and so support for an IOMMU needs to be coded up separately. It would be cleanest to handle this in IOMMU core code, but that needs per-device DMA ops. While we are waiting for that to be implemented, use a work-around in virtio core. Note: a "noiommu" feature is a quirk - add a wrapper to make that clear. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 15 ++++++++++++++- include/linux/virtio_config.h | 13 +++++++++++++ include/uapi/linux/virtio_config.h | 10 +++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ca6bfddaacad..114a0c88afb8 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -117,7 +117,10 @@ struct vring_virtqueue { #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) /* - * The interaction between virtio and a possible IOMMU is a mess. + * Modern virtio devices have feature bits to specify whether they need a + * quirk and bypass the IOMMU. If not there, just use the DMA API. + * + * If there, the interaction between virtio and DMA API is messy. * * On most systems with virtio, physical addresses match bus addresses, * and it doesn't particularly matter whether we use the DMA API. @@ -133,10 +136,18 @@ struct vring_virtqueue { * * For the time being, we preserve historic behavior and bypass the DMA * API. + * + * TODO: install a per-device DMA ops structure that does the right thing + * taking into account all the above quirks, and use the DMA API + * unconditionally on data path. */ static bool vring_use_dma_api(struct virtio_device *vdev) { + if (!virtio_has_iommu_quirk(vdev)) + return true; + + /* Otherwise, we are left to guess. */ /* * In theory, it's possible to have a buggy QEMU-supposed * emulated Q35 IOMMU and Xen enabled at the same time. On @@ -1099,6 +1110,8 @@ void vring_transport_features(struct virtio_device *vdev) break; case VIRTIO_F_VERSION_1: break; + case VIRTIO_F_IOMMU_PLATFORM: + break; default: /* We don't understand this bit. */ __virtio_clear_bit(vdev, i); diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 6e6cb0c9d7cb..26c155bb639b 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -149,6 +149,19 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, return __virtio_test_bit(vdev, fbit); } +/** + * virtio_has_iommu_quirk - determine whether this device has the iommu quirk + * @vdev: the device + */ +static inline bool virtio_has_iommu_quirk(const struct virtio_device *vdev) +{ + /* + * Note the reverse polarity of the quirk feature (compared to most + * other features), this is for compatibility with legacy systems. + */ + return !virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); +} + static inline struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, vq_callback_t *c, const char *n) diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index 4cb65bbfa654..308e2096291f 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -49,7 +49,7 @@ * transport being used (eg. virtio_ring), the rest are per-device feature * bits. */ #define VIRTIO_TRANSPORT_F_START 28 -#define VIRTIO_TRANSPORT_F_END 33 +#define VIRTIO_TRANSPORT_F_END 34 #ifndef VIRTIO_CONFIG_NO_LEGACY /* Do we get callbacks when the ring is completely used, even if we've @@ -63,4 +63,12 @@ /* v1.0 compliant. */ #define VIRTIO_F_VERSION_1 32 +/* + * If clear - device has the IOMMU bypass quirk feature. + * If set - use platform tools to detect the IOMMU. + * + * Note the reverse polarity (compared to most other features), + * this is for compatibility with legacy systems. + */ +#define VIRTIO_F_IOMMU_PLATFORM 33 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */ -- cgit v1.2.3-71-gd317 From 06a8fc78367d070720af960dcecec917d3ae5f3b Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 28 Jul 2016 15:36:32 +0100 Subject: VSOCK: Introduce virtio_vsock_common.ko This module contains the common code and header files for the following virtio_transporto and vhost_vsock kernel modules. Signed-off-by: Asias He Signed-off-by: Claudio Imbrenda Signed-off-by: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 10 + include/linux/virtio_vsock.h | 154 ++++ include/net/af_vsock.h | 2 + .../trace/events/vsock_virtio_transport_common.h | 144 +++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/virtio_ids.h | 1 + include/uapi/linux/virtio_vsock.h | 94 ++ net/vmw_vsock/virtio_transport_common.c | 992 +++++++++++++++++++++ 8 files changed, 1398 insertions(+) create mode 100644 include/linux/virtio_vsock.h create mode 100644 include/trace/events/vsock_virtio_transport_common.h create mode 100644 include/uapi/linux/virtio_vsock.h create mode 100644 net/vmw_vsock/virtio_transport_common.c (limited to 'include/uapi') diff --git a/MAINTAINERS b/MAINTAINERS index 8c20323d1277..b49ffb86ebf0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12138,6 +12138,16 @@ S: Maintained F: drivers/media/v4l2-core/videobuf2-* F: include/media/videobuf2-* +VIRTIO AND VHOST VSOCK DRIVER +M: Stefan Hajnoczi +L: kvm@vger.kernel.org +L: virtualization@lists.linux-foundation.org +L: netdev@vger.kernel.org +S: Maintained +F: include/linux/virtio_vsock.h +F: include/uapi/linux/virtio_vsock.h +F: net/vmw_vsock/virtio_transport_common.c + VIRTUAL SERIO DEVICE DRIVER M: Stephen Chandler Paul S: Maintained diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h new file mode 100644 index 000000000000..9638bfeb0d1f --- /dev/null +++ b/include/linux/virtio_vsock.h @@ -0,0 +1,154 @@ +#ifndef _LINUX_VIRTIO_VSOCK_H +#define _LINUX_VIRTIO_VSOCK_H + +#include +#include +#include +#include + +#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 +#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) +#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) + +enum { + VSOCK_VQ_RX = 0, /* for host to guest data */ + VSOCK_VQ_TX = 1, /* for guest to host data */ + VSOCK_VQ_EVENT = 2, + VSOCK_VQ_MAX = 3, +}; + +/* Per-socket state (accessed via vsk->trans) */ +struct virtio_vsock_sock { + struct vsock_sock *vsk; + + /* Protected by lock_sock(sk_vsock(trans->vsk)) */ + u32 buf_size; + u32 buf_size_min; + u32 buf_size_max; + + spinlock_t tx_lock; + spinlock_t rx_lock; + + /* Protected by tx_lock */ + u32 tx_cnt; + u32 buf_alloc; + u32 peer_fwd_cnt; + u32 peer_buf_alloc; + + /* Protected by rx_lock */ + u32 fwd_cnt; + u32 rx_bytes; + struct list_head rx_queue; +}; + +struct virtio_vsock_pkt { + struct virtio_vsock_hdr hdr; + struct work_struct work; + struct list_head list; + void *buf; + u32 len; + u32 off; + bool reply; +}; + +struct virtio_vsock_pkt_info { + u32 remote_cid, remote_port; + struct msghdr *msg; + u32 pkt_len; + u16 type; + u16 op; + u32 flags; + bool reply; +}; + +struct virtio_transport { + /* This must be the first field */ + struct vsock_transport transport; + + /* Takes ownership of the packet */ + int (*send_pkt)(struct virtio_vsock_pkt *pkt); +}; + +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, + int type); +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk); +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now); +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_available_now); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); +bool virtio_transport_stream_is_active(struct vsock_sock *vsk); +bool virtio_transport_stream_allow(u32 cid, u32 port); +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr); +bool virtio_transport_dgram_allow(u32 cid, u32 port); + +int virtio_transport_connect(struct vsock_sock *vsk); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); + +void virtio_transport_release(struct vsock_sock *vsk); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len); +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t len); + +void virtio_transport_destruct(struct vsock_sock *vsk); + +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt); +u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); +void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); + +#endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 3af0b224f754..f2758964ce6f 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -63,6 +63,8 @@ struct vsock_sock { struct list_head accept_queue; bool rejected; struct delayed_work dwork; + struct delayed_work close_work; + bool close_work_scheduled; u32 peer_shutdown; bool sent_request; bool ignore_connecting_rst; diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h new file mode 100644 index 000000000000..b7f1d6278280 --- /dev/null +++ b/include/trace/events/vsock_virtio_transport_common.h @@ -0,0 +1,144 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsock + +#if !defined(_TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H) || \ + defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H + +#include + +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM); + +#define show_type(val) \ + __print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" }) + +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RESPONSE); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RST); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_SHUTDOWN); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RW); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_UPDATE); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_REQUEST); + +#define show_op(val) \ + __print_symbolic(val, \ + { VIRTIO_VSOCK_OP_INVALID, "INVALID" }, \ + { VIRTIO_VSOCK_OP_REQUEST, "REQUEST" }, \ + { VIRTIO_VSOCK_OP_RESPONSE, "RESPONSE" }, \ + { VIRTIO_VSOCK_OP_RST, "RST" }, \ + { VIRTIO_VSOCK_OP_SHUTDOWN, "SHUTDOWN" }, \ + { VIRTIO_VSOCK_OP_RW, "RW" }, \ + { VIRTIO_VSOCK_OP_CREDIT_UPDATE, "CREDIT_UPDATE" }, \ + { VIRTIO_VSOCK_OP_CREDIT_REQUEST, "CREDIT_REQUEST" }) + +TRACE_EVENT(virtio_transport_alloc_pkt, + TP_PROTO( + __u32 src_cid, __u32 src_port, + __u32 dst_cid, __u32 dst_port, + __u32 len, + __u16 type, + __u16 op, + __u32 flags + ), + TP_ARGS( + src_cid, src_port, + dst_cid, dst_port, + len, + type, + op, + flags + ), + TP_STRUCT__entry( + __field(__u32, src_cid) + __field(__u32, src_port) + __field(__u32, dst_cid) + __field(__u32, dst_port) + __field(__u32, len) + __field(__u16, type) + __field(__u16, op) + __field(__u32, flags) + ), + TP_fast_assign( + __entry->src_cid = src_cid; + __entry->src_port = src_port; + __entry->dst_cid = dst_cid; + __entry->dst_port = dst_port; + __entry->len = len; + __entry->type = type; + __entry->op = op; + __entry->flags = flags; + ), + TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x", + __entry->src_cid, __entry->src_port, + __entry->dst_cid, __entry->dst_port, + __entry->len, + show_type(__entry->type), + show_op(__entry->op), + __entry->flags) +); + +TRACE_EVENT(virtio_transport_recv_pkt, + TP_PROTO( + __u32 src_cid, __u32 src_port, + __u32 dst_cid, __u32 dst_port, + __u32 len, + __u16 type, + __u16 op, + __u32 flags, + __u32 buf_alloc, + __u32 fwd_cnt + ), + TP_ARGS( + src_cid, src_port, + dst_cid, dst_port, + len, + type, + op, + flags, + buf_alloc, + fwd_cnt + ), + TP_STRUCT__entry( + __field(__u32, src_cid) + __field(__u32, src_port) + __field(__u32, dst_cid) + __field(__u32, dst_port) + __field(__u32, len) + __field(__u16, type) + __field(__u16, op) + __field(__u32, flags) + __field(__u32, buf_alloc) + __field(__u32, fwd_cnt) + ), + TP_fast_assign( + __entry->src_cid = src_cid; + __entry->src_port = src_port; + __entry->dst_cid = dst_cid; + __entry->dst_port = dst_port; + __entry->len = len; + __entry->type = type; + __entry->op = op; + __entry->flags = flags; + __entry->buf_alloc = buf_alloc; + __entry->fwd_cnt = fwd_cnt; + ), + TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x " + "buf_alloc=%u fwd_cnt=%u", + __entry->src_cid, __entry->src_port, + __entry->dst_cid, __entry->dst_port, + __entry->len, + show_type(__entry->type), + show_op(__entry->op), + __entry->flags, + __entry->buf_alloc, + __entry->fwd_cnt) +); + +#endif /* _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H */ + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE vsock_virtio_transport_common + +/* This part must be outside protection */ +#include diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index ec10cfef166a..3cf0116d9c2b 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -453,6 +453,7 @@ header-y += virtio_ring.h header-y += virtio_rng.h header-y += virtio_scsi.h header-y += virtio_types.h +header-y += virtio_vsock.h header-y += vm_sockets.h header-y += vt.h header-y += wait.h diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 77925f587b15..3228d582234a 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -41,5 +41,6 @@ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_INPUT 18 /* virtio input */ +#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h new file mode 100644 index 000000000000..6b011c19b50f --- /dev/null +++ b/include/uapi/linux/virtio_vsock.h @@ -0,0 +1,94 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H +#define _UAPI_LINUX_VIRTIO_VOSCK_H + +#include +#include +#include + +struct virtio_vsock_config { + __le64 guest_cid; +} __attribute__((packed)); + +enum virtio_vsock_event_id { + VIRTIO_VSOCK_EVENT_TRANSPORT_RESET = 0, +}; + +struct virtio_vsock_event { + __le32 id; +} __attribute__((packed)); + +struct virtio_vsock_hdr { + __le64 src_cid; + __le64 dst_cid; + __le32 src_port; + __le32 dst_port; + __le32 len; + __le16 type; /* enum virtio_vsock_type */ + __le16 op; /* enum virtio_vsock_op */ + __le32 flags; + __le32 buf_alloc; + __le32 fwd_cnt; +} __attribute__((packed)); + +enum virtio_vsock_type { + VIRTIO_VSOCK_TYPE_STREAM = 1, +}; + +enum virtio_vsock_op { + VIRTIO_VSOCK_OP_INVALID = 0, + + /* Connect operations */ + VIRTIO_VSOCK_OP_REQUEST = 1, + VIRTIO_VSOCK_OP_RESPONSE = 2, + VIRTIO_VSOCK_OP_RST = 3, + VIRTIO_VSOCK_OP_SHUTDOWN = 4, + + /* To send payload */ + VIRTIO_VSOCK_OP_RW = 5, + + /* Tell the peer our credit info */ + VIRTIO_VSOCK_OP_CREDIT_UPDATE = 6, + /* Request the peer to send the credit info to us */ + VIRTIO_VSOCK_OP_CREDIT_REQUEST = 7, +}; + +/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */ +enum virtio_vsock_shutdown { + VIRTIO_VSOCK_SHUTDOWN_RCV = 1, + VIRTIO_VSOCK_SHUTDOWN_SEND = 2, +}; + +#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c new file mode 100644 index 000000000000..a53b3a16b4f1 --- /dev/null +++ b/net/vmw_vsock/virtio_transport_common.c @@ -0,0 +1,992 @@ +/* + * common code for virtio vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +/* How long to wait for graceful shutdown of a connection */ +#define VSOCK_CLOSE_TIMEOUT (8 * HZ) + +static const struct virtio_transport *virtio_transport_get_ops(void) +{ + const struct vsock_transport *t = vsock_core_get_transport(); + + return container_of(t, struct virtio_transport, transport); +} + +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port) +{ + struct virtio_vsock_pkt *pkt; + int err; + + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return NULL; + + pkt->hdr.type = cpu_to_le16(info->type); + pkt->hdr.op = cpu_to_le16(info->op); + pkt->hdr.src_cid = cpu_to_le64(src_cid); + pkt->hdr.dst_cid = cpu_to_le64(dst_cid); + pkt->hdr.src_port = cpu_to_le32(src_port); + pkt->hdr.dst_port = cpu_to_le32(dst_port); + pkt->hdr.flags = cpu_to_le32(info->flags); + pkt->len = len; + pkt->hdr.len = cpu_to_le32(len); + pkt->reply = info->reply; + + if (info->msg && len > 0) { + pkt->buf = kmalloc(len, GFP_KERNEL); + if (!pkt->buf) + goto out_pkt; + err = memcpy_from_msg(pkt->buf, info->msg, len); + if (err) + goto out; + } + + trace_virtio_transport_alloc_pkt(src_cid, src_port, + dst_cid, dst_port, + len, + info->type, + info->op, + info->flags); + + return pkt; + +out: + kfree(pkt->buf); +out_pkt: + kfree(pkt); + return NULL; +} +EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt); + +static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info) +{ + u32 src_cid, src_port, dst_cid, dst_port; + struct virtio_vsock_sock *vvs; + struct virtio_vsock_pkt *pkt; + u32 pkt_len = info->pkt_len; + + src_cid = vm_sockets_get_local_cid(); + src_port = vsk->local_addr.svm_port; + if (!info->remote_cid) { + dst_cid = vsk->remote_addr.svm_cid; + dst_port = vsk->remote_addr.svm_port; + } else { + dst_cid = info->remote_cid; + dst_port = info->remote_port; + } + + vvs = vsk->trans; + + /* we can send less than pkt_len bytes */ + if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) + pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; + + /* virtio_transport_get_credit might return less than pkt_len credit */ + pkt_len = virtio_transport_get_credit(vvs, pkt_len); + + /* Do not send zero length OP_RW pkt */ + if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) + return pkt_len; + + pkt = virtio_transport_alloc_pkt(info, pkt_len, + src_cid, src_port, + dst_cid, dst_port); + if (!pkt) { + virtio_transport_put_credit(vvs, pkt_len); + return -ENOMEM; + } + + virtio_transport_inc_tx_pkt(vvs, pkt); + + return virtio_transport_get_ops()->send_pkt(pkt); +} + +static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, + struct virtio_vsock_pkt *pkt) +{ + vvs->rx_bytes += pkt->len; +} + +static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, + struct virtio_vsock_pkt *pkt) +{ + vvs->rx_bytes -= pkt->len; + vvs->fwd_cnt += pkt->len; +} + +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) +{ + spin_lock_bh(&vvs->tx_lock); + pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); + pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); + spin_unlock_bh(&vvs->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); + +u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) +{ + u32 ret; + + spin_lock_bh(&vvs->tx_lock); + ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + if (ret > credit) + ret = credit; + vvs->tx_cnt += ret; + spin_unlock_bh(&vvs->tx_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_credit); + +void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) +{ + spin_lock_bh(&vvs->tx_lock); + vvs->tx_cnt -= credit; + spin_unlock_bh(&vvs->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_put_credit); + +static int virtio_transport_send_credit_update(struct vsock_sock *vsk, + int type, + struct virtio_vsock_hdr *hdr) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, + .type = type, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +static ssize_t +virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + while (total < len && !list_empty(&vvs->rx_queue)) { + pkt = list_first_entry(&vvs->rx_queue, + struct virtio_vsock_pkt, list); + + bytes = len - total; + if (bytes > pkt->len - pkt->off) + bytes = pkt->len - pkt->off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; + pkt->off += bytes; + if (pkt->off == pkt->len) { + virtio_transport_dec_rx_pkt(vvs, pkt); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + } + spin_unlock_bh(&vvs->rx_lock); + + /* Send a credit pkt to peer */ + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, + NULL); + + return total; + +out: + if (total) + err = total; + return err; +} + +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + + return virtio_transport_stream_do_dequeue(vsk, msg, len); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); + +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + spin_lock_bh(&vvs->rx_lock); + bytes = vvs->rx_bytes; + spin_unlock_bh(&vvs->rx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); + +static s64 virtio_transport_has_space(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + if (bytes < 0) + bytes = 0; + + return bytes; +} + +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + spin_lock_bh(&vvs->tx_lock); + bytes = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk) +{ + struct virtio_vsock_sock *vvs; + + vvs = kzalloc(sizeof(*vvs), GFP_KERNEL); + if (!vvs) + return -ENOMEM; + + vsk->trans = vvs; + vvs->vsk = vsk; + if (psk) { + struct virtio_vsock_sock *ptrans = psk->trans; + + vvs->buf_size = ptrans->buf_size; + vvs->buf_size_min = ptrans->buf_size_min; + vvs->buf_size_max = ptrans->buf_size_max; + vvs->peer_buf_alloc = ptrans->peer_buf_alloc; + } else { + vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; + vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; + vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; + } + + vvs->buf_alloc = vvs->buf_size; + + spin_lock_init(&vvs->rx_lock); + spin_lock_init(&vvs->tx_lock); + INIT_LIST_HEAD(&vvs->rx_queue); + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); + +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); + +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size_min; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); + +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size_max; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); + +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < vvs->buf_size_min) + vvs->buf_size_min = val; + if (val > vvs->buf_size_max) + vvs->buf_size_max = val; + vvs->buf_size = val; + vvs->buf_alloc = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); + +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val > vvs->buf_size) + vvs->buf_size = val; + vvs->buf_size_min = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); + +void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < vvs->buf_size) + vvs->buf_size = val; + vvs->buf_size_max = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); + +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now) +{ + if (vsock_stream_has_data(vsk)) + *data_ready_now = true; + else + *data_ready_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in); + +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_avail_now) +{ + s64 free_space; + + free_space = vsock_stream_has_space(vsk); + if (free_space > 0) + *space_avail_now = true; + else if (free_space == 0) + *space_avail_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init); + +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block); + +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue); + +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue); + +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init); + +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block); + +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue); + +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); + +bool virtio_transport_stream_is_active(struct vsock_sock *vsk) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); + +bool virtio_transport_stream_allow(u32 cid, u32 port) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); + +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); + +bool virtio_transport_dgram_allow(u32 cid, u32 port) +{ + return false; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow); + +int virtio_transport_connect(struct vsock_sock *vsk) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_REQUEST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_connect); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_SHUTDOWN, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .flags = (mode & RCV_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | + (mode & SEND_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_SEND : 0), + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_shutdown); + +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t dgram_len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RW, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .msg = msg, + .pkt_len = len, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue); + +void virtio_transport_destruct(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + kfree(vvs); +} +EXPORT_SYMBOL_GPL(virtio_transport_destruct); + +static int virtio_transport_reset(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .reply = !!pkt, + }; + + /* Send RST only if the original pkt is not a RST pkt */ + if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +/* Normally packets are associated with a socket. There may be no socket if an + * attempt was made to connect to a socket that does not exist. + */ +static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = le16_to_cpu(pkt->hdr.type), + .reply = true, + }; + + /* Send RST only if the original pkt is not a RST pkt */ + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + pkt = virtio_transport_alloc_pkt(&info, 0, + le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port), + le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + if (!pkt) + return -ENOMEM; + + return virtio_transport_get_ops()->send_pkt(pkt); +} + +static void virtio_transport_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, + sock_flag(sk, SOCK_DONE))) + break; + } while (!signal_pending(current) && timeout); + + finish_wait(sk_sleep(sk), &wait); + } +} + +static void virtio_transport_do_close(struct vsock_sock *vsk, + bool cancel_timeout) +{ + struct sock *sk = sk_vsock(vsk); + + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + sk->sk_state_change(sk); + + if (vsk->close_work_scheduled && + (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { + vsk->close_work_scheduled = false; + + vsock_remove_sock(vsk); + + /* Release refcnt obtained when we scheduled the timeout */ + sock_put(sk); + } +} + +static void virtio_transport_close_timeout(struct work_struct *work) +{ + struct vsock_sock *vsk = + container_of(work, struct vsock_sock, close_work.work); + struct sock *sk = sk_vsock(vsk); + + sock_hold(sk); + lock_sock(sk); + + if (!sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset(vsk, NULL); + + virtio_transport_do_close(vsk, false); + } + + vsk->close_work_scheduled = false; + + release_sock(sk); + sock_put(sk); +} + +/* User context, vsk->sk is locked */ +static bool virtio_transport_close(struct vsock_sock *vsk) +{ + struct sock *sk = &vsk->sk; + + if (!(sk->sk_state == SS_CONNECTED || + sk->sk_state == SS_DISCONNECTING)) + return true; + + /* Already received SHUTDOWN from peer, reply with RST */ + if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) { + (void)virtio_transport_reset(vsk, NULL); + return true; + } + + if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) + (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); + + if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) + virtio_transport_wait_close(sk, sk->sk_lingertime); + + if (sock_flag(sk, SOCK_DONE)) { + return true; + } + + sock_hold(sk); + INIT_DELAYED_WORK(&vsk->close_work, + virtio_transport_close_timeout); + vsk->close_work_scheduled = true; + schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT); + return false; +} + +void virtio_transport_release(struct vsock_sock *vsk) +{ + struct sock *sk = &vsk->sk; + bool remove_sock = true; + + lock_sock(sk); + if (sk->sk_type == SOCK_STREAM) + remove_sock = virtio_transport_close(vsk); + release_sock(sk); + + if (remove_sock) + vsock_remove_sock(vsk); +} +EXPORT_SYMBOL_GPL(virtio_transport_release); + +static int +virtio_transport_recv_connecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + int err; + int skerr; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RESPONSE: + sk->sk_state = SS_CONNECTED; + sk->sk_socket->state = SS_CONNECTED; + vsock_insert_connected(vsk); + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_INVALID: + break; + case VIRTIO_VSOCK_OP_RST: + skerr = ECONNRESET; + err = 0; + goto destroy; + default: + skerr = EPROTO; + err = -EINVAL; + goto destroy; + } + return 0; + +destroy: + virtio_transport_reset(vsk, pkt); + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = skerr; + sk->sk_error_report(sk); + return err; +} + +static int +virtio_transport_recv_connected(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + int err = 0; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RW: + pkt->len = le32_to_cpu(pkt->hdr.len); + pkt->off = 0; + + spin_lock_bh(&vvs->rx_lock); + virtio_transport_inc_rx_pkt(vvs, pkt); + list_add_tail(&pkt->list, &vvs->rx_queue); + spin_unlock_bh(&vvs->rx_lock); + + sk->sk_data_ready(sk); + return err; + case VIRTIO_VSOCK_OP_CREDIT_UPDATE: + sk->sk_write_space(sk); + break; + case VIRTIO_VSOCK_OP_SHUTDOWN: + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + vsk->peer_shutdown |= RCV_SHUTDOWN; + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + vsk->peer_shutdown |= SEND_SHUTDOWN; + if (vsk->peer_shutdown == SHUTDOWN_MASK && + vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + if (le32_to_cpu(pkt->hdr.flags)) + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_RST: + virtio_transport_do_close(vsk, true); + break; + default: + err = -EINVAL; + break; + } + + virtio_transport_free_pkt(pkt); + return err; +} + +static void +virtio_transport_recv_disconnecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + virtio_transport_do_close(vsk, true); +} + +static int +virtio_transport_send_response(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RESPONSE, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .remote_cid = le32_to_cpu(pkt->hdr.src_cid), + .remote_port = le32_to_cpu(pkt->hdr.src_port), + .reply = true, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +/* Handle server socket */ +static int +virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_sock *vchild; + struct sock *child; + + if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { + virtio_transport_reset(vsk, pkt); + return -EINVAL; + } + + if (sk_acceptq_is_full(sk)) { + virtio_transport_reset(vsk, pkt); + return -ENOMEM; + } + + child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, + sk->sk_type, 0); + if (!child) { + virtio_transport_reset(vsk, pkt); + return -ENOMEM; + } + + sk->sk_ack_backlog++; + + lock_sock_nested(child, SINGLE_DEPTH_NESTING); + + child->sk_state = SS_CONNECTED; + + vchild = vsock_sk(child); + vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + + vsock_insert_connected(vchild); + vsock_enqueue_accept(sk, child); + virtio_transport_send_response(vchild, pkt); + + release_sock(child); + + sk->sk_data_ready(sk); + return 0; +} + +static bool virtio_transport_space_update(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + bool space_available; + + /* buf_alloc and fwd_cnt is always included in the hdr */ + spin_lock_bh(&vvs->tx_lock); + vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + space_available = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + return space_available; +} + +/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex + * lock. + */ +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) +{ + struct sockaddr_vm src, dst; + struct vsock_sock *vsk; + struct sock *sk; + bool space_available; + + vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + + trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, + dst.svm_cid, dst.svm_port, + le32_to_cpu(pkt->hdr.len), + le16_to_cpu(pkt->hdr.type), + le16_to_cpu(pkt->hdr.op), + le32_to_cpu(pkt->hdr.flags), + le32_to_cpu(pkt->hdr.buf_alloc), + le32_to_cpu(pkt->hdr.fwd_cnt)); + + if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { + (void)virtio_transport_reset_no_sock(pkt); + goto free_pkt; + } + + /* The socket must be in connected or bound table + * otherwise send reset back + */ + sk = vsock_find_connected_socket(&src, &dst); + if (!sk) { + sk = vsock_find_bound_socket(&dst); + if (!sk) { + (void)virtio_transport_reset_no_sock(pkt); + goto free_pkt; + } + } + + vsk = vsock_sk(sk); + + space_available = virtio_transport_space_update(sk, pkt); + + lock_sock(sk); + + /* Update CID in case it has changed after a transport reset event */ + vsk->local_addr.svm_cid = dst.svm_cid; + + if (space_available) + sk->sk_write_space(sk); + + switch (sk->sk_state) { + case VSOCK_SS_LISTEN: + virtio_transport_recv_listen(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTING: + virtio_transport_recv_connecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTED: + virtio_transport_recv_connected(sk, pkt); + break; + case SS_DISCONNECTING: + virtio_transport_recv_disconnecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + default: + virtio_transport_free_pkt(pkt); + break; + } + release_sock(sk); + + /* Release refcnt obtained when we fetched this socket out of the + * bound or connected list. + */ + sock_put(sk); + return; + +free_pkt: + virtio_transport_free_pkt(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); + +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) +{ + kfree(pkt->buf); + kfree(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("common code for virtio vsock"); -- cgit v1.2.3-71-gd317 From 433fc58e6bf2c8bd97e57153ed28e64fd78207b8 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 28 Jul 2016 15:36:34 +0100 Subject: VSOCK: Introduce vhost_vsock.ko VM sockets vhost transport implementation. This driver runs on the host. Signed-off-by: Asias He Signed-off-by: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 2 + drivers/vhost/vsock.c | 722 +++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vhost.h | 5 + 3 files changed, 729 insertions(+) create mode 100644 drivers/vhost/vsock.c (limited to 'include/uapi') diff --git a/MAINTAINERS b/MAINTAINERS index 7302663ae4c3..12c79e5dc27e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12148,6 +12148,8 @@ F: include/linux/virtio_vsock.h F: include/uapi/linux/virtio_vsock.h F: net/vmw_vsock/virtio_transport_common.c F: net/vmw_vsock/virtio_transport.c +F: drivers/vhost/vsock.c +F: drivers/vhost/vsock.h VIRTUAL SERIO DEVICE DRIVER M: Stephen Chandler Paul diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c new file mode 100644 index 000000000000..028ca16c2d36 --- /dev/null +++ b/drivers/vhost/vsock.c @@ -0,0 +1,722 @@ +/* + * vhost transport for vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "vhost.h" + +#define VHOST_VSOCK_DEFAULT_HOST_CID 2 + +enum { + VHOST_VSOCK_FEATURES = VHOST_FEATURES, +}; + +/* Used to track all the vhost_vsock instances on the system. */ +static DEFINE_SPINLOCK(vhost_vsock_lock); +static LIST_HEAD(vhost_vsock_list); + +struct vhost_vsock { + struct vhost_dev dev; + struct vhost_virtqueue vqs[2]; + + /* Link to global vhost_vsock_list, protected by vhost_vsock_lock */ + struct list_head list; + + struct vhost_work send_pkt_work; + spinlock_t send_pkt_list_lock; + struct list_head send_pkt_list; /* host->guest pending packets */ + + atomic_t queued_replies; + + u32 guest_cid; +}; + +static u32 vhost_transport_get_local_cid(void) +{ + return VHOST_VSOCK_DEFAULT_HOST_CID; +} + +static struct vhost_vsock *vhost_vsock_get(u32 guest_cid) +{ + struct vhost_vsock *vsock; + + spin_lock_bh(&vhost_vsock_lock); + list_for_each_entry(vsock, &vhost_vsock_list, list) { + u32 other_cid = vsock->guest_cid; + + /* Skip instances that have no CID yet */ + if (other_cid == 0) + continue; + + if (other_cid == guest_cid) { + spin_unlock_bh(&vhost_vsock_lock); + return vsock; + } + } + spin_unlock_bh(&vhost_vsock_lock); + + return NULL; +} + +static void +vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + struct vhost_virtqueue *vq) +{ + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; + bool added = false; + bool restart_tx = false; + + mutex_lock(&vq->mutex); + + if (!vq->private_data) + goto out; + + /* Avoid further vmexits, we're already processing the virtqueue */ + vhost_disable_notify(&vsock->dev, vq); + + for (;;) { + struct virtio_vsock_pkt *pkt; + struct iov_iter iov_iter; + unsigned out, in; + size_t nbytes; + size_t len; + int head; + + spin_lock_bh(&vsock->send_pkt_list_lock); + if (list_empty(&vsock->send_pkt_list)) { + spin_unlock_bh(&vsock->send_pkt_list_lock); + vhost_enable_notify(&vsock->dev, vq); + break; + } + + pkt = list_first_entry(&vsock->send_pkt_list, + struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (head < 0) { + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + break; + } + + if (head == vq->num) { + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + /* We cannot finish yet if more buffers snuck in while + * re-enabling notify. + */ + if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { + vhost_disable_notify(&vsock->dev, vq); + continue; + } + break; + } + + if (out) { + virtio_transport_free_pkt(pkt); + vq_err(vq, "Expected 0 output buffers, got %u\n", out); + break; + } + + len = iov_length(&vq->iov[out], in); + iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len); + + nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); + if (nbytes != sizeof(pkt->hdr)) { + virtio_transport_free_pkt(pkt); + vq_err(vq, "Faulted on copying pkt hdr\n"); + break; + } + + nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter); + if (nbytes != pkt->len) { + virtio_transport_free_pkt(pkt); + vq_err(vq, "Faulted on copying pkt buf\n"); + break; + } + + vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len); + added = true; + + if (pkt->reply) { + int val; + + val = atomic_dec_return(&vsock->queued_replies); + + /* Do we have resources to resume tx processing? */ + if (val + 1 == tx_vq->num) + restart_tx = true; + } + + virtio_transport_free_pkt(pkt); + } + if (added) + vhost_signal(&vsock->dev, vq); + +out: + mutex_unlock(&vq->mutex); + + if (restart_tx) + vhost_poll_queue(&tx_vq->poll); +} + +static void vhost_transport_send_pkt_work(struct vhost_work *work) +{ + struct vhost_virtqueue *vq; + struct vhost_vsock *vsock; + + vsock = container_of(work, struct vhost_vsock, send_pkt_work); + vq = &vsock->vqs[VSOCK_VQ_RX]; + + vhost_transport_do_send_pkt(vsock, vq); +} + +static int +vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt) +{ + struct vhost_vsock *vsock; + struct vhost_virtqueue *vq; + int len = pkt->len; + + /* Find the vhost_vsock according to guest context id */ + vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid)); + if (!vsock) { + virtio_transport_free_pkt(pkt); + return -ENODEV; + } + + vq = &vsock->vqs[VSOCK_VQ_RX]; + + if (pkt->reply) + atomic_inc(&vsock->queued_replies); + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add_tail(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + vhost_work_queue(&vsock->dev, &vsock->send_pkt_work); + return len; +} + +static struct virtio_vsock_pkt * +vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, + unsigned int out, unsigned int in) +{ + struct virtio_vsock_pkt *pkt; + struct iov_iter iov_iter; + size_t nbytes; + size_t len; + + if (in != 0) { + vq_err(vq, "Expected 0 input buffers, got %u\n", in); + return NULL; + } + + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return NULL; + + len = iov_length(vq->iov, out); + iov_iter_init(&iov_iter, WRITE, vq->iov, out, len); + + nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); + if (nbytes != sizeof(pkt->hdr)) { + vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n", + sizeof(pkt->hdr), nbytes); + kfree(pkt); + return NULL; + } + + if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM) + pkt->len = le32_to_cpu(pkt->hdr.len); + + /* No payload */ + if (!pkt->len) + return pkt; + + /* The pkt is too big */ + if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { + kfree(pkt); + return NULL; + } + + pkt->buf = kmalloc(pkt->len, GFP_KERNEL); + if (!pkt->buf) { + kfree(pkt); + return NULL; + } + + nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter); + if (nbytes != pkt->len) { + vq_err(vq, "Expected %u byte payload, got %zu bytes\n", + pkt->len, nbytes); + virtio_transport_free_pkt(pkt); + return NULL; + } + + return pkt; +} + +/* Is there space left for replies to rx packets? */ +static bool vhost_vsock_more_replies(struct vhost_vsock *vsock) +{ + struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX]; + int val; + + smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ + val = atomic_read(&vsock->queued_replies); + + return val < vq->num; +} + +static void vhost_vsock_handle_tx_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, + dev); + struct virtio_vsock_pkt *pkt; + int head; + unsigned int out, in; + bool added = false; + + mutex_lock(&vq->mutex); + + if (!vq->private_data) + goto out; + + vhost_disable_notify(&vsock->dev, vq); + for (;;) { + if (!vhost_vsock_more_replies(vsock)) { + /* Stop tx until the device processes already + * pending replies. Leave tx virtqueue + * callbacks disabled. + */ + goto no_more_replies; + } + + head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (head < 0) + break; + + if (head == vq->num) { + if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { + vhost_disable_notify(&vsock->dev, vq); + continue; + } + break; + } + + pkt = vhost_vsock_alloc_pkt(vq, out, in); + if (!pkt) { + vq_err(vq, "Faulted on pkt\n"); + continue; + } + + /* Only accept correctly addressed packets */ + if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid) + virtio_transport_recv_pkt(pkt); + else + virtio_transport_free_pkt(pkt); + + vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len); + added = true; + } + +no_more_replies: + if (added) + vhost_signal(&vsock->dev, vq); + +out: + mutex_unlock(&vq->mutex); +} + +static void vhost_vsock_handle_rx_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, + dev); + + vhost_transport_do_send_pkt(vsock, vq); +} + +static int vhost_vsock_start(struct vhost_vsock *vsock) +{ + size_t i; + int ret; + + mutex_lock(&vsock->dev.mutex); + + ret = vhost_dev_check_owner(&vsock->dev); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { + struct vhost_virtqueue *vq = &vsock->vqs[i]; + + mutex_lock(&vq->mutex); + + if (!vhost_vq_access_ok(vq)) { + ret = -EFAULT; + mutex_unlock(&vq->mutex); + goto err_vq; + } + + if (!vq->private_data) { + vq->private_data = vsock; + vhost_vq_init_access(vq); + } + + mutex_unlock(&vq->mutex); + } + + mutex_unlock(&vsock->dev.mutex); + return 0; + +err_vq: + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { + struct vhost_virtqueue *vq = &vsock->vqs[i]; + + mutex_lock(&vq->mutex); + vq->private_data = NULL; + mutex_unlock(&vq->mutex); + } +err: + mutex_unlock(&vsock->dev.mutex); + return ret; +} + +static int vhost_vsock_stop(struct vhost_vsock *vsock) +{ + size_t i; + int ret; + + mutex_lock(&vsock->dev.mutex); + + ret = vhost_dev_check_owner(&vsock->dev); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { + struct vhost_virtqueue *vq = &vsock->vqs[i]; + + mutex_lock(&vq->mutex); + vq->private_data = NULL; + mutex_unlock(&vq->mutex); + } + +err: + mutex_unlock(&vsock->dev.mutex); + return ret; +} + +static void vhost_vsock_free(struct vhost_vsock *vsock) +{ + if (is_vmalloc_addr(vsock)) + vfree(vsock); + else + kfree(vsock); +} + +static int vhost_vsock_dev_open(struct inode *inode, struct file *file) +{ + struct vhost_virtqueue **vqs; + struct vhost_vsock *vsock; + int ret; + + /* This struct is large and allocation could fail, fall back to vmalloc + * if there is no other way. + */ + vsock = kzalloc(sizeof(*vsock), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!vsock) { + vsock = vmalloc(sizeof(*vsock)); + if (!vsock) + return -ENOMEM; + } + + vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + ret = -ENOMEM; + goto out; + } + + atomic_set(&vsock->queued_replies, 0); + + vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX]; + vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX]; + vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; + vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; + + vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs)); + + file->private_data = vsock; + spin_lock_init(&vsock->send_pkt_list_lock); + INIT_LIST_HEAD(&vsock->send_pkt_list); + vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work); + + spin_lock_bh(&vhost_vsock_lock); + list_add_tail(&vsock->list, &vhost_vsock_list); + spin_unlock_bh(&vhost_vsock_lock); + return 0; + +out: + vhost_vsock_free(vsock); + return ret; +} + +static void vhost_vsock_flush(struct vhost_vsock *vsock) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) + if (vsock->vqs[i].handle_kick) + vhost_poll_flush(&vsock->vqs[i].poll); + vhost_work_flush(&vsock->dev, &vsock->send_pkt_work); +} + +static void vhost_vsock_reset_orphans(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + /* vmci_transport.c doesn't take sk_lock here either. At least we're + * under vsock_table_lock so the sock cannot disappear while we're + * executing. + */ + + if (!vhost_vsock_get(vsk->local_addr.svm_cid)) { + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = ECONNRESET; + sk->sk_error_report(sk); + } +} + +static int vhost_vsock_dev_release(struct inode *inode, struct file *file) +{ + struct vhost_vsock *vsock = file->private_data; + + spin_lock_bh(&vhost_vsock_lock); + list_del(&vsock->list); + spin_unlock_bh(&vhost_vsock_lock); + + /* Iterating over all connections for all CIDs to find orphans is + * inefficient. Room for improvement here. */ + vsock_for_each_connected_socket(vhost_vsock_reset_orphans); + + vhost_vsock_stop(vsock); + vhost_vsock_flush(vsock); + vhost_dev_stop(&vsock->dev); + + spin_lock_bh(&vsock->send_pkt_list_lock); + while (!list_empty(&vsock->send_pkt_list)) { + struct virtio_vsock_pkt *pkt; + + pkt = list_first_entry(&vsock->send_pkt_list, + struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + virtio_transport_free_pkt(pkt); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + vhost_dev_cleanup(&vsock->dev, false); + kfree(vsock->dev.vqs); + vhost_vsock_free(vsock); + return 0; +} + +static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid) +{ + struct vhost_vsock *other; + + /* Refuse reserved CIDs */ + if (guest_cid <= VMADDR_CID_HOST || + guest_cid == U32_MAX) + return -EINVAL; + + /* 64-bit CIDs are not yet supported */ + if (guest_cid > U32_MAX) + return -EINVAL; + + /* Refuse if CID is already in use */ + other = vhost_vsock_get(guest_cid); + if (other && other != vsock) + return -EADDRINUSE; + + spin_lock_bh(&vhost_vsock_lock); + vsock->guest_cid = guest_cid; + spin_unlock_bh(&vhost_vsock_lock); + + return 0; +} + +static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) +{ + struct vhost_virtqueue *vq; + int i; + + if (features & ~VHOST_VSOCK_FEATURES) + return -EOPNOTSUPP; + + mutex_lock(&vsock->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&vsock->dev)) { + mutex_unlock(&vsock->dev.mutex); + return -EFAULT; + } + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { + vq = &vsock->vqs[i]; + mutex_lock(&vq->mutex); + vq->acked_features = features; + mutex_unlock(&vq->mutex); + } + mutex_unlock(&vsock->dev.mutex); + return 0; +} + +static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_vsock *vsock = f->private_data; + void __user *argp = (void __user *)arg; + u64 guest_cid; + u64 features; + int start; + int r; + + switch (ioctl) { + case VHOST_VSOCK_SET_GUEST_CID: + if (copy_from_user(&guest_cid, argp, sizeof(guest_cid))) + return -EFAULT; + return vhost_vsock_set_cid(vsock, guest_cid); + case VHOST_VSOCK_SET_RUNNING: + if (copy_from_user(&start, argp, sizeof(start))) + return -EFAULT; + if (start) + return vhost_vsock_start(vsock); + else + return vhost_vsock_stop(vsock); + case VHOST_GET_FEATURES: + features = VHOST_VSOCK_FEATURES; + if (copy_to_user(argp, &features, sizeof(features))) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES: + if (copy_from_user(&features, argp, sizeof(features))) + return -EFAULT; + return vhost_vsock_set_features(vsock, features); + default: + mutex_lock(&vsock->dev.mutex); + r = vhost_dev_ioctl(&vsock->dev, ioctl, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vring_ioctl(&vsock->dev, ioctl, argp); + else + vhost_vsock_flush(vsock); + mutex_unlock(&vsock->dev.mutex); + return r; + } +} + +static const struct file_operations vhost_vsock_fops = { + .owner = THIS_MODULE, + .open = vhost_vsock_dev_open, + .release = vhost_vsock_dev_release, + .llseek = noop_llseek, + .unlocked_ioctl = vhost_vsock_dev_ioctl, +}; + +static struct miscdevice vhost_vsock_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "vhost-vsock", + .fops = &vhost_vsock_fops, +}; + +static struct virtio_transport vhost_transport = { + .transport = { + .get_local_cid = vhost_transport_get_local_cid, + + .init = virtio_transport_do_socket_init, + .destruct = virtio_transport_destruct, + .release = virtio_transport_release, + .connect = virtio_transport_connect, + .shutdown = virtio_transport_shutdown, + + .dgram_enqueue = virtio_transport_dgram_enqueue, + .dgram_dequeue = virtio_transport_dgram_dequeue, + .dgram_bind = virtio_transport_dgram_bind, + .dgram_allow = virtio_transport_dgram_allow, + + .stream_enqueue = virtio_transport_stream_enqueue, + .stream_dequeue = virtio_transport_stream_dequeue, + .stream_has_data = virtio_transport_stream_has_data, + .stream_has_space = virtio_transport_stream_has_space, + .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, + .stream_is_active = virtio_transport_stream_is_active, + .stream_allow = virtio_transport_stream_allow, + + .notify_poll_in = virtio_transport_notify_poll_in, + .notify_poll_out = virtio_transport_notify_poll_out, + .notify_recv_init = virtio_transport_notify_recv_init, + .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, + .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, + .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, + .notify_send_init = virtio_transport_notify_send_init, + .notify_send_pre_block = virtio_transport_notify_send_pre_block, + .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, + .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, + + .set_buffer_size = virtio_transport_set_buffer_size, + .set_min_buffer_size = virtio_transport_set_min_buffer_size, + .set_max_buffer_size = virtio_transport_set_max_buffer_size, + .get_buffer_size = virtio_transport_get_buffer_size, + .get_min_buffer_size = virtio_transport_get_min_buffer_size, + .get_max_buffer_size = virtio_transport_get_max_buffer_size, + }, + + .send_pkt = vhost_transport_send_pkt, +}; + +static int __init vhost_vsock_init(void) +{ + int ret; + + ret = vsock_core_init(&vhost_transport.transport); + if (ret < 0) + return ret; + return misc_register(&vhost_vsock_misc); +}; + +static void __exit vhost_vsock_exit(void) +{ + misc_deregister(&vhost_vsock_misc); + vsock_core_exit(); +}; + +module_init(vhost_vsock_init); +module_exit(vhost_vsock_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("vhost transport for vsock "); diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 61a8777178c6..c4400b267716 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -175,4 +175,9 @@ struct vhost_scsi_target { #define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32) #define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32) +/* VHOST_VSOCK specific defines */ + +#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64) +#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int) + #endif -- cgit v1.2.3-71-gd317 From 6b1e6cc7855b09a0a9bfa1d9f30172ba366f161c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 23 Jun 2016 02:04:32 -0400 Subject: vhost: new device IOTLB API This patch tries to implement an device IOTLB for vhost. This could be used with userspace(qemu) implementation of DMA remapping to emulate an IOMMU for the guest. The idea is simple, cache the translation in a software device IOTLB (which is implemented as an interval tree) in vhost and use vhost_net file descriptor for reporting IOTLB miss and IOTLB update/invalidation. When vhost meets an IOTLB miss, the fault address, size and access can be read from the file. After userspace finishes the translation, it writes the translated address to the vhost_net file to update the device IOTLB. When device IOTLB is enabled by setting VIRTIO_F_IOMMU_PLATFORM all vq addresses set by ioctl are treated as iova instead of virtual address and the accessing can only be done through IOTLB instead of direct userspace memory access. Before each round or vq processing, all vq metadata is prefetched in device IOTLB to make sure no translation fault happens during vq processing. In most cases, virtqueues are contiguous even in virtual address space. The IOTLB translation for virtqueue itself may make it a little slower. We might add fast path cache on top of this patch. Signed-off-by: Jason Wang [mst: use virtio feature bit: VHOST_F_DEVICE_IOTLB -> VIRTIO_F_IOMMU_PLATFORM ] [mst: fix build warnings ] Signed-off-by: Michael S. Tsirkin [ weiyj.lk: missing unlock on error ] Signed-off-by: Wei Yongjun --- drivers/vhost/net.c | 59 ++++- drivers/vhost/vhost.c | 636 ++++++++++++++++++++++++++++++++++++++++++--- drivers/vhost/vhost.h | 32 ++- include/uapi/linux/vhost.h | 28 ++ 4 files changed, 705 insertions(+), 50 deletions(-) (limited to 'include/uapi') diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index a6b270aff9ef..0965f869dc57 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | - (1ULL << VIRTIO_NET_F_MRG_RXBUF) + (1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_F_IOMMU_PLATFORM) }; enum { @@ -308,7 +309,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net, { unsigned long uninitialized_var(endtime); int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), - out_num, in_num, NULL, NULL); + out_num, in_num, NULL, NULL); if (r == vq->num && vq->busyloop_timeout) { preempt_disable(); @@ -318,7 +319,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net, cpu_relax_lowlatency(); preempt_enable(); r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), - out_num, in_num, NULL, NULL); + out_num, in_num, NULL, NULL); } return r; @@ -351,6 +352,9 @@ static void handle_tx(struct vhost_net *net) if (!sock) goto out; + if (!vq_iotlb_prefetch(vq)) + goto out; + vhost_disable_notify(&net->dev, vq); hdr_size = nvq->vhost_hlen; @@ -612,6 +616,10 @@ static void handle_rx(struct vhost_net *net) sock = vq->private_data; if (!sock) goto out; + + if (!vq_iotlb_prefetch(vq)) + goto out; + vhost_disable_notify(&net->dev, vq); vhost_hlen = nvq->vhost_hlen; @@ -1080,10 +1088,14 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) } mutex_lock(&n->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && - !vhost_log_access_ok(&n->dev)) { - mutex_unlock(&n->dev.mutex); - return -EFAULT; + !vhost_log_access_ok(&n->dev)) + goto out_unlock; + + if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) { + if (vhost_init_device_iotlb(&n->dev, true)) + goto out_unlock; } + for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].vq.mutex); n->vqs[i].vq.acked_features = features; @@ -1093,6 +1105,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) } mutex_unlock(&n->dev.mutex); return 0; + +out_unlock: + mutex_unlock(&n->dev.mutex); + return -EFAULT; } static long vhost_net_set_owner(struct vhost_net *n) @@ -1166,9 +1182,40 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, } #endif +static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct vhost_net *n = file->private_data; + struct vhost_dev *dev = &n->dev; + int noblock = file->f_flags & O_NONBLOCK; + + return vhost_chr_read_iter(dev, to, noblock); +} + +static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vhost_net *n = file->private_data; + struct vhost_dev *dev = &n->dev; + + return vhost_chr_write_iter(dev, from); +} + +static unsigned int vhost_net_chr_poll(struct file *file, poll_table *wait) +{ + struct vhost_net *n = file->private_data; + struct vhost_dev *dev = &n->dev; + + return vhost_chr_poll(file, dev, wait); +} + static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, + .read_iter = vhost_net_chr_read_iter, + .write_iter = vhost_net_chr_write_iter, + .poll = vhost_net_chr_poll, .unlocked_ioctl = vhost_net_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vhost_net_compat_ioctl, diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8071f3638db9..d02c1614921f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -35,6 +35,10 @@ static ushort max_mem_regions = 64; module_param(max_mem_regions, ushort, 0444); MODULE_PARM_DESC(max_mem_regions, "Maximum number of memory regions in memory map. (default: 64)"); +static int max_iotlb_entries = 2048; +module_param(max_iotlb_entries, int, 0444); +MODULE_PARM_DESC(max_iotlb_entries, + "Maximum number of iotlb entries. (default: 2048)"); enum { VHOST_MEMORY_F_LOG = 0x1, @@ -306,6 +310,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, vhost_disable_cross_endian(vq); vq->busyloop_timeout = 0; vq->umem = NULL; + vq->iotlb = NULL; } static int vhost_worker(void *data) @@ -400,9 +405,14 @@ void vhost_dev_init(struct vhost_dev *dev, dev->log_ctx = NULL; dev->log_file = NULL; dev->umem = NULL; + dev->iotlb = NULL; dev->mm = NULL; dev->worker = NULL; init_llist_head(&dev->work_list); + init_waitqueue_head(&dev->wait); + INIT_LIST_HEAD(&dev->read_list); + INIT_LIST_HEAD(&dev->pending_list); + spin_lock_init(&dev->iotlb_lock); for (i = 0; i < dev->nvqs; ++i) { @@ -550,6 +560,15 @@ void vhost_dev_stop(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_stop); +static void vhost_umem_free(struct vhost_umem *umem, + struct vhost_umem_node *node) +{ + vhost_umem_interval_tree_remove(node, &umem->umem_tree); + list_del(&node->link); + kfree(node); + umem->numem--; +} + static void vhost_umem_clean(struct vhost_umem *umem) { struct vhost_umem_node *node, *tmp; @@ -557,14 +576,31 @@ static void vhost_umem_clean(struct vhost_umem *umem) if (!umem) return; - list_for_each_entry_safe(node, tmp, &umem->umem_list, link) { - vhost_umem_interval_tree_remove(node, &umem->umem_tree); - list_del(&node->link); - kvfree(node); - } + list_for_each_entry_safe(node, tmp, &umem->umem_list, link) + vhost_umem_free(umem, node); + kvfree(umem); } +static void vhost_clear_msg(struct vhost_dev *dev) +{ + struct vhost_msg_node *node, *n; + + spin_lock(&dev->iotlb_lock); + + list_for_each_entry_safe(node, n, &dev->read_list, node) { + list_del(&node->node); + kfree(node); + } + + list_for_each_entry_safe(node, n, &dev->pending_list, node) { + list_del(&node->node); + kfree(node); + } + + spin_unlock(&dev->iotlb_lock); +} + /* Caller should have device mutex if and only if locked is set */ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) { @@ -593,6 +629,10 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked) /* No one will access memory at this point */ vhost_umem_clean(dev->umem); dev->umem = NULL; + vhost_umem_clean(dev->iotlb); + dev->iotlb = NULL; + vhost_clear_msg(dev); + wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM); WARN_ON(!llist_empty(&dev->work_list)); if (dev->worker) { kthread_stop(dev->worker); @@ -668,28 +708,381 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, return 1; } -#define vhost_put_user(vq, x, ptr) __put_user(x, ptr) +static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, + struct iovec iov[], int iov_size, int access); static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to, const void *from, unsigned size) { - return __copy_to_user(to, from, size); -} + int ret; -#define vhost_get_user(vq, x, ptr) __get_user(x, ptr) + if (!vq->iotlb) + return __copy_to_user(to, from, size); + else { + /* This function should be called after iotlb + * prefetch, which means we're sure that all vq + * could be access through iotlb. So -EAGAIN should + * not happen in this case. + */ + /* TODO: more fast path */ + struct iov_iter t; + ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov, + ARRAY_SIZE(vq->iotlb_iov), + VHOST_ACCESS_WO); + if (ret < 0) + goto out; + iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size); + ret = copy_to_iter(from, size, &t); + if (ret == size) + ret = 0; + } +out: + return ret; +} static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to, void *from, unsigned size) { - return __copy_from_user(to, from, size); + int ret; + + if (!vq->iotlb) + return __copy_from_user(to, from, size); + else { + /* This function should be called after iotlb + * prefetch, which means we're sure that vq + * could be access through iotlb. So -EAGAIN should + * not happen in this case. + */ + /* TODO: more fast path */ + struct iov_iter f; + ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov, + ARRAY_SIZE(vq->iotlb_iov), + VHOST_ACCESS_RO); + if (ret < 0) { + vq_err(vq, "IOTLB translation failure: uaddr " + "%p size 0x%llx\n", from, + (unsigned long long) size); + goto out; + } + iov_iter_init(&f, READ, vq->iotlb_iov, ret, size); + ret = copy_from_iter(to, size, &f); + if (ret == size) + ret = 0; + } + +out: + return ret; +} + +static void __user *__vhost_get_user(struct vhost_virtqueue *vq, + void *addr, unsigned size) +{ + int ret; + + /* This function should be called after iotlb + * prefetch, which means we're sure that vq + * could be access through iotlb. So -EAGAIN should + * not happen in this case. + */ + /* TODO: more fast path */ + ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov, + ARRAY_SIZE(vq->iotlb_iov), + VHOST_ACCESS_RO); + if (ret < 0) { + vq_err(vq, "IOTLB translation failure: uaddr " + "%p size 0x%llx\n", addr, + (unsigned long long) size); + return NULL; + } + + if (ret != 1 || vq->iotlb_iov[0].iov_len != size) { + vq_err(vq, "Non atomic userspace memory access: uaddr " + "%p size 0x%llx\n", addr, + (unsigned long long) size); + return NULL; + } + + return vq->iotlb_iov[0].iov_base; +} + +#define vhost_put_user(vq, x, ptr) \ +({ \ + int ret = -EFAULT; \ + if (!vq->iotlb) { \ + ret = __put_user(x, ptr); \ + } else { \ + __typeof__(ptr) to = \ + (__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \ + if (to != NULL) \ + ret = __put_user(x, to); \ + else \ + ret = -EFAULT; \ + } \ + ret; \ +}) + +#define vhost_get_user(vq, x, ptr) \ +({ \ + int ret; \ + if (!vq->iotlb) { \ + ret = __get_user(x, ptr); \ + } else { \ + __typeof__(ptr) from = \ + (__typeof__(ptr)) __vhost_get_user(vq, ptr, sizeof(*ptr)); \ + if (from != NULL) \ + ret = __get_user(x, from); \ + else \ + ret = -EFAULT; \ + } \ + ret; \ +}) + +static void vhost_dev_lock_vqs(struct vhost_dev *d) +{ + int i = 0; + for (i = 0; i < d->nvqs; ++i) + mutex_lock(&d->vqs[i]->mutex); +} + +static void vhost_dev_unlock_vqs(struct vhost_dev *d) +{ + int i = 0; + for (i = 0; i < d->nvqs; ++i) + mutex_unlock(&d->vqs[i]->mutex); +} + +static int vhost_new_umem_range(struct vhost_umem *umem, + u64 start, u64 size, u64 end, + u64 userspace_addr, int perm) +{ + struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC); + + if (!node) + return -ENOMEM; + + if (umem->numem == max_iotlb_entries) { + tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link); + vhost_umem_free(umem, tmp); + } + + node->start = start; + node->size = size; + node->last = end; + node->userspace_addr = userspace_addr; + node->perm = perm; + INIT_LIST_HEAD(&node->link); + list_add_tail(&node->link, &umem->umem_list); + vhost_umem_interval_tree_insert(node, &umem->umem_tree); + umem->numem++; + + return 0; +} + +static void vhost_del_umem_range(struct vhost_umem *umem, + u64 start, u64 end) +{ + struct vhost_umem_node *node; + + while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, + start, end))) + vhost_umem_free(umem, node); +} + +static void vhost_iotlb_notify_vq(struct vhost_dev *d, + struct vhost_iotlb_msg *msg) +{ + struct vhost_msg_node *node, *n; + + spin_lock(&d->iotlb_lock); + + list_for_each_entry_safe(node, n, &d->pending_list, node) { + struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb; + if (msg->iova <= vq_msg->iova && + msg->iova + msg->size - 1 > vq_msg->iova && + vq_msg->type == VHOST_IOTLB_MISS) { + vhost_poll_queue(&node->vq->poll); + list_del(&node->node); + kfree(node); + } + } + + spin_unlock(&d->iotlb_lock); +} + +static int umem_access_ok(u64 uaddr, u64 size, int access) +{ + unsigned long a = uaddr; + + if ((access & VHOST_ACCESS_RO) && + !access_ok(VERIFY_READ, (void __user *)a, size)) + return -EFAULT; + if ((access & VHOST_ACCESS_WO) && + !access_ok(VERIFY_WRITE, (void __user *)a, size)) + return -EFAULT; + return 0; +} + +int vhost_process_iotlb_msg(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg) +{ + int ret = 0; + + vhost_dev_lock_vqs(dev); + switch (msg->type) { + case VHOST_IOTLB_UPDATE: + if (!dev->iotlb) { + ret = -EFAULT; + break; + } + if (umem_access_ok(msg->uaddr, msg->size, msg->perm)) { + ret = -EFAULT; + break; + } + if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size, + msg->iova + msg->size - 1, + msg->uaddr, msg->perm)) { + ret = -ENOMEM; + break; + } + vhost_iotlb_notify_vq(dev, msg); + break; + case VHOST_IOTLB_INVALIDATE: + vhost_del_umem_range(dev->iotlb, msg->iova, + msg->iova + msg->size - 1); + break; + default: + ret = -EINVAL; + break; + } + + vhost_dev_unlock_vqs(dev); + return ret; +} +ssize_t vhost_chr_write_iter(struct vhost_dev *dev, + struct iov_iter *from) +{ + struct vhost_msg_node node; + unsigned size = sizeof(struct vhost_msg); + size_t ret; + int err; + + if (iov_iter_count(from) < size) + return 0; + ret = copy_from_iter(&node.msg, size, from); + if (ret != size) + goto done; + + switch (node.msg.type) { + case VHOST_IOTLB_MSG: + err = vhost_process_iotlb_msg(dev, &node.msg.iotlb); + if (err) + ret = err; + break; + default: + ret = -EINVAL; + break; + } + +done: + return ret; +} +EXPORT_SYMBOL(vhost_chr_write_iter); + +unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev, + poll_table *wait) +{ + unsigned int mask = 0; + + poll_wait(file, &dev->wait, wait); + + if (!list_empty(&dev->read_list)) + mask |= POLLIN | POLLRDNORM; + + return mask; +} +EXPORT_SYMBOL(vhost_chr_poll); + +ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to, + int noblock) +{ + DEFINE_WAIT(wait); + struct vhost_msg_node *node; + ssize_t ret = 0; + unsigned size = sizeof(struct vhost_msg); + + if (iov_iter_count(to) < size) + return 0; + + while (1) { + if (!noblock) + prepare_to_wait(&dev->wait, &wait, + TASK_INTERRUPTIBLE); + + node = vhost_dequeue_msg(dev, &dev->read_list); + if (node) + break; + if (noblock) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!dev->iotlb) { + ret = -EBADFD; + break; + } + + schedule(); + } + + if (!noblock) + finish_wait(&dev->wait, &wait); + + if (node) { + ret = copy_to_iter(&node->msg, size, to); + + if (ret != size || node->msg.type != VHOST_IOTLB_MISS) { + kfree(node); + return ret; + } + + vhost_enqueue_msg(dev, &dev->pending_list, node); + } + + return ret; +} +EXPORT_SYMBOL_GPL(vhost_chr_read_iter); + +static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access) +{ + struct vhost_dev *dev = vq->dev; + struct vhost_msg_node *node; + struct vhost_iotlb_msg *msg; + + node = vhost_new_msg(vq, VHOST_IOTLB_MISS); + if (!node) + return -ENOMEM; + + msg = &node->msg.iotlb; + msg->type = VHOST_IOTLB_MISS; + msg->iova = iova; + msg->perm = access; + + vhost_enqueue_msg(dev, &dev->read_list, node); + + return 0; } static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, struct vring_desc __user *desc, struct vring_avail __user *avail, struct vring_used __user *used) + { size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + return access_ok(VERIFY_READ, desc, num * sizeof *desc) && access_ok(VERIFY_READ, avail, sizeof *avail + num * sizeof *avail->ring + s) && @@ -697,6 +1090,54 @@ static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, sizeof *used + num * sizeof *used->ring + s); } +static int iotlb_access_ok(struct vhost_virtqueue *vq, + int access, u64 addr, u64 len) +{ + const struct vhost_umem_node *node; + struct vhost_umem *umem = vq->iotlb; + u64 s = 0, size; + + while (len > s) { + node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, + addr, + addr + len - 1); + if (node == NULL || node->start > addr) { + vhost_iotlb_miss(vq, addr, access); + return false; + } else if (!(node->perm & access)) { + /* Report the possible access violation by + * request another translation from userspace. + */ + return false; + } + + size = node->size - addr + node->start; + s += size; + addr += size; + } + + return true; +} + +int vq_iotlb_prefetch(struct vhost_virtqueue *vq) +{ + size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + unsigned int num = vq->num; + + if (!vq->iotlb) + return 1; + + return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc, + num * sizeof *vq->desc) && + iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail, + sizeof *vq->avail + + num * sizeof *vq->avail->ring + s) && + iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used, + sizeof *vq->used + + num * sizeof *vq->used->ring + s); +} +EXPORT_SYMBOL_GPL(vq_iotlb_prefetch); + /* Can we log writes? */ /* Caller should have device mutex but not vq mutex */ int vhost_log_access_ok(struct vhost_dev *dev) @@ -723,16 +1164,35 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq, /* Caller should have vq mutex and device mutex */ int vhost_vq_access_ok(struct vhost_virtqueue *vq) { + if (vq->iotlb) { + /* When device IOTLB was used, the access validation + * will be validated during prefetching. + */ + return 1; + } return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used) && vq_log_access_ok(vq, vq->log_base); } EXPORT_SYMBOL_GPL(vhost_vq_access_ok); +static struct vhost_umem *vhost_umem_alloc(void) +{ + struct vhost_umem *umem = vhost_kvzalloc(sizeof(*umem)); + + if (!umem) + return NULL; + + umem->umem_tree = RB_ROOT; + umem->numem = 0; + INIT_LIST_HEAD(&umem->umem_list); + + return umem; +} + static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem; struct vhost_memory_region *region; - struct vhost_umem_node *node; struct vhost_umem *newumem, *oldumem; unsigned long size = offsetof(struct vhost_memory, regions); int i; @@ -754,28 +1214,23 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) return -EFAULT; } - newumem = vhost_kvzalloc(sizeof(*newumem)); + newumem = vhost_umem_alloc(); if (!newumem) { kvfree(newmem); return -ENOMEM; } - newumem->umem_tree = RB_ROOT; - INIT_LIST_HEAD(&newumem->umem_list); - for (region = newmem->regions; region < newmem->regions + mem.nregions; region++) { - node = vhost_kvzalloc(sizeof(*node)); - if (!node) + if (vhost_new_umem_range(newumem, + region->guest_phys_addr, + region->memory_size, + region->guest_phys_addr + + region->memory_size - 1, + region->userspace_addr, + VHOST_ACCESS_RW)) goto err; - node->start = region->guest_phys_addr; - node->size = region->memory_size; - node->last = node->start + node->size - 1; - node->userspace_addr = region->userspace_addr; - INIT_LIST_HEAD(&node->link); - list_add_tail(&node->link, &newumem->umem_list); - vhost_umem_interval_tree_insert(node, &newumem->umem_tree); } if (!memory_access_ok(d, newumem, 0)) @@ -1019,6 +1474,30 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) } EXPORT_SYMBOL_GPL(vhost_vring_ioctl); +int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled) +{ + struct vhost_umem *niotlb, *oiotlb; + int i; + + niotlb = vhost_umem_alloc(); + if (!niotlb) + return -ENOMEM; + + oiotlb = d->iotlb; + d->iotlb = niotlb; + + for (i = 0; i < d->nvqs; ++i) { + mutex_lock(&d->vqs[i]->mutex); + d->vqs[i]->iotlb = niotlb; + mutex_unlock(&d->vqs[i]->mutex); + } + + vhost_umem_clean(oiotlb); + + return 0; +} +EXPORT_SYMBOL_GPL(vhost_init_device_iotlb); + /* Caller must have device mutex */ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) { @@ -1233,15 +1712,20 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq) if (r) goto err; vq->signalled_used_valid = false; - if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) { + if (!vq->iotlb && + !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) { r = -EFAULT; goto err; } r = vhost_get_user(vq, last_used_idx, &vq->used->idx); - if (r) + if (r) { + vq_err(vq, "Can't access used idx at %p\n", + &vq->used->idx); goto err; + } vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx); return 0; + err: vq->is_le = is_le; return r; @@ -1249,10 +1733,11 @@ err: EXPORT_SYMBOL_GPL(vhost_vq_init_access); static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, - struct iovec iov[], int iov_size) + struct iovec iov[], int iov_size, int access) { const struct vhost_umem_node *node; - struct vhost_umem *umem = vq->umem; + struct vhost_dev *dev = vq->dev; + struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem; struct iovec *_iov; u64 s = 0; int ret = 0; @@ -1263,12 +1748,21 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, ret = -ENOBUFS; break; } + node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, addr, addr + len - 1); if (node == NULL || node->start > addr) { - ret = -EFAULT; + if (umem != dev->iotlb) { + ret = -EFAULT; + break; + } + ret = -EAGAIN; + break; + } else if (!(node->perm & access)) { + ret = -EPERM; break; } + _iov = iov + ret; size = node->size - addr + node->start; _iov->iov_len = min((u64)len - s, size); @@ -1279,6 +1773,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, ++ret; } + if (ret == -EAGAIN) + vhost_iotlb_miss(vq, addr, access); return ret; } @@ -1313,7 +1809,7 @@ static int get_indirect(struct vhost_virtqueue *vq, unsigned int i = 0, count, found = 0; u32 len = vhost32_to_cpu(vq, indirect->len); struct iov_iter from; - int ret; + int ret, access; /* Sanity check */ if (unlikely(len % sizeof desc)) { @@ -1325,9 +1821,10 @@ static int get_indirect(struct vhost_virtqueue *vq, } ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect, - UIO_MAXIOV); + UIO_MAXIOV, VHOST_ACCESS_RO); if (unlikely(ret < 0)) { - vq_err(vq, "Translation failure %d in indirect.\n", ret); + if (ret != -EAGAIN) + vq_err(vq, "Translation failure %d in indirect.\n", ret); return ret; } iov_iter_init(&from, READ, vq->indirect, ret, len); @@ -1365,16 +1862,22 @@ static int get_indirect(struct vhost_virtqueue *vq, return -EINVAL; } + if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) + access = VHOST_ACCESS_WO; + else + access = VHOST_ACCESS_RO; + ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), vhost32_to_cpu(vq, desc.len), iov + iov_count, - iov_size - iov_count); + iov_size - iov_count, access); if (unlikely(ret < 0)) { - vq_err(vq, "Translation failure %d indirect idx %d\n", - ret, i); + if (ret != -EAGAIN) + vq_err(vq, "Translation failure %d indirect idx %d\n", + ret, i); return ret; } /* If this is an input descriptor, increment that count. */ - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) { + if (access == VHOST_ACCESS_WO) { *in_num += ret; if (unlikely(log)) { log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); @@ -1413,7 +1916,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, u16 last_avail_idx; __virtio16 avail_idx; __virtio16 ring_head; - int ret; + int ret, access; /* Check it isn't doing very strange things with descriptor numbers. */ last_avail_idx = vq->last_avail_idx; @@ -1487,22 +1990,28 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, out_num, in_num, log, log_num, &desc); if (unlikely(ret < 0)) { - vq_err(vq, "Failure detected " - "in indirect descriptor at idx %d\n", i); + if (ret != -EAGAIN) + vq_err(vq, "Failure detected " + "in indirect descriptor at idx %d\n", i); return ret; } continue; } + if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) + access = VHOST_ACCESS_WO; + else + access = VHOST_ACCESS_RO; ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), vhost32_to_cpu(vq, desc.len), iov + iov_count, - iov_size - iov_count); + iov_size - iov_count, access); if (unlikely(ret < 0)) { - vq_err(vq, "Translation failure %d descriptor idx %d\n", - ret, i); + if (ret != -EAGAIN) + vq_err(vq, "Translation failure %d descriptor idx %d\n", + ret, i); return ret; } - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) { + if (access == VHOST_ACCESS_WO) { /* If this is an input descriptor, * increment that count. */ *in_num += ret; @@ -1768,6 +2277,47 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) } EXPORT_SYMBOL_GPL(vhost_disable_notify); +/* Create a new message. */ +struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type) +{ + struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL); + if (!node) + return NULL; + node->vq = vq; + node->msg.type = type; + return node; +} +EXPORT_SYMBOL_GPL(vhost_new_msg); + +void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head, + struct vhost_msg_node *node) +{ + spin_lock(&dev->iotlb_lock); + list_add_tail(&node->node, head); + spin_unlock(&dev->iotlb_lock); + + wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM); +} +EXPORT_SYMBOL_GPL(vhost_enqueue_msg); + +struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev, + struct list_head *head) +{ + struct vhost_msg_node *node = NULL; + + spin_lock(&dev->iotlb_lock); + if (!list_empty(head)) { + node = list_first_entry(head, struct vhost_msg_node, + node); + list_del(&node->node); + } + spin_unlock(&dev->iotlb_lock); + + return node; +} +EXPORT_SYMBOL_GPL(vhost_dequeue_msg); + + static int __init vhost_init(void) { return 0; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index eaaf6df72218..78f3c5fc02e4 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -65,13 +65,15 @@ struct vhost_umem_node { __u64 last; __u64 size; __u64 userspace_addr; - __u64 flags_padding; + __u32 perm; + __u32 flags_padding; __u64 __subtree_last; }; struct vhost_umem { struct rb_root umem_tree; struct list_head umem_list; + int numem; }; /* The virtqueue structure describes a queue attached to a device. */ @@ -119,10 +121,12 @@ struct vhost_virtqueue { u64 log_addr; struct iovec iov[UIO_MAXIOV]; + struct iovec iotlb_iov[64]; struct iovec *indirect; struct vring_used_elem *heads; /* Protected by virtqueue mutex. */ struct vhost_umem *umem; + struct vhost_umem *iotlb; void *private_data; u64 acked_features; /* Log write descriptors */ @@ -139,6 +143,12 @@ struct vhost_virtqueue { u32 busyloop_timeout; }; +struct vhost_msg_node { + struct vhost_msg msg; + struct vhost_virtqueue *vq; + struct list_head node; +}; + struct vhost_dev { struct mm_struct *mm; struct mutex mutex; @@ -149,6 +159,11 @@ struct vhost_dev { struct llist_head work_list; struct task_struct *worker; struct vhost_umem *umem; + struct vhost_umem *iotlb; + spinlock_t iotlb_lock; + struct list_head read_list; + struct list_head pending_list; + wait_queue_head_t wait; }; void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs); @@ -185,6 +200,21 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *); int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len); +int vq_iotlb_prefetch(struct vhost_virtqueue *vq); + +struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type); +void vhost_enqueue_msg(struct vhost_dev *dev, + struct list_head *head, + struct vhost_msg_node *node); +struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev, + struct list_head *head); +unsigned int vhost_chr_poll(struct file *file, struct vhost_dev *dev, + poll_table *wait); +ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to, + int noblock); +ssize_t vhost_chr_write_iter(struct vhost_dev *dev, + struct iov_iter *from); +int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled); #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index c4400b267716..56b7ab584cc0 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -47,6 +47,32 @@ struct vhost_vring_addr { __u64 log_guest_addr; }; +/* no alignment requirement */ +struct vhost_iotlb_msg { + __u64 iova; + __u64 size; + __u64 uaddr; +#define VHOST_ACCESS_RO 0x1 +#define VHOST_ACCESS_WO 0x2 +#define VHOST_ACCESS_RW 0x3 + __u8 perm; +#define VHOST_IOTLB_MISS 1 +#define VHOST_IOTLB_UPDATE 2 +#define VHOST_IOTLB_INVALIDATE 3 +#define VHOST_IOTLB_ACCESS_FAIL 4 + __u8 type; +}; + +#define VHOST_IOTLB_MSG 0x1 + +struct vhost_msg { + int type; + union { + struct vhost_iotlb_msg iotlb; + __u8 padding[64]; + }; +}; + struct vhost_memory_region { __u64 guest_phys_addr; __u64 memory_size; /* bytes */ @@ -146,6 +172,8 @@ struct vhost_memory { #define VHOST_F_LOG_ALL 26 /* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */ #define VHOST_NET_F_VIRTIO_NET_HDR 27 +/* Vhost have device IOTLB */ +#define VHOST_F_DEVICE_IOTLB 63 /* VHOST_SCSI specific definitions */ -- cgit v1.2.3-71-gd317 From db3f60012482756f46cc4d7d9ad7d793ae30360c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 2 Aug 2016 14:03:36 -0700 Subject: uapi: move forward declarations of internal structures Don't user forward declarations of internal kernel structures in headers exported to userspace. Move "struct completion;". Move "struct task_struct;". Link: http://lkml.kernel.org/r/20160713215808.GA22486@p183.telecom.by Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/capability.h | 1 + include/linux/sysctl.h | 1 + include/uapi/linux/capability.h | 2 -- include/uapi/linux/sysctl.h | 2 -- 4 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/capability.h b/include/linux/capability.h index 5f3c63dde2d5..dbc21c719ce6 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -38,6 +38,7 @@ struct cpu_vfs_cap_data { struct file; struct inode; struct dentry; +struct task_struct; struct user_namespace; extern const kernel_cap_t __cap_empty_set; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index fa7bc29925c9..697e160c78d0 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -28,6 +28,7 @@ #include /* For the /proc/sys support */ +struct completion; struct ctl_table; struct nsproxy; struct ctl_table_root; diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 12c37a197d24..49bc06295398 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -15,8 +15,6 @@ #include -struct task_struct; - /* User-level do most of the mapping between kernel and user capabilities based on the version tag given by the kernel. The kernel might be somewhat backwards compatible, but don't bet on diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 0956373b56db..d2b12152e358 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -26,8 +26,6 @@ #include #include -struct completion; - #define CTL_MAXNAME 10 /* how many path components do we allow in a call to sysctl? In other words, what is the largest acceptable value for the nlen -- cgit v1.2.3-71-gd317 From e63e88bc53bac7e4c3f592f8126c51a7569be673 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 2 Aug 2016 14:05:30 -0700 Subject: nilfs2: move ioctl interface and disk layout to uapi separately The header file "include/linux/nilfs2_fs.h" is composed of parts for ioctl and disk format, and both are intended to be shared with user space programs. This moves them to the uapi directory "include/uapi/linux" splitting the file to "nilfs2_api.h" and "nilfs2_ondisk.h". The following minor changes are accompanied by this migration: - nilfs_direct_node struct in nilfs2/direct.h is converged to nilfs2_ondisk.h because it's an on-disk structure. - inline functions nilfs_rec_len_from_disk() and nilfs_rec_len_to_disk() are moved to nilfs2/dir.c. Link: http://lkml.kernel.org/r/1465825507-3407-4-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/nilfs2.txt | 3 +- Documentation/ioctl/ioctl-number.txt | 2 +- MAINTAINERS | 3 +- fs/nilfs2/bmap.h | 2 +- fs/nilfs2/btree.h | 2 +- fs/nilfs2/cpfile.c | 1 - fs/nilfs2/cpfile.h | 3 +- fs/nilfs2/dat.h | 1 + fs/nilfs2/dir.c | 22 + fs/nilfs2/direct.h | 10 - fs/nilfs2/ifile.h | 1 - fs/nilfs2/ioctl.c | 1 - fs/nilfs2/nilfs.h | 3 +- fs/nilfs2/segment.h | 1 - fs/nilfs2/sufile.c | 1 - fs/nilfs2/sufile.h | 1 - include/linux/nilfs2_fs.h | 934 ----------------------------------- include/uapi/linux/nilfs2_api.h | 292 +++++++++++ include/uapi/linux/nilfs2_ondisk.h | 650 ++++++++++++++++++++++++ 19 files changed, 976 insertions(+), 957 deletions(-) delete mode 100644 include/linux/nilfs2_fs.h create mode 100644 include/uapi/linux/nilfs2_api.h create mode 100644 include/uapi/linux/nilfs2_ondisk.h (limited to 'include/uapi') diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 5b21ef76f751..c0727dc36271 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -267,7 +267,8 @@ among NILFS2 files can be depicted as follows: `-- file (ino=yy) ( regular file, directory, or symlink ) -For detail on the format of each file, please see include/linux/nilfs2_fs.h. +For detail on the format of each file, please see nilfs2_ondisk.h +located at include/uapi/linux directory. There are no patents or other intellectual property that we protect with regard to the design of NILFS2. It is allowed to replicate the diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 56af5e43e9c0..81c7f2bb7daf 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -248,7 +248,7 @@ Code Seq#(hex) Include File Comments 'm' 00 drivers/scsi/megaraid/megaraid_ioctl.h conflict! 'm' 00-1F net/irda/irmod.h conflict! 'n' 00-7F linux/ncp_fs.h and fs/ncpfs/ioctl.c -'n' 80-8F linux/nilfs2_fs.h NILFS2 +'n' 80-8F uapi/linux/nilfs2_api.h NILFS2 'n' E0-FF linux/matroxfb.h matroxfb 'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2 'o' 00-03 mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps) diff --git a/MAINTAINERS b/MAINTAINERS index bb51bbbc9e1d..e9eacacf0f08 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8258,8 +8258,9 @@ T: git git://github.com/konis/nilfs2.git S: Supported F: Documentation/filesystems/nilfs2.txt F: fs/nilfs2/ -F: include/linux/nilfs2_fs.h F: include/trace/events/nilfs2.h +F: include/uapi/linux/nilfs2_api.h +F: include/uapi/linux/nilfs2_ondisk.h NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER M: YOKOTA Hiroshi diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b6a4c8f93ac8..2b6ffbe5997a 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include /* nilfs_binfo, nilfs_inode, etc */ #include "alloc.h" #include "dat.h" diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index df1a25faa83b..2184e47fa4bf 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include /* nilfs_btree_node */ #include "btnode.h" #include "bmap.h" diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 19d9f4ae8347..a15a1601e931 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -21,7 +21,6 @@ #include #include #include -#include #include "mdt.h" #include "cpfile.h" diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 0249744ae234..6eca972f9673 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -21,7 +21,8 @@ #include #include -#include +#include /* nilfs_cpstat */ +#include /* nilfs_inode, nilfs_checkpoint */ int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index abbfdabcabea..57dc6cf466d0 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -22,6 +22,7 @@ #include #include #include +#include /* nilfs_inode, nilfs_checkpoint */ struct nilfs_palloc_req; diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 746956d2937a..908ebbf0ac7e 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -42,6 +42,28 @@ #include "nilfs.h" #include "page.h" +static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen) +{ + unsigned int len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == NILFS_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 nilfs_rec_len_to_disk(unsigned int len) +{ +#if (PAGE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(NILFS_MAX_REC_LEN); + + BUG_ON(len > (1 << 16)); +#endif + return cpu_to_le16(len); +} + /* * nilfs uses block-sized chunks. Arguably, sector-sized ones would be * more robust, but we have what we have diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index 3015a6e78724..cfe85e848bba 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -24,16 +24,6 @@ #include "bmap.h" -/** - * struct nilfs_direct_node - direct node - * @dn_flags: flags - * @dn_pad: padding - */ -struct nilfs_direct_node { - __u8 dn_flags; - __u8 pad[7]; -}; - #define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) #define NILFS_DIRECT_KEY_MIN 0 #define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 23ad2f091e76..188b94fe0ec5 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -23,7 +23,6 @@ #include #include -#include #include "mdt.h" #include "alloc.h" diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 827283fe9525..f1d7989459fd 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -25,7 +25,6 @@ #include /* compat_ptr() */ #include /* mnt_want_write_file(), mnt_drop_write_file() */ #include -#include #include "nilfs.h" #include "segment.h" #include "bmap.h" diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 2ba8a146af1f..33f8c8fc96e8 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -23,7 +23,8 @@ #include #include #include -#include +#include +#include #include "the_nilfs.h" #include "bmap.h" diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 6565c10b7b76..1060949d7dd2 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -23,7 +23,6 @@ #include #include #include -#include #include "nilfs.h" struct nilfs_root; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 12d11de93602..1541a1e9221a 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "mdt.h" #include "sufile.h" diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 46e89872294c..158a9190c8ec 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -21,7 +21,6 @@ #include #include -#include #include "mdt.h" diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h deleted file mode 100644 index 5988dd57ba66..000000000000 --- a/include/linux/nilfs2_fs.h +++ /dev/null @@ -1,934 +0,0 @@ -/* - * nilfs2_fs.h - NILFS2 on-disk structures and common declarations. - * - * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * Written by Koji Sato and Ryusuke Konishi. - */ -/* - * linux/include/linux/ext2_fs.h - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#ifndef _LINUX_NILFS_FS_H -#define _LINUX_NILFS_FS_H - -#include -#include -#include -#include - - -#define NILFS_INODE_BMAP_SIZE 7 -/** - * struct nilfs_inode - structure of an inode on disk - * @i_blocks: blocks count - * @i_size: size in bytes - * @i_ctime: creation time (seconds) - * @i_mtime: modification time (seconds) - * @i_ctime_nsec: creation time (nano seconds) - * @i_mtime_nsec: modification time (nano seconds) - * @i_uid: user id - * @i_gid: group id - * @i_mode: file mode - * @i_links_count: links count - * @i_flags: file flags - * @i_bmap: block mapping - * @i_xattr: extended attributes - * @i_generation: file generation (for NFS) - * @i_pad: padding - */ -struct nilfs_inode { - __le64 i_blocks; - __le64 i_size; - __le64 i_ctime; - __le64 i_mtime; - __le32 i_ctime_nsec; - __le32 i_mtime_nsec; - __le32 i_uid; - __le32 i_gid; - __le16 i_mode; - __le16 i_links_count; - __le32 i_flags; - __le64 i_bmap[NILFS_INODE_BMAP_SIZE]; -#define i_device_code i_bmap[0] - __le64 i_xattr; - __le32 i_generation; - __le32 i_pad; -}; - -#define NILFS_MIN_INODE_SIZE 128 - -/** - * struct nilfs_super_root - structure of super root - * @sr_sum: check sum - * @sr_bytes: byte count of the structure - * @sr_flags: flags (reserved) - * @sr_nongc_ctime: write time of the last segment not for cleaner operation - * @sr_dat: DAT file inode - * @sr_cpfile: checkpoint file inode - * @sr_sufile: segment usage file inode - */ -struct nilfs_super_root { - __le32 sr_sum; - __le16 sr_bytes; - __le16 sr_flags; - __le64 sr_nongc_ctime; - struct nilfs_inode sr_dat; - struct nilfs_inode sr_cpfile; - struct nilfs_inode sr_sufile; -}; - -#define NILFS_SR_MDT_OFFSET(inode_size, i) \ - ((unsigned long)&((struct nilfs_super_root *)0)->sr_dat + \ - (inode_size) * (i)) -#define NILFS_SR_DAT_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 0) -#define NILFS_SR_CPFILE_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 1) -#define NILFS_SR_SUFILE_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 2) -#define NILFS_SR_BYTES(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 3) - -/* - * Maximal mount counts - */ -#define NILFS_DFL_MAX_MNT_COUNT 50 /* 50 mounts */ - -/* - * File system states (sbp->s_state, nilfs->ns_mount_state) - */ -#define NILFS_VALID_FS 0x0001 /* Unmounted cleanly */ -#define NILFS_ERROR_FS 0x0002 /* Errors detected */ -#define NILFS_RESIZE_FS 0x0004 /* Resize required */ - -/* - * Mount flags (sbi->s_mount_opt) - */ -#define NILFS_MOUNT_ERROR_MODE 0x0070 /* Error mode mask */ -#define NILFS_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */ -#define NILFS_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ -#define NILFS_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ -#define NILFS_MOUNT_BARRIER 0x1000 /* Use block barriers */ -#define NILFS_MOUNT_STRICT_ORDER 0x2000 /* - * Apply strict in-order - * semantics also for data - */ -#define NILFS_MOUNT_NORECOVERY 0x4000 /* - * Disable write access during - * mount-time recovery - */ -#define NILFS_MOUNT_DISCARD 0x8000 /* Issue DISCARD requests */ - - -/** - * struct nilfs_super_block - structure of super block on disk - */ -struct nilfs_super_block { -/*00*/ __le32 s_rev_level; /* Revision level */ - __le16 s_minor_rev_level; /* minor revision level */ - __le16 s_magic; /* Magic signature */ - - __le16 s_bytes; /* - * Bytes count of CRC calculation - * for this structure. s_reserved - * is excluded. - */ - __le16 s_flags; /* flags */ - __le32 s_crc_seed; /* Seed value of CRC calculation */ -/*10*/ __le32 s_sum; /* Check sum of super block */ - - __le32 s_log_block_size; /* - * Block size represented as follows - * blocksize = - * 1 << (s_log_block_size + 10) - */ - __le64 s_nsegments; /* Number of segments in filesystem */ -/*20*/ __le64 s_dev_size; /* block device size in bytes */ - __le64 s_first_data_block; /* 1st seg disk block number */ -/*30*/ __le32 s_blocks_per_segment; /* number of blocks per full segment */ - __le32 s_r_segments_percentage; /* Reserved segments percentage */ - - __le64 s_last_cno; /* Last checkpoint number */ -/*40*/ __le64 s_last_pseg; /* disk block addr pseg written last */ - __le64 s_last_seq; /* seq. number of seg written last */ -/*50*/ __le64 s_free_blocks_count; /* Free blocks count */ - - __le64 s_ctime; /* - * Creation time (execution time of - * newfs) - */ -/*60*/ __le64 s_mtime; /* Mount time */ - __le64 s_wtime; /* Write time */ -/*70*/ __le16 s_mnt_count; /* Mount count */ - __le16 s_max_mnt_count; /* Maximal mount count */ - __le16 s_state; /* File system state */ - __le16 s_errors; /* Behaviour when detecting errors */ - __le64 s_lastcheck; /* time of last check */ - -/*80*/ __le32 s_checkinterval; /* max. time between checks */ - __le32 s_creator_os; /* OS */ - __le16 s_def_resuid; /* Default uid for reserved blocks */ - __le16 s_def_resgid; /* Default gid for reserved blocks */ - __le32 s_first_ino; /* First non-reserved inode */ - -/*90*/ __le16 s_inode_size; /* Size of an inode */ - __le16 s_dat_entry_size; /* Size of a dat entry */ - __le16 s_checkpoint_size; /* Size of a checkpoint */ - __le16 s_segment_usage_size; /* Size of a segment usage */ - -/*98*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*A8*/ char s_volume_name[80]; /* volume name */ - -/*F8*/ __le32 s_c_interval; /* Commit interval of segment */ - __le32 s_c_block_max; /* - * Threshold of data amount for - * the segment construction - */ -/*100*/ __le64 s_feature_compat; /* Compatible feature set */ - __le64 s_feature_compat_ro; /* Read-only compatible feature set */ - __le64 s_feature_incompat; /* Incompatible feature set */ - __u32 s_reserved[186]; /* padding to the end of the block */ -}; - -/* - * Codes for operating systems - */ -#define NILFS_OS_LINUX 0 -/* Codes from 1 to 4 are reserved to keep compatibility with ext2 creator-OS */ - -/* - * Revision levels - */ -#define NILFS_CURRENT_REV 2 /* current major revision */ -#define NILFS_MINOR_REV 0 /* minor revision */ -#define NILFS_MIN_SUPP_REV 2 /* minimum supported revision */ - -/* - * Feature set definitions - * - * If there is a bit set in the incompatible feature set that the kernel - * doesn't know about, it should refuse to mount the filesystem. - */ -#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT 0x00000001ULL - -#define NILFS_FEATURE_COMPAT_SUPP 0ULL -#define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT -#define NILFS_FEATURE_INCOMPAT_SUPP 0ULL - -/* - * Bytes count of super_block for CRC-calculation - */ -#define NILFS_SB_BYTES \ - ((long)&((struct nilfs_super_block *)0)->s_reserved) - -/* - * Special inode number - */ -#define NILFS_ROOT_INO 2 /* Root file inode */ -#define NILFS_DAT_INO 3 /* DAT file */ -#define NILFS_CPFILE_INO 4 /* checkpoint file */ -#define NILFS_SUFILE_INO 5 /* segment usage file */ -#define NILFS_IFILE_INO 6 /* ifile */ -#define NILFS_ATIME_INO 7 /* Atime file (reserved) */ -#define NILFS_XATTR_INO 8 /* Xattribute file (reserved) */ -#define NILFS_SKETCH_INO 10 /* Sketch file */ -#define NILFS_USER_INO 11 /* Fisrt user's file inode number */ - -#define NILFS_SB_OFFSET_BYTES 1024 /* byte offset of nilfs superblock */ - -#define NILFS_SEG_MIN_BLOCKS 16 /* - * Minimum number of blocks in - * a full segment - */ -#define NILFS_PSEG_MIN_BLOCKS 2 /* - * Minimum number of blocks in - * a partial segment - */ -#define NILFS_MIN_NRSVSEGS 8 /* - * Minimum number of reserved - * segments - */ - -/* - * We call DAT, cpfile, and sufile root metadata files. Inodes of - * these files are written in super root block instead of ifile, and - * garbage collector doesn't keep any past versions of these files. - */ -#define NILFS_ROOT_METADATA_FILE(ino) \ - ((ino) >= NILFS_DAT_INO && (ino) <= NILFS_SUFILE_INO) - -/* - * bytes offset of secondary super block - */ -#define NILFS_SB2_OFFSET_BYTES(devsize) ((((devsize) >> 12) - 1) << 12) - -/* - * Maximal count of links to a file - */ -#define NILFS_LINK_MAX 32000 - -/* - * Structure of a directory entry - * (Same as ext2) - */ - -#define NILFS_NAME_LEN 255 - -/* - * Block size limitations - */ -#define NILFS_MIN_BLOCK_SIZE 1024 -#define NILFS_MAX_BLOCK_SIZE 65536 - -/* - * The new version of the directory entry. Since V0 structures are - * stored in intel byte order, and the name_len field could never be - * bigger than 255 chars, it's safe to reclaim the extra byte for the - * file_type field. - */ -struct nilfs_dir_entry { - __le64 inode; /* Inode number */ - __le16 rec_len; /* Directory entry length */ - __u8 name_len; /* Name length */ - __u8 file_type; /* Dir entry type (file, dir, etc) */ - char name[NILFS_NAME_LEN]; /* File name */ - char pad; -}; - -/* - * NILFS directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -enum { - NILFS_FT_UNKNOWN, - NILFS_FT_REG_FILE, - NILFS_FT_DIR, - NILFS_FT_CHRDEV, - NILFS_FT_BLKDEV, - NILFS_FT_FIFO, - NILFS_FT_SOCK, - NILFS_FT_SYMLINK, - NILFS_FT_MAX -}; - -/* - * NILFS_DIR_PAD defines the directory entries boundaries - * - * NOTE: It must be a multiple of 8 - */ -#define NILFS_DIR_PAD 8 -#define NILFS_DIR_ROUND (NILFS_DIR_PAD - 1) -#define NILFS_DIR_REC_LEN(name_len) (((name_len) + 12 + NILFS_DIR_ROUND) & \ - ~NILFS_DIR_ROUND) -#define NILFS_MAX_REC_LEN ((1<<16)-1) - -static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen) -{ - unsigned int len = le16_to_cpu(dlen); - -#if !defined(__KERNEL__) || (PAGE_SIZE >= 65536) - if (len == NILFS_MAX_REC_LEN) - return 1 << 16; -#endif - return len; -} - -static inline __le16 nilfs_rec_len_to_disk(unsigned int len) -{ -#if !defined(__KERNEL__) || (PAGE_SIZE >= 65536) - if (len == (1 << 16)) - return cpu_to_le16(NILFS_MAX_REC_LEN); - else if (len > (1 << 16)) - BUG(); -#endif - return cpu_to_le16(len); -} - -/** - * struct nilfs_finfo - file information - * @fi_ino: inode number - * @fi_cno: checkpoint number - * @fi_nblocks: number of blocks (including intermediate blocks) - * @fi_ndatablk: number of file data blocks - */ -struct nilfs_finfo { - __le64 fi_ino; - __le64 fi_cno; - __le32 fi_nblocks; - __le32 fi_ndatablk; - /* array of virtual block numbers */ -}; - -/** - * struct nilfs_binfo_v - information for the block to which a virtual block number is assigned - * @bi_vblocknr: virtual block number - * @bi_blkoff: block offset - */ -struct nilfs_binfo_v { - __le64 bi_vblocknr; - __le64 bi_blkoff; -}; - -/** - * struct nilfs_binfo_dat - information for the block which belongs to the DAT file - * @bi_blkoff: block offset - * @bi_level: level - * @bi_pad: padding - */ -struct nilfs_binfo_dat { - __le64 bi_blkoff; - __u8 bi_level; - __u8 bi_pad[7]; -}; - -/** - * union nilfs_binfo: block information - * @bi_v: nilfs_binfo_v structure - * @bi_dat: nilfs_binfo_dat structure - */ -union nilfs_binfo { - struct nilfs_binfo_v bi_v; - struct nilfs_binfo_dat bi_dat; -}; - -/** - * struct nilfs_segment_summary - segment summary header - * @ss_datasum: checksum of data - * @ss_sumsum: checksum of segment summary - * @ss_magic: magic number - * @ss_bytes: size of this structure in bytes - * @ss_flags: flags - * @ss_seq: sequence number - * @ss_create: creation timestamp - * @ss_next: next segment - * @ss_nblocks: number of blocks - * @ss_nfinfo: number of finfo structures - * @ss_sumbytes: total size of segment summary in bytes - * @ss_pad: padding - * @ss_cno: checkpoint number - */ -struct nilfs_segment_summary { - __le32 ss_datasum; - __le32 ss_sumsum; - __le32 ss_magic; - __le16 ss_bytes; - __le16 ss_flags; - __le64 ss_seq; - __le64 ss_create; - __le64 ss_next; - __le32 ss_nblocks; - __le32 ss_nfinfo; - __le32 ss_sumbytes; - __le32 ss_pad; - __le64 ss_cno; - /* array of finfo structures */ -}; - -#define NILFS_SEGSUM_MAGIC 0x1eaffa11 /* segment summary magic number */ - -/* - * Segment summary flags - */ -#define NILFS_SS_LOGBGN 0x0001 /* begins a logical segment */ -#define NILFS_SS_LOGEND 0x0002 /* ends a logical segment */ -#define NILFS_SS_SR 0x0004 /* has super root */ -#define NILFS_SS_SYNDT 0x0008 /* includes data only updates */ -#define NILFS_SS_GC 0x0010 /* segment written for cleaner operation */ - -/** - * struct nilfs_btree_node - B-tree node - * @bn_flags: flags - * @bn_level: level - * @bn_nchildren: number of children - * @bn_pad: padding - */ -struct nilfs_btree_node { - __u8 bn_flags; - __u8 bn_level; - __le16 bn_nchildren; - __le32 bn_pad; -}; - -/* flags */ -#define NILFS_BTREE_NODE_ROOT 0x01 - -/* level */ -#define NILFS_BTREE_LEVEL_DATA 0 -#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) -#define NILFS_BTREE_LEVEL_MAX 14 /* Max level (exclusive) */ - -/** - * struct nilfs_palloc_group_desc - block group descriptor - * @pg_nfrees: number of free entries in block group - */ -struct nilfs_palloc_group_desc { - __le32 pg_nfrees; -}; - -/** - * struct nilfs_dat_entry - disk address translation entry - * @de_blocknr: block number - * @de_start: start checkpoint number - * @de_end: end checkpoint number - * @de_rsv: reserved for future use - */ -struct nilfs_dat_entry { - __le64 de_blocknr; - __le64 de_start; - __le64 de_end; - __le64 de_rsv; -}; - -#define NILFS_MIN_DAT_ENTRY_SIZE 32 - -/** - * struct nilfs_snapshot_list - snapshot list - * @ssl_next: next checkpoint number on snapshot list - * @ssl_prev: previous checkpoint number on snapshot list - */ -struct nilfs_snapshot_list { - __le64 ssl_next; - __le64 ssl_prev; -}; - -/** - * struct nilfs_checkpoint - checkpoint structure - * @cp_flags: flags - * @cp_checkpoints_count: checkpoints count in a block - * @cp_snapshot_list: snapshot list - * @cp_cno: checkpoint number - * @cp_create: creation timestamp - * @cp_nblk_inc: number of blocks incremented by this checkpoint - * @cp_inodes_count: inodes count - * @cp_blocks_count: blocks count - * @cp_ifile_inode: inode of ifile - */ -struct nilfs_checkpoint { - __le32 cp_flags; - __le32 cp_checkpoints_count; - struct nilfs_snapshot_list cp_snapshot_list; - __le64 cp_cno; - __le64 cp_create; - __le64 cp_nblk_inc; - __le64 cp_inodes_count; - __le64 cp_blocks_count; - - /* - * Do not change the byte offset of ifile inode. - * To keep the compatibility of the disk format, - * additional fields should be added behind cp_ifile_inode. - */ - struct nilfs_inode cp_ifile_inode; -}; - -#define NILFS_MIN_CHECKPOINT_SIZE (64 + NILFS_MIN_INODE_SIZE) - -/* checkpoint flags */ -enum { - NILFS_CHECKPOINT_SNAPSHOT, - NILFS_CHECKPOINT_INVALID, - NILFS_CHECKPOINT_SKETCH, - NILFS_CHECKPOINT_MINOR, -}; - -#define NILFS_CHECKPOINT_FNS(flag, name) \ -static inline void \ -nilfs_checkpoint_set_##name(struct nilfs_checkpoint *cp) \ -{ \ - cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) | \ - (1UL << NILFS_CHECKPOINT_##flag)); \ -} \ -static inline void \ -nilfs_checkpoint_clear_##name(struct nilfs_checkpoint *cp) \ -{ \ - cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) & \ - ~(1UL << NILFS_CHECKPOINT_##flag)); \ -} \ -static inline int \ -nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp) \ -{ \ - return !!(le32_to_cpu(cp->cp_flags) & \ - (1UL << NILFS_CHECKPOINT_##flag)); \ -} - -NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot) -NILFS_CHECKPOINT_FNS(INVALID, invalid) -NILFS_CHECKPOINT_FNS(MINOR, minor) - -/** - * struct nilfs_cpinfo - checkpoint information - * @ci_flags: flags - * @ci_pad: padding - * @ci_cno: checkpoint number - * @ci_create: creation timestamp - * @ci_nblk_inc: number of blocks incremented by this checkpoint - * @ci_inodes_count: inodes count - * @ci_blocks_count: blocks count - * @ci_next: next checkpoint number in snapshot list - */ -struct nilfs_cpinfo { - __u32 ci_flags; - __u32 ci_pad; - __u64 ci_cno; - __u64 ci_create; - __u64 ci_nblk_inc; - __u64 ci_inodes_count; - __u64 ci_blocks_count; - __u64 ci_next; -}; - -#define NILFS_CPINFO_FNS(flag, name) \ -static inline int \ -nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo) \ -{ \ - return !!(cpinfo->ci_flags & (1UL << NILFS_CHECKPOINT_##flag)); \ -} - -NILFS_CPINFO_FNS(SNAPSHOT, snapshot) -NILFS_CPINFO_FNS(INVALID, invalid) -NILFS_CPINFO_FNS(MINOR, minor) - - -/** - * struct nilfs_cpfile_header - checkpoint file header - * @ch_ncheckpoints: number of checkpoints - * @ch_nsnapshots: number of snapshots - * @ch_snapshot_list: snapshot list - */ -struct nilfs_cpfile_header { - __le64 ch_ncheckpoints; - __le64 ch_nsnapshots; - struct nilfs_snapshot_list ch_snapshot_list; -}; - -#define NILFS_CPFILE_FIRST_CHECKPOINT_OFFSET \ - ((sizeof(struct nilfs_cpfile_header) + \ - sizeof(struct nilfs_checkpoint) - 1) / \ - sizeof(struct nilfs_checkpoint)) - -/** - * struct nilfs_segment_usage - segment usage - * @su_lastmod: last modified timestamp - * @su_nblocks: number of blocks in segment - * @su_flags: flags - */ -struct nilfs_segment_usage { - __le64 su_lastmod; - __le32 su_nblocks; - __le32 su_flags; -}; - -#define NILFS_MIN_SEGMENT_USAGE_SIZE 16 - -/* segment usage flag */ -enum { - NILFS_SEGMENT_USAGE_ACTIVE, - NILFS_SEGMENT_USAGE_DIRTY, - NILFS_SEGMENT_USAGE_ERROR, - - /* ... */ -}; - -#define NILFS_SEGMENT_USAGE_FNS(flag, name) \ -static inline void \ -nilfs_segment_usage_set_##name(struct nilfs_segment_usage *su) \ -{ \ - su->su_flags = cpu_to_le32(le32_to_cpu(su->su_flags) | \ - (1UL << NILFS_SEGMENT_USAGE_##flag));\ -} \ -static inline void \ -nilfs_segment_usage_clear_##name(struct nilfs_segment_usage *su) \ -{ \ - su->su_flags = \ - cpu_to_le32(le32_to_cpu(su->su_flags) & \ - ~(1UL << NILFS_SEGMENT_USAGE_##flag)); \ -} \ -static inline int \ -nilfs_segment_usage_##name(const struct nilfs_segment_usage *su) \ -{ \ - return !!(le32_to_cpu(su->su_flags) & \ - (1UL << NILFS_SEGMENT_USAGE_##flag)); \ -} - -NILFS_SEGMENT_USAGE_FNS(ACTIVE, active) -NILFS_SEGMENT_USAGE_FNS(DIRTY, dirty) -NILFS_SEGMENT_USAGE_FNS(ERROR, error) - -static inline void -nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su) -{ - su->su_lastmod = cpu_to_le64(0); - su->su_nblocks = cpu_to_le32(0); - su->su_flags = cpu_to_le32(0); -} - -static inline int -nilfs_segment_usage_clean(const struct nilfs_segment_usage *su) -{ - return !le32_to_cpu(su->su_flags); -} - -/** - * struct nilfs_sufile_header - segment usage file header - * @sh_ncleansegs: number of clean segments - * @sh_ndirtysegs: number of dirty segments - * @sh_last_alloc: last allocated segment number - */ -struct nilfs_sufile_header { - __le64 sh_ncleansegs; - __le64 sh_ndirtysegs; - __le64 sh_last_alloc; - /* ... */ -}; - -#define NILFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET \ - ((sizeof(struct nilfs_sufile_header) + \ - sizeof(struct nilfs_segment_usage) - 1) / \ - sizeof(struct nilfs_segment_usage)) - -/** - * nilfs_suinfo - segment usage information - * @sui_lastmod: timestamp of last modification - * @sui_nblocks: number of written blocks in segment - * @sui_flags: segment usage flags - */ -struct nilfs_suinfo { - __u64 sui_lastmod; - __u32 sui_nblocks; - __u32 sui_flags; -}; - -#define NILFS_SUINFO_FNS(flag, name) \ -static inline int \ -nilfs_suinfo_##name(const struct nilfs_suinfo *si) \ -{ \ - return si->sui_flags & (1UL << NILFS_SEGMENT_USAGE_##flag); \ -} - -NILFS_SUINFO_FNS(ACTIVE, active) -NILFS_SUINFO_FNS(DIRTY, dirty) -NILFS_SUINFO_FNS(ERROR, error) - -static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si) -{ - return !si->sui_flags; -} - -/* ioctl */ -/** - * nilfs_suinfo_update - segment usage information update - * @sup_segnum: segment number - * @sup_flags: flags for which fields are active in sup_sui - * @sup_reserved: reserved necessary for alignment - * @sup_sui: segment usage information - */ -struct nilfs_suinfo_update { - __u64 sup_segnum; - __u32 sup_flags; - __u32 sup_reserved; - struct nilfs_suinfo sup_sui; -}; - -enum { - NILFS_SUINFO_UPDATE_LASTMOD, - NILFS_SUINFO_UPDATE_NBLOCKS, - NILFS_SUINFO_UPDATE_FLAGS, - __NR_NILFS_SUINFO_UPDATE_FIELDS, -}; - -#define NILFS_SUINFO_UPDATE_FNS(flag, name) \ -static inline void \ -nilfs_suinfo_update_set_##name(struct nilfs_suinfo_update *sup) \ -{ \ - sup->sup_flags |= 1UL << NILFS_SUINFO_UPDATE_##flag; \ -} \ -static inline void \ -nilfs_suinfo_update_clear_##name(struct nilfs_suinfo_update *sup) \ -{ \ - sup->sup_flags &= ~(1UL << NILFS_SUINFO_UPDATE_##flag); \ -} \ -static inline int \ -nilfs_suinfo_update_##name(const struct nilfs_suinfo_update *sup) \ -{ \ - return !!(sup->sup_flags & (1UL << NILFS_SUINFO_UPDATE_##flag));\ -} - -NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod) -NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks) -NILFS_SUINFO_UPDATE_FNS(FLAGS, flags) - -enum { - NILFS_CHECKPOINT, - NILFS_SNAPSHOT, -}; - -/** - * struct nilfs_cpmode - change checkpoint mode structure - * @cm_cno: checkpoint number - * @cm_mode: mode of checkpoint - * @cm_pad: padding - */ -struct nilfs_cpmode { - __u64 cm_cno; - __u32 cm_mode; - __u32 cm_pad; -}; - -/** - * struct nilfs_argv - argument vector - * @v_base: pointer on data array from userspace - * @v_nmembs: number of members in data array - * @v_size: size of data array in bytes - * @v_flags: flags - * @v_index: start number of target data items - */ -struct nilfs_argv { - __u64 v_base; - __u32 v_nmembs; /* number of members */ - __u16 v_size; /* size of members */ - __u16 v_flags; - __u64 v_index; -}; - -/** - * struct nilfs_period - period of checkpoint numbers - * @p_start: start checkpoint number (inclusive) - * @p_end: end checkpoint number (exclusive) - */ -struct nilfs_period { - __u64 p_start; - __u64 p_end; -}; - -/** - * struct nilfs_cpstat - checkpoint statistics - * @cs_cno: checkpoint number - * @cs_ncps: number of checkpoints - * @cs_nsss: number of snapshots - */ -struct nilfs_cpstat { - __u64 cs_cno; - __u64 cs_ncps; - __u64 cs_nsss; -}; - -/** - * struct nilfs_sustat - segment usage statistics - * @ss_nsegs: number of segments - * @ss_ncleansegs: number of clean segments - * @ss_ndirtysegs: number of dirty segments - * @ss_ctime: creation time of the last segment - * @ss_nongc_ctime: creation time of the last segment not for GC - * @ss_prot_seq: least sequence number of segments which must not be reclaimed - */ -struct nilfs_sustat { - __u64 ss_nsegs; - __u64 ss_ncleansegs; - __u64 ss_ndirtysegs; - __u64 ss_ctime; - __u64 ss_nongc_ctime; - __u64 ss_prot_seq; -}; - -/** - * struct nilfs_vinfo - virtual block number information - * @vi_vblocknr: virtual block number - * @vi_start: start checkpoint number (inclusive) - * @vi_end: end checkpoint number (exclusive) - * @vi_blocknr: disk block number - */ -struct nilfs_vinfo { - __u64 vi_vblocknr; - __u64 vi_start; - __u64 vi_end; - __u64 vi_blocknr; -}; - -/** - * struct nilfs_vdesc - descriptor of virtual block number - * @vd_ino: inode number - * @vd_cno: checkpoint number - * @vd_vblocknr: virtual block number - * @vd_period: period of checkpoint numbers - * @vd_blocknr: disk block number - * @vd_offset: logical block offset inside a file - * @vd_flags: flags (data or node block) - * @vd_pad: padding - */ -struct nilfs_vdesc { - __u64 vd_ino; - __u64 vd_cno; - __u64 vd_vblocknr; - struct nilfs_period vd_period; - __u64 vd_blocknr; - __u64 vd_offset; - __u32 vd_flags; - __u32 vd_pad; -}; - -/** - * struct nilfs_bdesc - descriptor of disk block number - * @bd_ino: inode number - * @bd_oblocknr: disk block address (for skipping dead blocks) - * @bd_blocknr: disk block address - * @bd_offset: logical block offset inside a file - * @bd_level: level in the b-tree organization - * @bd_pad: padding - */ -struct nilfs_bdesc { - __u64 bd_ino; - __u64 bd_oblocknr; - __u64 bd_blocknr; - __u64 bd_offset; - __u32 bd_level; - __u32 bd_pad; -}; - -#define NILFS_IOCTL_IDENT 'n' - -#define NILFS_IOCTL_CHANGE_CPMODE \ - _IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode) -#define NILFS_IOCTL_DELETE_CHECKPOINT \ - _IOW(NILFS_IOCTL_IDENT, 0x81, __u64) -#define NILFS_IOCTL_GET_CPINFO \ - _IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv) -#define NILFS_IOCTL_GET_CPSTAT \ - _IOR(NILFS_IOCTL_IDENT, 0x83, struct nilfs_cpstat) -#define NILFS_IOCTL_GET_SUINFO \ - _IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv) -#define NILFS_IOCTL_GET_SUSTAT \ - _IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat) -#define NILFS_IOCTL_GET_VINFO \ - _IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv) -#define NILFS_IOCTL_GET_BDESCS \ - _IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv) -#define NILFS_IOCTL_CLEAN_SEGMENTS \ - _IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv[5]) -#define NILFS_IOCTL_SYNC \ - _IOR(NILFS_IOCTL_IDENT, 0x8A, __u64) -#define NILFS_IOCTL_RESIZE \ - _IOW(NILFS_IOCTL_IDENT, 0x8B, __u64) -#define NILFS_IOCTL_SET_ALLOC_RANGE \ - _IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2]) -#define NILFS_IOCTL_SET_SUINFO \ - _IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv) - -#endif /* _LINUX_NILFS_FS_H */ diff --git a/include/uapi/linux/nilfs2_api.h b/include/uapi/linux/nilfs2_api.h new file mode 100644 index 000000000000..ef4c1de89b11 --- /dev/null +++ b/include/uapi/linux/nilfs2_api.h @@ -0,0 +1,292 @@ +/* + * nilfs2_api.h - NILFS2 user space API + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + */ + +#ifndef _LINUX_NILFS2_API_H +#define _LINUX_NILFS2_API_H + +#include +#include + +/** + * struct nilfs_cpinfo - checkpoint information + * @ci_flags: flags + * @ci_pad: padding + * @ci_cno: checkpoint number + * @ci_create: creation timestamp + * @ci_nblk_inc: number of blocks incremented by this checkpoint + * @ci_inodes_count: inodes count + * @ci_blocks_count: blocks count + * @ci_next: next checkpoint number in snapshot list + */ +struct nilfs_cpinfo { + __u32 ci_flags; + __u32 ci_pad; + __u64 ci_cno; + __u64 ci_create; + __u64 ci_nblk_inc; + __u64 ci_inodes_count; + __u64 ci_blocks_count; + __u64 ci_next; +}; + +/* checkpoint flags */ +enum { + NILFS_CPINFO_SNAPSHOT, + NILFS_CPINFO_INVALID, + NILFS_CPINFO_SKETCH, + NILFS_CPINFO_MINOR, +}; + +#define NILFS_CPINFO_FNS(flag, name) \ +static inline int \ +nilfs_cpinfo_##name(const struct nilfs_cpinfo *cpinfo) \ +{ \ + return !!(cpinfo->ci_flags & (1UL << NILFS_CPINFO_##flag)); \ +} + +NILFS_CPINFO_FNS(SNAPSHOT, snapshot) +NILFS_CPINFO_FNS(INVALID, invalid) +NILFS_CPINFO_FNS(MINOR, minor) + +/** + * nilfs_suinfo - segment usage information + * @sui_lastmod: timestamp of last modification + * @sui_nblocks: number of written blocks in segment + * @sui_flags: segment usage flags + */ +struct nilfs_suinfo { + __u64 sui_lastmod; + __u32 sui_nblocks; + __u32 sui_flags; +}; + +/* segment usage flags */ +enum { + NILFS_SUINFO_ACTIVE, + NILFS_SUINFO_DIRTY, + NILFS_SUINFO_ERROR, +}; + +#define NILFS_SUINFO_FNS(flag, name) \ +static inline int \ +nilfs_suinfo_##name(const struct nilfs_suinfo *si) \ +{ \ + return si->sui_flags & (1UL << NILFS_SUINFO_##flag); \ +} + +NILFS_SUINFO_FNS(ACTIVE, active) +NILFS_SUINFO_FNS(DIRTY, dirty) +NILFS_SUINFO_FNS(ERROR, error) + +static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si) +{ + return !si->sui_flags; +} + +/** + * nilfs_suinfo_update - segment usage information update + * @sup_segnum: segment number + * @sup_flags: flags for which fields are active in sup_sui + * @sup_reserved: reserved necessary for alignment + * @sup_sui: segment usage information + */ +struct nilfs_suinfo_update { + __u64 sup_segnum; + __u32 sup_flags; + __u32 sup_reserved; + struct nilfs_suinfo sup_sui; +}; + +enum { + NILFS_SUINFO_UPDATE_LASTMOD, + NILFS_SUINFO_UPDATE_NBLOCKS, + NILFS_SUINFO_UPDATE_FLAGS, + __NR_NILFS_SUINFO_UPDATE_FIELDS, +}; + +#define NILFS_SUINFO_UPDATE_FNS(flag, name) \ +static inline void \ +nilfs_suinfo_update_set_##name(struct nilfs_suinfo_update *sup) \ +{ \ + sup->sup_flags |= 1UL << NILFS_SUINFO_UPDATE_##flag; \ +} \ +static inline void \ +nilfs_suinfo_update_clear_##name(struct nilfs_suinfo_update *sup) \ +{ \ + sup->sup_flags &= ~(1UL << NILFS_SUINFO_UPDATE_##flag); \ +} \ +static inline int \ +nilfs_suinfo_update_##name(const struct nilfs_suinfo_update *sup) \ +{ \ + return !!(sup->sup_flags & (1UL << NILFS_SUINFO_UPDATE_##flag));\ +} + +NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod) +NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks) +NILFS_SUINFO_UPDATE_FNS(FLAGS, flags) + +enum { + NILFS_CHECKPOINT, + NILFS_SNAPSHOT, +}; + +/** + * struct nilfs_cpmode - change checkpoint mode structure + * @cm_cno: checkpoint number + * @cm_mode: mode of checkpoint + * @cm_pad: padding + */ +struct nilfs_cpmode { + __u64 cm_cno; + __u32 cm_mode; + __u32 cm_pad; +}; + +/** + * struct nilfs_argv - argument vector + * @v_base: pointer on data array from userspace + * @v_nmembs: number of members in data array + * @v_size: size of data array in bytes + * @v_flags: flags + * @v_index: start number of target data items + */ +struct nilfs_argv { + __u64 v_base; + __u32 v_nmembs; /* number of members */ + __u16 v_size; /* size of members */ + __u16 v_flags; + __u64 v_index; +}; + +/** + * struct nilfs_period - period of checkpoint numbers + * @p_start: start checkpoint number (inclusive) + * @p_end: end checkpoint number (exclusive) + */ +struct nilfs_period { + __u64 p_start; + __u64 p_end; +}; + +/** + * struct nilfs_cpstat - checkpoint statistics + * @cs_cno: checkpoint number + * @cs_ncps: number of checkpoints + * @cs_nsss: number of snapshots + */ +struct nilfs_cpstat { + __u64 cs_cno; + __u64 cs_ncps; + __u64 cs_nsss; +}; + +/** + * struct nilfs_sustat - segment usage statistics + * @ss_nsegs: number of segments + * @ss_ncleansegs: number of clean segments + * @ss_ndirtysegs: number of dirty segments + * @ss_ctime: creation time of the last segment + * @ss_nongc_ctime: creation time of the last segment not for GC + * @ss_prot_seq: least sequence number of segments which must not be reclaimed + */ +struct nilfs_sustat { + __u64 ss_nsegs; + __u64 ss_ncleansegs; + __u64 ss_ndirtysegs; + __u64 ss_ctime; + __u64 ss_nongc_ctime; + __u64 ss_prot_seq; +}; + +/** + * struct nilfs_vinfo - virtual block number information + * @vi_vblocknr: virtual block number + * @vi_start: start checkpoint number (inclusive) + * @vi_end: end checkpoint number (exclusive) + * @vi_blocknr: disk block number + */ +struct nilfs_vinfo { + __u64 vi_vblocknr; + __u64 vi_start; + __u64 vi_end; + __u64 vi_blocknr; +}; + +/** + * struct nilfs_vdesc - descriptor of virtual block number + * @vd_ino: inode number + * @vd_cno: checkpoint number + * @vd_vblocknr: virtual block number + * @vd_period: period of checkpoint numbers + * @vd_blocknr: disk block number + * @vd_offset: logical block offset inside a file + * @vd_flags: flags (data or node block) + * @vd_pad: padding + */ +struct nilfs_vdesc { + __u64 vd_ino; + __u64 vd_cno; + __u64 vd_vblocknr; + struct nilfs_period vd_period; + __u64 vd_blocknr; + __u64 vd_offset; + __u32 vd_flags; + __u32 vd_pad; +}; + +/** + * struct nilfs_bdesc - descriptor of disk block number + * @bd_ino: inode number + * @bd_oblocknr: disk block address (for skipping dead blocks) + * @bd_blocknr: disk block address + * @bd_offset: logical block offset inside a file + * @bd_level: level in the b-tree organization + * @bd_pad: padding + */ +struct nilfs_bdesc { + __u64 bd_ino; + __u64 bd_oblocknr; + __u64 bd_blocknr; + __u64 bd_offset; + __u32 bd_level; + __u32 bd_pad; +}; + +#define NILFS_IOCTL_IDENT 'n' + +#define NILFS_IOCTL_CHANGE_CPMODE \ + _IOW(NILFS_IOCTL_IDENT, 0x80, struct nilfs_cpmode) +#define NILFS_IOCTL_DELETE_CHECKPOINT \ + _IOW(NILFS_IOCTL_IDENT, 0x81, __u64) +#define NILFS_IOCTL_GET_CPINFO \ + _IOR(NILFS_IOCTL_IDENT, 0x82, struct nilfs_argv) +#define NILFS_IOCTL_GET_CPSTAT \ + _IOR(NILFS_IOCTL_IDENT, 0x83, struct nilfs_cpstat) +#define NILFS_IOCTL_GET_SUINFO \ + _IOR(NILFS_IOCTL_IDENT, 0x84, struct nilfs_argv) +#define NILFS_IOCTL_GET_SUSTAT \ + _IOR(NILFS_IOCTL_IDENT, 0x85, struct nilfs_sustat) +#define NILFS_IOCTL_GET_VINFO \ + _IOWR(NILFS_IOCTL_IDENT, 0x86, struct nilfs_argv) +#define NILFS_IOCTL_GET_BDESCS \ + _IOWR(NILFS_IOCTL_IDENT, 0x87, struct nilfs_argv) +#define NILFS_IOCTL_CLEAN_SEGMENTS \ + _IOW(NILFS_IOCTL_IDENT, 0x88, struct nilfs_argv[5]) +#define NILFS_IOCTL_SYNC \ + _IOR(NILFS_IOCTL_IDENT, 0x8A, __u64) +#define NILFS_IOCTL_RESIZE \ + _IOW(NILFS_IOCTL_IDENT, 0x8B, __u64) +#define NILFS_IOCTL_SET_ALLOC_RANGE \ + _IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2]) +#define NILFS_IOCTL_SET_SUINFO \ + _IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv) + +#endif /* _LINUX_NILFS2_API_H */ diff --git a/include/uapi/linux/nilfs2_ondisk.h b/include/uapi/linux/nilfs2_ondisk.h new file mode 100644 index 000000000000..2a8a3addb675 --- /dev/null +++ b/include/uapi/linux/nilfs2_ondisk.h @@ -0,0 +1,650 @@ +/* + * nilfs2_ondisk.h - NILFS2 on-disk structures + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + */ +/* + * linux/include/linux/ext2_fs.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _LINUX_NILFS2_ONDISK_H +#define _LINUX_NILFS2_ONDISK_H + +#include +#include + + +#define NILFS_INODE_BMAP_SIZE 7 + +/** + * struct nilfs_inode - structure of an inode on disk + * @i_blocks: blocks count + * @i_size: size in bytes + * @i_ctime: creation time (seconds) + * @i_mtime: modification time (seconds) + * @i_ctime_nsec: creation time (nano seconds) + * @i_mtime_nsec: modification time (nano seconds) + * @i_uid: user id + * @i_gid: group id + * @i_mode: file mode + * @i_links_count: links count + * @i_flags: file flags + * @i_bmap: block mapping + * @i_xattr: extended attributes + * @i_generation: file generation (for NFS) + * @i_pad: padding + */ +struct nilfs_inode { + __le64 i_blocks; + __le64 i_size; + __le64 i_ctime; + __le64 i_mtime; + __le32 i_ctime_nsec; + __le32 i_mtime_nsec; + __le32 i_uid; + __le32 i_gid; + __le16 i_mode; + __le16 i_links_count; + __le32 i_flags; + __le64 i_bmap[NILFS_INODE_BMAP_SIZE]; +#define i_device_code i_bmap[0] + __le64 i_xattr; + __le32 i_generation; + __le32 i_pad; +}; + +#define NILFS_MIN_INODE_SIZE 128 + +/** + * struct nilfs_super_root - structure of super root + * @sr_sum: check sum + * @sr_bytes: byte count of the structure + * @sr_flags: flags (reserved) + * @sr_nongc_ctime: write time of the last segment not for cleaner operation + * @sr_dat: DAT file inode + * @sr_cpfile: checkpoint file inode + * @sr_sufile: segment usage file inode + */ +struct nilfs_super_root { + __le32 sr_sum; + __le16 sr_bytes; + __le16 sr_flags; + __le64 sr_nongc_ctime; + struct nilfs_inode sr_dat; + struct nilfs_inode sr_cpfile; + struct nilfs_inode sr_sufile; +}; + +#define NILFS_SR_MDT_OFFSET(inode_size, i) \ + ((unsigned long)&((struct nilfs_super_root *)0)->sr_dat + \ + (inode_size) * (i)) +#define NILFS_SR_DAT_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 0) +#define NILFS_SR_CPFILE_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 1) +#define NILFS_SR_SUFILE_OFFSET(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 2) +#define NILFS_SR_BYTES(inode_size) NILFS_SR_MDT_OFFSET(inode_size, 3) + +/* + * Maximal mount counts + */ +#define NILFS_DFL_MAX_MNT_COUNT 50 /* 50 mounts */ + +/* + * File system states (sbp->s_state, nilfs->ns_mount_state) + */ +#define NILFS_VALID_FS 0x0001 /* Unmounted cleanly */ +#define NILFS_ERROR_FS 0x0002 /* Errors detected */ +#define NILFS_RESIZE_FS 0x0004 /* Resize required */ + +/* + * Mount flags (sbi->s_mount_opt) + */ +#define NILFS_MOUNT_ERROR_MODE 0x0070 /* Error mode mask */ +#define NILFS_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */ +#define NILFS_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ +#define NILFS_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ +#define NILFS_MOUNT_BARRIER 0x1000 /* Use block barriers */ +#define NILFS_MOUNT_STRICT_ORDER 0x2000 /* + * Apply strict in-order + * semantics also for data + */ +#define NILFS_MOUNT_NORECOVERY 0x4000 /* + * Disable write access during + * mount-time recovery + */ +#define NILFS_MOUNT_DISCARD 0x8000 /* Issue DISCARD requests */ + + +/** + * struct nilfs_super_block - structure of super block on disk + */ +struct nilfs_super_block { +/*00*/ __le32 s_rev_level; /* Revision level */ + __le16 s_minor_rev_level; /* minor revision level */ + __le16 s_magic; /* Magic signature */ + + __le16 s_bytes; /* + * Bytes count of CRC calculation + * for this structure. s_reserved + * is excluded. + */ + __le16 s_flags; /* flags */ + __le32 s_crc_seed; /* Seed value of CRC calculation */ +/*10*/ __le32 s_sum; /* Check sum of super block */ + + __le32 s_log_block_size; /* + * Block size represented as follows + * blocksize = + * 1 << (s_log_block_size + 10) + */ + __le64 s_nsegments; /* Number of segments in filesystem */ +/*20*/ __le64 s_dev_size; /* block device size in bytes */ + __le64 s_first_data_block; /* 1st seg disk block number */ +/*30*/ __le32 s_blocks_per_segment; /* number of blocks per full segment */ + __le32 s_r_segments_percentage; /* Reserved segments percentage */ + + __le64 s_last_cno; /* Last checkpoint number */ +/*40*/ __le64 s_last_pseg; /* disk block addr pseg written last */ + __le64 s_last_seq; /* seq. number of seg written last */ +/*50*/ __le64 s_free_blocks_count; /* Free blocks count */ + + __le64 s_ctime; /* + * Creation time (execution time of + * newfs) + */ +/*60*/ __le64 s_mtime; /* Mount time */ + __le64 s_wtime; /* Write time */ +/*70*/ __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le64 s_lastcheck; /* time of last check */ + +/*80*/ __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + __le32 s_first_ino; /* First non-reserved inode */ + +/*90*/ __le16 s_inode_size; /* Size of an inode */ + __le16 s_dat_entry_size; /* Size of a dat entry */ + __le16 s_checkpoint_size; /* Size of a checkpoint */ + __le16 s_segment_usage_size; /* Size of a segment usage */ + +/*98*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*A8*/ char s_volume_name[80]; /* volume name */ + +/*F8*/ __le32 s_c_interval; /* Commit interval of segment */ + __le32 s_c_block_max; /* + * Threshold of data amount for + * the segment construction + */ +/*100*/ __le64 s_feature_compat; /* Compatible feature set */ + __le64 s_feature_compat_ro; /* Read-only compatible feature set */ + __le64 s_feature_incompat; /* Incompatible feature set */ + __u32 s_reserved[186]; /* padding to the end of the block */ +}; + +/* + * Codes for operating systems + */ +#define NILFS_OS_LINUX 0 +/* Codes from 1 to 4 are reserved to keep compatibility with ext2 creator-OS */ + +/* + * Revision levels + */ +#define NILFS_CURRENT_REV 2 /* current major revision */ +#define NILFS_MINOR_REV 0 /* minor revision */ +#define NILFS_MIN_SUPP_REV 2 /* minimum supported revision */ + +/* + * Feature set definitions + * + * If there is a bit set in the incompatible feature set that the kernel + * doesn't know about, it should refuse to mount the filesystem. + */ +#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT 0x00000001ULL + +#define NILFS_FEATURE_COMPAT_SUPP 0ULL +#define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT +#define NILFS_FEATURE_INCOMPAT_SUPP 0ULL + +/* + * Bytes count of super_block for CRC-calculation + */ +#define NILFS_SB_BYTES \ + ((long)&((struct nilfs_super_block *)0)->s_reserved) + +/* + * Special inode number + */ +#define NILFS_ROOT_INO 2 /* Root file inode */ +#define NILFS_DAT_INO 3 /* DAT file */ +#define NILFS_CPFILE_INO 4 /* checkpoint file */ +#define NILFS_SUFILE_INO 5 /* segment usage file */ +#define NILFS_IFILE_INO 6 /* ifile */ +#define NILFS_ATIME_INO 7 /* Atime file (reserved) */ +#define NILFS_XATTR_INO 8 /* Xattribute file (reserved) */ +#define NILFS_SKETCH_INO 10 /* Sketch file */ +#define NILFS_USER_INO 11 /* Fisrt user's file inode number */ + +#define NILFS_SB_OFFSET_BYTES 1024 /* byte offset of nilfs superblock */ + +#define NILFS_SEG_MIN_BLOCKS 16 /* + * Minimum number of blocks in + * a full segment + */ +#define NILFS_PSEG_MIN_BLOCKS 2 /* + * Minimum number of blocks in + * a partial segment + */ +#define NILFS_MIN_NRSVSEGS 8 /* + * Minimum number of reserved + * segments + */ + +/* + * We call DAT, cpfile, and sufile root metadata files. Inodes of + * these files are written in super root block instead of ifile, and + * garbage collector doesn't keep any past versions of these files. + */ +#define NILFS_ROOT_METADATA_FILE(ino) \ + ((ino) >= NILFS_DAT_INO && (ino) <= NILFS_SUFILE_INO) + +/* + * bytes offset of secondary super block + */ +#define NILFS_SB2_OFFSET_BYTES(devsize) ((((devsize) >> 12) - 1) << 12) + +/* + * Maximal count of links to a file + */ +#define NILFS_LINK_MAX 32000 + +/* + * Structure of a directory entry + * (Same as ext2) + */ + +#define NILFS_NAME_LEN 255 + +/* + * Block size limitations + */ +#define NILFS_MIN_BLOCK_SIZE 1024 +#define NILFS_MAX_BLOCK_SIZE 65536 + +/* + * The new version of the directory entry. Since V0 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct nilfs_dir_entry { + __le64 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; /* Dir entry type (file, dir, etc) */ + char name[NILFS_NAME_LEN]; /* File name */ + char pad; +}; + +/* + * NILFS directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +enum { + NILFS_FT_UNKNOWN, + NILFS_FT_REG_FILE, + NILFS_FT_DIR, + NILFS_FT_CHRDEV, + NILFS_FT_BLKDEV, + NILFS_FT_FIFO, + NILFS_FT_SOCK, + NILFS_FT_SYMLINK, + NILFS_FT_MAX +}; + +/* + * NILFS_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 8 + */ +#define NILFS_DIR_PAD 8 +#define NILFS_DIR_ROUND (NILFS_DIR_PAD - 1) +#define NILFS_DIR_REC_LEN(name_len) (((name_len) + 12 + NILFS_DIR_ROUND) & \ + ~NILFS_DIR_ROUND) +#define NILFS_MAX_REC_LEN ((1 << 16) - 1) + +/** + * struct nilfs_finfo - file information + * @fi_ino: inode number + * @fi_cno: checkpoint number + * @fi_nblocks: number of blocks (including intermediate blocks) + * @fi_ndatablk: number of file data blocks + */ +struct nilfs_finfo { + __le64 fi_ino; + __le64 fi_cno; + __le32 fi_nblocks; + __le32 fi_ndatablk; +}; + +/** + * struct nilfs_binfo_v - information on a data block (except DAT) + * @bi_vblocknr: virtual block number + * @bi_blkoff: block offset + */ +struct nilfs_binfo_v { + __le64 bi_vblocknr; + __le64 bi_blkoff; +}; + +/** + * struct nilfs_binfo_dat - information on a DAT node block + * @bi_blkoff: block offset + * @bi_level: level + * @bi_pad: padding + */ +struct nilfs_binfo_dat { + __le64 bi_blkoff; + __u8 bi_level; + __u8 bi_pad[7]; +}; + +/** + * union nilfs_binfo: block information + * @bi_v: nilfs_binfo_v structure + * @bi_dat: nilfs_binfo_dat structure + */ +union nilfs_binfo { + struct nilfs_binfo_v bi_v; + struct nilfs_binfo_dat bi_dat; +}; + +/** + * struct nilfs_segment_summary - segment summary header + * @ss_datasum: checksum of data + * @ss_sumsum: checksum of segment summary + * @ss_magic: magic number + * @ss_bytes: size of this structure in bytes + * @ss_flags: flags + * @ss_seq: sequence number + * @ss_create: creation timestamp + * @ss_next: next segment + * @ss_nblocks: number of blocks + * @ss_nfinfo: number of finfo structures + * @ss_sumbytes: total size of segment summary in bytes + * @ss_pad: padding + * @ss_cno: checkpoint number + */ +struct nilfs_segment_summary { + __le32 ss_datasum; + __le32 ss_sumsum; + __le32 ss_magic; + __le16 ss_bytes; + __le16 ss_flags; + __le64 ss_seq; + __le64 ss_create; + __le64 ss_next; + __le32 ss_nblocks; + __le32 ss_nfinfo; + __le32 ss_sumbytes; + __le32 ss_pad; + __le64 ss_cno; + /* array of finfo structures */ +}; + +#define NILFS_SEGSUM_MAGIC 0x1eaffa11 /* segment summary magic number */ + +/* + * Segment summary flags + */ +#define NILFS_SS_LOGBGN 0x0001 /* begins a logical segment */ +#define NILFS_SS_LOGEND 0x0002 /* ends a logical segment */ +#define NILFS_SS_SR 0x0004 /* has super root */ +#define NILFS_SS_SYNDT 0x0008 /* includes data only updates */ +#define NILFS_SS_GC 0x0010 /* segment written for cleaner operation */ + +/** + * struct nilfs_btree_node - header of B-tree node block + * @bn_flags: flags + * @bn_level: level + * @bn_nchildren: number of children + * @bn_pad: padding + */ +struct nilfs_btree_node { + __u8 bn_flags; + __u8 bn_level; + __le16 bn_nchildren; + __le32 bn_pad; +}; + +/* flags */ +#define NILFS_BTREE_NODE_ROOT 0x01 + +/* level */ +#define NILFS_BTREE_LEVEL_DATA 0 +#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) +#define NILFS_BTREE_LEVEL_MAX 14 /* Max level (exclusive) */ + +/** + * struct nilfs_direct_node - header of built-in bmap array + * @dn_flags: flags + * @dn_pad: padding + */ +struct nilfs_direct_node { + __u8 dn_flags; + __u8 pad[7]; +}; + +/** + * struct nilfs_palloc_group_desc - block group descriptor + * @pg_nfrees: number of free entries in block group + */ +struct nilfs_palloc_group_desc { + __le32 pg_nfrees; +}; + +/** + * struct nilfs_dat_entry - disk address translation entry + * @de_blocknr: block number + * @de_start: start checkpoint number + * @de_end: end checkpoint number + * @de_rsv: reserved for future use + */ +struct nilfs_dat_entry { + __le64 de_blocknr; + __le64 de_start; + __le64 de_end; + __le64 de_rsv; +}; + +#define NILFS_MIN_DAT_ENTRY_SIZE 32 + +/** + * struct nilfs_snapshot_list - snapshot list + * @ssl_next: next checkpoint number on snapshot list + * @ssl_prev: previous checkpoint number on snapshot list + */ +struct nilfs_snapshot_list { + __le64 ssl_next; + __le64 ssl_prev; +}; + +/** + * struct nilfs_checkpoint - checkpoint structure + * @cp_flags: flags + * @cp_checkpoints_count: checkpoints count in a block + * @cp_snapshot_list: snapshot list + * @cp_cno: checkpoint number + * @cp_create: creation timestamp + * @cp_nblk_inc: number of blocks incremented by this checkpoint + * @cp_inodes_count: inodes count + * @cp_blocks_count: blocks count + * @cp_ifile_inode: inode of ifile + */ +struct nilfs_checkpoint { + __le32 cp_flags; + __le32 cp_checkpoints_count; + struct nilfs_snapshot_list cp_snapshot_list; + __le64 cp_cno; + __le64 cp_create; + __le64 cp_nblk_inc; + __le64 cp_inodes_count; + __le64 cp_blocks_count; + + /* + * Do not change the byte offset of ifile inode. + * To keep the compatibility of the disk format, + * additional fields should be added behind cp_ifile_inode. + */ + struct nilfs_inode cp_ifile_inode; +}; + +#define NILFS_MIN_CHECKPOINT_SIZE (64 + NILFS_MIN_INODE_SIZE) + +/* checkpoint flags */ +enum { + NILFS_CHECKPOINT_SNAPSHOT, + NILFS_CHECKPOINT_INVALID, + NILFS_CHECKPOINT_SKETCH, + NILFS_CHECKPOINT_MINOR, +}; + +#define NILFS_CHECKPOINT_FNS(flag, name) \ +static inline void \ +nilfs_checkpoint_set_##name(struct nilfs_checkpoint *cp) \ +{ \ + cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) | \ + (1UL << NILFS_CHECKPOINT_##flag)); \ +} \ +static inline void \ +nilfs_checkpoint_clear_##name(struct nilfs_checkpoint *cp) \ +{ \ + cp->cp_flags = cpu_to_le32(le32_to_cpu(cp->cp_flags) & \ + ~(1UL << NILFS_CHECKPOINT_##flag)); \ +} \ +static inline int \ +nilfs_checkpoint_##name(const struct nilfs_checkpoint *cp) \ +{ \ + return !!(le32_to_cpu(cp->cp_flags) & \ + (1UL << NILFS_CHECKPOINT_##flag)); \ +} + +NILFS_CHECKPOINT_FNS(SNAPSHOT, snapshot) +NILFS_CHECKPOINT_FNS(INVALID, invalid) +NILFS_CHECKPOINT_FNS(MINOR, minor) + +/** + * struct nilfs_cpfile_header - checkpoint file header + * @ch_ncheckpoints: number of checkpoints + * @ch_nsnapshots: number of snapshots + * @ch_snapshot_list: snapshot list + */ +struct nilfs_cpfile_header { + __le64 ch_ncheckpoints; + __le64 ch_nsnapshots; + struct nilfs_snapshot_list ch_snapshot_list; +}; + +#define NILFS_CPFILE_FIRST_CHECKPOINT_OFFSET \ + ((sizeof(struct nilfs_cpfile_header) + \ + sizeof(struct nilfs_checkpoint) - 1) / \ + sizeof(struct nilfs_checkpoint)) + +/** + * struct nilfs_segment_usage - segment usage + * @su_lastmod: last modified timestamp + * @su_nblocks: number of blocks in segment + * @su_flags: flags + */ +struct nilfs_segment_usage { + __le64 su_lastmod; + __le32 su_nblocks; + __le32 su_flags; +}; + +#define NILFS_MIN_SEGMENT_USAGE_SIZE 16 + +/* segment usage flag */ +enum { + NILFS_SEGMENT_USAGE_ACTIVE, + NILFS_SEGMENT_USAGE_DIRTY, + NILFS_SEGMENT_USAGE_ERROR, +}; + +#define NILFS_SEGMENT_USAGE_FNS(flag, name) \ +static inline void \ +nilfs_segment_usage_set_##name(struct nilfs_segment_usage *su) \ +{ \ + su->su_flags = cpu_to_le32(le32_to_cpu(su->su_flags) | \ + (1UL << NILFS_SEGMENT_USAGE_##flag));\ +} \ +static inline void \ +nilfs_segment_usage_clear_##name(struct nilfs_segment_usage *su) \ +{ \ + su->su_flags = \ + cpu_to_le32(le32_to_cpu(su->su_flags) & \ + ~(1UL << NILFS_SEGMENT_USAGE_##flag)); \ +} \ +static inline int \ +nilfs_segment_usage_##name(const struct nilfs_segment_usage *su) \ +{ \ + return !!(le32_to_cpu(su->su_flags) & \ + (1UL << NILFS_SEGMENT_USAGE_##flag)); \ +} + +NILFS_SEGMENT_USAGE_FNS(ACTIVE, active) +NILFS_SEGMENT_USAGE_FNS(DIRTY, dirty) +NILFS_SEGMENT_USAGE_FNS(ERROR, error) + +static inline void +nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su) +{ + su->su_lastmod = cpu_to_le64(0); + su->su_nblocks = cpu_to_le32(0); + su->su_flags = cpu_to_le32(0); +} + +static inline int +nilfs_segment_usage_clean(const struct nilfs_segment_usage *su) +{ + return !le32_to_cpu(su->su_flags); +} + +/** + * struct nilfs_sufile_header - segment usage file header + * @sh_ncleansegs: number of clean segments + * @sh_ndirtysegs: number of dirty segments + * @sh_last_alloc: last allocated segment number + */ +struct nilfs_sufile_header { + __le64 sh_ncleansegs; + __le64 sh_ndirtysegs; + __le64 sh_last_alloc; + /* ... */ +}; + +#define NILFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET \ + ((sizeof(struct nilfs_sufile_header) + \ + sizeof(struct nilfs_segment_usage) - 1) / \ + sizeof(struct nilfs_segment_usage)) + +#endif /* _LINUX_NILFS2_ONDISK_H */ -- cgit v1.2.3-71-gd317 From b6e8d4aa1110306378af0f3472a6b85a1f039a16 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 2 Aug 2016 14:06:25 -0700 Subject: rapidio: add RapidIO channelized messaging driver Add channelized messaging driver to support native RapidIO messaging exchange between multiple senders/recipients on devices that use kernel RapidIO subsystem services. This device driver is the result of collaboration within the RapidIO.org Software Task Group (STG) between Texas Instruments, Prodrive Technologies, Nokia Networks, BAE and IDT. Additional input was received from other members of RapidIO.org. The objective was to create a character mode driver interface which exposes messaging capabilities of RapidIO endpoint devices (mports) directly to applications, in a manner that allows the numerous and varied RapidIO implementations to interoperate. This char mode device driver allows user-space applications to setup messaging communication channels using single shared RapidIO messaging mailbox. By default this driver uses RapidIO MBOX_1 (MBOX_0 is reserved for use by RIONET Ethernet emulation driver). [weiyj.lk@gmail.com: rapidio/rio_cm: fix return value check in riocm_init()] Link: http://lkml.kernel.org/r/1469198221-21970-1-git-send-email-alexandre.bounine@idt.com Link: http://lkml.kernel.org/r/1468952862-18056-1-git-send-email-alexandre.bounine@idt.com Signed-off-by: Alexandre Bounine Tested-by: Barry Wood Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Cc: Barry Wood Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rapidio/rio_cm.txt | 119 ++ drivers/rapidio/Kconfig | 9 + drivers/rapidio/Makefile | 1 + drivers/rapidio/rio_cm.c | 2366 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/rio_cm_cdev.h | 78 ++ 6 files changed, 2574 insertions(+) create mode 100644 Documentation/rapidio/rio_cm.txt create mode 100644 drivers/rapidio/rio_cm.c create mode 100644 include/uapi/linux/rio_cm_cdev.h (limited to 'include/uapi') diff --git a/Documentation/rapidio/rio_cm.txt b/Documentation/rapidio/rio_cm.txt new file mode 100644 index 000000000000..27aa401f1126 --- /dev/null +++ b/Documentation/rapidio/rio_cm.txt @@ -0,0 +1,119 @@ +RapidIO subsystem Channelized Messaging character device driver (rio_cm.c) +========================================================================== + +Version History: +---------------- + 1.0.0 - Initial driver release. + +========================================================================== + +I. Overview + +This device driver is the result of collaboration within the RapidIO.org +Software Task Group (STG) between Texas Instruments, Prodrive Technologies, +Nokia Networks, BAE and IDT. Additional input was received from other members +of RapidIO.org. + +The objective was to create a character mode driver interface which exposes +messaging capabilities of RapidIO endpoint devices (mports) directly +to applications, in a manner that allows the numerous and varied RapidIO +implementations to interoperate. + +This driver (RIO_CM) provides to user-space applications shared access to +RapidIO mailbox messaging resources. + +RapidIO specification (Part 2) defines that endpoint devices may have up to four +messaging mailboxes in case of multi-packet message (up to 4KB) and +up to 64 mailboxes if single-packet messages (up to 256 B) are used. In addition +to protocol definition limitations, a particular hardware implementation can +have reduced number of messaging mailboxes. RapidIO aware applications must +therefore share the messaging resources of a RapidIO endpoint. + +Main purpose of this device driver is to provide RapidIO mailbox messaging +capability to large number of user-space processes by introducing socket-like +operations using a single messaging mailbox. This allows applications to +use the limited RapidIO messaging hardware resources efficiently. + +Most of device driver's operations are supported through 'ioctl' system calls. + +When loaded this device driver creates a single file system node named rio_cm +in /dev directory common for all registered RapidIO mport devices. + +Following ioctl commands are available to user-space applications: + +- RIO_CM_MPORT_GET_LIST : Returns to caller list of local mport devices that + support messaging operations (number of entries up to RIO_MAX_MPORTS). + Each list entry is combination of mport's index in the system and RapidIO + destination ID assigned to the port. +- RIO_CM_EP_GET_LIST_SIZE : Returns number of messaging capable remote endpoints + in a RapidIO network associated with the specified mport device. +- RIO_CM_EP_GET_LIST : Returns list of RapidIO destination IDs for messaging + capable remote endpoints (peers) available in a RapidIO network associated + with the specified mport device. +- RIO_CM_CHAN_CREATE : Creates RapidIO message exchange channel data structure + with channel ID assigned automatically or as requested by a caller. +- RIO_CM_CHAN_BIND : Binds the specified channel data structure to the specified + mport device. +- RIO_CM_CHAN_LISTEN : Enables listening for connection requests on the specified + channel. +- RIO_CM_CHAN_ACCEPT : Accepts a connection request from peer on the specified + channel. If wait timeout for this request is specified by a caller it is + a blocking call. If timeout set to 0 this is non-blocking call - ioctl + handler checks for a pending connection request and if one is not available + exits with -EGAIN error status immediately. +- RIO_CM_CHAN_CONNECT : Sends a connection request to a remote peer/channel. +- RIO_CM_CHAN_SEND : Sends a data message through the specified channel. + The handler for this request assumes that message buffer specified by + a caller includes the reserved space for a packet header required by + this driver. +- RIO_CM_CHAN_RECEIVE : Receives a data message through a connected channel. + If the channel does not have an incoming message ready to return this ioctl + handler will wait for new message until timeout specified by a caller + expires. If timeout value is set to 0, ioctl handler uses a default value + defined by MAX_SCHEDULE_TIMEOUT. +- RIO_CM_CHAN_CLOSE : Closes a specified channel and frees associated buffers. + If the specified channel is in the CONNECTED state, sends close notification + to the remote peer. + +The ioctl command codes and corresponding data structures intended for use by +user-space applications are defined in 'include/uapi/linux/rio_cm_cdev.h'. + +II. Hardware Compatibility + +This device driver uses standard interfaces defined by kernel RapidIO subsystem +and therefore it can be used with any mport device driver registered by RapidIO +subsystem with limitations set by available mport HW implementation of messaging +mailboxes. + +III. Module parameters + +- 'dbg_level' - This parameter allows to control amount of debug information + generated by this device driver. This parameter is formed by set of + bit masks that correspond to the specific functional block. + For mask definitions see 'drivers/rapidio/devices/rio_cm.c' + This parameter can be changed dynamically. + Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level. + +- 'cmbox' - Number of RapidIO mailbox to use (default value is 1). + This parameter allows to set messaging mailbox number that will be used + within entire RapidIO network. It can be used when default mailbox is + used by other device drivers or is not supported by some nodes in the + RapidIO network. + +- 'chstart' - Start channel number for dynamic assignment. Default value - 256. + Allows to exclude channel numbers below this parameter from dynamic + allocation to avoid conflicts with software components that use + reserved predefined channel numbers. + +IV. Known problems + + None. + +V. User-space Applications and API Library + +Messaging API library and applications that use this device driver are available +from RapidIO.org. + +VI. TODO List + +- Add support for system notification messages (reserved channel 0). diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig index b5a10d3c92c7..d6d2f20c4597 100644 --- a/drivers/rapidio/Kconfig +++ b/drivers/rapidio/Kconfig @@ -67,6 +67,15 @@ config RAPIDIO_ENUM_BASIC endchoice +config RAPIDIO_CHMAN + tristate "RapidIO Channelized Messaging driver" + depends on RAPIDIO + help + This option includes RapidIO channelized messaging driver which + provides socket-like interface to allow sharing of single RapidIO + messaging mailbox between multiple user-space applications. + See "Documentation/rapidio/rio_cm.txt" for driver description. + config RAPIDIO_MPORT_CDEV tristate "RapidIO /dev mport device driver" depends on RAPIDIO diff --git a/drivers/rapidio/Makefile b/drivers/rapidio/Makefile index 6271ada6993f..74dcea45ad49 100644 --- a/drivers/rapidio/Makefile +++ b/drivers/rapidio/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_RAPIDIO) += rapidio.o rapidio-y := rio.o rio-access.o rio-driver.o rio-sysfs.o obj-$(CONFIG_RAPIDIO_ENUM_BASIC) += rio-scan.o +obj-$(CONFIG_RAPIDIO_CHMAN) += rio_cm.o obj-$(CONFIG_RAPIDIO) += switches/ obj-$(CONFIG_RAPIDIO) += devices/ diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c new file mode 100644 index 000000000000..cecc15a880de --- /dev/null +++ b/drivers/rapidio/rio_cm.c @@ -0,0 +1,2366 @@ +/* + * rio_cm - RapidIO Channelized Messaging Driver + * + * Copyright 2013-2016 Integrated Device Technology, Inc. + * Copyright (c) 2015, Prodrive Technologies + * Copyright (c) 2015, RapidIO Trade Association + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * THIS PROGRAM IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL, + * BUT WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED WARRANTY OF + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE + * GNU GENERAL PUBLIC LICENSE FOR MORE DETAILS. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRV_NAME "rio_cm" +#define DRV_VERSION "1.0.0" +#define DRV_AUTHOR "Alexandre Bounine " +#define DRV_DESC "RapidIO Channelized Messaging Driver" +#define DEV_NAME "rio_cm" + +/* Debug output filtering masks */ +enum { + DBG_NONE = 0, + DBG_INIT = BIT(0), /* driver init */ + DBG_EXIT = BIT(1), /* driver exit */ + DBG_MPORT = BIT(2), /* mport add/remove */ + DBG_RDEV = BIT(3), /* RapidIO device add/remove */ + DBG_CHOP = BIT(4), /* channel operations */ + DBG_WAIT = BIT(5), /* waiting for events */ + DBG_TX = BIT(6), /* message TX */ + DBG_TX_EVENT = BIT(7), /* message TX event */ + DBG_RX_DATA = BIT(8), /* inbound data messages */ + DBG_RX_CMD = BIT(9), /* inbound REQ/ACK/NACK messages */ + DBG_ALL = ~0, +}; + +#ifdef DEBUG +#define riocm_debug(level, fmt, arg...) \ + do { \ + if (DBG_##level & dbg_level) \ + pr_debug(DRV_NAME ": %s " fmt "\n", \ + __func__, ##arg); \ + } while (0) +#else +#define riocm_debug(level, fmt, arg...) \ + no_printk(KERN_DEBUG pr_fmt(DRV_NAME fmt "\n"), ##arg) +#endif + +#define riocm_warn(fmt, arg...) \ + pr_warn(DRV_NAME ": %s WARNING " fmt "\n", __func__, ##arg) + +#define riocm_error(fmt, arg...) \ + pr_err(DRV_NAME ": %s ERROR " fmt "\n", __func__, ##arg) + + +static int cmbox = 1; +module_param(cmbox, int, S_IRUGO); +MODULE_PARM_DESC(cmbox, "RapidIO Mailbox number (default 1)"); + +static int chstart = 256; +module_param(chstart, int, S_IRUGO); +MODULE_PARM_DESC(chstart, + "Start channel number for dynamic allocation (default 256)"); + +#ifdef DEBUG +static u32 dbg_level = DBG_NONE; +module_param(dbg_level, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)"); +#endif + +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_VERSION); + +#define RIOCM_TX_RING_SIZE 128 +#define RIOCM_RX_RING_SIZE 128 +#define RIOCM_CONNECT_TO 3 /* connect response TO (in sec) */ + +#define RIOCM_MAX_CHNUM 0xffff /* Use full range of u16 field */ +#define RIOCM_CHNUM_AUTO 0 +#define RIOCM_MAX_EP_COUNT 0x10000 /* Max number of endpoints */ + +enum rio_cm_state { + RIO_CM_IDLE, + RIO_CM_CONNECT, + RIO_CM_CONNECTED, + RIO_CM_DISCONNECT, + RIO_CM_CHAN_BOUND, + RIO_CM_LISTEN, + RIO_CM_DESTROYING, +}; + +enum rio_cm_pkt_type { + RIO_CM_SYS = 0xaa, + RIO_CM_CHAN = 0x55, +}; + +enum rio_cm_chop { + CM_CONN_REQ, + CM_CONN_ACK, + CM_CONN_CLOSE, + CM_DATA_MSG, +}; + +struct rio_ch_base_bhdr { + u32 src_id; + u32 dst_id; +#define RIO_HDR_LETTER_MASK 0xffff0000 +#define RIO_HDR_MBOX_MASK 0x0000ffff + u8 src_mbox; + u8 dst_mbox; + u8 type; +} __attribute__((__packed__)); + +struct rio_ch_chan_hdr { + struct rio_ch_base_bhdr bhdr; + u8 ch_op; + u16 dst_ch; + u16 src_ch; + u16 msg_len; + u16 rsrvd; +} __attribute__((__packed__)); + +struct tx_req { + struct list_head node; + struct rio_dev *rdev; + void *buffer; + size_t len; +}; + +struct cm_dev { + struct list_head list; + struct rio_mport *mport; + void *rx_buf[RIOCM_RX_RING_SIZE]; + int rx_slots; + struct mutex rx_lock; + + void *tx_buf[RIOCM_TX_RING_SIZE]; + int tx_slot; + int tx_cnt; + int tx_ack_slot; + struct list_head tx_reqs; + spinlock_t tx_lock; + + struct list_head peers; + u32 npeers; + struct workqueue_struct *rx_wq; + struct work_struct rx_work; +}; + +struct chan_rx_ring { + void *buf[RIOCM_RX_RING_SIZE]; + int head; + int tail; + int count; + + /* Tracking RX buffers reported to upper level */ + void *inuse[RIOCM_RX_RING_SIZE]; + int inuse_cnt; +}; + +struct rio_channel { + u16 id; /* local channel ID */ + struct kref ref; /* channel refcount */ + struct file *filp; + struct cm_dev *cmdev; /* associated CM device object */ + struct rio_dev *rdev; /* remote RapidIO device */ + enum rio_cm_state state; + int error; + spinlock_t lock; + void *context; + u32 loc_destid; /* local destID */ + u32 rem_destid; /* remote destID */ + u16 rem_channel; /* remote channel ID */ + struct list_head accept_queue; + struct list_head ch_node; + struct completion comp; + struct completion comp_close; + struct chan_rx_ring rx_ring; +}; + +struct cm_peer { + struct list_head node; + struct rio_dev *rdev; +}; + +struct rio_cm_work { + struct work_struct work; + struct cm_dev *cm; + void *data; +}; + +struct conn_req { + struct list_head node; + u32 destid; /* requester destID */ + u16 chan; /* requester channel ID */ + struct cm_dev *cmdev; +}; + +/* + * A channel_dev structure represents a CM_CDEV + * @cdev Character device + * @dev Associated device object + */ +struct channel_dev { + struct cdev cdev; + struct device *dev; +}; + +static struct rio_channel *riocm_ch_alloc(u16 ch_num); +static void riocm_ch_free(struct kref *ref); +static int riocm_post_send(struct cm_dev *cm, struct rio_dev *rdev, + void *buffer, size_t len); +static int riocm_ch_close(struct rio_channel *ch); + +static DEFINE_SPINLOCK(idr_lock); +static DEFINE_IDR(ch_idr); + +static LIST_HEAD(cm_dev_list); +static DECLARE_RWSEM(rdev_sem); + +static struct class *dev_class; +static unsigned int dev_major; +static unsigned int dev_minor_base; +static dev_t dev_number; +static struct channel_dev riocm_cdev; + +#define is_msg_capable(src_ops, dst_ops) \ + ((src_ops & RIO_SRC_OPS_DATA_MSG) && \ + (dst_ops & RIO_DST_OPS_DATA_MSG)) +#define dev_cm_capable(dev) \ + is_msg_capable(dev->src_ops, dev->dst_ops) + +static int riocm_cmp(struct rio_channel *ch, enum rio_cm_state cmp) +{ + int ret; + + spin_lock_bh(&ch->lock); + ret = (ch->state == cmp); + spin_unlock_bh(&ch->lock); + return ret; +} + +static int riocm_cmp_exch(struct rio_channel *ch, + enum rio_cm_state cmp, enum rio_cm_state exch) +{ + int ret; + + spin_lock_bh(&ch->lock); + ret = (ch->state == cmp); + if (ret) + ch->state = exch; + spin_unlock_bh(&ch->lock); + return ret; +} + +static enum rio_cm_state riocm_exch(struct rio_channel *ch, + enum rio_cm_state exch) +{ + enum rio_cm_state old; + + spin_lock_bh(&ch->lock); + old = ch->state; + ch->state = exch; + spin_unlock_bh(&ch->lock); + return old; +} + +static struct rio_channel *riocm_get_channel(u16 nr) +{ + struct rio_channel *ch; + + spin_lock_bh(&idr_lock); + ch = idr_find(&ch_idr, nr); + if (ch) + kref_get(&ch->ref); + spin_unlock_bh(&idr_lock); + return ch; +} + +static void riocm_put_channel(struct rio_channel *ch) +{ + kref_put(&ch->ref, riocm_ch_free); +} + +static void *riocm_rx_get_msg(struct cm_dev *cm) +{ + void *msg; + int i; + + msg = rio_get_inb_message(cm->mport, cmbox); + if (msg) { + for (i = 0; i < RIOCM_RX_RING_SIZE; i++) { + if (cm->rx_buf[i] == msg) { + cm->rx_buf[i] = NULL; + cm->rx_slots++; + break; + } + } + + if (i == RIOCM_RX_RING_SIZE) + riocm_warn("no record for buffer 0x%p", msg); + } + + return msg; +} + +/* + * riocm_rx_fill - fills a ring of receive buffers for given cm device + * @cm: cm_dev object + * @nent: max number of entries to fill + * + * Returns: none + */ +static void riocm_rx_fill(struct cm_dev *cm, int nent) +{ + int i; + + if (cm->rx_slots == 0) + return; + + for (i = 0; i < RIOCM_RX_RING_SIZE && cm->rx_slots && nent; i++) { + if (cm->rx_buf[i] == NULL) { + cm->rx_buf[i] = kmalloc(RIO_MAX_MSG_SIZE, GFP_KERNEL); + if (cm->rx_buf[i] == NULL) + break; + rio_add_inb_buffer(cm->mport, cmbox, cm->rx_buf[i]); + cm->rx_slots--; + nent--; + } + } +} + +/* + * riocm_rx_free - frees all receive buffers associated with given cm device + * @cm: cm_dev object + * + * Returns: none + */ +static void riocm_rx_free(struct cm_dev *cm) +{ + int i; + + for (i = 0; i < RIOCM_RX_RING_SIZE; i++) { + if (cm->rx_buf[i] != NULL) { + kfree(cm->rx_buf[i]); + cm->rx_buf[i] = NULL; + } + } +} + +/* + * riocm_req_handler - connection request handler + * @cm: cm_dev object + * @req_data: pointer to the request packet + * + * Returns: 0 if success, or + * -EINVAL if channel is not in correct state, + * -ENODEV if cannot find a channel with specified ID, + * -ENOMEM if unable to allocate memory to store the request + */ +static int riocm_req_handler(struct cm_dev *cm, void *req_data) +{ + struct rio_channel *ch; + struct conn_req *req; + struct rio_ch_chan_hdr *hh = req_data; + u16 chnum; + + chnum = ntohs(hh->dst_ch); + + ch = riocm_get_channel(chnum); + + if (!ch) + return -ENODEV; + + if (ch->state != RIO_CM_LISTEN) { + riocm_debug(RX_CMD, "channel %d is not in listen state", chnum); + riocm_put_channel(ch); + return -EINVAL; + } + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) { + riocm_put_channel(ch); + return -ENOMEM; + } + + req->destid = ntohl(hh->bhdr.src_id); + req->chan = ntohs(hh->src_ch); + req->cmdev = cm; + + spin_lock_bh(&ch->lock); + list_add_tail(&req->node, &ch->accept_queue); + spin_unlock_bh(&ch->lock); + complete(&ch->comp); + riocm_put_channel(ch); + + return 0; +} + +/* + * riocm_resp_handler - response to connection request handler + * @resp_data: pointer to the response packet + * + * Returns: 0 if success, or + * -EINVAL if channel is not in correct state, + * -ENODEV if cannot find a channel with specified ID, + */ +static int riocm_resp_handler(void *resp_data) +{ + struct rio_channel *ch; + struct rio_ch_chan_hdr *hh = resp_data; + u16 chnum; + + chnum = ntohs(hh->dst_ch); + ch = riocm_get_channel(chnum); + if (!ch) + return -ENODEV; + + if (ch->state != RIO_CM_CONNECT) { + riocm_put_channel(ch); + return -EINVAL; + } + + riocm_exch(ch, RIO_CM_CONNECTED); + ch->rem_channel = ntohs(hh->src_ch); + complete(&ch->comp); + riocm_put_channel(ch); + + return 0; +} + +/* + * riocm_close_handler - channel close request handler + * @req_data: pointer to the request packet + * + * Returns: 0 if success, or + * -ENODEV if cannot find a channel with specified ID, + * + error codes returned by riocm_ch_close. + */ +static int riocm_close_handler(void *data) +{ + struct rio_channel *ch; + struct rio_ch_chan_hdr *hh = data; + int ret; + + riocm_debug(RX_CMD, "for ch=%d", ntohs(hh->dst_ch)); + + spin_lock_bh(&idr_lock); + ch = idr_find(&ch_idr, ntohs(hh->dst_ch)); + if (!ch) { + spin_unlock_bh(&idr_lock); + return -ENODEV; + } + idr_remove(&ch_idr, ch->id); + spin_unlock_bh(&idr_lock); + + riocm_exch(ch, RIO_CM_DISCONNECT); + + ret = riocm_ch_close(ch); + if (ret) + riocm_debug(RX_CMD, "riocm_ch_close() returned %d", ret); + + return 0; +} + +/* + * rio_cm_handler - function that services request (non-data) packets + * @cm: cm_dev object + * @data: pointer to the packet + */ +static void rio_cm_handler(struct cm_dev *cm, void *data) +{ + struct rio_ch_chan_hdr *hdr; + + if (!rio_mport_is_running(cm->mport)) + goto out; + + hdr = data; + + riocm_debug(RX_CMD, "OP=%x for ch=%d from %d", + hdr->ch_op, ntohs(hdr->dst_ch), ntohs(hdr->src_ch)); + + switch (hdr->ch_op) { + case CM_CONN_REQ: + riocm_req_handler(cm, data); + break; + case CM_CONN_ACK: + riocm_resp_handler(data); + break; + case CM_CONN_CLOSE: + riocm_close_handler(data); + break; + default: + riocm_error("Invalid packet header"); + break; + } +out: + kfree(data); +} + +/* + * rio_rx_data_handler - received data packet handler + * @cm: cm_dev object + * @buf: data packet + * + * Returns: 0 if success, or + * -ENODEV if cannot find a channel with specified ID, + * -EIO if channel is not in CONNECTED state, + * -ENOMEM if channel RX queue is full (packet discarded) + */ +static int rio_rx_data_handler(struct cm_dev *cm, void *buf) +{ + struct rio_ch_chan_hdr *hdr; + struct rio_channel *ch; + + hdr = buf; + + riocm_debug(RX_DATA, "for ch=%d", ntohs(hdr->dst_ch)); + + ch = riocm_get_channel(ntohs(hdr->dst_ch)); + if (!ch) { + /* Discard data message for non-existing channel */ + kfree(buf); + return -ENODEV; + } + + /* Place pointer to the buffer into channel's RX queue */ + spin_lock(&ch->lock); + + if (ch->state != RIO_CM_CONNECTED) { + /* Channel is not ready to receive data, discard a packet */ + riocm_debug(RX_DATA, "ch=%d is in wrong state=%d", + ch->id, ch->state); + spin_unlock(&ch->lock); + kfree(buf); + riocm_put_channel(ch); + return -EIO; + } + + if (ch->rx_ring.count == RIOCM_RX_RING_SIZE) { + /* If RX ring is full, discard a packet */ + riocm_debug(RX_DATA, "ch=%d is full", ch->id); + spin_unlock(&ch->lock); + kfree(buf); + riocm_put_channel(ch); + return -ENOMEM; + } + + ch->rx_ring.buf[ch->rx_ring.head] = buf; + ch->rx_ring.head++; + ch->rx_ring.count++; + ch->rx_ring.head %= RIOCM_RX_RING_SIZE; + + complete(&ch->comp); + + spin_unlock(&ch->lock); + riocm_put_channel(ch); + + return 0; +} + +/* + * rio_ibmsg_handler - inbound message packet handler + */ +static void rio_ibmsg_handler(struct work_struct *work) +{ + struct cm_dev *cm = container_of(work, struct cm_dev, rx_work); + void *data; + struct rio_ch_chan_hdr *hdr; + + if (!rio_mport_is_running(cm->mport)) + return; + + while (1) { + mutex_lock(&cm->rx_lock); + data = riocm_rx_get_msg(cm); + if (data) + riocm_rx_fill(cm, 1); + mutex_unlock(&cm->rx_lock); + + if (data == NULL) + break; + + hdr = data; + + if (hdr->bhdr.type != RIO_CM_CHAN) { + /* For now simply discard packets other than channel */ + riocm_error("Unsupported TYPE code (0x%x). Msg dropped", + hdr->bhdr.type); + kfree(data); + continue; + } + + /* Process a channel message */ + if (hdr->ch_op == CM_DATA_MSG) + rio_rx_data_handler(cm, data); + else + rio_cm_handler(cm, data); + } +} + +static void riocm_inb_msg_event(struct rio_mport *mport, void *dev_id, + int mbox, int slot) +{ + struct cm_dev *cm = dev_id; + + if (rio_mport_is_running(cm->mport) && !work_pending(&cm->rx_work)) + queue_work(cm->rx_wq, &cm->rx_work); +} + +/* + * rio_txcq_handler - TX completion handler + * @cm: cm_dev object + * @slot: TX queue slot + * + * TX completion handler also ensures that pending request packets are placed + * into transmit queue as soon as a free slot becomes available. This is done + * to give higher priority to request packets during high intensity data flow. + */ +static void rio_txcq_handler(struct cm_dev *cm, int slot) +{ + int ack_slot; + + /* ATTN: Add TX completion notification if/when direct buffer + * transfer is implemented. At this moment only correct tracking + * of tx_count is important. + */ + riocm_debug(TX_EVENT, "for mport_%d slot %d tx_cnt %d", + cm->mport->id, slot, cm->tx_cnt); + + spin_lock(&cm->tx_lock); + ack_slot = cm->tx_ack_slot; + + if (ack_slot == slot) + riocm_debug(TX_EVENT, "slot == ack_slot"); + + while (cm->tx_cnt && ((ack_slot != slot) || + (cm->tx_cnt == RIOCM_TX_RING_SIZE))) { + + cm->tx_buf[ack_slot] = NULL; + ++ack_slot; + ack_slot &= (RIOCM_TX_RING_SIZE - 1); + cm->tx_cnt--; + } + + if (cm->tx_cnt < 0 || cm->tx_cnt > RIOCM_TX_RING_SIZE) + riocm_error("tx_cnt %d out of sync", cm->tx_cnt); + + WARN_ON((cm->tx_cnt < 0) || (cm->tx_cnt > RIOCM_TX_RING_SIZE)); + + cm->tx_ack_slot = ack_slot; + + /* + * If there are pending requests, insert them into transmit queue + */ + if (!list_empty(&cm->tx_reqs) && (cm->tx_cnt < RIOCM_TX_RING_SIZE)) { + struct tx_req *req, *_req; + int rc; + + list_for_each_entry_safe(req, _req, &cm->tx_reqs, node) { + list_del(&req->node); + cm->tx_buf[cm->tx_slot] = req->buffer; + rc = rio_add_outb_message(cm->mport, req->rdev, cmbox, + req->buffer, req->len); + kfree(req->buffer); + kfree(req); + + ++cm->tx_cnt; + ++cm->tx_slot; + cm->tx_slot &= (RIOCM_TX_RING_SIZE - 1); + if (cm->tx_cnt == RIOCM_TX_RING_SIZE) + break; + } + } + + spin_unlock(&cm->tx_lock); +} + +static void riocm_outb_msg_event(struct rio_mport *mport, void *dev_id, + int mbox, int slot) +{ + struct cm_dev *cm = dev_id; + + if (cm && rio_mport_is_running(cm->mport)) + rio_txcq_handler(cm, slot); +} + +static int riocm_queue_req(struct cm_dev *cm, struct rio_dev *rdev, + void *buffer, size_t len) +{ + unsigned long flags; + struct tx_req *treq; + + treq = kzalloc(sizeof(*treq), GFP_KERNEL); + if (treq == NULL) + return -ENOMEM; + + treq->rdev = rdev; + treq->buffer = buffer; + treq->len = len; + + spin_lock_irqsave(&cm->tx_lock, flags); + list_add_tail(&treq->node, &cm->tx_reqs); + spin_unlock_irqrestore(&cm->tx_lock, flags); + return 0; +} + +/* + * riocm_post_send - helper function that places packet into msg TX queue + * @cm: cm_dev object + * @rdev: target RapidIO device object (required by outbound msg interface) + * @buffer: pointer to a packet buffer to send + * @len: length of data to transfer + * @req: request priority flag + * + * Returns: 0 if success, or error code otherwise. + */ +static int riocm_post_send(struct cm_dev *cm, struct rio_dev *rdev, + void *buffer, size_t len) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave(&cm->tx_lock, flags); + + if (cm->mport == NULL) { + rc = -ENODEV; + goto err_out; + } + + if (cm->tx_cnt == RIOCM_TX_RING_SIZE) { + riocm_debug(TX, "Tx Queue is full"); + rc = -EBUSY; + goto err_out; + } + + cm->tx_buf[cm->tx_slot] = buffer; + rc = rio_add_outb_message(cm->mport, rdev, cmbox, buffer, len); + + riocm_debug(TX, "Add buf@%p destid=%x tx_slot=%d tx_cnt=%d", + buffer, rdev->destid, cm->tx_slot, cm->tx_cnt); + + ++cm->tx_cnt; + ++cm->tx_slot; + cm->tx_slot &= (RIOCM_TX_RING_SIZE - 1); + +err_out: + spin_unlock_irqrestore(&cm->tx_lock, flags); + return rc; +} + +/* + * riocm_ch_send - sends a data packet to a remote device + * @ch_id: local channel ID + * @buf: pointer to a data buffer to send (including CM header) + * @len: length of data to transfer (including CM header) + * + * ATTN: ASSUMES THAT THE HEADER SPACE IS RESERVED PART OF THE DATA PACKET + * + * Returns: 0 if success, or + * -EINVAL if one or more input parameters is/are not valid, + * -ENODEV if cannot find a channel with specified ID, + * -EAGAIN if a channel is not in CONNECTED state, + * + error codes returned by HW send routine. + */ +static int riocm_ch_send(u16 ch_id, void *buf, int len) +{ + struct rio_channel *ch; + struct rio_ch_chan_hdr *hdr; + int ret; + + if (buf == NULL || ch_id == 0 || len == 0 || len > RIO_MAX_MSG_SIZE) + return -EINVAL; + + ch = riocm_get_channel(ch_id); + if (!ch) { + riocm_error("%s(%d) ch_%d not found", current->comm, + task_pid_nr(current), ch_id); + return -ENODEV; + } + + if (!riocm_cmp(ch, RIO_CM_CONNECTED)) { + ret = -EAGAIN; + goto err_out; + } + + /* + * Fill buffer header section with corresponding channel data + */ + hdr = buf; + + hdr->bhdr.src_id = htonl(ch->loc_destid); + hdr->bhdr.dst_id = htonl(ch->rem_destid); + hdr->bhdr.src_mbox = cmbox; + hdr->bhdr.dst_mbox = cmbox; + hdr->bhdr.type = RIO_CM_CHAN; + hdr->ch_op = CM_DATA_MSG; + hdr->dst_ch = htons(ch->rem_channel); + hdr->src_ch = htons(ch->id); + hdr->msg_len = htons((u16)len); + + /* ATTN: the function call below relies on the fact that underlying + * HW-specific add_outb_message() routine copies TX data into its own + * internal transfer buffer (true for all RIONET compatible mport + * drivers). Must be reviewed if mport driver uses the buffer directly. + */ + + ret = riocm_post_send(ch->cmdev, ch->rdev, buf, len); + if (ret) + riocm_debug(TX, "ch %d send_err=%d", ch->id, ret); +err_out: + riocm_put_channel(ch); + return ret; +} + +static int riocm_ch_free_rxbuf(struct rio_channel *ch, void *buf) +{ + int i, ret = -EINVAL; + + spin_lock_bh(&ch->lock); + + for (i = 0; i < RIOCM_RX_RING_SIZE; i++) { + if (ch->rx_ring.inuse[i] == buf) { + ch->rx_ring.inuse[i] = NULL; + ch->rx_ring.inuse_cnt--; + ret = 0; + break; + } + } + + spin_unlock_bh(&ch->lock); + + if (!ret) + kfree(buf); + + return ret; +} + +/* + * riocm_ch_receive - fetch a data packet received for the specified channel + * @ch: local channel ID + * @buf: pointer to a packet buffer + * @timeout: timeout to wait for incoming packet (in jiffies) + * + * Returns: 0 and valid buffer pointer if success, or NULL pointer and one of: + * -EAGAIN if a channel is not in CONNECTED state, + * -ENOMEM if in-use tracking queue is full, + * -ETIME if wait timeout expired, + * -EINTR if wait was interrupted. + */ +static int riocm_ch_receive(struct rio_channel *ch, void **buf, long timeout) +{ + void *rxmsg = NULL; + int i, ret = 0; + long wret; + + if (!riocm_cmp(ch, RIO_CM_CONNECTED)) { + ret = -EAGAIN; + goto out; + } + + if (ch->rx_ring.inuse_cnt == RIOCM_RX_RING_SIZE) { + /* If we do not have entries to track buffers given to upper + * layer, reject request. + */ + ret = -ENOMEM; + goto out; + } + + wret = wait_for_completion_interruptible_timeout(&ch->comp, timeout); + + riocm_debug(WAIT, "wait on %d returned %ld", ch->id, wret); + + if (!wret) + ret = -ETIME; + else if (wret == -ERESTARTSYS) + ret = -EINTR; + else + ret = riocm_cmp(ch, RIO_CM_CONNECTED) ? 0 : -ECONNRESET; + + if (ret) + goto out; + + spin_lock_bh(&ch->lock); + + rxmsg = ch->rx_ring.buf[ch->rx_ring.tail]; + ch->rx_ring.buf[ch->rx_ring.tail] = NULL; + ch->rx_ring.count--; + ch->rx_ring.tail++; + ch->rx_ring.tail %= RIOCM_RX_RING_SIZE; + ret = -ENOMEM; + + for (i = 0; i < RIOCM_RX_RING_SIZE; i++) { + if (ch->rx_ring.inuse[i] == NULL) { + ch->rx_ring.inuse[i] = rxmsg; + ch->rx_ring.inuse_cnt++; + ret = 0; + break; + } + } + + if (ret) { + /* We have no entry to store pending message: drop it */ + kfree(rxmsg); + rxmsg = NULL; + } + + spin_unlock_bh(&ch->lock); +out: + *buf = rxmsg; + return ret; +} + +/* + * riocm_ch_connect - sends a connect request to a remote device + * @loc_ch: local channel ID + * @cm: CM device to send connect request + * @peer: target RapidIO device + * @rem_ch: remote channel ID + * + * Returns: 0 if success, or + * -EINVAL if the channel is not in IDLE state, + * -EAGAIN if no connection request available immediately, + * -ETIME if ACK response timeout expired, + * -EINTR if wait for response was interrupted. + */ +static int riocm_ch_connect(u16 loc_ch, struct cm_dev *cm, + struct cm_peer *peer, u16 rem_ch) +{ + struct rio_channel *ch = NULL; + struct rio_ch_chan_hdr *hdr; + int ret; + long wret; + + ch = riocm_get_channel(loc_ch); + if (!ch) + return -ENODEV; + + if (!riocm_cmp_exch(ch, RIO_CM_IDLE, RIO_CM_CONNECT)) { + ret = -EINVAL; + goto conn_done; + } + + ch->cmdev = cm; + ch->rdev = peer->rdev; + ch->context = NULL; + ch->loc_destid = cm->mport->host_deviceid; + ch->rem_channel = rem_ch; + + /* + * Send connect request to the remote RapidIO device + */ + + hdr = kzalloc(sizeof(*hdr), GFP_KERNEL); + if (hdr == NULL) { + ret = -ENOMEM; + goto conn_done; + } + + hdr->bhdr.src_id = htonl(ch->loc_destid); + hdr->bhdr.dst_id = htonl(peer->rdev->destid); + hdr->bhdr.src_mbox = cmbox; + hdr->bhdr.dst_mbox = cmbox; + hdr->bhdr.type = RIO_CM_CHAN; + hdr->ch_op = CM_CONN_REQ; + hdr->dst_ch = htons(rem_ch); + hdr->src_ch = htons(loc_ch); + + /* ATTN: the function call below relies on the fact that underlying + * HW-specific add_outb_message() routine copies TX data into its + * internal transfer buffer. Must be reviewed if mport driver uses + * this buffer directly. + */ + ret = riocm_post_send(cm, peer->rdev, hdr, sizeof(*hdr)); + + if (ret != -EBUSY) { + kfree(hdr); + } else { + ret = riocm_queue_req(cm, peer->rdev, hdr, sizeof(*hdr)); + if (ret) + kfree(hdr); + } + + if (ret) { + riocm_cmp_exch(ch, RIO_CM_CONNECT, RIO_CM_IDLE); + goto conn_done; + } + + /* Wait for connect response from the remote device */ + wret = wait_for_completion_interruptible_timeout(&ch->comp, + RIOCM_CONNECT_TO * HZ); + riocm_debug(WAIT, "wait on %d returns %ld", ch->id, wret); + + if (!wret) + ret = -ETIME; + else if (wret == -ERESTARTSYS) + ret = -EINTR; + else + ret = riocm_cmp(ch, RIO_CM_CONNECTED) ? 0 : -1; + +conn_done: + riocm_put_channel(ch); + return ret; +} + +static int riocm_send_ack(struct rio_channel *ch) +{ + struct rio_ch_chan_hdr *hdr; + int ret; + + hdr = kzalloc(sizeof(*hdr), GFP_KERNEL); + if (hdr == NULL) + return -ENOMEM; + + hdr->bhdr.src_id = htonl(ch->loc_destid); + hdr->bhdr.dst_id = htonl(ch->rem_destid); + hdr->dst_ch = htons(ch->rem_channel); + hdr->src_ch = htons(ch->id); + hdr->bhdr.src_mbox = cmbox; + hdr->bhdr.dst_mbox = cmbox; + hdr->bhdr.type = RIO_CM_CHAN; + hdr->ch_op = CM_CONN_ACK; + + /* ATTN: the function call below relies on the fact that underlying + * add_outb_message() routine copies TX data into its internal transfer + * buffer. Review if switching to direct buffer version. + */ + ret = riocm_post_send(ch->cmdev, ch->rdev, hdr, sizeof(*hdr)); + + if (ret == -EBUSY && !riocm_queue_req(ch->cmdev, + ch->rdev, hdr, sizeof(*hdr))) + return 0; + kfree(hdr); + + if (ret) + riocm_error("send ACK to ch_%d on %s failed (ret=%d)", + ch->id, rio_name(ch->rdev), ret); + return ret; +} + +/* + * riocm_ch_accept - accept incoming connection request + * @ch_id: channel ID + * @new_ch_id: local mport device + * @timeout: wait timeout (if 0 non-blocking call, do not wait if connection + * request is not available). + * + * Returns: pointer to new channel struct if success, or error-valued pointer: + * -ENODEV - cannot find specified channel or mport, + * -EINVAL - the channel is not in IDLE state, + * -EAGAIN - no connection request available immediately (timeout=0), + * -ENOMEM - unable to allocate new channel, + * -ETIME - wait timeout expired, + * -EINTR - wait was interrupted. + */ +static struct rio_channel *riocm_ch_accept(u16 ch_id, u16 *new_ch_id, + long timeout) +{ + struct rio_channel *ch = NULL; + struct rio_channel *new_ch = NULL; + struct conn_req *req; + struct cm_peer *peer; + int found = 0; + int err = 0; + long wret; + + ch = riocm_get_channel(ch_id); + if (!ch) + return ERR_PTR(-EINVAL); + + if (!riocm_cmp(ch, RIO_CM_LISTEN)) { + err = -EINVAL; + goto err_put; + } + + /* Don't sleep if this is a non blocking call */ + if (!timeout) { + if (!try_wait_for_completion(&ch->comp)) { + err = -EAGAIN; + goto err_put; + } + } else { + riocm_debug(WAIT, "on %d", ch->id); + + wret = wait_for_completion_interruptible_timeout(&ch->comp, + timeout); + if (!wret) { + err = -ETIME; + goto err_put; + } else if (wret == -ERESTARTSYS) { + err = -EINTR; + goto err_put; + } + } + + spin_lock_bh(&ch->lock); + + if (ch->state != RIO_CM_LISTEN) { + err = -ECANCELED; + } else if (list_empty(&ch->accept_queue)) { + riocm_debug(WAIT, "on %d accept_queue is empty on completion", + ch->id); + err = -EIO; + } + + spin_unlock_bh(&ch->lock); + + if (err) { + riocm_debug(WAIT, "on %d returns %d", ch->id, err); + goto err_put; + } + + /* Create new channel for this connection */ + new_ch = riocm_ch_alloc(RIOCM_CHNUM_AUTO); + + if (IS_ERR(new_ch)) { + riocm_error("failed to get channel for new req (%ld)", + PTR_ERR(new_ch)); + err = -ENOMEM; + goto err_put; + } + + spin_lock_bh(&ch->lock); + + req = list_first_entry(&ch->accept_queue, struct conn_req, node); + list_del(&req->node); + new_ch->cmdev = ch->cmdev; + new_ch->loc_destid = ch->loc_destid; + new_ch->rem_destid = req->destid; + new_ch->rem_channel = req->chan; + + spin_unlock_bh(&ch->lock); + riocm_put_channel(ch); + kfree(req); + + down_read(&rdev_sem); + /* Find requester's device object */ + list_for_each_entry(peer, &new_ch->cmdev->peers, node) { + if (peer->rdev->destid == new_ch->rem_destid) { + riocm_debug(RX_CMD, "found matching device(%s)", + rio_name(peer->rdev)); + found = 1; + break; + } + } + up_read(&rdev_sem); + + if (!found) { + /* If peer device object not found, simply ignore the request */ + err = -ENODEV; + goto err_nodev; + } + + new_ch->rdev = peer->rdev; + new_ch->state = RIO_CM_CONNECTED; + spin_lock_init(&new_ch->lock); + + /* Acknowledge the connection request. */ + riocm_send_ack(new_ch); + + *new_ch_id = new_ch->id; + return new_ch; +err_put: + riocm_put_channel(ch); +err_nodev: + if (new_ch) { + spin_lock_bh(&idr_lock); + idr_remove(&ch_idr, new_ch->id); + spin_unlock_bh(&idr_lock); + riocm_put_channel(new_ch); + } + *new_ch_id = 0; + return ERR_PTR(err); +} + +/* + * riocm_ch_listen - puts a channel into LISTEN state + * @ch_id: channel ID + * + * Returns: 0 if success, or + * -EINVAL if the specified channel does not exists or + * is not in CHAN_BOUND state. + */ +static int riocm_ch_listen(u16 ch_id) +{ + struct rio_channel *ch = NULL; + int ret = 0; + + riocm_debug(CHOP, "(ch_%d)", ch_id); + + ch = riocm_get_channel(ch_id); + if (!ch || !riocm_cmp_exch(ch, RIO_CM_CHAN_BOUND, RIO_CM_LISTEN)) + ret = -EINVAL; + riocm_put_channel(ch); + return ret; +} + +/* + * riocm_ch_bind - associate a channel object and an mport device + * @ch_id: channel ID + * @mport_id: local mport device ID + * @context: pointer to the additional caller's context + * + * Returns: 0 if success, or + * -ENODEV if cannot find specified mport, + * -EINVAL if the specified channel does not exist or + * is not in IDLE state. + */ +static int riocm_ch_bind(u16 ch_id, u8 mport_id, void *context) +{ + struct rio_channel *ch = NULL; + struct cm_dev *cm; + int rc = -ENODEV; + + riocm_debug(CHOP, "ch_%d to mport_%d", ch_id, mport_id); + + /* Find matching cm_dev object */ + down_read(&rdev_sem); + list_for_each_entry(cm, &cm_dev_list, list) { + if ((cm->mport->id == mport_id) && + rio_mport_is_running(cm->mport)) { + rc = 0; + break; + } + } + + if (rc) + goto exit; + + ch = riocm_get_channel(ch_id); + if (!ch) { + rc = -EINVAL; + goto exit; + } + + spin_lock_bh(&ch->lock); + if (ch->state != RIO_CM_IDLE) { + spin_unlock_bh(&ch->lock); + rc = -EINVAL; + goto err_put; + } + + ch->cmdev = cm; + ch->loc_destid = cm->mport->host_deviceid; + ch->context = context; + ch->state = RIO_CM_CHAN_BOUND; + spin_unlock_bh(&ch->lock); +err_put: + riocm_put_channel(ch); +exit: + up_read(&rdev_sem); + return rc; +} + +/* + * riocm_ch_alloc - channel object allocation helper routine + * @ch_num: channel ID (1 ... RIOCM_MAX_CHNUM, 0 = automatic) + * + * Return value: pointer to newly created channel object, + * or error-valued pointer + */ +static struct rio_channel *riocm_ch_alloc(u16 ch_num) +{ + int id; + int start, end; + struct rio_channel *ch; + + ch = kzalloc(sizeof(*ch), GFP_KERNEL); + if (!ch) + return ERR_PTR(-ENOMEM); + + if (ch_num) { + /* If requested, try to obtain the specified channel ID */ + start = ch_num; + end = ch_num + 1; + } else { + /* Obtain channel ID from the dynamic allocation range */ + start = chstart; + end = RIOCM_MAX_CHNUM + 1; + } + + idr_preload(GFP_KERNEL); + spin_lock_bh(&idr_lock); + id = idr_alloc_cyclic(&ch_idr, ch, start, end, GFP_NOWAIT); + spin_unlock_bh(&idr_lock); + idr_preload_end(); + + if (id < 0) { + kfree(ch); + return ERR_PTR(id == -ENOSPC ? -EBUSY : id); + } + + ch->id = (u16)id; + ch->state = RIO_CM_IDLE; + spin_lock_init(&ch->lock); + INIT_LIST_HEAD(&ch->accept_queue); + INIT_LIST_HEAD(&ch->ch_node); + init_completion(&ch->comp); + init_completion(&ch->comp_close); + kref_init(&ch->ref); + ch->rx_ring.head = 0; + ch->rx_ring.tail = 0; + ch->rx_ring.count = 0; + ch->rx_ring.inuse_cnt = 0; + + return ch; +} + +/* + * riocm_ch_create - creates a new channel object and allocates ID for it + * @ch_num: channel ID (1 ... RIOCM_MAX_CHNUM, 0 = automatic) + * + * Allocates and initializes a new channel object. If the parameter ch_num > 0 + * and is within the valid range, riocm_ch_create tries to allocate the + * specified ID for the new channel. If ch_num = 0, channel ID will be assigned + * automatically from the range (chstart ... RIOCM_MAX_CHNUM). + * Module parameter 'chstart' defines start of an ID range available for dynamic + * allocation. Range below 'chstart' is reserved for pre-defined ID numbers. + * Available channel numbers are limited by 16-bit size of channel numbers used + * in the packet header. + * + * Return value: PTR to rio_channel structure if successful (with channel number + * updated via pointer) or error-valued pointer if error. + */ +static struct rio_channel *riocm_ch_create(u16 *ch_num) +{ + struct rio_channel *ch = NULL; + + ch = riocm_ch_alloc(*ch_num); + + if (IS_ERR(ch)) + riocm_debug(CHOP, "Failed to allocate channel %d (err=%ld)", + *ch_num, PTR_ERR(ch)); + else + *ch_num = ch->id; + + return ch; +} + +/* + * riocm_ch_free - channel object release routine + * @ref: pointer to a channel's kref structure + */ +static void riocm_ch_free(struct kref *ref) +{ + struct rio_channel *ch = container_of(ref, struct rio_channel, ref); + int i; + + riocm_debug(CHOP, "(ch_%d)", ch->id); + + if (ch->rx_ring.inuse_cnt) { + for (i = 0; + i < RIOCM_RX_RING_SIZE && ch->rx_ring.inuse_cnt; i++) { + if (ch->rx_ring.inuse[i] != NULL) { + kfree(ch->rx_ring.inuse[i]); + ch->rx_ring.inuse_cnt--; + } + } + } + + if (ch->rx_ring.count) + for (i = 0; i < RIOCM_RX_RING_SIZE && ch->rx_ring.count; i++) { + if (ch->rx_ring.buf[i] != NULL) { + kfree(ch->rx_ring.buf[i]); + ch->rx_ring.count--; + } + } + + complete(&ch->comp_close); +} + +static int riocm_send_close(struct rio_channel *ch) +{ + struct rio_ch_chan_hdr *hdr; + int ret; + + /* + * Send CH_CLOSE notification to the remote RapidIO device + */ + + hdr = kzalloc(sizeof(*hdr), GFP_KERNEL); + if (hdr == NULL) + return -ENOMEM; + + hdr->bhdr.src_id = htonl(ch->loc_destid); + hdr->bhdr.dst_id = htonl(ch->rem_destid); + hdr->bhdr.src_mbox = cmbox; + hdr->bhdr.dst_mbox = cmbox; + hdr->bhdr.type = RIO_CM_CHAN; + hdr->ch_op = CM_CONN_CLOSE; + hdr->dst_ch = htons(ch->rem_channel); + hdr->src_ch = htons(ch->id); + + /* ATTN: the function call below relies on the fact that underlying + * add_outb_message() routine copies TX data into its internal transfer + * buffer. Needs to be reviewed if switched to direct buffer mode. + */ + ret = riocm_post_send(ch->cmdev, ch->rdev, hdr, sizeof(*hdr)); + + if (ret == -EBUSY && !riocm_queue_req(ch->cmdev, ch->rdev, + hdr, sizeof(*hdr))) + return 0; + kfree(hdr); + + if (ret) + riocm_error("ch(%d) send CLOSE failed (ret=%d)", ch->id, ret); + + return ret; +} + +/* + * riocm_ch_close - closes a channel object with specified ID (by local request) + * @ch: channel to be closed + */ +static int riocm_ch_close(struct rio_channel *ch) +{ + unsigned long tmo = msecs_to_jiffies(3000); + enum rio_cm_state state; + long wret; + int ret = 0; + + riocm_debug(CHOP, "ch_%d by %s(%d)", + ch->id, current->comm, task_pid_nr(current)); + + state = riocm_exch(ch, RIO_CM_DESTROYING); + if (state == RIO_CM_CONNECTED) + riocm_send_close(ch); + + complete_all(&ch->comp); + + riocm_put_channel(ch); + wret = wait_for_completion_interruptible_timeout(&ch->comp_close, tmo); + + riocm_debug(WAIT, "wait on %d returns %ld", ch->id, wret); + + if (wret == 0) { + /* Timeout on wait occurred */ + riocm_debug(CHOP, "%s(%d) timed out waiting for ch %d", + current->comm, task_pid_nr(current), ch->id); + ret = -ETIMEDOUT; + } else if (wret == -ERESTARTSYS) { + /* Wait_for_completion was interrupted by a signal */ + riocm_debug(CHOP, "%s(%d) wait for ch %d was interrupted", + current->comm, task_pid_nr(current), ch->id); + ret = -EINTR; + } + + if (!ret) { + riocm_debug(CHOP, "ch_%d resources released", ch->id); + kfree(ch); + } else { + riocm_debug(CHOP, "failed to release ch_%d resources", ch->id); + } + + return ret; +} + +/* + * riocm_cdev_open() - Open character device + */ +static int riocm_cdev_open(struct inode *inode, struct file *filp) +{ + riocm_debug(INIT, "by %s(%d) filp=%p ", + current->comm, task_pid_nr(current), filp); + + if (list_empty(&cm_dev_list)) + return -ENODEV; + + return 0; +} + +/* + * riocm_cdev_release() - Release character device + */ +static int riocm_cdev_release(struct inode *inode, struct file *filp) +{ + struct rio_channel *ch, *_c; + unsigned int i; + LIST_HEAD(list); + + riocm_debug(EXIT, "by %s(%d) filp=%p", + current->comm, task_pid_nr(current), filp); + + /* Check if there are channels associated with this file descriptor */ + spin_lock_bh(&idr_lock); + idr_for_each_entry(&ch_idr, ch, i) { + if (ch && ch->filp == filp) { + riocm_debug(EXIT, "ch_%d not released by %s(%d)", + ch->id, current->comm, + task_pid_nr(current)); + idr_remove(&ch_idr, ch->id); + list_add(&ch->ch_node, &list); + } + } + spin_unlock_bh(&idr_lock); + + if (!list_empty(&list)) { + list_for_each_entry_safe(ch, _c, &list, ch_node) { + list_del(&ch->ch_node); + riocm_ch_close(ch); + } + } + + return 0; +} + +/* + * cm_ep_get_list_size() - Reports number of endpoints in the network + */ +static int cm_ep_get_list_size(void __user *arg) +{ + u32 __user *p = arg; + u32 mport_id; + u32 count = 0; + struct cm_dev *cm; + + if (get_user(mport_id, p)) + return -EFAULT; + if (mport_id >= RIO_MAX_MPORTS) + return -EINVAL; + + /* Find a matching cm_dev object */ + down_read(&rdev_sem); + list_for_each_entry(cm, &cm_dev_list, list) { + if (cm->mport->id == mport_id) { + count = cm->npeers; + up_read(&rdev_sem); + if (copy_to_user(arg, &count, sizeof(u32))) + return -EFAULT; + return 0; + } + } + up_read(&rdev_sem); + + return -ENODEV; +} + +/* + * cm_ep_get_list() - Returns list of attached endpoints + */ +static int cm_ep_get_list(void __user *arg) +{ + struct cm_dev *cm; + struct cm_peer *peer; + u32 info[2]; + void *buf; + u32 nent; + u32 *entry_ptr; + u32 i = 0; + int ret = 0; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + if (info[1] >= RIO_MAX_MPORTS || info[0] > RIOCM_MAX_EP_COUNT) + return -EINVAL; + + /* Find a matching cm_dev object */ + down_read(&rdev_sem); + list_for_each_entry(cm, &cm_dev_list, list) + if (cm->mport->id == (u8)info[1]) + goto found; + + up_read(&rdev_sem); + return -ENODEV; + +found: + nent = min(info[0], cm->npeers); + buf = kcalloc(nent + 2, sizeof(u32), GFP_KERNEL); + if (!buf) { + up_read(&rdev_sem); + return -ENOMEM; + } + + entry_ptr = (u32 *)((uintptr_t)buf + 2*sizeof(u32)); + + list_for_each_entry(peer, &cm->peers, node) { + *entry_ptr = (u32)peer->rdev->destid; + entry_ptr++; + if (++i == nent) + break; + } + up_read(&rdev_sem); + + ((u32 *)buf)[0] = i; /* report an updated number of entries */ + ((u32 *)buf)[1] = info[1]; /* put back an mport ID */ + if (copy_to_user(arg, buf, sizeof(u32) * (info[0] + 2))) + ret = -EFAULT; + + kfree(buf); + return ret; +} + +/* + * cm_mport_get_list() - Returns list of available local mport devices + */ +static int cm_mport_get_list(void __user *arg) +{ + int ret = 0; + u32 entries; + void *buf; + struct cm_dev *cm; + u32 *entry_ptr; + int count = 0; + + if (copy_from_user(&entries, arg, sizeof(entries))) + return -EFAULT; + if (entries == 0 || entries > RIO_MAX_MPORTS) + return -EINVAL; + buf = kcalloc(entries + 1, sizeof(u32), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Scan all registered cm_dev objects */ + entry_ptr = (u32 *)((uintptr_t)buf + sizeof(u32)); + down_read(&rdev_sem); + list_for_each_entry(cm, &cm_dev_list, list) { + if (count++ < entries) { + *entry_ptr = (cm->mport->id << 16) | + cm->mport->host_deviceid; + entry_ptr++; + } + } + up_read(&rdev_sem); + + *((u32 *)buf) = count; /* report a real number of entries */ + if (copy_to_user(arg, buf, sizeof(u32) * (count + 1))) + ret = -EFAULT; + + kfree(buf); + return ret; +} + +/* + * cm_chan_create() - Create a message exchange channel + */ +static int cm_chan_create(struct file *filp, void __user *arg) +{ + u16 __user *p = arg; + u16 ch_num; + struct rio_channel *ch; + + if (get_user(ch_num, p)) + return -EFAULT; + + riocm_debug(CHOP, "ch_%d requested by %s(%d)", + ch_num, current->comm, task_pid_nr(current)); + ch = riocm_ch_create(&ch_num); + if (IS_ERR(ch)) + return PTR_ERR(ch); + + ch->filp = filp; + riocm_debug(CHOP, "ch_%d created by %s(%d)", + ch_num, current->comm, task_pid_nr(current)); + return put_user(ch_num, p); +} + +/* + * cm_chan_close() - Close channel + * @filp: Pointer to file object + * @arg: Channel to close + */ +static int cm_chan_close(struct file *filp, void __user *arg) +{ + u16 __user *p = arg; + u16 ch_num; + struct rio_channel *ch; + + if (get_user(ch_num, p)) + return -EFAULT; + + riocm_debug(CHOP, "ch_%d by %s(%d)", + ch_num, current->comm, task_pid_nr(current)); + + spin_lock_bh(&idr_lock); + ch = idr_find(&ch_idr, ch_num); + if (!ch) { + spin_unlock_bh(&idr_lock); + return 0; + } + if (ch->filp != filp) { + spin_unlock_bh(&idr_lock); + return -EINVAL; + } + idr_remove(&ch_idr, ch->id); + spin_unlock_bh(&idr_lock); + + return riocm_ch_close(ch); +} + +/* + * cm_chan_bind() - Bind channel + * @arg: Channel number + */ +static int cm_chan_bind(void __user *arg) +{ + struct rio_cm_channel chan; + + if (copy_from_user(&chan, arg, sizeof(chan))) + return -EFAULT; + if (chan.mport_id >= RIO_MAX_MPORTS) + return -EINVAL; + + return riocm_ch_bind(chan.id, chan.mport_id, NULL); +} + +/* + * cm_chan_listen() - Listen on channel + * @arg: Channel number + */ +static int cm_chan_listen(void __user *arg) +{ + u16 __user *p = arg; + u16 ch_num; + + if (get_user(ch_num, p)) + return -EFAULT; + + return riocm_ch_listen(ch_num); +} + +/* + * cm_chan_accept() - Accept incoming connection + * @filp: Pointer to file object + * @arg: Channel number + */ +static int cm_chan_accept(struct file *filp, void __user *arg) +{ + struct rio_cm_accept param; + long accept_to; + struct rio_channel *ch; + + if (copy_from_user(¶m, arg, sizeof(param))) + return -EFAULT; + + riocm_debug(CHOP, "on ch_%d by %s(%d)", + param.ch_num, current->comm, task_pid_nr(current)); + + accept_to = param.wait_to ? + msecs_to_jiffies(param.wait_to) : 0; + + ch = riocm_ch_accept(param.ch_num, ¶m.ch_num, accept_to); + if (IS_ERR(ch)) + return PTR_ERR(ch); + ch->filp = filp; + + riocm_debug(CHOP, "new ch_%d for %s(%d)", + ch->id, current->comm, task_pid_nr(current)); + + if (copy_to_user(arg, ¶m, sizeof(param))) + return -EFAULT; + return 0; +} + +/* + * cm_chan_connect() - Connect on channel + * @arg: Channel information + */ +static int cm_chan_connect(void __user *arg) +{ + struct rio_cm_channel chan; + struct cm_dev *cm; + struct cm_peer *peer; + int ret = -ENODEV; + + if (copy_from_user(&chan, arg, sizeof(chan))) + return -EFAULT; + if (chan.mport_id >= RIO_MAX_MPORTS) + return -EINVAL; + + down_read(&rdev_sem); + + /* Find matching cm_dev object */ + list_for_each_entry(cm, &cm_dev_list, list) { + if (cm->mport->id == chan.mport_id) { + ret = 0; + break; + } + } + + if (ret) + goto err_out; + + if (chan.remote_destid >= RIO_ANY_DESTID(cm->mport->sys_size)) { + ret = -EINVAL; + goto err_out; + } + + /* Find corresponding RapidIO endpoint device object */ + ret = -ENODEV; + + list_for_each_entry(peer, &cm->peers, node) { + if (peer->rdev->destid == chan.remote_destid) { + ret = 0; + break; + } + } + + if (ret) + goto err_out; + + up_read(&rdev_sem); + + return riocm_ch_connect(chan.id, cm, peer, chan.remote_channel); +err_out: + up_read(&rdev_sem); + return ret; +} + +/* + * cm_chan_msg_send() - Send a message through channel + * @arg: Outbound message information + */ +static int cm_chan_msg_send(void __user *arg) +{ + struct rio_cm_msg msg; + void *buf; + int ret = 0; + + if (copy_from_user(&msg, arg, sizeof(msg))) + return -EFAULT; + if (msg.size > RIO_MAX_MSG_SIZE) + return -EINVAL; + + buf = kmalloc(msg.size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, (void __user *)(uintptr_t)msg.msg, msg.size)) { + ret = -EFAULT; + goto out; + } + + ret = riocm_ch_send(msg.ch_num, buf, msg.size); +out: + kfree(buf); + return ret; +} + +/* + * cm_chan_msg_rcv() - Receive a message through channel + * @arg: Inbound message information + */ +static int cm_chan_msg_rcv(void __user *arg) +{ + struct rio_cm_msg msg; + struct rio_channel *ch; + void *buf; + long rxto; + int ret = 0, msg_size; + + if (copy_from_user(&msg, arg, sizeof(msg))) + return -EFAULT; + + if (msg.ch_num == 0 || msg.size == 0) + return -EINVAL; + + ch = riocm_get_channel(msg.ch_num); + if (!ch) + return -ENODEV; + + rxto = msg.rxto ? msecs_to_jiffies(msg.rxto) : MAX_SCHEDULE_TIMEOUT; + + ret = riocm_ch_receive(ch, &buf, rxto); + if (ret) + goto out; + + msg_size = min(msg.size, (u16)(RIO_MAX_MSG_SIZE)); + + if (copy_to_user((void __user *)(uintptr_t)msg.msg, buf, msg_size)) + ret = -EFAULT; + + riocm_ch_free_rxbuf(ch, buf); +out: + riocm_put_channel(ch); + return ret; +} + +/* + * riocm_cdev_ioctl() - IOCTL requests handler + */ +static long +riocm_cdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case RIO_CM_EP_GET_LIST_SIZE: + return cm_ep_get_list_size((void __user *)arg); + case RIO_CM_EP_GET_LIST: + return cm_ep_get_list((void __user *)arg); + case RIO_CM_CHAN_CREATE: + return cm_chan_create(filp, (void __user *)arg); + case RIO_CM_CHAN_CLOSE: + return cm_chan_close(filp, (void __user *)arg); + case RIO_CM_CHAN_BIND: + return cm_chan_bind((void __user *)arg); + case RIO_CM_CHAN_LISTEN: + return cm_chan_listen((void __user *)arg); + case RIO_CM_CHAN_ACCEPT: + return cm_chan_accept(filp, (void __user *)arg); + case RIO_CM_CHAN_CONNECT: + return cm_chan_connect((void __user *)arg); + case RIO_CM_CHAN_SEND: + return cm_chan_msg_send((void __user *)arg); + case RIO_CM_CHAN_RECEIVE: + return cm_chan_msg_rcv((void __user *)arg); + case RIO_CM_MPORT_GET_LIST: + return cm_mport_get_list((void __user *)arg); + default: + break; + } + + return -EINVAL; +} + +static const struct file_operations riocm_cdev_fops = { + .owner = THIS_MODULE, + .open = riocm_cdev_open, + .release = riocm_cdev_release, + .unlocked_ioctl = riocm_cdev_ioctl, +}; + +/* + * riocm_add_dev - add new remote RapidIO device into channel management core + * @dev: device object associated with RapidIO device + * @sif: subsystem interface + * + * Adds the specified RapidIO device (if applicable) into peers list of + * the corresponding channel management device (cm_dev). + */ +static int riocm_add_dev(struct device *dev, struct subsys_interface *sif) +{ + struct cm_peer *peer; + struct rio_dev *rdev = to_rio_dev(dev); + struct cm_dev *cm; + + /* Check if the remote device has capabilities required to support CM */ + if (!dev_cm_capable(rdev)) + return 0; + + riocm_debug(RDEV, "(%s)", rio_name(rdev)); + + peer = kmalloc(sizeof(*peer), GFP_KERNEL); + if (!peer) + return -ENOMEM; + + /* Find a corresponding cm_dev object */ + down_write(&rdev_sem); + list_for_each_entry(cm, &cm_dev_list, list) { + if (cm->mport == rdev->net->hport) + goto found; + } + + up_write(&rdev_sem); + kfree(peer); + return -ENODEV; + +found: + peer->rdev = rdev; + list_add_tail(&peer->node, &cm->peers); + cm->npeers++; + + up_write(&rdev_sem); + return 0; +} + +/* + * riocm_remove_dev - remove remote RapidIO device from channel management core + * @dev: device object associated with RapidIO device + * @sif: subsystem interface + * + * Removes the specified RapidIO device (if applicable) from peers list of + * the corresponding channel management device (cm_dev). + */ +static void riocm_remove_dev(struct device *dev, struct subsys_interface *sif) +{ + struct rio_dev *rdev = to_rio_dev(dev); + struct cm_dev *cm; + struct cm_peer *peer; + struct rio_channel *ch, *_c; + unsigned int i; + bool found = false; + LIST_HEAD(list); + + /* Check if the remote device has capabilities required to support CM */ + if (!dev_cm_capable(rdev)) + return; + + riocm_debug(RDEV, "(%s)", rio_name(rdev)); + + /* Find matching cm_dev object */ + down_write(&rdev_sem); + list_for_each_entry(cm, &cm_dev_list, list) { + if (cm->mport == rdev->net->hport) { + found = true; + break; + } + } + + if (!found) { + up_write(&rdev_sem); + return; + } + + /* Remove remote device from the list of peers */ + found = false; + list_for_each_entry(peer, &cm->peers, node) { + if (peer->rdev == rdev) { + riocm_debug(RDEV, "removing peer %s", rio_name(rdev)); + found = true; + list_del(&peer->node); + cm->npeers--; + kfree(peer); + break; + } + } + + up_write(&rdev_sem); + + if (!found) + return; + + /* + * Release channels associated with this peer + */ + + spin_lock_bh(&idr_lock); + idr_for_each_entry(&ch_idr, ch, i) { + if (ch && ch->rdev == rdev) { + if (atomic_read(&rdev->state) != RIO_DEVICE_SHUTDOWN) + riocm_exch(ch, RIO_CM_DISCONNECT); + idr_remove(&ch_idr, ch->id); + list_add(&ch->ch_node, &list); + } + } + spin_unlock_bh(&idr_lock); + + if (!list_empty(&list)) { + list_for_each_entry_safe(ch, _c, &list, ch_node) { + list_del(&ch->ch_node); + riocm_ch_close(ch); + } + } +} + +/* + * riocm_cdev_add() - Create rio_cm char device + * @devno: device number assigned to device (MAJ + MIN) + */ +static int riocm_cdev_add(dev_t devno) +{ + int ret; + + cdev_init(&riocm_cdev.cdev, &riocm_cdev_fops); + riocm_cdev.cdev.owner = THIS_MODULE; + ret = cdev_add(&riocm_cdev.cdev, devno, 1); + if (ret < 0) { + riocm_error("Cannot register a device with error %d", ret); + return ret; + } + + riocm_cdev.dev = device_create(dev_class, NULL, devno, NULL, DEV_NAME); + if (IS_ERR(riocm_cdev.dev)) { + cdev_del(&riocm_cdev.cdev); + return PTR_ERR(riocm_cdev.dev); + } + + riocm_debug(MPORT, "Added %s cdev(%d:%d)", + DEV_NAME, MAJOR(devno), MINOR(devno)); + + return 0; +} + +/* + * riocm_add_mport - add new local mport device into channel management core + * @dev: device object associated with mport + * @class_intf: class interface + * + * When a new mport device is added, CM immediately reserves inbound and + * outbound RapidIO mailboxes that will be used. + */ +static int riocm_add_mport(struct device *dev, + struct class_interface *class_intf) +{ + int rc; + int i; + struct cm_dev *cm; + struct rio_mport *mport = to_rio_mport(dev); + + riocm_debug(MPORT, "add mport %s", mport->name); + + cm = kzalloc(sizeof(*cm), GFP_KERNEL); + if (!cm) + return -ENOMEM; + + cm->mport = mport; + + rc = rio_request_outb_mbox(mport, cm, cmbox, + RIOCM_TX_RING_SIZE, riocm_outb_msg_event); + if (rc) { + riocm_error("failed to allocate OBMBOX_%d on %s", + cmbox, mport->name); + kfree(cm); + return -ENODEV; + } + + rc = rio_request_inb_mbox(mport, cm, cmbox, + RIOCM_RX_RING_SIZE, riocm_inb_msg_event); + if (rc) { + riocm_error("failed to allocate IBMBOX_%d on %s", + cmbox, mport->name); + rio_release_outb_mbox(mport, cmbox); + kfree(cm); + return -ENODEV; + } + + /* + * Allocate and register inbound messaging buffers to be ready + * to receive channel and system management requests + */ + for (i = 0; i < RIOCM_RX_RING_SIZE; i++) + cm->rx_buf[i] = NULL; + + cm->rx_slots = RIOCM_RX_RING_SIZE; + mutex_init(&cm->rx_lock); + riocm_rx_fill(cm, RIOCM_RX_RING_SIZE); + cm->rx_wq = create_workqueue(DRV_NAME "/rxq"); + INIT_WORK(&cm->rx_work, rio_ibmsg_handler); + + cm->tx_slot = 0; + cm->tx_cnt = 0; + cm->tx_ack_slot = 0; + spin_lock_init(&cm->tx_lock); + + INIT_LIST_HEAD(&cm->peers); + cm->npeers = 0; + INIT_LIST_HEAD(&cm->tx_reqs); + + down_write(&rdev_sem); + list_add_tail(&cm->list, &cm_dev_list); + up_write(&rdev_sem); + + return 0; +} + +/* + * riocm_remove_mport - remove local mport device from channel management core + * @dev: device object associated with mport + * @class_intf: class interface + * + * Removes a local mport device from the list of registered devices that provide + * channel management services. Returns an error if the specified mport is not + * registered with the CM core. + */ +static void riocm_remove_mport(struct device *dev, + struct class_interface *class_intf) +{ + struct rio_mport *mport = to_rio_mport(dev); + struct cm_dev *cm; + struct cm_peer *peer, *temp; + struct rio_channel *ch, *_c; + unsigned int i; + bool found = false; + LIST_HEAD(list); + + riocm_debug(MPORT, "%s", mport->name); + + /* Find a matching cm_dev object */ + down_write(&rdev_sem); + list_for_each_entry(cm, &cm_dev_list, list) { + if (cm->mport == mport) { + list_del(&cm->list); + found = true; + break; + } + } + up_write(&rdev_sem); + if (!found) + return; + + flush_workqueue(cm->rx_wq); + destroy_workqueue(cm->rx_wq); + + /* Release channels bound to this mport */ + spin_lock_bh(&idr_lock); + idr_for_each_entry(&ch_idr, ch, i) { + if (ch->cmdev == cm) { + riocm_debug(RDEV, "%s drop ch_%d", + mport->name, ch->id); + idr_remove(&ch_idr, ch->id); + list_add(&ch->ch_node, &list); + } + } + spin_unlock_bh(&idr_lock); + + if (!list_empty(&list)) { + list_for_each_entry_safe(ch, _c, &list, ch_node) { + list_del(&ch->ch_node); + riocm_ch_close(ch); + } + } + + rio_release_inb_mbox(mport, cmbox); + rio_release_outb_mbox(mport, cmbox); + + /* Remove and free peer entries */ + if (!list_empty(&cm->peers)) + riocm_debug(RDEV, "ATTN: peer list not empty"); + list_for_each_entry_safe(peer, temp, &cm->peers, node) { + riocm_debug(RDEV, "removing peer %s", rio_name(peer->rdev)); + list_del(&peer->node); + kfree(peer); + } + + riocm_rx_free(cm); + kfree(cm); + riocm_debug(MPORT, "%s done", mport->name); +} + +static int rio_cm_shutdown(struct notifier_block *nb, unsigned long code, + void *unused) +{ + struct rio_channel *ch; + unsigned int i; + + riocm_debug(EXIT, "."); + + spin_lock_bh(&idr_lock); + idr_for_each_entry(&ch_idr, ch, i) { + riocm_debug(EXIT, "close ch %d", ch->id); + if (ch->state == RIO_CM_CONNECTED) + riocm_send_close(ch); + } + spin_unlock_bh(&idr_lock); + + return NOTIFY_DONE; +} + +/* + * riocm_interface handles addition/removal of remote RapidIO devices + */ +static struct subsys_interface riocm_interface = { + .name = "rio_cm", + .subsys = &rio_bus_type, + .add_dev = riocm_add_dev, + .remove_dev = riocm_remove_dev, +}; + +/* + * rio_mport_interface handles addition/removal local mport devices + */ +static struct class_interface rio_mport_interface __refdata = { + .class = &rio_mport_class, + .add_dev = riocm_add_mport, + .remove_dev = riocm_remove_mport, +}; + +static struct notifier_block rio_cm_notifier = { + .notifier_call = rio_cm_shutdown, +}; + +static int __init riocm_init(void) +{ + int ret; + + /* Create device class needed by udev */ + dev_class = class_create(THIS_MODULE, DRV_NAME); + if (IS_ERR(dev_class)) { + riocm_error("Cannot create " DRV_NAME " class"); + return PTR_ERR(dev_class); + } + + ret = alloc_chrdev_region(&dev_number, 0, 1, DRV_NAME); + if (ret) { + class_destroy(dev_class); + return ret; + } + + dev_major = MAJOR(dev_number); + dev_minor_base = MINOR(dev_number); + riocm_debug(INIT, "Registered class with %d major", dev_major); + + /* + * Register as rapidio_port class interface to get notifications about + * mport additions and removals. + */ + ret = class_interface_register(&rio_mport_interface); + if (ret) { + riocm_error("class_interface_register error: %d", ret); + goto err_reg; + } + + /* + * Register as RapidIO bus interface to get notifications about + * addition/removal of remote RapidIO devices. + */ + ret = subsys_interface_register(&riocm_interface); + if (ret) { + riocm_error("subsys_interface_register error: %d", ret); + goto err_cl; + } + + ret = register_reboot_notifier(&rio_cm_notifier); + if (ret) { + riocm_error("failed to register reboot notifier (err=%d)", ret); + goto err_sif; + } + + ret = riocm_cdev_add(dev_number); + if (ret) { + unregister_reboot_notifier(&rio_cm_notifier); + ret = -ENODEV; + goto err_sif; + } + + return 0; +err_sif: + subsys_interface_unregister(&riocm_interface); +err_cl: + class_interface_unregister(&rio_mport_interface); +err_reg: + unregister_chrdev_region(dev_number, 1); + class_destroy(dev_class); + return ret; +} + +static void __exit riocm_exit(void) +{ + riocm_debug(EXIT, "enter"); + unregister_reboot_notifier(&rio_cm_notifier); + subsys_interface_unregister(&riocm_interface); + class_interface_unregister(&rio_mport_interface); + idr_destroy(&ch_idr); + + device_unregister(riocm_cdev.dev); + cdev_del(&(riocm_cdev.cdev)); + + class_destroy(dev_class); + unregister_chrdev_region(dev_number, 1); +} + +late_initcall(riocm_init); +module_exit(riocm_exit); diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 6d4e92ccdc91..c44747c0796a 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -357,6 +357,7 @@ header-y += reiserfs_fs.h header-y += reiserfs_xattr.h header-y += resource.h header-y += rfkill.h +header-y += rio_cm_cdev.h header-y += rio_mport_cdev.h header-y += romfs_fs.h header-y += rose.h diff --git a/include/uapi/linux/rio_cm_cdev.h b/include/uapi/linux/rio_cm_cdev.h new file mode 100644 index 000000000000..6edb900d318d --- /dev/null +++ b/include/uapi/linux/rio_cm_cdev.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2015, Integrated Device Technology Inc. + * Copyright (c) 2015, Prodrive Technologies + * Copyright (c) 2015, RapidIO Trade Association + * All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License(GPL) Version 2, or the BSD-3 Clause license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RIO_CM_CDEV_H_ +#define _RIO_CM_CDEV_H_ + +#include + +struct rio_cm_channel { + __u16 id; + __u16 remote_channel; + __u16 remote_destid; + __u8 mport_id; +}; + +struct rio_cm_msg { + __u16 ch_num; + __u16 size; + __u32 rxto; /* receive timeout in mSec. 0 = blocking */ + __u64 msg; +}; + +struct rio_cm_accept { + __u16 ch_num; + __u16 pad0; + __u32 wait_to; /* accept timeout in mSec. 0 = blocking */ +}; + +/* RapidIO Channelized Messaging Driver IOCTLs */ +#define RIO_CM_IOC_MAGIC 'c' + +#define RIO_CM_EP_GET_LIST_SIZE _IOWR(RIO_CM_IOC_MAGIC, 1, __u32) +#define RIO_CM_EP_GET_LIST _IOWR(RIO_CM_IOC_MAGIC, 2, __u32) +#define RIO_CM_CHAN_CREATE _IOWR(RIO_CM_IOC_MAGIC, 3, __u16) +#define RIO_CM_CHAN_CLOSE _IOW(RIO_CM_IOC_MAGIC, 4, __u16) +#define RIO_CM_CHAN_BIND _IOW(RIO_CM_IOC_MAGIC, 5, struct rio_cm_channel) +#define RIO_CM_CHAN_LISTEN _IOW(RIO_CM_IOC_MAGIC, 6, __u16) +#define RIO_CM_CHAN_ACCEPT _IOWR(RIO_CM_IOC_MAGIC, 7, struct rio_cm_accept) +#define RIO_CM_CHAN_CONNECT _IOW(RIO_CM_IOC_MAGIC, 8, struct rio_cm_channel) +#define RIO_CM_CHAN_SEND _IOW(RIO_CM_IOC_MAGIC, 9, struct rio_cm_msg) +#define RIO_CM_CHAN_RECEIVE _IOWR(RIO_CM_IOC_MAGIC, 10, struct rio_cm_msg) +#define RIO_CM_MPORT_GET_LIST _IOWR(RIO_CM_IOC_MAGIC, 11, __u32) + +#endif /* _RIO_CM_CDEV_H_ */ -- cgit v1.2.3-71-gd317 From c49298026908a8ce9dcf01ed68734ad171cef98b Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 27 Jul 2016 21:08:42 -0400 Subject: IB/hfi1: Allow for non-double word multiple message sizes for user SDMA The driver pads non-double word multiple message sizes but it doesn't account for this padding when the packet length is calculated. Also, the data length is miscalculated for message sizes less than 4 bytes due to the bit representation in LRH. And there's a check for non-double word multiple message sizes that prevents these messages from being sent. This patch fixes length miscalculations and enables the functionality to send non-double word multiple message sizes. Reviewed-by: Harish Chegondi Signed-off-by: Sebastian Sanchez Signed-off-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_sdma.c | 31 ++++++++++++++++++++++--------- include/uapi/rdma/hfi/hfi1_user.h | 2 +- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index d16ed52a2cb1..1e266c95056a 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -793,14 +793,21 @@ static inline u32 compute_data_length(struct user_sdma_request *req, * The size of the data of the first packet is in the header * template. However, it includes the header and ICRC, which need * to be subtracted. + * The minimum representable packet data length in a header is 4 bytes, + * therefore, when the data length request is less than 4 bytes, there's + * only one packet, and the packet data length is equal to that of the + * request data length. * The size of the remaining packets is the minimum of the frag * size (MTU) or remaining data in the request. */ u32 len; if (!req->seqnum) { - len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - - (sizeof(tx->hdr) - 4)); + if (req->data_len < sizeof(u32)) + len = req->data_len; + else + len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - + (sizeof(tx->hdr) - 4)); } else if (req_opcode(req->info.ctrl) == EXPECTED) { u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * PAGE_SIZE; @@ -830,6 +837,13 @@ static inline u32 compute_data_length(struct user_sdma_request *req, return len; } +static inline u32 pad_len(u32 len) +{ + if (len & (sizeof(u32) - 1)) + len += sizeof(u32) - (len & (sizeof(u32) - 1)); + return len; +} + static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) { /* (Size of complete header - size of PBC) + 4B ICRC + data length */ @@ -921,7 +935,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { if (!req->seqnum) { u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); - u32 lrhlen = get_lrh_len(req->hdr, datalen); + u32 lrhlen = get_lrh_len(req->hdr, + pad_len(datalen)); /* * Copy the request header into the tx header * because the HW needs a cacheline-aligned @@ -1219,16 +1234,14 @@ static int check_header_template(struct user_sdma_request *req, /* * Perform safety checks for any type of packet: * - transfer size is multiple of 64bytes - * - packet length is multiple of 4bytes - * - entire request length is multiple of 4bytes + * - packet length is multiple of 4 bytes * - packet length is not larger than MTU size * * These checks are only done for the first packet of the * transfer since the header is "given" to us by user space. * For the remainder of the packets we compute the values. */ - if (req->info.fragsize % PIO_BLOCK_SIZE || - lrhlen & 0x3 || req->data_len & 0x3 || + if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || lrhlen > get_lrh_len(*hdr, req->info.fragsize)) return -EINVAL; @@ -1290,7 +1303,7 @@ static int set_txreq_header(struct user_sdma_request *req, struct hfi1_pkt_header *hdr = &tx->hdr; u16 pbclen; int ret; - u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen); + u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); /* Copy the header template to the request before modification */ memcpy(hdr, &req->hdr, sizeof(*hdr)); @@ -1401,7 +1414,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, struct hfi1_user_sdma_pkt_q *pq = req->pq; struct hfi1_pkt_header *hdr = &req->hdr; u16 pbclen = le16_to_cpu(hdr->pbc[0]); - u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len); + u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len)); if (PBC2LRH(pbclen) != lrhlen) { /* PBC.PbcLengthDWs */ diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index 98bebf8bef55..d15e7289d835 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -75,7 +75,7 @@ * may not be implemented; the user code must deal with this if it * cares, or it must abort after initialization reports the difference. */ -#define HFI1_USER_SWMINOR 1 +#define HFI1_USER_SWMINOR 2 /* * We will encode the major/minor inside a single 32bit version number. -- cgit v1.2.3-71-gd317 From 444d13ff10fb13bc3e64859c3cf9ce43dcfeb075 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 27 Jul 2016 12:06:21 +0930 Subject: modules: add ro_after_init support Add ro_after_init support for modules by adding a new page-aligned section in the module layout (after rodata) for ro_after_init data and enabling RO protection for that section after module init runs. Signed-off-by: Jessica Yu Acked-by: Kees Cook Signed-off-by: Rusty Russell --- include/linux/module.h | 6 +++-- include/uapi/linux/elf.h | 1 + kernel/livepatch/core.c | 2 +- kernel/module.c | 66 +++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 60 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/module.h b/include/linux/module.h index f95ed243a4de..0c3207d26ac0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -298,6 +298,8 @@ struct module_layout { unsigned int text_size; /* Size of RO section of the module (text+rodata) */ unsigned int ro_size; + /* Size of RO after init section */ + unsigned int ro_after_init_size; #ifdef CONFIG_MODULES_TREE_LOOKUP struct mod_tree_node mtn; @@ -765,12 +767,12 @@ extern int module_sysfs_initialized; #ifdef CONFIG_DEBUG_SET_MODULE_RONX extern void set_all_modules_text_rw(void); extern void set_all_modules_text_ro(void); -extern void module_enable_ro(const struct module *mod); +extern void module_enable_ro(const struct module *mod, bool after_init); extern void module_disable_ro(const struct module *mod); #else static inline void set_all_modules_text_rw(void) { } static inline void set_all_modules_text_ro(void) { } -static inline void module_enable_ro(const struct module *mod) { } +static inline void module_enable_ro(const struct module *mod, bool after_init) { } static inline void module_disable_ro(const struct module *mod) { } #endif diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index cb4a72f888d5..70b172ba41ce 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -286,6 +286,7 @@ typedef struct elf64_phdr { #define SHF_ALLOC 0x2 #define SHF_EXECINSTR 0x4 #define SHF_RELA_LIVEPATCH 0x00100000 +#define SHF_RO_AFTER_INIT 0x00200000 #define SHF_MASKPROC 0xf0000000 /* special section indexes */ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5c2bc1052691..8bbe50704621 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod, break; } - module_enable_ro(pmod); + module_enable_ro(pmod, true); return ret; } diff --git a/kernel/module.c b/kernel/module.c index c91c2fdca2e6..205a71a97852 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1857,10 +1857,11 @@ static void mod_sysfs_teardown(struct module *mod) * from modification and any data from execution. * * General layout of module is: - * [text] [read-only-data] [writable data] - * text_size -----^ ^ ^ - * ro_size ------------------------| | - * size -------------------------------------------| + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| * * These values are always page-aligned (as is base) */ @@ -1883,14 +1884,24 @@ static void frob_rodata(const struct module_layout *layout, (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + static void frob_writable_data(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->size - layout->ro_size) >> PAGE_SHIFT); + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } /* livepatching wants to disable read-only so it can frob module. */ @@ -1898,21 +1909,26 @@ void module_disable_ro(const struct module *mod) { frob_text(&mod->core_layout, set_memory_rw); frob_rodata(&mod->core_layout, set_memory_rw); + frob_ro_after_init(&mod->core_layout, set_memory_rw); frob_text(&mod->init_layout, set_memory_rw); frob_rodata(&mod->init_layout, set_memory_rw); } -void module_enable_ro(const struct module *mod) +void module_enable_ro(const struct module *mod, bool after_init) { frob_text(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->core_layout, set_memory_ro); } static void module_enable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_nx); + frob_ro_after_init(&mod->core_layout, set_memory_nx); frob_writable_data(&mod->core_layout, set_memory_nx); frob_rodata(&mod->init_layout, set_memory_nx); frob_writable_data(&mod->init_layout, set_memory_nx); @@ -1921,6 +1937,7 @@ static void module_enable_nx(const struct module *mod) static void module_disable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_x); + frob_ro_after_init(&mod->core_layout, set_memory_x); frob_writable_data(&mod->core_layout, set_memory_x); frob_rodata(&mod->init_layout, set_memory_x); frob_writable_data(&mod->init_layout, set_memory_x); @@ -1963,6 +1980,8 @@ static void disable_ro_nx(const struct module_layout *layout) frob_text(layout, set_memory_rw); frob_rodata(layout, set_memory_rw); frob_rodata(layout, set_memory_x); + frob_ro_after_init(layout, set_memory_rw); + frob_ro_after_init(layout, set_memory_x); frob_writable_data(layout, set_memory_x); } @@ -2305,6 +2324,7 @@ static void layout_sections(struct module *mod, struct load_info *info) * finder in the two loops below */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, + { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, { ARCH_SHF_SMALL | SHF_ALLOC, 0 } }; @@ -2336,7 +2356,11 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->core_layout.size = debug_align(mod->core_layout.size); mod->core_layout.ro_size = mod->core_layout.size; break; - case 3: /* whole core */ + case 2: /* RO after init */ + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_after_init_size = mod->core_layout.size; + break; + case 4: /* whole core */ mod->core_layout.size = debug_align(mod->core_layout.size); break; } @@ -2366,7 +2390,14 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->init_layout.size = debug_align(mod->init_layout.size); mod->init_layout.ro_size = mod->init_layout.size; break; - case 3: /* whole init */ + case 2: + /* + * RO after init doesn't apply to init_layout (only + * core_layout), so it just takes the value of ro_size. + */ + mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; + break; + case 4: /* whole init */ mod->init_layout.size = debug_align(mod->init_layout.size); break; } @@ -3193,6 +3224,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; + unsigned int ndx; int err; mod = setup_load_info(info, flags); @@ -3215,6 +3247,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; + /* + * Mark ro_after_init section with SHF_RO_AFTER_INIT so that + * layout_sections() can put it in the right place. + * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. + */ + ndx = find_sec(info, ".data..ro_after_init"); + if (ndx) + info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ @@ -3381,12 +3422,14 @@ static noinline int do_init_module(struct module *mod) /* Switch to core kallsyms now init is done: kallsyms may be walking! */ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif + module_enable_ro(mod, true); mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; mod->init_layout.ro_size = 0; + mod->init_layout.ro_after_init_size = 0; mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be @@ -3478,8 +3521,7 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions */ - module_enable_ro(mod); + module_enable_ro(mod, false); module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, -- cgit v1.2.3-71-gd317 From ab15c95a17b3fe8c0e01bb7ce1dd0b657598eb61 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Wed, 6 Jul 2016 16:36:35 +0300 Subject: IB/core: Support for CMA multicast join flags Added UCMA and CMA support for multicast join flags. Flags are passed using UCMA CM join command previously reserved fields. Currently supporting two join flags indicating two different multicast JoinStates: 1. Full Member: The initiator creates the Multicast group(MCG) if it wasn't previously created, can send Multicast messages to the group and receive messages from the MCG. 2. Send Only Full Member: The initiator creates the Multicast group(MCG) if it wasn't previously created, can send Multicast messages to the group but doesn't receive any messages from the MCG. IB: Send Only Full Member requires a query of ClassPortInfo to determine if SM/SA supports this option. If SM/SA doesn't support Send-Only there will be no join request sent and an error will be returned. ETH: When Send Only Full Member is requested no IGMP join will be sent. Signed-off-by: Alex Vesker Reviewed by: Hal Rosenstock Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 100 +++++++++++++++++++++++++++++++++--- drivers/infiniband/core/multicast.c | 12 ----- drivers/infiniband/core/ucma.c | 18 +++++-- include/rdma/ib_sa.h | 13 +++++ include/rdma/rdma_cm.h | 4 +- include/uapi/rdma/rdma_user_cm.h | 9 +++- 6 files changed, 131 insertions(+), 25 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f0c91ba3178a..0451307bea18 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -68,6 +68,7 @@ MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 +#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 @@ -162,6 +163,14 @@ struct rdma_bind_list { unsigned short port; }; +struct class_port_info_context { + struct ib_class_port_info *class_port_info; + struct ib_device *device; + struct completion done; + struct ib_sa_query *sa_query; + u8 port_num; +}; + static int cma_ps_alloc(struct net *net, enum rdma_port_space ps, struct rdma_bind_list *bind_list, int snum) { @@ -306,6 +315,7 @@ struct cma_multicast { struct sockaddr_storage addr; struct kref mcref; bool igmp_joined; + u8 join_state; }; struct cma_work { @@ -3754,10 +3764,63 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, } } +static void cma_query_sa_classport_info_cb(int status, + struct ib_class_port_info *rec, + void *context) +{ + struct class_port_info_context *cb_ctx = context; + + WARN_ON(!context); + + if (status || !rec) { + pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n", + cb_ctx->device->name, cb_ctx->port_num, status); + goto out; + } + + memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info)); + +out: + complete(&cb_ctx->done); +} + +static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num, + struct ib_class_port_info *class_port_info) +{ + struct class_port_info_context *cb_ctx; + int ret; + + cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL); + if (!cb_ctx) + return -ENOMEM; + + cb_ctx->device = device; + cb_ctx->class_port_info = class_port_info; + cb_ctx->port_num = port_num; + init_completion(&cb_ctx->done); + + ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num, + CMA_QUERY_CLASSPORT_INFO_TIMEOUT, + GFP_KERNEL, cma_query_sa_classport_info_cb, + cb_ctx, &cb_ctx->sa_query); + if (ret < 0) { + pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n", + device->name, port_num, ret); + goto out; + } + + wait_for_completion(&cb_ctx->done); + +out: + kfree(cb_ctx); + return ret; +} + static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; + struct ib_class_port_info class_port_info; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; @@ -3776,7 +3839,24 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); - rec.join_state = 1; + rec.join_state = mc->join_state; + + if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) { + ret = cma_query_sa_classport_info(id_priv->id.device, + id_priv->id.port_num, + &class_port_info); + + if (ret) + return ret; + + if (!(ib_get_cpi_capmask2(&class_port_info) & + IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) { + pr_warn("RDMA CM: %s port %u Unable to multicast join\n" + "RDMA CM: SM doesn't support Send Only Full Member option\n", + id_priv->id.device->name, id_priv->id.port_num); + return -EOPNOTSUPP; + } + } comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | @@ -3845,6 +3925,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; enum ib_gid_type gid_type; + bool send_only; + + send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; @@ -3878,12 +3961,14 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; if (addr->sa_family == AF_INET) { - if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, - true); - if (!err) { - mc->igmp_joined = true; + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + if (!send_only) { + err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + true); + if (!err) + mc->igmp_joined = true; + } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) @@ -3913,7 +3998,7 @@ out1: } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, - void *context) + u8 join_state, void *context) { struct rdma_id_private *id_priv; struct cma_multicast *mc; @@ -3932,6 +4017,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, mc->context = context; mc->id_priv = id_priv; mc->igmp_joined = false; + mc->join_state = join_state; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index a83ec28a147b..3a3c5d73bbfc 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -93,18 +93,6 @@ enum { struct mcast_member; -/* -* There are 4 types of join states: -* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. -*/ -enum { - FULLMEMBER_JOIN, - NONMEMBER_JOIN, - SENDONLY_NONMEBER_JOIN, - SENDONLY_FULLMEMBER_JOIN, - NUM_JOIN_MEMBERSHIP_TYPES, -}; - struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index c0f3826abb30..2825ece91d3c 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -106,6 +106,7 @@ struct ucma_multicast { int events_reported; u64 uid; + u8 join_state; struct list_head list; struct sockaddr_storage addr; }; @@ -1317,12 +1318,20 @@ static ssize_t ucma_process_join(struct ucma_file *file, struct ucma_multicast *mc; struct sockaddr *addr; int ret; + u8 join_state; if (out_len < sizeof(resp)) return -ENOSPC; addr = (struct sockaddr *) &cmd->addr; - if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + if (!cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + return -EINVAL; + + if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER) + join_state = BIT(FULLMEMBER_JOIN); + else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) + join_state = BIT(SENDONLY_FULLMEMBER_JOIN); + else return -EINVAL; ctx = ucma_get_ctx(file, cmd->id); @@ -1335,10 +1344,11 @@ static ssize_t ucma_process_join(struct ucma_file *file, ret = -ENOMEM; goto err1; } - + mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); - ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc); + ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, + join_state, mc); if (ret) goto err2; @@ -1382,7 +1392,7 @@ static ssize_t ucma_join_ip_multicast(struct ucma_file *file, join_cmd.uid = cmd.uid; join_cmd.id = cmd.id; join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr); - join_cmd.reserved = 0; + join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER; memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); return ucma_process_join(file, &join_cmd, out_len); diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 384041669489..5ee7aab95eb8 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -94,6 +94,19 @@ enum ib_sa_selector { IB_SA_BEST = 3 }; +/* + * There are 4 types of join states: + * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. + * The order corresponds to JoinState bits in MCMemberRecord. + */ +enum ib_sa_mc_join_states { + FULLMEMBER_JOIN, + NONMEMBER_JOIN, + SENDONLY_NONMEBER_JOIN, + SENDONLY_FULLMEMBER_JOIN, + NUM_JOIN_MEMBERSHIP_TYPES, +}; + #define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12) /* diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index afe44fde72a5..81fb1d15e8bb 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -333,11 +333,13 @@ int rdma_disconnect(struct rdma_cm_id *id); * address. * @id: Communication identifier associated with the request. * @addr: Multicast address identifying the group to join. + * @join_state: Multicast JoinState bitmap requested by port. + * Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits. * @context: User-defined context associated with the join request, returned * to the user through the private_data pointer in multicast events. */ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, - void *context); + u8 join_state, void *context); /** * rdma_leave_multicast - Leave the multicast group specified by the given diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 3066718eb120..01923d463673 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -244,12 +244,19 @@ struct rdma_ucm_join_ip_mcast { __u32 id; }; +/* Multicast join flags */ +enum { + RDMA_MC_JOIN_FLAG_FULLMEMBER, + RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER, + RDMA_MC_JOIN_FLAG_RESERVED, +}; + struct rdma_ucm_join_mcast { __u64 response; /* rdma_ucma_create_id_resp */ __u64 uid; __u32 id; __u16 addr_size; - __u16 reserved; + __u16 join_flags; struct sockaddr_storage addr; }; -- cgit v1.2.3-71-gd317 From 8700e3e7c4857d28ebaa824509934556da0b3e76 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 16 Jun 2016 16:45:23 +0300 Subject: Soft RoCE driver Soft RoCE (RXE) - The software RoCE driver ib_rxe implements the RDMA transport and registers to the RDMA core device as a kernel verbs provider. It also implements the packet IO layer. On the other hand ib_rxe registers to the Linux netdev stack as a udp encapsulating protocol, in that case RDMA, for sending and receiving packets over any Ethernet device. This yields a RDMA transport over the UDP/Ethernet network layer forming a RoCEv2 compatible device. The configuration procedure of the Soft RoCE drivers requires binding to any existing Ethernet network device. This is done with /sys interface. A userspace Soft RoCE library (librxe) provides user applications the ability to run with Soft RoCE devices. The use of rxe verbs ins user space requires the inclusion of librxe as a device specifics plug-in to libibverbs. librxe is packaged separately. Architecture: +-----------------------------------------------------------+ | Application | +-----------------------------------------------------------+ +-----------------------------------+ | libibverbs | User +-----------------------------------+ +----------------+ +----------------+ | librxe | | HW RoCE lib | +----------------+ +----------------+ +---------------------------------------------------------------+ +--------------+ +------------+ | Sockets | | RDMA ULP | +--------------+ +------------+ +--------------+ +---------------------+ | TCP/IP | | ib_core | +--------------+ +---------------------+ +------------+ +----------------+ Kernel | ib_rxe | | HW RoCE driver | +------------+ +----------------+ +------------------------------------+ | NIC driver | +------------------------------------+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-----------------------------------------------------------+ | Application | +-----------------------------------------------------------+ +-----------------------------------+ | libibverbs | User +-----------------------------------+ +----------------+ +----------------+ | librxe | | HW RoCE lib | +----------------+ +----------------+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------+ +------------+ | Sockets | | RDMA ULP | +--------------+ +------------+ +--------------+ +---------------------+ | TCP/IP | | ib_core | +--------------+ +---------------------+ +------------+ +----------------+ Kernel | ib_rxe | | HW RoCE driver | +------------+ +----------------+ +------------------------------------+ | NIC driver | +------------------------------------+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Soft RoCE resources: [1[ https://github.com/SoftRoCE/librxe-dev librxe - source code in Github [2] https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home - Soft RoCE Wiki page [3] https://github.com/SoftRoCE/librxe-dev - Soft RoCE userspace library Signed-off-by: Kamal Heib Signed-off-by: Amir Vadai Signed-off-by: Moni Shoua Reviewed-by: Haggai Eran Signed-off-by: Doug Ledford --- MAINTAINERS | 9 + drivers/infiniband/Kconfig | 1 + drivers/infiniband/sw/Makefile | 1 + drivers/infiniband/sw/rxe/Kconfig | 24 + drivers/infiniband/sw/rxe/Makefile | 24 + drivers/infiniband/sw/rxe/rxe.c | 386 +++++++++ drivers/infiniband/sw/rxe/rxe.h | 77 ++ drivers/infiniband/sw/rxe/rxe_av.c | 98 +++ drivers/infiniband/sw/rxe/rxe_comp.c | 734 +++++++++++++++++ drivers/infiniband/sw/rxe/rxe_cq.c | 165 ++++ drivers/infiniband/sw/rxe/rxe_dma.c | 166 ++++ drivers/infiniband/sw/rxe/rxe_hdr.h | 952 ++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_icrc.c | 96 +++ drivers/infiniband/sw/rxe/rxe_loc.h | 286 +++++++ drivers/infiniband/sw/rxe/rxe_mcast.c | 190 +++++ drivers/infiniband/sw/rxe/rxe_mmap.c | 173 ++++ drivers/infiniband/sw/rxe/rxe_mr.c | 643 +++++++++++++++ drivers/infiniband/sw/rxe/rxe_net.c | 708 ++++++++++++++++ drivers/infiniband/sw/rxe/rxe_net.h | 53 ++ drivers/infiniband/sw/rxe/rxe_opcode.c | 961 ++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_opcode.h | 129 +++ drivers/infiniband/sw/rxe/rxe_param.h | 172 ++++ drivers/infiniband/sw/rxe/rxe_pool.c | 502 ++++++++++++ drivers/infiniband/sw/rxe/rxe_pool.h | 163 ++++ drivers/infiniband/sw/rxe/rxe_qp.c | 851 ++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_queue.c | 217 +++++ drivers/infiniband/sw/rxe/rxe_queue.h | 178 ++++ drivers/infiniband/sw/rxe/rxe_recv.c | 420 ++++++++++ drivers/infiniband/sw/rxe/rxe_req.c | 726 +++++++++++++++++ drivers/infiniband/sw/rxe/rxe_resp.c | 1380 ++++++++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_srq.c | 193 +++++ drivers/infiniband/sw/rxe/rxe_sysfs.c | 157 ++++ drivers/infiniband/sw/rxe/rxe_task.c | 154 ++++ drivers/infiniband/sw/rxe/rxe_task.h | 95 +++ drivers/infiniband/sw/rxe/rxe_verbs.c | 1330 ++++++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_verbs.h | 480 +++++++++++ include/uapi/rdma/Kbuild | 1 + include/uapi/rdma/rdma_user_rxe.h | 144 ++++ 38 files changed, 13039 insertions(+) create mode 100644 drivers/infiniband/sw/rxe/Kconfig create mode 100644 drivers/infiniband/sw/rxe/Makefile create mode 100644 drivers/infiniband/sw/rxe/rxe.c create mode 100644 drivers/infiniband/sw/rxe/rxe.h create mode 100644 drivers/infiniband/sw/rxe/rxe_av.c create mode 100644 drivers/infiniband/sw/rxe/rxe_comp.c create mode 100644 drivers/infiniband/sw/rxe/rxe_cq.c create mode 100644 drivers/infiniband/sw/rxe/rxe_dma.c create mode 100644 drivers/infiniband/sw/rxe/rxe_hdr.h create mode 100644 drivers/infiniband/sw/rxe/rxe_icrc.c create mode 100644 drivers/infiniband/sw/rxe/rxe_loc.h create mode 100644 drivers/infiniband/sw/rxe/rxe_mcast.c create mode 100644 drivers/infiniband/sw/rxe/rxe_mmap.c create mode 100644 drivers/infiniband/sw/rxe/rxe_mr.c create mode 100644 drivers/infiniband/sw/rxe/rxe_net.c create mode 100644 drivers/infiniband/sw/rxe/rxe_net.h create mode 100644 drivers/infiniband/sw/rxe/rxe_opcode.c create mode 100644 drivers/infiniband/sw/rxe/rxe_opcode.h create mode 100644 drivers/infiniband/sw/rxe/rxe_param.h create mode 100644 drivers/infiniband/sw/rxe/rxe_pool.c create mode 100644 drivers/infiniband/sw/rxe/rxe_pool.h create mode 100644 drivers/infiniband/sw/rxe/rxe_qp.c create mode 100644 drivers/infiniband/sw/rxe/rxe_queue.c create mode 100644 drivers/infiniband/sw/rxe/rxe_queue.h create mode 100644 drivers/infiniband/sw/rxe/rxe_recv.c create mode 100644 drivers/infiniband/sw/rxe/rxe_req.c create mode 100644 drivers/infiniband/sw/rxe/rxe_resp.c create mode 100644 drivers/infiniband/sw/rxe/rxe_srq.c create mode 100644 drivers/infiniband/sw/rxe/rxe_sysfs.c create mode 100644 drivers/infiniband/sw/rxe/rxe_task.c create mode 100644 drivers/infiniband/sw/rxe/rxe_task.h create mode 100644 drivers/infiniband/sw/rxe/rxe_verbs.c create mode 100644 drivers/infiniband/sw/rxe/rxe_verbs.h create mode 100644 include/uapi/rdma/rdma_user_rxe.h (limited to 'include/uapi') diff --git a/MAINTAINERS b/MAINTAINERS index e1b090f86e0d..4daa6712eac1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7444,6 +7444,15 @@ W: http://www.mellanox.com Q: http://patchwork.ozlabs.org/project/netdev/list/ F: drivers/net/ethernet/mellanox/mlxsw/ +SOFT-ROCE DRIVER (rxe) +M: Moni Shoua +L: linux-rdma@vger.kernel.org +S: Supported +W: https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home +Q: http://patchwork.kernel.org/project/linux-rdma/list/ +F: drivers/infiniband/hw/rxe/ +F: include/uapi/rdma/rdma_user_rxe.h + MEMBARRIER SUPPORT M: Mathieu Desnoyers M: "Paul E. McKenney" diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 2137adfbd8c3..e9b7dc037ff8 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -84,6 +84,7 @@ source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" +source "drivers/infiniband/sw/rxe/Kconfig" source "drivers/infiniband/hw/hfi1/Kconfig" diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile index 988b6a0101a4..8b095b27db87 100644 --- a/drivers/infiniband/sw/Makefile +++ b/drivers/infiniband/sw/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/ +obj-$(CONFIG_RDMA_RXE) += rxe/ diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig new file mode 100644 index 000000000000..1e4e628fe7b0 --- /dev/null +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -0,0 +1,24 @@ +config RDMA_RXE + tristate "Software RDMA over Ethernet (RoCE) driver" + depends on INET && PCI && INFINIBAND + depends on NET_UDP_TUNNEL + ---help--- + This driver implements the InfiniBand RDMA transport over + the Linux network stack. It enables a system with a + standard Ethernet adapter to interoperate with a RoCE + adapter or with another system running the RXE driver. + Documentation on InfiniBand and RoCE can be downloaded at + www.infinibandta.org and www.openfabrics.org. (See also + siw which is a similar software driver for iWARP.) + + The driver is split into two layers, one interfaces with the + Linux RDMA stack and implements a kernel or user space + verbs API. The user space verbs API requires a support + library named librxe which is loaded by the generic user + space verbs API, libibverbs. The other layer interfaces + with the Linux network stack at layer 3. + + To configure and work with soft-RoCE driver please use the + following wiki page under "configure Soft-RoCE (RXE)" section: + + https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile new file mode 100644 index 000000000000..3b3fb9d1c470 --- /dev/null +++ b/drivers/infiniband/sw/rxe/Makefile @@ -0,0 +1,24 @@ +obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o + +rdma_rxe-y := \ + rxe.o \ + rxe_comp.o \ + rxe_req.o \ + rxe_resp.o \ + rxe_recv.o \ + rxe_pool.o \ + rxe_queue.o \ + rxe_verbs.o \ + rxe_av.o \ + rxe_srq.o \ + rxe_qp.o \ + rxe_cq.o \ + rxe_mr.o \ + rxe_dma.o \ + rxe_opcode.o \ + rxe_mmap.o \ + rxe_icrc.o \ + rxe_mcast.o \ + rxe_task.o \ + rxe_net.o \ + rxe_sysfs.o diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c new file mode 100644 index 000000000000..55f0e8f0ca79 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); +MODULE_DESCRIPTION("Soft RDMA transport"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION("0.2"); + +/* free resources for all ports on a device */ +static void rxe_cleanup_ports(struct rxe_dev *rxe) +{ + kfree(rxe->port.pkey_tbl); + rxe->port.pkey_tbl = NULL; + +} + +/* free resources for a rxe device all objects created for this device must + * have been destroyed + */ +static void rxe_cleanup(struct rxe_dev *rxe) +{ + rxe_pool_cleanup(&rxe->uc_pool); + rxe_pool_cleanup(&rxe->pd_pool); + rxe_pool_cleanup(&rxe->ah_pool); + rxe_pool_cleanup(&rxe->srq_pool); + rxe_pool_cleanup(&rxe->qp_pool); + rxe_pool_cleanup(&rxe->cq_pool); + rxe_pool_cleanup(&rxe->mr_pool); + rxe_pool_cleanup(&rxe->mw_pool); + rxe_pool_cleanup(&rxe->mc_grp_pool); + rxe_pool_cleanup(&rxe->mc_elem_pool); + + rxe_cleanup_ports(rxe); +} + +/* called when all references have been dropped */ +void rxe_release(struct kref *kref) +{ + struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt); + + rxe_cleanup(rxe); + ib_dealloc_device(&rxe->ib_dev); +} + +void rxe_dev_put(struct rxe_dev *rxe) +{ + kref_put(&rxe->ref_cnt, rxe_release); +} +EXPORT_SYMBOL_GPL(rxe_dev_put); + +/* initialize rxe device parameters */ +static int rxe_init_device_param(struct rxe_dev *rxe) +{ + rxe->max_inline_data = RXE_MAX_INLINE_DATA; + + rxe->attr.fw_ver = RXE_FW_VER; + rxe->attr.max_mr_size = RXE_MAX_MR_SIZE; + rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP; + rxe->attr.vendor_id = RXE_VENDOR_ID; + rxe->attr.vendor_part_id = RXE_VENDOR_PART_ID; + rxe->attr.hw_ver = RXE_HW_VER; + rxe->attr.max_qp = RXE_MAX_QP; + rxe->attr.max_qp_wr = RXE_MAX_QP_WR; + rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; + rxe->attr.max_sge = RXE_MAX_SGE; + rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; + rxe->attr.max_cq = RXE_MAX_CQ; + rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1; + rxe->attr.max_mr = RXE_MAX_MR; + rxe->attr.max_pd = RXE_MAX_PD; + rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM; + rxe->attr.max_ee_rd_atom = RXE_MAX_EE_RD_ATOM; + rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; + rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; + rxe->attr.max_ee_init_rd_atom = RXE_MAX_EE_INIT_RD_ATOM; + rxe->attr.atomic_cap = RXE_ATOMIC_CAP; + rxe->attr.max_ee = RXE_MAX_EE; + rxe->attr.max_rdd = RXE_MAX_RDD; + rxe->attr.max_mw = RXE_MAX_MW; + rxe->attr.max_raw_ipv6_qp = RXE_MAX_RAW_IPV6_QP; + rxe->attr.max_raw_ethy_qp = RXE_MAX_RAW_ETHY_QP; + rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP; + rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH; + rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH; + rxe->attr.max_ah = RXE_MAX_AH; + rxe->attr.max_fmr = RXE_MAX_FMR; + rxe->attr.max_map_per_fmr = RXE_MAX_MAP_PER_FMR; + rxe->attr.max_srq = RXE_MAX_SRQ; + rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR; + rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE; + rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; + rxe->attr.max_pkeys = RXE_MAX_PKEYS; + rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + + rxe->max_ucontext = RXE_MAX_UCONTEXT; + + return 0; +} + +/* initialize port attributes */ +static int rxe_init_port_param(struct rxe_port *port) +{ + port->attr.state = RXE_PORT_STATE; + port->attr.max_mtu = RXE_PORT_MAX_MTU; + port->attr.active_mtu = RXE_PORT_ACTIVE_MTU; + port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; + port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; + port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; + port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR; + port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR; + port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN; + port->attr.lid = RXE_PORT_LID; + port->attr.sm_lid = RXE_PORT_SM_LID; + port->attr.lmc = RXE_PORT_LMC; + port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM; + port->attr.sm_sl = RXE_PORT_SM_SL; + port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT; + port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY; + port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; + port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; + port->attr.phys_state = RXE_PORT_PHYS_STATE; + port->mtu_cap = + ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU); + port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); + + return 0; +} + +/* initialize port state, note IB convention that HCA ports are always + * numbered from 1 + */ +static int rxe_init_ports(struct rxe_dev *rxe) +{ + struct rxe_port *port = &rxe->port; + + rxe_init_port_param(port); + + if (!port->attr.pkey_tbl_len || !port->attr.gid_tbl_len) + return -EINVAL; + + port->pkey_tbl = kcalloc(port->attr.pkey_tbl_len, + sizeof(*port->pkey_tbl), GFP_KERNEL); + + if (!port->pkey_tbl) + return -ENOMEM; + + port->pkey_tbl[0] = 0xffff; + port->port_guid = rxe->ifc_ops->port_guid(rxe); + + spin_lock_init(&port->port_lock); + + return 0; +} + +/* init pools of managed objects */ +static int rxe_init_pools(struct rxe_dev *rxe) +{ + int err; + + err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC, + rxe->max_ucontext); + if (err) + goto err1; + + err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD, + rxe->attr.max_pd); + if (err) + goto err2; + + err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH, + rxe->attr.max_ah); + if (err) + goto err3; + + err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ, + rxe->attr.max_srq); + if (err) + goto err4; + + err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP, + rxe->attr.max_qp); + if (err) + goto err5; + + err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ, + rxe->attr.max_cq); + if (err) + goto err6; + + err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR, + rxe->attr.max_mr); + if (err) + goto err7; + + err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW, + rxe->attr.max_mw); + if (err) + goto err8; + + err = rxe_pool_init(rxe, &rxe->mc_grp_pool, RXE_TYPE_MC_GRP, + rxe->attr.max_mcast_grp); + if (err) + goto err9; + + err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM, + rxe->attr.max_total_mcast_qp_attach); + if (err) + goto err10; + + return 0; + +err10: + rxe_pool_cleanup(&rxe->mc_grp_pool); +err9: + rxe_pool_cleanup(&rxe->mw_pool); +err8: + rxe_pool_cleanup(&rxe->mr_pool); +err7: + rxe_pool_cleanup(&rxe->cq_pool); +err6: + rxe_pool_cleanup(&rxe->qp_pool); +err5: + rxe_pool_cleanup(&rxe->srq_pool); +err4: + rxe_pool_cleanup(&rxe->ah_pool); +err3: + rxe_pool_cleanup(&rxe->pd_pool); +err2: + rxe_pool_cleanup(&rxe->uc_pool); +err1: + return err; +} + +/* initialize rxe device state */ +static int rxe_init(struct rxe_dev *rxe) +{ + int err; + + /* init default device parameters */ + rxe_init_device_param(rxe); + + err = rxe_init_ports(rxe); + if (err) + goto err1; + + err = rxe_init_pools(rxe); + if (err) + goto err2; + + /* init pending mmap list */ + spin_lock_init(&rxe->mmap_offset_lock); + spin_lock_init(&rxe->pending_lock); + INIT_LIST_HEAD(&rxe->pending_mmaps); + INIT_LIST_HEAD(&rxe->list); + + mutex_init(&rxe->usdev_lock); + + return 0; + +err2: + rxe_cleanup_ports(rxe); +err1: + return err; +} + +int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) +{ + struct rxe_port *port = &rxe->port; + enum ib_mtu mtu; + + mtu = eth_mtu_int_to_enum(ndev_mtu); + + /* Make sure that new MTU in range */ + mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256; + + port->attr.active_mtu = mtu; + port->mtu_cap = ib_mtu_enum_to_int(mtu); + + return 0; +} +EXPORT_SYMBOL(rxe_set_mtu); + +/* called by ifc layer to create new rxe device. + * The caller should allocate memory for rxe by calling ib_alloc_device. + */ +int rxe_add(struct rxe_dev *rxe, unsigned int mtu) +{ + int err; + + kref_init(&rxe->ref_cnt); + + err = rxe_init(rxe); + if (err) + goto err1; + + err = rxe_set_mtu(rxe, mtu); + if (err) + goto err1; + + err = rxe_register_device(rxe); + if (err) + goto err1; + + return 0; + +err1: + rxe_dev_put(rxe); + return err; +} +EXPORT_SYMBOL(rxe_add); + +/* called by the ifc layer to remove a device */ +void rxe_remove(struct rxe_dev *rxe) +{ + rxe_unregister_device(rxe); + + rxe_dev_put(rxe); +} +EXPORT_SYMBOL(rxe_remove); + +static int __init rxe_module_init(void) +{ + int err; + + /* initialize slab caches for managed objects */ + err = rxe_cache_init(); + if (err) { + pr_err("rxe: unable to init object pools\n"); + return err; + } + + err = rxe_net_init(); + if (err) { + pr_err("rxe: unable to init\n"); + rxe_cache_exit(); + return err; + } + pr_info("rxe: loaded\n"); + + return 0; +} + +static void __exit rxe_module_exit(void) +{ + rxe_remove_all(); + rxe_net_exit(); + rxe_cache_exit(); + + pr_info("rxe: unloaded\n"); +} + +module_init(rxe_module_init); +module_exit(rxe_module_exit); diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h new file mode 100644 index 000000000000..12c71c549f97 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_H +#define RXE_H + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "rxe_net.h" +#include "rxe_opcode.h" +#include "rxe_hdr.h" +#include "rxe_param.h" +#include "rxe_verbs.h" + +#define RXE_UVERBS_ABI_VERSION (1) + +#define IB_PHYS_STATE_LINK_UP (5) +#define IB_PHYS_STATE_LINK_DOWN (3) + +#define RXE_ROCE_V2_SPORT (0xc000) + +int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); + +int rxe_add(struct rxe_dev *rxe, unsigned int mtu); +void rxe_remove(struct rxe_dev *rxe); +void rxe_remove_all(void); + +int rxe_rcv(struct sk_buff *skb); + +void rxe_dev_put(struct rxe_dev *rxe); +struct rxe_dev *net_to_rxe(struct net_device *ndev); +struct rxe_dev *get_rxe_by_name(const char* name); + +void rxe_port_up(struct rxe_dev *rxe); +void rxe_port_down(struct rxe_dev *rxe); + +#endif /* RXE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c new file mode 100644 index 000000000000..5c9474212d4e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr) +{ + struct rxe_port *port; + + if (attr->port_num != 1) { + pr_info("rxe: invalid port_num = %d\n", attr->port_num); + return -EINVAL; + } + + port = &rxe->port; + + if (attr->ah_flags & IB_AH_GRH) { + if (attr->grh.sgid_index > port->attr.gid_tbl_len) { + pr_info("rxe: invalid sgid index = %d\n", + attr->grh.sgid_index); + return -EINVAL; + } + } + + return 0; +} + +int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num, + struct rxe_av *av, struct ib_ah_attr *attr) +{ + memset(av, 0, sizeof(*av)); + memcpy(&av->grh, &attr->grh, sizeof(attr->grh)); + av->port_num = port_num; + return 0; +} + +int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av, + struct ib_ah_attr *attr) +{ + memcpy(&attr->grh, &av->grh, sizeof(av->grh)); + attr->port_num = av->port_num; + return 0; +} + +int rxe_av_fill_ip_info(struct rxe_dev *rxe, + struct rxe_av *av, + struct ib_ah_attr *attr, + struct ib_gid_attr *sgid_attr, + union ib_gid *sgid) +{ + rdma_gid2ip(&av->sgid_addr._sockaddr, sgid); + rdma_gid2ip(&av->dgid_addr._sockaddr, &attr->grh.dgid); + av->network_type = ib_gid_to_network_type(sgid_attr->gid_type, sgid); + + return 0; +} + +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) +{ + if (!pkt || !pkt->qp) + return NULL; + + if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC) + return &pkt->qp->pri_av; + + return (pkt->wqe) ? &pkt->wqe->av : NULL; +} diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c new file mode 100644 index 000000000000..36f67de44095 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_task.h" + +enum comp_state { + COMPST_GET_ACK, + COMPST_GET_WQE, + COMPST_COMP_WQE, + COMPST_COMP_ACK, + COMPST_CHECK_PSN, + COMPST_CHECK_ACK, + COMPST_READ, + COMPST_ATOMIC, + COMPST_WRITE_SEND, + COMPST_UPDATE_COMP, + COMPST_ERROR_RETRY, + COMPST_RNR_RETRY, + COMPST_ERROR, + COMPST_EXIT, /* We have an issue, and we want to rerun the completer */ + COMPST_DONE, /* The completer finished successflly */ +}; + +static char *comp_state_name[] = { + [COMPST_GET_ACK] = "GET ACK", + [COMPST_GET_WQE] = "GET WQE", + [COMPST_COMP_WQE] = "COMP WQE", + [COMPST_COMP_ACK] = "COMP ACK", + [COMPST_CHECK_PSN] = "CHECK PSN", + [COMPST_CHECK_ACK] = "CHECK ACK", + [COMPST_READ] = "READ", + [COMPST_ATOMIC] = "ATOMIC", + [COMPST_WRITE_SEND] = "WRITE/SEND", + [COMPST_UPDATE_COMP] = "UPDATE COMP", + [COMPST_ERROR_RETRY] = "ERROR RETRY", + [COMPST_RNR_RETRY] = "RNR RETRY", + [COMPST_ERROR] = "ERROR", + [COMPST_EXIT] = "EXIT", + [COMPST_DONE] = "DONE", +}; + +static unsigned long rnrnak_usec[32] = { + [IB_RNR_TIMER_655_36] = 655360, + [IB_RNR_TIMER_000_01] = 10, + [IB_RNR_TIMER_000_02] = 20, + [IB_RNR_TIMER_000_03] = 30, + [IB_RNR_TIMER_000_04] = 40, + [IB_RNR_TIMER_000_06] = 60, + [IB_RNR_TIMER_000_08] = 80, + [IB_RNR_TIMER_000_12] = 120, + [IB_RNR_TIMER_000_16] = 160, + [IB_RNR_TIMER_000_24] = 240, + [IB_RNR_TIMER_000_32] = 320, + [IB_RNR_TIMER_000_48] = 480, + [IB_RNR_TIMER_000_64] = 640, + [IB_RNR_TIMER_000_96] = 960, + [IB_RNR_TIMER_001_28] = 1280, + [IB_RNR_TIMER_001_92] = 1920, + [IB_RNR_TIMER_002_56] = 2560, + [IB_RNR_TIMER_003_84] = 3840, + [IB_RNR_TIMER_005_12] = 5120, + [IB_RNR_TIMER_007_68] = 7680, + [IB_RNR_TIMER_010_24] = 10240, + [IB_RNR_TIMER_015_36] = 15360, + [IB_RNR_TIMER_020_48] = 20480, + [IB_RNR_TIMER_030_72] = 30720, + [IB_RNR_TIMER_040_96] = 40960, + [IB_RNR_TIMER_061_44] = 61410, + [IB_RNR_TIMER_081_92] = 81920, + [IB_RNR_TIMER_122_88] = 122880, + [IB_RNR_TIMER_163_84] = 163840, + [IB_RNR_TIMER_245_76] = 245760, + [IB_RNR_TIMER_327_68] = 327680, + [IB_RNR_TIMER_491_52] = 491520, +}; + +static inline unsigned long rnrnak_jiffies(u8 timeout) +{ + return max_t(unsigned long, + usecs_to_jiffies(rnrnak_usec[timeout]), 1); +} + +static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: return IB_WC_RDMA_WRITE; + case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE; + case IB_WR_SEND: return IB_WC_SEND; + case IB_WR_SEND_WITH_IMM: return IB_WC_SEND; + case IB_WR_RDMA_READ: return IB_WC_RDMA_READ; + case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP; + case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD; + case IB_WR_LSO: return IB_WC_LSO; + case IB_WR_SEND_WITH_INV: return IB_WC_SEND; + case IB_WR_RDMA_READ_WITH_INV: return IB_WC_RDMA_READ; + case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; + case IB_WR_REG_MR: return IB_WC_REG_MR; + + default: + return 0xff; + } +} + +void retransmit_timer(unsigned long data) +{ + struct rxe_qp *qp = (struct rxe_qp *)data; + + if (qp->valid) { + qp->comp.timeout = 1; + rxe_run_task(&qp->comp.task, 1); + } +} + +void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp, + struct sk_buff *skb) +{ + int must_sched; + + skb_queue_tail(&qp->resp_pkts, skb); + + must_sched = skb_queue_len(&qp->resp_pkts) > 1; + rxe_run_task(&qp->comp.task, must_sched); +} + +static inline enum comp_state get_wqe(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe **wqe_p) +{ + struct rxe_send_wqe *wqe; + + /* we come here whether or not we found a response packet to see if + * there are any posted WQEs + */ + wqe = queue_head(qp->sq.queue); + *wqe_p = wqe; + + /* no WQE or requester has not started it yet */ + if (!wqe || wqe->state == wqe_state_posted) + return pkt ? COMPST_DONE : COMPST_EXIT; + + /* WQE does not require an ack */ + if (wqe->state == wqe_state_done) + return COMPST_COMP_WQE; + + /* WQE caused an error */ + if (wqe->state == wqe_state_error) + return COMPST_ERROR; + + /* we have a WQE, if we also have an ack check its PSN */ + return pkt ? COMPST_CHECK_PSN : COMPST_EXIT; +} + +static inline void reset_retry_counters(struct rxe_qp *qp) +{ + qp->comp.retry_cnt = qp->attr.retry_cnt; + qp->comp.rnr_retry = qp->attr.rnr_retry; +} + +static inline enum comp_state check_psn(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + s32 diff; + + /* check to see if response is past the oldest WQE. if it is, complete + * send/write or error read/atomic + */ + diff = psn_compare(pkt->psn, wqe->last_psn); + if (diff > 0) { + if (wqe->state == wqe_state_pending) { + if (wqe->mask & WR_ATOMIC_OR_READ_MASK) + return COMPST_ERROR_RETRY; + + reset_retry_counters(qp); + return COMPST_COMP_WQE; + } else { + return COMPST_DONE; + } + } + + /* compare response packet to expected response */ + diff = psn_compare(pkt->psn, qp->comp.psn); + if (diff < 0) { + /* response is most likely a retried packet if it matches an + * uncompleted WQE go complete it else ignore it + */ + if (pkt->psn == wqe->last_psn) + return COMPST_COMP_ACK; + else + return COMPST_DONE; + } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) { + return COMPST_ERROR_RETRY; + } else { + return COMPST_CHECK_ACK; + } +} + +static inline enum comp_state check_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + unsigned int mask = pkt->mask; + u8 syn; + + /* Check the sequence only */ + switch (qp->comp.opcode) { + case -1: + /* Will catch all *_ONLY cases. */ + if (!(mask & RXE_START_MASK)) + return COMPST_ERROR; + + break; + + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && + pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { + return COMPST_ERROR; + } + break; + default: + WARN_ON(1); + } + + /* Check operation validity. */ + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY: + syn = aeth_syn(pkt); + + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return COMPST_ERROR; + + /* Fall through (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE + * doesn't have an AETH) + */ + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (wqe->wr.opcode != IB_WR_RDMA_READ && + wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) { + return COMPST_ERROR; + } + reset_retry_counters(qp); + return COMPST_READ; + + case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE: + syn = aeth_syn(pkt); + + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return COMPST_ERROR; + + if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP && + wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD) + return COMPST_ERROR; + reset_retry_counters(qp); + return COMPST_ATOMIC; + + case IB_OPCODE_RC_ACKNOWLEDGE: + syn = aeth_syn(pkt); + switch (syn & AETH_TYPE_MASK) { + case AETH_ACK: + reset_retry_counters(qp); + return COMPST_WRITE_SEND; + + case AETH_RNR_NAK: + return COMPST_RNR_RETRY; + + case AETH_NAK: + switch (syn) { + case AETH_NAK_PSN_SEQ_ERROR: + /* a nak implicitly acks all packets with psns + * before + */ + if (psn_compare(pkt->psn, qp->comp.psn) > 0) { + qp->comp.psn = pkt->psn; + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 1); + } + } + return COMPST_ERROR_RETRY; + + case AETH_NAK_INVALID_REQ: + wqe->status = IB_WC_REM_INV_REQ_ERR; + return COMPST_ERROR; + + case AETH_NAK_REM_ACC_ERR: + wqe->status = IB_WC_REM_ACCESS_ERR; + return COMPST_ERROR; + + case AETH_NAK_REM_OP_ERR: + wqe->status = IB_WC_REM_OP_ERR; + return COMPST_ERROR; + + default: + pr_warn("unexpected nak %x\n", syn); + wqe->status = IB_WC_REM_OP_ERR; + return COMPST_ERROR; + } + + default: + return COMPST_ERROR; + } + break; + + default: + pr_warn("unexpected opcode\n"); + } + + return COMPST_ERROR; +} + +static inline enum comp_state do_read(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int ret; + + ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, + &wqe->dma, payload_addr(pkt), + payload_size(pkt), to_mem_obj, NULL); + if (ret) + return COMPST_ERROR; + + if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK)) + return COMPST_COMP_ACK; + else + return COMPST_UPDATE_COMP; +} + +static inline enum comp_state do_atomic(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int ret; + + u64 atomic_orig = atmack_orig(pkt); + + ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, + &wqe->dma, &atomic_orig, + sizeof(u64), to_mem_obj, NULL); + if (ret) + return COMPST_ERROR; + else + return COMPST_COMP_ACK; +} + +static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_cqe *cqe) +{ + memset(cqe, 0, sizeof(*cqe)); + + if (!qp->is_user) { + struct ib_wc *wc = &cqe->ibwc; + + wc->wr_id = wqe->wr.wr_id; + wc->status = wqe->status; + wc->opcode = wr_to_wc_opcode(wqe->wr.opcode); + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + wc->wc_flags = IB_WC_WITH_IMM; + wc->byte_len = wqe->dma.length; + wc->qp = &qp->ibqp; + } else { + struct ib_uverbs_wc *uwc = &cqe->uibwc; + + uwc->wr_id = wqe->wr.wr_id; + uwc->status = wqe->status; + uwc->opcode = wr_to_wc_opcode(wqe->wr.opcode); + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + uwc->wc_flags = IB_WC_WITH_IMM; + uwc->byte_len = wqe->dma.length; + uwc->qp_num = qp->ibqp.qp_num; + } +} + +static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_cqe cqe; + + if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + (qp->req.state == QP_STATE_ERROR)) { + make_send_cqe(qp, wqe, &cqe); + rxe_cq_post(qp->scq, &cqe, 0); + } + + advance_consumer(qp->sq.queue); + + /* + * we completed something so let req run again + * if it is trying to fence + */ + if (qp->req.wait_fence) { + qp->req.wait_fence = 0; + rxe_run_task(&qp->req.task, 1); + } +} + +static inline enum comp_state complete_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + unsigned long flags; + + if (wqe->has_rd_atomic) { + wqe->has_rd_atomic = 0; + atomic_inc(&qp->req.rd_atomic); + if (qp->req.need_rd_atomic) { + qp->comp.timeout_retry = 0; + qp->req.need_rd_atomic = 0; + rxe_run_task(&qp->req.task, 1); + } + } + + if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + /* state_lock used by requester & completer */ + spin_lock_irqsave(&qp->state_lock, flags); + if ((qp->req.state == QP_STATE_DRAIN) && + (qp->comp.psn == qp->req.psn)) { + qp->req.state = QP_STATE_DRAINED; + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_SQ_DRAINED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + } else { + spin_unlock_irqrestore(&qp->state_lock, flags); + } + } + + do_complete(qp, wqe); + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + return COMPST_UPDATE_COMP; + else + return COMPST_DONE; +} + +static inline enum comp_state complete_wqe(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + qp->comp.opcode = -1; + + if (pkt) { + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 1); + } + } + + do_complete(qp, wqe); + + return COMPST_GET_WQE; +} + +int rxe_completer(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + struct rxe_send_wqe *wqe = wqe; + struct sk_buff *skb = NULL; + struct rxe_pkt_info *pkt = NULL; + enum comp_state state; + + if (!qp->valid) { + while ((skb = skb_dequeue(&qp->resp_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + skb = NULL; + pkt = NULL; + + while (queue_head(qp->sq.queue)) + advance_consumer(qp->sq.queue); + + goto exit; + } + + if (qp->req.state == QP_STATE_ERROR) { + while ((skb = skb_dequeue(&qp->resp_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + skb = NULL; + pkt = NULL; + + while ((wqe = queue_head(qp->sq.queue))) { + wqe->status = IB_WC_WR_FLUSH_ERR; + do_complete(qp, wqe); + } + + goto exit; + } + + if (qp->req.state == QP_STATE_RESET) { + while ((skb = skb_dequeue(&qp->resp_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + skb = NULL; + pkt = NULL; + + while (queue_head(qp->sq.queue)) + advance_consumer(qp->sq.queue); + + goto exit; + } + + if (qp->comp.timeout) { + qp->comp.timeout_retry = 1; + qp->comp.timeout = 0; + } else { + qp->comp.timeout_retry = 0; + } + + if (qp->req.need_retry) + goto exit; + + state = COMPST_GET_ACK; + + while (1) { + pr_debug("state = %s\n", comp_state_name[state]); + switch (state) { + case COMPST_GET_ACK: + skb = skb_dequeue(&qp->resp_pkts); + if (skb) { + pkt = SKB_TO_PKT(skb); + qp->comp.timeout_retry = 0; + } + state = COMPST_GET_WQE; + break; + + case COMPST_GET_WQE: + state = get_wqe(qp, pkt, &wqe); + break; + + case COMPST_CHECK_PSN: + state = check_psn(qp, pkt, wqe); + break; + + case COMPST_CHECK_ACK: + state = check_ack(qp, pkt, wqe); + break; + + case COMPST_READ: + state = do_read(qp, pkt, wqe); + break; + + case COMPST_ATOMIC: + state = do_atomic(qp, pkt, wqe); + break; + + case COMPST_WRITE_SEND: + if (wqe->state == wqe_state_pending && + wqe->last_psn == pkt->psn) + state = COMPST_COMP_ACK; + else + state = COMPST_UPDATE_COMP; + break; + + case COMPST_COMP_ACK: + state = complete_ack(qp, pkt, wqe); + break; + + case COMPST_COMP_WQE: + state = complete_wqe(qp, pkt, wqe); + break; + + case COMPST_UPDATE_COMP: + if (pkt->mask & RXE_END_MASK) + qp->comp.opcode = -1; + else + qp->comp.opcode = pkt->opcode; + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 1); + } + + state = COMPST_DONE; + break; + + case COMPST_DONE: + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + } + goto done; + + case COMPST_EXIT: + if (qp->comp.timeout_retry && wqe) { + state = COMPST_ERROR_RETRY; + break; + } + + /* re reset the timeout counter if + * (1) QP is type RC + * (2) the QP is alive + * (3) there is a packet sent by the requester that + * might be acked (we still might get spurious + * timeouts but try to keep them as few as possible) + * (4) the timeout parameter is set + */ + if ((qp_type(qp) == IB_QPT_RC) && + (qp->req.state == QP_STATE_READY) && + (psn_compare(qp->req.psn, qp->comp.psn) > 0) && + qp->qp_timeout_jiffies) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); + goto exit; + + case COMPST_ERROR_RETRY: + /* we come here if the retry timer fired and we did + * not receive a response packet. try to retry the send + * queue if that makes sense and the limits have not + * been exceeded. remember that some timeouts are + * spurious since we do not reset the timer but kick + * it down the road or let it expire + */ + + /* there is nothing to retry in this case */ + if (!wqe || (wqe->state == wqe_state_posted)) + goto exit; + + if (qp->comp.retry_cnt > 0) { + if (qp->comp.retry_cnt != 7) + qp->comp.retry_cnt--; + + /* no point in retrying if we have already + * seen the last ack that the requester could + * have caused + */ + if (psn_compare(qp->req.psn, + qp->comp.psn) > 0) { + /* tell the requester to retry the + * send send queue next time around + */ + qp->req.need_retry = 1; + rxe_run_task(&qp->req.task, 1); + } + goto exit; + } else { + wqe->status = IB_WC_RETRY_EXC_ERR; + state = COMPST_ERROR; + } + break; + + case COMPST_RNR_RETRY: + if (qp->comp.rnr_retry > 0) { + if (qp->comp.rnr_retry != 7) + qp->comp.rnr_retry--; + + qp->req.need_retry = 1; + pr_debug("set rnr nak timer\n"); + mod_timer(&qp->rnr_nak_timer, + jiffies + rnrnak_jiffies(aeth_syn(pkt) + & ~AETH_TYPE_MASK)); + goto exit; + } else { + wqe->status = IB_WC_RNR_RETRY_EXC_ERR; + state = COMPST_ERROR; + } + break; + + case COMPST_ERROR: + do_complete(qp, wqe); + rxe_qp_error(qp); + goto exit; + } + } + +exit: + /* we come here if we are done with processing and want the task to + * exit from the loop calling us + */ + return -EAGAIN; + +done: + /* we come here if we have processed a packet we want the task to call + * us again to see if there is anything else to do + */ + return 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c new file mode 100644 index 000000000000..e5e6a5e7dee9 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, + int cqe, int comp_vector, struct ib_udata *udata) +{ + int count; + + if (cqe <= 0) { + pr_warn("cqe(%d) <= 0\n", cqe); + goto err1; + } + + if (cqe > rxe->attr.max_cqe) { + pr_warn("cqe(%d) > max_cqe(%d)\n", + cqe, rxe->attr.max_cqe); + goto err1; + } + + if (cq) { + count = queue_count(cq->queue); + if (cqe < count) { + pr_warn("cqe(%d) < current # elements in queue (%d)", + cqe, count); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static void rxe_send_complete(unsigned long data) +{ + struct rxe_cq *cq = (struct rxe_cq *)data; + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata) +{ + int err; + + cq->queue = rxe_queue_init(rxe, &cqe, + sizeof(struct rxe_cqe)); + if (!cq->queue) { + pr_warn("unable to create cq\n"); + return -ENOMEM; + } + + err = do_mmap_info(rxe, udata, false, context, cq->queue->buf, + cq->queue->buf_size, &cq->queue->ip); + if (err) { + kvfree(cq->queue->buf); + kfree(cq->queue); + return err; + } + + if (udata) + cq->is_user = 1; + + tasklet_init(&cq->comp_task, rxe_send_complete, (unsigned long)cq); + + spin_lock_init(&cq->cq_lock); + cq->ibcq.cqe = cqe; + return 0; +} + +int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, struct ib_udata *udata) +{ + int err; + + err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe, + sizeof(struct rxe_cqe), + cq->queue->ip ? cq->queue->ip->context : NULL, + udata, NULL, &cq->cq_lock); + if (!err) + cq->ibcq.cqe = cqe; + + return err; +} + +int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) +{ + struct ib_event ev; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + + if (unlikely(queue_full(cq->queue))) { + spin_unlock_irqrestore(&cq->cq_lock, flags); + if (cq->ibcq.event_handler) { + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + + return -EBUSY; + } + + memcpy(producer_addr(cq->queue), cqe, sizeof(*cqe)); + + /* make sure all changes to the CQ are written before we update the + * producer pointer + */ + smp_wmb(); + + advance_producer(cq->queue); + spin_unlock_irqrestore(&cq->cq_lock, flags); + + if ((cq->notify == IB_CQ_NEXT_COMP) || + (cq->notify == IB_CQ_SOLICITED && solicited)) { + cq->notify = 0; + tasklet_schedule(&cq->comp_task); + } + + return 0; +} + +void rxe_cq_cleanup(void *arg) +{ + struct rxe_cq *cq = arg; + + if (cq->queue) + rxe_queue_cleanup(cq->queue); +} diff --git a/drivers/infiniband/sw/rxe/rxe_dma.c b/drivers/infiniband/sw/rxe/rxe_dma.c new file mode 100644 index 000000000000..7634c1a81b2b --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_dma.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +#define DMA_BAD_ADDER ((u64)0) + +static int rxe_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == DMA_BAD_ADDER; +} + +static u64 rxe_dma_map_single(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + WARN_ON(!valid_dma_direction(direction)); + return (uintptr_t)cpu_addr; +} + +static void rxe_dma_unmap_single(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + WARN_ON(!valid_dma_direction(direction)); +} + +static u64 rxe_dma_map_page(struct ib_device *dev, + struct page *page, + unsigned long offset, + size_t size, enum dma_data_direction direction) +{ + u64 addr; + + WARN_ON(!valid_dma_direction(direction)); + + if (offset + size > PAGE_SIZE) { + addr = DMA_BAD_ADDER; + goto done; + } + + addr = (uintptr_t)page_address(page); + if (addr) + addr += offset; + +done: + return addr; +} + +static void rxe_dma_unmap_page(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + WARN_ON(!valid_dma_direction(direction)); +} + +static int rxe_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + WARN_ON(!valid_dma_direction(direction)); + + for_each_sg(sgl, sg, nents, i) { + addr = (uintptr_t)page_address(sg_page(sg)); + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + + return ret; +} + +static void rxe_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + WARN_ON(!valid_dma_direction(direction)); +} + +static void rxe_sync_single_for_cpu(struct ib_device *dev, + u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void rxe_sync_single_for_device(struct ib_device *dev, + u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void *rxe_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + + if (dma_handle) + *dma_handle = (uintptr_t)addr; + + return addr; +} + +static void rxe_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long)cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops rxe_dma_mapping_ops = { + .mapping_error = rxe_mapping_error, + .map_single = rxe_dma_map_single, + .unmap_single = rxe_dma_unmap_single, + .map_page = rxe_dma_map_page, + .unmap_page = rxe_dma_unmap_page, + .map_sg = rxe_map_sg, + .unmap_sg = rxe_unmap_sg, + .sync_single_for_cpu = rxe_sync_single_for_cpu, + .sync_single_for_device = rxe_sync_single_for_device, + .alloc_coherent = rxe_dma_alloc_coherent, + .free_coherent = rxe_dma_free_coherent +}; diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h new file mode 100644 index 000000000000..d57b5e956ceb --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_HDR_H +#define RXE_HDR_H + +/* extracted information about a packet carried in an sk_buff struct fits in + * the skbuff cb array. Must be at most 48 bytes. stored in control block of + * sk_buff for received packets. + */ +struct rxe_pkt_info { + struct rxe_dev *rxe; /* device that owns packet */ + struct rxe_qp *qp; /* qp that owns packet */ + struct rxe_send_wqe *wqe; /* send wqe */ + u8 *hdr; /* points to bth */ + u32 mask; /* useful info about pkt */ + u32 psn; /* bth psn of packet */ + u16 pkey_index; /* partition of pkt */ + u16 paylen; /* length of bth - icrc */ + u8 port_num; /* port pkt received on */ + u8 opcode; /* bth opcode of packet */ + u8 offset; /* bth offset from pkt->hdr */ +}; + +/* Macros should be used only for received skb */ +#define SKB_TO_PKT(skb) ((struct rxe_pkt_info *)(skb)->cb) +#define PKT_TO_SKB(pkt) container_of((void *)(pkt), struct sk_buff, cb) + +/* + * IBA header types and methods + * + * Some of these are for reference and completeness only since + * rxe does not currently support RD transport + * most of this could be moved into IB core. ib_pack.h has + * part of this but is incomplete + * + * Header specific routines to insert/extract values to/from headers + * the routines that are named __hhh_(set_)fff() take a pointer to a + * hhh header and get(set) the fff field. The routines named + * hhh_(set_)fff take a packet info struct and find the + * header and field based on the opcode in the packet. + * Conversion to/from network byte order from cpu order is also done. + */ + +#define RXE_ICRC_SIZE (4) +#define RXE_MAX_HDR_LENGTH (80) + +/****************************************************************************** + * Base Transport Header + ******************************************************************************/ +struct rxe_bth { + u8 opcode; + u8 flags; + __be16 pkey; + __be32 qpn; + __be32 apsn; +}; + +#define BTH_TVER (0) +#define BTH_DEF_PKEY (0xffff) + +#define BTH_SE_MASK (0x80) +#define BTH_MIG_MASK (0x40) +#define BTH_PAD_MASK (0x30) +#define BTH_TVER_MASK (0x0f) +#define BTH_FECN_MASK (0x80000000) +#define BTH_BECN_MASK (0x40000000) +#define BTH_RESV6A_MASK (0x3f000000) +#define BTH_QPN_MASK (0x00ffffff) +#define BTH_ACK_MASK (0x80000000) +#define BTH_RESV7_MASK (0x7f000000) +#define BTH_PSN_MASK (0x00ffffff) + +static inline u8 __bth_opcode(void *arg) +{ + struct rxe_bth *bth = arg; + + return bth->opcode; +} + +static inline void __bth_set_opcode(void *arg, u8 opcode) +{ + struct rxe_bth *bth = arg; + + bth->opcode = opcode; +} + +static inline u8 __bth_se(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (BTH_SE_MASK & bth->flags); +} + +static inline void __bth_set_se(void *arg, int se) +{ + struct rxe_bth *bth = arg; + + if (se) + bth->flags |= BTH_SE_MASK; + else + bth->flags &= ~BTH_SE_MASK; +} + +static inline u8 __bth_mig(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (BTH_MIG_MASK & bth->flags); +} + +static inline void __bth_set_mig(void *arg, u8 mig) +{ + struct rxe_bth *bth = arg; + + if (mig) + bth->flags |= BTH_MIG_MASK; + else + bth->flags &= ~BTH_MIG_MASK; +} + +static inline u8 __bth_pad(void *arg) +{ + struct rxe_bth *bth = arg; + + return (BTH_PAD_MASK & bth->flags) >> 4; +} + +static inline void __bth_set_pad(void *arg, u8 pad) +{ + struct rxe_bth *bth = arg; + + bth->flags = (BTH_PAD_MASK & (pad << 4)) | + (~BTH_PAD_MASK & bth->flags); +} + +static inline u8 __bth_tver(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_TVER_MASK & bth->flags; +} + +static inline void __bth_set_tver(void *arg, u8 tver) +{ + struct rxe_bth *bth = arg; + + bth->flags = (BTH_TVER_MASK & tver) | + (~BTH_TVER_MASK & bth->flags); +} + +static inline u16 __bth_pkey(void *arg) +{ + struct rxe_bth *bth = arg; + + return be16_to_cpu(bth->pkey); +} + +static inline void __bth_set_pkey(void *arg, u16 pkey) +{ + struct rxe_bth *bth = arg; + + bth->pkey = cpu_to_be16(pkey); +} + +static inline u32 __bth_qpn(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_QPN_MASK & be32_to_cpu(bth->qpn); +} + +static inline void __bth_set_qpn(void *arg, u32 qpn) +{ + struct rxe_bth *bth = arg; + u32 resvqpn = be32_to_cpu(bth->qpn); + + bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) | + (~BTH_QPN_MASK & resvqpn)); +} + +static inline int __bth_fecn(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn); +} + +static inline void __bth_set_fecn(void *arg, int fecn) +{ + struct rxe_bth *bth = arg; + + if (fecn) + bth->qpn |= cpu_to_be32(BTH_FECN_MASK); + else + bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK); +} + +static inline int __bth_becn(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn); +} + +static inline void __bth_set_becn(void *arg, int becn) +{ + struct rxe_bth *bth = arg; + + if (becn) + bth->qpn |= cpu_to_be32(BTH_BECN_MASK); + else + bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK); +} + +static inline u8 __bth_resv6a(void *arg) +{ + struct rxe_bth *bth = arg; + + return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24; +} + +static inline void __bth_set_resv6a(void *arg) +{ + struct rxe_bth *bth = arg; + + bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK); +} + +static inline int __bth_ack(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn); +} + +static inline void __bth_set_ack(void *arg, int ack) +{ + struct rxe_bth *bth = arg; + + if (ack) + bth->apsn |= cpu_to_be32(BTH_ACK_MASK); + else + bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK); +} + +static inline void __bth_set_resv7(void *arg) +{ + struct rxe_bth *bth = arg; + + bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK); +} + +static inline u32 __bth_psn(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_PSN_MASK & be32_to_cpu(bth->apsn); +} + +static inline void __bth_set_psn(void *arg, u32 psn) +{ + struct rxe_bth *bth = arg; + u32 apsn = be32_to_cpu(bth->apsn); + + bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) | + (~BTH_PSN_MASK & apsn)); +} + +static inline u8 bth_opcode(struct rxe_pkt_info *pkt) +{ + return __bth_opcode(pkt->hdr + pkt->offset); +} + +static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode) +{ + __bth_set_opcode(pkt->hdr + pkt->offset, opcode); +} + +static inline u8 bth_se(struct rxe_pkt_info *pkt) +{ + return __bth_se(pkt->hdr + pkt->offset); +} + +static inline void bth_set_se(struct rxe_pkt_info *pkt, int se) +{ + __bth_set_se(pkt->hdr + pkt->offset, se); +} + +static inline u8 bth_mig(struct rxe_pkt_info *pkt) +{ + return __bth_mig(pkt->hdr + pkt->offset); +} + +static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig) +{ + __bth_set_mig(pkt->hdr + pkt->offset, mig); +} + +static inline u8 bth_pad(struct rxe_pkt_info *pkt) +{ + return __bth_pad(pkt->hdr + pkt->offset); +} + +static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad) +{ + __bth_set_pad(pkt->hdr + pkt->offset, pad); +} + +static inline u8 bth_tver(struct rxe_pkt_info *pkt) +{ + return __bth_tver(pkt->hdr + pkt->offset); +} + +static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver) +{ + __bth_set_tver(pkt->hdr + pkt->offset, tver); +} + +static inline u16 bth_pkey(struct rxe_pkt_info *pkt) +{ + return __bth_pkey(pkt->hdr + pkt->offset); +} + +static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey) +{ + __bth_set_pkey(pkt->hdr + pkt->offset, pkey); +} + +static inline u32 bth_qpn(struct rxe_pkt_info *pkt) +{ + return __bth_qpn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn) +{ + __bth_set_qpn(pkt->hdr + pkt->offset, qpn); +} + +static inline int bth_fecn(struct rxe_pkt_info *pkt) +{ + return __bth_fecn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn) +{ + __bth_set_fecn(pkt->hdr + pkt->offset, fecn); +} + +static inline int bth_becn(struct rxe_pkt_info *pkt) +{ + return __bth_becn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn) +{ + __bth_set_becn(pkt->hdr + pkt->offset, becn); +} + +static inline u8 bth_resv6a(struct rxe_pkt_info *pkt) +{ + return __bth_resv6a(pkt->hdr + pkt->offset); +} + +static inline void bth_set_resv6a(struct rxe_pkt_info *pkt) +{ + __bth_set_resv6a(pkt->hdr + pkt->offset); +} + +static inline int bth_ack(struct rxe_pkt_info *pkt) +{ + return __bth_ack(pkt->hdr + pkt->offset); +} + +static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack) +{ + __bth_set_ack(pkt->hdr + pkt->offset, ack); +} + +static inline void bth_set_resv7(struct rxe_pkt_info *pkt) +{ + __bth_set_resv7(pkt->hdr + pkt->offset); +} + +static inline u32 bth_psn(struct rxe_pkt_info *pkt) +{ + return __bth_psn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn) +{ + __bth_set_psn(pkt->hdr + pkt->offset, psn); +} + +static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se, + int mig, int pad, u16 pkey, u32 qpn, int ack_req, + u32 psn) +{ + struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr + pkt->offset); + + bth->opcode = opcode; + bth->flags = (pad << 4) & BTH_PAD_MASK; + if (se) + bth->flags |= BTH_SE_MASK; + if (mig) + bth->flags |= BTH_MIG_MASK; + bth->pkey = cpu_to_be16(pkey); + bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK); + psn &= BTH_PSN_MASK; + if (ack_req) + psn |= BTH_ACK_MASK; + bth->apsn = cpu_to_be32(psn); +} + +/****************************************************************************** + * Reliable Datagram Extended Transport Header + ******************************************************************************/ +struct rxe_rdeth { + __be32 een; +}; + +#define RDETH_EEN_MASK (0x00ffffff) + +static inline u8 __rdeth_een(void *arg) +{ + struct rxe_rdeth *rdeth = arg; + + return RDETH_EEN_MASK & be32_to_cpu(rdeth->een); +} + +static inline void __rdeth_set_een(void *arg, u32 een) +{ + struct rxe_rdeth *rdeth = arg; + + rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een); +} + +static inline u8 rdeth_een(struct rxe_pkt_info *pkt) +{ + return __rdeth_een(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RDETH]); +} + +static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een) +{ + __rdeth_set_een(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een); +} + +/****************************************************************************** + * Datagram Extended Transport Header + ******************************************************************************/ +struct rxe_deth { + __be32 qkey; + __be32 sqp; +}; + +#define GSI_QKEY (0x80010000) +#define DETH_SQP_MASK (0x00ffffff) + +static inline u32 __deth_qkey(void *arg) +{ + struct rxe_deth *deth = arg; + + return be32_to_cpu(deth->qkey); +} + +static inline void __deth_set_qkey(void *arg, u32 qkey) +{ + struct rxe_deth *deth = arg; + + deth->qkey = cpu_to_be32(qkey); +} + +static inline u32 __deth_sqp(void *arg) +{ + struct rxe_deth *deth = arg; + + return DETH_SQP_MASK & be32_to_cpu(deth->sqp); +} + +static inline void __deth_set_sqp(void *arg, u32 sqp) +{ + struct rxe_deth *deth = arg; + + deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp); +} + +static inline u32 deth_qkey(struct rxe_pkt_info *pkt) +{ + return __deth_qkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH]); +} + +static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey) +{ + __deth_set_qkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey); +} + +static inline u32 deth_sqp(struct rxe_pkt_info *pkt) +{ + return __deth_sqp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH]); +} + +static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp) +{ + __deth_set_sqp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp); +} + +/****************************************************************************** + * RDMA Extended Transport Header + ******************************************************************************/ +struct rxe_reth { + __be64 va; + __be32 rkey; + __be32 len; +}; + +static inline u64 __reth_va(void *arg) +{ + struct rxe_reth *reth = arg; + + return be64_to_cpu(reth->va); +} + +static inline void __reth_set_va(void *arg, u64 va) +{ + struct rxe_reth *reth = arg; + + reth->va = cpu_to_be64(va); +} + +static inline u32 __reth_rkey(void *arg) +{ + struct rxe_reth *reth = arg; + + return be32_to_cpu(reth->rkey); +} + +static inline void __reth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_reth *reth = arg; + + reth->rkey = cpu_to_be32(rkey); +} + +static inline u32 __reth_len(void *arg) +{ + struct rxe_reth *reth = arg; + + return be32_to_cpu(reth->len); +} + +static inline void __reth_set_len(void *arg, u32 len) +{ + struct rxe_reth *reth = arg; + + reth->len = cpu_to_be32(len); +} + +static inline u64 reth_va(struct rxe_pkt_info *pkt) +{ + return __reth_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va) +{ + __reth_set_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], va); +} + +static inline u32 reth_rkey(struct rxe_pkt_info *pkt) +{ + return __reth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __reth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey); +} + +static inline u32 reth_len(struct rxe_pkt_info *pkt) +{ + return __reth_len(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len) +{ + __reth_set_len(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], len); +} + +/****************************************************************************** + * Atomic Extended Transport Header + ******************************************************************************/ +struct rxe_atmeth { + __be64 va; + __be32 rkey; + __be64 swap_add; + __be64 comp; +} __attribute__((__packed__)); + +static inline u64 __atmeth_va(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->va); +} + +static inline void __atmeth_set_va(void *arg, u64 va) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->va = cpu_to_be64(va); +} + +static inline u32 __atmeth_rkey(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be32_to_cpu(atmeth->rkey); +} + +static inline void __atmeth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->rkey = cpu_to_be32(rkey); +} + +static inline u64 __atmeth_swap_add(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->swap_add); +} + +static inline void __atmeth_set_swap_add(void *arg, u64 swap_add) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->swap_add = cpu_to_be64(swap_add); +} + +static inline u64 __atmeth_comp(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->comp); +} + +static inline void __atmeth_set_comp(void *arg, u64 comp) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->comp = cpu_to_be64(comp); +} + +static inline u64 atmeth_va(struct rxe_pkt_info *pkt) +{ + return __atmeth_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va) +{ + __atmeth_set_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va); +} + +static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt) +{ + return __atmeth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __atmeth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey); +} + +static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt) +{ + return __atmeth_swap_add(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add) +{ + __atmeth_set_swap_add(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add); +} + +static inline u64 atmeth_comp(struct rxe_pkt_info *pkt) +{ + return __atmeth_comp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp) +{ + __atmeth_set_comp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp); +} + +/****************************************************************************** + * Ack Extended Transport Header + ******************************************************************************/ +struct rxe_aeth { + __be32 smsn; +}; + +#define AETH_SYN_MASK (0xff000000) +#define AETH_MSN_MASK (0x00ffffff) + +enum aeth_syndrome { + AETH_TYPE_MASK = 0xe0, + AETH_ACK = 0x00, + AETH_RNR_NAK = 0x20, + AETH_RSVD = 0x40, + AETH_NAK = 0x60, + AETH_ACK_UNLIMITED = 0x1f, + AETH_NAK_PSN_SEQ_ERROR = 0x60, + AETH_NAK_INVALID_REQ = 0x61, + AETH_NAK_REM_ACC_ERR = 0x62, + AETH_NAK_REM_OP_ERR = 0x63, + AETH_NAK_INV_RD_REQ = 0x64, +}; + +static inline u8 __aeth_syn(void *arg) +{ + struct rxe_aeth *aeth = arg; + + return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24; +} + +static inline void __aeth_set_syn(void *arg, u8 syn) +{ + struct rxe_aeth *aeth = arg; + u32 smsn = be32_to_cpu(aeth->smsn); + + aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) | + (~AETH_SYN_MASK & smsn)); +} + +static inline u32 __aeth_msn(void *arg) +{ + struct rxe_aeth *aeth = arg; + + return AETH_MSN_MASK & be32_to_cpu(aeth->smsn); +} + +static inline void __aeth_set_msn(void *arg, u32 msn) +{ + struct rxe_aeth *aeth = arg; + u32 smsn = be32_to_cpu(aeth->smsn); + + aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) | + (~AETH_MSN_MASK & smsn)); +} + +static inline u8 aeth_syn(struct rxe_pkt_info *pkt) +{ + return __aeth_syn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH]); +} + +static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn) +{ + __aeth_set_syn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn); +} + +static inline u32 aeth_msn(struct rxe_pkt_info *pkt) +{ + return __aeth_msn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH]); +} + +static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn) +{ + __aeth_set_msn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn); +} + +/****************************************************************************** + * Atomic Ack Extended Transport Header + ******************************************************************************/ +struct rxe_atmack { + __be64 orig; +}; + +static inline u64 __atmack_orig(void *arg) +{ + struct rxe_atmack *atmack = arg; + + return be64_to_cpu(atmack->orig); +} + +static inline void __atmack_set_orig(void *arg, u64 orig) +{ + struct rxe_atmack *atmack = arg; + + atmack->orig = cpu_to_be64(orig); +} + +static inline u64 atmack_orig(struct rxe_pkt_info *pkt) +{ + return __atmack_orig(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]); +} + +static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig) +{ + __atmack_set_orig(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig); +} + +/****************************************************************************** + * Immediate Extended Transport Header + ******************************************************************************/ +struct rxe_immdt { + __be32 imm; +}; + +static inline __be32 __immdt_imm(void *arg) +{ + struct rxe_immdt *immdt = arg; + + return immdt->imm; +} + +static inline void __immdt_set_imm(void *arg, __be32 imm) +{ + struct rxe_immdt *immdt = arg; + + immdt->imm = imm; +} + +static inline __be32 immdt_imm(struct rxe_pkt_info *pkt) +{ + return __immdt_imm(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]); +} + +static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm) +{ + __immdt_set_imm(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm); +} + +/****************************************************************************** + * Invalidate Extended Transport Header + ******************************************************************************/ +struct rxe_ieth { + __be32 rkey; +}; + +static inline u32 __ieth_rkey(void *arg) +{ + struct rxe_ieth *ieth = arg; + + return be32_to_cpu(ieth->rkey); +} + +static inline void __ieth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_ieth *ieth = arg; + + ieth->rkey = cpu_to_be32(rkey); +} + +static inline u32 ieth_rkey(struct rxe_pkt_info *pkt) +{ + return __ieth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IETH]); +} + +static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __ieth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey); +} + +enum rxe_hdr_length { + RXE_BTH_BYTES = sizeof(struct rxe_bth), + RXE_DETH_BYTES = sizeof(struct rxe_deth), + RXE_IMMDT_BYTES = sizeof(struct rxe_immdt), + RXE_RETH_BYTES = sizeof(struct rxe_reth), + RXE_AETH_BYTES = sizeof(struct rxe_aeth), + RXE_ATMACK_BYTES = sizeof(struct rxe_atmack), + RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth), + RXE_IETH_BYTES = sizeof(struct rxe_ieth), + RXE_RDETH_BYTES = sizeof(struct rxe_rdeth), +}; + +static inline size_t header_size(struct rxe_pkt_info *pkt) +{ + return pkt->offset + rxe_opcode[pkt->opcode].length; +} + +static inline void *payload_addr(struct rxe_pkt_info *pkt) +{ + return pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]; +} + +static inline size_t payload_size(struct rxe_pkt_info *pkt) +{ + return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD] + - bth_pad(pkt) - RXE_ICRC_SIZE; +} + +#endif /* RXE_HDR_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c new file mode 100644 index 000000000000..413b56b23a06 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* Compute a partial ICRC for all the IB transport headers. */ +u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + unsigned int bth_offset = 0; + struct iphdr *ip4h = NULL; + struct ipv6hdr *ip6h = NULL; + struct udphdr *udph; + struct rxe_bth *bth; + int crc; + int length; + int hdr_size = sizeof(struct udphdr) + + (skb->protocol == htons(ETH_P_IP) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr)); + /* pseudo header buffer size is calculate using ipv6 header size since + * it is bigger than ipv4 + */ + u8 pshdr[sizeof(struct udphdr) + + sizeof(struct ipv6hdr) + + RXE_BTH_BYTES]; + + /* This seed is the result of computing a CRC with a seed of + * 0xfffffff and 8 bytes of 0xff representing a masked LRH. + */ + crc = 0xdebb20e3; + + if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */ + memcpy(pshdr, ip_hdr(skb), hdr_size); + ip4h = (struct iphdr *)pshdr; + udph = (struct udphdr *)(ip4h + 1); + + ip4h->ttl = 0xff; + ip4h->check = CSUM_MANGLED_0; + ip4h->tos = 0xff; + } else { /* IPv6 */ + memcpy(pshdr, ipv6_hdr(skb), hdr_size); + ip6h = (struct ipv6hdr *)pshdr; + udph = (struct udphdr *)(ip6h + 1); + + memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl)); + ip6h->priority = 0xf; + ip6h->hop_limit = 0xff; + } + udph->check = CSUM_MANGLED_0; + + bth_offset += hdr_size; + + memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES); + bth = (struct rxe_bth *)&pshdr[bth_offset]; + + /* exclude bth.resv8a */ + bth->qpn |= cpu_to_be32(~BTH_QPN_MASK); + + length = hdr_size + RXE_BTH_BYTES; + crc = crc32_le(crc, pshdr, length); + + /* And finish to compute the CRC on the remainder of the headers. */ + crc = crc32_le(crc, pkt->hdr + RXE_BTH_BYTES, + rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES); + return crc; +} diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h new file mode 100644 index 000000000000..4a5484ef604f --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_LOC_H +#define RXE_LOC_H + +/* rxe_av.c */ + +int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr); + +int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num, + struct rxe_av *av, struct ib_ah_attr *attr); + +int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av, + struct ib_ah_attr *attr); + +int rxe_av_fill_ip_info(struct rxe_dev *rxe, + struct rxe_av *av, + struct ib_ah_attr *attr, + struct ib_gid_attr *sgid_attr, + union ib_gid *sgid); + +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt); + +/* rxe_cq.c */ +int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, + int cqe, int comp_vector, struct ib_udata *udata); + +int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, + int comp_vector, struct ib_ucontext *context, + struct ib_udata *udata); + +int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, struct ib_udata *udata); + +int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); + +void rxe_cq_cleanup(void *arg); + +/* rxe_mcast.c */ +int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mc_grp **grp_p); + +int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_mc_grp *grp); + +int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + union ib_gid *mgid); + +void rxe_drop_all_mcast_groups(struct rxe_qp *qp); + +void rxe_mc_cleanup(void *arg); + +/* rxe_mmap.c */ +struct rxe_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + struct kref ref; + void *obj; + + struct mminfo info; +}; + +void rxe_mmap_release(struct kref *ref); + +struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev, + u32 size, + struct ib_ucontext *context, + void *obj); + +int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +/* rxe_mr.c */ +enum copy_direction { + to_mem_obj, + from_mem_obj, +}; + +int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd, + int access, struct rxe_mem *mem); + +int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start, + u64 length, u64 iova, int access, struct ib_udata *udata, + struct rxe_mem *mr); + +int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd, + int max_pages, struct rxe_mem *mem); + +int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, + int length, enum copy_direction dir, u32 *crcp); + +int copy_data(struct rxe_dev *rxe, struct rxe_pd *pd, int access, + struct rxe_dma_info *dma, void *addr, int length, + enum copy_direction dir, u32 *crcp); + +void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length); + +enum lookup_type { + lookup_local, + lookup_remote, +}; + +struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, + enum lookup_type type); + +int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length); + +int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem, + u64 *page, int num_pages, u64 iova); + +void rxe_mem_cleanup(void *arg); + +int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); + +/* rxe_qp.c */ +int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init); + +int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, + struct ib_qp_init_attr *init, struct ib_udata *udata, + struct ib_pd *ibpd); + +int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init); + +int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_attr *attr, int mask); + +int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata); + +int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask); + +void rxe_qp_error(struct rxe_qp *qp); + +void rxe_qp_destroy(struct rxe_qp *qp); + +void rxe_qp_cleanup(void *arg); + +static inline int qp_num(struct rxe_qp *qp) +{ + return qp->ibqp.qp_num; +} + +static inline enum ib_qp_type qp_type(struct rxe_qp *qp) +{ + return qp->ibqp.qp_type; +} + +static inline enum ib_qp_state qp_state(struct rxe_qp *qp) +{ + return qp->attr.qp_state; +} + +static inline int qp_mtu(struct rxe_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) + return qp->attr.path_mtu; + else + return RXE_PORT_MAX_MTU; +} + +static inline int rcv_wqe_size(int max_sge) +{ + return sizeof(struct rxe_recv_wqe) + + max_sge * sizeof(struct ib_sge); +} + +void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res); + +static inline void rxe_advance_resp_resource(struct rxe_qp *qp) +{ + qp->resp.res_head++; + if (unlikely(qp->resp.res_head == qp->attr.max_rd_atomic)) + qp->resp.res_head = 0; +} + +void retransmit_timer(unsigned long data); +void rnr_nak_timer(unsigned long data); + +void dump_qp(struct rxe_qp *qp); + +/* rxe_srq.c */ +#define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT) + +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask); + +int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_init_attr *init, + struct ib_ucontext *context, struct ib_udata *udata); + +int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, + struct ib_udata *udata); + +extern struct ib_dma_mapping_ops rxe_dma_mapping_ops; + +void rxe_release(struct kref *kref); + +int rxe_completer(void *arg); +int rxe_requester(void *arg); +int rxe_responder(void *arg); + +u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb); + +void rxe_resp_queue_pkt(struct rxe_dev *rxe, + struct rxe_qp *qp, struct sk_buff *skb); + +void rxe_comp_queue_pkt(struct rxe_dev *rxe, + struct rxe_qp *qp, struct sk_buff *skb); + +static inline unsigned wr_opcode_mask(int opcode, struct rxe_qp *qp) +{ + return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; +} + +static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + int err; + int is_request = pkt->mask & RXE_REQ_MASK; + + if ((is_request && (qp->req.state != QP_STATE_READY)) || + (!is_request && (qp->resp.state != QP_STATE_READY))) { + pr_info("Packet dropped. QP is not in ready state\n"); + goto drop; + } + + if (pkt->mask & RXE_LOOPBACK_MASK) { + memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); + err = rxe->ifc_ops->loopback(skb); + } else { + err = rxe->ifc_ops->send(rxe, pkt, skb); + } + + if (err) { + rxe->xmit_errors++; + return err; + } + + atomic_inc(&qp->skb_out); + + if ((qp_type(qp) != IB_QPT_RC) && + (pkt->mask & RXE_END_MASK)) { + pkt->wqe->state = wqe_state_done; + rxe_run_task(&qp->comp.task, 1); + } + + goto done; + +drop: + kfree_skb(skb); + err = 0; +done: + return err; +} + +#endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c new file mode 100644 index 000000000000..fa95544ca7e0 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mc_grp **grp_p) +{ + int err; + struct rxe_mc_grp *grp; + + if (rxe->attr.max_mcast_qp_attach == 0) { + err = -EINVAL; + goto err1; + } + + grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); + if (grp) + goto done; + + grp = rxe_alloc(&rxe->mc_grp_pool); + if (!grp) { + err = -ENOMEM; + goto err1; + } + + INIT_LIST_HEAD(&grp->qp_list); + spin_lock_init(&grp->mcg_lock); + grp->rxe = rxe; + + rxe_add_key(grp, mgid); + + err = rxe->ifc_ops->mcast_add(rxe, mgid); + if (err) + goto err2; + +done: + *grp_p = grp; + return 0; + +err2: + rxe_drop_ref(grp); +err1: + return err; +} + +int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_mc_grp *grp) +{ + int err; + struct rxe_mc_elem *elem; + + /* check to see of the qp is already a member of the group */ + spin_lock_bh(&qp->grp_lock); + spin_lock_bh(&grp->mcg_lock); + list_for_each_entry(elem, &grp->qp_list, qp_list) { + if (elem->qp == qp) { + err = 0; + goto out; + } + } + + if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) { + err = -ENOMEM; + goto out; + } + + elem = rxe_alloc(&rxe->mc_elem_pool); + if (!elem) { + err = -ENOMEM; + goto out; + } + + /* each qp holds a ref on the grp */ + rxe_add_ref(grp); + + grp->num_qp++; + elem->qp = qp; + elem->grp = grp; + + list_add(&elem->qp_list, &grp->qp_list); + list_add(&elem->grp_list, &qp->grp_list); + + err = 0; +out: + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + return err; +} + +int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + union ib_gid *mgid) +{ + struct rxe_mc_grp *grp; + struct rxe_mc_elem *elem, *tmp; + + grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); + if (!grp) + goto err1; + + spin_lock_bh(&qp->grp_lock); + spin_lock_bh(&grp->mcg_lock); + + list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) { + if (elem->qp == qp) { + list_del(&elem->qp_list); + list_del(&elem->grp_list); + grp->num_qp--; + + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + rxe_drop_ref(elem); + rxe_drop_ref(grp); /* ref held by QP */ + rxe_drop_ref(grp); /* ref from get_key */ + return 0; + } + } + + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + rxe_drop_ref(grp); /* ref from get_key */ +err1: + return -EINVAL; +} + +void rxe_drop_all_mcast_groups(struct rxe_qp *qp) +{ + struct rxe_mc_grp *grp; + struct rxe_mc_elem *elem; + + while (1) { + spin_lock_bh(&qp->grp_lock); + if (list_empty(&qp->grp_list)) { + spin_unlock_bh(&qp->grp_lock); + break; + } + elem = list_first_entry(&qp->grp_list, struct rxe_mc_elem, + grp_list); + list_del(&elem->grp_list); + spin_unlock_bh(&qp->grp_lock); + + grp = elem->grp; + spin_lock_bh(&grp->mcg_lock); + list_del(&elem->qp_list); + grp->num_qp--; + spin_unlock_bh(&grp->mcg_lock); + rxe_drop_ref(grp); + rxe_drop_ref(elem); + } +} + +void rxe_mc_cleanup(void *arg) +{ + struct rxe_mc_grp *grp = arg; + struct rxe_dev *rxe = grp->rxe; + + rxe_drop_key(grp); + rxe->ifc_ops->mcast_delete(rxe, &grp->mgid); +} diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c new file mode 100644 index 000000000000..54b3c7c99eff --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +void rxe_mmap_release(struct kref *ref) +{ + struct rxe_mmap_info *ip = container_of(ref, + struct rxe_mmap_info, ref); + struct rxe_dev *rxe = to_rdev(ip->context->device); + + spin_lock_bh(&rxe->pending_lock); + + if (!list_empty(&ip->pending_mmaps)) + list_del(&ip->pending_mmaps); + + spin_unlock_bh(&rxe->pending_lock); + + vfree(ip->obj); /* buf */ + kfree(ip); +} + +/* + * open and close keep track of how many times the memory region is mapped, + * to avoid releasing it. + */ +static void rxe_vma_open(struct vm_area_struct *vma) +{ + struct rxe_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void rxe_vma_close(struct vm_area_struct *vma) +{ + struct rxe_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, rxe_mmap_release); +} + +static struct vm_operations_struct rxe_vm_ops = { + .open = rxe_vma_open, + .close = rxe_vma_close, +}; + +/** + * rxe_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct rxe_dev *rxe = to_rdev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct rxe_mmap_info *ip, *pp; + int ret; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_bh(&rxe->pending_lock); + list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) { + if (context != ip->context || (__u64)offset != ip->info.offset) + continue; + + /* Don't allow a mmap larger than the object. */ + if (size > ip->info.size) { + pr_err("mmap region is larger than the object!\n"); + spin_unlock_bh(&rxe->pending_lock); + ret = -EINVAL; + goto done; + } + + goto found_it; + } + pr_warn("unable to find pending mmap info\n"); + spin_unlock_bh(&rxe->pending_lock); + ret = -EINVAL; + goto done; + +found_it: + list_del_init(&ip->pending_mmaps); + spin_unlock_bh(&rxe->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) { + pr_err("rxe: err %d from remap_vmalloc_range\n", ret); + goto done; + } + + vma->vm_ops = &rxe_vm_ops; + vma->vm_private_data = ip; + rxe_vma_open(vma); +done: + return ret; +} + +/* + * Allocate information for rxe_mmap + */ +struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe, + u32 size, + struct ib_ucontext *context, + void *obj) +{ + struct rxe_mmap_info *ip; + + ip = kmalloc(sizeof(*ip), GFP_KERNEL); + if (!ip) + return NULL; + + size = PAGE_ALIGN(size); + + spin_lock_bh(&rxe->mmap_offset_lock); + + if (rxe->mmap_offset == 0) + rxe->mmap_offset = PAGE_SIZE; + + ip->info.offset = rxe->mmap_offset; + rxe->mmap_offset += size; + + spin_unlock_bh(&rxe->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->info.size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + + return ip; +} diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c new file mode 100644 index 000000000000..f3dab6574504 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -0,0 +1,643 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* + * lfsr (linear feedback shift register) with period 255 + */ +static u8 rxe_get_key(void) +{ + static unsigned key = 1; + + key = key << 1; + + key |= (0 != (key & 0x100)) ^ (0 != (key & 0x10)) + ^ (0 != (key & 0x80)) ^ (0 != (key & 0x40)); + + key &= 0xff; + + return key; +} + +int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length) +{ + switch (mem->type) { + case RXE_MEM_TYPE_DMA: + return 0; + + case RXE_MEM_TYPE_MR: + case RXE_MEM_TYPE_FMR: + return ((iova < mem->iova) || + ((iova + length) > (mem->iova + mem->length))) ? + -EFAULT : 0; + + default: + return -EFAULT; + } +} + +#define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \ + | IB_ACCESS_REMOTE_WRITE \ + | IB_ACCESS_REMOTE_ATOMIC) + +static void rxe_mem_init(int access, struct rxe_mem *mem) +{ + u32 lkey = mem->pelem.index << 8 | rxe_get_key(); + u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; + + if (mem->pelem.pool->type == RXE_TYPE_MR) { + mem->ibmr.lkey = lkey; + mem->ibmr.rkey = rkey; + } + + mem->lkey = lkey; + mem->rkey = rkey; + mem->state = RXE_MEM_STATE_INVALID; + mem->type = RXE_MEM_TYPE_NONE; + mem->map_shift = ilog2(RXE_BUF_PER_MAP); +} + +void rxe_mem_cleanup(void *arg) +{ + struct rxe_mem *mem = arg; + int i; + + if (mem->umem) + ib_umem_release(mem->umem); + + if (mem->map) { + for (i = 0; i < mem->num_map; i++) + kfree(mem->map[i]); + + kfree(mem->map); + } +} + +static int rxe_mem_alloc(struct rxe_dev *rxe, struct rxe_mem *mem, int num_buf) +{ + int i; + int num_map; + struct rxe_map **map = mem->map; + + num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; + + mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL); + if (!mem->map) + goto err1; + + for (i = 0; i < num_map; i++) { + mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL); + if (!mem->map[i]) + goto err2; + } + + WARN_ON(!is_power_of_2(RXE_BUF_PER_MAP)); + + mem->map_shift = ilog2(RXE_BUF_PER_MAP); + mem->map_mask = RXE_BUF_PER_MAP - 1; + + mem->num_buf = num_buf; + mem->num_map = num_map; + mem->max_buf = num_map * RXE_BUF_PER_MAP; + + return 0; + +err2: + for (i--; i >= 0; i--) + kfree(mem->map[i]); + + kfree(mem->map); +err1: + return -ENOMEM; +} + +int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd, + int access, struct rxe_mem *mem) +{ + rxe_mem_init(access, mem); + + mem->pd = pd; + mem->access = access; + mem->state = RXE_MEM_STATE_VALID; + mem->type = RXE_MEM_TYPE_DMA; + + return 0; +} + +int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start, + u64 length, u64 iova, int access, struct ib_udata *udata, + struct rxe_mem *mem) +{ + int entry; + struct rxe_map **map; + struct rxe_phys_buf *buf = NULL; + struct ib_umem *umem; + struct scatterlist *sg; + int num_buf; + void *vaddr; + int err; + + umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0); + if (IS_ERR(umem)) { + pr_warn("err %d from rxe_umem_get\n", + (int)PTR_ERR(umem)); + err = -EINVAL; + goto err1; + } + + mem->umem = umem; + num_buf = umem->nmap; + + rxe_mem_init(access, mem); + + err = rxe_mem_alloc(rxe, mem, num_buf); + if (err) { + pr_warn("err %d from rxe_mem_alloc\n", err); + ib_umem_release(umem); + goto err1; + } + + WARN_ON(!is_power_of_2(umem->page_size)); + + mem->page_shift = ilog2(umem->page_size); + mem->page_mask = umem->page_size - 1; + + num_buf = 0; + map = mem->map; + if (length > 0) { + buf = map[0]->buf; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + pr_warn("null vaddr\n"); + err = -ENOMEM; + goto err1; + } + + buf->addr = (uintptr_t)vaddr; + buf->size = umem->page_size; + num_buf++; + buf++; + + if (num_buf >= RXE_BUF_PER_MAP) { + map++; + buf = map[0]->buf; + num_buf = 0; + } + } + } + + mem->pd = pd; + mem->umem = umem; + mem->access = access; + mem->length = length; + mem->iova = iova; + mem->va = start; + mem->offset = ib_umem_offset(umem); + mem->state = RXE_MEM_STATE_VALID; + mem->type = RXE_MEM_TYPE_MR; + + return 0; + +err1: + return err; +} + +int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd, + int max_pages, struct rxe_mem *mem) +{ + int err; + + rxe_mem_init(0, mem); + + /* In fastreg, we also set the rkey */ + mem->ibmr.rkey = mem->ibmr.lkey; + + err = rxe_mem_alloc(rxe, mem, max_pages); + if (err) + goto err1; + + mem->pd = pd; + mem->max_buf = max_pages; + mem->state = RXE_MEM_STATE_FREE; + mem->type = RXE_MEM_TYPE_MR; + + return 0; + +err1: + return err; +} + +static void lookup_iova( + struct rxe_mem *mem, + u64 iova, + int *m_out, + int *n_out, + size_t *offset_out) +{ + size_t offset = iova - mem->iova + mem->offset; + int map_index; + int buf_index; + u64 length; + + if (likely(mem->page_shift)) { + *offset_out = offset & mem->page_mask; + offset >>= mem->page_shift; + *n_out = offset & mem->map_mask; + *m_out = offset >> mem->map_shift; + } else { + map_index = 0; + buf_index = 0; + + length = mem->map[map_index]->buf[buf_index].size; + + while (offset >= length) { + offset -= length; + buf_index++; + + if (buf_index == RXE_BUF_PER_MAP) { + map_index++; + buf_index = 0; + } + length = mem->map[map_index]->buf[buf_index].size; + } + + *m_out = map_index; + *n_out = buf_index; + *offset_out = offset; + } +} + +void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length) +{ + size_t offset; + int m, n; + void *addr; + + if (mem->state != RXE_MEM_STATE_VALID) { + pr_warn("mem not in valid state\n"); + addr = NULL; + goto out; + } + + if (!mem->map) { + addr = (void *)(uintptr_t)iova; + goto out; + } + + if (mem_check_range(mem, iova, length)) { + pr_warn("range violation\n"); + addr = NULL; + goto out; + } + + lookup_iova(mem, iova, &m, &n, &offset); + + if (offset + length > mem->map[m]->buf[n].size) { + pr_warn("crosses page boundary\n"); + addr = NULL; + goto out; + } + + addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset; + +out: + return addr; +} + +/* copy data from a range (vaddr, vaddr+length-1) to or from + * a mem object starting at iova. Compute incremental value of + * crc32 if crcp is not zero. caller must hold a reference to mem + */ +int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length, + enum copy_direction dir, u32 *crcp) +{ + int err; + int bytes; + u8 *va; + struct rxe_map **map; + struct rxe_phys_buf *buf; + int m; + int i; + size_t offset; + u32 crc = crcp ? (*crcp) : 0; + + if (mem->type == RXE_MEM_TYPE_DMA) { + u8 *src, *dest; + + src = (dir == to_mem_obj) ? + addr : ((void *)(uintptr_t)iova); + + dest = (dir == to_mem_obj) ? + ((void *)(uintptr_t)iova) : addr; + + if (crcp) + *crcp = crc32_le(*crcp, src, length); + + memcpy(dest, src, length); + + return 0; + } + + WARN_ON(!mem->map); + + err = mem_check_range(mem, iova, length); + if (err) { + err = -EFAULT; + goto err1; + } + + lookup_iova(mem, iova, &m, &i, &offset); + + map = mem->map + m; + buf = map[0]->buf + i; + + while (length > 0) { + u8 *src, *dest; + + va = (u8 *)(uintptr_t)buf->addr + offset; + src = (dir == to_mem_obj) ? addr : va; + dest = (dir == to_mem_obj) ? va : addr; + + bytes = buf->size - offset; + + if (bytes > length) + bytes = length; + + if (crcp) + crc = crc32_le(crc, src, bytes); + + memcpy(dest, src, bytes); + + length -= bytes; + addr += bytes; + + offset = 0; + buf++; + i++; + + if (i == RXE_BUF_PER_MAP) { + i = 0; + map++; + buf = map[0]->buf; + } + } + + if (crcp) + *crcp = crc; + + return 0; + +err1: + return err; +} + +/* copy data in or out of a wqe, i.e. sg list + * under the control of a dma descriptor + */ +int copy_data( + struct rxe_dev *rxe, + struct rxe_pd *pd, + int access, + struct rxe_dma_info *dma, + void *addr, + int length, + enum copy_direction dir, + u32 *crcp) +{ + int bytes; + struct rxe_sge *sge = &dma->sge[dma->cur_sge]; + int offset = dma->sge_offset; + int resid = dma->resid; + struct rxe_mem *mem = NULL; + u64 iova; + int err; + + if (length == 0) + return 0; + + if (length > resid) { + err = -EINVAL; + goto err2; + } + + if (sge->length && (offset < sge->length)) { + mem = lookup_mem(pd, access, sge->lkey, lookup_local); + if (!mem) { + err = -EINVAL; + goto err1; + } + } + + while (length > 0) { + bytes = length; + + if (offset >= sge->length) { + if (mem) { + rxe_drop_ref(mem); + mem = NULL; + } + sge++; + dma->cur_sge++; + offset = 0; + + if (dma->cur_sge >= dma->num_sge) { + err = -ENOSPC; + goto err2; + } + + if (sge->length) { + mem = lookup_mem(pd, access, sge->lkey, + lookup_local); + if (!mem) { + err = -EINVAL; + goto err1; + } + } else { + continue; + } + } + + if (bytes > sge->length - offset) + bytes = sge->length - offset; + + if (bytes > 0) { + iova = sge->addr + offset; + + err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp); + if (err) + goto err2; + + offset += bytes; + resid -= bytes; + length -= bytes; + addr += bytes; + } + } + + dma->sge_offset = offset; + dma->resid = resid; + + if (mem) + rxe_drop_ref(mem); + + return 0; + +err2: + if (mem) + rxe_drop_ref(mem); +err1: + return err; +} + +int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) +{ + struct rxe_sge *sge = &dma->sge[dma->cur_sge]; + int offset = dma->sge_offset; + int resid = dma->resid; + + while (length) { + unsigned int bytes; + + if (offset >= sge->length) { + sge++; + dma->cur_sge++; + offset = 0; + if (dma->cur_sge >= dma->num_sge) + return -ENOSPC; + } + + bytes = length; + + if (bytes > sge->length - offset) + bytes = sge->length - offset; + + offset += bytes; + resid -= bytes; + length -= bytes; + } + + dma->sge_offset = offset; + dma->resid = resid; + + return 0; +} + +/* (1) find the mem (mr or mw) corresponding to lkey/rkey + * depending on lookup_type + * (2) verify that the (qp) pd matches the mem pd + * (3) verify that the mem can support the requested access + * (4) verify that mem state is valid + */ +struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, + enum lookup_type type) +{ + struct rxe_mem *mem; + struct rxe_dev *rxe = to_rdev(pd->ibpd.device); + int index = key >> 8; + + if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) { + mem = rxe_pool_get_index(&rxe->mr_pool, index); + if (!mem) + goto err1; + } else { + goto err1; + } + + if ((type == lookup_local && mem->lkey != key) || + (type == lookup_remote && mem->rkey != key)) + goto err2; + + if (mem->pd != pd) + goto err2; + + if (access && !(access & mem->access)) + goto err2; + + if (mem->state != RXE_MEM_STATE_VALID) + goto err2; + + return mem; + +err2: + rxe_drop_ref(mem); +err1: + return NULL; +} + +int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem, + u64 *page, int num_pages, u64 iova) +{ + int i; + int num_buf; + int err; + struct rxe_map **map; + struct rxe_phys_buf *buf; + int page_size; + + if (num_pages > mem->max_buf) { + err = -EINVAL; + goto err1; + } + + num_buf = 0; + page_size = 1 << mem->page_shift; + map = mem->map; + buf = map[0]->buf; + + for (i = 0; i < num_pages; i++) { + buf->addr = *page++; + buf->size = page_size; + buf++; + num_buf++; + + if (num_buf == RXE_BUF_PER_MAP) { + map++; + buf = map[0]->buf; + num_buf = 0; + } + } + + mem->iova = iova; + mem->va = iova; + mem->length = num_pages << mem->page_shift; + mem->state = RXE_MEM_STATE_VALID; + + return 0; + +err1: + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c new file mode 100644 index 000000000000..0b8d2ea8b41d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rxe.h" +#include "rxe_net.h" +#include "rxe_loc.h" + +static LIST_HEAD(rxe_dev_list); +static spinlock_t dev_list_lock; /* spinlock for device list */ + +struct rxe_dev *net_to_rxe(struct net_device *ndev) +{ + struct rxe_dev *rxe; + struct rxe_dev *found = NULL; + + spin_lock_bh(&dev_list_lock); + list_for_each_entry(rxe, &rxe_dev_list, list) { + if (rxe->ndev == ndev) { + found = rxe; + break; + } + } + spin_unlock_bh(&dev_list_lock); + + return found; +} + +struct rxe_dev *get_rxe_by_name(const char* name) +{ + struct rxe_dev *rxe; + struct rxe_dev *found = NULL; + + spin_lock_bh(&dev_list_lock); + list_for_each_entry(rxe, &rxe_dev_list, list) { + if (!strcmp(name, rxe->ib_dev.name)) { + found = rxe; + break; + } + } + spin_unlock_bh(&dev_list_lock); + return found; +} + + +struct rxe_recv_sockets recv_sockets; + +static __be64 rxe_mac_to_eui64(struct net_device *ndev) +{ + unsigned char *mac_addr = ndev->dev_addr; + __be64 eui64; + unsigned char *dst = (unsigned char *)&eui64; + + dst[0] = mac_addr[0] ^ 2; + dst[1] = mac_addr[1]; + dst[2] = mac_addr[2]; + dst[3] = 0xff; + dst[4] = 0xfe; + dst[5] = mac_addr[3]; + dst[6] = mac_addr[4]; + dst[7] = mac_addr[5]; + + return eui64; +} + +static __be64 node_guid(struct rxe_dev *rxe) +{ + return rxe_mac_to_eui64(rxe->ndev); +} + +static __be64 port_guid(struct rxe_dev *rxe) +{ + return rxe_mac_to_eui64(rxe->ndev); +} + +static struct device *dma_device(struct rxe_dev *rxe) +{ + struct net_device *ndev; + + ndev = rxe->ndev; + + if (ndev->priv_flags & IFF_802_1Q_VLAN) + ndev = vlan_dev_real_dev(ndev); + + return ndev->dev.parent; +} + +static int mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) +{ + int err; + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + err = dev_mc_add(rxe->ndev, ll_addr); + + return err; +} + +static int mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) +{ + int err; + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + err = dev_mc_del(rxe->ndev, ll_addr); + + return err; +} + +static struct dst_entry *rxe_find_route4(struct net_device *ndev, + struct in_addr *saddr, + struct in_addr *daddr) +{ + struct rtable *rt; + struct flowi4 fl = { { 0 } }; + + memset(&fl, 0, sizeof(fl)); + fl.flowi4_oif = ndev->ifindex; + memcpy(&fl.saddr, saddr, sizeof(*saddr)); + memcpy(&fl.daddr, daddr, sizeof(*daddr)); + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(&init_net, &fl); + if (IS_ERR(rt)) { + pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr); + return NULL; + } + + return &rt->dst; +} + +#if IS_ENABLED(CONFIG_IPV6) +static struct dst_entry *rxe_find_route6(struct net_device *ndev, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + struct dst_entry *ndst; + struct flowi6 fl6 = { { 0 } }; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = ndev->ifindex; + memcpy(&fl6.saddr, saddr, sizeof(*saddr)); + memcpy(&fl6.daddr, daddr, sizeof(*daddr)); + fl6.flowi6_proto = IPPROTO_UDP; + + if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk), + recv_sockets.sk6->sk, &ndst, &fl6))) { + pr_err_ratelimited("no route to %pI6\n", daddr); + goto put; + } + + if (unlikely(ndst->error)) { + pr_err("no route to %pI6\n", daddr); + goto put; + } + + return ndst; +put: + dst_release(ndst); + return NULL; +} + +#else + +static struct dst_entry *rxe_find_route6(struct net_device *ndev, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + return NULL; +} + +#endif + +static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct udphdr *udph; + struct net_device *ndev = skb->dev; + struct rxe_dev *rxe = net_to_rxe(ndev); + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + if (!rxe) + goto drop; + + if (skb_linearize(skb)) { + pr_err("skb_linearize failed\n"); + goto drop; + } + + udph = udp_hdr(skb); + pkt->rxe = rxe; + pkt->port_num = 1; + pkt->hdr = (u8 *)(udph + 1); + pkt->mask = RXE_GRH_MASK; + pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); + + return rxe_rcv(skb); +drop: + kfree_skb(skb); + return 0; +} + +static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, + bool ipv6) +{ + int err; + struct socket *sock; + struct udp_port_cfg udp_cfg; + struct udp_tunnel_sock_cfg tnl_cfg; + + memset(&udp_cfg, 0, sizeof(udp_cfg)); + + if (ipv6) { + udp_cfg.family = AF_INET6; + udp_cfg.ipv6_v6only = 1; + } else { + udp_cfg.family = AF_INET; + } + + udp_cfg.local_udp_port = port; + + /* Create UDP socket */ + err = udp_sock_create(net, &udp_cfg, &sock); + if (err < 0) { + pr_err("failed to create udp socket. err = %d\n", err); + return ERR_PTR(err); + } + + tnl_cfg.sk_user_data = NULL; + tnl_cfg.encap_type = 1; + tnl_cfg.encap_rcv = rxe_udp_encap_recv; + tnl_cfg.encap_destroy = NULL; + + /* Setup UDP tunnel */ + setup_udp_tunnel_sock(net, sock, &tnl_cfg); + + return sock; +} + +static void rxe_release_udp_tunnel(struct socket *sk) +{ + udp_tunnel_sock_release(sk); +} + +static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, + __be16 dst_port) +{ + struct udphdr *udph; + + __skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); + udph = udp_hdr(skb); + + udph->dest = dst_port; + udph->source = src_port; + udph->len = htons(skb->len); + udph->check = 0; +} + +static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, + __be32 saddr, __be32 daddr, __u8 proto, + __u8 tos, __u8 ttl, __be16 df, bool xnet) +{ + struct iphdr *iph; + + skb_scrub_packet(skb, xnet); + + skb_clear_hash(skb); + skb_dst_set(skb, dst); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + + iph->version = IPVERSION; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = proto; + iph->tos = tos; + iph->daddr = daddr; + iph->saddr = saddr; + iph->ttl = ttl; + __ip_select_ident(dev_net(dst->dev), iph, + skb_shinfo(skb)->gso_segs ?: 1); + iph->tot_len = htons(skb->len); + ip_send_check(iph); +} + +static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 proto, __u8 prio, __u8 ttl) +{ + struct ipv6hdr *ip6h; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED + | IPSKB_REROUTED); + skb_dst_set(skb, dst); + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = proto; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); +} + +static int prepare4(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av) +{ + struct dst_entry *dst; + bool xnet = false; + __be16 df = htons(IP_DF); + struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; + struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + dst = rxe_find_route4(rxe->ndev, saddr, daddr); + if (!dst) { + pr_err("Host not reachable\n"); + return -EHOSTUNREACH; + } + + if (!memcmp(saddr, daddr, sizeof(*daddr))) + pkt->mask |= RXE_LOOPBACK_MASK; + + prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), + htons(ROCE_V2_UDP_DPORT)); + + prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, + av->grh.traffic_class, av->grh.hop_limit, df, xnet); + return 0; +} + +static int prepare6(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av) +{ + struct dst_entry *dst; + struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; + struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + dst = rxe_find_route6(rxe->ndev, saddr, daddr); + if (!dst) { + pr_err("Host not reachable\n"); + return -EHOSTUNREACH; + } + + if (!memcmp(saddr, daddr, sizeof(*daddr))) + pkt->mask |= RXE_LOOPBACK_MASK; + + prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), + htons(ROCE_V2_UDP_DPORT)); + + prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, + av->grh.traffic_class, + av->grh.hop_limit); + return 0; +} + +static int prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct sk_buff *skb, u32 *crc) +{ + int err = 0; + struct rxe_av *av = rxe_get_av(pkt); + + if (av->network_type == RDMA_NETWORK_IPV4) + err = prepare4(rxe, skb, av); + else if (av->network_type == RDMA_NETWORK_IPV6) + err = prepare6(rxe, skb, av); + + *crc = rxe_icrc_hdr(pkt, skb); + + return err; +} + +static void rxe_skb_tx_dtor(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct rxe_qp *qp = sk->sk_user_data; + int skb_out = atomic_dec_return(&qp->skb_out); + + if (unlikely(qp->need_req_skb && + skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) + rxe_run_task(&qp->req.task, 1); +} + +static int send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + struct sk_buff *nskb; + struct rxe_av *av; + int err; + + av = rxe_get_av(pkt); + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb->destructor = rxe_skb_tx_dtor; + nskb->sk = pkt->qp->sk->sk; + + if (av->network_type == RDMA_NETWORK_IPV4) { + err = ip_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); + } else if (av->network_type == RDMA_NETWORK_IPV6) { + err = ip6_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); + } else { + pr_err("Unknown layer 3 protocol: %d\n", av->network_type); + kfree_skb(nskb); + return -EINVAL; + } + + if (unlikely(net_xmit_eval(err))) { + pr_debug("error sending packet: %d\n", err); + return -EAGAIN; + } + + kfree_skb(skb); + + return 0; +} + +static int loopback(struct sk_buff *skb) +{ + return rxe_rcv(skb); +} + +static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av) +{ + return rxe->port.port_guid == av->grh.dgid.global.interface_id; +} + +static struct sk_buff *init_packet(struct rxe_dev *rxe, struct rxe_av *av, + int paylen, struct rxe_pkt_info *pkt) +{ + unsigned int hdr_len; + struct sk_buff *skb; + + if (av->network_type == RDMA_NETWORK_IPV4) + hdr_len = ETH_HLEN + sizeof(struct udphdr) + + sizeof(struct iphdr); + else + hdr_len = ETH_HLEN + sizeof(struct udphdr) + + sizeof(struct ipv6hdr); + + skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(rxe->ndev), + GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev)); + + skb->dev = rxe->ndev; + if (av->network_type == RDMA_NETWORK_IPV4) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + pkt->rxe = rxe; + pkt->port_num = 1; + pkt->hdr = skb_put(skb, paylen); + pkt->mask |= RXE_GRH_MASK; + + memset(pkt->hdr, 0, paylen); + + return skb; +} + +/* + * this is required by rxe_cfg to match rxe devices in + * /sys/class/infiniband up with their underlying ethernet devices + */ +static char *parent_name(struct rxe_dev *rxe, unsigned int port_num) +{ + return rxe->ndev->name; +} + +static enum rdma_link_layer link_layer(struct rxe_dev *rxe, + unsigned int port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static struct rxe_ifc_ops ifc_ops = { + .node_guid = node_guid, + .port_guid = port_guid, + .dma_device = dma_device, + .mcast_add = mcast_add, + .mcast_delete = mcast_delete, + .prepare = prepare, + .send = send, + .loopback = loopback, + .init_packet = init_packet, + .parent_name = parent_name, + .link_layer = link_layer, +}; + +struct rxe_dev *rxe_net_add(struct net_device *ndev) +{ + int err; + struct rxe_dev *rxe = NULL; + + rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe)); + if (!rxe) + return NULL; + + rxe->ifc_ops = &ifc_ops; + rxe->ndev = ndev; + + err = rxe_add(rxe, ndev->mtu); + if (err) { + ib_dealloc_device(&rxe->ib_dev); + return NULL; + } + + spin_lock_bh(&dev_list_lock); + list_add_tail(&rxe_dev_list, &rxe->list); + spin_unlock_bh(&dev_list_lock); + return rxe; +} + +void rxe_remove_all(void) +{ + spin_lock_bh(&dev_list_lock); + while (!list_empty(&rxe_dev_list)) { + struct rxe_dev *rxe = + list_first_entry(&rxe_dev_list, struct rxe_dev, list); + + list_del(&rxe->list); + spin_unlock_bh(&dev_list_lock); + rxe_remove(rxe); + spin_lock_bh(&dev_list_lock); + } + spin_unlock_bh(&dev_list_lock); +} +EXPORT_SYMBOL(rxe_remove_all); + +static void rxe_port_event(struct rxe_dev *rxe, + enum ib_event_type event) +{ + struct ib_event ev; + + ev.device = &rxe->ib_dev; + ev.element.port_num = 1; + ev.event = event; + + ib_dispatch_event(&ev); +} + +/* Caller must hold net_info_lock */ +void rxe_port_up(struct rxe_dev *rxe) +{ + struct rxe_port *port; + + port = &rxe->port; + port->attr.state = IB_PORT_ACTIVE; + port->attr.phys_state = IB_PHYS_STATE_LINK_UP; + + rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); + pr_info("rxe: set %s active\n", rxe->ib_dev.name); + return; +} + +/* Caller must hold net_info_lock */ +void rxe_port_down(struct rxe_dev *rxe) +{ + struct rxe_port *port; + + port = &rxe->port; + port->attr.state = IB_PORT_DOWN; + port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN; + + rxe_port_event(rxe, IB_EVENT_PORT_ERR); + pr_info("rxe: set %s down\n", rxe->ib_dev.name); + return; +} + +static int rxe_notify(struct notifier_block *not_blk, + unsigned long event, + void *arg) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(arg); + struct rxe_dev *rxe = net_to_rxe(ndev); + + if (!rxe) + goto out; + + switch (event) { + case NETDEV_UNREGISTER: + list_del(&rxe->list); + rxe_remove(rxe); + break; + case NETDEV_UP: + rxe_port_up(rxe); + break; + case NETDEV_DOWN: + rxe_port_down(rxe); + break; + case NETDEV_CHANGEMTU: + pr_info("rxe: %s changed mtu to %d\n", ndev->name, ndev->mtu); + rxe_set_mtu(rxe, ndev->mtu); + break; + case NETDEV_REBOOT: + case NETDEV_CHANGE: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + default: + pr_info("rxe: ignoring netdev event = %ld for %s\n", + event, ndev->name); + break; + } +out: + return NOTIFY_OK; +} + +static struct notifier_block rxe_net_notifier = { + .notifier_call = rxe_notify, +}; + +int rxe_net_init(void) +{ + int err; + + spin_lock_init(&dev_list_lock); + + recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, + htons(ROCE_V2_UDP_DPORT), true); + if (IS_ERR(recv_sockets.sk6)) { + recv_sockets.sk6 = NULL; + pr_err("rxe: Failed to create IPv6 UDP tunnel\n"); + return -1; + } + + recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, + htons(ROCE_V2_UDP_DPORT), false); + if (IS_ERR(recv_sockets.sk4)) { + rxe_release_udp_tunnel(recv_sockets.sk6); + recv_sockets.sk4 = NULL; + recv_sockets.sk6 = NULL; + pr_err("rxe: Failed to create IPv4 UDP tunnel\n"); + return -1; + } + + err = register_netdevice_notifier(&rxe_net_notifier); + if (err) { + rxe_release_udp_tunnel(recv_sockets.sk6); + rxe_release_udp_tunnel(recv_sockets.sk4); + pr_err("rxe: Failed to rigister netdev notifier\n"); + } + + return err; +} + +void rxe_net_exit(void) +{ + if (recv_sockets.sk6) + rxe_release_udp_tunnel(recv_sockets.sk6); + + if (recv_sockets.sk4) + rxe_release_udp_tunnel(recv_sockets.sk4); + + unregister_netdevice_notifier(&rxe_net_notifier); +} diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h new file mode 100644 index 000000000000..7b06f76d16cc --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_NET_H +#define RXE_NET_H + +#include +#include +#include + +struct rxe_recv_sockets { + struct socket *sk4; + struct socket *sk6; +}; + +extern struct rxe_recv_sockets recv_sockets; + +struct rxe_dev *rxe_net_add(struct net_device *ndev); + +int rxe_net_init(void); +void rxe_net_exit(void); + +#endif /* RXE_NET_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c new file mode 100644 index 000000000000..61927c165b59 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -0,0 +1,961 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "rxe_opcode.h" +#include "rxe_hdr.h" + +/* useful information about work request opcodes and pkt opcodes in + * table form + */ +struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { + [IB_WR_RDMA_WRITE] = { + .name = "IB_WR_RDMA_WRITE", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK, + }, + }, + [IB_WR_RDMA_WRITE_WITH_IMM] = { + .name = "IB_WR_RDMA_WRITE_WITH_IMM", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK, + }, + }, + [IB_WR_SEND] = { + .name = "IB_WR_SEND", + .mask = { + [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_SEND_WITH_IMM] = { + .name = "IB_WR_SEND_WITH_IMM", + .mask = { + [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_RDMA_READ] = { + .name = "IB_WR_RDMA_READ", + .mask = { + [IB_QPT_RC] = WR_READ_MASK, + }, + }, + [IB_WR_ATOMIC_CMP_AND_SWP] = { + .name = "IB_WR_ATOMIC_CMP_AND_SWP", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_MASK, + }, + }, + [IB_WR_ATOMIC_FETCH_AND_ADD] = { + .name = "IB_WR_ATOMIC_FETCH_AND_ADD", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_MASK, + }, + }, + [IB_WR_LSO] = { + .name = "IB_WR_LSO", + .mask = { + /* not supported */ + }, + }, + [IB_WR_SEND_WITH_INV] = { + .name = "IB_WR_SEND_WITH_INV", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_RDMA_READ_WITH_INV] = { + .name = "IB_WR_RDMA_READ_WITH_INV", + .mask = { + [IB_QPT_RC] = WR_READ_MASK, + }, + }, + [IB_WR_LOCAL_INV] = { + .name = "IB_WR_LOCAL_INV", + .mask = { + [IB_QPT_RC] = WR_REG_MASK, + }, + }, + [IB_WR_REG_MR] = { + .name = "IB_WR_REG_MR", + .mask = { + [IB_QPT_RC] = WR_REG_MASK, + }, + }, +}; + +struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { + [IB_OPCODE_RC_SEND_FIRST] = { + .name = "IB_OPCODE_RC_SEND_FIRST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_MIDDLE] = { + .name = "IB_OPCODE_RC_SEND_MIDDLE]", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST] = { + .name = "IB_OPCODE_RC_SEND_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY] = { + .name = "IB_OPCODE_RC_SEND_ONLY", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_FIRST", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = { + .name = "IB_OPCODE_RC_RDMA_READ_REQUEST", + .mask = RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RC_ACKNOWLEDGE", + .mask = RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE", + .mask = RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_ATMACK] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMACK_BYTES + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_COMPARE_SWAP] = { + .name = "IB_OPCODE_RC_COMPARE_SWAP", + .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_ATMETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES, + } + }, + [IB_OPCODE_RC_FETCH_ADD] = { + .name = "IB_OPCODE_RC_FETCH_ADD", + .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_ATMETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = { + .name = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE", + .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IETH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = { + .name = "IB_OPCODE_RC_SEND_ONLY_INV", + .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IETH_BYTES, + } + }, + + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = { + .name = "IB_OPCODE_UC_SEND_FIRST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_MIDDLE] = { + .name = "IB_OPCODE_UC_SEND_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_LAST] = { + .name = "IB_OPCODE_UC_SEND_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_SEND_ONLY] = { + .name = "IB_OPCODE_UC_SEND_ONLY", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_FIRST", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + + /* RD */ + [IB_OPCODE_RD_SEND_FIRST] = { + .name = "IB_OPCODE_RD_SEND_FIRST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_MIDDLE] = { + .name = "IB_OPCODE_RD_SEND_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_LAST] = { + .name = "IB_OPCODE_RD_SEND_LAST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_SEND_ONLY] = { + .name = "IB_OPCODE_RD_SEND_ONLY", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_FIRST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_LAST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES + + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_REQUEST] = { + .name = "IB_OPCODE_RD_RDMA_READ_REQUEST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_REQ_MASK | RXE_READ_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK + | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK + | RXE_ACK_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK + | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RD_ACKNOWLEDGE", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK + | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMACK] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_COMPARE_SWAP] = { + .name = "RD_COMPARE_SWAP", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK + | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + + RXE_ATMETH_BYTES + + RXE_DETH_BYTES + + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_FETCH_ADD] = { + .name = "IB_OPCODE_RD_FETCH_ADD", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK + | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + + RXE_ATMETH_BYTES + + RXE_DETH_BYTES + + + RXE_RDETH_BYTES, + } + }, + + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = { + .name = "IB_OPCODE_UD_SEND_ONLY", + .mask = RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_DETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_DETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + +}; diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h new file mode 100644 index 000000000000..307604e9c78d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_OPCODE_H +#define RXE_OPCODE_H + +/* + * contains header bit mask definitions and header lengths + * declaration of the rxe_opcode_info struct and + * rxe_wr_opcode_info struct + */ + +enum rxe_wr_mask { + WR_INLINE_MASK = BIT(0), + WR_ATOMIC_MASK = BIT(1), + WR_SEND_MASK = BIT(2), + WR_READ_MASK = BIT(3), + WR_WRITE_MASK = BIT(4), + WR_LOCAL_MASK = BIT(5), + WR_REG_MASK = BIT(6), + + WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, + WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK, + WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, + WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK, +}; + +#define WR_MAX_QPT (8) + +struct rxe_wr_opcode_info { + char *name; + enum rxe_wr_mask mask[WR_MAX_QPT]; +}; + +extern struct rxe_wr_opcode_info rxe_wr_opcode_info[]; + +enum rxe_hdr_type { + RXE_LRH, + RXE_GRH, + RXE_BTH, + RXE_RETH, + RXE_AETH, + RXE_ATMETH, + RXE_ATMACK, + RXE_IETH, + RXE_RDETH, + RXE_DETH, + RXE_IMMDT, + RXE_PAYLOAD, + NUM_HDR_TYPES +}; + +enum rxe_hdr_mask { + RXE_LRH_MASK = BIT(RXE_LRH), + RXE_GRH_MASK = BIT(RXE_GRH), + RXE_BTH_MASK = BIT(RXE_BTH), + RXE_IMMDT_MASK = BIT(RXE_IMMDT), + RXE_RETH_MASK = BIT(RXE_RETH), + RXE_AETH_MASK = BIT(RXE_AETH), + RXE_ATMETH_MASK = BIT(RXE_ATMETH), + RXE_ATMACK_MASK = BIT(RXE_ATMACK), + RXE_IETH_MASK = BIT(RXE_IETH), + RXE_RDETH_MASK = BIT(RXE_RDETH), + RXE_DETH_MASK = BIT(RXE_DETH), + RXE_PAYLOAD_MASK = BIT(RXE_PAYLOAD), + + RXE_REQ_MASK = BIT(NUM_HDR_TYPES + 0), + RXE_ACK_MASK = BIT(NUM_HDR_TYPES + 1), + RXE_SEND_MASK = BIT(NUM_HDR_TYPES + 2), + RXE_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), + RXE_READ_MASK = BIT(NUM_HDR_TYPES + 4), + RXE_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), + + RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 6), + RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 7), + + RXE_START_MASK = BIT(NUM_HDR_TYPES + 8), + RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), + RXE_END_MASK = BIT(NUM_HDR_TYPES + 10), + + RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), + + RXE_READ_OR_ATOMIC = (RXE_READ_MASK | RXE_ATOMIC_MASK), + RXE_WRITE_OR_SEND = (RXE_WRITE_MASK | RXE_SEND_MASK), +}; + +#define OPCODE_NONE (-1) +#define RXE_NUM_OPCODE 256 + +struct rxe_opcode_info { + char *name; + enum rxe_hdr_mask mask; + int length; + int offset[NUM_HDR_TYPES]; +}; + +extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE]; + +#endif /* RXE_OPCODE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h new file mode 100644 index 000000000000..f459c43a77c8 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_PARAM_H +#define RXE_PARAM_H + +static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu) +{ + if (mtu < 256) + return 0; + else if (mtu < 512) + return IB_MTU_256; + else if (mtu < 1024) + return IB_MTU_512; + else if (mtu < 2048) + return IB_MTU_1024; + else if (mtu < 4096) + return IB_MTU_2048; + else + return IB_MTU_4096; +} + +/* Find the IB mtu for a given network MTU. */ +static inline enum ib_mtu eth_mtu_int_to_enum(int mtu) +{ + mtu -= RXE_MAX_HDR_LENGTH; + + return rxe_mtu_int_to_enum(mtu); +} + +/* default/initial rxe device parameter settings */ +enum rxe_device_param { + RXE_FW_VER = 0, + RXE_MAX_MR_SIZE = -1ull, + RXE_PAGE_SIZE_CAP = 0xfffff000, + RXE_VENDOR_ID = 0, + RXE_VENDOR_PART_ID = 0, + RXE_HW_VER = 0, + RXE_MAX_QP = 0x10000, + RXE_MAX_QP_WR = 0x4000, + RXE_MAX_INLINE_DATA = 400, + RXE_DEVICE_CAP_FLAGS = IB_DEVICE_BAD_PKEY_CNTR + | IB_DEVICE_BAD_QKEY_CNTR + | IB_DEVICE_AUTO_PATH_MIG + | IB_DEVICE_CHANGE_PHY_PORT + | IB_DEVICE_UD_AV_PORT_ENFORCE + | IB_DEVICE_PORT_ACTIVE_EVENT + | IB_DEVICE_SYS_IMAGE_GUID + | IB_DEVICE_RC_RNR_NAK_GEN + | IB_DEVICE_SRQ_RESIZE + | IB_DEVICE_MEM_MGT_EXTENSIONS, + RXE_MAX_SGE = 32, + RXE_MAX_SGE_RD = 32, + RXE_MAX_CQ = 16384, + RXE_MAX_LOG_CQE = 13, + RXE_MAX_MR = 2 * 1024, + RXE_MAX_PD = 0x7ffc, + RXE_MAX_QP_RD_ATOM = 128, + RXE_MAX_EE_RD_ATOM = 0, + RXE_MAX_RES_RD_ATOM = 0x3f000, + RXE_MAX_QP_INIT_RD_ATOM = 128, + RXE_MAX_EE_INIT_RD_ATOM = 0, + RXE_ATOMIC_CAP = 1, + RXE_MAX_EE = 0, + RXE_MAX_RDD = 0, + RXE_MAX_MW = 0, + RXE_MAX_RAW_IPV6_QP = 0, + RXE_MAX_RAW_ETHY_QP = 0, + RXE_MAX_MCAST_GRP = 8192, + RXE_MAX_MCAST_QP_ATTACH = 56, + RXE_MAX_TOT_MCAST_QP_ATTACH = 0x70000, + RXE_MAX_AH = 100, + RXE_MAX_FMR = 0, + RXE_MAX_MAP_PER_FMR = 0, + RXE_MAX_SRQ = 960, + RXE_MAX_SRQ_WR = 0x4000, + RXE_MIN_SRQ_WR = 1, + RXE_MAX_SRQ_SGE = 27, + RXE_MIN_SRQ_SGE = 1, + RXE_MAX_FMR_PAGE_LIST_LEN = 512, + RXE_MAX_PKEYS = 64, + RXE_LOCAL_CA_ACK_DELAY = 15, + + RXE_MAX_UCONTEXT = 512, + + RXE_NUM_PORT = 1, + RXE_NUM_COMP_VECTORS = 1, + + RXE_MIN_QP_INDEX = 16, + RXE_MAX_QP_INDEX = 0x00020000, + + RXE_MIN_SRQ_INDEX = 0x00020001, + RXE_MAX_SRQ_INDEX = 0x00040000, + + RXE_MIN_MR_INDEX = 0x00000001, + RXE_MAX_MR_INDEX = 0x00040000, + RXE_MIN_MW_INDEX = 0x00040001, + RXE_MAX_MW_INDEX = 0x00060000, + RXE_MAX_PKT_PER_ACK = 64, + + RXE_MAX_UNACKED_PSNS = 128, + + /* Max inflight SKBs per queue pair */ + RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, + RXE_INFLIGHT_SKBS_PER_QP_LOW = 16, + + /* Delay before calling arbiter timer */ + RXE_NSEC_ARB_TIMER_DELAY = 200, +}; + +/* default/initial rxe port parameters */ +enum rxe_port_param { + RXE_PORT_STATE = IB_PORT_DOWN, + RXE_PORT_MAX_MTU = IB_MTU_4096, + RXE_PORT_ACTIVE_MTU = IB_MTU_256, + RXE_PORT_GID_TBL_LEN = 1024, + RXE_PORT_PORT_CAP_FLAGS = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP, + RXE_PORT_MAX_MSG_SZ = 0x800000, + RXE_PORT_BAD_PKEY_CNTR = 0, + RXE_PORT_QKEY_VIOL_CNTR = 0, + RXE_PORT_LID = 0, + RXE_PORT_SM_LID = 0, + RXE_PORT_SM_SL = 0, + RXE_PORT_LMC = 0, + RXE_PORT_MAX_VL_NUM = 1, + RXE_PORT_SUBNET_TIMEOUT = 0, + RXE_PORT_INIT_TYPE_REPLY = 0, + RXE_PORT_ACTIVE_WIDTH = IB_WIDTH_1X, + RXE_PORT_ACTIVE_SPEED = 1, + RXE_PORT_PKEY_TBL_LEN = 64, + RXE_PORT_PHYS_STATE = 2, + RXE_PORT_SUBNET_PREFIX = 0xfe80000000000000ULL, +}; + +/* default/initial port info parameters */ +enum rxe_port_info_param { + RXE_PORT_INFO_VL_CAP = 4, /* 1-8 */ + RXE_PORT_INFO_MTU_CAP = 5, /* 4096 */ + RXE_PORT_INFO_OPER_VL = 1, /* 1 */ +}; + +#endif /* RXE_PARAM_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c new file mode 100644 index 000000000000..6bac0717c540 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* info about object pools + * note that mr and mw share a single index space + * so that one can map an lkey to the correct type of object + */ +struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { + [RXE_TYPE_UC] = { + .name = "rxe-uc", + .size = sizeof(struct rxe_ucontext), + }, + [RXE_TYPE_PD] = { + .name = "rxe-pd", + .size = sizeof(struct rxe_pd), + }, + [RXE_TYPE_AH] = { + .name = "rxe-ah", + .size = sizeof(struct rxe_ah), + .flags = RXE_POOL_ATOMIC, + }, + [RXE_TYPE_SRQ] = { + .name = "rxe-srq", + .size = sizeof(struct rxe_srq), + .flags = RXE_POOL_INDEX, + .min_index = RXE_MIN_SRQ_INDEX, + .max_index = RXE_MAX_SRQ_INDEX, + }, + [RXE_TYPE_QP] = { + .name = "rxe-qp", + .size = sizeof(struct rxe_qp), + .cleanup = rxe_qp_cleanup, + .flags = RXE_POOL_INDEX, + .min_index = RXE_MIN_QP_INDEX, + .max_index = RXE_MAX_QP_INDEX, + }, + [RXE_TYPE_CQ] = { + .name = "rxe-cq", + .size = sizeof(struct rxe_cq), + .cleanup = rxe_cq_cleanup, + }, + [RXE_TYPE_MR] = { + .name = "rxe-mr", + .size = sizeof(struct rxe_mem), + .cleanup = rxe_mem_cleanup, + .flags = RXE_POOL_INDEX, + .max_index = RXE_MAX_MR_INDEX, + .min_index = RXE_MIN_MR_INDEX, + }, + [RXE_TYPE_MW] = { + .name = "rxe-mw", + .size = sizeof(struct rxe_mem), + .flags = RXE_POOL_INDEX, + .max_index = RXE_MAX_MW_INDEX, + .min_index = RXE_MIN_MW_INDEX, + }, + [RXE_TYPE_MC_GRP] = { + .name = "rxe-mc_grp", + .size = sizeof(struct rxe_mc_grp), + .cleanup = rxe_mc_cleanup, + .flags = RXE_POOL_KEY, + .key_offset = offsetof(struct rxe_mc_grp, mgid), + .key_size = sizeof(union ib_gid), + }, + [RXE_TYPE_MC_ELEM] = { + .name = "rxe-mc_elem", + .size = sizeof(struct rxe_mc_elem), + .flags = RXE_POOL_ATOMIC, + }, +}; + +static inline char *pool_name(struct rxe_pool *pool) +{ + return rxe_type_info[pool->type].name; +} + +static inline struct kmem_cache *pool_cache(struct rxe_pool *pool) +{ + return rxe_type_info[pool->type].cache; +} + +static inline enum rxe_elem_type rxe_type(void *arg) +{ + struct rxe_pool_entry *elem = arg; + + return elem->pool->type; +} + +int rxe_cache_init(void) +{ + int err; + int i; + size_t size; + struct rxe_type_info *type; + + for (i = 0; i < RXE_NUM_TYPES; i++) { + type = &rxe_type_info[i]; + size = ALIGN(type->size, RXE_POOL_ALIGN); + type->cache = kmem_cache_create(type->name, size, + RXE_POOL_ALIGN, + RXE_POOL_CACHE_FLAGS, NULL); + if (!type->cache) { + pr_err("Unable to init kmem cache for %s\n", + type->name); + err = -ENOMEM; + goto err1; + } + } + + return 0; + +err1: + while (--i >= 0) { + kmem_cache_destroy(type->cache); + type->cache = NULL; + } + + return err; +} + +void rxe_cache_exit(void) +{ + int i; + struct rxe_type_info *type; + + for (i = 0; i < RXE_NUM_TYPES; i++) { + type = &rxe_type_info[i]; + kmem_cache_destroy(type->cache); + type->cache = NULL; + } +} + +static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min) +{ + int err = 0; + size_t size; + + if ((max - min + 1) < pool->max_elem) { + pr_warn("not enough indices for max_elem\n"); + err = -EINVAL; + goto out; + } + + pool->max_index = max; + pool->min_index = min; + + size = BITS_TO_LONGS(max - min + 1) * sizeof(long); + pool->table = kmalloc(size, GFP_KERNEL); + if (!pool->table) { + pr_warn("no memory for bit table\n"); + err = -ENOMEM; + goto out; + } + + pool->table_size = size; + bitmap_zero(pool->table, max - min + 1); + +out: + return err; +} + +int rxe_pool_init( + struct rxe_dev *rxe, + struct rxe_pool *pool, + enum rxe_elem_type type, + unsigned max_elem) +{ + int err = 0; + size_t size = rxe_type_info[type].size; + + memset(pool, 0, sizeof(*pool)); + + pool->rxe = rxe; + pool->type = type; + pool->max_elem = max_elem; + pool->elem_size = ALIGN(size, RXE_POOL_ALIGN); + pool->flags = rxe_type_info[type].flags; + pool->tree = RB_ROOT; + pool->cleanup = rxe_type_info[type].cleanup; + + atomic_set(&pool->num_elem, 0); + + kref_init(&pool->ref_cnt); + + spin_lock_init(&pool->pool_lock); + + if (rxe_type_info[type].flags & RXE_POOL_INDEX) { + err = rxe_pool_init_index(pool, + rxe_type_info[type].max_index, + rxe_type_info[type].min_index); + if (err) + goto out; + } + + if (rxe_type_info[type].flags & RXE_POOL_KEY) { + pool->key_offset = rxe_type_info[type].key_offset; + pool->key_size = rxe_type_info[type].key_size; + } + + pool->state = rxe_pool_valid; + +out: + return err; +} + +static void rxe_pool_release(struct kref *kref) +{ + struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt); + + pool->state = rxe_pool_invalid; + kfree(pool->table); +} + +static void rxe_pool_put(struct rxe_pool *pool) +{ + kref_put(&pool->ref_cnt, rxe_pool_release); +} + +int rxe_pool_cleanup(struct rxe_pool *pool) +{ + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + pool->state = rxe_pool_invalid; + if (atomic_read(&pool->num_elem) > 0) + pr_warn("%s pool destroyed with unfree'd elem\n", + pool_name(pool)); + spin_unlock_irqrestore(&pool->pool_lock, flags); + + rxe_pool_put(pool); + + return 0; +} + +static u32 alloc_index(struct rxe_pool *pool) +{ + u32 index; + u32 range = pool->max_index - pool->min_index + 1; + + index = find_next_zero_bit(pool->table, range, pool->last); + if (index >= range) + index = find_first_zero_bit(pool->table, range); + + set_bit(index, pool->table); + pool->last = index; + return index + pool->min_index; +} + +static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new) +{ + struct rb_node **link = &pool->tree.rb_node; + struct rb_node *parent = NULL; + struct rxe_pool_entry *elem; + + while (*link) { + parent = *link; + elem = rb_entry(parent, struct rxe_pool_entry, node); + + if (elem->index == new->index) { + pr_warn("element already exists!\n"); + goto out; + } + + if (elem->index > new->index) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &pool->tree); +out: + return; +} + +static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new) +{ + struct rb_node **link = &pool->tree.rb_node; + struct rb_node *parent = NULL; + struct rxe_pool_entry *elem; + int cmp; + + while (*link) { + parent = *link; + elem = rb_entry(parent, struct rxe_pool_entry, node); + + cmp = memcmp((u8 *)elem + pool->key_offset, + (u8 *)new + pool->key_offset, pool->key_size); + + if (cmp == 0) { + pr_warn("key already exists!\n"); + goto out; + } + + if (cmp > 0) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &pool->tree); +out: + return; +} + +void rxe_add_key(void *arg, void *key) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + memcpy((u8 *)elem + pool->key_offset, key, pool->key_size); + insert_key(pool, elem); + spin_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_drop_key(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + rb_erase(&elem->node, &pool->tree); + spin_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_add_index(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + elem->index = alloc_index(pool); + insert_index(pool, elem); + spin_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_drop_index(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + clear_bit(elem->index - pool->min_index, pool->table); + rb_erase(&elem->node, &pool->tree); + spin_unlock_irqrestore(&pool->pool_lock, flags); +} + +void *rxe_alloc(struct rxe_pool *pool) +{ + struct rxe_pool_entry *elem; + unsigned long flags; + + might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); + + spin_lock_irqsave(&pool->pool_lock, flags); + if (pool->state != rxe_pool_valid) { + spin_unlock_irqrestore(&pool->pool_lock, flags); + return NULL; + } + kref_get(&pool->ref_cnt); + spin_unlock_irqrestore(&pool->pool_lock, flags); + + kref_get(&pool->rxe->ref_cnt); + + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) { + atomic_dec(&pool->num_elem); + rxe_dev_put(pool->rxe); + rxe_pool_put(pool); + return NULL; + } + + elem = kmem_cache_zalloc(pool_cache(pool), + (pool->flags & RXE_POOL_ATOMIC) ? + GFP_ATOMIC : GFP_KERNEL); + + elem->pool = pool; + kref_init(&elem->ref_cnt); + + return elem; +} + +void rxe_elem_release(struct kref *kref) +{ + struct rxe_pool_entry *elem = + container_of(kref, struct rxe_pool_entry, ref_cnt); + struct rxe_pool *pool = elem->pool; + + if (pool->cleanup) + pool->cleanup(elem); + + kmem_cache_free(pool_cache(pool), elem); + atomic_dec(&pool->num_elem); + rxe_dev_put(pool->rxe); + rxe_pool_put(pool); +} + +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) +{ + struct rb_node *node = NULL; + struct rxe_pool_entry *elem = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + + if (pool->state != rxe_pool_valid) + goto out; + + node = pool->tree.rb_node; + + while (node) { + elem = rb_entry(node, struct rxe_pool_entry, node); + + if (elem->index > index) + node = node->rb_left; + else if (elem->index < index) + node = node->rb_right; + else + break; + } + + if (node) + kref_get(&elem->ref_cnt); + +out: + spin_unlock_irqrestore(&pool->pool_lock, flags); + return node ? (void *)elem : NULL; +} + +void *rxe_pool_get_key(struct rxe_pool *pool, void *key) +{ + struct rb_node *node = NULL; + struct rxe_pool_entry *elem = NULL; + int cmp; + unsigned long flags; + + spin_lock_irqsave(&pool->pool_lock, flags); + + if (pool->state != rxe_pool_valid) + goto out; + + node = pool->tree.rb_node; + + while (node) { + elem = rb_entry(node, struct rxe_pool_entry, node); + + cmp = memcmp((u8 *)elem + pool->key_offset, + key, pool->key_size); + + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else + break; + } + + if (node) + kref_get(&elem->ref_cnt); + +out: + spin_unlock_irqrestore(&pool->pool_lock, flags); + return node ? ((void *)elem) : NULL; +} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h new file mode 100644 index 000000000000..4d04830adcae --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_POOL_H +#define RXE_POOL_H + +#define RXE_POOL_ALIGN (16) +#define RXE_POOL_CACHE_FLAGS (0) + +enum rxe_pool_flags { + RXE_POOL_ATOMIC = BIT(0), + RXE_POOL_INDEX = BIT(1), + RXE_POOL_KEY = BIT(2), +}; + +enum rxe_elem_type { + RXE_TYPE_UC, + RXE_TYPE_PD, + RXE_TYPE_AH, + RXE_TYPE_SRQ, + RXE_TYPE_QP, + RXE_TYPE_CQ, + RXE_TYPE_MR, + RXE_TYPE_MW, + RXE_TYPE_MC_GRP, + RXE_TYPE_MC_ELEM, + RXE_NUM_TYPES, /* keep me last */ +}; + +struct rxe_type_info { + char *name; + size_t size; + void (*cleanup)(void *obj); + enum rxe_pool_flags flags; + u32 max_index; + u32 min_index; + size_t key_offset; + size_t key_size; + struct kmem_cache *cache; +}; + +extern struct rxe_type_info rxe_type_info[]; + +enum rxe_pool_state { + rxe_pool_invalid, + rxe_pool_valid, +}; + +struct rxe_pool_entry { + struct rxe_pool *pool; + struct kref ref_cnt; + struct list_head list; + + /* only used if indexed or keyed */ + struct rb_node node; + u32 index; +}; + +struct rxe_pool { + struct rxe_dev *rxe; + spinlock_t pool_lock; /* pool spinlock */ + size_t elem_size; + struct kref ref_cnt; + void (*cleanup)(void *obj); + enum rxe_pool_state state; + enum rxe_pool_flags flags; + enum rxe_elem_type type; + + unsigned int max_elem; + atomic_t num_elem; + + /* only used if indexed or keyed */ + struct rb_root tree; + unsigned long *table; + size_t table_size; + u32 max_index; + u32 min_index; + u32 last; + size_t key_offset; + size_t key_size; +}; + +/* initialize slab caches for managed objects */ +int rxe_cache_init(void); + +/* cleanup slab caches for managed objects */ +void rxe_cache_exit(void); + +/* initialize a pool of objects with given limit on + * number of elements. gets parameters from rxe_type_info + * pool elements will be allocated out of a slab cache + */ +int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, + enum rxe_elem_type type, u32 max_elem); + +/* free resources from object pool */ +int rxe_pool_cleanup(struct rxe_pool *pool); + +/* allocate an object from pool */ +void *rxe_alloc(struct rxe_pool *pool); + +/* assign an index to an indexed object and insert object into + * pool's rb tree + */ +void rxe_add_index(void *elem); + +/* drop an index and remove object from rb tree */ +void rxe_drop_index(void *elem); + +/* assign a key to a keyed object and insert object into + * pool's rb tree + */ +void rxe_add_key(void *elem, void *key); + +/* remove elem from rb tree */ +void rxe_drop_key(void *elem); + +/* lookup an indexed object from index. takes a reference on object */ +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); + +/* lookup keyed object from key. takes a reference on the object */ +void *rxe_pool_get_key(struct rxe_pool *pool, void *key); + +/* cleanup an object when all references are dropped */ +void rxe_elem_release(struct kref *kref); + +/* take a reference on an object */ +#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt) + +/* drop a reference on an object */ +#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release) + +#endif /* RXE_POOL_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c new file mode 100644 index 000000000000..22ba24f2a2c1 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -0,0 +1,851 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_task.h" + +char *rxe_qp_state_name[] = { + [QP_STATE_RESET] = "RESET", + [QP_STATE_INIT] = "INIT", + [QP_STATE_READY] = "READY", + [QP_STATE_DRAIN] = "DRAIN", + [QP_STATE_DRAINED] = "DRAINED", + [QP_STATE_ERROR] = "ERROR", +}; + +static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, + int has_srq) +{ + if (cap->max_send_wr > rxe->attr.max_qp_wr) { + pr_warn("invalid send wr = %d > %d\n", + cap->max_send_wr, rxe->attr.max_qp_wr); + goto err1; + } + + if (cap->max_send_sge > rxe->attr.max_sge) { + pr_warn("invalid send sge = %d > %d\n", + cap->max_send_sge, rxe->attr.max_sge); + goto err1; + } + + if (!has_srq) { + if (cap->max_recv_wr > rxe->attr.max_qp_wr) { + pr_warn("invalid recv wr = %d > %d\n", + cap->max_recv_wr, rxe->attr.max_qp_wr); + goto err1; + } + + if (cap->max_recv_sge > rxe->attr.max_sge) { + pr_warn("invalid recv sge = %d > %d\n", + cap->max_recv_sge, rxe->attr.max_sge); + goto err1; + } + } + + if (cap->max_inline_data > rxe->max_inline_data) { + pr_warn("invalid max inline data = %d > %d\n", + cap->max_inline_data, rxe->max_inline_data); + goto err1; + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) +{ + struct ib_qp_cap *cap = &init->cap; + struct rxe_port *port; + int port_num = init->port_num; + + if (!init->recv_cq || !init->send_cq) { + pr_warn("missing cq\n"); + goto err1; + } + + if (rxe_qp_chk_cap(rxe, cap, !!init->srq)) + goto err1; + + if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) { + if (port_num != 1) { + pr_warn("invalid port = %d\n", port_num); + goto err1; + } + + port = &rxe->port; + + if (init->qp_type == IB_QPT_SMI && port->qp_smi_index) { + pr_warn("SMI QP exists for port %d\n", port_num); + goto err1; + } + + if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { + pr_warn("GSI QP exists for port %d\n", port_num); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n) +{ + qp->resp.res_head = 0; + qp->resp.res_tail = 0; + qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL); + + if (!qp->resp.resources) + return -ENOMEM; + + return 0; +} + +static void free_rd_atomic_resources(struct rxe_qp *qp) +{ + if (qp->resp.resources) { + int i; + + for (i = 0; i < qp->attr.max_rd_atomic; i++) { + struct resp_res *res = &qp->resp.resources[i]; + + free_rd_atomic_resource(qp, res); + } + kfree(qp->resp.resources); + qp->resp.resources = NULL; + } +} + +void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res) +{ + if (res->type == RXE_ATOMIC_MASK) { + rxe_drop_ref(qp); + kfree_skb(res->atomic.skb); + } else if (res->type == RXE_READ_MASK) { + if (res->read.mr) + rxe_drop_ref(res->read.mr); + } + res->type = 0; +} + +static void cleanup_rd_atomic_resources(struct rxe_qp *qp) +{ + int i; + struct resp_res *res; + + if (qp->resp.resources) { + for (i = 0; i < qp->attr.max_rd_atomic; i++) { + res = &qp->resp.resources[i]; + free_rd_atomic_resource(qp, res); + } + } +} + +static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init) +{ + struct rxe_port *port; + u32 qpn; + + qp->sq_sig_type = init->sq_sig_type; + qp->attr.path_mtu = 1; + qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu); + + qpn = qp->pelem.index; + port = &rxe->port; + + switch (init->qp_type) { + case IB_QPT_SMI: + qp->ibqp.qp_num = 0; + port->qp_smi_index = qpn; + qp->attr.port_num = init->port_num; + break; + + case IB_QPT_GSI: + qp->ibqp.qp_num = 1; + port->qp_gsi_index = qpn; + qp->attr.port_num = init->port_num; + break; + + default: + qp->ibqp.qp_num = qpn; + break; + } + + INIT_LIST_HEAD(&qp->grp_list); + + skb_queue_head_init(&qp->send_pkts); + + spin_lock_init(&qp->grp_lock); + spin_lock_init(&qp->state_lock); + + atomic_set(&qp->ssn, 0); + atomic_set(&qp->skb_out, 0); +} + +static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init, + struct ib_ucontext *context, struct ib_udata *udata) +{ + int err; + int wqe_size; + + err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); + if (err < 0) + return err; + qp->sk->sk->sk_user_data = qp; + + qp->sq.max_wr = init->cap.max_send_wr; + qp->sq.max_sge = init->cap.max_send_sge; + qp->sq.max_inline = init->cap.max_inline_data; + + wqe_size = max_t(int, sizeof(struct rxe_send_wqe) + + qp->sq.max_sge * sizeof(struct ib_sge), + sizeof(struct rxe_send_wqe) + + qp->sq.max_inline); + + qp->sq.queue = rxe_queue_init(rxe, + &qp->sq.max_wr, + wqe_size); + if (!qp->sq.queue) + return -ENOMEM; + + err = do_mmap_info(rxe, udata, true, + context, qp->sq.queue->buf, + qp->sq.queue->buf_size, &qp->sq.queue->ip); + + if (err) { + kvfree(qp->sq.queue->buf); + kfree(qp->sq.queue); + return err; + } + + qp->req.wqe_index = producer_index(qp->sq.queue); + qp->req.state = QP_STATE_RESET; + qp->req.opcode = -1; + qp->comp.opcode = -1; + + spin_lock_init(&qp->sq.sq_lock); + skb_queue_head_init(&qp->req_pkts); + + rxe_init_task(rxe, &qp->req.task, qp, + rxe_requester, "req"); + rxe_init_task(rxe, &qp->comp.task, qp, + rxe_completer, "comp"); + + init_timer(&qp->rnr_nak_timer); + qp->rnr_nak_timer.function = rnr_nak_timer; + qp->rnr_nak_timer.data = (unsigned long)qp; + + init_timer(&qp->retrans_timer); + qp->retrans_timer.function = retransmit_timer; + qp->retrans_timer.data = (unsigned long)qp; + qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ + + return 0; +} + +static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init, + struct ib_ucontext *context, struct ib_udata *udata) +{ + int err; + int wqe_size; + + if (!qp->srq) { + qp->rq.max_wr = init->cap.max_recv_wr; + qp->rq.max_sge = init->cap.max_recv_sge; + + wqe_size = rcv_wqe_size(qp->rq.max_sge); + + pr_debug("max_wr = %d, max_sge = %d, wqe_size = %d\n", + qp->rq.max_wr, qp->rq.max_sge, wqe_size); + + qp->rq.queue = rxe_queue_init(rxe, + &qp->rq.max_wr, + wqe_size); + if (!qp->rq.queue) + return -ENOMEM; + + err = do_mmap_info(rxe, udata, false, context, + qp->rq.queue->buf, + qp->rq.queue->buf_size, + &qp->rq.queue->ip); + if (err) { + kvfree(qp->rq.queue->buf); + kfree(qp->rq.queue); + return err; + } + } + + spin_lock_init(&qp->rq.producer_lock); + spin_lock_init(&qp->rq.consumer_lock); + + skb_queue_head_init(&qp->resp_pkts); + + rxe_init_task(rxe, &qp->resp.task, qp, + rxe_responder, "resp"); + + qp->resp.opcode = OPCODE_NONE; + qp->resp.msn = 0; + qp->resp.state = QP_STATE_RESET; + + return 0; +} + +/* called by the create qp verb */ +int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, + struct ib_qp_init_attr *init, struct ib_udata *udata, + struct ib_pd *ibpd) +{ + int err; + struct rxe_cq *rcq = to_rcq(init->recv_cq); + struct rxe_cq *scq = to_rcq(init->send_cq); + struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; + struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL; + + rxe_add_ref(pd); + rxe_add_ref(rcq); + rxe_add_ref(scq); + if (srq) + rxe_add_ref(srq); + + qp->pd = pd; + qp->rcq = rcq; + qp->scq = scq; + qp->srq = srq; + + rxe_qp_init_misc(rxe, qp, init); + + err = rxe_qp_init_req(rxe, qp, init, context, udata); + if (err) + goto err1; + + err = rxe_qp_init_resp(rxe, qp, init, context, udata); + if (err) + goto err2; + + qp->attr.qp_state = IB_QPS_RESET; + qp->valid = 1; + + return 0; + +err2: + rxe_queue_cleanup(qp->sq.queue); +err1: + if (srq) + rxe_drop_ref(srq); + rxe_drop_ref(scq); + rxe_drop_ref(rcq); + rxe_drop_ref(pd); + + return err; +} + +/* called by the query qp verb */ +int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init) +{ + init->event_handler = qp->ibqp.event_handler; + init->qp_context = qp->ibqp.qp_context; + init->send_cq = qp->ibqp.send_cq; + init->recv_cq = qp->ibqp.recv_cq; + init->srq = qp->ibqp.srq; + + init->cap.max_send_wr = qp->sq.max_wr; + init->cap.max_send_sge = qp->sq.max_sge; + init->cap.max_inline_data = qp->sq.max_inline; + + if (!qp->srq) { + init->cap.max_recv_wr = qp->rq.max_wr; + init->cap.max_recv_sge = qp->rq.max_sge; + } + + init->sq_sig_type = qp->sq_sig_type; + + init->qp_type = qp->ibqp.qp_type; + init->port_num = 1; + + return 0; +} + +/* called by the modify qp verb, this routine checks all the parameters before + * making any changes + */ +int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_attr *attr, int mask) +{ + enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : qp->attr.qp_state; + enum ib_qp_state new_state = (mask & IB_QP_STATE) ? + attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask, + IB_LINK_LAYER_ETHERNET)) { + pr_warn("invalid mask or state for qp\n"); + goto err1; + } + + if (mask & IB_QP_STATE) { + if (cur_state == IB_QPS_SQD) { + if (qp->req.state == QP_STATE_DRAIN && + new_state != IB_QPS_ERR) + goto err1; + } + } + + if (mask & IB_QP_PORT) { + if (attr->port_num != 1) { + pr_warn("invalid port %d\n", attr->port_num); + goto err1; + } + } + + if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) + goto err1; + + if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr)) + goto err1; + + if (mask & IB_QP_ALT_PATH) { + if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr)) + goto err1; + if (attr->alt_port_num != 1) { + pr_warn("invalid alt port %d\n", attr->alt_port_num); + goto err1; + } + if (attr->alt_timeout > 31) { + pr_warn("invalid QP alt timeout %d > 31\n", + attr->alt_timeout); + goto err1; + } + } + + if (mask & IB_QP_PATH_MTU) { + struct rxe_port *port = &rxe->port; + + enum ib_mtu max_mtu = port->attr.max_mtu; + enum ib_mtu mtu = attr->path_mtu; + + if (mtu > max_mtu) { + pr_debug("invalid mtu (%d) > (%d)\n", + ib_mtu_enum_to_int(mtu), + ib_mtu_enum_to_int(max_mtu)); + goto err1; + } + } + + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) { + pr_warn("invalid max_rd_atomic %d > %d\n", + attr->max_rd_atomic, + rxe->attr.max_qp_rd_atom); + goto err1; + } + } + + if (mask & IB_QP_TIMEOUT) { + if (attr->timeout > 31) { + pr_warn("invalid QP timeout %d > 31\n", + attr->timeout); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +/* move the qp to the reset state */ +static void rxe_qp_reset(struct rxe_qp *qp) +{ + /* stop tasks from running */ + rxe_disable_task(&qp->resp.task); + + /* stop request/comp */ + if (qp->sq.queue) { + if (qp_type(qp) == IB_QPT_RC) + rxe_disable_task(&qp->comp.task); + rxe_disable_task(&qp->req.task); + } + + /* move qp to the reset state */ + qp->req.state = QP_STATE_RESET; + qp->resp.state = QP_STATE_RESET; + + /* let state machines reset themselves drain work and packet queues + * etc. + */ + __rxe_do_task(&qp->resp.task); + + if (qp->sq.queue) { + __rxe_do_task(&qp->comp.task); + __rxe_do_task(&qp->req.task); + } + + /* cleanup attributes */ + atomic_set(&qp->ssn, 0); + qp->req.opcode = -1; + qp->req.need_retry = 0; + qp->req.noack_pkts = 0; + qp->resp.msn = 0; + qp->resp.opcode = -1; + qp->resp.drop_msg = 0; + qp->resp.goto_error = 0; + qp->resp.sent_psn_nak = 0; + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + cleanup_rd_atomic_resources(qp); + + /* reenable tasks */ + rxe_enable_task(&qp->resp.task); + + if (qp->sq.queue) { + if (qp_type(qp) == IB_QPT_RC) + rxe_enable_task(&qp->comp.task); + + rxe_enable_task(&qp->req.task); + } +} + +/* drain the send queue */ +static void rxe_qp_drain(struct rxe_qp *qp) +{ + if (qp->sq.queue) { + if (qp->req.state != QP_STATE_DRAINED) { + qp->req.state = QP_STATE_DRAIN; + if (qp_type(qp) == IB_QPT_RC) + rxe_run_task(&qp->comp.task, 1); + else + __rxe_do_task(&qp->comp.task); + rxe_run_task(&qp->req.task, 1); + } + } +} + +/* move the qp to the error state */ +void rxe_qp_error(struct rxe_qp *qp) +{ + qp->req.state = QP_STATE_ERROR; + qp->resp.state = QP_STATE_ERROR; + + /* drain work and packet queues */ + rxe_run_task(&qp->resp.task, 1); + + if (qp_type(qp) == IB_QPT_RC) + rxe_run_task(&qp->comp.task, 1); + else + __rxe_do_task(&qp->comp.task); + rxe_run_task(&qp->req.task, 1); +} + +/* called by the modify qp verb */ +int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + union ib_gid sgid; + struct ib_gid_attr sgid_attr; + + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { + int max_rd_atomic = __roundup_pow_of_two(attr->max_rd_atomic); + + free_rd_atomic_resources(qp); + + err = alloc_rd_atomic_resources(qp, max_rd_atomic); + if (err) + return err; + + qp->attr.max_rd_atomic = max_rd_atomic; + atomic_set(&qp->req.rd_atomic, max_rd_atomic); + } + + if (mask & IB_QP_CUR_STATE) + qp->attr.cur_qp_state = attr->qp_state; + + if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY) + qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify; + + if (mask & IB_QP_ACCESS_FLAGS) + qp->attr.qp_access_flags = attr->qp_access_flags; + + if (mask & IB_QP_PKEY_INDEX) + qp->attr.pkey_index = attr->pkey_index; + + if (mask & IB_QP_PORT) + qp->attr.port_num = attr->port_num; + + if (mask & IB_QP_QKEY) + qp->attr.qkey = attr->qkey; + + if (mask & IB_QP_AV) { + ib_get_cached_gid(&rxe->ib_dev, 1, + attr->ah_attr.grh.sgid_index, &sgid, + &sgid_attr); + rxe_av_from_attr(rxe, attr->port_num, &qp->pri_av, + &attr->ah_attr); + rxe_av_fill_ip_info(rxe, &qp->pri_av, &attr->ah_attr, + &sgid_attr, &sgid); + if (sgid_attr.ndev) + dev_put(sgid_attr.ndev); + } + + if (mask & IB_QP_ALT_PATH) { + ib_get_cached_gid(&rxe->ib_dev, 1, + attr->alt_ah_attr.grh.sgid_index, &sgid, + &sgid_attr); + + rxe_av_from_attr(rxe, attr->alt_port_num, &qp->alt_av, + &attr->alt_ah_attr); + rxe_av_fill_ip_info(rxe, &qp->alt_av, &attr->alt_ah_attr, + &sgid_attr, &sgid); + if (sgid_attr.ndev) + dev_put(sgid_attr.ndev); + + qp->attr.alt_port_num = attr->alt_port_num; + qp->attr.alt_pkey_index = attr->alt_pkey_index; + qp->attr.alt_timeout = attr->alt_timeout; + } + + if (mask & IB_QP_PATH_MTU) { + qp->attr.path_mtu = attr->path_mtu; + qp->mtu = ib_mtu_enum_to_int(attr->path_mtu); + } + + if (mask & IB_QP_TIMEOUT) { + qp->attr.timeout = attr->timeout; + if (attr->timeout == 0) { + qp->qp_timeout_jiffies = 0; + } else { + /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */ + int j = nsecs_to_jiffies(4096ULL << attr->timeout); + + qp->qp_timeout_jiffies = j ? j : 1; + } + } + + if (mask & IB_QP_RETRY_CNT) { + qp->attr.retry_cnt = attr->retry_cnt; + qp->comp.retry_cnt = attr->retry_cnt; + pr_debug("set retry count = %d\n", attr->retry_cnt); + } + + if (mask & IB_QP_RNR_RETRY) { + qp->attr.rnr_retry = attr->rnr_retry; + qp->comp.rnr_retry = attr->rnr_retry; + pr_debug("set rnr retry count = %d\n", attr->rnr_retry); + } + + if (mask & IB_QP_RQ_PSN) { + qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK); + qp->resp.psn = qp->attr.rq_psn; + pr_debug("set resp psn = 0x%x\n", qp->resp.psn); + } + + if (mask & IB_QP_MIN_RNR_TIMER) { + qp->attr.min_rnr_timer = attr->min_rnr_timer; + pr_debug("set min rnr timer = 0x%x\n", + attr->min_rnr_timer); + } + + if (mask & IB_QP_SQ_PSN) { + qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK); + qp->req.psn = qp->attr.sq_psn; + qp->comp.psn = qp->attr.sq_psn; + pr_debug("set req psn = 0x%x\n", qp->req.psn); + } + + if (mask & IB_QP_MAX_DEST_RD_ATOMIC) { + qp->attr.max_dest_rd_atomic = + __roundup_pow_of_two(attr->max_dest_rd_atomic); + } + + if (mask & IB_QP_PATH_MIG_STATE) + qp->attr.path_mig_state = attr->path_mig_state; + + if (mask & IB_QP_DEST_QPN) + qp->attr.dest_qp_num = attr->dest_qp_num; + + if (mask & IB_QP_STATE) { + qp->attr.qp_state = attr->qp_state; + + switch (attr->qp_state) { + case IB_QPS_RESET: + pr_debug("qp state -> RESET\n"); + rxe_qp_reset(qp); + break; + + case IB_QPS_INIT: + pr_debug("qp state -> INIT\n"); + qp->req.state = QP_STATE_INIT; + qp->resp.state = QP_STATE_INIT; + break; + + case IB_QPS_RTR: + pr_debug("qp state -> RTR\n"); + qp->resp.state = QP_STATE_READY; + break; + + case IB_QPS_RTS: + pr_debug("qp state -> RTS\n"); + qp->req.state = QP_STATE_READY; + break; + + case IB_QPS_SQD: + pr_debug("qp state -> SQD\n"); + rxe_qp_drain(qp); + break; + + case IB_QPS_SQE: + pr_warn("qp state -> SQE !!?\n"); + /* Not possible from modify_qp. */ + break; + + case IB_QPS_ERR: + pr_debug("qp state -> ERR\n"); + rxe_qp_error(qp); + break; + } + } + + return 0; +} + +/* called by the query qp verb */ +int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + *attr = qp->attr; + + attr->rq_psn = qp->resp.psn; + attr->sq_psn = qp->req.psn; + + attr->cap.max_send_wr = qp->sq.max_wr; + attr->cap.max_send_sge = qp->sq.max_sge; + attr->cap.max_inline_data = qp->sq.max_inline; + + if (!qp->srq) { + attr->cap.max_recv_wr = qp->rq.max_wr; + attr->cap.max_recv_sge = qp->rq.max_sge; + } + + rxe_av_to_attr(rxe, &qp->pri_av, &attr->ah_attr); + rxe_av_to_attr(rxe, &qp->alt_av, &attr->alt_ah_attr); + + if (qp->req.state == QP_STATE_DRAIN) { + attr->sq_draining = 1; + /* applications that get this state + * typically spin on it. yield the + * processor + */ + cond_resched(); + } else { + attr->sq_draining = 0; + } + + pr_debug("attr->sq_draining = %d\n", attr->sq_draining); + + return 0; +} + +/* called by the destroy qp verb */ +void rxe_qp_destroy(struct rxe_qp *qp) +{ + qp->valid = 0; + qp->qp_timeout_jiffies = 0; + rxe_cleanup_task(&qp->resp.task); + + del_timer_sync(&qp->retrans_timer); + del_timer_sync(&qp->rnr_nak_timer); + + rxe_cleanup_task(&qp->req.task); + if (qp_type(qp) == IB_QPT_RC) + rxe_cleanup_task(&qp->comp.task); + + /* flush out any receive wr's or pending requests */ + __rxe_do_task(&qp->req.task); + if (qp->sq.queue) { + __rxe_do_task(&qp->comp.task); + __rxe_do_task(&qp->req.task); + } +} + +/* called when the last reference to the qp is dropped */ +void rxe_qp_cleanup(void *arg) +{ + struct rxe_qp *qp = arg; + + rxe_drop_all_mcast_groups(qp); + + if (qp->sq.queue) + rxe_queue_cleanup(qp->sq.queue); + + if (qp->srq) + rxe_drop_ref(qp->srq); + + if (qp->rq.queue) + rxe_queue_cleanup(qp->rq.queue); + + if (qp->scq) + rxe_drop_ref(qp->scq); + if (qp->rcq) + rxe_drop_ref(qp->rcq); + if (qp->pd) + rxe_drop_ref(qp->pd); + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + free_rd_atomic_resources(qp); + + kernel_sock_shutdown(qp->sk, SHUT_RDWR); +} diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c new file mode 100644 index 000000000000..08274254eb88 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_queue.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must retailuce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int do_mmap_info(struct rxe_dev *rxe, + struct ib_udata *udata, + bool is_req, + struct ib_ucontext *context, + struct rxe_queue_buf *buf, + size_t buf_size, + struct rxe_mmap_info **ip_p) +{ + int err; + u32 len, offset; + struct rxe_mmap_info *ip = NULL; + + if (udata) { + if (is_req) { + len = udata->outlen - sizeof(struct mminfo); + offset = sizeof(struct mminfo); + } else { + len = udata->outlen; + offset = 0; + } + + if (len < sizeof(ip->info)) + goto err1; + + ip = rxe_create_mmap_info(rxe, buf_size, context, buf); + if (!ip) + goto err1; + + err = copy_to_user(udata->outbuf + offset, &ip->info, + sizeof(ip->info)); + if (err) + goto err2; + + spin_lock_bh(&rxe->pending_lock); + list_add(&ip->pending_mmaps, &rxe->pending_mmaps); + spin_unlock_bh(&rxe->pending_lock); + } + + *ip_p = ip; + + return 0; + +err2: + kfree(ip); +err1: + return -EINVAL; +} + +struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, + int *num_elem, + unsigned int elem_size) +{ + struct rxe_queue *q; + size_t buf_size; + unsigned int num_slots; + + /* num_elem == 0 is allowed, but uninteresting */ + if (*num_elem < 0) + goto err1; + + q = kmalloc(sizeof(*q), GFP_KERNEL); + if (!q) + goto err1; + + q->rxe = rxe; + + /* used in resize, only need to copy used part of queue */ + q->elem_size = elem_size; + + /* pad element up to at least a cacheline and always a power of 2 */ + if (elem_size < cache_line_size()) + elem_size = cache_line_size(); + elem_size = roundup_pow_of_two(elem_size); + + q->log2_elem_size = order_base_2(elem_size); + + num_slots = *num_elem + 1; + num_slots = roundup_pow_of_two(num_slots); + q->index_mask = num_slots - 1; + + buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size; + + q->buf = vmalloc_user(buf_size); + if (!q->buf) + goto err2; + + q->buf->log2_elem_size = q->log2_elem_size; + q->buf->index_mask = q->index_mask; + + q->buf_size = buf_size; + + *num_elem = num_slots - 1; + return q; + +err2: + kfree(q); +err1: + return NULL; +} + +/* copies elements from original q to new q and then swaps the contents of the + * two q headers. This is so that if anyone is holding a pointer to q it will + * still work + */ +static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q, + unsigned int num_elem) +{ + if (!queue_empty(q) && (num_elem < queue_count(q))) + return -EINVAL; + + while (!queue_empty(q)) { + memcpy(producer_addr(new_q), consumer_addr(q), + new_q->elem_size); + advance_producer(new_q); + advance_consumer(q); + } + + swap(*q, *new_q); + + return 0; +} + +int rxe_queue_resize(struct rxe_queue *q, + unsigned int *num_elem_p, + unsigned int elem_size, + struct ib_ucontext *context, + struct ib_udata *udata, + spinlock_t *producer_lock, + spinlock_t *consumer_lock) +{ + struct rxe_queue *new_q; + unsigned int num_elem = *num_elem_p; + int err; + unsigned long flags = 0, flags1; + + new_q = rxe_queue_init(q->rxe, &num_elem, elem_size); + if (!new_q) + return -ENOMEM; + + err = do_mmap_info(new_q->rxe, udata, false, context, new_q->buf, + new_q->buf_size, &new_q->ip); + if (err) { + vfree(new_q->buf); + kfree(new_q); + goto err1; + } + + spin_lock_irqsave(consumer_lock, flags1); + + if (producer_lock) { + spin_lock_irqsave(producer_lock, flags); + err = resize_finish(q, new_q, num_elem); + spin_unlock_irqrestore(producer_lock, flags); + } else { + err = resize_finish(q, new_q, num_elem); + } + + spin_unlock_irqrestore(consumer_lock, flags1); + + rxe_queue_cleanup(new_q); /* new/old dep on err */ + if (err) + goto err1; + + *num_elem_p = num_elem; + return 0; + +err1: + return err; +} + +void rxe_queue_cleanup(struct rxe_queue *q) +{ + if (q->ip) + kref_put(&q->ip->ref, rxe_mmap_release); + else + vfree(q->buf); + + kfree(q); +} diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h new file mode 100644 index 000000000000..239fd609c31e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_QUEUE_H +#define RXE_QUEUE_H + +/* implements a simple circular buffer that can optionally be + * shared between user space and the kernel and can be resized + + * the requested element size is rounded up to a power of 2 + * and the number of elements in the buffer is also rounded + * up to a power of 2. Since the queue is empty when the + * producer and consumer indices match the maximum capacity + * of the queue is one less than the number of element slots + */ + +/* this data structure is shared between user space and kernel + * space for those cases where the queue is shared. It contains + * the producer and consumer indices. Is also contains a copy + * of the queue size parameters for user space to use but the + * kernel must use the parameters in the rxe_queue struct + * this MUST MATCH the corresponding librxe struct + * for performance reasons arrange to have producer and consumer + * pointers in separate cache lines + * the kernel should always mask the indices to avoid accessing + * memory outside of the data area + */ +struct rxe_queue_buf { + __u32 log2_elem_size; + __u32 index_mask; + __u32 pad_1[30]; + __u32 producer_index; + __u32 pad_2[31]; + __u32 consumer_index; + __u32 pad_3[31]; + __u8 data[0]; +}; + +struct rxe_queue { + struct rxe_dev *rxe; + struct rxe_queue_buf *buf; + struct rxe_mmap_info *ip; + size_t buf_size; + size_t elem_size; + unsigned int log2_elem_size; + unsigned int index_mask; +}; + +int do_mmap_info(struct rxe_dev *rxe, + struct ib_udata *udata, + bool is_req, + struct ib_ucontext *context, + struct rxe_queue_buf *buf, + size_t buf_size, + struct rxe_mmap_info **ip_p); + +struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, + int *num_elem, + unsigned int elem_size); + +int rxe_queue_resize(struct rxe_queue *q, + unsigned int *num_elem_p, + unsigned int elem_size, + struct ib_ucontext *context, + struct ib_udata *udata, + /* Protect producers while resizing queue */ + spinlock_t *producer_lock, + /* Protect consumers while resizing queue */ + spinlock_t *consumer_lock); + +void rxe_queue_cleanup(struct rxe_queue *queue); + +static inline int next_index(struct rxe_queue *q, int index) +{ + return (index + 1) & q->buf->index_mask; +} + +static inline int queue_empty(struct rxe_queue *q) +{ + return ((q->buf->producer_index - q->buf->consumer_index) + & q->index_mask) == 0; +} + +static inline int queue_full(struct rxe_queue *q) +{ + return ((q->buf->producer_index + 1 - q->buf->consumer_index) + & q->index_mask) == 0; +} + +static inline void advance_producer(struct rxe_queue *q) +{ + q->buf->producer_index = (q->buf->producer_index + 1) + & q->index_mask; +} + +static inline void advance_consumer(struct rxe_queue *q) +{ + q->buf->consumer_index = (q->buf->consumer_index + 1) + & q->index_mask; +} + +static inline void *producer_addr(struct rxe_queue *q) +{ + return q->buf->data + ((q->buf->producer_index & q->index_mask) + << q->log2_elem_size); +} + +static inline void *consumer_addr(struct rxe_queue *q) +{ + return q->buf->data + ((q->buf->consumer_index & q->index_mask) + << q->log2_elem_size); +} + +static inline unsigned int producer_index(struct rxe_queue *q) +{ + return q->buf->producer_index; +} + +static inline unsigned int consumer_index(struct rxe_queue *q) +{ + return q->buf->consumer_index; +} + +static inline void *addr_from_index(struct rxe_queue *q, unsigned int index) +{ + return q->buf->data + ((index & q->index_mask) + << q->buf->log2_elem_size); +} + +static inline unsigned int index_from_addr(const struct rxe_queue *q, + const void *addr) +{ + return (((u8 *)addr - q->buf->data) >> q->log2_elem_size) + & q->index_mask; +} + +static inline unsigned int queue_count(const struct rxe_queue *q) +{ + return (q->buf->producer_index - q->buf->consumer_index) + & q->index_mask; +} + +static inline void *queue_head(struct rxe_queue *q) +{ + return queue_empty(q) ? NULL : consumer_addr(q); +} + +#endif /* RXE_QUEUE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c new file mode 100644 index 000000000000..3d464c23e08b --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "rxe.h" +#include "rxe_loc.h" + +static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct rxe_qp *qp) +{ + if (unlikely(!qp->valid)) + goto err1; + + switch (qp_type(qp)) { + case IB_QPT_RC: + if (unlikely((pkt->opcode & IB_OPCODE_RC) != 0)) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + case IB_QPT_UC: + if (unlikely(!(pkt->opcode & IB_OPCODE_UC))) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + if (unlikely(!(pkt->opcode & IB_OPCODE_UD))) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + default: + pr_warn_ratelimited("unsupported qp type\n"); + goto err1; + } + + if (pkt->mask & RXE_REQ_MASK) { + if (unlikely(qp->resp.state != QP_STATE_READY)) + goto err1; + } else if (unlikely(qp->req.state < QP_STATE_READY || + qp->req.state > QP_STATE_DRAINED)) { + goto err1; + } + + return 0; + +err1: + return -EINVAL; +} + +static void set_bad_pkey_cntr(struct rxe_port *port) +{ + spin_lock_bh(&port->port_lock); + port->attr.bad_pkey_cntr = min((u32)0xffff, + port->attr.bad_pkey_cntr + 1); + spin_unlock_bh(&port->port_lock); +} + +static void set_qkey_viol_cntr(struct rxe_port *port) +{ + spin_lock_bh(&port->port_lock); + port->attr.qkey_viol_cntr = min((u32)0xffff, + port->attr.qkey_viol_cntr + 1); + spin_unlock_bh(&port->port_lock); +} + +static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + u32 qpn, struct rxe_qp *qp) +{ + int i; + int found_pkey = 0; + struct rxe_port *port = &rxe->port; + u16 pkey = bth_pkey(pkt); + + pkt->pkey_index = 0; + + if (qpn == 1) { + for (i = 0; i < port->attr.pkey_tbl_len; i++) { + if (pkey_match(pkey, port->pkey_tbl[i])) { + pkt->pkey_index = i; + found_pkey = 1; + break; + } + } + + if (!found_pkey) { + pr_warn_ratelimited("bad pkey = 0x%x\n", pkey); + set_bad_pkey_cntr(port); + goto err1; + } + } else if (qpn != 0) { + if (unlikely(!pkey_match(pkey, + port->pkey_tbl[qp->attr.pkey_index] + ))) { + pr_warn_ratelimited("bad pkey = 0x%0x\n", pkey); + set_bad_pkey_cntr(port); + goto err1; + } + pkt->pkey_index = qp->attr.pkey_index; + } + + if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) && + qpn != 0 && pkt->mask) { + u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey; + + if (unlikely(deth_qkey(pkt) != qkey)) { + pr_warn_ratelimited("bad qkey, got 0x%x expected 0x%x for qpn 0x%x\n", + deth_qkey(pkt), qkey, qpn); + set_qkey_viol_cntr(port); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct rxe_qp *qp) +{ + struct sk_buff *skb = PKT_TO_SKB(pkt); + + if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC) + goto done; + + if (unlikely(pkt->port_num != qp->attr.port_num)) { + pr_warn_ratelimited("port %d != qp port %d\n", + pkt->port_num, qp->attr.port_num); + goto err1; + } + + if (skb->protocol == htons(ETH_P_IP)) { + struct in_addr *saddr = + &qp->pri_av.sgid_addr._sockaddr_in.sin_addr; + struct in_addr *daddr = + &qp->pri_av.dgid_addr._sockaddr_in.sin_addr; + + if (ip_hdr(skb)->daddr != saddr->s_addr) { + pr_warn_ratelimited("dst addr %pI4 != qp source addr %pI4\n", + &ip_hdr(skb)->daddr, + &saddr->s_addr); + goto err1; + } + + if (ip_hdr(skb)->saddr != daddr->s_addr) { + pr_warn_ratelimited("source addr %pI4 != qp dst addr %pI4\n", + &ip_hdr(skb)->saddr, + &daddr->s_addr); + goto err1; + } + + } else if (skb->protocol == htons(ETH_P_IPV6)) { + struct in6_addr *saddr = + &qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr; + struct in6_addr *daddr = + &qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr; + + if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr))) { + pr_warn_ratelimited("dst addr %pI6 != qp source addr %pI6\n", + &ipv6_hdr(skb)->daddr, saddr); + goto err1; + } + + if (memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr))) { + pr_warn_ratelimited("source addr %pI6 != qp dst addr %pI6\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err1; + } + } + +done: + return 0; + +err1: + return -EINVAL; +} + +static int hdr_check(struct rxe_pkt_info *pkt) +{ + struct rxe_dev *rxe = pkt->rxe; + struct rxe_port *port = &rxe->port; + struct rxe_qp *qp = NULL; + u32 qpn = bth_qpn(pkt); + int index; + int err; + + if (unlikely(bth_tver(pkt) != BTH_TVER)) { + pr_warn_ratelimited("bad tver\n"); + goto err1; + } + + if (qpn != IB_MULTICAST_QPN) { + index = (qpn == 0) ? port->qp_smi_index : + ((qpn == 1) ? port->qp_gsi_index : qpn); + qp = rxe_pool_get_index(&rxe->qp_pool, index); + if (unlikely(!qp)) { + pr_warn_ratelimited("no qp matches qpn 0x%x\n", qpn); + goto err1; + } + + err = check_type_state(rxe, pkt, qp); + if (unlikely(err)) + goto err2; + + err = check_addr(rxe, pkt, qp); + if (unlikely(err)) + goto err2; + + err = check_keys(rxe, pkt, qpn, qp); + if (unlikely(err)) + goto err2; + } else { + if (unlikely((pkt->mask & RXE_GRH_MASK) == 0)) { + pr_warn_ratelimited("no grh for mcast qpn\n"); + goto err1; + } + } + + pkt->qp = qp; + return 0; + +err2: + if (qp) + rxe_drop_ref(qp); +err1: + return -EINVAL; +} + +static inline void rxe_rcv_pkt(struct rxe_dev *rxe, + struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + if (pkt->mask & RXE_REQ_MASK) + rxe_resp_queue_pkt(rxe, pkt->qp, skb); + else + rxe_comp_queue_pkt(rxe, pkt->qp, skb); +} + +static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) +{ + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + struct rxe_mc_grp *mcg; + struct sk_buff *skb_copy; + struct rxe_mc_elem *mce; + struct rxe_qp *qp; + union ib_gid dgid; + int err; + + if (skb->protocol == htons(ETH_P_IP)) + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + (struct in6_addr *)&dgid); + else if (skb->protocol == htons(ETH_P_IPV6)) + memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid)); + + /* lookup mcast group corresponding to mgid, takes a ref */ + mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid); + if (!mcg) + goto err1; /* mcast group not registered */ + + spin_lock_bh(&mcg->mcg_lock); + + list_for_each_entry(mce, &mcg->qp_list, qp_list) { + qp = mce->qp; + pkt = SKB_TO_PKT(skb); + + /* validate qp for incoming packet */ + err = check_type_state(rxe, pkt, qp); + if (err) + continue; + + err = check_keys(rxe, pkt, bth_qpn(pkt), qp); + if (err) + continue; + + /* if *not* the last qp in the list + * make a copy of the skb to post to the next qp + */ + skb_copy = (mce->qp_list.next != &mcg->qp_list) ? + skb_clone(skb, GFP_KERNEL) : NULL; + + pkt->qp = qp; + rxe_add_ref(qp); + rxe_rcv_pkt(rxe, pkt, skb); + + skb = skb_copy; + if (!skb) + break; + } + + spin_unlock_bh(&mcg->mcg_lock); + + rxe_drop_ref(mcg); /* drop ref from rxe_pool_get_key. */ + +err1: + if (skb) + kfree_skb(skb); +} + +static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) +{ + union ib_gid dgid; + union ib_gid *pdgid; + u16 index; + + if (skb->protocol == htons(ETH_P_IP)) { + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + (struct in6_addr *)&dgid); + pdgid = &dgid; + } else { + pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr; + } + + return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid, + IB_GID_TYPE_ROCE_UDP_ENCAP, + 1, rxe->ndev, &index); +} + +/* rxe_rcv is called from the interface driver */ +int rxe_rcv(struct sk_buff *skb) +{ + int err; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + struct rxe_dev *rxe = pkt->rxe; + __be32 *icrcp; + u32 calc_icrc, pack_icrc; + + pkt->offset = 0; + + if (unlikely(skb->len < pkt->offset + RXE_BTH_BYTES)) + goto drop; + + if (unlikely(rxe_match_dgid(rxe, skb) < 0)) { + pr_warn_ratelimited("failed matching dgid\n"); + goto drop; + } + + pkt->opcode = bth_opcode(pkt); + pkt->psn = bth_psn(pkt); + pkt->qp = NULL; + pkt->mask |= rxe_opcode[pkt->opcode].mask; + + if (unlikely(skb->len < header_size(pkt))) + goto drop; + + err = hdr_check(pkt); + if (unlikely(err)) + goto drop; + + /* Verify ICRC */ + icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE); + pack_icrc = be32_to_cpu(*icrcp); + + calc_icrc = rxe_icrc_hdr(pkt, skb); + calc_icrc = crc32_le(calc_icrc, (u8 *)payload_addr(pkt), payload_size(pkt)); + calc_icrc = cpu_to_be32(~calc_icrc); + if (unlikely(calc_icrc != pack_icrc)) { + char saddr[sizeof(struct in6_addr)]; + + if (skb->protocol == htons(ETH_P_IPV6)) + sprintf(saddr, "%pI6", &ipv6_hdr(skb)->saddr); + else if (skb->protocol == htons(ETH_P_IP)) + sprintf(saddr, "%pI4", &ip_hdr(skb)->saddr); + else + sprintf(saddr, "unknown"); + + pr_warn_ratelimited("bad ICRC from %s\n", saddr); + goto drop; + } + + if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN)) + rxe_rcv_mcast_pkt(rxe, skb); + else + rxe_rcv_pkt(rxe, pkt, skb); + + return 0; + +drop: + if (pkt->qp) + rxe_drop_ref(pkt->qp); + + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(rxe_rcv); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c new file mode 100644 index 000000000000..33b2d9d77021 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + unsigned opcode); + +static inline void retry_first_write_send(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + unsigned mask, int npsn) +{ + int i; + + for (i = 0; i < npsn; i++) { + int to_send = (wqe->dma.resid > qp->mtu) ? + qp->mtu : wqe->dma.resid; + + qp->req.opcode = next_opcode(qp, wqe, + wqe->wr.opcode); + + if (wqe->wr.send_flags & IB_SEND_INLINE) { + wqe->dma.resid -= to_send; + wqe->dma.sge_offset += to_send; + } else { + advance_dma_data(&wqe->dma, to_send); + } + if (mask & WR_WRITE_MASK) + wqe->iova += qp->mtu; + } +} + +static void req_retry(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe; + unsigned int wqe_index; + unsigned int mask; + int npsn; + int first = 1; + + wqe = queue_head(qp->sq.queue); + npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK; + + qp->req.wqe_index = consumer_index(qp->sq.queue); + qp->req.psn = qp->comp.psn; + qp->req.opcode = -1; + + for (wqe_index = consumer_index(qp->sq.queue); + wqe_index != producer_index(qp->sq.queue); + wqe_index = next_index(qp->sq.queue, wqe_index)) { + wqe = addr_from_index(qp->sq.queue, wqe_index); + mask = wr_opcode_mask(wqe->wr.opcode, qp); + + if (wqe->state == wqe_state_posted) + break; + + if (wqe->state == wqe_state_done) + continue; + + wqe->iova = (mask & WR_ATOMIC_MASK) ? + wqe->wr.wr.atomic.remote_addr : + (mask & WR_READ_OR_WRITE_MASK) ? + wqe->wr.wr.rdma.remote_addr : + 0; + + if (!first || (mask & WR_READ_MASK) == 0) { + wqe->dma.resid = wqe->dma.length; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + } + + if (first) { + first = 0; + + if (mask & WR_WRITE_OR_SEND_MASK) + retry_first_write_send(qp, wqe, mask, npsn); + + if (mask & WR_READ_MASK) + wqe->iova += npsn * qp->mtu; + } + + wqe->state = wqe_state_posted; + } +} + +void rnr_nak_timer(unsigned long data) +{ + struct rxe_qp *qp = (struct rxe_qp *)data; + + pr_debug("rnr nak timer fired\n"); + rxe_run_task(&qp->req.task, 1); +} + +static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe = queue_head(qp->sq.queue); + unsigned long flags; + + if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + /* check to see if we are drained; + * state_lock used by requester and completer + */ + spin_lock_irqsave(&qp->state_lock, flags); + do { + if (qp->req.state != QP_STATE_DRAIN) { + /* comp just finished */ + spin_unlock_irqrestore(&qp->state_lock, + flags); + break; + } + + if (wqe && ((qp->req.wqe_index != + consumer_index(qp->sq.queue)) || + (wqe->state != wqe_state_posted))) { + /* comp not done yet */ + spin_unlock_irqrestore(&qp->state_lock, + flags); + break; + } + + qp->req.state = QP_STATE_DRAINED; + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_SQ_DRAINED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + } while (0); + } + + if (qp->req.wqe_index == producer_index(qp->sq.queue)) + return NULL; + + wqe = addr_from_index(qp->sq.queue, qp->req.wqe_index); + + if (unlikely((qp->req.state == QP_STATE_DRAIN || + qp->req.state == QP_STATE_DRAINED) && + (wqe->state != wqe_state_processing))) + return NULL; + + if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) && + (qp->req.wqe_index != consumer_index(qp->sq.queue)))) { + qp->req.wait_fence = 1; + return NULL; + } + + wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); + return wqe; +} + +static int next_opcode_rc(struct rxe_qp *qp, unsigned opcode, int fits) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_RC_RDMA_WRITE_LAST : + IB_OPCODE_RC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_RC_RDMA_WRITE_ONLY : + IB_OPCODE_RC_RDMA_WRITE_FIRST; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE : + IB_OPCODE_RC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE : + IB_OPCODE_RC_RDMA_WRITE_FIRST; + + case IB_WR_SEND: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? + IB_OPCODE_RC_SEND_LAST : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_RC_SEND_ONLY : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_SEND_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? + IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_RDMA_READ: + return IB_OPCODE_RC_RDMA_READ_REQUEST; + + case IB_WR_ATOMIC_CMP_AND_SWP: + return IB_OPCODE_RC_COMPARE_SWAP; + + case IB_WR_ATOMIC_FETCH_AND_ADD: + return IB_OPCODE_RC_FETCH_ADD; + + case IB_WR_SEND_WITH_INV: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE : + IB_OPCODE_RC_SEND_FIRST; + case IB_WR_REG_MR: + case IB_WR_LOCAL_INV: + return opcode; + } + + return -EINVAL; +} + +static int next_opcode_uc(struct rxe_qp *qp, unsigned opcode, int fits) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_UC_RDMA_WRITE_LAST : + IB_OPCODE_UC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_UC_RDMA_WRITE_ONLY : + IB_OPCODE_UC_RDMA_WRITE_FIRST; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE : + IB_OPCODE_UC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE : + IB_OPCODE_UC_RDMA_WRITE_FIRST; + + case IB_WR_SEND: + if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE) + return fits ? + IB_OPCODE_UC_SEND_LAST : + IB_OPCODE_UC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_UC_SEND_ONLY : + IB_OPCODE_UC_SEND_FIRST; + + case IB_WR_SEND_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE) + return fits ? + IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE : + IB_OPCODE_UC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE : + IB_OPCODE_UC_SEND_FIRST; + } + + return -EINVAL; +} + +static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + unsigned opcode) +{ + int fits = (wqe->dma.resid <= qp->mtu); + + switch (qp_type(qp)) { + case IB_QPT_RC: + return next_opcode_rc(qp, opcode, fits); + + case IB_QPT_UC: + return next_opcode_uc(qp, opcode, fits); + + case IB_QPT_SMI: + case IB_QPT_UD: + case IB_QPT_GSI: + switch (opcode) { + case IB_WR_SEND: + return IB_OPCODE_UD_SEND_ONLY; + + case IB_WR_SEND_WITH_IMM: + return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + } + break; + + default: + break; + } + + return -EINVAL; +} + +static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + int depth; + + if (wqe->has_rd_atomic) + return 0; + + qp->req.need_rd_atomic = 1; + depth = atomic_dec_return(&qp->req.rd_atomic); + + if (depth >= 0) { + qp->req.need_rd_atomic = 0; + wqe->has_rd_atomic = 1; + return 0; + } + + atomic_inc(&qp->req.rd_atomic); + return -EAGAIN; +} + +static inline int get_mtu(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_port *port; + struct rxe_av *av; + + if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC)) + return qp->mtu; + + av = &wqe->av; + port = &rxe->port; + + return port->mtu_cap; +} + +static struct sk_buff *init_req_packet(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + int opcode, int payload, + struct rxe_pkt_info *pkt) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_port *port = &rxe->port; + struct sk_buff *skb; + struct rxe_send_wr *ibwr = &wqe->wr; + struct rxe_av *av; + int pad = (-payload) & 0x3; + int paylen; + int solicited; + u16 pkey; + u32 qp_num; + int ack_req; + + /* length from start of bth to end of icrc */ + paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; + + /* pkt->hdr, rxe, port_num and mask are initialized in ifc + * layer + */ + pkt->opcode = opcode; + pkt->qp = qp; + pkt->psn = qp->req.psn; + pkt->mask = rxe_opcode[opcode].mask; + pkt->paylen = paylen; + pkt->offset = 0; + pkt->wqe = wqe; + + /* init skb */ + av = rxe_get_av(pkt); + skb = rxe->ifc_ops->init_packet(rxe, av, paylen, pkt); + if (unlikely(!skb)) + return NULL; + + /* init bth */ + solicited = (ibwr->send_flags & IB_SEND_SOLICITED) && + (pkt->mask & RXE_END_MASK) && + ((pkt->mask & (RXE_SEND_MASK)) || + (pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) == + (RXE_WRITE_MASK | RXE_IMMDT_MASK)); + + pkey = (qp_type(qp) == IB_QPT_GSI) ? + port->pkey_tbl[ibwr->wr.ud.pkey_index] : + port->pkey_tbl[qp->attr.pkey_index]; + + qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn : + qp->attr.dest_qp_num; + + ack_req = ((pkt->mask & RXE_END_MASK) || + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); + if (ack_req) + qp->req.noack_pkts = 0; + + bth_init(pkt, pkt->opcode, solicited, 0, pad, pkey, qp_num, + ack_req, pkt->psn); + + /* init optional headers */ + if (pkt->mask & RXE_RETH_MASK) { + reth_set_rkey(pkt, ibwr->wr.rdma.rkey); + reth_set_va(pkt, wqe->iova); + reth_set_len(pkt, wqe->dma.length); + } + + if (pkt->mask & RXE_IMMDT_MASK) + immdt_set_imm(pkt, ibwr->ex.imm_data); + + if (pkt->mask & RXE_IETH_MASK) + ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey); + + if (pkt->mask & RXE_ATMETH_MASK) { + atmeth_set_va(pkt, wqe->iova); + if (opcode == IB_OPCODE_RC_COMPARE_SWAP || + opcode == IB_OPCODE_RD_COMPARE_SWAP) { + atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap); + atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add); + } else { + atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add); + } + atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey); + } + + if (pkt->mask & RXE_DETH_MASK) { + if (qp->ibqp.qp_num == 1) + deth_set_qkey(pkt, GSI_QKEY); + else + deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey); + deth_set_sqp(pkt, qp->ibqp.qp_num); + } + + return skb; +} + +static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, struct sk_buff *skb, + int paylen) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + u32 crc = 0; + u32 *p; + int err; + + err = rxe->ifc_ops->prepare(rxe, pkt, skb, &crc); + if (err) + return err; + + if (pkt->mask & RXE_WRITE_OR_SEND) { + if (wqe->wr.send_flags & IB_SEND_INLINE) { + u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset]; + + crc = crc32_le(crc, tmp, paylen); + + memcpy(payload_addr(pkt), tmp, paylen); + + wqe->dma.resid -= paylen; + wqe->dma.sge_offset += paylen; + } else { + err = copy_data(rxe, qp->pd, 0, &wqe->dma, + payload_addr(pkt), paylen, + from_mem_obj, + &crc); + if (err) + return err; + } + } + p = payload_addr(pkt) + paylen + bth_pad(pkt); + + *p = ~crc; + + return 0; +} + +static void update_wqe_state(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, + enum wqe_state *prev_state) +{ + enum wqe_state prev_state_ = wqe->state; + + if (pkt->mask & RXE_END_MASK) { + if (qp_type(qp) == IB_QPT_RC) + wqe->state = wqe_state_pending; + } else { + wqe->state = wqe_state_processing; + } + + *prev_state = prev_state_; +} + +static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, int payload) +{ + /* number of packets left to send including current one */ + int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu; + + /* handle zero length packet case */ + if (num_pkt == 0) + num_pkt = 1; + + if (pkt->mask & RXE_START_MASK) { + wqe->first_psn = qp->req.psn; + wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK; + } + + if (pkt->mask & RXE_READ_MASK) + qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK; + else + qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; + + qp->req.opcode = pkt->opcode; + + + if (pkt->mask & RXE_END_MASK) + qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index); + + qp->need_req_skb = 0; + + if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer)) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); +} + +int rxe_requester(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + struct rxe_pkt_info pkt; + struct sk_buff *skb; + struct rxe_send_wqe *wqe; + unsigned mask; + int payload; + int mtu; + int opcode; + int ret; + enum wqe_state prev_state; + +next_wqe: + if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) + goto exit; + + if (unlikely(qp->req.state == QP_STATE_RESET)) { + qp->req.wqe_index = consumer_index(qp->sq.queue); + qp->req.opcode = -1; + qp->req.need_rd_atomic = 0; + qp->req.wait_psn = 0; + qp->req.need_retry = 0; + goto exit; + } + + if (unlikely(qp->req.need_retry)) { + req_retry(qp); + qp->req.need_retry = 0; + } + + wqe = req_next_wqe(qp); + if (unlikely(!wqe)) + goto exit; + + if (wqe->mask & WR_REG_MASK) { + if (wqe->wr.opcode == IB_WR_LOCAL_INV) { + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mem *rmr; + + rmr = rxe_pool_get_index(&rxe->mr_pool, + wqe->wr.ex.invalidate_rkey >> 8); + if (!rmr) { + pr_err("No mr for key %#x\n", wqe->wr.ex.invalidate_rkey); + wqe->state = wqe_state_error; + wqe->status = IB_WC_MW_BIND_ERR; + goto exit; + } + rmr->state = RXE_MEM_STATE_FREE; + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + } else if (wqe->wr.opcode == IB_WR_REG_MR) { + struct rxe_mem *rmr = to_rmr(wqe->wr.wr.reg.mr); + + rmr->state = RXE_MEM_STATE_VALID; + rmr->access = wqe->wr.wr.reg.access; + rmr->lkey = wqe->wr.wr.reg.key; + rmr->rkey = wqe->wr.wr.reg.key; + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + } else { + goto exit; + } + qp->req.wqe_index = next_index(qp->sq.queue, + qp->req.wqe_index); + goto next_wqe; + } + + if (unlikely(qp_type(qp) == IB_QPT_RC && + qp->req.psn > (qp->comp.psn + RXE_MAX_UNACKED_PSNS))) { + qp->req.wait_psn = 1; + goto exit; + } + + /* Limit the number of inflight SKBs per QP */ + if (unlikely(atomic_read(&qp->skb_out) > + RXE_INFLIGHT_SKBS_PER_QP_HIGH)) { + qp->need_req_skb = 1; + goto exit; + } + + opcode = next_opcode(qp, wqe, wqe->wr.opcode); + if (unlikely(opcode < 0)) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto exit; + } + + mask = rxe_opcode[opcode].mask; + if (unlikely(mask & RXE_READ_OR_ATOMIC)) { + if (check_init_depth(qp, wqe)) + goto exit; + } + + mtu = get_mtu(qp, wqe); + payload = (mask & RXE_WRITE_OR_SEND) ? wqe->dma.resid : 0; + if (payload > mtu) { + if (qp_type(qp) == IB_QPT_UD) { + /* C10-93.1.1: If the total sum of all the buffer lengths specified for a + * UD message exceeds the MTU of the port as returned by QueryHCA, the CI + * shall not emit any packets for this message. Further, the CI shall not + * generate an error due to this condition. + */ + + /* fake a successful UD send */ + wqe->first_psn = qp->req.psn; + wqe->last_psn = qp->req.psn; + qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; + qp->req.opcode = IB_OPCODE_UD_SEND_ONLY; + qp->req.wqe_index = next_index(qp->sq.queue, + qp->req.wqe_index); + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + goto complete; + } + payload = mtu; + } + + skb = init_req_packet(qp, wqe, opcode, payload, &pkt); + if (unlikely(!skb)) { + pr_err("Failed allocating skb\n"); + goto err; + } + + if (fill_packet(qp, wqe, &pkt, skb, payload)) { + pr_debug("Error during fill packet\n"); + goto err; + } + + update_wqe_state(qp, wqe, &pkt, &prev_state); + ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb); + if (ret) { + qp->need_req_skb = 1; + kfree_skb(skb); + + wqe->state = prev_state; + + if (ret == -EAGAIN) { + rxe_run_task(&qp->req.task, 1); + goto exit; + } + + goto err; + } + + update_state(qp, wqe, &pkt, payload); + + goto next_wqe; + +err: + kfree_skb(skb); + wqe->status = IB_WC_LOC_PROT_ERR; + wqe->state = wqe_state_error; + +complete: + if (qp_type(qp) != IB_QPT_RC) { + while (rxe_completer(qp) == 0) + ; + } + + return 0; + +exit: + return -EAGAIN; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c new file mode 100644 index 000000000000..ebb03b46e2ad --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -0,0 +1,1380 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_RESET, + RESPST_DONE, + RESPST_EXIT, +}; + +static char *resp_state_name[] = { + [RESPST_NONE] = "NONE", + [RESPST_GET_REQ] = "GET_REQ", + [RESPST_CHK_PSN] = "CHK_PSN", + [RESPST_CHK_OP_SEQ] = "CHK_OP_SEQ", + [RESPST_CHK_OP_VALID] = "CHK_OP_VALID", + [RESPST_CHK_RESOURCE] = "CHK_RESOURCE", + [RESPST_CHK_LENGTH] = "CHK_LENGTH", + [RESPST_CHK_RKEY] = "CHK_RKEY", + [RESPST_EXECUTE] = "EXECUTE", + [RESPST_READ_REPLY] = "READ_REPLY", + [RESPST_COMPLETE] = "COMPLETE", + [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", + [RESPST_CLEANUP] = "CLEANUP", + [RESPST_DUPLICATE_REQUEST] = "DUPLICATE_REQUEST", + [RESPST_ERR_MALFORMED_WQE] = "ERR_MALFORMED_WQE", + [RESPST_ERR_UNSUPPORTED_OPCODE] = "ERR_UNSUPPORTED_OPCODE", + [RESPST_ERR_MISALIGNED_ATOMIC] = "ERR_MISALIGNED_ATOMIC", + [RESPST_ERR_PSN_OUT_OF_SEQ] = "ERR_PSN_OUT_OF_SEQ", + [RESPST_ERR_MISSING_OPCODE_FIRST] = "ERR_MISSING_OPCODE_FIRST", + [RESPST_ERR_MISSING_OPCODE_LAST_C] = "ERR_MISSING_OPCODE_LAST_C", + [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E", + [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ", + [RESPST_ERR_RNR] = "ERR_RNR", + [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION", + [RESPST_ERR_LENGTH] = "ERR_LENGTH", + [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW", + [RESPST_ERROR] = "ERROR", + [RESPST_RESET] = "RESET", + [RESPST_DONE] = "DONE", + [RESPST_EXIT] = "EXIT", +}; + +/* rxe_recv calls here to add a request packet to the input queue */ +void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp, + struct sk_buff *skb) +{ + int must_sched; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + skb_queue_tail(&qp->req_pkts, skb); + + must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || + (skb_queue_len(&qp->req_pkts) > 1); + + rxe_run_task(&qp->resp.task, must_sched); +} + +static inline enum resp_states get_req(struct rxe_qp *qp, + struct rxe_pkt_info **pkt_p) +{ + struct sk_buff *skb; + + if (qp->resp.state == QP_STATE_ERROR) { + skb = skb_dequeue(&qp->req_pkts); + if (skb) { + /* drain request packet queue */ + rxe_drop_ref(qp); + kfree_skb(skb); + return RESPST_GET_REQ; + } + + /* go drain recv wr queue */ + return RESPST_CHK_RESOURCE; + } + + skb = skb_peek(&qp->req_pkts); + if (!skb) + return RESPST_EXIT; + + *pkt_p = SKB_TO_PKT(skb); + + return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN; +} + +static enum resp_states check_psn(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + int diff = psn_compare(pkt->psn, qp->resp.psn); + + switch (qp_type(qp)) { + case IB_QPT_RC: + if (diff > 0) { + if (qp->resp.sent_psn_nak) + return RESPST_CLEANUP; + + qp->resp.sent_psn_nak = 1; + return RESPST_ERR_PSN_OUT_OF_SEQ; + + } else if (diff < 0) { + return RESPST_DUPLICATE_REQUEST; + } + + if (qp->resp.sent_psn_nak) + qp->resp.sent_psn_nak = 0; + + break; + + case IB_QPT_UC: + if (qp->resp.drop_msg || diff != 0) { + if (pkt->mask & RXE_START_MASK) { + qp->resp.drop_msg = 0; + return RESPST_CHK_OP_SEQ; + } + + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + } + break; + default: + break; + } + + return RESPST_CHK_OP_SEQ; +} + +static enum resp_states check_op_seq(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + switch (qp->resp.opcode) { + case IB_OPCODE_RC_SEND_FIRST: + case IB_OPCODE_RC_SEND_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_RC_SEND_MIDDLE: + case IB_OPCODE_RC_SEND_LAST: + case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_C; + } + + case IB_OPCODE_RC_RDMA_WRITE_FIRST: + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_RC_RDMA_WRITE_LAST: + case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_C; + } + + default: + switch (pkt->opcode) { + case IB_OPCODE_RC_SEND_MIDDLE: + case IB_OPCODE_RC_SEND_LAST: + case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_RC_RDMA_WRITE_LAST: + case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_ERR_MISSING_OPCODE_FIRST; + default: + return RESPST_CHK_OP_VALID; + } + } + break; + + case IB_QPT_UC: + switch (qp->resp.opcode) { + case IB_OPCODE_UC_SEND_FIRST: + case IB_OPCODE_UC_SEND_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_UC_SEND_MIDDLE: + case IB_OPCODE_UC_SEND_LAST: + case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_D1E; + } + + case IB_OPCODE_UC_RDMA_WRITE_FIRST: + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_UC_RDMA_WRITE_LAST: + case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_D1E; + } + + default: + switch (pkt->opcode) { + case IB_OPCODE_UC_SEND_MIDDLE: + case IB_OPCODE_UC_SEND_LAST: + case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_UC_RDMA_WRITE_LAST: + case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + default: + return RESPST_CHK_OP_VALID; + } + } + break; + + default: + return RESPST_CHK_OP_VALID; + } +} + +static enum resp_states check_op_valid(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + if (((pkt->mask & RXE_READ_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || + ((pkt->mask & RXE_WRITE_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || + ((pkt->mask & RXE_ATOMIC_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) { + return RESPST_ERR_UNSUPPORTED_OPCODE; + } + + break; + + case IB_QPT_UC: + if ((pkt->mask & RXE_WRITE_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) { + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + } + + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + break; + + default: + WARN_ON(1); + break; + } + + return RESPST_CHK_RESOURCE; +} + +static enum resp_states get_srq_wqe(struct rxe_qp *qp) +{ + struct rxe_srq *srq = qp->srq; + struct rxe_queue *q = srq->rq.queue; + struct rxe_recv_wqe *wqe; + struct ib_event ev; + + if (srq->error) + return RESPST_ERR_RNR; + + spin_lock_bh(&srq->rq.consumer_lock); + + wqe = queue_head(q); + if (!wqe) { + spin_unlock_bh(&srq->rq.consumer_lock); + return RESPST_ERR_RNR; + } + + /* note kernel and user space recv wqes have same size */ + memcpy(&qp->resp.srq_wqe, wqe, sizeof(qp->resp.srq_wqe)); + + qp->resp.wqe = &qp->resp.srq_wqe.wqe; + advance_consumer(q); + + if (srq->limit && srq->ibsrq.event_handler && + (queue_count(q) < srq->limit)) { + srq->limit = 0; + goto event; + } + + spin_unlock_bh(&srq->rq.consumer_lock); + return RESPST_CHK_LENGTH; + +event: + spin_unlock_bh(&srq->rq.consumer_lock); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context); + return RESPST_CHK_LENGTH; +} + +static enum resp_states check_resource(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_srq *srq = qp->srq; + + if (qp->resp.state == QP_STATE_ERROR) { + if (qp->resp.wqe) { + qp->resp.status = IB_WC_WR_FLUSH_ERR; + return RESPST_COMPLETE; + } else if (!srq) { + qp->resp.wqe = queue_head(qp->rq.queue); + if (qp->resp.wqe) { + qp->resp.status = IB_WC_WR_FLUSH_ERR; + return RESPST_COMPLETE; + } else { + return RESPST_EXIT; + } + } else { + return RESPST_EXIT; + } + } + + if (pkt->mask & RXE_READ_OR_ATOMIC) { + /* it is the requesters job to not send + * too many read/atomic ops, we just + * recycle the responder resource queue + */ + if (likely(qp->attr.max_rd_atomic > 0)) + return RESPST_CHK_LENGTH; + else + return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ; + } + + if (pkt->mask & RXE_RWR_MASK) { + if (srq) + return get_srq_wqe(qp); + + qp->resp.wqe = queue_head(qp->rq.queue); + return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR; + } + + return RESPST_CHK_LENGTH; +} + +static enum resp_states check_length(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + return RESPST_CHK_RKEY; + + case IB_QPT_UC: + return RESPST_CHK_RKEY; + + default: + return RESPST_CHK_RKEY; + } +} + +static enum resp_states check_rkey(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_mem *mem; + u64 va; + u32 rkey; + u32 resid; + u32 pktlen; + int mtu = qp->mtu; + enum resp_states state; + int access; + + if (pkt->mask & (RXE_READ_MASK | RXE_WRITE_MASK)) { + if (pkt->mask & RXE_RETH_MASK) { + qp->resp.va = reth_va(pkt); + qp->resp.rkey = reth_rkey(pkt); + qp->resp.resid = reth_len(pkt); + } + access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ + : IB_ACCESS_REMOTE_WRITE; + } else if (pkt->mask & RXE_ATOMIC_MASK) { + qp->resp.va = atmeth_va(pkt); + qp->resp.rkey = atmeth_rkey(pkt); + qp->resp.resid = sizeof(u64); + access = IB_ACCESS_REMOTE_ATOMIC; + } else { + return RESPST_EXECUTE; + } + + va = qp->resp.va; + rkey = qp->resp.rkey; + resid = qp->resp.resid; + pktlen = payload_size(pkt); + + mem = lookup_mem(qp->pd, access, rkey, lookup_remote); + if (!mem) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err1; + } + + if (unlikely(mem->state == RXE_MEM_STATE_FREE)) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err1; + } + + if (mem_check_range(mem, va, resid)) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err2; + } + + if (pkt->mask & RXE_WRITE_MASK) { + if (resid > mtu) { + if (pktlen != mtu || bth_pad(pkt)) { + state = RESPST_ERR_LENGTH; + goto err2; + } + + resid = mtu; + } else { + if (pktlen != resid) { + state = RESPST_ERR_LENGTH; + goto err2; + } + if ((bth_pad(pkt) != (0x3 & (-resid)))) { + /* This case may not be exactly that + * but nothing else fits. + */ + state = RESPST_ERR_LENGTH; + goto err2; + } + } + } + + WARN_ON(qp->resp.mr); + + qp->resp.mr = mem; + return RESPST_EXECUTE; + +err2: + rxe_drop_ref(mem); +err1: + return state; +} + +static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr, + int data_len) +{ + int err; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + err = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma, + data_addr, data_len, to_mem_obj, NULL); + if (unlikely(err)) + return (err == -ENOSPC) ? RESPST_ERR_LENGTH + : RESPST_ERR_MALFORMED_WQE; + + return RESPST_NONE; +} + +static enum resp_states write_data_in(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + enum resp_states rc = RESPST_NONE; + int err; + int data_len = payload_size(pkt); + + err = rxe_mem_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt), + data_len, to_mem_obj, NULL); + if (err) { + rc = RESPST_ERR_RKEY_VIOLATION; + goto out; + } + + qp->resp.va += data_len; + qp->resp.resid -= data_len; + +out: + return rc; +} + +/* Guarantee atomicity of atomic operations at the machine level. */ +static DEFINE_SPINLOCK(atomic_ops_lock); + +static enum resp_states process_atomic(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + u64 iova = atmeth_va(pkt); + u64 *vaddr; + enum resp_states ret; + struct rxe_mem *mr = qp->resp.mr; + + if (mr->state != RXE_MEM_STATE_VALID) { + ret = RESPST_ERR_RKEY_VIOLATION; + goto out; + } + + vaddr = iova_to_vaddr(mr, iova, sizeof(u64)); + + /* check vaddr is 8 bytes aligned. */ + if (!vaddr || (uintptr_t)vaddr & 7) { + ret = RESPST_ERR_MISALIGNED_ATOMIC; + goto out; + } + + spin_lock_bh(&atomic_ops_lock); + + qp->resp.atomic_orig = *vaddr; + + if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP || + pkt->opcode == IB_OPCODE_RD_COMPARE_SWAP) { + if (*vaddr == atmeth_comp(pkt)) + *vaddr = atmeth_swap_add(pkt); + } else { + *vaddr += atmeth_swap_add(pkt); + } + + spin_unlock_bh(&atomic_ops_lock); + + ret = RESPST_NONE; +out: + return ret; +} + +static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_pkt_info *ack, + int opcode, + int payload, + u32 psn, + u8 syndrome, + u32 *crcp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct sk_buff *skb; + u32 crc = 0; + u32 *p; + int paylen; + int pad; + int err; + + /* + * allocate packet + */ + pad = (-payload) & 0x3; + paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; + + skb = rxe->ifc_ops->init_packet(rxe, &qp->pri_av, paylen, ack); + if (!skb) + return NULL; + + ack->qp = qp; + ack->opcode = opcode; + ack->mask = rxe_opcode[opcode].mask; + ack->offset = pkt->offset; + ack->paylen = paylen; + + /* fill in bth using the request packet headers */ + memcpy(ack->hdr, pkt->hdr, pkt->offset + RXE_BTH_BYTES); + + bth_set_opcode(ack, opcode); + bth_set_qpn(ack, qp->attr.dest_qp_num); + bth_set_pad(ack, pad); + bth_set_se(ack, 0); + bth_set_psn(ack, psn); + bth_set_ack(ack, 0); + ack->psn = psn; + + if (ack->mask & RXE_AETH_MASK) { + aeth_set_syn(ack, syndrome); + aeth_set_msn(ack, qp->resp.msn); + } + + if (ack->mask & RXE_ATMACK_MASK) + atmack_set_orig(ack, qp->resp.atomic_orig); + + err = rxe->ifc_ops->prepare(rxe, ack, skb, &crc); + if (err) { + kfree_skb(skb); + return NULL; + } + + if (crcp) { + /* CRC computation will be continued by the caller */ + *crcp = crc; + } else { + p = payload_addr(ack) + payload + bth_pad(ack); + *p = ~crc; + } + + return skb; +} + +/* RDMA read response. If res is not NULL, then we have a current RDMA request + * being processed or replayed. + */ +static enum resp_states read_reply(struct rxe_qp *qp, + struct rxe_pkt_info *req_pkt) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + int mtu = qp->mtu; + enum resp_states state; + int payload; + int opcode; + int err; + struct resp_res *res = qp->resp.res; + u32 icrc; + u32 *p; + + if (!res) { + /* This is the first time we process that request. Get a + * resource + */ + res = &qp->resp.resources[qp->resp.res_head]; + + free_rd_atomic_resource(qp, res); + rxe_advance_resp_resource(qp); + + res->type = RXE_READ_MASK; + + res->read.va = qp->resp.va; + res->read.va_org = qp->resp.va; + + res->first_psn = req_pkt->psn; + res->last_psn = req_pkt->psn + + (reth_len(req_pkt) + mtu - 1) / + mtu - 1; + res->cur_psn = req_pkt->psn; + + res->read.resid = qp->resp.resid; + res->read.length = qp->resp.resid; + res->read.rkey = qp->resp.rkey; + + /* note res inherits the reference to mr from qp */ + res->read.mr = qp->resp.mr; + qp->resp.mr = NULL; + + qp->resp.res = res; + res->state = rdatm_res_state_new; + } + + if (res->state == rdatm_res_state_new) { + if (res->read.resid <= mtu) + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY; + else + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; + } else { + if (res->read.resid > mtu) + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE; + else + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST; + } + + res->state = rdatm_res_state_next; + + payload = min_t(int, res->read.resid, mtu); + + skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload, + res->cur_psn, AETH_ACK_UNLIMITED, &icrc); + if (!skb) + return RESPST_ERR_RNR; + + err = rxe_mem_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt), + payload, from_mem_obj, &icrc); + if (err) + pr_err("Failed copying memory\n"); + + p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt); + *p = ~icrc; + + err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb); + if (err) { + pr_err("Failed sending RDMA reply.\n"); + kfree_skb(skb); + return RESPST_ERR_RNR; + } + + res->read.va += payload; + res->read.resid -= payload; + res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK; + + if (res->read.resid > 0) { + state = RESPST_DONE; + } else { + qp->resp.res = NULL; + qp->resp.opcode = -1; + qp->resp.psn = res->cur_psn; + state = RESPST_CLEANUP; + } + + return state; +} + +/* Executes a new request. A retried request never reach that function (send + * and writes are discarded, and reads and atomics are retried elsewhere. + */ +static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + enum resp_states err; + + if (pkt->mask & RXE_SEND_MASK) { + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) { + union rdma_network_hdr hdr; + struct sk_buff *skb = PKT_TO_SKB(pkt); + + memset(&hdr, 0, sizeof(hdr)); + if (skb->protocol == htons(ETH_P_IP)) + memcpy(&hdr.roce4grh, ip_hdr(skb), sizeof(hdr.roce4grh)); + else if (skb->protocol == htons(ETH_P_IPV6)) + memcpy(&hdr.ibgrh, ipv6_hdr(skb), sizeof(hdr.ibgrh)); + + err = send_data_in(qp, &hdr, sizeof(hdr)); + if (err) + return err; + } + err = send_data_in(qp, payload_addr(pkt), payload_size(pkt)); + if (err) + return err; + } else if (pkt->mask & RXE_WRITE_MASK) { + err = write_data_in(qp, pkt); + if (err) + return err; + } else if (pkt->mask & RXE_READ_MASK) { + /* For RDMA Read we can increment the msn now. See C9-148. */ + qp->resp.msn++; + return RESPST_READ_REPLY; + } else if (pkt->mask & RXE_ATOMIC_MASK) { + err = process_atomic(qp, pkt); + if (err) + return err; + } else + /* Unreachable */ + WARN_ON(1); + + /* We successfully processed this new request. */ + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + if (pkt->mask & RXE_COMP_MASK) + return RESPST_COMPLETE; + else if (qp_type(qp) == IB_QPT_RC) + return RESPST_ACKNOWLEDGE; + else + return RESPST_CLEANUP; +} + +static enum resp_states do_complete(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_cqe cqe; + struct ib_wc *wc = &cqe.ibwc; + struct ib_uverbs_wc *uwc = &cqe.uibwc; + struct rxe_recv_wqe *wqe = qp->resp.wqe; + + if (unlikely(!wqe)) + return RESPST_CLEANUP; + + memset(&cqe, 0, sizeof(cqe)); + + wc->wr_id = wqe->wr_id; + wc->status = qp->resp.status; + wc->qp = &qp->ibqp; + + /* fields after status are not required for errors */ + if (wc->status == IB_WC_SUCCESS) { + wc->opcode = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; + wc->vendor_err = 0; + wc->byte_len = wqe->dma.length - wqe->dma.resid; + + /* fields after byte_len are different between kernel and user + * space + */ + if (qp->rcq->is_user) { + uwc->wc_flags = IB_WC_GRH; + + if (pkt->mask & RXE_IMMDT_MASK) { + uwc->wc_flags |= IB_WC_WITH_IMM; + uwc->ex.imm_data = + (__u32 __force)immdt_imm(pkt); + } + + if (pkt->mask & RXE_IETH_MASK) { + uwc->wc_flags |= IB_WC_WITH_INVALIDATE; + uwc->ex.invalidate_rkey = ieth_rkey(pkt); + } + + uwc->qp_num = qp->ibqp.qp_num; + + if (pkt->mask & RXE_DETH_MASK) + uwc->src_qp = deth_sqp(pkt); + + uwc->port_num = qp->attr.port_num; + } else { + struct sk_buff *skb = PKT_TO_SKB(pkt); + + wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE; + if (skb->protocol == htons(ETH_P_IP)) + wc->network_hdr_type = RDMA_NETWORK_IPV4; + else + wc->network_hdr_type = RDMA_NETWORK_IPV6; + + if (pkt->mask & RXE_IMMDT_MASK) { + wc->wc_flags |= IB_WC_WITH_IMM; + wc->ex.imm_data = immdt_imm(pkt); + } + + if (pkt->mask & RXE_IETH_MASK) { + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mem *rmr; + + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = ieth_rkey(pkt); + + rmr = rxe_pool_get_index(&rxe->mr_pool, + wc->ex.invalidate_rkey >> 8); + if (unlikely(!rmr)) { + pr_err("Bad rkey %#x invalidation\n", wc->ex.invalidate_rkey); + return RESPST_ERROR; + } + rmr->state = RXE_MEM_STATE_FREE; + } + + wc->qp = &qp->ibqp; + + if (pkt->mask & RXE_DETH_MASK) + wc->src_qp = deth_sqp(pkt); + + wc->port_num = qp->attr.port_num; + } + } + + /* have copy for srq and reference for !srq */ + if (!qp->srq) + advance_consumer(qp->rq.queue); + + qp->resp.wqe = NULL; + + if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1)) + return RESPST_ERR_CQ_OVERFLOW; + + if (qp->resp.state == QP_STATE_ERROR) + return RESPST_CHK_RESOURCE; + + if (!pkt) + return RESPST_DONE; + else if (qp_type(qp) == IB_QPT_RC) + return RESPST_ACKNOWLEDGE; + else + return RESPST_CLEANUP; +} + +static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + u8 syndrome, u32 psn) +{ + int err = 0; + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, + 0, psn, syndrome, NULL); + if (!skb) { + err = -ENOMEM; + goto err1; + } + + err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb); + if (err) { + pr_err_ratelimited("Failed sending ack\n"); + kfree_skb(skb); + } + +err1: + return err; +} + +static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + u8 syndrome) +{ + int rc = 0; + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + struct sk_buff *skb_copy; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct resp_res *res; + + skb = prepare_ack_packet(qp, pkt, &ack_pkt, + IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn, + syndrome, NULL); + if (!skb) { + rc = -ENOMEM; + goto out; + } + + skb_copy = skb_clone(skb, GFP_ATOMIC); + if (skb_copy) + rxe_add_ref(qp); /* for the new SKB */ + else { + pr_warn("Could not clone atomic response\n"); + rc = -ENOMEM; + goto out; + } + + res = &qp->resp.resources[qp->resp.res_head]; + free_rd_atomic_resource(qp, res); + rxe_advance_resp_resource(qp); + + res->type = RXE_ATOMIC_MASK; + res->atomic.skb = skb; + res->first_psn = qp->resp.psn; + res->last_psn = qp->resp.psn; + res->cur_psn = qp->resp.psn; + + rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb_copy); + if (rc) { + pr_err_ratelimited("Failed sending ack\n"); + rxe_drop_ref(qp); + kfree_skb(skb_copy); + } + +out: + return rc; +} + +static enum resp_states acknowledge(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + if (qp_type(qp) != IB_QPT_RC) + return RESPST_CLEANUP; + + if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) + send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn); + else if (pkt->mask & RXE_ATOMIC_MASK) + send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED); + else if (bth_ack(pkt)) + send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn); + + return RESPST_CLEANUP; +} + +static enum resp_states cleanup(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct sk_buff *skb; + + if (pkt) { + skb = skb_dequeue(&qp->req_pkts); + rxe_drop_ref(qp); + kfree_skb(skb); + } + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + return RESPST_DONE; +} + +static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn) +{ + int i; + + for (i = 0; i < qp->attr.max_rd_atomic; i++) { + struct resp_res *res = &qp->resp.resources[i]; + + if (res->type == 0) + continue; + + if (psn_compare(psn, res->first_psn) >= 0 && + psn_compare(psn, res->last_psn) <= 0) { + return res; + } + } + + return NULL; +} + +static enum resp_states duplicate_request(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + enum resp_states rc; + + if (pkt->mask & RXE_SEND_MASK || + pkt->mask & RXE_WRITE_MASK) { + /* SEND. Ack again and cleanup. C9-105. */ + if (bth_ack(pkt)) + send_ack(qp, pkt, AETH_ACK_UNLIMITED, qp->resp.psn - 1); + rc = RESPST_CLEANUP; + goto out; + } else if (pkt->mask & RXE_READ_MASK) { + struct resp_res *res; + + res = find_resource(qp, pkt->psn); + if (!res) { + /* Resource not found. Class D error. Drop the + * request. + */ + rc = RESPST_CLEANUP; + goto out; + } else { + /* Ensure this new request is the same as the previous + * one or a subset of it. + */ + u64 iova = reth_va(pkt); + u32 resid = reth_len(pkt); + + if (iova < res->read.va_org || + resid > res->read.length || + (iova + resid) > (res->read.va_org + + res->read.length)) { + rc = RESPST_CLEANUP; + goto out; + } + + if (reth_rkey(pkt) != res->read.rkey) { + rc = RESPST_CLEANUP; + goto out; + } + + res->cur_psn = pkt->psn; + res->state = (pkt->psn == res->first_psn) ? + rdatm_res_state_new : + rdatm_res_state_replay; + + /* Reset the resource, except length. */ + res->read.va_org = iova; + res->read.va = iova; + res->read.resid = resid; + + /* Replay the RDMA read reply. */ + qp->resp.res = res; + rc = RESPST_READ_REPLY; + goto out; + } + } else { + struct resp_res *res; + + /* Find the operation in our list of responder resources. */ + res = find_resource(qp, pkt->psn); + if (res) { + struct sk_buff *skb_copy; + + skb_copy = skb_clone(res->atomic.skb, GFP_ATOMIC); + if (skb_copy) { + rxe_add_ref(qp); /* for the new SKB */ + } else { + pr_warn("Couldn't clone atomic resp\n"); + rc = RESPST_CLEANUP; + goto out; + } + bth_set_psn(SKB_TO_PKT(skb_copy), + qp->resp.psn - 1); + /* Resend the result. */ + rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, + pkt, skb_copy); + if (rc) { + pr_err("Failed resending result. This flow is not handled - skb ignored\n"); + kfree_skb(skb_copy); + rc = RESPST_CLEANUP; + goto out; + } + } + + /* Resource not found. Class D error. Drop the request. */ + rc = RESPST_CLEANUP; + goto out; + } +out: + return rc; +} + +/* Process a class A or C. Both are treated the same in this implementation. */ +static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome, + enum ib_wc_status status) +{ + qp->resp.aeth_syndrome = syndrome; + qp->resp.status = status; + + /* indicate that we should go through the ERROR state */ + qp->resp.goto_error = 1; +} + +static enum resp_states do_class_d1e_error(struct rxe_qp *qp) +{ + /* UC */ + if (qp->srq) { + /* Class E */ + qp->resp.drop_msg = 1; + if (qp->resp.wqe) { + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + return RESPST_COMPLETE; + } else { + return RESPST_CLEANUP; + } + } else { + /* Class D1. This packet may be the start of a + * new message and could be valid. The previous + * message is invalid and ignored. reset the + * recv wr to its original state + */ + if (qp->resp.wqe) { + qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length; + qp->resp.wqe->dma.cur_sge = 0; + qp->resp.wqe->dma.sge_offset = 0; + qp->resp.opcode = -1; + } + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + return RESPST_CLEANUP; + } +} + +int rxe_responder(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + enum resp_states state; + struct rxe_pkt_info *pkt = NULL; + int ret = 0; + + qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; + + if (!qp->valid) { + ret = -EINVAL; + goto done; + } + + switch (qp->resp.state) { + case QP_STATE_RESET: + state = RESPST_RESET; + break; + + default: + state = RESPST_GET_REQ; + break; + } + + while (1) { + pr_debug("state = %s\n", resp_state_name[state]); + switch (state) { + case RESPST_GET_REQ: + state = get_req(qp, &pkt); + break; + case RESPST_CHK_PSN: + state = check_psn(qp, pkt); + break; + case RESPST_CHK_OP_SEQ: + state = check_op_seq(qp, pkt); + break; + case RESPST_CHK_OP_VALID: + state = check_op_valid(qp, pkt); + break; + case RESPST_CHK_RESOURCE: + state = check_resource(qp, pkt); + break; + case RESPST_CHK_LENGTH: + state = check_length(qp, pkt); + break; + case RESPST_CHK_RKEY: + state = check_rkey(qp, pkt); + break; + case RESPST_EXECUTE: + state = execute(qp, pkt); + break; + case RESPST_COMPLETE: + state = do_complete(qp, pkt); + break; + case RESPST_READ_REPLY: + state = read_reply(qp, pkt); + break; + case RESPST_ACKNOWLEDGE: + state = acknowledge(qp, pkt); + break; + case RESPST_CLEANUP: + state = cleanup(qp, pkt); + break; + case RESPST_DUPLICATE_REQUEST: + state = duplicate_request(qp, pkt); + break; + case RESPST_ERR_PSN_OUT_OF_SEQ: + /* RC only - Class B. Drop packet. */ + send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); + state = RESPST_CLEANUP; + break; + + case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ: + case RESPST_ERR_MISSING_OPCODE_FIRST: + case RESPST_ERR_MISSING_OPCODE_LAST_C: + case RESPST_ERR_UNSUPPORTED_OPCODE: + case RESPST_ERR_MISALIGNED_ATOMIC: + /* RC Only - Class C. */ + do_class_ac_error(qp, AETH_NAK_INVALID_REQ, + IB_WC_REM_INV_REQ_ERR); + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_MISSING_OPCODE_LAST_D1E: + state = do_class_d1e_error(qp); + break; + case RESPST_ERR_RNR: + if (qp_type(qp) == IB_QPT_RC) { + /* RC - class B */ + send_ack(qp, pkt, AETH_RNR_NAK | + (~AETH_TYPE_MASK & + qp->attr.min_rnr_timer), + pkt->psn); + } else { + /* UD/UC - class D */ + qp->resp.drop_msg = 1; + } + state = RESPST_CLEANUP; + break; + + case RESPST_ERR_RKEY_VIOLATION: + if (qp_type(qp) == IB_QPT_RC) { + /* Class C */ + do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR, + IB_WC_REM_ACCESS_ERR); + state = RESPST_COMPLETE; + } else { + qp->resp.drop_msg = 1; + if (qp->srq) { + /* UC/SRQ Class D */ + qp->resp.status = IB_WC_REM_ACCESS_ERR; + state = RESPST_COMPLETE; + } else { + /* UC/non-SRQ Class E. */ + state = RESPST_CLEANUP; + } + } + break; + + case RESPST_ERR_LENGTH: + if (qp_type(qp) == IB_QPT_RC) { + /* Class C */ + do_class_ac_error(qp, AETH_NAK_INVALID_REQ, + IB_WC_REM_INV_REQ_ERR); + state = RESPST_COMPLETE; + } else if (qp->srq) { + /* UC/UD - class E */ + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + state = RESPST_COMPLETE; + } else { + /* UC/UD - class D */ + qp->resp.drop_msg = 1; + state = RESPST_CLEANUP; + } + break; + + case RESPST_ERR_MALFORMED_WQE: + /* All, Class A. */ + do_class_ac_error(qp, AETH_NAK_REM_OP_ERR, + IB_WC_LOC_QP_OP_ERR); + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_CQ_OVERFLOW: + /* All - Class G */ + state = RESPST_ERROR; + break; + + case RESPST_DONE: + if (qp->resp.goto_error) { + state = RESPST_ERROR; + break; + } + + goto done; + + case RESPST_EXIT: + if (qp->resp.goto_error) { + state = RESPST_ERROR; + break; + } + + goto exit; + + case RESPST_RESET: { + struct sk_buff *skb; + + while ((skb = skb_dequeue(&qp->req_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + + while (!qp->srq && qp->rq.queue && + queue_head(qp->rq.queue)) + advance_consumer(qp->rq.queue); + + qp->resp.wqe = NULL; + goto exit; + } + + case RESPST_ERROR: + qp->resp.goto_error = 0; + pr_warn("qp#%d moved to error state\n", qp_num(qp)); + rxe_qp_error(qp); + goto exit; + + default: + WARN_ON(1); + } + } + +exit: + ret = -EAGAIN; +done: + return ret; +} diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c new file mode 100644 index 000000000000..2a6e3cd2d4e8 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask) +{ + if (srq && srq->error) { + pr_warn("srq in error state\n"); + goto err1; + } + + if (mask & IB_SRQ_MAX_WR) { + if (attr->max_wr > rxe->attr.max_srq_wr) { + pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + attr->max_wr, rxe->attr.max_srq_wr); + goto err1; + } + + if (attr->max_wr <= 0) { + pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + goto err1; + } + + if (srq && srq->limit && (attr->max_wr < srq->limit)) { + pr_warn("max_wr (%d) < srq->limit (%d)\n", + attr->max_wr, srq->limit); + goto err1; + } + + if (attr->max_wr < RXE_MIN_SRQ_WR) + attr->max_wr = RXE_MIN_SRQ_WR; + } + + if (mask & IB_SRQ_LIMIT) { + if (attr->srq_limit > rxe->attr.max_srq_wr) { + pr_warn("srq_limit(%d) > max_srq_wr(%d)\n", + attr->srq_limit, rxe->attr.max_srq_wr); + goto err1; + } + + if (srq && (attr->srq_limit > srq->rq.queue->buf->index_mask)) { + pr_warn("srq_limit (%d) > cur limit(%d)\n", + attr->srq_limit, + srq->rq.queue->buf->index_mask); + goto err1; + } + } + + if (mask == IB_SRQ_INIT_MASK) { + if (attr->max_sge > rxe->attr.max_srq_sge) { + pr_warn("max_sge(%d) > max_srq_sge(%d)\n", + attr->max_sge, rxe->attr.max_srq_sge); + goto err1; + } + + if (attr->max_sge < RXE_MIN_SRQ_SGE) + attr->max_sge = RXE_MIN_SRQ_SGE; + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_init_attr *init, + struct ib_ucontext *context, struct ib_udata *udata) +{ + int err; + int srq_wqe_size; + struct rxe_queue *q; + + srq->ibsrq.event_handler = init->event_handler; + srq->ibsrq.srq_context = init->srq_context; + srq->limit = init->attr.srq_limit; + srq->srq_num = srq->pelem.index; + srq->rq.max_wr = init->attr.max_wr; + srq->rq.max_sge = init->attr.max_sge; + + srq_wqe_size = rcv_wqe_size(srq->rq.max_sge); + + spin_lock_init(&srq->rq.producer_lock); + spin_lock_init(&srq->rq.consumer_lock); + + q = rxe_queue_init(rxe, &srq->rq.max_wr, + srq_wqe_size); + if (!q) { + pr_warn("unable to allocate queue for srq\n"); + return -ENOMEM; + } + + srq->rq.queue = q; + + err = do_mmap_info(rxe, udata, false, context, q->buf, + q->buf_size, &q->ip); + if (err) + return err; + + if (udata && udata->outlen >= sizeof(struct mminfo) + sizeof(u32)) { + if (copy_to_user(udata->outbuf + sizeof(struct mminfo), + &srq->srq_num, sizeof(u32))) + return -EFAULT; + } + return 0; +} + +int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, + struct ib_udata *udata) +{ + int err; + struct rxe_queue *q = srq->rq.queue; + struct mminfo mi = { .offset = 1, .size = 0}; + + if (mask & IB_SRQ_MAX_WR) { + /* Check that we can write the mminfo struct to user space */ + if (udata && udata->inlen >= sizeof(__u64)) { + __u64 mi_addr; + + /* Get address of user space mminfo struct */ + err = ib_copy_from_udata(&mi_addr, udata, + sizeof(mi_addr)); + if (err) + goto err1; + + udata->outbuf = (void __user *)(unsigned long)mi_addr; + udata->outlen = sizeof(mi); + + if (!access_ok(VERIFY_WRITE, + (void __user *)udata->outbuf, + udata->outlen)) { + err = -EFAULT; + goto err1; + } + } + + err = rxe_queue_resize(q, (unsigned int *)&attr->max_wr, + rcv_wqe_size(srq->rq.max_sge), + srq->rq.queue->ip ? + srq->rq.queue->ip->context : + NULL, + udata, &srq->rq.producer_lock, + &srq->rq.consumer_lock); + if (err) + goto err2; + } + + if (mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + + return 0; + +err2: + rxe_queue_cleanup(q); + srq->rq.queue = NULL; +err1: + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c new file mode 100644 index 000000000000..cf8e77800046 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_net.h" + +/* Copy argument and remove trailing CR. Return the new length. */ +static int sanitize_arg(const char *val, char *intf, int intf_len) +{ + int len; + + if (!val) + return 0; + + /* Remove newline. */ + for (len = 0; len < intf_len - 1 && val[len] && val[len] != '\n'; len++) + intf[len] = val[len]; + intf[len] = 0; + + if (len == 0 || (val[len] != 0 && val[len] != '\n')) + return 0; + + return len; +} + +static void rxe_set_port_state(struct net_device *ndev) +{ + struct rxe_dev *rxe = net_to_rxe(ndev); + bool is_up = netif_running(ndev) && netif_carrier_ok(ndev); + + if (!rxe) + goto out; + + if (is_up) + rxe_port_up(rxe); + else + rxe_port_down(rxe); /* down for unknown state */ +out: + return; +} + +static int rxe_param_set_add(const char *val, const struct kernel_param *kp) +{ + int len; + int err = 0; + char intf[32]; + struct net_device *ndev = NULL; + struct rxe_dev *rxe; + + len = sanitize_arg(val, intf, sizeof(intf)); + if (!len) { + pr_err("rxe: add: invalid interface name\n"); + err = -EINVAL; + goto err; + } + + ndev = dev_get_by_name(&init_net, intf); + if (!ndev) { + pr_err("interface %s not found\n", intf); + err = -EINVAL; + goto err; + } + + if (net_to_rxe(ndev)) { + pr_err("rxe: already configured on %s\n", intf); + err = -EINVAL; + goto err; + } + + rxe = rxe_net_add(ndev); + if (!rxe) { + pr_err("rxe: failed to add %s\n", intf); + err = -EINVAL; + goto err; + } + + rxe_set_port_state(ndev); + pr_info("rxe: added %s to %s\n", rxe->ib_dev.name, intf); +err: + if (ndev) + dev_put(ndev); + return err; +} + +static int rxe_param_set_remove(const char *val, const struct kernel_param *kp) +{ + int len; + char intf[32]; + struct rxe_dev *rxe; + + len = sanitize_arg(val, intf, sizeof(intf)); + if (!len) { + pr_err("rxe: add: invalid interface name\n"); + return -EINVAL; + } + + if (strncmp("all", intf, len) == 0) { + pr_info("rxe_sys: remove all"); + rxe_remove_all(); + return 0; + } + + rxe = get_rxe_by_name(intf); + + if (!rxe) { + pr_err("rxe: not configured on %s\n", intf); + return -EINVAL; + } + + list_del(&rxe->list); + rxe_remove(rxe); + + return 0; +} + +static const struct kernel_param_ops rxe_add_ops = { + .set = rxe_param_set_add, +}; + +static const struct kernel_param_ops rxe_remove_ops = { + .set = rxe_param_set_remove, +}; + +module_param_cb(add, &rxe_add_ops, NULL, 0200); +MODULE_PARM_DESC(add, "Create RXE device over network interface"); +module_param_cb(remove, &rxe_remove_ops, NULL, 0200); +MODULE_PARM_DESC(remove, "Remove RXE device over network interface"); diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c new file mode 100644 index 000000000000..1e19bf828a6e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "rxe_task.h" + +int __rxe_do_task(struct rxe_task *task) + +{ + int ret; + + while ((ret = task->func(task->arg)) == 0) + ; + + task->ret = ret; + + return ret; +} + +/* + * this locking is due to a potential race where + * a second caller finds the task already running + * but looks just after the last call to func + */ +void rxe_do_task(unsigned long data) +{ + int cont; + int ret; + unsigned long flags; + struct rxe_task *task = (struct rxe_task *)data; + + spin_lock_irqsave(&task->state_lock, flags); + switch (task->state) { + case TASK_STATE_START: + task->state = TASK_STATE_BUSY; + spin_unlock_irqrestore(&task->state_lock, flags); + break; + + case TASK_STATE_BUSY: + task->state = TASK_STATE_ARMED; + /* fall through to */ + case TASK_STATE_ARMED: + spin_unlock_irqrestore(&task->state_lock, flags); + return; + + default: + spin_unlock_irqrestore(&task->state_lock, flags); + pr_warn("bad state = %d in rxe_do_task\n", task->state); + return; + } + + do { + cont = 0; + ret = task->func(task->arg); + + spin_lock_irqsave(&task->state_lock, flags); + switch (task->state) { + case TASK_STATE_BUSY: + if (ret) + task->state = TASK_STATE_START; + else + cont = 1; + break; + + /* soneone tried to run the task since the last time we called + * func, so we will call one more time regardless of the + * return value + */ + case TASK_STATE_ARMED: + task->state = TASK_STATE_BUSY; + cont = 1; + break; + + default: + pr_warn("bad state = %d in rxe_do_task\n", + task->state); + } + spin_unlock_irqrestore(&task->state_lock, flags); + } while (cont); + + task->ret = ret; +} + +int rxe_init_task(void *obj, struct rxe_task *task, + void *arg, int (*func)(void *), char *name) +{ + task->obj = obj; + task->arg = arg; + task->func = func; + snprintf(task->name, sizeof(task->name), "%s", name); + + tasklet_init(&task->tasklet, rxe_do_task, (unsigned long)task); + + task->state = TASK_STATE_START; + spin_lock_init(&task->state_lock); + + return 0; +} + +void rxe_cleanup_task(struct rxe_task *task) +{ + tasklet_kill(&task->tasklet); +} + +void rxe_run_task(struct rxe_task *task, int sched) +{ + if (sched) + tasklet_schedule(&task->tasklet); + else + rxe_do_task((unsigned long)task); +} + +void rxe_disable_task(struct rxe_task *task) +{ + tasklet_disable(&task->tasklet); +} + +void rxe_enable_task(struct rxe_task *task) +{ + tasklet_enable(&task->tasklet); +} diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h new file mode 100644 index 000000000000..d14aa6daed05 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_TASK_H +#define RXE_TASK_H + +enum { + TASK_STATE_START = 0, + TASK_STATE_BUSY = 1, + TASK_STATE_ARMED = 2, +}; + +/* + * data structure to describe a 'task' which is a short + * function that returns 0 as long as it needs to be + * called again. + */ +struct rxe_task { + void *obj; + struct tasklet_struct tasklet; + int state; + spinlock_t state_lock; /* spinlock for task state */ + void *arg; + int (*func)(void *arg); + int ret; + char name[16]; +}; + +/* + * init rxe_task structure + * arg => parameter to pass to fcn + * fcn => function to call until it returns != 0 + */ +int rxe_init_task(void *obj, struct rxe_task *task, + void *arg, int (*func)(void *), char *name); + +/* cleanup task */ +void rxe_cleanup_task(struct rxe_task *task); + +/* + * raw call to func in loop without any checking + * can call when tasklets are disabled + */ +int __rxe_do_task(struct rxe_task *task); + +/* + * common function called by any of the main tasklets + * If there is any chance that there is additional + * work to do someone must reschedule the task before + * leaving + */ +void rxe_do_task(unsigned long data); + +/* run a task, else schedule it to run as a tasklet, The decision + * to run or schedule tasklet is based on the parameter sched. + */ +void rxe_run_task(struct rxe_task *task, int sched); + +/* keep a task from scheduling */ +void rxe_disable_task(struct rxe_task *task); + +/* allow task to run */ +void rxe_enable_task(struct rxe_task *task); + +#endif /* RXE_TASK_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c new file mode 100644 index 000000000000..4552be960c6a --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -0,0 +1,1330 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +static int rxe_query_device(struct ib_device *dev, + struct ib_device_attr *attr, + struct ib_udata *uhw) +{ + struct rxe_dev *rxe = to_rdev(dev); + + if (uhw->inlen || uhw->outlen) + return -EINVAL; + + *attr = rxe->attr; + return 0; +} + +static void rxe_eth_speed_to_ib_speed(int speed, u8 *active_speed, + u8 *active_width) +{ + if (speed <= 1000) { + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_SDR; + } else if (speed <= 10000) { + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_FDR10; + } else if (speed <= 20000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_DDR; + } else if (speed <= 30000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + } else if (speed <= 40000) { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR10; + } else { + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + } +} + +static int rxe_query_port(struct ib_device *dev, + u8 port_num, struct ib_port_attr *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_port *port; + u32 speed; + + if (unlikely(port_num != 1)) { + pr_warn("invalid port_number %d\n", port_num); + goto err1; + } + + port = &rxe->port; + + *attr = port->attr; + + mutex_lock(&rxe->usdev_lock); + if (rxe->ndev->ethtool_ops->get_link_ksettings) { + struct ethtool_link_ksettings ks; + + rxe->ndev->ethtool_ops->get_link_ksettings(rxe->ndev, &ks); + speed = ks.base.speed; + } else if (rxe->ndev->ethtool_ops->get_settings) { + struct ethtool_cmd cmd; + + rxe->ndev->ethtool_ops->get_settings(rxe->ndev, &cmd); + speed = cmd.speed; + } else { + pr_warn("%s speed is unknown, defaulting to 1000\n", rxe->ndev->name); + speed = 1000; + } + rxe_eth_speed_to_ib_speed(speed, &attr->active_speed, &attr->active_width); + mutex_unlock(&rxe->usdev_lock); + + return 0; + +err1: + return -EINVAL; +} + +static int rxe_query_gid(struct ib_device *device, + u8 port_num, int index, union ib_gid *gid) +{ + int ret; + + if (index > RXE_PORT_GID_TBL_LEN) + return -EINVAL; + + ret = ib_get_cached_gid(device, port_num, index, gid, NULL); + if (ret == -EAGAIN) { + memcpy(gid, &zgid, sizeof(*gid)); + return 0; + } + + return ret; +} + +static int rxe_add_gid(struct ib_device *device, u8 port_num, unsigned int + index, const union ib_gid *gid, + const struct ib_gid_attr *attr, void **context) +{ + if (index >= RXE_PORT_GID_TBL_LEN) + return -EINVAL; + return 0; +} + +static int rxe_del_gid(struct ib_device *device, u8 port_num, unsigned int + index, void **context) +{ + if (index >= RXE_PORT_GID_TBL_LEN) + return -EINVAL; + return 0; +} + +static struct net_device *rxe_get_netdev(struct ib_device *device, + u8 port_num) +{ + struct rxe_dev *rxe = to_rdev(device); + + if (rxe->ndev) { + dev_hold(rxe->ndev); + return rxe->ndev; + } + + return NULL; +} + +static int rxe_query_pkey(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey) +{ + struct rxe_dev *rxe = to_rdev(device); + struct rxe_port *port; + + if (unlikely(port_num != 1)) { + dev_warn(device->dma_device, "invalid port_num = %d\n", + port_num); + goto err1; + } + + port = &rxe->port; + + if (unlikely(index >= port->attr.pkey_tbl_len)) { + dev_warn(device->dma_device, "invalid index = %d\n", + index); + goto err1; + } + + *pkey = port->pkey_tbl[index]; + return 0; + +err1: + return -EINVAL; +} + +static int rxe_modify_device(struct ib_device *dev, + int mask, struct ib_device_modify *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + + if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) + rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); + + if (mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(rxe->ib_dev.node_desc, + attr->node_desc, sizeof(rxe->ib_dev.node_desc)); + } + + return 0; +} + +static int rxe_modify_port(struct ib_device *dev, + u8 port_num, int mask, struct ib_port_modify *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_port *port; + + if (unlikely(port_num != 1)) { + pr_warn("invalid port_num = %d\n", port_num); + goto err1; + } + + port = &rxe->port; + + port->attr.port_cap_flags |= attr->set_port_cap_mask; + port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; + + if (mask & IB_PORT_RESET_QKEY_CNTR) + port->attr.qkey_viol_cntr = 0; + + return 0; + +err1: + return -EINVAL; +} + +static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev, + u8 port_num) +{ + struct rxe_dev *rxe = to_rdev(dev); + + return rxe->ifc_ops->link_layer(rxe, port_num); +} + +static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_ucontext *uc; + + uc = rxe_alloc(&rxe->uc_pool); + return uc ? &uc->ibuc : ERR_PTR(-ENOMEM); +} + +static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc) +{ + struct rxe_ucontext *uc = to_ruc(ibuc); + + rxe_drop_ref(uc); + return 0; +} + +static int rxe_port_immutable(struct ib_device *dev, u8 port_num, + struct ib_port_immutable *immutable) +{ + int err; + struct ib_port_attr attr; + + err = rxe_query_port(dev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + +static struct ib_pd *rxe_alloc_pd(struct ib_device *dev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_pd *pd; + + pd = rxe_alloc(&rxe->pd_pool); + return pd ? &pd->ibpd : ERR_PTR(-ENOMEM); +} + +static int rxe_dealloc_pd(struct ib_pd *ibpd) +{ + struct rxe_pd *pd = to_rpd(ibpd); + + rxe_drop_ref(pd); + return 0; +} + +static int rxe_init_av(struct rxe_dev *rxe, struct ib_ah_attr *attr, + struct rxe_av *av) +{ + int err; + union ib_gid sgid; + struct ib_gid_attr sgid_attr; + + err = ib_get_cached_gid(&rxe->ib_dev, attr->port_num, + attr->grh.sgid_index, &sgid, + &sgid_attr); + if (err) { + pr_err("Failed to query sgid. err = %d\n", err); + return err; + } + + err = rxe_av_from_attr(rxe, attr->port_num, av, attr); + if (!err) + err = rxe_av_fill_ip_info(rxe, av, attr, &sgid_attr, &sgid); + + if (sgid_attr.ndev) + dev_put(sgid_attr.ndev); + return err; +} + +static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_ah *ah; + + err = rxe_av_chk_attr(rxe, attr); + if (err) + goto err1; + + ah = rxe_alloc(&rxe->ah_pool); + if (!ah) { + err = -ENOMEM; + goto err1; + } + + rxe_add_ref(pd); + ah->pd = pd; + + err = rxe_init_av(rxe, attr, &ah->av); + if (err) + goto err2; + + return &ah->ibah; + +err2: + rxe_drop_ref(pd); + rxe_drop_ref(ah); +err1: + return ERR_PTR(err); +} + +static int rxe_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibah->device); + struct rxe_ah *ah = to_rah(ibah); + + err = rxe_av_chk_attr(rxe, attr); + if (err) + return err; + + err = rxe_init_av(rxe, attr, &ah->av); + if (err) + return err; + + return 0; +} + +static int rxe_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) +{ + struct rxe_dev *rxe = to_rdev(ibah->device); + struct rxe_ah *ah = to_rah(ibah); + + rxe_av_to_attr(rxe, &ah->av, attr); + return 0; +} + +static int rxe_destroy_ah(struct ib_ah *ibah) +{ + struct rxe_ah *ah = to_rah(ibah); + + rxe_drop_ref(ah->pd); + rxe_drop_ref(ah); + return 0; +} + +static int post_one_recv(struct rxe_rq *rq, struct ib_recv_wr *ibwr) +{ + int err; + int i; + u32 length; + struct rxe_recv_wqe *recv_wqe; + int num_sge = ibwr->num_sge; + + if (unlikely(queue_full(rq->queue))) { + err = -ENOMEM; + goto err1; + } + + if (unlikely(num_sge > rq->max_sge)) { + err = -EINVAL; + goto err1; + } + + length = 0; + for (i = 0; i < num_sge; i++) + length += ibwr->sg_list[i].length; + + recv_wqe = producer_addr(rq->queue); + recv_wqe->wr_id = ibwr->wr_id; + recv_wqe->num_sge = num_sge; + + memcpy(recv_wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + recv_wqe->dma.length = length; + recv_wqe->dma.resid = length; + recv_wqe->dma.num_sge = num_sge; + recv_wqe->dma.cur_sge = 0; + recv_wqe->dma.sge_offset = 0; + + /* make sure all changes to the work queue are written before we + * update the producer pointer + */ + smp_wmb(); + + advance_producer(rq->queue); + return 0; + +err1: + return err; +} + +static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *init, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_srq *srq; + struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL; + + err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK); + if (err) + goto err1; + + srq = rxe_alloc(&rxe->srq_pool); + if (!srq) { + err = -ENOMEM; + goto err1; + } + + rxe_add_index(srq); + rxe_add_ref(pd); + srq->pd = pd; + + err = rxe_srq_from_init(rxe, srq, init, context, udata); + if (err) + goto err2; + + return &srq->ibsrq; + +err2: + rxe_drop_ref(pd); + rxe_drop_index(srq); + rxe_drop_ref(srq); +err1: + return ERR_PTR(err); +} + +static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask mask, + struct ib_udata *udata) +{ + int err; + struct rxe_srq *srq = to_rsrq(ibsrq); + struct rxe_dev *rxe = to_rdev(ibsrq->device); + + err = rxe_srq_chk_attr(rxe, srq, attr, mask); + if (err) + goto err1; + + err = rxe_srq_from_attr(rxe, srq, attr, mask, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + + if (srq->error) + return -EINVAL; + + attr->max_wr = srq->rq.queue->buf->index_mask; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +static int rxe_destroy_srq(struct ib_srq *ibsrq) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + + if (srq->rq.queue) + rxe_queue_cleanup(srq->rq.queue); + + rxe_drop_ref(srq->pd); + rxe_drop_index(srq); + rxe_drop_ref(srq); + + return 0; +} + +static int rxe_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + unsigned long flags; + struct rxe_srq *srq = to_rsrq(ibsrq); + + spin_lock_irqsave(&srq->rq.producer_lock, flags); + + while (wr) { + err = post_one_recv(&srq->rq, wr); + if (unlikely(err)) + break; + wr = wr->next; + } + + spin_unlock_irqrestore(&srq->rq.producer_lock, flags); + + if (err) + *bad_wr = wr; + + return err; +} + +static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_qp *qp; + + err = rxe_qp_chk_init(rxe, init); + if (err) + goto err1; + + qp = rxe_alloc(&rxe->qp_pool); + if (!qp) { + err = -ENOMEM; + goto err1; + } + + if (udata) { + if (udata->inlen) { + err = -EINVAL; + goto err1; + } + qp->is_user = 1; + } + + rxe_add_index(qp); + + err = rxe_qp_from_init(rxe, qp, pd, init, udata, ibpd); + if (err) + goto err2; + + return &qp->ibqp; + +err2: + rxe_drop_index(qp); + rxe_drop_ref(qp); +err1: + return ERR_PTR(err); +} + +static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + + err = rxe_qp_chk_attr(rxe, qp, attr, mask); + if (err) + goto err1; + + err = rxe_qp_from_attr(qp, attr, mask, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_qp_init_attr *init) +{ + struct rxe_qp *qp = to_rqp(ibqp); + + rxe_qp_to_init(qp, init); + rxe_qp_to_attr(qp, attr, mask); + + return 0; +} + +static int rxe_destroy_qp(struct ib_qp *ibqp) +{ + struct rxe_qp *qp = to_rqp(ibqp); + + rxe_qp_destroy(qp); + rxe_drop_index(qp); + rxe_drop_ref(qp); + return 0; +} + +static int validate_send_wr(struct rxe_qp *qp, struct ib_send_wr *ibwr, + unsigned int mask, unsigned int length) +{ + int num_sge = ibwr->num_sge; + struct rxe_sq *sq = &qp->sq; + + if (unlikely(num_sge > sq->max_sge)) + goto err1; + + if (unlikely(mask & WR_ATOMIC_MASK)) { + if (length < 8) + goto err1; + + if (atomic_wr(ibwr)->remote_addr & 0x7) + goto err1; + } + + if (unlikely((ibwr->send_flags & IB_SEND_INLINE) && + (length > sq->max_inline))) + goto err1; + + return 0; + +err1: + return -EINVAL; +} + +static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, + struct ib_send_wr *ibwr) +{ + wr->wr_id = ibwr->wr_id; + wr->num_sge = ibwr->num_sge; + wr->opcode = ibwr->opcode; + wr->send_flags = ibwr->send_flags; + + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) { + wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; + wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; + if (qp_type(qp) == IB_QPT_GSI) + wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; + if (wr->opcode == IB_WR_SEND_WITH_IMM) + wr->ex.imm_data = ibwr->ex.imm_data; + } else { + switch (wr->opcode) { + case IB_WR_RDMA_WRITE_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; + wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; + break; + case IB_WR_SEND_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + break; + case IB_WR_SEND_WITH_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + break; + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + wr->wr.atomic.remote_addr = + atomic_wr(ibwr)->remote_addr; + wr->wr.atomic.compare_add = + atomic_wr(ibwr)->compare_add; + wr->wr.atomic.swap = atomic_wr(ibwr)->swap; + wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey; + break; + case IB_WR_LOCAL_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + break; + case IB_WR_REG_MR: + wr->wr.reg.mr = reg_wr(ibwr)->mr; + wr->wr.reg.key = reg_wr(ibwr)->key; + wr->wr.reg.access = reg_wr(ibwr)->access; + break; + default: + break; + } + } +} + +static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr, + unsigned int mask, unsigned int length, + struct rxe_send_wqe *wqe) +{ + int num_sge = ibwr->num_sge; + struct ib_sge *sge; + int i; + u8 *p; + + init_send_wr(qp, &wqe->wr, ibwr); + + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) + memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av)); + + if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) { + p = wqe->dma.inline_data; + + sge = ibwr->sg_list; + for (i = 0; i < num_sge; i++, sge++) { + if (qp->is_user && copy_from_user(p, (__user void *) + (uintptr_t)sge->addr, sge->length)) + return -EFAULT; + + else if (!qp->is_user) + memcpy(p, (void *)(uintptr_t)sge->addr, + sge->length); + + p += sge->length; + } + } else if (mask & WR_REG_MASK) { + wqe->mask = mask; + wqe->state = wqe_state_posted; + return 0; + } else + memcpy(wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + wqe->iova = (mask & WR_ATOMIC_MASK) ? + atomic_wr(ibwr)->remote_addr : + rdma_wr(ibwr)->remote_addr; + wqe->mask = mask; + wqe->dma.length = length; + wqe->dma.resid = length; + wqe->dma.num_sge = num_sge; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + wqe->state = wqe_state_posted; + wqe->ssn = atomic_add_return(1, &qp->ssn); + + return 0; +} + +static int post_one_send(struct rxe_qp *qp, struct ib_send_wr *ibwr, + unsigned mask, u32 length) +{ + int err; + struct rxe_sq *sq = &qp->sq; + struct rxe_send_wqe *send_wqe; + unsigned long flags; + + err = validate_send_wr(qp, ibwr, mask, length); + if (err) + return err; + + spin_lock_irqsave(&qp->sq.sq_lock, flags); + + if (unlikely(queue_full(sq->queue))) { + err = -ENOMEM; + goto err1; + } + + send_wqe = producer_addr(sq->queue); + + err = init_send_wqe(qp, ibwr, mask, length, send_wqe); + if (unlikely(err)) + goto err1; + + /* + * make sure all changes to the work queue are + * written before we update the producer pointer + */ + smp_wmb(); + + advance_producer(sq->queue); + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + + return 0; + +err1: + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + return err; +} + +static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + struct rxe_qp *qp = to_rqp(ibqp); + unsigned int mask; + unsigned int length = 0; + int i; + int must_sched; + + if (unlikely(!qp->valid)) { + *bad_wr = wr; + return -EINVAL; + } + + if (unlikely(qp->req.state < QP_STATE_READY)) { + *bad_wr = wr; + return -EINVAL; + } + + while (wr) { + mask = wr_opcode_mask(wr->opcode, qp); + if (unlikely(!mask)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely((wr->send_flags & IB_SEND_INLINE) && + !(mask & WR_INLINE_MASK))) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + length = 0; + for (i = 0; i < wr->num_sge; i++) + length += wr->sg_list[i].length; + + err = post_one_send(qp, wr, mask, length); + + if (err) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + + /* + * Must sched in case of GSI QP because ib_send_mad() hold irq lock, + * and the requester call ip_local_out_sk() that takes spin_lock_bh. + */ + must_sched = (qp_type(qp) == IB_QPT_GSI) || + (queue_count(qp->sq.queue) > 1); + + rxe_run_task(&qp->req.task, must_sched); + + return err; +} + +static int rxe_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_rq *rq = &qp->rq; + unsigned long flags; + + if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) { + *bad_wr = wr; + err = -EINVAL; + goto err1; + } + + if (unlikely(qp->srq)) { + *bad_wr = wr; + err = -EINVAL; + goto err1; + } + + spin_lock_irqsave(&rq->producer_lock, flags); + + while (wr) { + err = post_one_recv(rq, wr); + if (unlikely(err)) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + + spin_unlock_irqrestore(&rq->producer_lock, flags); + +err1: + return err; +} + +static struct ib_cq *rxe_create_cq(struct ib_device *dev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_cq *cq; + + if (attr->flags) + return ERR_PTR(-EINVAL); + + err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector, udata); + if (err) + goto err1; + + cq = rxe_alloc(&rxe->cq_pool); + if (!cq) { + err = -ENOMEM; + goto err1; + } + + err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, + context, udata); + if (err) + goto err2; + + return &cq->ibcq; + +err2: + rxe_drop_ref(cq); +err1: + return ERR_PTR(err); +} + +static int rxe_destroy_cq(struct ib_cq *ibcq) +{ + struct rxe_cq *cq = to_rcq(ibcq); + + rxe_drop_ref(cq); + return 0; +} + +static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + int err; + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_dev *rxe = to_rdev(ibcq->device); + + err = rxe_cq_chk_attr(rxe, cq, cqe, 0, udata); + if (err) + goto err1; + + err = rxe_cq_resize_queue(cq, cqe, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + int i; + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_cqe *cqe; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + for (i = 0; i < num_entries; i++) { + cqe = queue_head(cq->queue); + if (!cqe) + break; + + memcpy(wc++, &cqe->ibwc, sizeof(*wc)); + advance_consumer(cq->queue); + } + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return i; +} + +static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) +{ + struct rxe_cq *cq = to_rcq(ibcq); + int count = queue_count(cq->queue); + + return (count > wc_cnt) ? wc_cnt : count; +} + +static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct rxe_cq *cq = to_rcq(ibcq); + + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = flags & IB_CQ_SOLICITED_MASK; + + return 0; +} + +static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + int err; + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) { + err = -ENOMEM; + goto err1; + } + + rxe_add_index(mr); + + rxe_add_ref(pd); + + err = rxe_mem_init_dma(rxe, pd, access, mr); + if (err) + goto err2; + + return &mr->ibmr; + +err2: + rxe_drop_ref(pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); +err1: + return ERR_PTR(err); +} + +static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, + u64 start, + u64 length, + u64 iova, + int access, struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) { + err = -ENOMEM; + goto err2; + } + + rxe_add_index(mr); + + rxe_add_ref(pd); + + err = rxe_mem_init_user(rxe, pd, start, length, iova, + access, udata, mr); + if (err) + goto err3; + + return &mr->ibmr; + +err3: + rxe_drop_ref(pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); +err2: + return ERR_PTR(err); +} + +static int rxe_dereg_mr(struct ib_mr *ibmr) +{ + struct rxe_mem *mr = to_rmr(ibmr); + + mr->state = RXE_MEM_STATE_ZOMBIE; + rxe_drop_ref(mr->pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); + return 0; +} + +static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + int err; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) { + err = -ENOMEM; + goto err1; + } + + rxe_add_index(mr); + + rxe_add_ref(pd); + + err = rxe_mem_init_fast(rxe, pd, max_num_sg, mr); + if (err) + goto err2; + + return &mr->ibmr; + +err2: + rxe_drop_ref(pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); +err1: + return ERR_PTR(err); +} + +static int rxe_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct rxe_mem *mr = to_rmr(ibmr); + struct rxe_map *map; + struct rxe_phys_buf *buf; + + if (unlikely(mr->nbuf == mr->num_buf)) + return -ENOMEM; + + map = mr->map[mr->nbuf / RXE_BUF_PER_MAP]; + buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP]; + + buf->addr = addr; + buf->size = ibmr->page_size; + mr->nbuf++; + + return 0; +} + +static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct rxe_mem *mr = to_rmr(ibmr); + int n; + + mr->nbuf = 0; + + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page); + + mr->va = ibmr->iova; + mr->iova = ibmr->iova; + mr->length = ibmr->length; + mr->page_shift = ilog2(ibmr->page_size); + mr->page_mask = ibmr->page_size - 1; + mr->offset = mr->iova & mr->page_mask; + + return n; +} + +static int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_mc_grp *grp; + + /* takes a ref on grp if successful */ + err = rxe_mcast_get_grp(rxe, mgid, &grp); + if (err) + return err; + + err = rxe_mcast_add_grp_elem(rxe, qp, grp); + + rxe_drop_ref(grp); + return err; +} + +static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + + return rxe_mcast_drop_grp_elem(rxe, qp, mgid); +} + +static ssize_t rxe_show_parent(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct rxe_dev *rxe = container_of(device, struct rxe_dev, + ib_dev.dev); + char *name; + + name = rxe->ifc_ops->parent_name(rxe, 1); + return snprintf(buf, 16, "%s\n", name); +} + +static DEVICE_ATTR(parent, S_IRUGO, rxe_show_parent, NULL); + +static struct device_attribute *rxe_dev_attributes[] = { + &dev_attr_parent, +}; + +int rxe_register_device(struct rxe_dev *rxe) +{ + int err; + int i; + struct ib_device *dev = &rxe->ib_dev; + + strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX); + strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); + + dev->owner = THIS_MODULE; + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = 1; + dev->num_comp_vectors = RXE_NUM_COMP_VECTORS; + dev->dma_device = rxe->ifc_ops->dma_device(rxe); + dev->local_dma_lkey = 0; + dev->node_guid = rxe->ifc_ops->node_guid(rxe); + dev->dma_ops = &rxe_dma_mapping_ops; + + dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION; + dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) + | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) + | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_POST_SRQ_RECV) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) + | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_PEEK_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) + | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) + | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) + ; + + dev->query_device = rxe_query_device; + dev->modify_device = rxe_modify_device; + dev->query_port = rxe_query_port; + dev->modify_port = rxe_modify_port; + dev->get_link_layer = rxe_get_link_layer; + dev->query_gid = rxe_query_gid; + dev->get_netdev = rxe_get_netdev; + dev->add_gid = rxe_add_gid; + dev->del_gid = rxe_del_gid; + dev->query_pkey = rxe_query_pkey; + dev->alloc_ucontext = rxe_alloc_ucontext; + dev->dealloc_ucontext = rxe_dealloc_ucontext; + dev->mmap = rxe_mmap; + dev->get_port_immutable = rxe_port_immutable; + dev->alloc_pd = rxe_alloc_pd; + dev->dealloc_pd = rxe_dealloc_pd; + dev->create_ah = rxe_create_ah; + dev->modify_ah = rxe_modify_ah; + dev->query_ah = rxe_query_ah; + dev->destroy_ah = rxe_destroy_ah; + dev->create_srq = rxe_create_srq; + dev->modify_srq = rxe_modify_srq; + dev->query_srq = rxe_query_srq; + dev->destroy_srq = rxe_destroy_srq; + dev->post_srq_recv = rxe_post_srq_recv; + dev->create_qp = rxe_create_qp; + dev->modify_qp = rxe_modify_qp; + dev->query_qp = rxe_query_qp; + dev->destroy_qp = rxe_destroy_qp; + dev->post_send = rxe_post_send; + dev->post_recv = rxe_post_recv; + dev->create_cq = rxe_create_cq; + dev->destroy_cq = rxe_destroy_cq; + dev->resize_cq = rxe_resize_cq; + dev->poll_cq = rxe_poll_cq; + dev->peek_cq = rxe_peek_cq; + dev->req_notify_cq = rxe_req_notify_cq; + dev->get_dma_mr = rxe_get_dma_mr; + dev->reg_user_mr = rxe_reg_user_mr; + dev->dereg_mr = rxe_dereg_mr; + dev->alloc_mr = rxe_alloc_mr; + dev->map_mr_sg = rxe_map_mr_sg; + dev->attach_mcast = rxe_attach_mcast; + dev->detach_mcast = rxe_detach_mcast; + + err = ib_register_device(dev, NULL); + if (err) { + pr_warn("rxe_register_device failed, err = %d\n", err); + goto err1; + } + + for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) { + err = device_create_file(&dev->dev, rxe_dev_attributes[i]); + if (err) { + pr_warn("device_create_file failed, i = %d, err = %d\n", + i, err); + goto err2; + } + } + + return 0; + +err2: + ib_unregister_device(dev); +err1: + return err; +} + +int rxe_unregister_device(struct rxe_dev *rxe) +{ + int i; + struct ib_device *dev = &rxe->ib_dev; + + for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) + device_remove_file(&dev->dev, rxe_dev_attributes[i]); + + ib_unregister_device(dev); + + return 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h new file mode 100644 index 000000000000..cac1d52a08f0 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_VERBS_H +#define RXE_VERBS_H + +#include +#include +#include "rxe_pool.h" +#include "rxe_task.h" + +static inline int pkey_match(u16 key1, u16 key2) +{ + return (((key1 & 0x7fff) != 0) && + ((key1 & 0x7fff) == (key2 & 0x7fff)) && + ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0; +} + +/* Return >0 if psn_a > psn_b + * 0 if psn_a == psn_b + * <0 if psn_a < psn_b + */ +static inline int psn_compare(u32 psn_a, u32 psn_b) +{ + s32 diff; + + diff = (psn_a - psn_b) << 8; + return diff; +} + +struct rxe_ucontext { + struct rxe_pool_entry pelem; + struct ib_ucontext ibuc; +}; + +struct rxe_pd { + struct rxe_pool_entry pelem; + struct ib_pd ibpd; +}; + +struct rxe_ah { + struct rxe_pool_entry pelem; + struct ib_ah ibah; + struct rxe_pd *pd; + struct rxe_av av; +}; + +struct rxe_cqe { + union { + struct ib_wc ibwc; + struct ib_uverbs_wc uibwc; + }; +}; + +struct rxe_cq { + struct rxe_pool_entry pelem; + struct ib_cq ibcq; + struct rxe_queue *queue; + spinlock_t cq_lock; + u8 notify; + int is_user; + struct tasklet_struct comp_task; +}; + +enum wqe_state { + wqe_state_posted, + wqe_state_processing, + wqe_state_pending, + wqe_state_done, + wqe_state_error, +}; + +struct rxe_sq { + int max_wr; + int max_sge; + int max_inline; + spinlock_t sq_lock; /* guard queue */ + struct rxe_queue *queue; +}; + +struct rxe_rq { + int max_wr; + int max_sge; + spinlock_t producer_lock; /* guard queue producer */ + spinlock_t consumer_lock; /* guard queue consumer */ + struct rxe_queue *queue; +}; + +struct rxe_srq { + struct rxe_pool_entry pelem; + struct ib_srq ibsrq; + struct rxe_pd *pd; + struct rxe_rq rq; + u32 srq_num; + + int limit; + int error; +}; + +enum rxe_qp_state { + QP_STATE_RESET, + QP_STATE_INIT, + QP_STATE_READY, + QP_STATE_DRAIN, /* req only */ + QP_STATE_DRAINED, /* req only */ + QP_STATE_ERROR +}; + +extern char *rxe_qp_state_name[]; + +struct rxe_req_info { + enum rxe_qp_state state; + int wqe_index; + u32 psn; + int opcode; + atomic_t rd_atomic; + int wait_fence; + int need_rd_atomic; + int wait_psn; + int need_retry; + int noack_pkts; + struct rxe_task task; +}; + +struct rxe_comp_info { + u32 psn; + int opcode; + int timeout; + int timeout_retry; + u32 retry_cnt; + u32 rnr_retry; + struct rxe_task task; +}; + +enum rdatm_res_state { + rdatm_res_state_next, + rdatm_res_state_new, + rdatm_res_state_replay, +}; + +struct resp_res { + int type; + u32 first_psn; + u32 last_psn; + u32 cur_psn; + enum rdatm_res_state state; + + union { + struct { + struct sk_buff *skb; + } atomic; + struct { + struct rxe_mem *mr; + u64 va_org; + u32 rkey; + u32 length; + u64 va; + u32 resid; + } read; + }; +}; + +struct rxe_resp_info { + enum rxe_qp_state state; + u32 msn; + u32 psn; + int opcode; + int drop_msg; + int goto_error; + int sent_psn_nak; + enum ib_wc_status status; + u8 aeth_syndrome; + + /* Receive only */ + struct rxe_recv_wqe *wqe; + + /* RDMA read / atomic only */ + u64 va; + struct rxe_mem *mr; + u32 resid; + u32 rkey; + u64 atomic_orig; + + /* SRQ only */ + struct { + struct rxe_recv_wqe wqe; + struct ib_sge sge[RXE_MAX_SGE]; + } srq_wqe; + + /* Responder resources. It's a circular list where the oldest + * resource is dropped first. + */ + struct resp_res *resources; + unsigned int res_head; + unsigned int res_tail; + struct resp_res *res; + struct rxe_task task; +}; + +struct rxe_qp { + struct rxe_pool_entry pelem; + struct ib_qp ibqp; + struct ib_qp_attr attr; + unsigned int valid; + unsigned int mtu; + int is_user; + + struct rxe_pd *pd; + struct rxe_srq *srq; + struct rxe_cq *scq; + struct rxe_cq *rcq; + + enum ib_sig_type sq_sig_type; + + struct rxe_sq sq; + struct rxe_rq rq; + + struct socket *sk; + + struct rxe_av pri_av; + struct rxe_av alt_av; + + /* list of mcast groups qp has joined (for cleanup) */ + struct list_head grp_list; + spinlock_t grp_lock; /* guard grp_list */ + + struct sk_buff_head req_pkts; + struct sk_buff_head resp_pkts; + struct sk_buff_head send_pkts; + + struct rxe_req_info req; + struct rxe_comp_info comp; + struct rxe_resp_info resp; + + atomic_t ssn; + atomic_t skb_out; + int need_req_skb; + + /* Timer for retranmitting packet when ACKs have been lost. RC + * only. The requester sets it when it is not already + * started. The responder resets it whenever an ack is + * received. + */ + struct timer_list retrans_timer; + u64 qp_timeout_jiffies; + + /* Timer for handling RNR NAKS. */ + struct timer_list rnr_nak_timer; + + spinlock_t state_lock; /* guard requester and completer */ +}; + +enum rxe_mem_state { + RXE_MEM_STATE_ZOMBIE, + RXE_MEM_STATE_INVALID, + RXE_MEM_STATE_FREE, + RXE_MEM_STATE_VALID, +}; + +enum rxe_mem_type { + RXE_MEM_TYPE_NONE, + RXE_MEM_TYPE_DMA, + RXE_MEM_TYPE_MR, + RXE_MEM_TYPE_FMR, + RXE_MEM_TYPE_MW, +}; + +#define RXE_BUF_PER_MAP (PAGE_SIZE / sizeof(struct rxe_phys_buf)) + +struct rxe_phys_buf { + u64 addr; + u64 size; +}; + +struct rxe_map { + struct rxe_phys_buf buf[RXE_BUF_PER_MAP]; +}; + +struct rxe_mem { + struct rxe_pool_entry pelem; + union { + struct ib_mr ibmr; + struct ib_mw ibmw; + }; + + struct rxe_pd *pd; + struct ib_umem *umem; + + u32 lkey; + u32 rkey; + + enum rxe_mem_state state; + enum rxe_mem_type type; + u64 va; + u64 iova; + size_t length; + u32 offset; + int access; + + int page_shift; + int page_mask; + int map_shift; + int map_mask; + + u32 num_buf; + u32 nbuf; + + u32 max_buf; + u32 num_map; + + struct rxe_map **map; +}; + +struct rxe_mc_grp { + struct rxe_pool_entry pelem; + spinlock_t mcg_lock; /* guard group */ + struct rxe_dev *rxe; + struct list_head qp_list; + union ib_gid mgid; + int num_qp; + u32 qkey; + u16 pkey; +}; + +struct rxe_mc_elem { + struct rxe_pool_entry pelem; + struct list_head qp_list; + struct list_head grp_list; + struct rxe_qp *qp; + struct rxe_mc_grp *grp; +}; + +struct rxe_port { + struct ib_port_attr attr; + u16 *pkey_tbl; + __be64 port_guid; + __be64 subnet_prefix; + spinlock_t port_lock; /* guard port */ + unsigned int mtu_cap; + /* special QPs */ + u32 qp_smi_index; + u32 qp_gsi_index; +}; + +/* callbacks from rdma_rxe to network interface layer */ +struct rxe_ifc_ops { + void (*release)(struct rxe_dev *rxe); + __be64 (*node_guid)(struct rxe_dev *rxe); + __be64 (*port_guid)(struct rxe_dev *rxe); + struct device *(*dma_device)(struct rxe_dev *rxe); + int (*mcast_add)(struct rxe_dev *rxe, union ib_gid *mgid); + int (*mcast_delete)(struct rxe_dev *rxe, union ib_gid *mgid); + int (*prepare)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct sk_buff *skb, u32 *crc); + int (*send)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct sk_buff *skb); + int (*loopback)(struct sk_buff *skb); + struct sk_buff *(*init_packet)(struct rxe_dev *rxe, struct rxe_av *av, + int paylen, struct rxe_pkt_info *pkt); + char *(*parent_name)(struct rxe_dev *rxe, unsigned int port_num); + enum rdma_link_layer (*link_layer)(struct rxe_dev *rxe, + unsigned int port_num); +}; + +struct rxe_dev { + struct ib_device ib_dev; + struct ib_device_attr attr; + int max_ucontext; + int max_inline_data; + struct kref ref_cnt; + struct mutex usdev_lock; + + struct rxe_ifc_ops *ifc_ops; + + struct net_device *ndev; + + int xmit_errors; + + struct rxe_pool uc_pool; + struct rxe_pool pd_pool; + struct rxe_pool ah_pool; + struct rxe_pool srq_pool; + struct rxe_pool qp_pool; + struct rxe_pool cq_pool; + struct rxe_pool mr_pool; + struct rxe_pool mw_pool; + struct rxe_pool mc_grp_pool; + struct rxe_pool mc_elem_pool; + + spinlock_t pending_lock; /* guard pending_mmaps */ + struct list_head pending_mmaps; + + spinlock_t mmap_offset_lock; /* guard mmap_offset */ + int mmap_offset; + + struct rxe_port port; + struct list_head list; +}; + +static inline struct rxe_dev *to_rdev(struct ib_device *dev) +{ + return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL; +} + +static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc) +{ + return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL; +} + +static inline struct rxe_pd *to_rpd(struct ib_pd *pd) +{ + return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL; +} + +static inline struct rxe_ah *to_rah(struct ib_ah *ah) +{ + return ah ? container_of(ah, struct rxe_ah, ibah) : NULL; +} + +static inline struct rxe_srq *to_rsrq(struct ib_srq *srq) +{ + return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL; +} + +static inline struct rxe_qp *to_rqp(struct ib_qp *qp) +{ + return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL; +} + +static inline struct rxe_cq *to_rcq(struct ib_cq *cq) +{ + return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL; +} + +static inline struct rxe_mem *to_rmr(struct ib_mr *mr) +{ + return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL; +} + +static inline struct rxe_mem *to_rmw(struct ib_mw *mw) +{ + return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL; +} + +int rxe_register_device(struct rxe_dev *rxe); +int rxe_unregister_device(struct rxe_dev *rxe); + +void rxe_mc_cleanup(void *arg); + +#endif /* RXE_VERBS_H */ diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild index 231901b08f6c..4edb0f2b4f9f 100644 --- a/include/uapi/rdma/Kbuild +++ b/include/uapi/rdma/Kbuild @@ -6,3 +6,4 @@ header-y += ib_user_verbs.h header-y += rdma_netlink.h header-y += rdma_user_cm.h header-y += hfi/ +header-y += rdma_user_rxe.h diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h new file mode 100644 index 000000000000..1de99cfdaf7d --- /dev/null +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_RXE_H +#define RDMA_USER_RXE_H + +#include + +union rxe_gid { + __u8 raw[16]; + struct { + __be64 subnet_prefix; + __be64 interface_id; + } global; +}; + +struct rxe_global_route { + union rxe_gid dgid; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; +}; + +struct rxe_av { + __u8 port_num; + __u8 network_type; + struct rxe_global_route grh; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; +}; + +struct rxe_send_wr { + __u64 wr_id; + __u32 num_sge; + __u32 opcode; + __u32 send_flags; + union { + __be32 imm_data; + __u32 invalidate_rkey; + } ex; + union { + struct { + __u64 remote_addr; + __u32 rkey; + } rdma; + struct { + __u64 remote_addr; + __u64 compare_add; + __u64 swap; + __u32 rkey; + } atomic; + struct { + __u32 remote_qpn; + __u32 remote_qkey; + __u16 pkey_index; + } ud; + struct { + struct ib_mr *mr; + __u32 key; + int access; + } reg; + } wr; +}; + +struct rxe_sge { + __u64 addr; + __u32 length; + __u32 lkey; +}; + +struct mminfo { + __u64 offset; + __u32 size; + __u32 pad; +}; + +struct rxe_dma_info { + __u32 length; + __u32 resid; + __u32 cur_sge; + __u32 num_sge; + __u32 sge_offset; + union { + __u8 inline_data[0]; + struct rxe_sge sge[0]; + }; +}; + +struct rxe_send_wqe { + struct rxe_send_wr wr; + struct rxe_av av; + __u32 status; + __u32 state; + __u64 iova; + __u32 mask; + __u32 first_psn; + __u32 last_psn; + __u32 ack_length; + __u32 ssn; + __u32 has_rd_atomic; + struct rxe_dma_info dma; +}; + +struct rxe_recv_wqe { + __u64 wr_id; + __u32 num_sge; + __u32 padding; + struct rxe_dma_info dma; +}; + +#endif /* RDMA_USER_RXE_H */ -- cgit v1.2.3-71-gd317