cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

file_ops.c (43207B)


      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2/*
      3 * Copyright(c) 2020 Cornelis Networks, Inc.
      4 * Copyright(c) 2015-2020 Intel Corporation.
      5 */
      6
      7#include <linux/poll.h>
      8#include <linux/cdev.h>
      9#include <linux/vmalloc.h>
     10#include <linux/io.h>
     11#include <linux/sched/mm.h>
     12#include <linux/bitmap.h>
     13
     14#include <rdma/ib.h>
     15
     16#include "hfi.h"
     17#include "pio.h"
     18#include "device.h"
     19#include "common.h"
     20#include "trace.h"
     21#include "mmu_rb.h"
     22#include "user_sdma.h"
     23#include "user_exp_rcv.h"
     24#include "aspm.h"
     25
     26#undef pr_fmt
     27#define pr_fmt(fmt) DRIVER_NAME ": " fmt
     28
     29#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
     30
     31/*
     32 * File operation functions
     33 */
     34static int hfi1_file_open(struct inode *inode, struct file *fp);
     35static int hfi1_file_close(struct inode *inode, struct file *fp);
     36static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from);
     37static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt);
     38static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma);
     39
     40static u64 kvirt_to_phys(void *addr);
     41static int assign_ctxt(struct hfi1_filedata *fd, unsigned long arg, u32 len);
     42static void init_subctxts(struct hfi1_ctxtdata *uctxt,
     43			  const struct hfi1_user_info *uinfo);
     44static int init_user_ctxt(struct hfi1_filedata *fd,
     45			  struct hfi1_ctxtdata *uctxt);
     46static void user_init(struct hfi1_ctxtdata *uctxt);
     47static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len);
     48static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len);
     49static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg,
     50			      u32 len);
     51static int user_exp_rcv_clear(struct hfi1_filedata *fd, unsigned long arg,
     52			      u32 len);
     53static int user_exp_rcv_invalid(struct hfi1_filedata *fd, unsigned long arg,
     54				u32 len);
     55static int setup_base_ctxt(struct hfi1_filedata *fd,
     56			   struct hfi1_ctxtdata *uctxt);
     57static int setup_subctxt(struct hfi1_ctxtdata *uctxt);
     58
     59static int find_sub_ctxt(struct hfi1_filedata *fd,
     60			 const struct hfi1_user_info *uinfo);
     61static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
     62			 struct hfi1_user_info *uinfo,
     63			 struct hfi1_ctxtdata **cd);
     64static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt);
     65static __poll_t poll_urgent(struct file *fp, struct poll_table_struct *pt);
     66static __poll_t poll_next(struct file *fp, struct poll_table_struct *pt);
     67static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
     68			  unsigned long arg);
     69static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg);
     70static int ctxt_reset(struct hfi1_ctxtdata *uctxt);
     71static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
     72		       unsigned long arg);
     73static vm_fault_t vma_fault(struct vm_fault *vmf);
     74static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
     75			    unsigned long arg);
     76
     77static const struct file_operations hfi1_file_ops = {
     78	.owner = THIS_MODULE,
     79	.write_iter = hfi1_write_iter,
     80	.open = hfi1_file_open,
     81	.release = hfi1_file_close,
     82	.unlocked_ioctl = hfi1_file_ioctl,
     83	.poll = hfi1_poll,
     84	.mmap = hfi1_file_mmap,
     85	.llseek = noop_llseek,
     86};
     87
     88static const struct vm_operations_struct vm_ops = {
     89	.fault = vma_fault,
     90};
     91
     92/*
     93 * Types of memories mapped into user processes' space
     94 */
     95enum mmap_types {
     96	PIO_BUFS = 1,
     97	PIO_BUFS_SOP,
     98	PIO_CRED,
     99	RCV_HDRQ,
    100	RCV_EGRBUF,
    101	UREGS,
    102	EVENTS,
    103	STATUS,
    104	RTAIL,
    105	SUBCTXT_UREGS,
    106	SUBCTXT_RCV_HDRQ,
    107	SUBCTXT_EGRBUF,
    108	SDMA_COMP
    109};
    110
    111/*
    112 * Masks and offsets defining the mmap tokens
    113 */
    114#define HFI1_MMAP_OFFSET_MASK   0xfffULL
    115#define HFI1_MMAP_OFFSET_SHIFT  0
    116#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
    117#define HFI1_MMAP_SUBCTXT_SHIFT 12
    118#define HFI1_MMAP_CTXT_MASK     0xffULL
    119#define HFI1_MMAP_CTXT_SHIFT    16
    120#define HFI1_MMAP_TYPE_MASK     0xfULL
    121#define HFI1_MMAP_TYPE_SHIFT    24
    122#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
    123#define HFI1_MMAP_MAGIC_SHIFT   32
    124
    125#define HFI1_MMAP_MAGIC         0xdabbad00
    126
    127#define HFI1_MMAP_TOKEN_SET(field, val)	\
    128	(((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
    129#define HFI1_MMAP_TOKEN_GET(field, token) \
    130	(((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
    131#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
    132	(HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
    133	HFI1_MMAP_TOKEN_SET(TYPE, type) | \
    134	HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
    135	HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
    136	HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
    137
    138#define dbg(fmt, ...)				\
    139	pr_info(fmt, ##__VA_ARGS__)
    140
    141static inline int is_valid_mmap(u64 token)
    142{
    143	return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
    144}
    145
    146static int hfi1_file_open(struct inode *inode, struct file *fp)
    147{
    148	struct hfi1_filedata *fd;
    149	struct hfi1_devdata *dd = container_of(inode->i_cdev,
    150					       struct hfi1_devdata,
    151					       user_cdev);
    152
    153	if (!((dd->flags & HFI1_PRESENT) && dd->kregbase1))
    154		return -EINVAL;
    155
    156	if (!refcount_inc_not_zero(&dd->user_refcount))
    157		return -ENXIO;
    158
    159	/* The real work is performed later in assign_ctxt() */
    160
    161	fd = kzalloc(sizeof(*fd), GFP_KERNEL);
    162
    163	if (!fd || init_srcu_struct(&fd->pq_srcu))
    164		goto nomem;
    165	spin_lock_init(&fd->pq_rcu_lock);
    166	spin_lock_init(&fd->tid_lock);
    167	spin_lock_init(&fd->invalid_lock);
    168	fd->rec_cpu_num = -1; /* no cpu affinity by default */
    169	fd->dd = dd;
    170	fp->private_data = fd;
    171	return 0;
    172nomem:
    173	kfree(fd);
    174	fp->private_data = NULL;
    175	if (refcount_dec_and_test(&dd->user_refcount))
    176		complete(&dd->user_comp);
    177	return -ENOMEM;
    178}
    179
    180static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
    181			    unsigned long arg)
    182{
    183	struct hfi1_filedata *fd = fp->private_data;
    184	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    185	int ret = 0;
    186	int uval = 0;
    187
    188	hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
    189	if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
    190	    cmd != HFI1_IOCTL_GET_VERS &&
    191	    !uctxt)
    192		return -EINVAL;
    193
    194	switch (cmd) {
    195	case HFI1_IOCTL_ASSIGN_CTXT:
    196		ret = assign_ctxt(fd, arg, _IOC_SIZE(cmd));
    197		break;
    198
    199	case HFI1_IOCTL_CTXT_INFO:
    200		ret = get_ctxt_info(fd, arg, _IOC_SIZE(cmd));
    201		break;
    202
    203	case HFI1_IOCTL_USER_INFO:
    204		ret = get_base_info(fd, arg, _IOC_SIZE(cmd));
    205		break;
    206
    207	case HFI1_IOCTL_CREDIT_UPD:
    208		if (uctxt)
    209			sc_return_credits(uctxt->sc);
    210		break;
    211
    212	case HFI1_IOCTL_TID_UPDATE:
    213		ret = user_exp_rcv_setup(fd, arg, _IOC_SIZE(cmd));
    214		break;
    215
    216	case HFI1_IOCTL_TID_FREE:
    217		ret = user_exp_rcv_clear(fd, arg, _IOC_SIZE(cmd));
    218		break;
    219
    220	case HFI1_IOCTL_TID_INVAL_READ:
    221		ret = user_exp_rcv_invalid(fd, arg, _IOC_SIZE(cmd));
    222		break;
    223
    224	case HFI1_IOCTL_RECV_CTRL:
    225		ret = manage_rcvq(uctxt, fd->subctxt, arg);
    226		break;
    227
    228	case HFI1_IOCTL_POLL_TYPE:
    229		if (get_user(uval, (int __user *)arg))
    230			return -EFAULT;
    231		uctxt->poll_type = (typeof(uctxt->poll_type))uval;
    232		break;
    233
    234	case HFI1_IOCTL_ACK_EVENT:
    235		ret = user_event_ack(uctxt, fd->subctxt, arg);
    236		break;
    237
    238	case HFI1_IOCTL_SET_PKEY:
    239		ret = set_ctxt_pkey(uctxt, arg);
    240		break;
    241
    242	case HFI1_IOCTL_CTXT_RESET:
    243		ret = ctxt_reset(uctxt);
    244		break;
    245
    246	case HFI1_IOCTL_GET_VERS:
    247		uval = HFI1_USER_SWVERSION;
    248		if (put_user(uval, (int __user *)arg))
    249			return -EFAULT;
    250		break;
    251
    252	default:
    253		return -EINVAL;
    254	}
    255
    256	return ret;
    257}
    258
    259static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
    260{
    261	struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
    262	struct hfi1_user_sdma_pkt_q *pq;
    263	struct hfi1_user_sdma_comp_q *cq = fd->cq;
    264	int done = 0, reqs = 0;
    265	unsigned long dim = from->nr_segs;
    266	int idx;
    267
    268	if (!HFI1_CAP_IS_KSET(SDMA))
    269		return -EINVAL;
    270	idx = srcu_read_lock(&fd->pq_srcu);
    271	pq = srcu_dereference(fd->pq, &fd->pq_srcu);
    272	if (!cq || !pq) {
    273		srcu_read_unlock(&fd->pq_srcu, idx);
    274		return -EIO;
    275	}
    276
    277	if (!iter_is_iovec(from) || !dim) {
    278		srcu_read_unlock(&fd->pq_srcu, idx);
    279		return -EINVAL;
    280	}
    281
    282	trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim);
    283
    284	if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
    285		srcu_read_unlock(&fd->pq_srcu, idx);
    286		return -ENOSPC;
    287	}
    288
    289	while (dim) {
    290		int ret;
    291		unsigned long count = 0;
    292
    293		ret = hfi1_user_sdma_process_request(
    294			fd, (struct iovec *)(from->iov + done),
    295			dim, &count);
    296		if (ret) {
    297			reqs = ret;
    298			break;
    299		}
    300		dim -= count;
    301		done += count;
    302		reqs++;
    303	}
    304
    305	srcu_read_unlock(&fd->pq_srcu, idx);
    306	return reqs;
    307}
    308
    309static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
    310{
    311	struct hfi1_filedata *fd = fp->private_data;
    312	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    313	struct hfi1_devdata *dd;
    314	unsigned long flags;
    315	u64 token = vma->vm_pgoff << PAGE_SHIFT,
    316		memaddr = 0;
    317	void *memvirt = NULL;
    318	u8 subctxt, mapio = 0, vmf = 0, type;
    319	ssize_t memlen = 0;
    320	int ret = 0;
    321	u16 ctxt;
    322
    323	if (!is_valid_mmap(token) || !uctxt ||
    324	    !(vma->vm_flags & VM_SHARED)) {
    325		ret = -EINVAL;
    326		goto done;
    327	}
    328	dd = uctxt->dd;
    329	ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
    330	subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
    331	type = HFI1_MMAP_TOKEN_GET(TYPE, token);
    332	if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
    333		ret = -EINVAL;
    334		goto done;
    335	}
    336
    337	flags = vma->vm_flags;
    338
    339	switch (type) {
    340	case PIO_BUFS:
    341	case PIO_BUFS_SOP:
    342		memaddr = ((dd->physaddr + TXE_PIO_SEND) +
    343				/* chip pio base */
    344			   (uctxt->sc->hw_context * BIT(16))) +
    345				/* 64K PIO space / ctxt */
    346			(type == PIO_BUFS_SOP ?
    347				(TXE_PIO_SIZE / 2) : 0); /* sop? */
    348		/*
    349		 * Map only the amount allocated to the context, not the
    350		 * entire available context's PIO space.
    351		 */
    352		memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
    353		flags &= ~VM_MAYREAD;
    354		flags |= VM_DONTCOPY | VM_DONTEXPAND;
    355		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
    356		mapio = 1;
    357		break;
    358	case PIO_CRED:
    359		if (flags & VM_WRITE) {
    360			ret = -EPERM;
    361			goto done;
    362		}
    363		/*
    364		 * The credit return location for this context could be on the
    365		 * second or third page allocated for credit returns (if number
    366		 * of enabled contexts > 64 and 128 respectively).
    367		 */
    368		memvirt = dd->cr_base[uctxt->numa_id].va;
    369		memaddr = virt_to_phys(memvirt) +
    370			(((u64)uctxt->sc->hw_free -
    371			  (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
    372		memlen = PAGE_SIZE;
    373		flags &= ~VM_MAYWRITE;
    374		flags |= VM_DONTCOPY | VM_DONTEXPAND;
    375		/*
    376		 * The driver has already allocated memory for credit
    377		 * returns and programmed it into the chip. Has that
    378		 * memory been flagged as non-cached?
    379		 */
    380		/* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
    381		mapio = 1;
    382		break;
    383	case RCV_HDRQ:
    384		memlen = rcvhdrq_size(uctxt);
    385		memvirt = uctxt->rcvhdrq;
    386		break;
    387	case RCV_EGRBUF: {
    388		unsigned long addr;
    389		int i;
    390		/*
    391		 * The RcvEgr buffer need to be handled differently
    392		 * as multiple non-contiguous pages need to be mapped
    393		 * into the user process.
    394		 */
    395		memlen = uctxt->egrbufs.size;
    396		if ((vma->vm_end - vma->vm_start) != memlen) {
    397			dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
    398				   (vma->vm_end - vma->vm_start), memlen);
    399			ret = -EINVAL;
    400			goto done;
    401		}
    402		if (vma->vm_flags & VM_WRITE) {
    403			ret = -EPERM;
    404			goto done;
    405		}
    406		vma->vm_flags &= ~VM_MAYWRITE;
    407		addr = vma->vm_start;
    408		for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
    409			memlen = uctxt->egrbufs.buffers[i].len;
    410			memvirt = uctxt->egrbufs.buffers[i].addr;
    411			ret = remap_pfn_range(
    412				vma, addr,
    413				/*
    414				 * virt_to_pfn() does the same, but
    415				 * it's not available on x86_64
    416				 * when CONFIG_MMU is enabled.
    417				 */
    418				PFN_DOWN(__pa(memvirt)),
    419				memlen,
    420				vma->vm_page_prot);
    421			if (ret < 0)
    422				goto done;
    423			addr += memlen;
    424		}
    425		ret = 0;
    426		goto done;
    427	}
    428	case UREGS:
    429		/*
    430		 * Map only the page that contains this context's user
    431		 * registers.
    432		 */
    433		memaddr = (unsigned long)
    434			(dd->physaddr + RXE_PER_CONTEXT_USER)
    435			+ (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
    436		/*
    437		 * TidFlow table is on the same page as the rest of the
    438		 * user registers.
    439		 */
    440		memlen = PAGE_SIZE;
    441		flags |= VM_DONTCOPY | VM_DONTEXPAND;
    442		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
    443		mapio = 1;
    444		break;
    445	case EVENTS:
    446		/*
    447		 * Use the page where this context's flags are. User level
    448		 * knows where it's own bitmap is within the page.
    449		 */
    450		memaddr = (unsigned long)
    451			(dd->events + uctxt_offset(uctxt)) & PAGE_MASK;
    452		memlen = PAGE_SIZE;
    453		/*
    454		 * v3.7 removes VM_RESERVED but the effect is kept by
    455		 * using VM_IO.
    456		 */
    457		flags |= VM_IO | VM_DONTEXPAND;
    458		vmf = 1;
    459		break;
    460	case STATUS:
    461		if (flags & VM_WRITE) {
    462			ret = -EPERM;
    463			goto done;
    464		}
    465		memaddr = kvirt_to_phys((void *)dd->status);
    466		memlen = PAGE_SIZE;
    467		flags |= VM_IO | VM_DONTEXPAND;
    468		break;
    469	case RTAIL:
    470		if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
    471			/*
    472			 * If the memory allocation failed, the context alloc
    473			 * also would have failed, so we would never get here
    474			 */
    475			ret = -EINVAL;
    476			goto done;
    477		}
    478		if ((flags & VM_WRITE) || !hfi1_rcvhdrtail_kvaddr(uctxt)) {
    479			ret = -EPERM;
    480			goto done;
    481		}
    482		memlen = PAGE_SIZE;
    483		memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt);
    484		flags &= ~VM_MAYWRITE;
    485		break;
    486	case SUBCTXT_UREGS:
    487		memaddr = (u64)uctxt->subctxt_uregbase;
    488		memlen = PAGE_SIZE;
    489		flags |= VM_IO | VM_DONTEXPAND;
    490		vmf = 1;
    491		break;
    492	case SUBCTXT_RCV_HDRQ:
    493		memaddr = (u64)uctxt->subctxt_rcvhdr_base;
    494		memlen = rcvhdrq_size(uctxt) * uctxt->subctxt_cnt;
    495		flags |= VM_IO | VM_DONTEXPAND;
    496		vmf = 1;
    497		break;
    498	case SUBCTXT_EGRBUF:
    499		memaddr = (u64)uctxt->subctxt_rcvegrbuf;
    500		memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
    501		flags |= VM_IO | VM_DONTEXPAND;
    502		flags &= ~VM_MAYWRITE;
    503		vmf = 1;
    504		break;
    505	case SDMA_COMP: {
    506		struct hfi1_user_sdma_comp_q *cq = fd->cq;
    507
    508		if (!cq) {
    509			ret = -EFAULT;
    510			goto done;
    511		}
    512		memaddr = (u64)cq->comps;
    513		memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
    514		flags |= VM_IO | VM_DONTEXPAND;
    515		vmf = 1;
    516		break;
    517	}
    518	default:
    519		ret = -EINVAL;
    520		break;
    521	}
    522
    523	if ((vma->vm_end - vma->vm_start) != memlen) {
    524		hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
    525			  uctxt->ctxt, fd->subctxt,
    526			  (vma->vm_end - vma->vm_start), memlen);
    527		ret = -EINVAL;
    528		goto done;
    529	}
    530
    531	vma->vm_flags = flags;
    532	hfi1_cdbg(PROC,
    533		  "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
    534		    ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
    535		    vma->vm_end - vma->vm_start, vma->vm_flags);
    536	if (vmf) {
    537		vma->vm_pgoff = PFN_DOWN(memaddr);
    538		vma->vm_ops = &vm_ops;
    539		ret = 0;
    540	} else if (mapio) {
    541		ret = io_remap_pfn_range(vma, vma->vm_start,
    542					 PFN_DOWN(memaddr),
    543					 memlen,
    544					 vma->vm_page_prot);
    545	} else if (memvirt) {
    546		ret = remap_pfn_range(vma, vma->vm_start,
    547				      PFN_DOWN(__pa(memvirt)),
    548				      memlen,
    549				      vma->vm_page_prot);
    550	} else {
    551		ret = remap_pfn_range(vma, vma->vm_start,
    552				      PFN_DOWN(memaddr),
    553				      memlen,
    554				      vma->vm_page_prot);
    555	}
    556done:
    557	return ret;
    558}
    559
    560/*
    561 * Local (non-chip) user memory is not mapped right away but as it is
    562 * accessed by the user-level code.
    563 */
    564static vm_fault_t vma_fault(struct vm_fault *vmf)
    565{
    566	struct page *page;
    567
    568	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
    569	if (!page)
    570		return VM_FAULT_SIGBUS;
    571
    572	get_page(page);
    573	vmf->page = page;
    574
    575	return 0;
    576}
    577
    578static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt)
    579{
    580	struct hfi1_ctxtdata *uctxt;
    581	__poll_t pollflag;
    582
    583	uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
    584	if (!uctxt)
    585		pollflag = EPOLLERR;
    586	else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
    587		pollflag = poll_urgent(fp, pt);
    588	else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
    589		pollflag = poll_next(fp, pt);
    590	else /* invalid */
    591		pollflag = EPOLLERR;
    592
    593	return pollflag;
    594}
    595
    596static int hfi1_file_close(struct inode *inode, struct file *fp)
    597{
    598	struct hfi1_filedata *fdata = fp->private_data;
    599	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
    600	struct hfi1_devdata *dd = container_of(inode->i_cdev,
    601					       struct hfi1_devdata,
    602					       user_cdev);
    603	unsigned long flags, *ev;
    604
    605	fp->private_data = NULL;
    606
    607	if (!uctxt)
    608		goto done;
    609
    610	hfi1_cdbg(PROC, "closing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
    611
    612	flush_wc();
    613	/* drain user sdma queue */
    614	hfi1_user_sdma_free_queues(fdata, uctxt);
    615
    616	/* release the cpu */
    617	hfi1_put_proc_affinity(fdata->rec_cpu_num);
    618
    619	/* clean up rcv side */
    620	hfi1_user_exp_rcv_free(fdata);
    621
    622	/*
    623	 * fdata->uctxt is used in the above cleanup.  It is not ready to be
    624	 * removed until here.
    625	 */
    626	fdata->uctxt = NULL;
    627	hfi1_rcd_put(uctxt);
    628
    629	/*
    630	 * Clear any left over, unhandled events so the next process that
    631	 * gets this context doesn't get confused.
    632	 */
    633	ev = dd->events + uctxt_offset(uctxt) + fdata->subctxt;
    634	*ev = 0;
    635
    636	spin_lock_irqsave(&dd->uctxt_lock, flags);
    637	__clear_bit(fdata->subctxt, uctxt->in_use_ctxts);
    638	if (!bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
    639		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
    640		goto done;
    641	}
    642	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
    643
    644	/*
    645	 * Disable receive context and interrupt available, reset all
    646	 * RcvCtxtCtrl bits to default values.
    647	 */
    648	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
    649		     HFI1_RCVCTRL_TIDFLOW_DIS |
    650		     HFI1_RCVCTRL_INTRAVAIL_DIS |
    651		     HFI1_RCVCTRL_TAILUPD_DIS |
    652		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
    653		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
    654		     HFI1_RCVCTRL_NO_EGR_DROP_DIS |
    655		     HFI1_RCVCTRL_URGENT_DIS, uctxt);
    656	/* Clear the context's J_KEY */
    657	hfi1_clear_ctxt_jkey(dd, uctxt);
    658	/*
    659	 * If a send context is allocated, reset context integrity
    660	 * checks to default and disable the send context.
    661	 */
    662	if (uctxt->sc) {
    663		sc_disable(uctxt->sc);
    664		set_pio_integrity(uctxt->sc);
    665	}
    666
    667	hfi1_free_ctxt_rcv_groups(uctxt);
    668	hfi1_clear_ctxt_pkey(dd, uctxt);
    669
    670	uctxt->event_flags = 0;
    671
    672	deallocate_ctxt(uctxt);
    673done:
    674
    675	if (refcount_dec_and_test(&dd->user_refcount))
    676		complete(&dd->user_comp);
    677
    678	cleanup_srcu_struct(&fdata->pq_srcu);
    679	kfree(fdata);
    680	return 0;
    681}
    682
    683/*
    684 * Convert kernel *virtual* addresses to physical addresses.
    685 * This is used to vmalloc'ed addresses.
    686 */
    687static u64 kvirt_to_phys(void *addr)
    688{
    689	struct page *page;
    690	u64 paddr = 0;
    691
    692	page = vmalloc_to_page(addr);
    693	if (page)
    694		paddr = page_to_pfn(page) << PAGE_SHIFT;
    695
    696	return paddr;
    697}
    698
    699/**
    700 * complete_subctxt - complete sub-context info
    701 * @fd: valid filedata pointer
    702 *
    703 * Sub-context info can only be set up after the base context
    704 * has been completed.  This is indicated by the clearing of the
    705 * HFI1_CTXT_BASE_UINIT bit.
    706 *
    707 * Wait for the bit to be cleared, and then complete the subcontext
    708 * initialization.
    709 *
    710 */
    711static int complete_subctxt(struct hfi1_filedata *fd)
    712{
    713	int ret;
    714	unsigned long flags;
    715
    716	/*
    717	 * sub-context info can only be set up after the base context
    718	 * has been completed.
    719	 */
    720	ret = wait_event_interruptible(
    721		fd->uctxt->wait,
    722		!test_bit(HFI1_CTXT_BASE_UNINIT, &fd->uctxt->event_flags));
    723
    724	if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags))
    725		ret = -ENOMEM;
    726
    727	/* Finish the sub-context init */
    728	if (!ret) {
    729		fd->rec_cpu_num = hfi1_get_proc_affinity(fd->uctxt->numa_id);
    730		ret = init_user_ctxt(fd, fd->uctxt);
    731	}
    732
    733	if (ret) {
    734		spin_lock_irqsave(&fd->dd->uctxt_lock, flags);
    735		__clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts);
    736		spin_unlock_irqrestore(&fd->dd->uctxt_lock, flags);
    737		hfi1_rcd_put(fd->uctxt);
    738		fd->uctxt = NULL;
    739	}
    740
    741	return ret;
    742}
    743
    744static int assign_ctxt(struct hfi1_filedata *fd, unsigned long arg, u32 len)
    745{
    746	int ret;
    747	unsigned int swmajor;
    748	struct hfi1_ctxtdata *uctxt = NULL;
    749	struct hfi1_user_info uinfo;
    750
    751	if (fd->uctxt)
    752		return -EINVAL;
    753
    754	if (sizeof(uinfo) != len)
    755		return -EINVAL;
    756
    757	if (copy_from_user(&uinfo, (void __user *)arg, sizeof(uinfo)))
    758		return -EFAULT;
    759
    760	swmajor = uinfo.userversion >> 16;
    761	if (swmajor != HFI1_USER_SWMAJOR)
    762		return -ENODEV;
    763
    764	if (uinfo.subctxt_cnt > HFI1_MAX_SHARED_CTXTS)
    765		return -EINVAL;
    766
    767	/*
    768	 * Acquire the mutex to protect against multiple creations of what
    769	 * could be a shared base context.
    770	 */
    771	mutex_lock(&hfi1_mutex);
    772	/*
    773	 * Get a sub context if available  (fd->uctxt will be set).
    774	 * ret < 0 error, 0 no context, 1 sub-context found
    775	 */
    776	ret = find_sub_ctxt(fd, &uinfo);
    777
    778	/*
    779	 * Allocate a base context if context sharing is not required or a
    780	 * sub context wasn't found.
    781	 */
    782	if (!ret)
    783		ret = allocate_ctxt(fd, fd->dd, &uinfo, &uctxt);
    784
    785	mutex_unlock(&hfi1_mutex);
    786
    787	/* Depending on the context type, finish the appropriate init */
    788	switch (ret) {
    789	case 0:
    790		ret = setup_base_ctxt(fd, uctxt);
    791		if (ret)
    792			deallocate_ctxt(uctxt);
    793		break;
    794	case 1:
    795		ret = complete_subctxt(fd);
    796		break;
    797	default:
    798		break;
    799	}
    800
    801	return ret;
    802}
    803
    804/**
    805 * match_ctxt - match context
    806 * @fd: valid filedata pointer
    807 * @uinfo: user info to compare base context with
    808 * @uctxt: context to compare uinfo to.
    809 *
    810 * Compare the given context with the given information to see if it
    811 * can be used for a sub context.
    812 */
    813static int match_ctxt(struct hfi1_filedata *fd,
    814		      const struct hfi1_user_info *uinfo,
    815		      struct hfi1_ctxtdata *uctxt)
    816{
    817	struct hfi1_devdata *dd = fd->dd;
    818	unsigned long flags;
    819	u16 subctxt;
    820
    821	/* Skip dynamically allocated kernel contexts */
    822	if (uctxt->sc && (uctxt->sc->type == SC_KERNEL))
    823		return 0;
    824
    825	/* Skip ctxt if it doesn't match the requested one */
    826	if (memcmp(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)) ||
    827	    uctxt->jkey != generate_jkey(current_uid()) ||
    828	    uctxt->subctxt_id != uinfo->subctxt_id ||
    829	    uctxt->subctxt_cnt != uinfo->subctxt_cnt)
    830		return 0;
    831
    832	/* Verify the sharing process matches the base */
    833	if (uctxt->userversion != uinfo->userversion)
    834		return -EINVAL;
    835
    836	/* Find an unused sub context */
    837	spin_lock_irqsave(&dd->uctxt_lock, flags);
    838	if (bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
    839		/* context is being closed, do not use */
    840		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
    841		return 0;
    842	}
    843
    844	subctxt = find_first_zero_bit(uctxt->in_use_ctxts,
    845				      HFI1_MAX_SHARED_CTXTS);
    846	if (subctxt >= uctxt->subctxt_cnt) {
    847		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
    848		return -EBUSY;
    849	}
    850
    851	fd->subctxt = subctxt;
    852	__set_bit(fd->subctxt, uctxt->in_use_ctxts);
    853	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
    854
    855	fd->uctxt = uctxt;
    856	hfi1_rcd_get(uctxt);
    857
    858	return 1;
    859}
    860
    861/**
    862 * find_sub_ctxt - fund sub-context
    863 * @fd: valid filedata pointer
    864 * @uinfo: matching info to use to find a possible context to share.
    865 *
    866 * The hfi1_mutex must be held when this function is called.  It is
    867 * necessary to ensure serialized creation of shared contexts.
    868 *
    869 * Return:
    870 *    0      No sub-context found
    871 *    1      Subcontext found and allocated
    872 *    errno  EINVAL (incorrect parameters)
    873 *           EBUSY (all sub contexts in use)
    874 */
    875static int find_sub_ctxt(struct hfi1_filedata *fd,
    876			 const struct hfi1_user_info *uinfo)
    877{
    878	struct hfi1_ctxtdata *uctxt;
    879	struct hfi1_devdata *dd = fd->dd;
    880	u16 i;
    881	int ret;
    882
    883	if (!uinfo->subctxt_cnt)
    884		return 0;
    885
    886	for (i = dd->first_dyn_alloc_ctxt; i < dd->num_rcv_contexts; i++) {
    887		uctxt = hfi1_rcd_get_by_index(dd, i);
    888		if (uctxt) {
    889			ret = match_ctxt(fd, uinfo, uctxt);
    890			hfi1_rcd_put(uctxt);
    891			/* value of != 0 will return */
    892			if (ret)
    893				return ret;
    894		}
    895	}
    896
    897	return 0;
    898}
    899
    900static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
    901			 struct hfi1_user_info *uinfo,
    902			 struct hfi1_ctxtdata **rcd)
    903{
    904	struct hfi1_ctxtdata *uctxt;
    905	int ret, numa;
    906
    907	if (dd->flags & HFI1_FROZEN) {
    908		/*
    909		 * Pick an error that is unique from all other errors
    910		 * that are returned so the user process knows that
    911		 * it tried to allocate while the SPC was frozen.  It
    912		 * it should be able to retry with success in a short
    913		 * while.
    914		 */
    915		return -EIO;
    916	}
    917
    918	if (!dd->freectxts)
    919		return -EBUSY;
    920
    921	/*
    922	 * If we don't have a NUMA node requested, preference is towards
    923	 * device NUMA node.
    924	 */
    925	fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
    926	if (fd->rec_cpu_num != -1)
    927		numa = cpu_to_node(fd->rec_cpu_num);
    928	else
    929		numa = numa_node_id();
    930	ret = hfi1_create_ctxtdata(dd->pport, numa, &uctxt);
    931	if (ret < 0) {
    932		dd_dev_err(dd, "user ctxtdata allocation failed\n");
    933		return ret;
    934	}
    935	hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
    936		  uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
    937		  uctxt->numa_id);
    938
    939	/*
    940	 * Allocate and enable a PIO send context.
    941	 */
    942	uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, dd->node);
    943	if (!uctxt->sc) {
    944		ret = -ENOMEM;
    945		goto ctxdata_free;
    946	}
    947	hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
    948		  uctxt->sc->hw_context);
    949	ret = sc_enable(uctxt->sc);
    950	if (ret)
    951		goto ctxdata_free;
    952
    953	/*
    954	 * Setup sub context information if the user-level has requested
    955	 * sub contexts.
    956	 * This has to be done here so the rest of the sub-contexts find the
    957	 * proper base context.
    958	 * NOTE: _set_bit() can be used here because the context creation is
    959	 * protected by the mutex (rather than the spin_lock), and will be the
    960	 * very first instance of this context.
    961	 */
    962	__set_bit(0, uctxt->in_use_ctxts);
    963	if (uinfo->subctxt_cnt)
    964		init_subctxts(uctxt, uinfo);
    965	uctxt->userversion = uinfo->userversion;
    966	uctxt->flags = hfi1_cap_mask; /* save current flag state */
    967	init_waitqueue_head(&uctxt->wait);
    968	strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
    969	memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
    970	uctxt->jkey = generate_jkey(current_uid());
    971	hfi1_stats.sps_ctxts++;
    972	/*
    973	 * Disable ASPM when there are open user/PSM contexts to avoid
    974	 * issues with ASPM L1 exit latency
    975	 */
    976	if (dd->freectxts-- == dd->num_user_contexts)
    977		aspm_disable_all(dd);
    978
    979	*rcd = uctxt;
    980
    981	return 0;
    982
    983ctxdata_free:
    984	hfi1_free_ctxt(uctxt);
    985	return ret;
    986}
    987
    988static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt)
    989{
    990	mutex_lock(&hfi1_mutex);
    991	hfi1_stats.sps_ctxts--;
    992	if (++uctxt->dd->freectxts == uctxt->dd->num_user_contexts)
    993		aspm_enable_all(uctxt->dd);
    994	mutex_unlock(&hfi1_mutex);
    995
    996	hfi1_free_ctxt(uctxt);
    997}
    998
    999static void init_subctxts(struct hfi1_ctxtdata *uctxt,
   1000			  const struct hfi1_user_info *uinfo)
   1001{
   1002	uctxt->subctxt_cnt = uinfo->subctxt_cnt;
   1003	uctxt->subctxt_id = uinfo->subctxt_id;
   1004	set_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags);
   1005}
   1006
   1007static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
   1008{
   1009	int ret = 0;
   1010	u16 num_subctxts = uctxt->subctxt_cnt;
   1011
   1012	uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
   1013	if (!uctxt->subctxt_uregbase)
   1014		return -ENOMEM;
   1015
   1016	/* We can take the size of the RcvHdr Queue from the master */
   1017	uctxt->subctxt_rcvhdr_base = vmalloc_user(rcvhdrq_size(uctxt) *
   1018						  num_subctxts);
   1019	if (!uctxt->subctxt_rcvhdr_base) {
   1020		ret = -ENOMEM;
   1021		goto bail_ureg;
   1022	}
   1023
   1024	uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
   1025						num_subctxts);
   1026	if (!uctxt->subctxt_rcvegrbuf) {
   1027		ret = -ENOMEM;
   1028		goto bail_rhdr;
   1029	}
   1030
   1031	return 0;
   1032
   1033bail_rhdr:
   1034	vfree(uctxt->subctxt_rcvhdr_base);
   1035	uctxt->subctxt_rcvhdr_base = NULL;
   1036bail_ureg:
   1037	vfree(uctxt->subctxt_uregbase);
   1038	uctxt->subctxt_uregbase = NULL;
   1039
   1040	return ret;
   1041}
   1042
   1043static void user_init(struct hfi1_ctxtdata *uctxt)
   1044{
   1045	unsigned int rcvctrl_ops = 0;
   1046
   1047	/* initialize poll variables... */
   1048	uctxt->urgent = 0;
   1049	uctxt->urgent_poll = 0;
   1050
   1051	/*
   1052	 * Now enable the ctxt for receive.
   1053	 * For chips that are set to DMA the tail register to memory
   1054	 * when they change (and when the update bit transitions from
   1055	 * 0 to 1.  So for those chips, we turn it off and then back on.
   1056	 * This will (very briefly) affect any other open ctxts, but the
   1057	 * duration is very short, and therefore isn't an issue.  We
   1058	 * explicitly set the in-memory tail copy to 0 beforehand, so we
   1059	 * don't have to wait to be sure the DMA update has happened
   1060	 * (chip resets head/tail to 0 on transition to enable).
   1061	 */
   1062	if (hfi1_rcvhdrtail_kvaddr(uctxt))
   1063		clear_rcvhdrtail(uctxt);
   1064
   1065	/* Setup J_KEY before enabling the context */
   1066	hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey);
   1067
   1068	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
   1069	rcvctrl_ops |= HFI1_RCVCTRL_URGENT_ENB;
   1070	if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
   1071		rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
   1072	/*
   1073	 * Ignore the bit in the flags for now until proper
   1074	 * support for multiple packet per rcv array entry is
   1075	 * added.
   1076	 */
   1077	if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR))
   1078		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
   1079	if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL))
   1080		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
   1081	if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
   1082		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
   1083	/*
   1084	 * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
   1085	 * We can't rely on the correct value to be set from prior
   1086	 * uses of the chip or ctxt. Therefore, add the rcvctrl op
   1087	 * for both cases.
   1088	 */
   1089	if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL))
   1090		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
   1091	else
   1092		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
   1093	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
   1094}
   1095
   1096static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
   1097{
   1098	struct hfi1_ctxt_info cinfo;
   1099	struct hfi1_ctxtdata *uctxt = fd->uctxt;
   1100
   1101	if (sizeof(cinfo) != len)
   1102		return -EINVAL;
   1103
   1104	memset(&cinfo, 0, sizeof(cinfo));
   1105	cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) &
   1106				HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) |
   1107			HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
   1108			HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
   1109	/* adjust flag if this fd is not able to cache */
   1110	if (!fd->use_mn)
   1111		cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
   1112
   1113	cinfo.num_active = hfi1_count_active_units();
   1114	cinfo.unit = uctxt->dd->unit;
   1115	cinfo.ctxt = uctxt->ctxt;
   1116	cinfo.subctxt = fd->subctxt;
   1117	cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
   1118				uctxt->dd->rcv_entries.group_size) +
   1119		uctxt->expected_count;
   1120	cinfo.credits = uctxt->sc->credits;
   1121	cinfo.numa_node = uctxt->numa_id;
   1122	cinfo.rec_cpu = fd->rec_cpu_num;
   1123	cinfo.send_ctxt = uctxt->sc->hw_context;
   1124
   1125	cinfo.egrtids = uctxt->egrbufs.alloced;
   1126	cinfo.rcvhdrq_cnt = get_hdrq_cnt(uctxt);
   1127	cinfo.rcvhdrq_entsize = get_hdrqentsize(uctxt) << 2;
   1128	cinfo.sdma_ring_size = fd->cq->nentries;
   1129	cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
   1130
   1131	trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, &cinfo);
   1132	if (copy_to_user((void __user *)arg, &cinfo, len))
   1133		return -EFAULT;
   1134
   1135	return 0;
   1136}
   1137
   1138static int init_user_ctxt(struct hfi1_filedata *fd,
   1139			  struct hfi1_ctxtdata *uctxt)
   1140{
   1141	int ret;
   1142
   1143	ret = hfi1_user_sdma_alloc_queues(uctxt, fd);
   1144	if (ret)
   1145		return ret;
   1146
   1147	ret = hfi1_user_exp_rcv_init(fd, uctxt);
   1148	if (ret)
   1149		hfi1_user_sdma_free_queues(fd, uctxt);
   1150
   1151	return ret;
   1152}
   1153
   1154static int setup_base_ctxt(struct hfi1_filedata *fd,
   1155			   struct hfi1_ctxtdata *uctxt)
   1156{
   1157	struct hfi1_devdata *dd = uctxt->dd;
   1158	int ret = 0;
   1159
   1160	hfi1_init_ctxt(uctxt->sc);
   1161
   1162	/* Now allocate the RcvHdr queue and eager buffers. */
   1163	ret = hfi1_create_rcvhdrq(dd, uctxt);
   1164	if (ret)
   1165		goto done;
   1166
   1167	ret = hfi1_setup_eagerbufs(uctxt);
   1168	if (ret)
   1169		goto done;
   1170
   1171	/* If sub-contexts are enabled, do the appropriate setup */
   1172	if (uctxt->subctxt_cnt)
   1173		ret = setup_subctxt(uctxt);
   1174	if (ret)
   1175		goto done;
   1176
   1177	ret = hfi1_alloc_ctxt_rcv_groups(uctxt);
   1178	if (ret)
   1179		goto done;
   1180
   1181	ret = init_user_ctxt(fd, uctxt);
   1182	if (ret)
   1183		goto done;
   1184
   1185	user_init(uctxt);
   1186
   1187	/* Now that the context is set up, the fd can get a reference. */
   1188	fd->uctxt = uctxt;
   1189	hfi1_rcd_get(uctxt);
   1190
   1191done:
   1192	if (uctxt->subctxt_cnt) {
   1193		/*
   1194		 * On error, set the failed bit so sub-contexts will clean up
   1195		 * correctly.
   1196		 */
   1197		if (ret)
   1198			set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags);
   1199
   1200		/*
   1201		 * Base context is done (successfully or not), notify anybody
   1202		 * using a sub-context that is waiting for this completion.
   1203		 */
   1204		clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags);
   1205		wake_up(&uctxt->wait);
   1206	}
   1207
   1208	return ret;
   1209}
   1210
   1211static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
   1212{
   1213	struct hfi1_base_info binfo;
   1214	struct hfi1_ctxtdata *uctxt = fd->uctxt;
   1215	struct hfi1_devdata *dd = uctxt->dd;
   1216	unsigned offset;
   1217
   1218	trace_hfi1_uctxtdata(uctxt->dd, uctxt, fd->subctxt);
   1219
   1220	if (sizeof(binfo) != len)
   1221		return -EINVAL;
   1222
   1223	memset(&binfo, 0, sizeof(binfo));
   1224	binfo.hw_version = dd->revision;
   1225	binfo.sw_version = HFI1_USER_SWVERSION;
   1226	binfo.bthqp = RVT_KDETH_QP_PREFIX;
   1227	binfo.jkey = uctxt->jkey;
   1228	/*
   1229	 * If more than 64 contexts are enabled the allocated credit
   1230	 * return will span two or three contiguous pages. Since we only
   1231	 * map the page containing the context's credit return address,
   1232	 * we need to calculate the offset in the proper page.
   1233	 */
   1234	offset = ((u64)uctxt->sc->hw_free -
   1235		  (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
   1236	binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
   1237						fd->subctxt, offset);
   1238	binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
   1239					    fd->subctxt,
   1240					    uctxt->sc->base_addr);
   1241	binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
   1242						uctxt->ctxt,
   1243						fd->subctxt,
   1244						uctxt->sc->base_addr);
   1245	binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
   1246					       fd->subctxt,
   1247					       uctxt->rcvhdrq);
   1248	binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
   1249					       fd->subctxt,
   1250					       uctxt->egrbufs.rcvtids[0].dma);
   1251	binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
   1252						  fd->subctxt, 0);
   1253	/*
   1254	 * user regs are at
   1255	 * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
   1256	 */
   1257	binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
   1258					     fd->subctxt, 0);
   1259	offset = offset_in_page((uctxt_offset(uctxt) + fd->subctxt) *
   1260				sizeof(*dd->events));
   1261	binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
   1262					       fd->subctxt,
   1263					       offset);
   1264	binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
   1265					       fd->subctxt,
   1266					       dd->status);
   1267	if (HFI1_CAP_IS_USET(DMA_RTAIL))
   1268		binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
   1269							fd->subctxt, 0);
   1270	if (uctxt->subctxt_cnt) {
   1271		binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
   1272							 uctxt->ctxt,
   1273							 fd->subctxt, 0);
   1274		binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
   1275							  uctxt->ctxt,
   1276							  fd->subctxt, 0);
   1277		binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
   1278							  uctxt->ctxt,
   1279							  fd->subctxt, 0);
   1280	}
   1281
   1282	if (copy_to_user((void __user *)arg, &binfo, len))
   1283		return -EFAULT;
   1284
   1285	return 0;
   1286}
   1287
   1288/**
   1289 * user_exp_rcv_setup - Set up the given tid rcv list
   1290 * @fd: file data of the current driver instance
   1291 * @arg: ioctl argumnent for user space information
   1292 * @len: length of data structure associated with ioctl command
   1293 *
   1294 * Wrapper to validate ioctl information before doing _rcv_setup.
   1295 *
   1296 */
   1297static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg,
   1298			      u32 len)
   1299{
   1300	int ret;
   1301	unsigned long addr;
   1302	struct hfi1_tid_info tinfo;
   1303
   1304	if (sizeof(tinfo) != len)
   1305		return -EINVAL;
   1306
   1307	if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
   1308		return -EFAULT;
   1309
   1310	ret = hfi1_user_exp_rcv_setup(fd, &tinfo);
   1311	if (!ret) {
   1312		/*
   1313		 * Copy the number of tidlist entries we used
   1314		 * and the length of the buffer we registered.
   1315		 */
   1316		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
   1317		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
   1318				 sizeof(tinfo.tidcnt)))
   1319			return -EFAULT;
   1320
   1321		addr = arg + offsetof(struct hfi1_tid_info, length);
   1322		if (copy_to_user((void __user *)addr, &tinfo.length,
   1323				 sizeof(tinfo.length)))
   1324			ret = -EFAULT;
   1325	}
   1326
   1327	return ret;
   1328}
   1329
   1330/**
   1331 * user_exp_rcv_clear - Clear the given tid rcv list
   1332 * @fd: file data of the current driver instance
   1333 * @arg: ioctl argumnent for user space information
   1334 * @len: length of data structure associated with ioctl command
   1335 *
   1336 * The hfi1_user_exp_rcv_clear() can be called from the error path.  Because
   1337 * of this, we need to use this wrapper to copy the user space information
   1338 * before doing the clear.
   1339 */
   1340static int user_exp_rcv_clear(struct hfi1_filedata *fd, unsigned long arg,
   1341			      u32 len)
   1342{
   1343	int ret;
   1344	unsigned long addr;
   1345	struct hfi1_tid_info tinfo;
   1346
   1347	if (sizeof(tinfo) != len)
   1348		return -EINVAL;
   1349
   1350	if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
   1351		return -EFAULT;
   1352
   1353	ret = hfi1_user_exp_rcv_clear(fd, &tinfo);
   1354	if (!ret) {
   1355		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
   1356		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
   1357				 sizeof(tinfo.tidcnt)))
   1358			return -EFAULT;
   1359	}
   1360
   1361	return ret;
   1362}
   1363
   1364/**
   1365 * user_exp_rcv_invalid - Invalidate the given tid rcv list
   1366 * @fd: file data of the current driver instance
   1367 * @arg: ioctl argumnent for user space information
   1368 * @len: length of data structure associated with ioctl command
   1369 *
   1370 * Wrapper to validate ioctl information before doing _rcv_invalid.
   1371 *
   1372 */
   1373static int user_exp_rcv_invalid(struct hfi1_filedata *fd, unsigned long arg,
   1374				u32 len)
   1375{
   1376	int ret;
   1377	unsigned long addr;
   1378	struct hfi1_tid_info tinfo;
   1379
   1380	if (sizeof(tinfo) != len)
   1381		return -EINVAL;
   1382
   1383	if (!fd->invalid_tids)
   1384		return -EINVAL;
   1385
   1386	if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
   1387		return -EFAULT;
   1388
   1389	ret = hfi1_user_exp_rcv_invalid(fd, &tinfo);
   1390	if (ret)
   1391		return ret;
   1392
   1393	addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
   1394	if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
   1395			 sizeof(tinfo.tidcnt)))
   1396		ret = -EFAULT;
   1397
   1398	return ret;
   1399}
   1400
   1401static __poll_t poll_urgent(struct file *fp,
   1402				struct poll_table_struct *pt)
   1403{
   1404	struct hfi1_filedata *fd = fp->private_data;
   1405	struct hfi1_ctxtdata *uctxt = fd->uctxt;
   1406	struct hfi1_devdata *dd = uctxt->dd;
   1407	__poll_t pollflag;
   1408
   1409	poll_wait(fp, &uctxt->wait, pt);
   1410
   1411	spin_lock_irq(&dd->uctxt_lock);
   1412	if (uctxt->urgent != uctxt->urgent_poll) {
   1413		pollflag = EPOLLIN | EPOLLRDNORM;
   1414		uctxt->urgent_poll = uctxt->urgent;
   1415	} else {
   1416		pollflag = 0;
   1417		set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
   1418	}
   1419	spin_unlock_irq(&dd->uctxt_lock);
   1420
   1421	return pollflag;
   1422}
   1423
   1424static __poll_t poll_next(struct file *fp,
   1425			      struct poll_table_struct *pt)
   1426{
   1427	struct hfi1_filedata *fd = fp->private_data;
   1428	struct hfi1_ctxtdata *uctxt = fd->uctxt;
   1429	struct hfi1_devdata *dd = uctxt->dd;
   1430	__poll_t pollflag;
   1431
   1432	poll_wait(fp, &uctxt->wait, pt);
   1433
   1434	spin_lock_irq(&dd->uctxt_lock);
   1435	if (hdrqempty(uctxt)) {
   1436		set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
   1437		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt);
   1438		pollflag = 0;
   1439	} else {
   1440		pollflag = EPOLLIN | EPOLLRDNORM;
   1441	}
   1442	spin_unlock_irq(&dd->uctxt_lock);
   1443
   1444	return pollflag;
   1445}
   1446
   1447/*
   1448 * Find all user contexts in use, and set the specified bit in their
   1449 * event mask.
   1450 * See also find_ctxt() for a similar use, that is specific to send buffers.
   1451 */
   1452int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
   1453{
   1454	struct hfi1_ctxtdata *uctxt;
   1455	struct hfi1_devdata *dd = ppd->dd;
   1456	u16 ctxt;
   1457
   1458	if (!dd->events)
   1459		return -EINVAL;
   1460
   1461	for (ctxt = dd->first_dyn_alloc_ctxt; ctxt < dd->num_rcv_contexts;
   1462	     ctxt++) {
   1463		uctxt = hfi1_rcd_get_by_index(dd, ctxt);
   1464		if (uctxt) {
   1465			unsigned long *evs;
   1466			int i;
   1467			/*
   1468			 * subctxt_cnt is 0 if not shared, so do base
   1469			 * separately, first, then remaining subctxt, if any
   1470			 */
   1471			evs = dd->events + uctxt_offset(uctxt);
   1472			set_bit(evtbit, evs);
   1473			for (i = 1; i < uctxt->subctxt_cnt; i++)
   1474				set_bit(evtbit, evs + i);
   1475			hfi1_rcd_put(uctxt);
   1476		}
   1477	}
   1478
   1479	return 0;
   1480}
   1481
   1482/**
   1483 * manage_rcvq - manage a context's receive queue
   1484 * @uctxt: the context
   1485 * @subctxt: the sub-context
   1486 * @arg: start/stop action to carry out
   1487 *
   1488 * start_stop == 0 disables receive on the context, for use in queue
   1489 * overflow conditions.  start_stop==1 re-enables, to be used to
   1490 * re-init the software copy of the head register
   1491 */
   1492static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
   1493		       unsigned long arg)
   1494{
   1495	struct hfi1_devdata *dd = uctxt->dd;
   1496	unsigned int rcvctrl_op;
   1497	int start_stop;
   1498
   1499	if (subctxt)
   1500		return 0;
   1501
   1502	if (get_user(start_stop, (int __user *)arg))
   1503		return -EFAULT;
   1504
   1505	/* atomically clear receive enable ctxt. */
   1506	if (start_stop) {
   1507		/*
   1508		 * On enable, force in-memory copy of the tail register to
   1509		 * 0, so that protocol code doesn't have to worry about
   1510		 * whether or not the chip has yet updated the in-memory
   1511		 * copy or not on return from the system call. The chip
   1512		 * always resets it's tail register back to 0 on a
   1513		 * transition from disabled to enabled.
   1514		 */
   1515		if (hfi1_rcvhdrtail_kvaddr(uctxt))
   1516			clear_rcvhdrtail(uctxt);
   1517		rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
   1518	} else {
   1519		rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
   1520	}
   1521	hfi1_rcvctrl(dd, rcvctrl_op, uctxt);
   1522	/* always; new head should be equal to new tail; see above */
   1523
   1524	return 0;
   1525}
   1526
   1527/*
   1528 * clear the event notifier events for this context.
   1529 * User process then performs actions appropriate to bit having been
   1530 * set, if desired, and checks again in future.
   1531 */
   1532static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
   1533			  unsigned long arg)
   1534{
   1535	int i;
   1536	struct hfi1_devdata *dd = uctxt->dd;
   1537	unsigned long *evs;
   1538	unsigned long events;
   1539
   1540	if (!dd->events)
   1541		return 0;
   1542
   1543	if (get_user(events, (unsigned long __user *)arg))
   1544		return -EFAULT;
   1545
   1546	evs = dd->events + uctxt_offset(uctxt) + subctxt;
   1547
   1548	for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
   1549		if (!test_bit(i, &events))
   1550			continue;
   1551		clear_bit(i, evs);
   1552	}
   1553	return 0;
   1554}
   1555
   1556static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg)
   1557{
   1558	int i;
   1559	struct hfi1_pportdata *ppd = uctxt->ppd;
   1560	struct hfi1_devdata *dd = uctxt->dd;
   1561	u16 pkey;
   1562
   1563	if (!HFI1_CAP_IS_USET(PKEY_CHECK))
   1564		return -EPERM;
   1565
   1566	if (get_user(pkey, (u16 __user *)arg))
   1567		return -EFAULT;
   1568
   1569	if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
   1570		return -EINVAL;
   1571
   1572	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
   1573		if (pkey == ppd->pkeys[i])
   1574			return hfi1_set_ctxt_pkey(dd, uctxt, pkey);
   1575
   1576	return -ENOENT;
   1577}
   1578
   1579/**
   1580 * ctxt_reset - Reset the user context
   1581 * @uctxt: valid user context
   1582 */
   1583static int ctxt_reset(struct hfi1_ctxtdata *uctxt)
   1584{
   1585	struct send_context *sc;
   1586	struct hfi1_devdata *dd;
   1587	int ret = 0;
   1588
   1589	if (!uctxt || !uctxt->dd || !uctxt->sc)
   1590		return -EINVAL;
   1591
   1592	/*
   1593	 * There is no protection here. User level has to guarantee that
   1594	 * no one will be writing to the send context while it is being
   1595	 * re-initialized.  If user level breaks that guarantee, it will
   1596	 * break it's own context and no one else's.
   1597	 */
   1598	dd = uctxt->dd;
   1599	sc = uctxt->sc;
   1600
   1601	/*
   1602	 * Wait until the interrupt handler has marked the context as
   1603	 * halted or frozen. Report error if we time out.
   1604	 */
   1605	wait_event_interruptible_timeout(
   1606		sc->halt_wait, (sc->flags & SCF_HALTED),
   1607		msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
   1608	if (!(sc->flags & SCF_HALTED))
   1609		return -ENOLCK;
   1610
   1611	/*
   1612	 * If the send context was halted due to a Freeze, wait until the
   1613	 * device has been "unfrozen" before resetting the context.
   1614	 */
   1615	if (sc->flags & SCF_FROZEN) {
   1616		wait_event_interruptible_timeout(
   1617			dd->event_queue,
   1618			!(READ_ONCE(dd->flags) & HFI1_FROZEN),
   1619			msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
   1620		if (dd->flags & HFI1_FROZEN)
   1621			return -ENOLCK;
   1622
   1623		if (dd->flags & HFI1_FORCED_FREEZE)
   1624			/*
   1625			 * Don't allow context reset if we are into
   1626			 * forced freeze
   1627			 */
   1628			return -ENODEV;
   1629
   1630		sc_disable(sc);
   1631		ret = sc_enable(sc);
   1632		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, uctxt);
   1633	} else {
   1634		ret = sc_restart(sc);
   1635	}
   1636	if (!ret)
   1637		sc_return_credits(sc);
   1638
   1639	return ret;
   1640}
   1641
   1642static void user_remove(struct hfi1_devdata *dd)
   1643{
   1644
   1645	hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
   1646}
   1647
   1648static int user_add(struct hfi1_devdata *dd)
   1649{
   1650	char name[10];
   1651	int ret;
   1652
   1653	snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
   1654	ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
   1655			     &dd->user_cdev, &dd->user_device,
   1656			     true, &dd->verbs_dev.rdi.ibdev.dev.kobj);
   1657	if (ret)
   1658		user_remove(dd);
   1659
   1660	return ret;
   1661}
   1662
   1663/*
   1664 * Create per-unit files in /dev
   1665 */
   1666int hfi1_device_create(struct hfi1_devdata *dd)
   1667{
   1668	return user_add(dd);
   1669}
   1670
   1671/*
   1672 * Remove per-unit files in /dev
   1673 * void, core kernel returns no errors for this stuff
   1674 */
   1675void hfi1_device_remove(struct hfi1_devdata *dd)
   1676{
   1677	user_remove(dd);
   1678}