user_exp_rcv.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
user_exp_rcv.c (26022B)
      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2/*
      3 * Copyright(c) 2020 Cornelis Networks, Inc.
      4 * Copyright(c) 2015-2018 Intel Corporation.
      5 */
      6#include <asm/page.h>
      7#include <linux/string.h>
      8
      9#include "mmu_rb.h"
     10#include "user_exp_rcv.h"
     11#include "trace.h"
     12
     13static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
     14			    struct exp_tid_set *set,
     15			    struct hfi1_filedata *fd);
     16static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
     17static int set_rcvarray_entry(struct hfi1_filedata *fd,
     18			      struct tid_user_buf *tbuf,
     19			      u32 rcventry, struct tid_group *grp,
     20			      u16 pageidx, unsigned int npages);
     21static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
     22				    struct tid_rb_node *tnode);
     23static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
     24			      const struct mmu_notifier_range *range,
     25			      unsigned long cur_seq);
     26static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
     27			    struct tid_group *grp,
     28			    unsigned int start, u16 count,
     29			    u32 *tidlist, unsigned int *tididx,
     30			    unsigned int *pmapped);
     31static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
     32			      struct tid_group **grp);
     33static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
     34
     35static const struct mmu_interval_notifier_ops tid_mn_ops = {
     36	.invalidate = tid_rb_invalidate,
     37};
     38
     39/*
     40 * Initialize context and file private data needed for Expected
     41 * receive caching. This needs to be done after the context has
     42 * been configured with the eager/expected RcvEntry counts.
     43 */
     44int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
     45			   struct hfi1_ctxtdata *uctxt)
     46{
     47	int ret = 0;
     48
     49	fd->entry_to_rb = kcalloc(uctxt->expected_count,
     50				  sizeof(struct rb_node *),
     51				  GFP_KERNEL);
     52	if (!fd->entry_to_rb)
     53		return -ENOMEM;
     54
     55	if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
     56		fd->invalid_tid_idx = 0;
     57		fd->invalid_tids = kcalloc(uctxt->expected_count,
     58					   sizeof(*fd->invalid_tids),
     59					   GFP_KERNEL);
     60		if (!fd->invalid_tids) {
     61			kfree(fd->entry_to_rb);
     62			fd->entry_to_rb = NULL;
     63			return -ENOMEM;
     64		}
     65		fd->use_mn = true;
     66	}
     67
     68	/*
     69	 * PSM does not have a good way to separate, count, and
     70	 * effectively enforce a limit on RcvArray entries used by
     71	 * subctxts (when context sharing is used) when TID caching
     72	 * is enabled. To help with that, we calculate a per-process
     73	 * RcvArray entry share and enforce that.
     74	 * If TID caching is not in use, PSM deals with usage on its
     75	 * own. In that case, we allow any subctxt to take all of the
     76	 * entries.
     77	 *
     78	 * Make sure that we set the tid counts only after successful
     79	 * init.
     80	 */
     81	spin_lock(&fd->tid_lock);
     82	if (uctxt->subctxt_cnt && fd->use_mn) {
     83		u16 remainder;
     84
     85		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
     86		remainder = uctxt->expected_count % uctxt->subctxt_cnt;
     87		if (remainder && fd->subctxt < remainder)
     88			fd->tid_limit++;
     89	} else {
     90		fd->tid_limit = uctxt->expected_count;
     91	}
     92	spin_unlock(&fd->tid_lock);
     93
     94	return ret;
     95}
     96
     97void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
     98{
     99	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    100
    101	mutex_lock(&uctxt->exp_mutex);
    102	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
    103		unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
    104	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
    105		unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
    106	mutex_unlock(&uctxt->exp_mutex);
    107
    108	kfree(fd->invalid_tids);
    109	fd->invalid_tids = NULL;
    110
    111	kfree(fd->entry_to_rb);
    112	fd->entry_to_rb = NULL;
    113}
    114
    115/*
    116 * Release pinned receive buffer pages.
    117 *
    118 * @mapped: true if the pages have been DMA mapped. false otherwise.
    119 * @idx: Index of the first page to unpin.
    120 * @npages: No of pages to unpin.
    121 *
    122 * If the pages have been DMA mapped (indicated by mapped parameter), their
    123 * info will be passed via a struct tid_rb_node. If they haven't been mapped,
    124 * their info will be passed via a struct tid_user_buf.
    125 */
    126static void unpin_rcv_pages(struct hfi1_filedata *fd,
    127			    struct tid_user_buf *tidbuf,
    128			    struct tid_rb_node *node,
    129			    unsigned int idx,
    130			    unsigned int npages,
    131			    bool mapped)
    132{
    133	struct page **pages;
    134	struct hfi1_devdata *dd = fd->uctxt->dd;
    135	struct mm_struct *mm;
    136
    137	if (mapped) {
    138		dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
    139				 node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
    140		pages = &node->pages[idx];
    141		mm = mm_from_tid_node(node);
    142	} else {
    143		pages = &tidbuf->pages[idx];
    144		mm = current->mm;
    145	}
    146	hfi1_release_user_pages(mm, pages, npages, mapped);
    147	fd->tid_n_pinned -= npages;
    148}
    149
    150/*
    151 * Pin receive buffer pages.
    152 */
    153static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
    154{
    155	int pinned;
    156	unsigned int npages;
    157	unsigned long vaddr = tidbuf->vaddr;
    158	struct page **pages = NULL;
    159	struct hfi1_devdata *dd = fd->uctxt->dd;
    160
    161	/* Get the number of pages the user buffer spans */
    162	npages = num_user_pages(vaddr, tidbuf->length);
    163	if (!npages)
    164		return -EINVAL;
    165
    166	if (npages > fd->uctxt->expected_count) {
    167		dd_dev_err(dd, "Expected buffer too big\n");
    168		return -EINVAL;
    169	}
    170
    171	/* Allocate the array of struct page pointers needed for pinning */
    172	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
    173	if (!pages)
    174		return -ENOMEM;
    175
    176	/*
    177	 * Pin all the pages of the user buffer. If we can't pin all the
    178	 * pages, accept the amount pinned so far and program only that.
    179	 * User space knows how to deal with partially programmed buffers.
    180	 */
    181	if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
    182		kfree(pages);
    183		return -ENOMEM;
    184	}
    185
    186	pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
    187	if (pinned <= 0) {
    188		kfree(pages);
    189		return pinned;
    190	}
    191	tidbuf->pages = pages;
    192	tidbuf->npages = npages;
    193	fd->tid_n_pinned += pinned;
    194	return pinned;
    195}
    196
    197/*
    198 * RcvArray entry allocation for Expected Receives is done by the
    199 * following algorithm:
    200 *
    201 * The context keeps 3 lists of groups of RcvArray entries:
    202 *   1. List of empty groups - tid_group_list
    203 *      This list is created during user context creation and
    204 *      contains elements which describe sets (of 8) of empty
    205 *      RcvArray entries.
    206 *   2. List of partially used groups - tid_used_list
    207 *      This list contains sets of RcvArray entries which are
    208 *      not completely used up. Another mapping request could
    209 *      use some of all of the remaining entries.
    210 *   3. List of full groups - tid_full_list
    211 *      This is the list where sets that are completely used
    212 *      up go.
    213 *
    214 * An attempt to optimize the usage of RcvArray entries is
    215 * made by finding all sets of physically contiguous pages in a
    216 * user's buffer.
    217 * These physically contiguous sets are further split into
    218 * sizes supported by the receive engine of the HFI. The
    219 * resulting sets of pages are stored in struct tid_pageset,
    220 * which describes the sets as:
    221 *    * .count - number of pages in this set
    222 *    * .idx - starting index into struct page ** array
    223 *                    of this set
    224 *
    225 * From this point on, the algorithm deals with the page sets
    226 * described above. The number of pagesets is divided by the
    227 * RcvArray group size to produce the number of full groups
    228 * needed.
    229 *
    230 * Groups from the 3 lists are manipulated using the following
    231 * rules:
    232 *   1. For each set of 8 pagesets, a complete group from
    233 *      tid_group_list is taken, programmed, and moved to
    234 *      the tid_full_list list.
    235 *   2. For all remaining pagesets:
    236 *      2.1 If the tid_used_list is empty and the tid_group_list
    237 *          is empty, stop processing pageset and return only
    238 *          what has been programmed up to this point.
    239 *      2.2 If the tid_used_list is empty and the tid_group_list
    240 *          is not empty, move a group from tid_group_list to
    241 *          tid_used_list.
    242 *      2.3 For each group is tid_used_group, program as much as
    243 *          can fit into the group. If the group becomes fully
    244 *          used, move it to tid_full_list.
    245 */
    246int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
    247			    struct hfi1_tid_info *tinfo)
    248{
    249	int ret = 0, need_group = 0, pinned;
    250	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    251	struct hfi1_devdata *dd = uctxt->dd;
    252	unsigned int ngroups, pageidx = 0, pageset_count,
    253		tididx = 0, mapped, mapped_pages = 0;
    254	u32 *tidlist = NULL;
    255	struct tid_user_buf *tidbuf;
    256
    257	if (!PAGE_ALIGNED(tinfo->vaddr))
    258		return -EINVAL;
    259
    260	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
    261	if (!tidbuf)
    262		return -ENOMEM;
    263
    264	tidbuf->vaddr = tinfo->vaddr;
    265	tidbuf->length = tinfo->length;
    266	tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
    267				GFP_KERNEL);
    268	if (!tidbuf->psets) {
    269		kfree(tidbuf);
    270		return -ENOMEM;
    271	}
    272
    273	pinned = pin_rcv_pages(fd, tidbuf);
    274	if (pinned <= 0) {
    275		kfree(tidbuf->psets);
    276		kfree(tidbuf);
    277		return pinned;
    278	}
    279
    280	/* Find sets of physically contiguous pages */
    281	tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
    282
    283	/*
    284	 * We don't need to access this under a lock since tid_used is per
    285	 * process and the same process cannot be in hfi1_user_exp_rcv_clear()
    286	 * and hfi1_user_exp_rcv_setup() at the same time.
    287	 */
    288	spin_lock(&fd->tid_lock);
    289	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
    290		pageset_count = fd->tid_limit - fd->tid_used;
    291	else
    292		pageset_count = tidbuf->n_psets;
    293	spin_unlock(&fd->tid_lock);
    294
    295	if (!pageset_count)
    296		goto bail;
    297
    298	ngroups = pageset_count / dd->rcv_entries.group_size;
    299	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
    300	if (!tidlist) {
    301		ret = -ENOMEM;
    302		goto nomem;
    303	}
    304
    305	tididx = 0;
    306
    307	/*
    308	 * From this point on, we are going to be using shared (between master
    309	 * and subcontexts) context resources. We need to take the lock.
    310	 */
    311	mutex_lock(&uctxt->exp_mutex);
    312	/*
    313	 * The first step is to program the RcvArray entries which are complete
    314	 * groups.
    315	 */
    316	while (ngroups && uctxt->tid_group_list.count) {
    317		struct tid_group *grp =
    318			tid_group_pop(&uctxt->tid_group_list);
    319
    320		ret = program_rcvarray(fd, tidbuf, grp,
    321				       pageidx, dd->rcv_entries.group_size,
    322				       tidlist, &tididx, &mapped);
    323		/*
    324		 * If there was a failure to program the RcvArray
    325		 * entries for the entire group, reset the grp fields
    326		 * and add the grp back to the free group list.
    327		 */
    328		if (ret <= 0) {
    329			tid_group_add_tail(grp, &uctxt->tid_group_list);
    330			hfi1_cdbg(TID,
    331				  "Failed to program RcvArray group %d", ret);
    332			goto unlock;
    333		}
    334
    335		tid_group_add_tail(grp, &uctxt->tid_full_list);
    336		ngroups--;
    337		pageidx += ret;
    338		mapped_pages += mapped;
    339	}
    340
    341	while (pageidx < pageset_count) {
    342		struct tid_group *grp, *ptr;
    343		/*
    344		 * If we don't have any partially used tid groups, check
    345		 * if we have empty groups. If so, take one from there and
    346		 * put in the partially used list.
    347		 */
    348		if (!uctxt->tid_used_list.count || need_group) {
    349			if (!uctxt->tid_group_list.count)
    350				goto unlock;
    351
    352			grp = tid_group_pop(&uctxt->tid_group_list);
    353			tid_group_add_tail(grp, &uctxt->tid_used_list);
    354			need_group = 0;
    355		}
    356		/*
    357		 * There is an optimization opportunity here - instead of
    358		 * fitting as many page sets as we can, check for a group
    359		 * later on in the list that could fit all of them.
    360		 */
    361		list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
    362					 list) {
    363			unsigned use = min_t(unsigned, pageset_count - pageidx,
    364					     grp->size - grp->used);
    365
    366			ret = program_rcvarray(fd, tidbuf, grp,
    367					       pageidx, use, tidlist,
    368					       &tididx, &mapped);
    369			if (ret < 0) {
    370				hfi1_cdbg(TID,
    371					  "Failed to program RcvArray entries %d",
    372					  ret);
    373				goto unlock;
    374			} else if (ret > 0) {
    375				if (grp->used == grp->size)
    376					tid_group_move(grp,
    377						       &uctxt->tid_used_list,
    378						       &uctxt->tid_full_list);
    379				pageidx += ret;
    380				mapped_pages += mapped;
    381				need_group = 0;
    382				/* Check if we are done so we break out early */
    383				if (pageidx >= pageset_count)
    384					break;
    385			} else if (WARN_ON(ret == 0)) {
    386				/*
    387				 * If ret is 0, we did not program any entries
    388				 * into this group, which can only happen if
    389				 * we've screwed up the accounting somewhere.
    390				 * Warn and try to continue.
    391				 */
    392				need_group = 1;
    393			}
    394		}
    395	}
    396unlock:
    397	mutex_unlock(&uctxt->exp_mutex);
    398nomem:
    399	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
    400		  mapped_pages, ret);
    401	if (tididx) {
    402		spin_lock(&fd->tid_lock);
    403		fd->tid_used += tididx;
    404		spin_unlock(&fd->tid_lock);
    405		tinfo->tidcnt = tididx;
    406		tinfo->length = mapped_pages * PAGE_SIZE;
    407
    408		if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
    409				 tidlist, sizeof(tidlist[0]) * tididx)) {
    410			/*
    411			 * On failure to copy to the user level, we need to undo
    412			 * everything done so far so we don't leak resources.
    413			 */
    414			tinfo->tidlist = (unsigned long)&tidlist;
    415			hfi1_user_exp_rcv_clear(fd, tinfo);
    416			tinfo->tidlist = 0;
    417			ret = -EFAULT;
    418			goto bail;
    419		}
    420	}
    421
    422	/*
    423	 * If not everything was mapped (due to insufficient RcvArray entries,
    424	 * for example), unpin all unmapped pages so we can pin them nex time.
    425	 */
    426	if (mapped_pages != pinned)
    427		unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
    428				(pinned - mapped_pages), false);
    429bail:
    430	kfree(tidbuf->psets);
    431	kfree(tidlist);
    432	kfree(tidbuf->pages);
    433	kfree(tidbuf);
    434	return ret > 0 ? 0 : ret;
    435}
    436
    437int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
    438			    struct hfi1_tid_info *tinfo)
    439{
    440	int ret = 0;
    441	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    442	u32 *tidinfo;
    443	unsigned tididx;
    444
    445	if (unlikely(tinfo->tidcnt > fd->tid_used))
    446		return -EINVAL;
    447
    448	tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
    449			      sizeof(tidinfo[0]) * tinfo->tidcnt);
    450	if (IS_ERR(tidinfo))
    451		return PTR_ERR(tidinfo);
    452
    453	mutex_lock(&uctxt->exp_mutex);
    454	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
    455		ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
    456		if (ret) {
    457			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
    458				  ret);
    459			break;
    460		}
    461	}
    462	spin_lock(&fd->tid_lock);
    463	fd->tid_used -= tididx;
    464	spin_unlock(&fd->tid_lock);
    465	tinfo->tidcnt = tididx;
    466	mutex_unlock(&uctxt->exp_mutex);
    467
    468	kfree(tidinfo);
    469	return ret;
    470}
    471
    472int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
    473			      struct hfi1_tid_info *tinfo)
    474{
    475	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    476	unsigned long *ev = uctxt->dd->events +
    477		(uctxt_offset(uctxt) + fd->subctxt);
    478	u32 *array;
    479	int ret = 0;
    480
    481	/*
    482	 * copy_to_user() can sleep, which will leave the invalid_lock
    483	 * locked and cause the MMU notifier to be blocked on the lock
    484	 * for a long time.
    485	 * Copy the data to a local buffer so we can release the lock.
    486	 */
    487	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
    488	if (!array)
    489		return -EFAULT;
    490
    491	spin_lock(&fd->invalid_lock);
    492	if (fd->invalid_tid_idx) {
    493		memcpy(array, fd->invalid_tids, sizeof(*array) *
    494		       fd->invalid_tid_idx);
    495		memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
    496		       fd->invalid_tid_idx);
    497		tinfo->tidcnt = fd->invalid_tid_idx;
    498		fd->invalid_tid_idx = 0;
    499		/*
    500		 * Reset the user flag while still holding the lock.
    501		 * Otherwise, PSM can miss events.
    502		 */
    503		clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
    504	} else {
    505		tinfo->tidcnt = 0;
    506	}
    507	spin_unlock(&fd->invalid_lock);
    508
    509	if (tinfo->tidcnt) {
    510		if (copy_to_user((void __user *)tinfo->tidlist,
    511				 array, sizeof(*array) * tinfo->tidcnt))
    512			ret = -EFAULT;
    513	}
    514	kfree(array);
    515
    516	return ret;
    517}
    518
    519static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
    520{
    521	unsigned pagecount, pageidx, setcount = 0, i;
    522	unsigned long pfn, this_pfn;
    523	struct page **pages = tidbuf->pages;
    524	struct tid_pageset *list = tidbuf->psets;
    525
    526	if (!npages)
    527		return 0;
    528
    529	/*
    530	 * Look for sets of physically contiguous pages in the user buffer.
    531	 * This will allow us to optimize Expected RcvArray entry usage by
    532	 * using the bigger supported sizes.
    533	 */
    534	pfn = page_to_pfn(pages[0]);
    535	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
    536		this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
    537
    538		/*
    539		 * If the pfn's are not sequential, pages are not physically
    540		 * contiguous.
    541		 */
    542		if (this_pfn != ++pfn) {
    543			/*
    544			 * At this point we have to loop over the set of
    545			 * physically contiguous pages and break them down it
    546			 * sizes supported by the HW.
    547			 * There are two main constraints:
    548			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
    549			 *        If the total set size is bigger than that
    550			 *        program only a MAX_EXPECTED_BUFFER chunk.
    551			 *     2. The buffer size has to be a power of two. If
    552			 *        it is not, round down to the closes power of
    553			 *        2 and program that size.
    554			 */
    555			while (pagecount) {
    556				int maxpages = pagecount;
    557				u32 bufsize = pagecount * PAGE_SIZE;
    558
    559				if (bufsize > MAX_EXPECTED_BUFFER)
    560					maxpages =
    561						MAX_EXPECTED_BUFFER >>
    562						PAGE_SHIFT;
    563				else if (!is_power_of_2(bufsize))
    564					maxpages =
    565						rounddown_pow_of_two(bufsize) >>
    566						PAGE_SHIFT;
    567
    568				list[setcount].idx = pageidx;
    569				list[setcount].count = maxpages;
    570				pagecount -= maxpages;
    571				pageidx += maxpages;
    572				setcount++;
    573			}
    574			pageidx = i;
    575			pagecount = 1;
    576			pfn = this_pfn;
    577		} else {
    578			pagecount++;
    579		}
    580	}
    581	return setcount;
    582}
    583
    584/**
    585 * program_rcvarray() - program an RcvArray group with receive buffers
    586 * @fd: filedata pointer
    587 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
    588 *	  virtual address, buffer length, page pointers, pagesets (array of
    589 *	  struct tid_pageset holding information on physically contiguous
    590 *	  chunks from the user buffer), and other fields.
    591 * @grp: RcvArray group
    592 * @start: starting index into sets array
    593 * @count: number of struct tid_pageset's to program
    594 * @tidlist: the array of u32 elements when the information about the
    595 *           programmed RcvArray entries is to be encoded.
    596 * @tididx: starting offset into tidlist
    597 * @pmapped: (output parameter) number of pages programmed into the RcvArray
    598 *           entries.
    599 *
    600 * This function will program up to 'count' number of RcvArray entries from the
    601 * group 'grp'. To make best use of write-combining writes, the function will
    602 * perform writes to the unused RcvArray entries which will be ignored by the
    603 * HW. Each RcvArray entry will be programmed with a physically contiguous
    604 * buffer chunk from the user's virtual buffer.
    605 *
    606 * Return:
    607 * -EINVAL if the requested count is larger than the size of the group,
    608 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
    609 * number of RcvArray entries programmed.
    610 */
    611static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
    612			    struct tid_group *grp,
    613			    unsigned int start, u16 count,
    614			    u32 *tidlist, unsigned int *tididx,
    615			    unsigned int *pmapped)
    616{
    617	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    618	struct hfi1_devdata *dd = uctxt->dd;
    619	u16 idx;
    620	u32 tidinfo = 0, rcventry, useidx = 0;
    621	int mapped = 0;
    622
    623	/* Count should never be larger than the group size */
    624	if (count > grp->size)
    625		return -EINVAL;
    626
    627	/* Find the first unused entry in the group */
    628	for (idx = 0; idx < grp->size; idx++) {
    629		if (!(grp->map & (1 << idx))) {
    630			useidx = idx;
    631			break;
    632		}
    633		rcv_array_wc_fill(dd, grp->base + idx);
    634	}
    635
    636	idx = 0;
    637	while (idx < count) {
    638		u16 npages, pageidx, setidx = start + idx;
    639		int ret = 0;
    640
    641		/*
    642		 * If this entry in the group is used, move to the next one.
    643		 * If we go past the end of the group, exit the loop.
    644		 */
    645		if (useidx >= grp->size) {
    646			break;
    647		} else if (grp->map & (1 << useidx)) {
    648			rcv_array_wc_fill(dd, grp->base + useidx);
    649			useidx++;
    650			continue;
    651		}
    652
    653		rcventry = grp->base + useidx;
    654		npages = tbuf->psets[setidx].count;
    655		pageidx = tbuf->psets[setidx].idx;
    656
    657		ret = set_rcvarray_entry(fd, tbuf,
    658					 rcventry, grp, pageidx,
    659					 npages);
    660		if (ret)
    661			return ret;
    662		mapped += npages;
    663
    664		tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
    665			EXP_TID_SET(LEN, npages);
    666		tidlist[(*tididx)++] = tidinfo;
    667		grp->used++;
    668		grp->map |= 1 << useidx++;
    669		idx++;
    670	}
    671
    672	/* Fill the rest of the group with "blank" writes */
    673	for (; useidx < grp->size; useidx++)
    674		rcv_array_wc_fill(dd, grp->base + useidx);
    675	*pmapped = mapped;
    676	return idx;
    677}
    678
    679static int set_rcvarray_entry(struct hfi1_filedata *fd,
    680			      struct tid_user_buf *tbuf,
    681			      u32 rcventry, struct tid_group *grp,
    682			      u16 pageidx, unsigned int npages)
    683{
    684	int ret;
    685	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    686	struct tid_rb_node *node;
    687	struct hfi1_devdata *dd = uctxt->dd;
    688	dma_addr_t phys;
    689	struct page **pages = tbuf->pages + pageidx;
    690
    691	/*
    692	 * Allocate the node first so we can handle a potential
    693	 * failure before we've programmed anything.
    694	 */
    695	node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL);
    696	if (!node)
    697		return -ENOMEM;
    698
    699	phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])),
    700			      npages * PAGE_SIZE, DMA_FROM_DEVICE);
    701	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
    702		dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
    703			   phys);
    704		kfree(node);
    705		return -EFAULT;
    706	}
    707
    708	node->fdata = fd;
    709	node->phys = page_to_phys(pages[0]);
    710	node->npages = npages;
    711	node->rcventry = rcventry;
    712	node->dma_addr = phys;
    713	node->grp = grp;
    714	node->freed = false;
    715	memcpy(node->pages, pages, flex_array_size(node, pages, npages));
    716
    717	if (fd->use_mn) {
    718		ret = mmu_interval_notifier_insert(
    719			&node->notifier, current->mm,
    720			tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
    721			&tid_mn_ops);
    722		if (ret)
    723			goto out_unmap;
    724		/*
    725		 * FIXME: This is in the wrong order, the notifier should be
    726		 * established before the pages are pinned by pin_rcv_pages.
    727		 */
    728		mmu_interval_read_begin(&node->notifier);
    729	}
    730	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
    731
    732	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
    733	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
    734			       node->notifier.interval_tree.start, node->phys,
    735			       phys);
    736	return 0;
    737
    738out_unmap:
    739	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
    740		  node->rcventry, node->notifier.interval_tree.start,
    741		  node->phys, ret);
    742	dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
    743			 DMA_FROM_DEVICE);
    744	kfree(node);
    745	return -EFAULT;
    746}
    747
    748static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
    749			      struct tid_group **grp)
    750{
    751	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    752	struct hfi1_devdata *dd = uctxt->dd;
    753	struct tid_rb_node *node;
    754	u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
    755	u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
    756
    757	if (tididx >= uctxt->expected_count) {
    758		dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
    759			   tididx, uctxt->ctxt);
    760		return -EINVAL;
    761	}
    762
    763	if (tidctrl == 0x3)
    764		return -EINVAL;
    765
    766	rcventry = tididx + (tidctrl - 1);
    767
    768	node = fd->entry_to_rb[rcventry];
    769	if (!node || node->rcventry != (uctxt->expected_base + rcventry))
    770		return -EBADF;
    771
    772	if (grp)
    773		*grp = node->grp;
    774
    775	if (fd->use_mn)
    776		mmu_interval_notifier_remove(&node->notifier);
    777	cacheless_tid_rb_remove(fd, node);
    778
    779	return 0;
    780}
    781
    782static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
    783{
    784	struct hfi1_ctxtdata *uctxt = fd->uctxt;
    785	struct hfi1_devdata *dd = uctxt->dd;
    786
    787	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
    788				 node->npages,
    789				 node->notifier.interval_tree.start, node->phys,
    790				 node->dma_addr);
    791
    792	/*
    793	 * Make sure device has seen the write before we unpin the
    794	 * pages.
    795	 */
    796	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
    797
    798	unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
    799
    800	node->grp->used--;
    801	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
    802
    803	if (node->grp->used == node->grp->size - 1)
    804		tid_group_move(node->grp, &uctxt->tid_full_list,
    805			       &uctxt->tid_used_list);
    806	else if (!node->grp->used)
    807		tid_group_move(node->grp, &uctxt->tid_used_list,
    808			       &uctxt->tid_group_list);
    809	kfree(node);
    810}
    811
    812/*
    813 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
    814 * clearing nodes in the non-cached case.
    815 */
    816static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
    817			    struct exp_tid_set *set,
    818			    struct hfi1_filedata *fd)
    819{
    820	struct tid_group *grp, *ptr;
    821	int i;
    822
    823	list_for_each_entry_safe(grp, ptr, &set->list, list) {
    824		list_del_init(&grp->list);
    825
    826		for (i = 0; i < grp->size; i++) {
    827			if (grp->map & (1 << i)) {
    828				u16 rcventry = grp->base + i;
    829				struct tid_rb_node *node;
    830
    831				node = fd->entry_to_rb[rcventry -
    832							  uctxt->expected_base];
    833				if (!node || node->rcventry != rcventry)
    834					continue;
    835
    836				if (fd->use_mn)
    837					mmu_interval_notifier_remove(
    838						&node->notifier);
    839				cacheless_tid_rb_remove(fd, node);
    840			}
    841		}
    842	}
    843}
    844
    845static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
    846			      const struct mmu_notifier_range *range,
    847			      unsigned long cur_seq)
    848{
    849	struct tid_rb_node *node =
    850		container_of(mni, struct tid_rb_node, notifier);
    851	struct hfi1_filedata *fdata = node->fdata;
    852	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
    853
    854	if (node->freed)
    855		return true;
    856
    857	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
    858				 node->notifier.interval_tree.start,
    859				 node->rcventry, node->npages, node->dma_addr);
    860	node->freed = true;
    861
    862	spin_lock(&fdata->invalid_lock);
    863	if (fdata->invalid_tid_idx < uctxt->expected_count) {
    864		fdata->invalid_tids[fdata->invalid_tid_idx] =
    865			rcventry2tidinfo(node->rcventry - uctxt->expected_base);
    866		fdata->invalid_tids[fdata->invalid_tid_idx] |=
    867			EXP_TID_SET(LEN, node->npages);
    868		if (!fdata->invalid_tid_idx) {
    869			unsigned long *ev;
    870
    871			/*
    872			 * hfi1_set_uevent_bits() sets a user event flag
    873			 * for all processes. Because calling into the
    874			 * driver to process TID cache invalidations is
    875			 * expensive and TID cache invalidations are
    876			 * handled on a per-process basis, we can
    877			 * optimize this to set the flag only for the
    878			 * process in question.
    879			 */
    880			ev = uctxt->dd->events +
    881				(uctxt_offset(uctxt) + fdata->subctxt);
    882			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
    883		}
    884		fdata->invalid_tid_idx++;
    885	}
    886	spin_unlock(&fdata->invalid_lock);
    887	return true;
    888}
    889
    890static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
    891				    struct tid_rb_node *tnode)
    892{
    893	u32 base = fdata->uctxt->expected_base;
    894
    895	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
    896	clear_tid_node(fdata, tnode);
    897}