cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mr.c (19126B)


      1/*
      2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
      3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
      4 *
      5 * This software is available to you under a choice of one of two
      6 * licenses.  You may choose to be licensed under the terms of the GNU
      7 * General Public License (GPL) Version 2, available from the file
      8 * COPYING in the main directory of this source tree, or the
      9 * OpenIB.org BSD license below:
     10 *
     11 *     Redistribution and use in source and binary forms, with or
     12 *     without modification, are permitted provided that the following
     13 *     conditions are met:
     14 *
     15 *      - Redistributions of source code must retain the above
     16 *        copyright notice, this list of conditions and the following
     17 *        disclaimer.
     18 *
     19 *      - Redistributions in binary form must reproduce the above
     20 *        copyright notice, this list of conditions and the following
     21 *        disclaimer in the documentation and/or other materials
     22 *        provided with the distribution.
     23 *
     24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     31 * SOFTWARE.
     32 */
     33
     34#include <linux/slab.h>
     35#include <rdma/ib_user_verbs.h>
     36
     37#include "mlx4_ib.h"
     38
     39static u32 convert_access(int acc)
     40{
     41	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
     42	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
     43	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
     44	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
     45	       (acc & IB_ACCESS_MW_BIND	      ? MLX4_PERM_BIND_MW      : 0) |
     46	       MLX4_PERM_LOCAL_READ;
     47}
     48
     49static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type)
     50{
     51	switch (type) {
     52	case IB_MW_TYPE_1:	return MLX4_MW_TYPE_1;
     53	case IB_MW_TYPE_2:	return MLX4_MW_TYPE_2;
     54	default:		return -1;
     55	}
     56}
     57
     58struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
     59{
     60	struct mlx4_ib_mr *mr;
     61	int err;
     62
     63	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
     64	if (!mr)
     65		return ERR_PTR(-ENOMEM);
     66
     67	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
     68			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
     69	if (err)
     70		goto err_free;
     71
     72	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
     73	if (err)
     74		goto err_mr;
     75
     76	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
     77	mr->umem = NULL;
     78
     79	return &mr->ibmr;
     80
     81err_mr:
     82	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
     83
     84err_free:
     85	kfree(mr);
     86
     87	return ERR_PTR(err);
     88}
     89
     90enum {
     91	MLX4_MAX_MTT_SHIFT = 31
     92};
     93
     94static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
     95					struct mlx4_mtt *mtt,
     96					u64 mtt_size, u64 mtt_shift, u64 len,
     97					u64 cur_start_addr, u64 *pages,
     98					int *start_index, int *npages)
     99{
    100	u64 cur_end_addr = cur_start_addr + len;
    101	u64 cur_end_addr_aligned = 0;
    102	u64 mtt_entries;
    103	int err = 0;
    104	int k;
    105
    106	len += (cur_start_addr & (mtt_size - 1ULL));
    107	cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
    108	len += (cur_end_addr_aligned - cur_end_addr);
    109	if (len & (mtt_size - 1ULL)) {
    110		pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n",
    111			len, mtt_size);
    112		return -EINVAL;
    113	}
    114
    115	mtt_entries = (len >> mtt_shift);
    116
    117	/*
    118	 * Align the MTT start address to the mtt_size.
    119	 * Required to handle cases when the MR starts in the middle of an MTT
    120	 * record. Was not required in old code since the physical addresses
    121	 * provided by the dma subsystem were page aligned, which was also the
    122	 * MTT size.
    123	 */
    124	cur_start_addr = round_down(cur_start_addr, mtt_size);
    125	/* A new block is started ... */
    126	for (k = 0; k < mtt_entries; ++k) {
    127		pages[*npages] = cur_start_addr + (mtt_size * k);
    128		(*npages)++;
    129		/*
    130		 * Be friendly to mlx4_write_mtt() and pass it chunks of
    131		 * appropriate size.
    132		 */
    133		if (*npages == PAGE_SIZE / sizeof(u64)) {
    134			err = mlx4_write_mtt(dev->dev, mtt, *start_index,
    135					     *npages, pages);
    136			if (err)
    137				return err;
    138
    139			(*start_index) += *npages;
    140			*npages = 0;
    141		}
    142	}
    143
    144	return 0;
    145}
    146
    147static inline u64 alignment_of(u64 ptr)
    148{
    149	return ilog2(ptr & (~(ptr - 1)));
    150}
    151
    152static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
    153				       u64 current_block_end,
    154				       u64 block_shift)
    155{
    156	/* Check whether the alignment of the new block is aligned as well as
    157	 * the previous block.
    158	 * Block address must start with zeros till size of entity_size.
    159	 */
    160	if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
    161		/*
    162		 * It is not as well aligned as the previous block-reduce the
    163		 * mtt size accordingly. Here we take the last right bit which
    164		 * is 1.
    165		 */
    166		block_shift = alignment_of(next_block_start);
    167
    168	/*
    169	 * Check whether the alignment of the end of previous block - is it
    170	 * aligned as well as the start of the block
    171	 */
    172	if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
    173		/*
    174		 * It is not as well aligned as the start of the block -
    175		 * reduce the mtt size accordingly.
    176		 */
    177		block_shift = alignment_of(current_block_end);
    178
    179	return block_shift;
    180}
    181
    182int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
    183			   struct ib_umem *umem)
    184{
    185	u64 *pages;
    186	u64 len = 0;
    187	int err = 0;
    188	u64 mtt_size;
    189	u64 cur_start_addr = 0;
    190	u64 mtt_shift;
    191	int start_index = 0;
    192	int npages = 0;
    193	struct scatterlist *sg;
    194	int i;
    195
    196	pages = (u64 *) __get_free_page(GFP_KERNEL);
    197	if (!pages)
    198		return -ENOMEM;
    199
    200	mtt_shift = mtt->page_shift;
    201	mtt_size = 1ULL << mtt_shift;
    202
    203	for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
    204		if (cur_start_addr + len == sg_dma_address(sg)) {
    205			/* still the same block */
    206			len += sg_dma_len(sg);
    207			continue;
    208		}
    209		/*
    210		 * A new block is started ...
    211		 * If len is malaligned, write an extra mtt entry to cover the
    212		 * misaligned area (round up the division)
    213		 */
    214		err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size,
    215						   mtt_shift, len,
    216						   cur_start_addr,
    217						   pages, &start_index,
    218						   &npages);
    219		if (err)
    220			goto out;
    221
    222		cur_start_addr = sg_dma_address(sg);
    223		len = sg_dma_len(sg);
    224	}
    225
    226	/* Handle the last block */
    227	if (len > 0) {
    228		/*
    229		 * If len is malaligned, write an extra mtt entry to cover
    230		 * the misaligned area (round up the division)
    231		 */
    232		err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size,
    233						   mtt_shift, len,
    234						   cur_start_addr, pages,
    235						   &start_index, &npages);
    236		if (err)
    237			goto out;
    238	}
    239
    240	if (npages)
    241		err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
    242
    243out:
    244	free_page((unsigned long) pages);
    245	return err;
    246}
    247
    248/*
    249 * Calculate optimal mtt size based on contiguous pages.
    250 * Function will return also the number of pages that are not aligned to the
    251 * calculated mtt_size to be added to total number of pages. For that we should
    252 * check the first chunk length & last chunk length and if not aligned to
    253 * mtt_size we should increment the non_aligned_pages number. All chunks in the
    254 * middle already handled as part of mtt shift calculation for both their start
    255 * & end addresses.
    256 */
    257int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
    258				       int *num_of_mtts)
    259{
    260	u64 block_shift = MLX4_MAX_MTT_SHIFT;
    261	u64 min_shift = PAGE_SHIFT;
    262	u64 last_block_aligned_end = 0;
    263	u64 current_block_start = 0;
    264	u64 first_block_start = 0;
    265	u64 current_block_len = 0;
    266	u64 last_block_end = 0;
    267	struct scatterlist *sg;
    268	u64 current_block_end;
    269	u64 misalignment_bits;
    270	u64 next_block_start;
    271	u64 total_len = 0;
    272	int i;
    273
    274	*num_of_mtts = ib_umem_num_dma_blocks(umem, PAGE_SIZE);
    275
    276	for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
    277		/*
    278		 * Initialization - save the first chunk start as the
    279		 * current_block_start - block means contiguous pages.
    280		 */
    281		if (current_block_len == 0 && current_block_start == 0) {
    282			current_block_start = sg_dma_address(sg);
    283			first_block_start = current_block_start;
    284			/*
    285			 * Find the bits that are different between the physical
    286			 * address and the virtual address for the start of the
    287			 * MR.
    288			 * umem_get aligned the start_va to a page boundary.
    289			 * Therefore, we need to align the start va to the same
    290			 * boundary.
    291			 * misalignment_bits is needed to handle the  case of a
    292			 * single memory region. In this case, the rest of the
    293			 * logic will not reduce the block size.  If we use a
    294			 * block size which is bigger than the alignment of the
    295			 * misalignment bits, we might use the virtual page
    296			 * number instead of the physical page number, resulting
    297			 * in access to the wrong data.
    298			 */
    299			misalignment_bits =
    300				(start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^
    301				current_block_start;
    302			block_shift = min(alignment_of(misalignment_bits),
    303					  block_shift);
    304		}
    305
    306		/*
    307		 * Go over the scatter entries and check if they continue the
    308		 * previous scatter entry.
    309		 */
    310		next_block_start = sg_dma_address(sg);
    311		current_block_end = current_block_start	+ current_block_len;
    312		/* If we have a split (non-contig.) between two blocks */
    313		if (current_block_end != next_block_start) {
    314			block_shift = mlx4_ib_umem_calc_block_mtt
    315					(next_block_start,
    316					 current_block_end,
    317					 block_shift);
    318
    319			/*
    320			 * If we reached the minimum shift for 4k page we stop
    321			 * the loop.
    322			 */
    323			if (block_shift <= min_shift)
    324				goto end;
    325
    326			/*
    327			 * If not saved yet we are in first block - we save the
    328			 * length of first block to calculate the
    329			 * non_aligned_pages number at the end.
    330			 */
    331			total_len += current_block_len;
    332
    333			/* Start a new block */
    334			current_block_start = next_block_start;
    335			current_block_len = sg_dma_len(sg);
    336			continue;
    337		}
    338		/* The scatter entry is another part of the current block,
    339		 * increase the block size.
    340		 * An entry in the scatter can be larger than 4k (page) as of
    341		 * dma mapping which merge some blocks together.
    342		 */
    343		current_block_len += sg_dma_len(sg);
    344	}
    345
    346	/* Account for the last block in the total len */
    347	total_len += current_block_len;
    348	/* Add to the first block the misalignment that it suffers from. */
    349	total_len += (first_block_start & ((1ULL << block_shift) - 1ULL));
    350	last_block_end = current_block_start + current_block_len;
    351	last_block_aligned_end = round_up(last_block_end, 1ULL << block_shift);
    352	total_len += (last_block_aligned_end - last_block_end);
    353
    354	if (total_len & ((1ULL << block_shift) - 1ULL))
    355		pr_warn("misaligned total length detected (%llu, %llu)!",
    356			total_len, block_shift);
    357
    358	*num_of_mtts = total_len >> block_shift;
    359end:
    360	if (block_shift < min_shift) {
    361		/*
    362		 * If shift is less than the min we set a warning and return the
    363		 * min shift.
    364		 */
    365		pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift);
    366
    367		block_shift = min_shift;
    368	}
    369	return block_shift;
    370}
    371
    372static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start,
    373					u64 length, int access_flags)
    374{
    375	/*
    376	 * Force registering the memory as writable if the underlying pages
    377	 * are writable.  This is so rereg can change the access permissions
    378	 * from readable to writable without having to run through ib_umem_get
    379	 * again
    380	 */
    381	if (!ib_access_writable(access_flags)) {
    382		unsigned long untagged_start = untagged_addr(start);
    383		struct vm_area_struct *vma;
    384
    385		mmap_read_lock(current->mm);
    386		/*
    387		 * FIXME: Ideally this would iterate over all the vmas that
    388		 * cover the memory, but for now it requires a single vma to
    389		 * entirely cover the MR to support RO mappings.
    390		 */
    391		vma = find_vma(current->mm, untagged_start);
    392		if (vma && vma->vm_end >= untagged_start + length &&
    393		    vma->vm_start <= untagged_start) {
    394			if (vma->vm_flags & VM_WRITE)
    395				access_flags |= IB_ACCESS_LOCAL_WRITE;
    396		} else {
    397			access_flags |= IB_ACCESS_LOCAL_WRITE;
    398		}
    399
    400		mmap_read_unlock(current->mm);
    401	}
    402
    403	return ib_umem_get(device, start, length, access_flags);
    404}
    405
    406struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
    407				  u64 virt_addr, int access_flags,
    408				  struct ib_udata *udata)
    409{
    410	struct mlx4_ib_dev *dev = to_mdev(pd->device);
    411	struct mlx4_ib_mr *mr;
    412	int shift;
    413	int err;
    414	int n;
    415
    416	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
    417	if (!mr)
    418		return ERR_PTR(-ENOMEM);
    419
    420	mr->umem = mlx4_get_umem_mr(pd->device, start, length, access_flags);
    421	if (IS_ERR(mr->umem)) {
    422		err = PTR_ERR(mr->umem);
    423		goto err_free;
    424	}
    425
    426	shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n);
    427
    428	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
    429			    convert_access(access_flags), n, shift, &mr->mmr);
    430	if (err)
    431		goto err_umem;
    432
    433	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
    434	if (err)
    435		goto err_mr;
    436
    437	err = mlx4_mr_enable(dev->dev, &mr->mmr);
    438	if (err)
    439		goto err_mr;
    440
    441	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
    442	mr->ibmr.length = length;
    443	mr->ibmr.page_size = 1U << shift;
    444
    445	return &mr->ibmr;
    446
    447err_mr:
    448	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
    449
    450err_umem:
    451	ib_umem_release(mr->umem);
    452
    453err_free:
    454	kfree(mr);
    455
    456	return ERR_PTR(err);
    457}
    458
    459struct ib_mr *mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start,
    460				    u64 length, u64 virt_addr,
    461				    int mr_access_flags, struct ib_pd *pd,
    462				    struct ib_udata *udata)
    463{
    464	struct mlx4_ib_dev *dev = to_mdev(mr->device);
    465	struct mlx4_ib_mr *mmr = to_mmr(mr);
    466	struct mlx4_mpt_entry *mpt_entry;
    467	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
    468	int err;
    469
    470	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
    471	 * we assume that the calls can't run concurrently. Otherwise, a
    472	 * race exists.
    473	 */
    474	err =  mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
    475	if (err)
    476		return ERR_PTR(err);
    477
    478	if (flags & IB_MR_REREG_PD) {
    479		err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
    480					   to_mpd(pd)->pdn);
    481
    482		if (err)
    483			goto release_mpt_entry;
    484	}
    485
    486	if (flags & IB_MR_REREG_ACCESS) {
    487		if (ib_access_writable(mr_access_flags) &&
    488		    !mmr->umem->writable) {
    489			err = -EPERM;
    490			goto release_mpt_entry;
    491		}
    492
    493		err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
    494					       convert_access(mr_access_flags));
    495
    496		if (err)
    497			goto release_mpt_entry;
    498	}
    499
    500	if (flags & IB_MR_REREG_TRANS) {
    501		int shift;
    502		int n;
    503
    504		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
    505		ib_umem_release(mmr->umem);
    506		mmr->umem = mlx4_get_umem_mr(mr->device, start, length,
    507					     mr_access_flags);
    508		if (IS_ERR(mmr->umem)) {
    509			err = PTR_ERR(mmr->umem);
    510			/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
    511			mmr->umem = NULL;
    512			goto release_mpt_entry;
    513		}
    514		n = ib_umem_num_dma_blocks(mmr->umem, PAGE_SIZE);
    515		shift = PAGE_SHIFT;
    516
    517		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
    518					      virt_addr, length, n, shift,
    519					      *pmpt_entry);
    520		if (err) {
    521			ib_umem_release(mmr->umem);
    522			goto release_mpt_entry;
    523		}
    524		mmr->mmr.iova       = virt_addr;
    525		mmr->mmr.size       = length;
    526
    527		err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
    528		if (err) {
    529			mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
    530			ib_umem_release(mmr->umem);
    531			goto release_mpt_entry;
    532		}
    533	}
    534
    535	/* If we couldn't transfer the MR to the HCA, just remember to
    536	 * return a failure. But dereg_mr will free the resources.
    537	 */
    538	err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
    539	if (!err && flags & IB_MR_REREG_ACCESS)
    540		mmr->mmr.access = mr_access_flags;
    541
    542release_mpt_entry:
    543	mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
    544	if (err)
    545		return ERR_PTR(err);
    546	return NULL;
    547}
    548
    549static int
    550mlx4_alloc_priv_pages(struct ib_device *device,
    551		      struct mlx4_ib_mr *mr,
    552		      int max_pages)
    553{
    554	int ret;
    555
    556	/* Ensure that size is aligned to DMA cacheline
    557	 * requirements.
    558	 * max_pages is limited to MLX4_MAX_FAST_REG_PAGES
    559	 * so page_map_size will never cross PAGE_SIZE.
    560	 */
    561	mr->page_map_size = roundup(max_pages * sizeof(u64),
    562				    MLX4_MR_PAGES_ALIGN);
    563
    564	/* Prevent cross page boundary allocation. */
    565	mr->pages = (__be64 *)get_zeroed_page(GFP_KERNEL);
    566	if (!mr->pages)
    567		return -ENOMEM;
    568
    569	mr->page_map = dma_map_single(device->dev.parent, mr->pages,
    570				      mr->page_map_size, DMA_TO_DEVICE);
    571
    572	if (dma_mapping_error(device->dev.parent, mr->page_map)) {
    573		ret = -ENOMEM;
    574		goto err;
    575	}
    576
    577	return 0;
    578
    579err:
    580	free_page((unsigned long)mr->pages);
    581	return ret;
    582}
    583
    584static void
    585mlx4_free_priv_pages(struct mlx4_ib_mr *mr)
    586{
    587	if (mr->pages) {
    588		struct ib_device *device = mr->ibmr.device;
    589
    590		dma_unmap_single(device->dev.parent, mr->page_map,
    591				 mr->page_map_size, DMA_TO_DEVICE);
    592		free_page((unsigned long)mr->pages);
    593		mr->pages = NULL;
    594	}
    595}
    596
    597int mlx4_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
    598{
    599	struct mlx4_ib_mr *mr = to_mmr(ibmr);
    600	int ret;
    601
    602	mlx4_free_priv_pages(mr);
    603
    604	ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
    605	if (ret)
    606		return ret;
    607	if (mr->umem)
    608		ib_umem_release(mr->umem);
    609	kfree(mr);
    610
    611	return 0;
    612}
    613
    614int mlx4_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
    615{
    616	struct mlx4_ib_dev *dev = to_mdev(ibmw->device);
    617	struct mlx4_ib_mw *mw = to_mmw(ibmw);
    618	int err;
    619
    620	err = mlx4_mw_alloc(dev->dev, to_mpd(ibmw->pd)->pdn,
    621			    to_mlx4_type(ibmw->type), &mw->mmw);
    622	if (err)
    623		return err;
    624
    625	err = mlx4_mw_enable(dev->dev, &mw->mmw);
    626	if (err)
    627		goto err_mw;
    628
    629	ibmw->rkey = mw->mmw.key;
    630	return 0;
    631
    632err_mw:
    633	mlx4_mw_free(dev->dev, &mw->mmw);
    634	return err;
    635}
    636
    637int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
    638{
    639	struct mlx4_ib_mw *mw = to_mmw(ibmw);
    640
    641	mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
    642	return 0;
    643}
    644
    645struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
    646			       u32 max_num_sg)
    647{
    648	struct mlx4_ib_dev *dev = to_mdev(pd->device);
    649	struct mlx4_ib_mr *mr;
    650	int err;
    651
    652	if (mr_type != IB_MR_TYPE_MEM_REG ||
    653	    max_num_sg > MLX4_MAX_FAST_REG_PAGES)
    654		return ERR_PTR(-EINVAL);
    655
    656	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
    657	if (!mr)
    658		return ERR_PTR(-ENOMEM);
    659
    660	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
    661			    max_num_sg, 0, &mr->mmr);
    662	if (err)
    663		goto err_free;
    664
    665	err = mlx4_alloc_priv_pages(pd->device, mr, max_num_sg);
    666	if (err)
    667		goto err_free_mr;
    668
    669	mr->max_pages = max_num_sg;
    670	err = mlx4_mr_enable(dev->dev, &mr->mmr);
    671	if (err)
    672		goto err_free_pl;
    673
    674	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
    675	mr->umem = NULL;
    676
    677	return &mr->ibmr;
    678
    679err_free_pl:
    680	mr->ibmr.device = pd->device;
    681	mlx4_free_priv_pages(mr);
    682err_free_mr:
    683	(void) mlx4_mr_free(dev->dev, &mr->mmr);
    684err_free:
    685	kfree(mr);
    686	return ERR_PTR(err);
    687}
    688
    689static int mlx4_set_page(struct ib_mr *ibmr, u64 addr)
    690{
    691	struct mlx4_ib_mr *mr = to_mmr(ibmr);
    692
    693	if (unlikely(mr->npages == mr->max_pages))
    694		return -ENOMEM;
    695
    696	mr->pages[mr->npages++] = cpu_to_be64(addr | MLX4_MTT_FLAG_PRESENT);
    697
    698	return 0;
    699}
    700
    701int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
    702		      unsigned int *sg_offset)
    703{
    704	struct mlx4_ib_mr *mr = to_mmr(ibmr);
    705	int rc;
    706
    707	mr->npages = 0;
    708
    709	ib_dma_sync_single_for_cpu(ibmr->device, mr->page_map,
    710				   mr->page_map_size, DMA_TO_DEVICE);
    711
    712	rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page);
    713
    714	ib_dma_sync_single_for_device(ibmr->device, mr->page_map,
    715				      mr->page_map_size, DMA_TO_DEVICE);
    716
    717	return rc;
    718}