cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

memory.c (80791B)


      1// SPDX-License-Identifier: GPL-2.0
      2
      3/*
      4 * Copyright 2016-2021 HabanaLabs, Ltd.
      5 * All Rights Reserved.
      6 */
      7
      8#include <uapi/misc/habanalabs.h>
      9#include "habanalabs.h"
     10#include "../include/hw_ip/mmu/mmu_general.h"
     11
     12#include <linux/uaccess.h>
     13#include <linux/slab.h>
     14#include <linux/vmalloc.h>
     15#include <linux/pci-p2pdma.h>
     16
     17MODULE_IMPORT_NS(DMA_BUF);
     18
     19#define HL_MMU_DEBUG	0
     20
     21/* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */
     22#define DRAM_POOL_PAGE_SIZE SZ_8M
     23
     24static int allocate_timestamps_buffers(struct hl_fpriv *hpriv,
     25			struct hl_mem_in *args, u64 *handle);
     26
     27static int set_alloc_page_size(struct hl_device *hdev, struct hl_mem_in *args, u32 *page_size)
     28{
     29	struct asic_fixed_properties *prop = &hdev->asic_prop;
     30	u32 psize;
     31
     32	/*
     33	 * for ASIC that supports setting the allocation page size by user we will address
     34	 * user's choice only if it is not 0 (as 0 means taking the default page size)
     35	 */
     36	if (prop->supports_user_set_page_size && args->alloc.page_size) {
     37		psize = args->alloc.page_size;
     38
     39		if (!hdev->asic_funcs->is_valid_dram_page_size(psize)) {
     40			dev_err(hdev->dev, "user page size (%#x) is not valid\n", psize);
     41			return -EINVAL;
     42		}
     43	} else {
     44		psize = prop->device_mem_alloc_default_page_size;
     45	}
     46
     47	*page_size = psize;
     48
     49	return 0;
     50}
     51
     52/*
     53 * The va ranges in context object contain a list with the available chunks of
     54 * device virtual memory.
     55 * There is one range for host allocations and one for DRAM allocations.
     56 *
     57 * On initialization each range contains one chunk of all of its available
     58 * virtual range which is a half of the total device virtual range.
     59 *
     60 * On each mapping of physical pages, a suitable virtual range chunk (with a
     61 * minimum size) is selected from the list. If the chunk size equals the
     62 * requested size, the chunk is returned. Otherwise, the chunk is split into
     63 * two chunks - one to return as result and a remainder to stay in the list.
     64 *
     65 * On each Unmapping of a virtual address, the relevant virtual chunk is
     66 * returned to the list. The chunk is added to the list and if its edges match
     67 * the edges of the adjacent chunks (means a contiguous chunk can be created),
     68 * the chunks are merged.
     69 *
     70 * On finish, the list is checked to have only one chunk of all the relevant
     71 * virtual range (which is a half of the device total virtual range).
     72 * If not (means not all mappings were unmapped), a warning is printed.
     73 */
     74
     75/*
     76 * alloc_device_memory() - allocate device memory.
     77 * @ctx: pointer to the context structure.
     78 * @args: host parameters containing the requested size.
     79 * @ret_handle: result handle.
     80 *
     81 * This function does the following:
     82 * - Allocate the requested size rounded up to 'dram_page_size' pages.
     83 * - Return unique handle for later map/unmap/free.
     84 */
     85static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
     86				u32 *ret_handle)
     87{
     88	struct hl_device *hdev = ctx->hdev;
     89	struct hl_vm *vm = &hdev->vm;
     90	struct hl_vm_phys_pg_pack *phys_pg_pack;
     91	u64 paddr = 0, total_size, num_pgs, i;
     92	u32 num_curr_pgs, page_size;
     93	bool contiguous;
     94	int handle, rc;
     95
     96	num_curr_pgs = 0;
     97
     98	rc = set_alloc_page_size(hdev, args, &page_size);
     99	if (rc)
    100		return rc;
    101
    102	num_pgs = DIV_ROUND_UP_ULL(args->alloc.mem_size, page_size);
    103	total_size = num_pgs * page_size;
    104
    105	if (!total_size) {
    106		dev_err(hdev->dev, "Cannot allocate 0 bytes\n");
    107		return -EINVAL;
    108	}
    109
    110	contiguous = args->flags & HL_MEM_CONTIGUOUS;
    111
    112	if (contiguous) {
    113		if (is_power_of_2(page_size))
    114			paddr = (uintptr_t) gen_pool_dma_alloc_align(vm->dram_pg_pool,
    115								     total_size, NULL, page_size);
    116		else
    117			paddr = gen_pool_alloc(vm->dram_pg_pool, total_size);
    118		if (!paddr) {
    119			dev_err(hdev->dev,
    120				"Cannot allocate %llu contiguous pages with total size of %llu\n",
    121				num_pgs, total_size);
    122			return -ENOMEM;
    123		}
    124	}
    125
    126	phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
    127	if (!phys_pg_pack) {
    128		rc = -ENOMEM;
    129		goto pages_pack_err;
    130	}
    131
    132	phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK;
    133	phys_pg_pack->asid = ctx->asid;
    134	phys_pg_pack->npages = num_pgs;
    135	phys_pg_pack->page_size = page_size;
    136	phys_pg_pack->total_size = total_size;
    137	phys_pg_pack->flags = args->flags;
    138	phys_pg_pack->contiguous = contiguous;
    139
    140	phys_pg_pack->pages = kvmalloc_array(num_pgs, sizeof(u64), GFP_KERNEL);
    141	if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
    142		rc = -ENOMEM;
    143		goto pages_arr_err;
    144	}
    145
    146	if (phys_pg_pack->contiguous) {
    147		for (i = 0 ; i < num_pgs ; i++)
    148			phys_pg_pack->pages[i] = paddr + i * page_size;
    149	} else {
    150		for (i = 0 ; i < num_pgs ; i++) {
    151			if (is_power_of_2(page_size))
    152				phys_pg_pack->pages[i] =
    153					(uintptr_t)gen_pool_dma_alloc_align(vm->dram_pg_pool,
    154									    page_size, NULL,
    155									    page_size);
    156			else
    157				phys_pg_pack->pages[i] = gen_pool_alloc(vm->dram_pg_pool,
    158									page_size);
    159
    160			if (!phys_pg_pack->pages[i]) {
    161				dev_err(hdev->dev,
    162					"Cannot allocate device memory (out of memory)\n");
    163				rc = -ENOMEM;
    164				goto page_err;
    165			}
    166
    167			num_curr_pgs++;
    168		}
    169	}
    170
    171	spin_lock(&vm->idr_lock);
    172	handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0,
    173				GFP_ATOMIC);
    174	spin_unlock(&vm->idr_lock);
    175
    176	if (handle < 0) {
    177		dev_err(hdev->dev, "Failed to get handle for page\n");
    178		rc = -EFAULT;
    179		goto idr_err;
    180	}
    181
    182	for (i = 0 ; i < num_pgs ; i++)
    183		kref_get(&vm->dram_pg_pool_refcount);
    184
    185	phys_pg_pack->handle = handle;
    186
    187	atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem);
    188	atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem);
    189
    190	*ret_handle = handle;
    191
    192	return 0;
    193
    194idr_err:
    195page_err:
    196	if (!phys_pg_pack->contiguous)
    197		for (i = 0 ; i < num_curr_pgs ; i++)
    198			gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i],
    199					page_size);
    200
    201	kvfree(phys_pg_pack->pages);
    202pages_arr_err:
    203	kfree(phys_pg_pack);
    204pages_pack_err:
    205	if (contiguous)
    206		gen_pool_free(vm->dram_pg_pool, paddr, total_size);
    207
    208	return rc;
    209}
    210
    211/**
    212 * dma_map_host_va() - DMA mapping of the given host virtual address.
    213 * @hdev: habanalabs device structure.
    214 * @addr: the host virtual address of the memory area.
    215 * @size: the size of the memory area.
    216 * @p_userptr: pointer to result userptr structure.
    217 *
    218 * This function does the following:
    219 * - Allocate userptr structure.
    220 * - Pin the given host memory using the userptr structure.
    221 * - Perform DMA mapping to have the DMA addresses of the pages.
    222 */
    223static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
    224				struct hl_userptr **p_userptr)
    225{
    226	struct hl_userptr *userptr;
    227	int rc;
    228
    229	userptr = kzalloc(sizeof(*userptr), GFP_KERNEL);
    230	if (!userptr) {
    231		rc = -ENOMEM;
    232		goto userptr_err;
    233	}
    234
    235	rc = hl_pin_host_memory(hdev, addr, size, userptr);
    236	if (rc) {
    237		dev_err(hdev->dev, "Failed to pin host memory\n");
    238		goto pin_err;
    239	}
    240
    241	userptr->dma_mapped = true;
    242	userptr->dir = DMA_BIDIRECTIONAL;
    243	userptr->vm_type = VM_TYPE_USERPTR;
    244
    245	*p_userptr = userptr;
    246
    247	rc = hdev->asic_funcs->asic_dma_map_sgtable(hdev, userptr->sgt, DMA_BIDIRECTIONAL);
    248	if (rc) {
    249		dev_err(hdev->dev, "failed to map sgt with DMA region\n");
    250		goto dma_map_err;
    251	}
    252
    253	return 0;
    254
    255dma_map_err:
    256	hl_unpin_host_memory(hdev, userptr);
    257pin_err:
    258	kfree(userptr);
    259userptr_err:
    260
    261	return rc;
    262}
    263
    264/**
    265 * dma_unmap_host_va() - DMA unmapping of the given host virtual address.
    266 * @hdev: habanalabs device structure.
    267 * @userptr: userptr to free.
    268 *
    269 * This function does the following:
    270 * - Unpins the physical pages.
    271 * - Frees the userptr structure.
    272 */
    273static void dma_unmap_host_va(struct hl_device *hdev,
    274				struct hl_userptr *userptr)
    275{
    276	hl_unpin_host_memory(hdev, userptr);
    277	kfree(userptr);
    278}
    279
    280/**
    281 * dram_pg_pool_do_release() - free DRAM pages pool
    282 * @ref: pointer to reference object.
    283 *
    284 * This function does the following:
    285 * - Frees the idr structure of physical pages handles.
    286 * - Frees the generic pool of DRAM physical pages.
    287 */
    288static void dram_pg_pool_do_release(struct kref *ref)
    289{
    290	struct hl_vm *vm = container_of(ref, struct hl_vm,
    291			dram_pg_pool_refcount);
    292
    293	/*
    294	 * free the idr here as only here we know for sure that there are no
    295	 * allocated physical pages and hence there are no handles in use
    296	 */
    297	idr_destroy(&vm->phys_pg_pack_handles);
    298	gen_pool_destroy(vm->dram_pg_pool);
    299}
    300
    301/**
    302 * free_phys_pg_pack() - free physical page pack.
    303 * @hdev: habanalabs device structure.
    304 * @phys_pg_pack: physical page pack to free.
    305 *
    306 * This function does the following:
    307 * - For DRAM memory only
    308 *   - iterate over the pack, scrub and free each physical block structure by
    309 *     returning it to the general pool.
    310 *     In case of error during scrubbing, initiate hard reset.
    311 *     Once hard reset is triggered, scrubbing is bypassed while freeing the
    312 *     memory continues.
    313 * - Free the hl_vm_phys_pg_pack structure.
    314 */
    315static int free_phys_pg_pack(struct hl_device *hdev,
    316				struct hl_vm_phys_pg_pack *phys_pg_pack)
    317{
    318	struct hl_vm *vm = &hdev->vm;
    319	u64 i;
    320	int rc = 0;
    321
    322	if (phys_pg_pack->created_from_userptr)
    323		goto end;
    324
    325	if (phys_pg_pack->contiguous) {
    326		if (hdev->memory_scrub && !hdev->disabled) {
    327			rc = hdev->asic_funcs->scrub_device_mem(hdev,
    328					phys_pg_pack->pages[0],
    329					phys_pg_pack->total_size);
    330			if (rc)
    331				dev_err(hdev->dev,
    332					"Failed to scrub contiguous device memory\n");
    333		}
    334
    335		gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0],
    336			phys_pg_pack->total_size);
    337
    338		for (i = 0; i < phys_pg_pack->npages ; i++)
    339			kref_put(&vm->dram_pg_pool_refcount,
    340				dram_pg_pool_do_release);
    341	} else {
    342		for (i = 0 ; i < phys_pg_pack->npages ; i++) {
    343			if (hdev->memory_scrub && !hdev->disabled && rc == 0) {
    344				rc = hdev->asic_funcs->scrub_device_mem(
    345						hdev,
    346						phys_pg_pack->pages[i],
    347						phys_pg_pack->page_size);
    348				if (rc)
    349					dev_err(hdev->dev,
    350						"Failed to scrub device memory\n");
    351			}
    352			gen_pool_free(vm->dram_pg_pool,
    353				phys_pg_pack->pages[i],
    354				phys_pg_pack->page_size);
    355			kref_put(&vm->dram_pg_pool_refcount,
    356				dram_pg_pool_do_release);
    357		}
    358	}
    359
    360	if (rc && !hdev->disabled)
    361		hl_device_reset(hdev, HL_DRV_RESET_HARD);
    362
    363end:
    364	kvfree(phys_pg_pack->pages);
    365	kfree(phys_pg_pack);
    366
    367	return rc;
    368}
    369
    370/**
    371 * free_device_memory() - free device memory.
    372 * @ctx: pointer to the context structure.
    373 * @args: host parameters containing the requested size.
    374 *
    375 * This function does the following:
    376 * - Free the device memory related to the given handle.
    377 */
    378static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args)
    379{
    380	struct hl_device *hdev = ctx->hdev;
    381	struct hl_vm *vm = &hdev->vm;
    382	struct hl_vm_phys_pg_pack *phys_pg_pack;
    383	u32 handle = args->free.handle;
    384
    385	spin_lock(&vm->idr_lock);
    386	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
    387	if (phys_pg_pack) {
    388		if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) {
    389			dev_err(hdev->dev, "handle %u is mapped, cannot free\n",
    390				handle);
    391			spin_unlock(&vm->idr_lock);
    392			return -EINVAL;
    393		}
    394
    395		if (phys_pg_pack->exporting_cnt) {
    396			dev_dbg(hdev->dev, "handle %u is exported, cannot free\n", handle);
    397			spin_unlock(&vm->idr_lock);
    398			return -EINVAL;
    399		}
    400
    401		/*
    402		 * must remove from idr before the freeing of the physical
    403		 * pages as the refcount of the pool is also the trigger of the
    404		 * idr destroy
    405		 */
    406		idr_remove(&vm->phys_pg_pack_handles, handle);
    407		spin_unlock(&vm->idr_lock);
    408
    409		atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem);
    410		atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem);
    411
    412		return free_phys_pg_pack(hdev, phys_pg_pack);
    413	} else {
    414		spin_unlock(&vm->idr_lock);
    415		dev_err(hdev->dev,
    416			"free device memory failed, no match for handle %u\n",
    417			handle);
    418		return -EINVAL;
    419	}
    420
    421	return 0;
    422}
    423
    424/**
    425 * clear_va_list_locked() - free virtual addresses list.
    426 * @hdev: habanalabs device structure.
    427 * @va_list: list of virtual addresses to free.
    428 *
    429 * This function does the following:
    430 * - Iterate over the list and free each virtual addresses block.
    431 *
    432 * This function should be called only when va_list lock is taken.
    433 */
    434static void clear_va_list_locked(struct hl_device *hdev,
    435		struct list_head *va_list)
    436{
    437	struct hl_vm_va_block *va_block, *tmp;
    438
    439	list_for_each_entry_safe(va_block, tmp, va_list, node) {
    440		list_del(&va_block->node);
    441		kfree(va_block);
    442	}
    443}
    444
    445/**
    446 * print_va_list_locked() - print virtual addresses list.
    447 * @hdev: habanalabs device structure.
    448 * @va_list: list of virtual addresses to print.
    449 *
    450 * This function does the following:
    451 * - Iterate over the list and print each virtual addresses block.
    452 *
    453 * This function should be called only when va_list lock is taken.
    454 */
    455static void print_va_list_locked(struct hl_device *hdev,
    456		struct list_head *va_list)
    457{
    458#if HL_MMU_DEBUG
    459	struct hl_vm_va_block *va_block;
    460
    461	dev_dbg(hdev->dev, "print va list:\n");
    462
    463	list_for_each_entry(va_block, va_list, node)
    464		dev_dbg(hdev->dev,
    465			"va block, start: 0x%llx, end: 0x%llx, size: %llu\n",
    466			va_block->start, va_block->end, va_block->size);
    467#endif
    468}
    469
    470/**
    471 * merge_va_blocks_locked() - merge a virtual block if possible.
    472 * @hdev: pointer to the habanalabs device structure.
    473 * @va_list: pointer to the virtual addresses block list.
    474 * @va_block: virtual block to merge with adjacent blocks.
    475 *
    476 * This function does the following:
    477 * - Merge the given blocks with the adjacent blocks if their virtual ranges
    478 *   create a contiguous virtual range.
    479 *
    480 * This Function should be called only when va_list lock is taken.
    481 */
    482static void merge_va_blocks_locked(struct hl_device *hdev,
    483		struct list_head *va_list, struct hl_vm_va_block *va_block)
    484{
    485	struct hl_vm_va_block *prev, *next;
    486
    487	prev = list_prev_entry(va_block, node);
    488	if (&prev->node != va_list && prev->end + 1 == va_block->start) {
    489		prev->end = va_block->end;
    490		prev->size = prev->end - prev->start;
    491		list_del(&va_block->node);
    492		kfree(va_block);
    493		va_block = prev;
    494	}
    495
    496	next = list_next_entry(va_block, node);
    497	if (&next->node != va_list && va_block->end + 1 == next->start) {
    498		next->start = va_block->start;
    499		next->size = next->end - next->start;
    500		list_del(&va_block->node);
    501		kfree(va_block);
    502	}
    503}
    504
    505/**
    506 * add_va_block_locked() - add a virtual block to the virtual addresses list.
    507 * @hdev: pointer to the habanalabs device structure.
    508 * @va_list: pointer to the virtual addresses block list.
    509 * @start: start virtual address.
    510 * @end: end virtual address.
    511 *
    512 * This function does the following:
    513 * - Add the given block to the virtual blocks list and merge with other blocks
    514 *   if a contiguous virtual block can be created.
    515 *
    516 * This Function should be called only when va_list lock is taken.
    517 */
    518static int add_va_block_locked(struct hl_device *hdev,
    519		struct list_head *va_list, u64 start, u64 end)
    520{
    521	struct hl_vm_va_block *va_block, *res = NULL;
    522	u64 size = end - start + 1;
    523
    524	print_va_list_locked(hdev, va_list);
    525
    526	list_for_each_entry(va_block, va_list, node) {
    527		/* TODO: remove upon matureness */
    528		if (hl_mem_area_crosses_range(start, size, va_block->start,
    529				va_block->end)) {
    530			dev_err(hdev->dev,
    531				"block crossing ranges at start 0x%llx, end 0x%llx\n",
    532				va_block->start, va_block->end);
    533			return -EINVAL;
    534		}
    535
    536		if (va_block->end < start)
    537			res = va_block;
    538	}
    539
    540	va_block = kmalloc(sizeof(*va_block), GFP_KERNEL);
    541	if (!va_block)
    542		return -ENOMEM;
    543
    544	va_block->start = start;
    545	va_block->end = end;
    546	va_block->size = size;
    547
    548	if (!res)
    549		list_add(&va_block->node, va_list);
    550	else
    551		list_add(&va_block->node, &res->node);
    552
    553	merge_va_blocks_locked(hdev, va_list, va_block);
    554
    555	print_va_list_locked(hdev, va_list);
    556
    557	return 0;
    558}
    559
    560/**
    561 * add_va_block() - wrapper for add_va_block_locked.
    562 * @hdev: pointer to the habanalabs device structure.
    563 * @va_range: pointer to the virtual addresses range object.
    564 * @start: start virtual address.
    565 * @end: end virtual address.
    566 *
    567 * This function does the following:
    568 * - Takes the list lock and calls add_va_block_locked.
    569 */
    570static inline int add_va_block(struct hl_device *hdev,
    571		struct hl_va_range *va_range, u64 start, u64 end)
    572{
    573	int rc;
    574
    575	mutex_lock(&va_range->lock);
    576	rc = add_va_block_locked(hdev, &va_range->list, start, end);
    577	mutex_unlock(&va_range->lock);
    578
    579	return rc;
    580}
    581
    582/**
    583 * is_hint_crossing_range() - check if hint address crossing specified reserved.
    584 * @range_type: virtual space range type.
    585 * @start_addr: start virtual address.
    586 * @size: block size.
    587 * @prop: asic properties structure to retrieve reserved ranges from.
    588 */
    589static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
    590		u64 start_addr, u32 size, struct asic_fixed_properties *prop) {
    591	bool range_cross;
    592
    593	if (range_type == HL_VA_RANGE_TYPE_DRAM)
    594		range_cross =
    595			hl_mem_area_crosses_range(start_addr, size,
    596			prop->hints_dram_reserved_va_range.start_addr,
    597			prop->hints_dram_reserved_va_range.end_addr);
    598	else if (range_type == HL_VA_RANGE_TYPE_HOST)
    599		range_cross =
    600			hl_mem_area_crosses_range(start_addr,	size,
    601			prop->hints_host_reserved_va_range.start_addr,
    602			prop->hints_host_reserved_va_range.end_addr);
    603	else
    604		range_cross =
    605			hl_mem_area_crosses_range(start_addr, size,
    606			prop->hints_host_hpage_reserved_va_range.start_addr,
    607			prop->hints_host_hpage_reserved_va_range.end_addr);
    608
    609	return range_cross;
    610}
    611
    612/**
    613 * get_va_block() - get a virtual block for the given size and alignment.
    614 *
    615 * @hdev: pointer to the habanalabs device structure.
    616 * @va_range: pointer to the virtual addresses range.
    617 * @size: requested block size.
    618 * @hint_addr: hint for requested address by the user.
    619 * @va_block_align: required alignment of the virtual block start address.
    620 * @range_type: va range type (host, dram)
    621 * @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT
    622 *
    623 * This function does the following:
    624 * - Iterate on the virtual block list to find a suitable virtual block for the
    625 *   given size, hint address and alignment.
    626 * - Reserve the requested block and update the list.
    627 * - Return the start address of the virtual block.
    628 */
    629static u64 get_va_block(struct hl_device *hdev,
    630				struct hl_va_range *va_range,
    631				u64 size, u64 hint_addr, u32 va_block_align,
    632				enum hl_va_range_type range_type,
    633				u32 flags)
    634{
    635	struct hl_vm_va_block *va_block, *new_va_block = NULL;
    636	struct asic_fixed_properties *prop = &hdev->asic_prop;
    637	u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end,
    638		align_mask, reserved_valid_start = 0, reserved_valid_size = 0,
    639		dram_hint_mask = prop->dram_hints_align_mask;
    640	bool add_prev = false;
    641	bool is_align_pow_2  = is_power_of_2(va_range->page_size);
    642	bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr);
    643	bool force_hint = flags & HL_MEM_FORCE_HINT;
    644
    645	if (is_align_pow_2)
    646		align_mask = ~((u64)va_block_align - 1);
    647	else
    648		/*
    649		 * with non-power-of-2 range we work only with page granularity
    650		 * and the start address is page aligned,
    651		 * so no need for alignment checking.
    652		 */
    653		size = DIV_ROUND_UP_ULL(size, va_range->page_size) *
    654							va_range->page_size;
    655
    656	tmp_hint_addr = hint_addr & ~dram_hint_mask;
    657
    658	/* Check if we need to ignore hint address */
    659	if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) ||
    660		(!is_align_pow_2 && is_hint_dram_addr &&
    661			do_div(tmp_hint_addr, va_range->page_size))) {
    662
    663		if (force_hint) {
    664			/* Hint must be respected, so here we just fail */
    665			dev_err(hdev->dev,
    666				"Hint address 0x%llx is not page aligned - cannot be respected\n",
    667				hint_addr);
    668			return 0;
    669		}
    670
    671		dev_dbg(hdev->dev,
    672			"Hint address 0x%llx will be ignored because it is not aligned\n",
    673			hint_addr);
    674		hint_addr = 0;
    675	}
    676
    677	mutex_lock(&va_range->lock);
    678
    679	print_va_list_locked(hdev, &va_range->list);
    680
    681	list_for_each_entry(va_block, &va_range->list, node) {
    682		/* Calc the first possible aligned addr */
    683		valid_start = va_block->start;
    684
    685		if (is_align_pow_2 && (valid_start & (va_block_align - 1))) {
    686			valid_start &= align_mask;
    687			valid_start += va_block_align;
    688			if (valid_start > va_block->end)
    689				continue;
    690		}
    691
    692		valid_size = va_block->end - valid_start + 1;
    693		if (valid_size < size)
    694			continue;
    695
    696		/*
    697		 * In case hint address is 0, and hints_range_reservation
    698		 * property enabled, then avoid allocating va blocks from the
    699		 * range reserved for hint addresses
    700		 */
    701		if (prop->hints_range_reservation && !hint_addr)
    702			if (is_hint_crossing_range(range_type, valid_start,
    703					size, prop))
    704				continue;
    705
    706		/* Pick the minimal length block which has the required size */
    707		if (!new_va_block || (valid_size < reserved_valid_size)) {
    708			new_va_block = va_block;
    709			reserved_valid_start = valid_start;
    710			reserved_valid_size = valid_size;
    711		}
    712
    713		if (hint_addr && hint_addr >= valid_start &&
    714					(hint_addr + size) <= va_block->end) {
    715			new_va_block = va_block;
    716			reserved_valid_start = hint_addr;
    717			reserved_valid_size = valid_size;
    718			break;
    719		}
    720	}
    721
    722	if (!new_va_block) {
    723		dev_err(hdev->dev, "no available va block for size %llu\n",
    724								size);
    725		goto out;
    726	}
    727
    728	if (force_hint && reserved_valid_start != hint_addr) {
    729		/* Hint address must be respected. If we are here - this means
    730		 * we could not respect it.
    731		 */
    732		dev_err(hdev->dev,
    733			"Hint address 0x%llx could not be respected\n",
    734			hint_addr);
    735		reserved_valid_start = 0;
    736		goto out;
    737	}
    738
    739	/*
    740	 * Check if there is some leftover range due to reserving the new
    741	 * va block, then return it to the main virtual addresses list.
    742	 */
    743	if (reserved_valid_start > new_va_block->start) {
    744		prev_start = new_va_block->start;
    745		prev_end = reserved_valid_start - 1;
    746
    747		new_va_block->start = reserved_valid_start;
    748		new_va_block->size = reserved_valid_size;
    749
    750		add_prev = true;
    751	}
    752
    753	if (new_va_block->size > size) {
    754		new_va_block->start += size;
    755		new_va_block->size = new_va_block->end - new_va_block->start + 1;
    756	} else {
    757		list_del(&new_va_block->node);
    758		kfree(new_va_block);
    759	}
    760
    761	if (add_prev)
    762		add_va_block_locked(hdev, &va_range->list, prev_start,
    763				prev_end);
    764
    765	print_va_list_locked(hdev, &va_range->list);
    766out:
    767	mutex_unlock(&va_range->lock);
    768
    769	return reserved_valid_start;
    770}
    771
    772/*
    773 * hl_reserve_va_block() - reserve a virtual block of a given size.
    774 * @hdev: pointer to the habanalabs device structure.
    775 * @ctx: current context
    776 * @type: virtual addresses range type.
    777 * @size: requested block size.
    778 * @alignment: required alignment in bytes of the virtual block start address,
    779 *             0 means no alignment.
    780 *
    781 * This function does the following:
    782 * - Iterate on the virtual block list to find a suitable virtual block for the
    783 *   given size and alignment.
    784 * - Reserve the requested block and update the list.
    785 * - Return the start address of the virtual block.
    786 */
    787u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
    788		enum hl_va_range_type type, u32 size, u32 alignment)
    789{
    790	return get_va_block(hdev, ctx->va_range[type], size, 0,
    791			max(alignment, ctx->va_range[type]->page_size),
    792			type, 0);
    793}
    794
    795/**
    796 * hl_get_va_range_type() - get va_range type for the given address and size.
    797 * @ctx: context to fetch va_range from.
    798 * @address: the start address of the area we want to validate.
    799 * @size: the size in bytes of the area we want to validate.
    800 * @type: returned va_range type.
    801 *
    802 * Return: true if the area is inside a valid range, false otherwise.
    803 */
    804static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size,
    805			enum hl_va_range_type *type)
    806{
    807	int i;
    808
    809	for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX; i++) {
    810		if (hl_mem_area_inside_range(address, size,
    811				ctx->va_range[i]->start_addr,
    812				ctx->va_range[i]->end_addr)) {
    813			*type = i;
    814			return 0;
    815		}
    816	}
    817
    818	return -EINVAL;
    819}
    820
    821/**
    822 * hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block.
    823 * @hdev: pointer to the habanalabs device structure
    824 * @ctx: pointer to the context structure.
    825 * @start_addr: start virtual address.
    826 * @size: number of bytes to unreserve.
    827 *
    828 * This function does the following:
    829 * - Takes the list lock and calls add_va_block_locked.
    830 */
    831int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
    832		u64 start_addr, u64 size)
    833{
    834	enum hl_va_range_type type;
    835	int rc;
    836
    837	rc = hl_get_va_range_type(ctx, start_addr, size, &type);
    838	if (rc) {
    839		dev_err(hdev->dev,
    840			"cannot find va_range for va %#llx size %llu",
    841			start_addr, size);
    842		return rc;
    843	}
    844
    845	rc = add_va_block(hdev, ctx->va_range[type], start_addr,
    846						start_addr + size - 1);
    847	if (rc)
    848		dev_warn(hdev->dev,
    849			"add va block failed for vaddr: 0x%llx\n", start_addr);
    850
    851	return rc;
    852}
    853
    854/**
    855 * init_phys_pg_pack_from_userptr() - initialize physical page pack from host
    856 *                                    memory
    857 * @ctx: pointer to the context structure.
    858 * @userptr: userptr to initialize from.
    859 * @pphys_pg_pack: result pointer.
    860 * @force_regular_page: tell the function to ignore huge page optimization,
    861 *                      even if possible. Needed for cases where the device VA
    862 *                      is allocated before we know the composition of the
    863 *                      physical pages
    864 *
    865 * This function does the following:
    866 * - Pin the physical pages related to the given virtual block.
    867 * - Create a physical page pack from the physical pages related to the given
    868 *   virtual block.
    869 */
    870static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
    871				struct hl_userptr *userptr,
    872				struct hl_vm_phys_pg_pack **pphys_pg_pack,
    873				bool force_regular_page)
    874{
    875	u32 npages, page_size = PAGE_SIZE,
    876		huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
    877	u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
    878	struct hl_vm_phys_pg_pack *phys_pg_pack;
    879	bool first = true, is_huge_page_opt;
    880	u64 page_mask, total_npages;
    881	struct scatterlist *sg;
    882	dma_addr_t dma_addr;
    883	int rc, i, j;
    884
    885	phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
    886	if (!phys_pg_pack)
    887		return -ENOMEM;
    888
    889	phys_pg_pack->vm_type = userptr->vm_type;
    890	phys_pg_pack->created_from_userptr = true;
    891	phys_pg_pack->asid = ctx->asid;
    892	atomic_set(&phys_pg_pack->mapping_cnt, 1);
    893
    894	is_huge_page_opt = (force_regular_page ? false : true);
    895
    896	/* Only if all dma_addrs are aligned to 2MB and their
    897	 * sizes is at least 2MB, we can use huge page mapping.
    898	 * We limit the 2MB optimization to this condition,
    899	 * since later on we acquire the related VA range as one
    900	 * consecutive block.
    901	 */
    902	total_npages = 0;
    903	for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
    904		npages = hl_get_sg_info(sg, &dma_addr);
    905
    906		total_npages += npages;
    907
    908		if ((npages % pgs_in_huge_page) ||
    909					(dma_addr & (huge_page_size - 1)))
    910			is_huge_page_opt = false;
    911	}
    912
    913	if (is_huge_page_opt) {
    914		page_size = huge_page_size;
    915		do_div(total_npages, pgs_in_huge_page);
    916	}
    917
    918	page_mask = ~(((u64) page_size) - 1);
    919
    920	phys_pg_pack->pages = kvmalloc_array(total_npages, sizeof(u64),
    921						GFP_KERNEL);
    922	if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
    923		rc = -ENOMEM;
    924		goto page_pack_arr_mem_err;
    925	}
    926
    927	phys_pg_pack->npages = total_npages;
    928	phys_pg_pack->page_size = page_size;
    929	phys_pg_pack->total_size = total_npages * page_size;
    930
    931	j = 0;
    932	for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
    933		npages = hl_get_sg_info(sg, &dma_addr);
    934
    935		/* align down to physical page size and save the offset */
    936		if (first) {
    937			first = false;
    938			phys_pg_pack->offset = dma_addr & (page_size - 1);
    939			dma_addr &= page_mask;
    940		}
    941
    942		while (npages) {
    943			phys_pg_pack->pages[j++] = dma_addr;
    944			dma_addr += page_size;
    945
    946			if (is_huge_page_opt)
    947				npages -= pgs_in_huge_page;
    948			else
    949				npages--;
    950		}
    951	}
    952
    953	*pphys_pg_pack = phys_pg_pack;
    954
    955	return 0;
    956
    957page_pack_arr_mem_err:
    958	kfree(phys_pg_pack);
    959
    960	return rc;
    961}
    962
    963/**
    964 * map_phys_pg_pack() - maps the physical page pack..
    965 * @ctx: pointer to the context structure.
    966 * @vaddr: start address of the virtual area to map from.
    967 * @phys_pg_pack: the pack of physical pages to map to.
    968 *
    969 * This function does the following:
    970 * - Maps each chunk of virtual memory to matching physical chunk.
    971 * - Stores number of successful mappings in the given argument.
    972 * - Returns 0 on success, error code otherwise.
    973 */
    974static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
    975				struct hl_vm_phys_pg_pack *phys_pg_pack)
    976{
    977	struct hl_device *hdev = ctx->hdev;
    978	u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i;
    979	u32 page_size = phys_pg_pack->page_size;
    980	int rc = 0;
    981	bool is_host_addr;
    982
    983	for (i = 0 ; i < phys_pg_pack->npages ; i++) {
    984		paddr = phys_pg_pack->pages[i];
    985
    986		rc = hl_mmu_map_page(ctx, next_vaddr, paddr, page_size,
    987				(i + 1) == phys_pg_pack->npages);
    988		if (rc) {
    989			dev_err(hdev->dev,
    990				"map failed for handle %u, npages: %llu, mapped: %llu",
    991				phys_pg_pack->handle, phys_pg_pack->npages,
    992				mapped_pg_cnt);
    993			goto err;
    994		}
    995
    996		mapped_pg_cnt++;
    997		next_vaddr += page_size;
    998	}
    999
   1000	return 0;
   1001
   1002err:
   1003	is_host_addr = !hl_is_dram_va(hdev, vaddr);
   1004
   1005	next_vaddr = vaddr;
   1006	for (i = 0 ; i < mapped_pg_cnt ; i++) {
   1007		if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
   1008					(i + 1) == mapped_pg_cnt))
   1009			dev_warn_ratelimited(hdev->dev,
   1010				"failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",
   1011					phys_pg_pack->handle, next_vaddr,
   1012					phys_pg_pack->pages[i], page_size);
   1013
   1014		next_vaddr += page_size;
   1015
   1016		/*
   1017		 * unmapping on Palladium can be really long, so avoid a CPU
   1018		 * soft lockup bug by sleeping a little between unmapping pages
   1019		 *
   1020		 * In addition, on host num of pages could be huge,
   1021		 * because page size could be 4KB, so when unmapping host
   1022		 * pages sleep every 32K pages to avoid soft lockup
   1023		 */
   1024		if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
   1025			usleep_range(50, 200);
   1026	}
   1027
   1028	return rc;
   1029}
   1030
   1031/**
   1032 * unmap_phys_pg_pack() - unmaps the physical page pack.
   1033 * @ctx: pointer to the context structure.
   1034 * @vaddr: start address of the virtual area to unmap.
   1035 * @phys_pg_pack: the pack of physical pages to unmap.
   1036 */
   1037static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
   1038				struct hl_vm_phys_pg_pack *phys_pg_pack)
   1039{
   1040	struct hl_device *hdev = ctx->hdev;
   1041	u64 next_vaddr, i;
   1042	bool is_host_addr;
   1043	u32 page_size;
   1044
   1045	is_host_addr = !hl_is_dram_va(hdev, vaddr);
   1046	page_size = phys_pg_pack->page_size;
   1047	next_vaddr = vaddr;
   1048
   1049	for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
   1050		if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
   1051				       (i + 1) == phys_pg_pack->npages))
   1052			dev_warn_ratelimited(hdev->dev,
   1053			"unmap failed for vaddr: 0x%llx\n", next_vaddr);
   1054
   1055		/*
   1056		 * unmapping on Palladium can be really long, so avoid a CPU
   1057		 * soft lockup bug by sleeping a little between unmapping pages
   1058		 *
   1059		 * In addition, on host num of pages could be huge,
   1060		 * because page size could be 4KB, so when unmapping host
   1061		 * pages sleep every 32K pages to avoid soft lockup
   1062		 */
   1063		if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0))
   1064			usleep_range(50, 200);
   1065	}
   1066}
   1067
   1068static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
   1069					u64 *paddr)
   1070{
   1071	struct hl_device *hdev = ctx->hdev;
   1072	struct hl_vm *vm = &hdev->vm;
   1073	struct hl_vm_phys_pg_pack *phys_pg_pack;
   1074	u32 handle;
   1075
   1076	handle = lower_32_bits(args->map_device.handle);
   1077	spin_lock(&vm->idr_lock);
   1078	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
   1079	if (!phys_pg_pack) {
   1080		spin_unlock(&vm->idr_lock);
   1081		dev_err(hdev->dev, "no match for handle %u\n", handle);
   1082		return -EINVAL;
   1083	}
   1084
   1085	*paddr = phys_pg_pack->pages[0];
   1086
   1087	spin_unlock(&vm->idr_lock);
   1088
   1089	return 0;
   1090}
   1091
   1092/**
   1093 * map_device_va() - map the given memory.
   1094 * @ctx: pointer to the context structure.
   1095 * @args: host parameters with handle/host virtual address.
   1096 * @device_addr: pointer to result device virtual address.
   1097 *
   1098 * This function does the following:
   1099 * - If given a physical device memory handle, map to a device virtual block
   1100 *   and return the start address of this block.
   1101 * - If given a host virtual address and size, find the related physical pages,
   1102 *   map a device virtual block to this pages and return the start address of
   1103 *   this block.
   1104 */
   1105static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device_addr)
   1106{
   1107	struct hl_vm_phys_pg_pack *phys_pg_pack;
   1108	enum hl_va_range_type va_range_type = 0;
   1109	struct hl_device *hdev = ctx->hdev;
   1110	struct hl_userptr *userptr = NULL;
   1111	u32 handle = 0, va_block_align;
   1112	struct hl_vm_hash_node *hnode;
   1113	struct hl_vm *vm = &hdev->vm;
   1114	struct hl_va_range *va_range;
   1115	bool is_userptr, do_prefetch;
   1116	u64 ret_vaddr, hint_addr;
   1117	enum vm_type *vm_type;
   1118	int rc;
   1119
   1120	/* set map flags */
   1121	is_userptr = args->flags & HL_MEM_USERPTR;
   1122	do_prefetch = hdev->supports_mmu_prefetch && (args->flags & HL_MEM_PREFETCH);
   1123
   1124	/* Assume failure */
   1125	*device_addr = 0;
   1126
   1127	if (is_userptr) {
   1128		u64 addr = args->map_host.host_virt_addr,
   1129			size = args->map_host.mem_size;
   1130		u32 page_size = hdev->asic_prop.pmmu.page_size,
   1131			huge_page_size = hdev->asic_prop.pmmu_huge.page_size;
   1132
   1133		rc = dma_map_host_va(hdev, addr, size, &userptr);
   1134		if (rc) {
   1135			dev_err(hdev->dev, "failed to get userptr from va\n");
   1136			return rc;
   1137		}
   1138
   1139		rc = init_phys_pg_pack_from_userptr(ctx, userptr,
   1140				&phys_pg_pack, false);
   1141		if (rc) {
   1142			dev_err(hdev->dev,
   1143				"unable to init page pack for vaddr 0x%llx\n",
   1144				addr);
   1145			goto init_page_pack_err;
   1146		}
   1147
   1148		vm_type = (enum vm_type *) userptr;
   1149		hint_addr = args->map_host.hint_addr;
   1150		handle = phys_pg_pack->handle;
   1151
   1152		/* get required alignment */
   1153		if (phys_pg_pack->page_size == page_size) {
   1154			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
   1155			va_range_type = HL_VA_RANGE_TYPE_HOST;
   1156			/*
   1157			 * huge page alignment may be needed in case of regular
   1158			 * page mapping, depending on the host VA alignment
   1159			 */
   1160			if (addr & (huge_page_size - 1))
   1161				va_block_align = page_size;
   1162			else
   1163				va_block_align = huge_page_size;
   1164		} else {
   1165			/*
   1166			 * huge page alignment is needed in case of huge page
   1167			 * mapping
   1168			 */
   1169			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
   1170			va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE;
   1171			va_block_align = huge_page_size;
   1172		}
   1173	} else {
   1174		handle = lower_32_bits(args->map_device.handle);
   1175
   1176		spin_lock(&vm->idr_lock);
   1177		phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
   1178		if (!phys_pg_pack) {
   1179			spin_unlock(&vm->idr_lock);
   1180			dev_err(hdev->dev,
   1181				"no match for handle %u\n", handle);
   1182			return -EINVAL;
   1183		}
   1184
   1185		/* increment now to avoid freeing device memory while mapping */
   1186		atomic_inc(&phys_pg_pack->mapping_cnt);
   1187
   1188		spin_unlock(&vm->idr_lock);
   1189
   1190		vm_type = (enum vm_type *) phys_pg_pack;
   1191
   1192		hint_addr = args->map_device.hint_addr;
   1193
   1194		/* DRAM VA alignment is the same as the MMU page size */
   1195		va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
   1196		va_range_type = HL_VA_RANGE_TYPE_DRAM;
   1197		va_block_align = hdev->asic_prop.dmmu.page_size;
   1198	}
   1199
   1200	/*
   1201	 * relevant for mapping device physical memory only, as host memory is
   1202	 * implicitly shared
   1203	 */
   1204	if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) &&
   1205			phys_pg_pack->asid != ctx->asid) {
   1206		dev_err(hdev->dev,
   1207			"Failed to map memory, handle %u is not shared\n",
   1208			handle);
   1209		rc = -EPERM;
   1210		goto shared_err;
   1211	}
   1212
   1213	hnode = kzalloc(sizeof(*hnode), GFP_KERNEL);
   1214	if (!hnode) {
   1215		rc = -ENOMEM;
   1216		goto hnode_err;
   1217	}
   1218
   1219	if (hint_addr && phys_pg_pack->offset) {
   1220		if (args->flags & HL_MEM_FORCE_HINT) {
   1221			/* Fail if hint must be respected but it can't be */
   1222			dev_err(hdev->dev,
   1223				"Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",
   1224				hint_addr, phys_pg_pack->offset);
   1225			rc = -EINVAL;
   1226			goto va_block_err;
   1227		}
   1228		dev_dbg(hdev->dev,
   1229			"Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",
   1230			hint_addr, phys_pg_pack->offset);
   1231	}
   1232
   1233	ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
   1234					hint_addr, va_block_align,
   1235					va_range_type, args->flags);
   1236	if (!ret_vaddr) {
   1237		dev_err(hdev->dev, "no available va block for handle %u\n",
   1238				handle);
   1239		rc = -ENOMEM;
   1240		goto va_block_err;
   1241	}
   1242
   1243	mutex_lock(&ctx->mmu_lock);
   1244
   1245	rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
   1246	if (rc) {
   1247		dev_err(hdev->dev, "mapping page pack failed for handle %u\n", handle);
   1248		goto map_err;
   1249	}
   1250
   1251	rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV,
   1252				ctx->asid, ret_vaddr, phys_pg_pack->total_size);
   1253	if (rc)
   1254		goto map_err;
   1255
   1256	mutex_unlock(&ctx->mmu_lock);
   1257
   1258	/*
   1259	 * prefetch is done upon user's request. it is performed in WQ as and so can
   1260	 * be outside the MMU lock. the operation itself is already protected by the mmu lock
   1261	 */
   1262	if (do_prefetch) {
   1263		rc = hl_mmu_prefetch_cache_range(ctx, *vm_type, ctx->asid, ret_vaddr,
   1264							phys_pg_pack->total_size);
   1265		if (rc)
   1266			goto map_err;
   1267	}
   1268
   1269	ret_vaddr += phys_pg_pack->offset;
   1270
   1271	hnode->ptr = vm_type;
   1272	hnode->vaddr = ret_vaddr;
   1273
   1274	mutex_lock(&ctx->mem_hash_lock);
   1275	hash_add(ctx->mem_hash, &hnode->node, ret_vaddr);
   1276	mutex_unlock(&ctx->mem_hash_lock);
   1277
   1278	*device_addr = ret_vaddr;
   1279
   1280	if (is_userptr)
   1281		rc = free_phys_pg_pack(hdev, phys_pg_pack);
   1282
   1283	return rc;
   1284
   1285map_err:
   1286	mutex_unlock(&ctx->mmu_lock);
   1287
   1288	if (add_va_block(hdev, va_range, ret_vaddr,
   1289				ret_vaddr + phys_pg_pack->total_size - 1))
   1290		dev_warn(hdev->dev,
   1291			"release va block failed for handle 0x%x, vaddr: 0x%llx\n",
   1292				handle, ret_vaddr);
   1293
   1294va_block_err:
   1295	kfree(hnode);
   1296hnode_err:
   1297shared_err:
   1298	atomic_dec(&phys_pg_pack->mapping_cnt);
   1299	if (is_userptr)
   1300		free_phys_pg_pack(hdev, phys_pg_pack);
   1301init_page_pack_err:
   1302	if (is_userptr)
   1303		dma_unmap_host_va(hdev, userptr);
   1304
   1305	return rc;
   1306}
   1307
   1308/**
   1309 * unmap_device_va() - unmap the given device virtual address.
   1310 * @ctx: pointer to the context structure.
   1311 * @args: host parameters with device virtual address to unmap.
   1312 * @ctx_free: true if in context free flow, false otherwise.
   1313 *
   1314 * This function does the following:
   1315 * - unmap the physical pages related to the given virtual address.
   1316 * - return the device virtual block to the virtual block list.
   1317 */
   1318static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
   1319				bool ctx_free)
   1320{
   1321	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
   1322	u64 vaddr = args->unmap.device_virt_addr;
   1323	struct hl_vm_hash_node *hnode = NULL;
   1324	struct asic_fixed_properties *prop;
   1325	struct hl_device *hdev = ctx->hdev;
   1326	struct hl_userptr *userptr = NULL;
   1327	struct hl_va_range *va_range;
   1328	enum vm_type *vm_type;
   1329	bool is_userptr;
   1330	int rc = 0;
   1331
   1332	prop = &hdev->asic_prop;
   1333
   1334	/* protect from double entrance */
   1335	mutex_lock(&ctx->mem_hash_lock);
   1336	hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr)
   1337		if (vaddr == hnode->vaddr)
   1338			break;
   1339
   1340	if (!hnode) {
   1341		mutex_unlock(&ctx->mem_hash_lock);
   1342		dev_err(hdev->dev,
   1343			"unmap failed, no mem hnode for vaddr 0x%llx\n",
   1344			vaddr);
   1345		return -EINVAL;
   1346	}
   1347
   1348	hash_del(&hnode->node);
   1349	mutex_unlock(&ctx->mem_hash_lock);
   1350
   1351	vm_type = hnode->ptr;
   1352
   1353	if (*vm_type == VM_TYPE_USERPTR) {
   1354		is_userptr = true;
   1355		userptr = hnode->ptr;
   1356
   1357		rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack,
   1358							false);
   1359		if (rc) {
   1360			dev_err(hdev->dev,
   1361				"unable to init page pack for vaddr 0x%llx\n",
   1362				vaddr);
   1363			goto vm_type_err;
   1364		}
   1365
   1366		if (phys_pg_pack->page_size ==
   1367					hdev->asic_prop.pmmu.page_size)
   1368			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
   1369		else
   1370			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
   1371	} else if (*vm_type == VM_TYPE_PHYS_PACK) {
   1372		is_userptr = false;
   1373		va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
   1374		phys_pg_pack = hnode->ptr;
   1375	} else {
   1376		dev_warn(hdev->dev,
   1377			"unmap failed, unknown vm desc for vaddr 0x%llx\n",
   1378				vaddr);
   1379		rc = -EFAULT;
   1380		goto vm_type_err;
   1381	}
   1382
   1383	if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) {
   1384		dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr);
   1385		rc = -EINVAL;
   1386		goto mapping_cnt_err;
   1387	}
   1388
   1389	if (!is_userptr && !is_power_of_2(phys_pg_pack->page_size))
   1390		vaddr = prop->dram_base_address +
   1391			DIV_ROUND_DOWN_ULL(vaddr - prop->dram_base_address,
   1392						phys_pg_pack->page_size) *
   1393							phys_pg_pack->page_size;
   1394	else
   1395		vaddr &= ~(((u64) phys_pg_pack->page_size) - 1);
   1396
   1397	mutex_lock(&ctx->mmu_lock);
   1398
   1399	unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
   1400
   1401	/*
   1402	 * During context free this function is called in a loop to clean all
   1403	 * the context mappings. Hence the cache invalidation can be called once
   1404	 * at the loop end rather than for each iteration
   1405	 */
   1406	if (!ctx_free)
   1407		rc = hl_mmu_invalidate_cache_range(hdev, true, *vm_type, ctx->asid, vaddr,
   1408							phys_pg_pack->total_size);
   1409
   1410	mutex_unlock(&ctx->mmu_lock);
   1411
   1412	/*
   1413	 * If the context is closing we don't need to check for the MMU cache
   1414	 * invalidation return code and update the VA free list as in this flow
   1415	 * we invalidate the MMU cache outside of this unmap function and the VA
   1416	 * free list will be freed anyway.
   1417	 */
   1418	if (!ctx_free) {
   1419		int tmp_rc;
   1420
   1421		tmp_rc = add_va_block(hdev, va_range, vaddr,
   1422					vaddr + phys_pg_pack->total_size - 1);
   1423		if (tmp_rc) {
   1424			dev_warn(hdev->dev,
   1425					"add va block failed for vaddr: 0x%llx\n",
   1426					vaddr);
   1427			if (!rc)
   1428				rc = tmp_rc;
   1429		}
   1430	}
   1431
   1432	atomic_dec(&phys_pg_pack->mapping_cnt);
   1433	kfree(hnode);
   1434
   1435	if (is_userptr) {
   1436		free_phys_pg_pack(hdev, phys_pg_pack);
   1437		dma_unmap_host_va(hdev, userptr);
   1438	}
   1439
   1440	return rc;
   1441
   1442mapping_cnt_err:
   1443	if (is_userptr)
   1444		free_phys_pg_pack(hdev, phys_pg_pack);
   1445vm_type_err:
   1446	mutex_lock(&ctx->mem_hash_lock);
   1447	hash_add(ctx->mem_hash, &hnode->node, vaddr);
   1448	mutex_unlock(&ctx->mem_hash_lock);
   1449
   1450	return rc;
   1451}
   1452
   1453static int map_block(struct hl_device *hdev, u64 address, u64 *handle,
   1454			u32 *size)
   1455{
   1456	u32 block_id = 0;
   1457	int rc;
   1458
   1459	rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id);
   1460
   1461	*handle = block_id | HL_MMAP_TYPE_BLOCK;
   1462	*handle <<= PAGE_SHIFT;
   1463
   1464	return rc;
   1465}
   1466
   1467static void hw_block_vm_close(struct vm_area_struct *vma)
   1468{
   1469	struct hl_vm_hw_block_list_node *lnode =
   1470		(struct hl_vm_hw_block_list_node *) vma->vm_private_data;
   1471	struct hl_ctx *ctx = lnode->ctx;
   1472
   1473	mutex_lock(&ctx->hw_block_list_lock);
   1474	list_del(&lnode->node);
   1475	mutex_unlock(&ctx->hw_block_list_lock);
   1476	hl_ctx_put(ctx);
   1477	kfree(lnode);
   1478	vma->vm_private_data = NULL;
   1479}
   1480
   1481static const struct vm_operations_struct hw_block_vm_ops = {
   1482	.close = hw_block_vm_close
   1483};
   1484
   1485/**
   1486 * hl_hw_block_mmap() - mmap a hw block to user.
   1487 * @hpriv: pointer to the private data of the fd
   1488 * @vma: pointer to vm_area_struct of the process
   1489 *
   1490 * Driver increments context reference for every HW block mapped in order
   1491 * to prevent user from closing FD without unmapping first
   1492 */
   1493int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
   1494{
   1495	struct hl_vm_hw_block_list_node *lnode;
   1496	struct hl_device *hdev = hpriv->hdev;
   1497	struct hl_ctx *ctx = hpriv->ctx;
   1498	u32 block_id, block_size;
   1499	int rc;
   1500
   1501	/* We use the page offset to hold the block id and thus we need to clear
   1502	 * it before doing the mmap itself
   1503	 */
   1504	block_id = vma->vm_pgoff;
   1505	vma->vm_pgoff = 0;
   1506
   1507	/* Driver only allows mapping of a complete HW block */
   1508	block_size = vma->vm_end - vma->vm_start;
   1509
   1510	if (!access_ok((void __user *) (uintptr_t) vma->vm_start, block_size)) {
   1511		dev_err(hdev->dev,
   1512			"user pointer is invalid - 0x%lx\n",
   1513			vma->vm_start);
   1514
   1515		return -EINVAL;
   1516	}
   1517
   1518	lnode = kzalloc(sizeof(*lnode), GFP_KERNEL);
   1519	if (!lnode)
   1520		return -ENOMEM;
   1521
   1522	vma->vm_ops = &hw_block_vm_ops;
   1523	vma->vm_private_data = lnode;
   1524
   1525	hl_ctx_get(ctx);
   1526
   1527	rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size);
   1528	if (rc) {
   1529		hl_ctx_put(ctx);
   1530		kfree(lnode);
   1531		return rc;
   1532	}
   1533
   1534	lnode->ctx = ctx;
   1535	lnode->vaddr = vma->vm_start;
   1536	lnode->size = block_size;
   1537	lnode->id = block_id;
   1538
   1539	mutex_lock(&ctx->hw_block_list_lock);
   1540	list_add_tail(&lnode->node, &ctx->hw_block_mem_list);
   1541	mutex_unlock(&ctx->hw_block_list_lock);
   1542
   1543	vma->vm_pgoff = block_id;
   1544
   1545	return 0;
   1546}
   1547
   1548static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size,
   1549			struct device *dev, enum dma_data_direction dir)
   1550{
   1551	dma_addr_t addr;
   1552	int rc;
   1553
   1554	addr = dma_map_resource(dev, bar_address, chunk_size, dir,
   1555				DMA_ATTR_SKIP_CPU_SYNC);
   1556	rc = dma_mapping_error(dev, addr);
   1557	if (rc)
   1558		return rc;
   1559
   1560	sg_set_page(sg, NULL, chunk_size, 0);
   1561	sg_dma_address(sg) = addr;
   1562	sg_dma_len(sg) = chunk_size;
   1563
   1564	return 0;
   1565}
   1566
   1567static struct sg_table *alloc_sgt_from_device_pages(struct hl_device *hdev, u64 *pages, u64 npages,
   1568						u64 page_size, struct device *dev,
   1569						enum dma_data_direction dir)
   1570{
   1571	u64 chunk_size, bar_address, dma_max_seg_size;
   1572	struct asic_fixed_properties *prop;
   1573	int rc, i, j, nents, cur_page;
   1574	struct scatterlist *sg;
   1575	struct sg_table *sgt;
   1576
   1577	prop = &hdev->asic_prop;
   1578
   1579	dma_max_seg_size = dma_get_max_seg_size(dev);
   1580
   1581	/* We would like to align the max segment size to PAGE_SIZE, so the
   1582	 * SGL will contain aligned addresses that can be easily mapped to
   1583	 * an MMU
   1584	 */
   1585	dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE);
   1586	if (dma_max_seg_size < PAGE_SIZE) {
   1587		dev_err_ratelimited(hdev->dev,
   1588				"dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n",
   1589				dma_max_seg_size);
   1590		return ERR_PTR(-EINVAL);
   1591	}
   1592
   1593	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
   1594	if (!sgt)
   1595		return ERR_PTR(-ENOMEM);
   1596
   1597	/* If the size of each page is larger than the dma max segment size,
   1598	 * then we can't combine pages and the number of entries in the SGL
   1599	 * will just be the
   1600	 * <number of pages> * <chunks of max segment size in each page>
   1601	 */
   1602	if (page_size > dma_max_seg_size)
   1603		nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size);
   1604	else
   1605		/* Get number of non-contiguous chunks */
   1606		for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) {
   1607			if (pages[i - 1] + page_size != pages[i] ||
   1608					chunk_size + page_size > dma_max_seg_size) {
   1609				nents++;
   1610				chunk_size = page_size;
   1611				continue;
   1612			}
   1613
   1614			chunk_size += page_size;
   1615		}
   1616
   1617	rc = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
   1618	if (rc)
   1619		goto error_free;
   1620
   1621	cur_page = 0;
   1622
   1623	if (page_size > dma_max_seg_size) {
   1624		u64 size_left, cur_device_address = 0;
   1625
   1626		size_left = page_size;
   1627
   1628		/* Need to split each page into the number of chunks of
   1629		 * dma_max_seg_size
   1630		 */
   1631		for_each_sgtable_dma_sg(sgt, sg, i) {
   1632			if (size_left == page_size)
   1633				cur_device_address =
   1634					pages[cur_page] - prop->dram_base_address;
   1635			else
   1636				cur_device_address += dma_max_seg_size;
   1637
   1638			chunk_size = min(size_left, dma_max_seg_size);
   1639
   1640			bar_address = hdev->dram_pci_bar_start + cur_device_address;
   1641
   1642			rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
   1643			if (rc)
   1644				goto error_unmap;
   1645
   1646			if (size_left > dma_max_seg_size) {
   1647				size_left -= dma_max_seg_size;
   1648			} else {
   1649				cur_page++;
   1650				size_left = page_size;
   1651			}
   1652		}
   1653	} else {
   1654		/* Merge pages and put them into the scatterlist */
   1655		for_each_sgtable_dma_sg(sgt, sg, i) {
   1656			chunk_size = page_size;
   1657			for (j = cur_page + 1 ; j < npages ; j++) {
   1658				if (pages[j - 1] + page_size != pages[j] ||
   1659						chunk_size + page_size > dma_max_seg_size)
   1660					break;
   1661
   1662				chunk_size += page_size;
   1663			}
   1664
   1665			bar_address = hdev->dram_pci_bar_start +
   1666					(pages[cur_page] - prop->dram_base_address);
   1667
   1668			rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
   1669			if (rc)
   1670				goto error_unmap;
   1671
   1672			cur_page = j;
   1673		}
   1674	}
   1675
   1676	/* Because we are not going to include a CPU list we want to have some
   1677	 * chance that other users will detect this by setting the orig_nents
   1678	 * to 0 and using only nents (length of DMA list) when going over the
   1679	 * sgl
   1680	 */
   1681	sgt->orig_nents = 0;
   1682
   1683	return sgt;
   1684
   1685error_unmap:
   1686	for_each_sgtable_dma_sg(sgt, sg, i) {
   1687		if (!sg_dma_len(sg))
   1688			continue;
   1689
   1690		dma_unmap_resource(dev, sg_dma_address(sg),
   1691					sg_dma_len(sg), dir,
   1692					DMA_ATTR_SKIP_CPU_SYNC);
   1693	}
   1694
   1695	sg_free_table(sgt);
   1696
   1697error_free:
   1698	kfree(sgt);
   1699	return ERR_PTR(rc);
   1700}
   1701
   1702static int hl_dmabuf_attach(struct dma_buf *dmabuf,
   1703				struct dma_buf_attachment *attachment)
   1704{
   1705	struct hl_dmabuf_priv *hl_dmabuf;
   1706	struct hl_device *hdev;
   1707	int rc;
   1708
   1709	hl_dmabuf = dmabuf->priv;
   1710	hdev = hl_dmabuf->ctx->hdev;
   1711
   1712	rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true);
   1713
   1714	if (rc < 0)
   1715		attachment->peer2peer = false;
   1716	return 0;
   1717}
   1718
   1719static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment,
   1720					enum dma_data_direction dir)
   1721{
   1722	struct dma_buf *dma_buf = attachment->dmabuf;
   1723	struct hl_vm_phys_pg_pack *phys_pg_pack;
   1724	struct hl_dmabuf_priv *hl_dmabuf;
   1725	struct hl_device *hdev;
   1726	struct sg_table *sgt;
   1727
   1728	hl_dmabuf = dma_buf->priv;
   1729	hdev = hl_dmabuf->ctx->hdev;
   1730	phys_pg_pack = hl_dmabuf->phys_pg_pack;
   1731
   1732	if (!attachment->peer2peer) {
   1733		dev_dbg(hdev->dev, "Failed to map dmabuf because p2p is disabled\n");
   1734		return ERR_PTR(-EPERM);
   1735	}
   1736
   1737	if (phys_pg_pack)
   1738		sgt = alloc_sgt_from_device_pages(hdev,
   1739						phys_pg_pack->pages,
   1740						phys_pg_pack->npages,
   1741						phys_pg_pack->page_size,
   1742						attachment->dev,
   1743						dir);
   1744	else
   1745		sgt = alloc_sgt_from_device_pages(hdev,
   1746						&hl_dmabuf->device_address,
   1747						1,
   1748						hl_dmabuf->dmabuf->size,
   1749						attachment->dev,
   1750						dir);
   1751
   1752	if (IS_ERR(sgt))
   1753		dev_err(hdev->dev, "failed (%ld) to initialize sgt for dmabuf\n", PTR_ERR(sgt));
   1754
   1755	return sgt;
   1756}
   1757
   1758static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,
   1759				  struct sg_table *sgt,
   1760				  enum dma_data_direction dir)
   1761{
   1762	struct scatterlist *sg;
   1763	int i;
   1764
   1765	/* The memory behind the dma-buf has *always* resided on the device itself, i.e. it lives
   1766	 * only in the 'device' domain (after all, it maps a PCI bar address which points to the
   1767	 * device memory).
   1768	 *
   1769	 * Therefore, it was never in the 'CPU' domain and hence, there is no need to perform
   1770	 * a sync of the memory to the CPU's cache, as it never resided inside that cache.
   1771	 */
   1772	for_each_sgtable_dma_sg(sgt, sg, i)
   1773		dma_unmap_resource(attachment->dev, sg_dma_address(sg),
   1774					sg_dma_len(sg), dir,
   1775					DMA_ATTR_SKIP_CPU_SYNC);
   1776
   1777	/* Need to restore orig_nents because sg_free_table use that field */
   1778	sgt->orig_nents = sgt->nents;
   1779	sg_free_table(sgt);
   1780	kfree(sgt);
   1781}
   1782
   1783static void hl_release_dmabuf(struct dma_buf *dmabuf)
   1784{
   1785	struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
   1786	struct hl_ctx *ctx = hl_dmabuf->ctx;
   1787	struct hl_device *hdev = ctx->hdev;
   1788	struct hl_vm *vm = &hdev->vm;
   1789
   1790	if (hl_dmabuf->phys_pg_pack) {
   1791		spin_lock(&vm->idr_lock);
   1792		hl_dmabuf->phys_pg_pack->exporting_cnt--;
   1793		spin_unlock(&vm->idr_lock);
   1794	}
   1795
   1796	hl_ctx_put(hl_dmabuf->ctx);
   1797
   1798	kfree(hl_dmabuf);
   1799}
   1800
   1801static const struct dma_buf_ops habanalabs_dmabuf_ops = {
   1802	.attach = hl_dmabuf_attach,
   1803	.map_dma_buf = hl_map_dmabuf,
   1804	.unmap_dma_buf = hl_unmap_dmabuf,
   1805	.release = hl_release_dmabuf,
   1806};
   1807
   1808static int export_dmabuf_common(struct hl_ctx *ctx,
   1809				struct hl_dmabuf_priv *hl_dmabuf,
   1810				u64 total_size, int flags, int *dmabuf_fd)
   1811{
   1812	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
   1813	struct hl_device *hdev = ctx->hdev;
   1814	int rc, fd;
   1815
   1816	exp_info.ops = &habanalabs_dmabuf_ops;
   1817	exp_info.size = total_size;
   1818	exp_info.flags = flags;
   1819	exp_info.priv = hl_dmabuf;
   1820
   1821	hl_dmabuf->dmabuf = dma_buf_export(&exp_info);
   1822	if (IS_ERR(hl_dmabuf->dmabuf)) {
   1823		dev_err(hdev->dev, "failed to export dma-buf\n");
   1824		return PTR_ERR(hl_dmabuf->dmabuf);
   1825	}
   1826
   1827	fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
   1828	if (fd < 0) {
   1829		dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf\n");
   1830		rc = fd;
   1831		goto err_dma_buf_put;
   1832	}
   1833
   1834	hl_dmabuf->ctx = ctx;
   1835	hl_ctx_get(hl_dmabuf->ctx);
   1836
   1837	*dmabuf_fd = fd;
   1838
   1839	return 0;
   1840
   1841err_dma_buf_put:
   1842	dma_buf_put(hl_dmabuf->dmabuf);
   1843	return rc;
   1844}
   1845
   1846/**
   1847 * export_dmabuf_from_addr() - export a dma-buf object for the given memory
   1848 *                             address and size.
   1849 * @ctx: pointer to the context structure.
   1850 * @device_addr:  device memory physical address.
   1851 * @size: size of device memory.
   1852 * @flags: DMA-BUF file/FD flags.
   1853 * @dmabuf_fd: pointer to result FD that represents the dma-buf object.
   1854 *
   1855 * Create and export a dma-buf object for an existing memory allocation inside
   1856 * the device memory, and return a FD which is associated with the dma-buf
   1857 * object.
   1858 *
   1859 * Return: 0 on success, non-zero for failure.
   1860 */
   1861static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr,
   1862					u64 size, int flags, int *dmabuf_fd)
   1863{
   1864	struct hl_dmabuf_priv *hl_dmabuf;
   1865	struct hl_device *hdev = ctx->hdev;
   1866	struct asic_fixed_properties *prop;
   1867	u64 bar_address;
   1868	int rc;
   1869
   1870	prop = &hdev->asic_prop;
   1871
   1872	if (!IS_ALIGNED(device_addr, PAGE_SIZE)) {
   1873		dev_dbg(hdev->dev,
   1874			"exported device memory address 0x%llx should be aligned to 0x%lx\n",
   1875			device_addr, PAGE_SIZE);
   1876		return -EINVAL;
   1877	}
   1878
   1879	if (size < PAGE_SIZE) {
   1880		dev_dbg(hdev->dev,
   1881			"exported device memory size %llu should be equal to or greater than %lu\n",
   1882			size, PAGE_SIZE);
   1883		return -EINVAL;
   1884	}
   1885
   1886	if (device_addr < prop->dram_user_base_address ||
   1887				device_addr + size > prop->dram_end_address ||
   1888				device_addr + size < device_addr) {
   1889		dev_dbg(hdev->dev,
   1890			"DRAM memory range 0x%llx (+0x%llx) is outside of DRAM boundaries\n",
   1891			device_addr, size);
   1892		return -EINVAL;
   1893	}
   1894
   1895	bar_address = hdev->dram_pci_bar_start +
   1896			(device_addr - prop->dram_base_address);
   1897
   1898	if (bar_address + size >
   1899			hdev->dram_pci_bar_start + prop->dram_pci_bar_size ||
   1900			bar_address + size < bar_address) {
   1901		dev_dbg(hdev->dev,
   1902			"DRAM memory range 0x%llx (+0x%llx) is outside of PCI BAR boundaries\n",
   1903			device_addr, size);
   1904		return -EINVAL;
   1905	}
   1906
   1907	hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);
   1908	if (!hl_dmabuf)
   1909		return -ENOMEM;
   1910
   1911	hl_dmabuf->device_address = device_addr;
   1912
   1913	rc = export_dmabuf_common(ctx, hl_dmabuf, size, flags, dmabuf_fd);
   1914	if (rc)
   1915		goto err_free_dmabuf_wrapper;
   1916
   1917	return 0;
   1918
   1919err_free_dmabuf_wrapper:
   1920	kfree(hl_dmabuf);
   1921	return rc;
   1922}
   1923
   1924/**
   1925 * export_dmabuf_from_handle() - export a dma-buf object for the given memory
   1926 *                               handle.
   1927 * @ctx: pointer to the context structure.
   1928 * @handle: device memory allocation handle.
   1929 * @flags: DMA-BUF file/FD flags.
   1930 * @dmabuf_fd: pointer to result FD that represents the dma-buf object.
   1931 *
   1932 * Create and export a dma-buf object for an existing memory allocation inside
   1933 * the device memory, and return a FD which is associated with the dma-buf
   1934 * object.
   1935 *
   1936 * Return: 0 on success, non-zero for failure.
   1937 */
   1938static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags,
   1939					int *dmabuf_fd)
   1940{
   1941	struct hl_vm_phys_pg_pack *phys_pg_pack;
   1942	struct hl_dmabuf_priv *hl_dmabuf;
   1943	struct hl_device *hdev = ctx->hdev;
   1944	struct asic_fixed_properties *prop;
   1945	struct hl_vm *vm = &hdev->vm;
   1946	u64 bar_address;
   1947	int rc, i;
   1948
   1949	prop = &hdev->asic_prop;
   1950
   1951	if (upper_32_bits(handle)) {
   1952		dev_dbg(hdev->dev, "no match for handle 0x%llx\n", handle);
   1953		return -EINVAL;
   1954	}
   1955
   1956	spin_lock(&vm->idr_lock);
   1957
   1958	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, (u32) handle);
   1959	if (!phys_pg_pack) {
   1960		spin_unlock(&vm->idr_lock);
   1961		dev_dbg(hdev->dev, "no match for handle 0x%x\n", (u32) handle);
   1962		return -EINVAL;
   1963	}
   1964
   1965	/* increment now to avoid freeing device memory while exporting */
   1966	phys_pg_pack->exporting_cnt++;
   1967
   1968	spin_unlock(&vm->idr_lock);
   1969
   1970	if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) {
   1971		dev_dbg(hdev->dev, "handle 0x%llx does not represent DRAM memory\n", handle);
   1972		rc = -EINVAL;
   1973		goto err_dec_exporting_cnt;
   1974	}
   1975
   1976	for (i = 0 ; i < phys_pg_pack->npages ; i++) {
   1977
   1978		bar_address = hdev->dram_pci_bar_start +
   1979						(phys_pg_pack->pages[i] -
   1980						prop->dram_base_address);
   1981
   1982		if (bar_address + phys_pg_pack->page_size >
   1983			hdev->dram_pci_bar_start + prop->dram_pci_bar_size ||
   1984			bar_address + phys_pg_pack->page_size < bar_address) {
   1985
   1986			dev_dbg(hdev->dev,
   1987				"DRAM memory range 0x%llx (+0x%x) is outside of PCI BAR boundaries\n",
   1988				phys_pg_pack->pages[i],
   1989				phys_pg_pack->page_size);
   1990
   1991			rc = -EINVAL;
   1992			goto err_dec_exporting_cnt;
   1993		}
   1994	}
   1995
   1996	hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL);
   1997	if (!hl_dmabuf) {
   1998		rc = -ENOMEM;
   1999		goto err_dec_exporting_cnt;
   2000	}
   2001
   2002	hl_dmabuf->phys_pg_pack = phys_pg_pack;
   2003
   2004	rc = export_dmabuf_common(ctx, hl_dmabuf, phys_pg_pack->total_size,
   2005				flags, dmabuf_fd);
   2006	if (rc)
   2007		goto err_free_dmabuf_wrapper;
   2008
   2009	return 0;
   2010
   2011err_free_dmabuf_wrapper:
   2012	kfree(hl_dmabuf);
   2013
   2014err_dec_exporting_cnt:
   2015	spin_lock(&vm->idr_lock);
   2016	phys_pg_pack->exporting_cnt--;
   2017	spin_unlock(&vm->idr_lock);
   2018
   2019	return rc;
   2020}
   2021
   2022static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
   2023{
   2024	struct hl_device *hdev = hpriv->hdev;
   2025	u64 block_handle, device_addr = 0;
   2026	struct hl_ctx *ctx = hpriv->ctx;
   2027	u32 handle = 0, block_size;
   2028	int rc;
   2029
   2030	switch (args->in.op) {
   2031	case HL_MEM_OP_ALLOC:
   2032		if (args->in.alloc.mem_size == 0) {
   2033			dev_err(hdev->dev, "alloc size must be larger than 0\n");
   2034			rc = -EINVAL;
   2035			goto out;
   2036		}
   2037
   2038		/* Force contiguous as there are no real MMU
   2039		 * translations to overcome physical memory gaps
   2040		 */
   2041		args->in.flags |= HL_MEM_CONTIGUOUS;
   2042		rc = alloc_device_memory(ctx, &args->in, &handle);
   2043
   2044		memset(args, 0, sizeof(*args));
   2045		args->out.handle = (__u64) handle;
   2046		break;
   2047
   2048	case HL_MEM_OP_FREE:
   2049		rc = free_device_memory(ctx, &args->in);
   2050		break;
   2051
   2052	case HL_MEM_OP_MAP:
   2053		if (args->in.flags & HL_MEM_USERPTR) {
   2054			dev_err(hdev->dev, "Failed to map host memory when MMU is disabled\n");
   2055			rc = -EPERM;
   2056		} else {
   2057			rc = get_paddr_from_handle(ctx, &args->in, &device_addr);
   2058			memset(args, 0, sizeof(*args));
   2059			args->out.device_virt_addr = device_addr;
   2060		}
   2061
   2062		break;
   2063
   2064	case HL_MEM_OP_UNMAP:
   2065		rc = 0;
   2066		break;
   2067
   2068	case HL_MEM_OP_MAP_BLOCK:
   2069		rc = map_block(hdev, args->in.map_block.block_addr, &block_handle, &block_size);
   2070		args->out.block_handle = block_handle;
   2071		args->out.block_size = block_size;
   2072		break;
   2073
   2074	case HL_MEM_OP_EXPORT_DMABUF_FD:
   2075		dev_err(hdev->dev, "Failed to export dma-buf object when MMU is disabled\n");
   2076		rc = -EPERM;
   2077		break;
   2078
   2079	case HL_MEM_OP_TS_ALLOC:
   2080		rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle);
   2081		break;
   2082	default:
   2083		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
   2084		rc = -EINVAL;
   2085		break;
   2086	}
   2087
   2088out:
   2089	return rc;
   2090}
   2091
   2092static void ts_buff_release(struct hl_mmap_mem_buf *buf)
   2093{
   2094	struct hl_ts_buff *ts_buff = buf->private;
   2095
   2096	vfree(ts_buff->kernel_buff_address);
   2097	vfree(ts_buff->user_buff_address);
   2098	kfree(ts_buff);
   2099}
   2100
   2101static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, void *args)
   2102{
   2103	struct hl_ts_buff *ts_buff = buf->private;
   2104
   2105	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE;
   2106	return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0);
   2107}
   2108
   2109static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args)
   2110{
   2111	struct hl_ts_buff *ts_buff = NULL;
   2112	u32 size, num_elements;
   2113	void *p;
   2114
   2115	num_elements = *(u32 *)args;
   2116
   2117	ts_buff = kzalloc(sizeof(*ts_buff), GFP_KERNEL);
   2118	if (!ts_buff)
   2119		return -ENOMEM;
   2120
   2121	/* Allocate the user buffer */
   2122	size = num_elements * sizeof(u64);
   2123	p = vmalloc_user(size);
   2124	if (!p)
   2125		goto free_mem;
   2126
   2127	ts_buff->user_buff_address = p;
   2128	buf->mappable_size = size;
   2129
   2130	/* Allocate the internal kernel buffer */
   2131	size = num_elements * sizeof(struct hl_user_pending_interrupt);
   2132	p = vmalloc(size);
   2133	if (!p)
   2134		goto free_user_buff;
   2135
   2136	ts_buff->kernel_buff_address = p;
   2137	ts_buff->kernel_buff_size = size;
   2138
   2139	buf->private = ts_buff;
   2140
   2141	return 0;
   2142
   2143free_user_buff:
   2144	vfree(ts_buff->user_buff_address);
   2145free_mem:
   2146	kfree(ts_buff);
   2147	return -ENOMEM;
   2148}
   2149
   2150static struct hl_mmap_mem_buf_behavior hl_ts_behavior = {
   2151	.topic = "TS",
   2152	.mem_id = HL_MMAP_TYPE_TS_BUFF,
   2153	.mmap = hl_ts_mmap,
   2154	.alloc = hl_ts_alloc_buf,
   2155	.release = ts_buff_release,
   2156};
   2157
   2158/**
   2159 * allocate_timestamps_buffers() - allocate timestamps buffers
   2160 * This function will allocate ts buffer that will later on be mapped to the user
   2161 * in order to be able to read the timestamp.
   2162 * in additon it'll allocate an extra buffer for registration management.
   2163 * since we cannot fail during registration for out-of-memory situation, so
   2164 * we'll prepare a pool which will be used as user interrupt nodes and instead
   2165 * of dynamically allocating nodes while registration we'll pick the node from
   2166 * this pool. in addtion it'll add node to the mapping hash which will be used
   2167 * to map user ts buffer to the internal kernel ts buffer.
   2168 * @hpriv: pointer to the private data of the fd
   2169 * @args: ioctl input
   2170 * @handle: user timestamp buffer handle as an output
   2171 */
   2172static int allocate_timestamps_buffers(struct hl_fpriv *hpriv, struct hl_mem_in *args, u64 *handle)
   2173{
   2174	struct hl_mem_mgr *mmg = &hpriv->mem_mgr;
   2175	struct hl_mmap_mem_buf *buf;
   2176
   2177	if (args->num_of_elements > TS_MAX_ELEMENTS_NUM) {
   2178		dev_err(mmg->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n",
   2179				args->num_of_elements, TS_MAX_ELEMENTS_NUM);
   2180		return -EINVAL;
   2181	}
   2182
   2183	buf = hl_mmap_mem_buf_alloc(mmg, &hl_ts_behavior, GFP_KERNEL, &args->num_of_elements);
   2184	if (!buf)
   2185		return -ENOMEM;
   2186
   2187	*handle = buf->handle;
   2188
   2189	return 0;
   2190}
   2191
   2192int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
   2193{
   2194	enum hl_device_status status;
   2195	union hl_mem_args *args = data;
   2196	struct hl_device *hdev = hpriv->hdev;
   2197	struct hl_ctx *ctx = hpriv->ctx;
   2198	u64 block_handle, device_addr = 0;
   2199	u32 handle = 0, block_size;
   2200	int rc, dmabuf_fd = -EBADF;
   2201
   2202	if (!hl_device_operational(hdev, &status)) {
   2203		dev_warn_ratelimited(hdev->dev,
   2204			"Device is %s. Can't execute MEMORY IOCTL\n",
   2205			hdev->status[status]);
   2206		return -EBUSY;
   2207	}
   2208
   2209	if (!hdev->mmu_enable)
   2210		return mem_ioctl_no_mmu(hpriv, args);
   2211
   2212	switch (args->in.op) {
   2213	case HL_MEM_OP_ALLOC:
   2214		if (args->in.alloc.mem_size == 0) {
   2215			dev_err(hdev->dev,
   2216				"alloc size must be larger than 0\n");
   2217			rc = -EINVAL;
   2218			goto out;
   2219		}
   2220
   2221		/* If DRAM does not support virtual memory the driver won't
   2222		 * handle the allocation/freeing of that memory. However, for
   2223		 * system administration/monitoring purposes, the driver will
   2224		 * keep track of the amount of DRAM memory that is allocated
   2225		 * and freed by the user. Because this code totally relies on
   2226		 * the user's input, the driver can't ensure the validity
   2227		 * of this accounting.
   2228		 */
   2229		if (!hdev->asic_prop.dram_supports_virtual_memory) {
   2230			atomic64_add(args->in.alloc.mem_size,
   2231					&ctx->dram_phys_mem);
   2232			atomic64_add(args->in.alloc.mem_size,
   2233					&hdev->dram_used_mem);
   2234
   2235			dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
   2236			rc = 0;
   2237
   2238			memset(args, 0, sizeof(*args));
   2239			args->out.handle = 0;
   2240			goto out;
   2241		}
   2242
   2243		rc = alloc_device_memory(ctx, &args->in, &handle);
   2244
   2245		memset(args, 0, sizeof(*args));
   2246		args->out.handle = (__u64) handle;
   2247		break;
   2248
   2249	case HL_MEM_OP_FREE:
   2250		/* If DRAM does not support virtual memory the driver won't
   2251		 * handle the allocation/freeing of that memory. However, for
   2252		 * system administration/monitoring purposes, the driver will
   2253		 * keep track of the amount of DRAM memory that is allocated
   2254		 * and freed by the user. Because this code totally relies on
   2255		 * the user's input, the driver can't ensure the validity
   2256		 * of this accounting.
   2257		 */
   2258		if (!hdev->asic_prop.dram_supports_virtual_memory) {
   2259			atomic64_sub(args->in.alloc.mem_size,
   2260					&ctx->dram_phys_mem);
   2261			atomic64_sub(args->in.alloc.mem_size,
   2262					&hdev->dram_used_mem);
   2263
   2264			dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
   2265			rc = 0;
   2266
   2267			goto out;
   2268		}
   2269
   2270		rc = free_device_memory(ctx, &args->in);
   2271		break;
   2272
   2273	case HL_MEM_OP_MAP:
   2274		rc = map_device_va(ctx, &args->in, &device_addr);
   2275
   2276		memset(args, 0, sizeof(*args));
   2277		args->out.device_virt_addr = device_addr;
   2278		break;
   2279
   2280	case HL_MEM_OP_UNMAP:
   2281		rc = unmap_device_va(ctx, &args->in, false);
   2282		break;
   2283
   2284	case HL_MEM_OP_MAP_BLOCK:
   2285		rc = map_block(hdev, args->in.map_block.block_addr,
   2286				&block_handle, &block_size);
   2287		args->out.block_handle = block_handle;
   2288		args->out.block_size = block_size;
   2289		break;
   2290
   2291	case HL_MEM_OP_EXPORT_DMABUF_FD:
   2292		if (hdev->asic_prop.dram_supports_virtual_memory)
   2293			rc = export_dmabuf_from_handle(ctx,
   2294					args->in.export_dmabuf_fd.handle,
   2295					args->in.flags,
   2296					&dmabuf_fd);
   2297		else
   2298			rc = export_dmabuf_from_addr(ctx,
   2299					args->in.export_dmabuf_fd.handle,
   2300					args->in.export_dmabuf_fd.mem_size,
   2301					args->in.flags,
   2302					&dmabuf_fd);
   2303		memset(args, 0, sizeof(*args));
   2304		args->out.fd = dmabuf_fd;
   2305		break;
   2306
   2307	case HL_MEM_OP_TS_ALLOC:
   2308		rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle);
   2309		break;
   2310	default:
   2311		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
   2312		rc = -EINVAL;
   2313		break;
   2314	}
   2315
   2316out:
   2317	return rc;
   2318}
   2319
   2320static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,
   2321				u32 npages, u64 start, u32 offset,
   2322				struct hl_userptr *userptr)
   2323{
   2324	int rc;
   2325
   2326	if (!access_ok((void __user *) (uintptr_t) addr, size)) {
   2327		dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
   2328		return -EFAULT;
   2329	}
   2330
   2331	userptr->pages = kvmalloc_array(npages, sizeof(*userptr->pages),
   2332					GFP_KERNEL);
   2333	if (!userptr->pages)
   2334		return -ENOMEM;
   2335
   2336	rc = pin_user_pages_fast(start, npages,
   2337				 FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM,
   2338				 userptr->pages);
   2339
   2340	if (rc != npages) {
   2341		dev_err(hdev->dev,
   2342			"Failed (%d) to pin host memory with user ptr 0x%llx, size 0x%llx, npages %d\n",
   2343			rc, addr, size, npages);
   2344		if (rc < 0)
   2345			goto destroy_pages;
   2346		npages = rc;
   2347		rc = -EFAULT;
   2348		goto put_pages;
   2349	}
   2350	userptr->npages = npages;
   2351
   2352	rc = sg_alloc_table_from_pages(userptr->sgt,
   2353				       userptr->pages,
   2354				       npages, offset, size, GFP_KERNEL);
   2355	if (rc < 0) {
   2356		dev_err(hdev->dev, "failed to create SG table from pages\n");
   2357		goto put_pages;
   2358	}
   2359
   2360	return 0;
   2361
   2362put_pages:
   2363	unpin_user_pages(userptr->pages, npages);
   2364destroy_pages:
   2365	kvfree(userptr->pages);
   2366	return rc;
   2367}
   2368
   2369/**
   2370 * hl_pin_host_memory() - pins a chunk of host memory.
   2371 * @hdev: pointer to the habanalabs device structure.
   2372 * @addr: the host virtual address of the memory area.
   2373 * @size: the size of the memory area.
   2374 * @userptr: pointer to hl_userptr structure.
   2375 *
   2376 * This function does the following:
   2377 * - Pins the physical pages.
   2378 * - Create an SG list from those pages.
   2379 */
   2380int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
   2381					struct hl_userptr *userptr)
   2382{
   2383	u64 start, end;
   2384	u32 npages, offset;
   2385	int rc;
   2386
   2387	if (!size) {
   2388		dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);
   2389		return -EINVAL;
   2390	}
   2391
   2392	/*
   2393	 * If the combination of the address and size requested for this memory
   2394	 * region causes an integer overflow, return error.
   2395	 */
   2396	if (((addr + size) < addr) ||
   2397			PAGE_ALIGN(addr + size) < (addr + size)) {
   2398		dev_err(hdev->dev,
   2399			"user pointer 0x%llx + %llu causes integer overflow\n",
   2400			addr, size);
   2401		return -EINVAL;
   2402	}
   2403
   2404	userptr->pid = current->pid;
   2405	userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL);
   2406	if (!userptr->sgt)
   2407		return -ENOMEM;
   2408
   2409	start = addr & PAGE_MASK;
   2410	offset = addr & ~PAGE_MASK;
   2411	end = PAGE_ALIGN(addr + size);
   2412	npages = (end - start) >> PAGE_SHIFT;
   2413
   2414	userptr->size = size;
   2415	userptr->addr = addr;
   2416	userptr->dma_mapped = false;
   2417	INIT_LIST_HEAD(&userptr->job_node);
   2418
   2419	rc = get_user_memory(hdev, addr, size, npages, start, offset,
   2420				userptr);
   2421	if (rc) {
   2422		dev_err(hdev->dev,
   2423			"failed to get user memory for address 0x%llx\n",
   2424			addr);
   2425		goto free_sgt;
   2426	}
   2427
   2428	hl_debugfs_add_userptr(hdev, userptr);
   2429
   2430	return 0;
   2431
   2432free_sgt:
   2433	kfree(userptr->sgt);
   2434	return rc;
   2435}
   2436
   2437/*
   2438 * hl_unpin_host_memory - unpins a chunk of host memory.
   2439 * @hdev: pointer to the habanalabs device structure
   2440 * @userptr: pointer to hl_userptr structure
   2441 *
   2442 * This function does the following:
   2443 * - Unpins the physical pages related to the host memory
   2444 * - Free the SG list
   2445 */
   2446void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
   2447{
   2448	hl_debugfs_remove_userptr(hdev, userptr);
   2449
   2450	if (userptr->dma_mapped)
   2451		hdev->asic_funcs->hl_dma_unmap_sgtable(hdev, userptr->sgt, userptr->dir);
   2452
   2453	unpin_user_pages_dirty_lock(userptr->pages, userptr->npages, true);
   2454	kvfree(userptr->pages);
   2455
   2456	list_del(&userptr->job_node);
   2457
   2458	sg_free_table(userptr->sgt);
   2459	kfree(userptr->sgt);
   2460}
   2461
   2462/**
   2463 * hl_userptr_delete_list() - clear userptr list.
   2464 * @hdev: pointer to the habanalabs device structure.
   2465 * @userptr_list: pointer to the list to clear.
   2466 *
   2467 * This function does the following:
   2468 * - Iterates over the list and unpins the host memory and frees the userptr
   2469 *   structure.
   2470 */
   2471void hl_userptr_delete_list(struct hl_device *hdev,
   2472				struct list_head *userptr_list)
   2473{
   2474	struct hl_userptr *userptr, *tmp;
   2475
   2476	list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {
   2477		hl_unpin_host_memory(hdev, userptr);
   2478		kfree(userptr);
   2479	}
   2480
   2481	INIT_LIST_HEAD(userptr_list);
   2482}
   2483
   2484/**
   2485 * hl_userptr_is_pinned() - returns whether the given userptr is pinned.
   2486 * @hdev: pointer to the habanalabs device structure.
   2487 * @addr: user address to check.
   2488 * @size: user block size to check.
   2489 * @userptr_list: pointer to the list to clear.
   2490 * @userptr: pointer to userptr to check.
   2491 *
   2492 * This function does the following:
   2493 * - Iterates over the list and checks if the given userptr is in it, means is
   2494 *   pinned. If so, returns true, otherwise returns false.
   2495 */
   2496bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
   2497				u32 size, struct list_head *userptr_list,
   2498				struct hl_userptr **userptr)
   2499{
   2500	list_for_each_entry((*userptr), userptr_list, job_node) {
   2501		if ((addr == (*userptr)->addr) && (size == (*userptr)->size))
   2502			return true;
   2503	}
   2504
   2505	return false;
   2506}
   2507
   2508/**
   2509 * va_range_init() - initialize virtual addresses range.
   2510 * @hdev: pointer to the habanalabs device structure.
   2511 * @va_ranges: pointer to va_ranges array.
   2512 * @start: range start address.
   2513 * @end: range end address.
   2514 * @page_size: page size for this va_range.
   2515 *
   2516 * This function does the following:
   2517 * - Initializes the virtual addresses list of the given range with the given
   2518 *   addresses.
   2519 */
   2520static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range,
   2521				u64 start, u64 end, u32 page_size)
   2522{
   2523	int rc;
   2524
   2525	INIT_LIST_HEAD(&va_range->list);
   2526
   2527	/*
   2528	 * PAGE_SIZE alignment
   2529	 * it is the callers responsibility to align the addresses if the
   2530	 * page size is not a power of 2
   2531	 */
   2532
   2533	if (is_power_of_2(page_size)) {
   2534		if (start & (PAGE_SIZE - 1)) {
   2535			start &= PAGE_MASK;
   2536			start += PAGE_SIZE;
   2537		}
   2538
   2539		/*
   2540		 * The end of the range is inclusive, hence we need to align it
   2541		 * to the end of the last full page in the range. For example if
   2542		 * end = 0x3ff5 with page size 0x1000, we need to align it to
   2543		 * 0x2fff. The remainig 0xff5 bytes do not form a full page.
   2544		 */
   2545		if ((end + 1) & (PAGE_SIZE - 1))
   2546			end = ((end + 1) & PAGE_MASK) - 1;
   2547	}
   2548
   2549	if (start >= end) {
   2550		dev_err(hdev->dev, "too small vm range for va list\n");
   2551		return -EFAULT;
   2552	}
   2553
   2554	rc = add_va_block(hdev, va_range, start, end);
   2555
   2556	if (rc) {
   2557		dev_err(hdev->dev, "Failed to init host va list\n");
   2558		return rc;
   2559	}
   2560
   2561	va_range->start_addr = start;
   2562	va_range->end_addr = end;
   2563	va_range->page_size = page_size;
   2564
   2565	return 0;
   2566}
   2567
   2568/**
   2569 * va_range_fini() - clear a virtual addresses range.
   2570 * @hdev: pointer to the habanalabs structure.
   2571 * @va_range: pointer to virtual addresses range.
   2572 *
   2573 * This function does the following:
   2574 * - Frees the virtual addresses block list and its lock.
   2575 */
   2576static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range)
   2577{
   2578	mutex_lock(&va_range->lock);
   2579	clear_va_list_locked(hdev, &va_range->list);
   2580	mutex_unlock(&va_range->lock);
   2581
   2582	mutex_destroy(&va_range->lock);
   2583	kfree(va_range);
   2584}
   2585
   2586/**
   2587 * vm_ctx_init_with_ranges() - initialize virtual memory for context.
   2588 * @ctx: pointer to the habanalabs context structure.
   2589 * @host_range_start: host virtual addresses range start.
   2590 * @host_range_end: host virtual addresses range end.
   2591 * @host_page_size: host page size.
   2592 * @host_huge_range_start: host virtual addresses range start for memory
   2593 *                         allocated with huge pages.
   2594 * @host_huge_range_end: host virtual addresses range end for memory allocated
   2595 *                        with huge pages.
   2596 * @host_huge_page_size: host huge page size.
   2597 * @dram_range_start: dram virtual addresses range start.
   2598 * @dram_range_end: dram virtual addresses range end.
   2599 * @dram_page_size: dram page size.
   2600 *
   2601 * This function initializes the following:
   2602 * - MMU for context.
   2603 * - Virtual address to area descriptor hashtable.
   2604 * - Virtual block list of available virtual memory.
   2605 */
   2606static int vm_ctx_init_with_ranges(struct hl_ctx *ctx,
   2607					u64 host_range_start,
   2608					u64 host_range_end,
   2609					u32 host_page_size,
   2610					u64 host_huge_range_start,
   2611					u64 host_huge_range_end,
   2612					u32 host_huge_page_size,
   2613					u64 dram_range_start,
   2614					u64 dram_range_end,
   2615					u32 dram_page_size)
   2616{
   2617	struct hl_device *hdev = ctx->hdev;
   2618	int i, rc;
   2619
   2620	for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++) {
   2621		ctx->va_range[i] =
   2622			kzalloc(sizeof(struct hl_va_range), GFP_KERNEL);
   2623		if (!ctx->va_range[i]) {
   2624			rc = -ENOMEM;
   2625			goto free_va_range;
   2626		}
   2627	}
   2628
   2629	rc = hl_mmu_ctx_init(ctx);
   2630	if (rc) {
   2631		dev_err(hdev->dev, "failed to init context %d\n", ctx->asid);
   2632		goto free_va_range;
   2633	}
   2634
   2635	mutex_init(&ctx->mem_hash_lock);
   2636	hash_init(ctx->mem_hash);
   2637
   2638	mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
   2639
   2640	rc = va_range_init(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST],
   2641			host_range_start, host_range_end, host_page_size);
   2642	if (rc) {
   2643		dev_err(hdev->dev, "failed to init host vm range\n");
   2644		goto mmu_ctx_fini;
   2645	}
   2646
   2647	if (hdev->pmmu_huge_range) {
   2648		mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
   2649
   2650		rc = va_range_init(hdev,
   2651			ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE],
   2652			host_huge_range_start, host_huge_range_end,
   2653			host_huge_page_size);
   2654		if (rc) {
   2655			dev_err(hdev->dev,
   2656				"failed to init host huge vm range\n");
   2657			goto clear_host_va_range;
   2658		}
   2659	} else {
   2660		kfree(ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
   2661		ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE] =
   2662				ctx->va_range[HL_VA_RANGE_TYPE_HOST];
   2663	}
   2664
   2665	mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
   2666
   2667	rc = va_range_init(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM],
   2668			dram_range_start, dram_range_end, dram_page_size);
   2669	if (rc) {
   2670		dev_err(hdev->dev, "failed to init dram vm range\n");
   2671		goto clear_host_huge_va_range;
   2672	}
   2673
   2674	hl_debugfs_add_ctx_mem_hash(hdev, ctx);
   2675
   2676	return 0;
   2677
   2678clear_host_huge_va_range:
   2679	mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
   2680
   2681	if (hdev->pmmu_huge_range) {
   2682		mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
   2683		clear_va_list_locked(hdev,
   2684			&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->list);
   2685		mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
   2686	}
   2687clear_host_va_range:
   2688	if (hdev->pmmu_huge_range)
   2689		mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
   2690	mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
   2691	clear_va_list_locked(hdev, &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->list);
   2692	mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
   2693mmu_ctx_fini:
   2694	mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
   2695	mutex_destroy(&ctx->mem_hash_lock);
   2696	hl_mmu_ctx_fini(ctx);
   2697free_va_range:
   2698	for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++)
   2699		kfree(ctx->va_range[i]);
   2700
   2701	return rc;
   2702}
   2703
   2704int hl_vm_ctx_init(struct hl_ctx *ctx)
   2705{
   2706	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
   2707	u64 host_range_start, host_range_end, host_huge_range_start,
   2708		host_huge_range_end, dram_range_start, dram_range_end;
   2709	u32 host_page_size, host_huge_page_size, dram_page_size;
   2710
   2711	atomic64_set(&ctx->dram_phys_mem, 0);
   2712
   2713	/*
   2714	 * - If MMU is enabled, init the ranges as usual.
   2715	 * - If MMU is disabled, in case of host mapping, the returned address
   2716	 *   is the given one.
   2717	 *   In case of DRAM mapping, the returned address is the physical
   2718	 *   address of the memory related to the given handle.
   2719	 */
   2720	if (!ctx->hdev->mmu_enable)
   2721		return 0;
   2722
   2723	dram_range_start = prop->dmmu.start_addr;
   2724	dram_range_end = prop->dmmu.end_addr - 1;
   2725	dram_page_size = prop->dram_page_size ?
   2726				prop->dram_page_size : prop->dmmu.page_size;
   2727	host_range_start = prop->pmmu.start_addr;
   2728	host_range_end = prop->pmmu.end_addr - 1;
   2729	host_page_size = prop->pmmu.page_size;
   2730	host_huge_range_start = prop->pmmu_huge.start_addr;
   2731	host_huge_range_end = prop->pmmu_huge.end_addr - 1;
   2732	host_huge_page_size = prop->pmmu_huge.page_size;
   2733
   2734	return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,
   2735			host_page_size, host_huge_range_start,
   2736			host_huge_range_end, host_huge_page_size,
   2737			dram_range_start, dram_range_end, dram_page_size);
   2738}
   2739
   2740/**
   2741 * hl_vm_ctx_fini() - virtual memory teardown of context.
   2742 * @ctx: pointer to the habanalabs context structure.
   2743 *
   2744 * This function perform teardown the following:
   2745 * - Virtual block list of available virtual memory.
   2746 * - Virtual address to area descriptor hashtable.
   2747 * - MMU for context.
   2748 *
   2749 * In addition this function does the following:
   2750 * - Unmaps the existing hashtable nodes if the hashtable is not empty. The
   2751 *   hashtable should be empty as no valid mappings should exist at this
   2752 *   point.
   2753 * - Frees any existing physical page list from the idr which relates to the
   2754 *   current context asid.
   2755 * - This function checks the virtual block list for correctness. At this point
   2756 *   the list should contain one element which describes the whole virtual
   2757 *   memory range of the context. Otherwise, a warning is printed.
   2758 */
   2759void hl_vm_ctx_fini(struct hl_ctx *ctx)
   2760{
   2761	struct hl_vm_phys_pg_pack *phys_pg_list, *tmp_phys_node;
   2762	struct hl_device *hdev = ctx->hdev;
   2763	struct hl_vm_hash_node *hnode;
   2764	struct hl_vm *vm = &hdev->vm;
   2765	struct hlist_node *tmp_node;
   2766	struct list_head free_list;
   2767	struct hl_mem_in args;
   2768	int i;
   2769
   2770	if (!hdev->mmu_enable)
   2771		return;
   2772
   2773	hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
   2774
   2775	/*
   2776	 * Clearly something went wrong on hard reset so no point in printing
   2777	 * another side effect error
   2778	 */
   2779	if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash))
   2780		dev_dbg(hdev->dev,
   2781			"user released device without removing its memory mappings\n");
   2782
   2783	hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
   2784		dev_dbg(hdev->dev,
   2785			"hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",
   2786			hnode->vaddr, ctx->asid);
   2787		args.unmap.device_virt_addr = hnode->vaddr;
   2788		unmap_device_va(ctx, &args, true);
   2789	}
   2790
   2791	mutex_lock(&ctx->mmu_lock);
   2792
   2793	/* invalidate the cache once after the unmapping loop */
   2794	hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);
   2795	hl_mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK);
   2796
   2797	mutex_unlock(&ctx->mmu_lock);
   2798
   2799	INIT_LIST_HEAD(&free_list);
   2800
   2801	spin_lock(&vm->idr_lock);
   2802	idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
   2803		if (phys_pg_list->asid == ctx->asid) {
   2804			dev_dbg(hdev->dev,
   2805				"page list 0x%px of asid %d is still alive\n",
   2806				phys_pg_list, ctx->asid);
   2807
   2808			atomic64_sub(phys_pg_list->total_size, &hdev->dram_used_mem);
   2809			idr_remove(&vm->phys_pg_pack_handles, i);
   2810			list_add(&phys_pg_list->node, &free_list);
   2811		}
   2812	spin_unlock(&vm->idr_lock);
   2813
   2814	list_for_each_entry_safe(phys_pg_list, tmp_phys_node, &free_list, node)
   2815		free_phys_pg_pack(hdev, phys_pg_list);
   2816
   2817	va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM]);
   2818	va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST]);
   2819
   2820	if (hdev->pmmu_huge_range)
   2821		va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
   2822
   2823	mutex_destroy(&ctx->mem_hash_lock);
   2824	hl_mmu_ctx_fini(ctx);
   2825
   2826	/* In this case we need to clear the global accounting of DRAM usage
   2827	 * because the user notifies us on allocations. If the user is no more,
   2828	 * all DRAM is available
   2829	 */
   2830	if (ctx->asid != HL_KERNEL_ASID_ID &&
   2831			!hdev->asic_prop.dram_supports_virtual_memory)
   2832		atomic64_set(&hdev->dram_used_mem, 0);
   2833}
   2834
   2835/**
   2836 * hl_vm_init() - initialize virtual memory module.
   2837 * @hdev: pointer to the habanalabs device structure.
   2838 *
   2839 * This function initializes the following:
   2840 * - MMU module.
   2841 * - DRAM physical pages pool of 2MB.
   2842 * - Idr for device memory allocation handles.
   2843 */
   2844int hl_vm_init(struct hl_device *hdev)
   2845{
   2846	struct asic_fixed_properties *prop = &hdev->asic_prop;
   2847	struct hl_vm *vm = &hdev->vm;
   2848	int rc;
   2849
   2850	if (is_power_of_2(prop->dram_page_size))
   2851		vm->dram_pg_pool =
   2852			gen_pool_create(__ffs(prop->dram_page_size), -1);
   2853	else
   2854		vm->dram_pg_pool =
   2855			gen_pool_create(__ffs(DRAM_POOL_PAGE_SIZE), -1);
   2856
   2857	if (!vm->dram_pg_pool) {
   2858		dev_err(hdev->dev, "Failed to create dram page pool\n");
   2859		return -ENOMEM;
   2860	}
   2861
   2862	kref_init(&vm->dram_pg_pool_refcount);
   2863
   2864	rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address,
   2865			prop->dram_end_address - prop->dram_user_base_address,
   2866			-1);
   2867
   2868	if (rc) {
   2869		dev_err(hdev->dev,
   2870			"Failed to add memory to dram page pool %d\n", rc);
   2871		goto pool_add_err;
   2872	}
   2873
   2874	spin_lock_init(&vm->idr_lock);
   2875	idr_init(&vm->phys_pg_pack_handles);
   2876
   2877	atomic64_set(&hdev->dram_used_mem, 0);
   2878
   2879	vm->init_done = true;
   2880
   2881	return 0;
   2882
   2883pool_add_err:
   2884	gen_pool_destroy(vm->dram_pg_pool);
   2885
   2886	return rc;
   2887}
   2888
   2889/**
   2890 * hl_vm_fini() - virtual memory module teardown.
   2891 * @hdev: pointer to the habanalabs device structure.
   2892 *
   2893 * This function perform teardown to the following:
   2894 * - Idr for device memory allocation handles.
   2895 * - DRAM physical pages pool of 2MB.
   2896 * - MMU module.
   2897 */
   2898void hl_vm_fini(struct hl_device *hdev)
   2899{
   2900	struct hl_vm *vm = &hdev->vm;
   2901
   2902	if (!vm->init_done)
   2903		return;
   2904
   2905	/*
   2906	 * At this point all the contexts should be freed and hence no DRAM
   2907	 * memory should be in use. Hence the DRAM pool should be freed here.
   2908	 */
   2909	if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1)
   2910		dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",
   2911				__func__);
   2912
   2913	vm->init_done = false;
   2914}
   2915
   2916/**
   2917 * hl_hw_block_mem_init() - HW block memory initialization.
   2918 * @ctx: pointer to the habanalabs context structure.
   2919 *
   2920 * This function initializes the HW block virtual mapped addresses list and
   2921 * it's lock.
   2922 */
   2923void hl_hw_block_mem_init(struct hl_ctx *ctx)
   2924{
   2925	mutex_init(&ctx->hw_block_list_lock);
   2926	INIT_LIST_HEAD(&ctx->hw_block_mem_list);
   2927}
   2928
   2929/**
   2930 * hl_hw_block_mem_fini() - HW block memory teardown.
   2931 * @ctx: pointer to the habanalabs context structure.
   2932 *
   2933 * This function clears the HW block virtual mapped addresses list and destroys
   2934 * it's lock.
   2935 */
   2936void hl_hw_block_mem_fini(struct hl_ctx *ctx)
   2937{
   2938	struct hl_vm_hw_block_list_node *lnode, *tmp;
   2939
   2940	if (!list_empty(&ctx->hw_block_mem_list))
   2941		dev_crit(ctx->hdev->dev, "HW block mem list isn't empty\n");
   2942
   2943	list_for_each_entry_safe(lnode, tmp, &ctx->hw_block_mem_list, node) {
   2944		list_del(&lnode->node);
   2945		kfree(lnode);
   2946	}
   2947
   2948	mutex_destroy(&ctx->hw_block_list_lock);
   2949}