amdgpu_amdkfd_gpuvm.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
amdgpu_amdkfd_gpuvm.c (72759B)
      1/*
      2 * Copyright 2014-2018 Advanced Micro Devices, Inc.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice shall be included in
     12 * all copies or substantial portions of the Software.
     13 *
     14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20 * OTHER DEALINGS IN THE SOFTWARE.
     21 */
     22#include <linux/dma-buf.h>
     23#include <linux/list.h>
     24#include <linux/pagemap.h>
     25#include <linux/sched/mm.h>
     26#include <linux/sched/task.h>
     27
     28#include "amdgpu_object.h"
     29#include "amdgpu_gem.h"
     30#include "amdgpu_vm.h"
     31#include "amdgpu_amdkfd.h"
     32#include "amdgpu_dma_buf.h"
     33#include <uapi/linux/kfd_ioctl.h>
     34#include "amdgpu_xgmi.h"
     35
     36/* Userptr restore delay, just long enough to allow consecutive VM
     37 * changes to accumulate
     38 */
     39#define AMDGPU_USERPTR_RESTORE_DELAY_MS 1
     40
     41/* Impose limit on how much memory KFD can use */
     42static struct {
     43	uint64_t max_system_mem_limit;
     44	uint64_t max_ttm_mem_limit;
     45	int64_t system_mem_used;
     46	int64_t ttm_mem_used;
     47	spinlock_t mem_limit_lock;
     48} kfd_mem_limit;
     49
     50static const char * const domain_bit_to_string[] = {
     51		"CPU",
     52		"GTT",
     53		"VRAM",
     54		"GDS",
     55		"GWS",
     56		"OA"
     57};
     58
     59#define domain_string(domain) domain_bit_to_string[ffs(domain)-1]
     60
     61static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work);
     62
     63static bool kfd_mem_is_attached(struct amdgpu_vm *avm,
     64		struct kgd_mem *mem)
     65{
     66	struct kfd_mem_attachment *entry;
     67
     68	list_for_each_entry(entry, &mem->attachments, list)
     69		if (entry->bo_va->base.vm == avm)
     70			return true;
     71
     72	return false;
     73}
     74
     75/* Set memory usage limits. Current, limits are
     76 *  System (TTM + userptr) memory - 15/16th System RAM
     77 *  TTM memory - 3/8th System RAM
     78 */
     79void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
     80{
     81	struct sysinfo si;
     82	uint64_t mem;
     83
     84	si_meminfo(&si);
     85	mem = si.freeram - si.freehigh;
     86	mem *= si.mem_unit;
     87
     88	spin_lock_init(&kfd_mem_limit.mem_limit_lock);
     89	kfd_mem_limit.max_system_mem_limit = mem - (mem >> 4);
     90	kfd_mem_limit.max_ttm_mem_limit = (mem >> 1) - (mem >> 3);
     91	pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
     92		(kfd_mem_limit.max_system_mem_limit >> 20),
     93		(kfd_mem_limit.max_ttm_mem_limit >> 20));
     94}
     95
     96void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
     97{
     98	kfd_mem_limit.system_mem_used += size;
     99}
    100
    101/* Estimate page table size needed to represent a given memory size
    102 *
    103 * With 4KB pages, we need one 8 byte PTE for each 4KB of memory
    104 * (factor 512, >> 9). With 2MB pages, we need one 8 byte PTE for 2MB
    105 * of memory (factor 256K, >> 18). ROCm user mode tries to optimize
    106 * for 2MB pages for TLB efficiency. However, small allocations and
    107 * fragmented system memory still need some 4KB pages. We choose a
    108 * compromise that should work in most cases without reserving too
    109 * much memory for page tables unnecessarily (factor 16K, >> 14).
    110 */
    111#define ESTIMATE_PT_SIZE(mem_size) ((mem_size) >> 14)
    112
    113static size_t amdgpu_amdkfd_acc_size(uint64_t size)
    114{
    115	size >>= PAGE_SHIFT;
    116	size *= sizeof(dma_addr_t) + sizeof(void *);
    117
    118	return __roundup_pow_of_two(sizeof(struct amdgpu_bo)) +
    119		__roundup_pow_of_two(sizeof(struct ttm_tt)) +
    120		PAGE_ALIGN(size);
    121}
    122
    123/**
    124 * amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size
    125 * of buffer including any reserved for control structures
    126 *
    127 * @adev: Device to which allocated BO belongs to
    128 * @size: Size of buffer, in bytes, encapsulated by B0. This should be
    129 * equivalent to amdgpu_bo_size(BO)
    130 * @alloc_flag: Flag used in allocating a BO as noted above
    131 *
    132 * Return: returns -ENOMEM in case of error, ZERO otherwise
    133 */
    134static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
    135		uint64_t size, u32 alloc_flag)
    136{
    137	uint64_t reserved_for_pt =
    138		ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
    139	size_t acc_size, system_mem_needed, ttm_mem_needed, vram_needed;
    140	int ret = 0;
    141
    142	acc_size = amdgpu_amdkfd_acc_size(size);
    143
    144	vram_needed = 0;
    145	if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
    146		system_mem_needed = acc_size + size;
    147		ttm_mem_needed = acc_size + size;
    148	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
    149		system_mem_needed = acc_size;
    150		ttm_mem_needed = acc_size;
    151		vram_needed = size;
    152	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
    153		system_mem_needed = acc_size + size;
    154		ttm_mem_needed = acc_size;
    155	} else if (alloc_flag &
    156		   (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
    157		    KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
    158		system_mem_needed = acc_size;
    159		ttm_mem_needed = acc_size;
    160	} else {
    161		pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
    162		return -ENOMEM;
    163	}
    164
    165	spin_lock(&kfd_mem_limit.mem_limit_lock);
    166
    167	if (kfd_mem_limit.system_mem_used + system_mem_needed >
    168	    kfd_mem_limit.max_system_mem_limit)
    169		pr_debug("Set no_system_mem_limit=1 if using shared memory\n");
    170
    171	if ((kfd_mem_limit.system_mem_used + system_mem_needed >
    172	     kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||
    173	    (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
    174	     kfd_mem_limit.max_ttm_mem_limit) ||
    175	    (adev->kfd.vram_used + vram_needed >
    176	     adev->gmc.real_vram_size - reserved_for_pt)) {
    177		ret = -ENOMEM;
    178		goto release;
    179	}
    180
    181	/* Update memory accounting by decreasing available system
    182	 * memory, TTM memory and GPU memory as computed above
    183	 */
    184	adev->kfd.vram_used += vram_needed;
    185	kfd_mem_limit.system_mem_used += system_mem_needed;
    186	kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
    187
    188release:
    189	spin_unlock(&kfd_mem_limit.mem_limit_lock);
    190	return ret;
    191}
    192
    193static void unreserve_mem_limit(struct amdgpu_device *adev,
    194		uint64_t size, u32 alloc_flag)
    195{
    196	size_t acc_size;
    197
    198	acc_size = amdgpu_amdkfd_acc_size(size);
    199
    200	spin_lock(&kfd_mem_limit.mem_limit_lock);
    201
    202	if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
    203		kfd_mem_limit.system_mem_used -= (acc_size + size);
    204		kfd_mem_limit.ttm_mem_used -= (acc_size + size);
    205	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
    206		kfd_mem_limit.system_mem_used -= acc_size;
    207		kfd_mem_limit.ttm_mem_used -= acc_size;
    208		adev->kfd.vram_used -= size;
    209	} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
    210		kfd_mem_limit.system_mem_used -= (acc_size + size);
    211		kfd_mem_limit.ttm_mem_used -= acc_size;
    212	} else if (alloc_flag &
    213		   (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
    214		    KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
    215		kfd_mem_limit.system_mem_used -= acc_size;
    216		kfd_mem_limit.ttm_mem_used -= acc_size;
    217	} else {
    218		pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
    219		goto release;
    220	}
    221
    222	WARN_ONCE(adev->kfd.vram_used < 0,
    223		  "KFD VRAM memory accounting unbalanced");
    224	WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
    225		  "KFD TTM memory accounting unbalanced");
    226	WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
    227		  "KFD system memory accounting unbalanced");
    228
    229release:
    230	spin_unlock(&kfd_mem_limit.mem_limit_lock);
    231}
    232
    233void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
    234{
    235	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
    236	u32 alloc_flags = bo->kfd_bo->alloc_flags;
    237	u64 size = amdgpu_bo_size(bo);
    238
    239	unreserve_mem_limit(adev, size, alloc_flags);
    240
    241	kfree(bo->kfd_bo);
    242}
    243
    244/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence from BO's
    245 *  reservation object.
    246 *
    247 * @bo: [IN] Remove eviction fence(s) from this BO
    248 * @ef: [IN] This eviction fence is removed if it
    249 *  is present in the shared list.
    250 *
    251 * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held.
    252 */
    253static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
    254					struct amdgpu_amdkfd_fence *ef)
    255{
    256	struct dma_fence *replacement;
    257
    258	if (!ef)
    259		return -EINVAL;
    260
    261	/* TODO: Instead of block before we should use the fence of the page
    262	 * table update and TLB flush here directly.
    263	 */
    264	replacement = dma_fence_get_stub();
    265	dma_resv_replace_fences(bo->tbo.base.resv, ef->base.context,
    266				replacement, DMA_RESV_USAGE_READ);
    267	dma_fence_put(replacement);
    268	return 0;
    269}
    270
    271int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
    272{
    273	struct amdgpu_bo *root = bo;
    274	struct amdgpu_vm_bo_base *vm_bo;
    275	struct amdgpu_vm *vm;
    276	struct amdkfd_process_info *info;
    277	struct amdgpu_amdkfd_fence *ef;
    278	int ret;
    279
    280	/* we can always get vm_bo from root PD bo.*/
    281	while (root->parent)
    282		root = root->parent;
    283
    284	vm_bo = root->vm_bo;
    285	if (!vm_bo)
    286		return 0;
    287
    288	vm = vm_bo->vm;
    289	if (!vm)
    290		return 0;
    291
    292	info = vm->process_info;
    293	if (!info || !info->eviction_fence)
    294		return 0;
    295
    296	ef = container_of(dma_fence_get(&info->eviction_fence->base),
    297			struct amdgpu_amdkfd_fence, base);
    298
    299	BUG_ON(!dma_resv_trylock(bo->tbo.base.resv));
    300	ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef);
    301	dma_resv_unlock(bo->tbo.base.resv);
    302
    303	dma_fence_put(&ef->base);
    304	return ret;
    305}
    306
    307static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
    308				     bool wait)
    309{
    310	struct ttm_operation_ctx ctx = { false, false };
    311	int ret;
    312
    313	if (WARN(amdgpu_ttm_tt_get_usermm(bo->tbo.ttm),
    314		 "Called with userptr BO"))
    315		return -EINVAL;
    316
    317	amdgpu_bo_placement_from_domain(bo, domain);
    318
    319	ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
    320	if (ret)
    321		goto validate_fail;
    322	if (wait)
    323		amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
    324
    325validate_fail:
    326	return ret;
    327}
    328
    329static int amdgpu_amdkfd_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
    330{
    331	return amdgpu_amdkfd_bo_validate(bo, bo->allowed_domains, false);
    332}
    333
    334/* vm_validate_pt_pd_bos - Validate page table and directory BOs
    335 *
    336 * Page directories are not updated here because huge page handling
    337 * during page table updates can invalidate page directory entries
    338 * again. Page directories are only updated after updating page
    339 * tables.
    340 */
    341static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
    342{
    343	struct amdgpu_bo *pd = vm->root.bo;
    344	struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
    345	int ret;
    346
    347	ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate_vm_bo, NULL);
    348	if (ret) {
    349		pr_err("failed to validate PT BOs\n");
    350		return ret;
    351	}
    352
    353	ret = amdgpu_amdkfd_validate_vm_bo(NULL, pd);
    354	if (ret) {
    355		pr_err("failed to validate PD\n");
    356		return ret;
    357	}
    358
    359	vm->pd_phys_addr = amdgpu_gmc_pd_addr(vm->root.bo);
    360
    361	if (vm->use_cpu_for_update) {
    362		ret = amdgpu_bo_kmap(pd, NULL);
    363		if (ret) {
    364			pr_err("failed to kmap PD, ret=%d\n", ret);
    365			return ret;
    366		}
    367	}
    368
    369	return 0;
    370}
    371
    372static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
    373{
    374	struct amdgpu_bo *pd = vm->root.bo;
    375	struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
    376	int ret;
    377
    378	ret = amdgpu_vm_update_pdes(adev, vm, false);
    379	if (ret)
    380		return ret;
    381
    382	return amdgpu_sync_fence(sync, vm->last_update);
    383}
    384
    385static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
    386{
    387	struct amdgpu_device *bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
    388	bool coherent = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
    389	bool uncached = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
    390	uint32_t mapping_flags;
    391	uint64_t pte_flags;
    392	bool snoop = false;
    393
    394	mapping_flags = AMDGPU_VM_PAGE_READABLE;
    395	if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE)
    396		mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
    397	if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE)
    398		mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
    399
    400	switch (adev->asic_type) {
    401	case CHIP_ARCTURUS:
    402		if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
    403			if (bo_adev == adev)
    404				mapping_flags |= coherent ?
    405					AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
    406			else
    407				mapping_flags |= coherent ?
    408					AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
    409		} else {
    410			mapping_flags |= coherent ?
    411				AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
    412		}
    413		break;
    414	case CHIP_ALDEBARAN:
    415		if (coherent && uncached) {
    416			if (adev->gmc.xgmi.connected_to_cpu ||
    417				!(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM))
    418				snoop = true;
    419			mapping_flags |= AMDGPU_VM_MTYPE_UC;
    420		} else if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
    421			if (bo_adev == adev) {
    422				mapping_flags |= coherent ?
    423					AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
    424				if (adev->gmc.xgmi.connected_to_cpu)
    425					snoop = true;
    426			} else {
    427				mapping_flags |= coherent ?
    428					AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
    429				if (amdgpu_xgmi_same_hive(adev, bo_adev))
    430					snoop = true;
    431			}
    432		} else {
    433			snoop = true;
    434			mapping_flags |= coherent ?
    435				AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
    436		}
    437		break;
    438	default:
    439		mapping_flags |= coherent ?
    440			AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
    441	}
    442
    443	pte_flags = amdgpu_gem_va_map_flags(adev, mapping_flags);
    444	pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
    445
    446	return pte_flags;
    447}
    448
    449static int
    450kfd_mem_dmamap_userptr(struct kgd_mem *mem,
    451		       struct kfd_mem_attachment *attachment)
    452{
    453	enum dma_data_direction direction =
    454		mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
    455		DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
    456	struct ttm_operation_ctx ctx = {.interruptible = true};
    457	struct amdgpu_bo *bo = attachment->bo_va->base.bo;
    458	struct amdgpu_device *adev = attachment->adev;
    459	struct ttm_tt *src_ttm = mem->bo->tbo.ttm;
    460	struct ttm_tt *ttm = bo->tbo.ttm;
    461	int ret;
    462
    463	ttm->sg = kmalloc(sizeof(*ttm->sg), GFP_KERNEL);
    464	if (unlikely(!ttm->sg))
    465		return -ENOMEM;
    466
    467	if (WARN_ON(ttm->num_pages != src_ttm->num_pages))
    468		return -EINVAL;
    469
    470	/* Same sequence as in amdgpu_ttm_tt_pin_userptr */
    471	ret = sg_alloc_table_from_pages(ttm->sg, src_ttm->pages,
    472					ttm->num_pages, 0,
    473					(u64)ttm->num_pages << PAGE_SHIFT,
    474					GFP_KERNEL);
    475	if (unlikely(ret))
    476		goto free_sg;
    477
    478	ret = dma_map_sgtable(adev->dev, ttm->sg, direction, 0);
    479	if (unlikely(ret))
    480		goto release_sg;
    481
    482	drm_prime_sg_to_dma_addr_array(ttm->sg, ttm->dma_address,
    483				       ttm->num_pages);
    484
    485	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
    486	ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
    487	if (ret)
    488		goto unmap_sg;
    489
    490	return 0;
    491
    492unmap_sg:
    493	dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
    494release_sg:
    495	pr_err("DMA map userptr failed: %d\n", ret);
    496	sg_free_table(ttm->sg);
    497free_sg:
    498	kfree(ttm->sg);
    499	ttm->sg = NULL;
    500	return ret;
    501}
    502
    503static int
    504kfd_mem_dmamap_dmabuf(struct kfd_mem_attachment *attachment)
    505{
    506	struct ttm_operation_ctx ctx = {.interruptible = true};
    507	struct amdgpu_bo *bo = attachment->bo_va->base.bo;
    508
    509	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
    510	return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
    511}
    512
    513static int
    514kfd_mem_dmamap_attachment(struct kgd_mem *mem,
    515			  struct kfd_mem_attachment *attachment)
    516{
    517	switch (attachment->type) {
    518	case KFD_MEM_ATT_SHARED:
    519		return 0;
    520	case KFD_MEM_ATT_USERPTR:
    521		return kfd_mem_dmamap_userptr(mem, attachment);
    522	case KFD_MEM_ATT_DMABUF:
    523		return kfd_mem_dmamap_dmabuf(attachment);
    524	default:
    525		WARN_ON_ONCE(1);
    526	}
    527	return -EINVAL;
    528}
    529
    530static void
    531kfd_mem_dmaunmap_userptr(struct kgd_mem *mem,
    532			 struct kfd_mem_attachment *attachment)
    533{
    534	enum dma_data_direction direction =
    535		mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
    536		DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
    537	struct ttm_operation_ctx ctx = {.interruptible = false};
    538	struct amdgpu_bo *bo = attachment->bo_va->base.bo;
    539	struct amdgpu_device *adev = attachment->adev;
    540	struct ttm_tt *ttm = bo->tbo.ttm;
    541
    542	if (unlikely(!ttm->sg))
    543		return;
    544
    545	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
    546	ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
    547
    548	dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
    549	sg_free_table(ttm->sg);
    550	kfree(ttm->sg);
    551	ttm->sg = NULL;
    552}
    553
    554static void
    555kfd_mem_dmaunmap_dmabuf(struct kfd_mem_attachment *attachment)
    556{
    557	struct ttm_operation_ctx ctx = {.interruptible = true};
    558	struct amdgpu_bo *bo = attachment->bo_va->base.bo;
    559
    560	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
    561	ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
    562}
    563
    564static void
    565kfd_mem_dmaunmap_attachment(struct kgd_mem *mem,
    566			    struct kfd_mem_attachment *attachment)
    567{
    568	switch (attachment->type) {
    569	case KFD_MEM_ATT_SHARED:
    570		break;
    571	case KFD_MEM_ATT_USERPTR:
    572		kfd_mem_dmaunmap_userptr(mem, attachment);
    573		break;
    574	case KFD_MEM_ATT_DMABUF:
    575		kfd_mem_dmaunmap_dmabuf(attachment);
    576		break;
    577	default:
    578		WARN_ON_ONCE(1);
    579	}
    580}
    581
    582static int
    583kfd_mem_attach_userptr(struct amdgpu_device *adev, struct kgd_mem *mem,
    584		       struct amdgpu_bo **bo)
    585{
    586	unsigned long bo_size = mem->bo->tbo.base.size;
    587	struct drm_gem_object *gobj;
    588	int ret;
    589
    590	ret = amdgpu_bo_reserve(mem->bo, false);
    591	if (ret)
    592		return ret;
    593
    594	ret = amdgpu_gem_object_create(adev, bo_size, 1,
    595				       AMDGPU_GEM_DOMAIN_CPU,
    596				       AMDGPU_GEM_CREATE_PREEMPTIBLE,
    597				       ttm_bo_type_sg, mem->bo->tbo.base.resv,
    598				       &gobj);
    599	amdgpu_bo_unreserve(mem->bo);
    600	if (ret)
    601		return ret;
    602
    603	*bo = gem_to_amdgpu_bo(gobj);
    604	(*bo)->parent = amdgpu_bo_ref(mem->bo);
    605
    606	return 0;
    607}
    608
    609static int
    610kfd_mem_attach_dmabuf(struct amdgpu_device *adev, struct kgd_mem *mem,
    611		      struct amdgpu_bo **bo)
    612{
    613	struct drm_gem_object *gobj;
    614	int ret;
    615
    616	if (!mem->dmabuf) {
    617		mem->dmabuf = amdgpu_gem_prime_export(&mem->bo->tbo.base,
    618			mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
    619				DRM_RDWR : 0);
    620		if (IS_ERR(mem->dmabuf)) {
    621			ret = PTR_ERR(mem->dmabuf);
    622			mem->dmabuf = NULL;
    623			return ret;
    624		}
    625	}
    626
    627	gobj = amdgpu_gem_prime_import(adev_to_drm(adev), mem->dmabuf);
    628	if (IS_ERR(gobj))
    629		return PTR_ERR(gobj);
    630
    631	*bo = gem_to_amdgpu_bo(gobj);
    632	(*bo)->flags |= AMDGPU_GEM_CREATE_PREEMPTIBLE;
    633	(*bo)->parent = amdgpu_bo_ref(mem->bo);
    634
    635	return 0;
    636}
    637
    638/* kfd_mem_attach - Add a BO to a VM
    639 *
    640 * Everything that needs to bo done only once when a BO is first added
    641 * to a VM. It can later be mapped and unmapped many times without
    642 * repeating these steps.
    643 *
    644 * 0. Create BO for DMA mapping, if needed
    645 * 1. Allocate and initialize BO VA entry data structure
    646 * 2. Add BO to the VM
    647 * 3. Determine ASIC-specific PTE flags
    648 * 4. Alloc page tables and directories if needed
    649 * 4a.  Validate new page tables and directories
    650 */
    651static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
    652		struct amdgpu_vm *vm, bool is_aql)
    653{
    654	struct amdgpu_device *bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
    655	unsigned long bo_size = mem->bo->tbo.base.size;
    656	uint64_t va = mem->va;
    657	struct kfd_mem_attachment *attachment[2] = {NULL, NULL};
    658	struct amdgpu_bo *bo[2] = {NULL, NULL};
    659	int i, ret;
    660
    661	if (!va) {
    662		pr_err("Invalid VA when adding BO to VM\n");
    663		return -EINVAL;
    664	}
    665
    666	for (i = 0; i <= is_aql; i++) {
    667		attachment[i] = kzalloc(sizeof(*attachment[i]), GFP_KERNEL);
    668		if (unlikely(!attachment[i])) {
    669			ret = -ENOMEM;
    670			goto unwind;
    671		}
    672
    673		pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
    674			 va + bo_size, vm);
    675
    676		if (adev == bo_adev ||
    677		   (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm) && adev->ram_is_direct_mapped) ||
    678		   (mem->domain == AMDGPU_GEM_DOMAIN_VRAM && amdgpu_xgmi_same_hive(adev, bo_adev))) {
    679			/* Mappings on the local GPU, or VRAM mappings in the
    680			 * local hive, or userptr mapping IOMMU direct map mode
    681			 * share the original BO
    682			 */
    683			attachment[i]->type = KFD_MEM_ATT_SHARED;
    684			bo[i] = mem->bo;
    685			drm_gem_object_get(&bo[i]->tbo.base);
    686		} else if (i > 0) {
    687			/* Multiple mappings on the same GPU share the BO */
    688			attachment[i]->type = KFD_MEM_ATT_SHARED;
    689			bo[i] = bo[0];
    690			drm_gem_object_get(&bo[i]->tbo.base);
    691		} else if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) {
    692			/* Create an SG BO to DMA-map userptrs on other GPUs */
    693			attachment[i]->type = KFD_MEM_ATT_USERPTR;
    694			ret = kfd_mem_attach_userptr(adev, mem, &bo[i]);
    695			if (ret)
    696				goto unwind;
    697		} else if (mem->domain == AMDGPU_GEM_DOMAIN_GTT &&
    698			   mem->bo->tbo.type != ttm_bo_type_sg) {
    699			/* GTT BOs use DMA-mapping ability of dynamic-attach
    700			 * DMA bufs. TODO: The same should work for VRAM on
    701			 * large-BAR GPUs.
    702			 */
    703			attachment[i]->type = KFD_MEM_ATT_DMABUF;
    704			ret = kfd_mem_attach_dmabuf(adev, mem, &bo[i]);
    705			if (ret)
    706				goto unwind;
    707		} else {
    708			/* FIXME: Need to DMA-map other BO types:
    709			 * large-BAR VRAM, doorbells, MMIO remap
    710			 */
    711			attachment[i]->type = KFD_MEM_ATT_SHARED;
    712			bo[i] = mem->bo;
    713			drm_gem_object_get(&bo[i]->tbo.base);
    714		}
    715
    716		/* Add BO to VM internal data structures */
    717		ret = amdgpu_bo_reserve(bo[i], false);
    718		if (ret) {
    719			pr_debug("Unable to reserve BO during memory attach");
    720			goto unwind;
    721		}
    722		attachment[i]->bo_va = amdgpu_vm_bo_add(adev, vm, bo[i]);
    723		amdgpu_bo_unreserve(bo[i]);
    724		if (unlikely(!attachment[i]->bo_va)) {
    725			ret = -ENOMEM;
    726			pr_err("Failed to add BO object to VM. ret == %d\n",
    727			       ret);
    728			goto unwind;
    729		}
    730		attachment[i]->va = va;
    731		attachment[i]->pte_flags = get_pte_flags(adev, mem);
    732		attachment[i]->adev = adev;
    733		list_add(&attachment[i]->list, &mem->attachments);
    734
    735		va += bo_size;
    736	}
    737
    738	return 0;
    739
    740unwind:
    741	for (; i >= 0; i--) {
    742		if (!attachment[i])
    743			continue;
    744		if (attachment[i]->bo_va) {
    745			amdgpu_bo_reserve(bo[i], true);
    746			amdgpu_vm_bo_del(adev, attachment[i]->bo_va);
    747			amdgpu_bo_unreserve(bo[i]);
    748			list_del(&attachment[i]->list);
    749		}
    750		if (bo[i])
    751			drm_gem_object_put(&bo[i]->tbo.base);
    752		kfree(attachment[i]);
    753	}
    754	return ret;
    755}
    756
    757static void kfd_mem_detach(struct kfd_mem_attachment *attachment)
    758{
    759	struct amdgpu_bo *bo = attachment->bo_va->base.bo;
    760
    761	pr_debug("\t remove VA 0x%llx in entry %p\n",
    762			attachment->va, attachment);
    763	amdgpu_vm_bo_del(attachment->adev, attachment->bo_va);
    764	drm_gem_object_put(&bo->tbo.base);
    765	list_del(&attachment->list);
    766	kfree(attachment);
    767}
    768
    769static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
    770				struct amdkfd_process_info *process_info,
    771				bool userptr)
    772{
    773	struct ttm_validate_buffer *entry = &mem->validate_list;
    774	struct amdgpu_bo *bo = mem->bo;
    775
    776	INIT_LIST_HEAD(&entry->head);
    777	entry->num_shared = 1;
    778	entry->bo = &bo->tbo;
    779	mutex_lock(&process_info->lock);
    780	if (userptr)
    781		list_add_tail(&entry->head, &process_info->userptr_valid_list);
    782	else
    783		list_add_tail(&entry->head, &process_info->kfd_bo_list);
    784	mutex_unlock(&process_info->lock);
    785}
    786
    787static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
    788		struct amdkfd_process_info *process_info)
    789{
    790	struct ttm_validate_buffer *bo_list_entry;
    791
    792	bo_list_entry = &mem->validate_list;
    793	mutex_lock(&process_info->lock);
    794	list_del(&bo_list_entry->head);
    795	mutex_unlock(&process_info->lock);
    796}
    797
    798/* Initializes user pages. It registers the MMU notifier and validates
    799 * the userptr BO in the GTT domain.
    800 *
    801 * The BO must already be on the userptr_valid_list. Otherwise an
    802 * eviction and restore may happen that leaves the new BO unmapped
    803 * with the user mode queues running.
    804 *
    805 * Takes the process_info->lock to protect against concurrent restore
    806 * workers.
    807 *
    808 * Returns 0 for success, negative errno for errors.
    809 */
    810static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
    811			   bool criu_resume)
    812{
    813	struct amdkfd_process_info *process_info = mem->process_info;
    814	struct amdgpu_bo *bo = mem->bo;
    815	struct ttm_operation_ctx ctx = { true, false };
    816	int ret = 0;
    817
    818	mutex_lock(&process_info->lock);
    819
    820	ret = amdgpu_ttm_tt_set_userptr(&bo->tbo, user_addr, 0);
    821	if (ret) {
    822		pr_err("%s: Failed to set userptr: %d\n", __func__, ret);
    823		goto out;
    824	}
    825
    826	ret = amdgpu_mn_register(bo, user_addr);
    827	if (ret) {
    828		pr_err("%s: Failed to register MMU notifier: %d\n",
    829		       __func__, ret);
    830		goto out;
    831	}
    832
    833	if (criu_resume) {
    834		/*
    835		 * During a CRIU restore operation, the userptr buffer objects
    836		 * will be validated in the restore_userptr_work worker at a
    837		 * later stage when it is scheduled by another ioctl called by
    838		 * CRIU master process for the target pid for restore.
    839		 */
    840		atomic_inc(&mem->invalid);
    841		mutex_unlock(&process_info->lock);
    842		return 0;
    843	}
    844
    845	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
    846	if (ret) {
    847		pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
    848		goto unregister_out;
    849	}
    850
    851	ret = amdgpu_bo_reserve(bo, true);
    852	if (ret) {
    853		pr_err("%s: Failed to reserve BO\n", __func__);
    854		goto release_out;
    855	}
    856	amdgpu_bo_placement_from_domain(bo, mem->domain);
    857	ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
    858	if (ret)
    859		pr_err("%s: failed to validate BO\n", __func__);
    860	amdgpu_bo_unreserve(bo);
    861
    862release_out:
    863	amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
    864unregister_out:
    865	if (ret)
    866		amdgpu_mn_unregister(bo);
    867out:
    868	mutex_unlock(&process_info->lock);
    869	return ret;
    870}
    871
    872/* Reserving a BO and its page table BOs must happen atomically to
    873 * avoid deadlocks. Some operations update multiple VMs at once. Track
    874 * all the reservation info in a context structure. Optionally a sync
    875 * object can track VM updates.
    876 */
    877struct bo_vm_reservation_context {
    878	struct amdgpu_bo_list_entry kfd_bo; /* BO list entry for the KFD BO */
    879	unsigned int n_vms;		    /* Number of VMs reserved	    */
    880	struct amdgpu_bo_list_entry *vm_pd; /* Array of VM BO list entries  */
    881	struct ww_acquire_ctx ticket;	    /* Reservation ticket	    */
    882	struct list_head list, duplicates;  /* BO lists			    */
    883	struct amdgpu_sync *sync;	    /* Pointer to sync object	    */
    884	bool reserved;			    /* Whether BOs are reserved	    */
    885};
    886
    887enum bo_vm_match {
    888	BO_VM_NOT_MAPPED = 0,	/* Match VMs where a BO is not mapped */
    889	BO_VM_MAPPED,		/* Match VMs where a BO is mapped     */
    890	BO_VM_ALL,		/* Match all VMs a BO was added to    */
    891};
    892
    893/**
    894 * reserve_bo_and_vm - reserve a BO and a VM unconditionally.
    895 * @mem: KFD BO structure.
    896 * @vm: the VM to reserve.
    897 * @ctx: the struct that will be used in unreserve_bo_and_vms().
    898 */
    899static int reserve_bo_and_vm(struct kgd_mem *mem,
    900			      struct amdgpu_vm *vm,
    901			      struct bo_vm_reservation_context *ctx)
    902{
    903	struct amdgpu_bo *bo = mem->bo;
    904	int ret;
    905
    906	WARN_ON(!vm);
    907
    908	ctx->reserved = false;
    909	ctx->n_vms = 1;
    910	ctx->sync = &mem->sync;
    911
    912	INIT_LIST_HEAD(&ctx->list);
    913	INIT_LIST_HEAD(&ctx->duplicates);
    914
    915	ctx->vm_pd = kcalloc(ctx->n_vms, sizeof(*ctx->vm_pd), GFP_KERNEL);
    916	if (!ctx->vm_pd)
    917		return -ENOMEM;
    918
    919	ctx->kfd_bo.priority = 0;
    920	ctx->kfd_bo.tv.bo = &bo->tbo;
    921	ctx->kfd_bo.tv.num_shared = 1;
    922	list_add(&ctx->kfd_bo.tv.head, &ctx->list);
    923
    924	amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]);
    925
    926	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
    927				     false, &ctx->duplicates);
    928	if (ret) {
    929		pr_err("Failed to reserve buffers in ttm.\n");
    930		kfree(ctx->vm_pd);
    931		ctx->vm_pd = NULL;
    932		return ret;
    933	}
    934
    935	ctx->reserved = true;
    936	return 0;
    937}
    938
    939/**
    940 * reserve_bo_and_cond_vms - reserve a BO and some VMs conditionally
    941 * @mem: KFD BO structure.
    942 * @vm: the VM to reserve. If NULL, then all VMs associated with the BO
    943 * is used. Otherwise, a single VM associated with the BO.
    944 * @map_type: the mapping status that will be used to filter the VMs.
    945 * @ctx: the struct that will be used in unreserve_bo_and_vms().
    946 *
    947 * Returns 0 for success, negative for failure.
    948 */
    949static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
    950				struct amdgpu_vm *vm, enum bo_vm_match map_type,
    951				struct bo_vm_reservation_context *ctx)
    952{
    953	struct amdgpu_bo *bo = mem->bo;
    954	struct kfd_mem_attachment *entry;
    955	unsigned int i;
    956	int ret;
    957
    958	ctx->reserved = false;
    959	ctx->n_vms = 0;
    960	ctx->vm_pd = NULL;
    961	ctx->sync = &mem->sync;
    962
    963	INIT_LIST_HEAD(&ctx->list);
    964	INIT_LIST_HEAD(&ctx->duplicates);
    965
    966	list_for_each_entry(entry, &mem->attachments, list) {
    967		if ((vm && vm != entry->bo_va->base.vm) ||
    968			(entry->is_mapped != map_type
    969			&& map_type != BO_VM_ALL))
    970			continue;
    971
    972		ctx->n_vms++;
    973	}
    974
    975	if (ctx->n_vms != 0) {
    976		ctx->vm_pd = kcalloc(ctx->n_vms, sizeof(*ctx->vm_pd),
    977				     GFP_KERNEL);
    978		if (!ctx->vm_pd)
    979			return -ENOMEM;
    980	}
    981
    982	ctx->kfd_bo.priority = 0;
    983	ctx->kfd_bo.tv.bo = &bo->tbo;
    984	ctx->kfd_bo.tv.num_shared = 1;
    985	list_add(&ctx->kfd_bo.tv.head, &ctx->list);
    986
    987	i = 0;
    988	list_for_each_entry(entry, &mem->attachments, list) {
    989		if ((vm && vm != entry->bo_va->base.vm) ||
    990			(entry->is_mapped != map_type
    991			&& map_type != BO_VM_ALL))
    992			continue;
    993
    994		amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list,
    995				&ctx->vm_pd[i]);
    996		i++;
    997	}
    998
    999	ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
   1000				     false, &ctx->duplicates);
   1001	if (ret) {
   1002		pr_err("Failed to reserve buffers in ttm.\n");
   1003		kfree(ctx->vm_pd);
   1004		ctx->vm_pd = NULL;
   1005		return ret;
   1006	}
   1007
   1008	ctx->reserved = true;
   1009	return 0;
   1010}
   1011
   1012/**
   1013 * unreserve_bo_and_vms - Unreserve BO and VMs from a reservation context
   1014 * @ctx: Reservation context to unreserve
   1015 * @wait: Optionally wait for a sync object representing pending VM updates
   1016 * @intr: Whether the wait is interruptible
   1017 *
   1018 * Also frees any resources allocated in
   1019 * reserve_bo_and_(cond_)vm(s). Returns the status from
   1020 * amdgpu_sync_wait.
   1021 */
   1022static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
   1023				 bool wait, bool intr)
   1024{
   1025	int ret = 0;
   1026
   1027	if (wait)
   1028		ret = amdgpu_sync_wait(ctx->sync, intr);
   1029
   1030	if (ctx->reserved)
   1031		ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list);
   1032	kfree(ctx->vm_pd);
   1033
   1034	ctx->sync = NULL;
   1035
   1036	ctx->reserved = false;
   1037	ctx->vm_pd = NULL;
   1038
   1039	return ret;
   1040}
   1041
   1042static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
   1043				struct kfd_mem_attachment *entry,
   1044				struct amdgpu_sync *sync)
   1045{
   1046	struct amdgpu_bo_va *bo_va = entry->bo_va;
   1047	struct amdgpu_device *adev = entry->adev;
   1048	struct amdgpu_vm *vm = bo_va->base.vm;
   1049
   1050	amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
   1051
   1052	amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
   1053
   1054	amdgpu_sync_fence(sync, bo_va->last_pt_update);
   1055
   1056	kfd_mem_dmaunmap_attachment(mem, entry);
   1057}
   1058
   1059static int update_gpuvm_pte(struct kgd_mem *mem,
   1060			    struct kfd_mem_attachment *entry,
   1061			    struct amdgpu_sync *sync)
   1062{
   1063	struct amdgpu_bo_va *bo_va = entry->bo_va;
   1064	struct amdgpu_device *adev = entry->adev;
   1065	int ret;
   1066
   1067	ret = kfd_mem_dmamap_attachment(mem, entry);
   1068	if (ret)
   1069		return ret;
   1070
   1071	/* Update the page tables  */
   1072	ret = amdgpu_vm_bo_update(adev, bo_va, false);
   1073	if (ret) {
   1074		pr_err("amdgpu_vm_bo_update failed\n");
   1075		return ret;
   1076	}
   1077
   1078	return amdgpu_sync_fence(sync, bo_va->last_pt_update);
   1079}
   1080
   1081static int map_bo_to_gpuvm(struct kgd_mem *mem,
   1082			   struct kfd_mem_attachment *entry,
   1083			   struct amdgpu_sync *sync,
   1084			   bool no_update_pte)
   1085{
   1086	int ret;
   1087
   1088	/* Set virtual address for the allocation */
   1089	ret = amdgpu_vm_bo_map(entry->adev, entry->bo_va, entry->va, 0,
   1090			       amdgpu_bo_size(entry->bo_va->base.bo),
   1091			       entry->pte_flags);
   1092	if (ret) {
   1093		pr_err("Failed to map VA 0x%llx in vm. ret %d\n",
   1094				entry->va, ret);
   1095		return ret;
   1096	}
   1097
   1098	if (no_update_pte)
   1099		return 0;
   1100
   1101	ret = update_gpuvm_pte(mem, entry, sync);
   1102	if (ret) {
   1103		pr_err("update_gpuvm_pte() failed\n");
   1104		goto update_gpuvm_pte_failed;
   1105	}
   1106
   1107	return 0;
   1108
   1109update_gpuvm_pte_failed:
   1110	unmap_bo_from_gpuvm(mem, entry, sync);
   1111	return ret;
   1112}
   1113
   1114static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size)
   1115{
   1116	struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);
   1117
   1118	if (!sg)
   1119		return NULL;
   1120	if (sg_alloc_table(sg, 1, GFP_KERNEL)) {
   1121		kfree(sg);
   1122		return NULL;
   1123	}
   1124	sg->sgl->dma_address = addr;
   1125	sg->sgl->length = size;
   1126#ifdef CONFIG_NEED_SG_DMA_LENGTH
   1127	sg->sgl->dma_length = size;
   1128#endif
   1129	return sg;
   1130}
   1131
   1132static int process_validate_vms(struct amdkfd_process_info *process_info)
   1133{
   1134	struct amdgpu_vm *peer_vm;
   1135	int ret;
   1136
   1137	list_for_each_entry(peer_vm, &process_info->vm_list_head,
   1138			    vm_list_node) {
   1139		ret = vm_validate_pt_pd_bos(peer_vm);
   1140		if (ret)
   1141			return ret;
   1142	}
   1143
   1144	return 0;
   1145}
   1146
   1147static int process_sync_pds_resv(struct amdkfd_process_info *process_info,
   1148				 struct amdgpu_sync *sync)
   1149{
   1150	struct amdgpu_vm *peer_vm;
   1151	int ret;
   1152
   1153	list_for_each_entry(peer_vm, &process_info->vm_list_head,
   1154			    vm_list_node) {
   1155		struct amdgpu_bo *pd = peer_vm->root.bo;
   1156
   1157		ret = amdgpu_sync_resv(NULL, sync, pd->tbo.base.resv,
   1158				       AMDGPU_SYNC_NE_OWNER,
   1159				       AMDGPU_FENCE_OWNER_KFD);
   1160		if (ret)
   1161			return ret;
   1162	}
   1163
   1164	return 0;
   1165}
   1166
   1167static int process_update_pds(struct amdkfd_process_info *process_info,
   1168			      struct amdgpu_sync *sync)
   1169{
   1170	struct amdgpu_vm *peer_vm;
   1171	int ret;
   1172
   1173	list_for_each_entry(peer_vm, &process_info->vm_list_head,
   1174			    vm_list_node) {
   1175		ret = vm_update_pds(peer_vm, sync);
   1176		if (ret)
   1177			return ret;
   1178	}
   1179
   1180	return 0;
   1181}
   1182
   1183static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
   1184		       struct dma_fence **ef)
   1185{
   1186	struct amdkfd_process_info *info = NULL;
   1187	int ret;
   1188
   1189	if (!*process_info) {
   1190		info = kzalloc(sizeof(*info), GFP_KERNEL);
   1191		if (!info)
   1192			return -ENOMEM;
   1193
   1194		mutex_init(&info->lock);
   1195		INIT_LIST_HEAD(&info->vm_list_head);
   1196		INIT_LIST_HEAD(&info->kfd_bo_list);
   1197		INIT_LIST_HEAD(&info->userptr_valid_list);
   1198		INIT_LIST_HEAD(&info->userptr_inval_list);
   1199
   1200		info->eviction_fence =
   1201			amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
   1202						   current->mm,
   1203						   NULL);
   1204		if (!info->eviction_fence) {
   1205			pr_err("Failed to create eviction fence\n");
   1206			ret = -ENOMEM;
   1207			goto create_evict_fence_fail;
   1208		}
   1209
   1210		info->pid = get_task_pid(current->group_leader, PIDTYPE_PID);
   1211		atomic_set(&info->evicted_bos, 0);
   1212		INIT_DELAYED_WORK(&info->restore_userptr_work,
   1213				  amdgpu_amdkfd_restore_userptr_worker);
   1214
   1215		*process_info = info;
   1216		*ef = dma_fence_get(&info->eviction_fence->base);
   1217	}
   1218
   1219	vm->process_info = *process_info;
   1220
   1221	/* Validate page directory and attach eviction fence */
   1222	ret = amdgpu_bo_reserve(vm->root.bo, true);
   1223	if (ret)
   1224		goto reserve_pd_fail;
   1225	ret = vm_validate_pt_pd_bos(vm);
   1226	if (ret) {
   1227		pr_err("validate_pt_pd_bos() failed\n");
   1228		goto validate_pd_fail;
   1229	}
   1230	ret = amdgpu_bo_sync_wait(vm->root.bo,
   1231				  AMDGPU_FENCE_OWNER_KFD, false);
   1232	if (ret)
   1233		goto wait_pd_fail;
   1234	ret = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1);
   1235	if (ret)
   1236		goto reserve_shared_fail;
   1237	amdgpu_bo_fence(vm->root.bo,
   1238			&vm->process_info->eviction_fence->base, true);
   1239	amdgpu_bo_unreserve(vm->root.bo);
   1240
   1241	/* Update process info */
   1242	mutex_lock(&vm->process_info->lock);
   1243	list_add_tail(&vm->vm_list_node,
   1244			&(vm->process_info->vm_list_head));
   1245	vm->process_info->n_vms++;
   1246	mutex_unlock(&vm->process_info->lock);
   1247
   1248	return 0;
   1249
   1250reserve_shared_fail:
   1251wait_pd_fail:
   1252validate_pd_fail:
   1253	amdgpu_bo_unreserve(vm->root.bo);
   1254reserve_pd_fail:
   1255	vm->process_info = NULL;
   1256	if (info) {
   1257		/* Two fence references: one in info and one in *ef */
   1258		dma_fence_put(&info->eviction_fence->base);
   1259		dma_fence_put(*ef);
   1260		*ef = NULL;
   1261		*process_info = NULL;
   1262		put_pid(info->pid);
   1263create_evict_fence_fail:
   1264		mutex_destroy(&info->lock);
   1265		kfree(info);
   1266	}
   1267	return ret;
   1268}
   1269
   1270/**
   1271 * amdgpu_amdkfd_gpuvm_pin_bo() - Pins a BO using following criteria
   1272 * @bo: Handle of buffer object being pinned
   1273 * @domain: Domain into which BO should be pinned
   1274 *
   1275 *   - USERPTR BOs are UNPINNABLE and will return error
   1276 *   - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their
   1277 *     PIN count incremented. It is valid to PIN a BO multiple times
   1278 *
   1279 * Return: ZERO if successful in pinning, Non-Zero in case of error.
   1280 */
   1281static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)
   1282{
   1283	int ret = 0;
   1284
   1285	ret = amdgpu_bo_reserve(bo, false);
   1286	if (unlikely(ret))
   1287		return ret;
   1288
   1289	ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
   1290	if (ret)
   1291		pr_err("Error in Pinning BO to domain: %d\n", domain);
   1292
   1293	amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
   1294	amdgpu_bo_unreserve(bo);
   1295
   1296	return ret;
   1297}
   1298
   1299/**
   1300 * amdgpu_amdkfd_gpuvm_unpin_bo() - Unpins BO using following criteria
   1301 * @bo: Handle of buffer object being unpinned
   1302 *
   1303 *   - Is a illegal request for USERPTR BOs and is ignored
   1304 *   - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their
   1305 *     PIN count decremented. Calls to UNPIN must balance calls to PIN
   1306 */
   1307static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
   1308{
   1309	int ret = 0;
   1310
   1311	ret = amdgpu_bo_reserve(bo, false);
   1312	if (unlikely(ret))
   1313		return;
   1314
   1315	amdgpu_bo_unpin(bo);
   1316	amdgpu_bo_unreserve(bo);
   1317}
   1318
   1319int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
   1320					   struct file *filp, u32 pasid,
   1321					   void **process_info,
   1322					   struct dma_fence **ef)
   1323{
   1324	struct amdgpu_fpriv *drv_priv;
   1325	struct amdgpu_vm *avm;
   1326	int ret;
   1327
   1328	ret = amdgpu_file_to_fpriv(filp, &drv_priv);
   1329	if (ret)
   1330		return ret;
   1331	avm = &drv_priv->vm;
   1332
   1333	/* Already a compute VM? */
   1334	if (avm->process_info)
   1335		return -EINVAL;
   1336
   1337	/* Free the original amdgpu allocated pasid,
   1338	 * will be replaced with kfd allocated pasid.
   1339	 */
   1340	if (avm->pasid) {
   1341		amdgpu_pasid_free(avm->pasid);
   1342		amdgpu_vm_set_pasid(adev, avm, 0);
   1343	}
   1344
   1345	/* Convert VM into a compute VM */
   1346	ret = amdgpu_vm_make_compute(adev, avm);
   1347	if (ret)
   1348		return ret;
   1349
   1350	ret = amdgpu_vm_set_pasid(adev, avm, pasid);
   1351	if (ret)
   1352		return ret;
   1353	/* Initialize KFD part of the VM and process info */
   1354	ret = init_kfd_vm(avm, process_info, ef);
   1355	if (ret)
   1356		return ret;
   1357
   1358	amdgpu_vm_set_task_info(avm);
   1359
   1360	return 0;
   1361}
   1362
   1363void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
   1364				    struct amdgpu_vm *vm)
   1365{
   1366	struct amdkfd_process_info *process_info = vm->process_info;
   1367	struct amdgpu_bo *pd = vm->root.bo;
   1368
   1369	if (!process_info)
   1370		return;
   1371
   1372	/* Release eviction fence from PD */
   1373	amdgpu_bo_reserve(pd, false);
   1374	amdgpu_bo_fence(pd, NULL, false);
   1375	amdgpu_bo_unreserve(pd);
   1376
   1377	/* Update process info */
   1378	mutex_lock(&process_info->lock);
   1379	process_info->n_vms--;
   1380	list_del(&vm->vm_list_node);
   1381	mutex_unlock(&process_info->lock);
   1382
   1383	vm->process_info = NULL;
   1384
   1385	/* Release per-process resources when last compute VM is destroyed */
   1386	if (!process_info->n_vms) {
   1387		WARN_ON(!list_empty(&process_info->kfd_bo_list));
   1388		WARN_ON(!list_empty(&process_info->userptr_valid_list));
   1389		WARN_ON(!list_empty(&process_info->userptr_inval_list));
   1390
   1391		dma_fence_put(&process_info->eviction_fence->base);
   1392		cancel_delayed_work_sync(&process_info->restore_userptr_work);
   1393		put_pid(process_info->pid);
   1394		mutex_destroy(&process_info->lock);
   1395		kfree(process_info);
   1396	}
   1397}
   1398
   1399void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
   1400					    void *drm_priv)
   1401{
   1402	struct amdgpu_vm *avm;
   1403
   1404	if (WARN_ON(!adev || !drm_priv))
   1405		return;
   1406
   1407	avm = drm_priv_to_vm(drm_priv);
   1408
   1409	pr_debug("Releasing process vm %p\n", avm);
   1410
   1411	/* The original pasid of amdgpu vm has already been
   1412	 * released during making a amdgpu vm to a compute vm
   1413	 * The current pasid is managed by kfd and will be
   1414	 * released on kfd process destroy. Set amdgpu pasid
   1415	 * to 0 to avoid duplicate release.
   1416	 */
   1417	amdgpu_vm_release_compute(adev, avm);
   1418}
   1419
   1420uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv)
   1421{
   1422	struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
   1423	struct amdgpu_bo *pd = avm->root.bo;
   1424	struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
   1425
   1426	if (adev->asic_type < CHIP_VEGA10)
   1427		return avm->pd_phys_addr >> AMDGPU_GPU_PAGE_SHIFT;
   1428	return avm->pd_phys_addr;
   1429}
   1430
   1431void amdgpu_amdkfd_block_mmu_notifications(void *p)
   1432{
   1433	struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
   1434
   1435	mutex_lock(&pinfo->lock);
   1436	WRITE_ONCE(pinfo->block_mmu_notifications, true);
   1437	mutex_unlock(&pinfo->lock);
   1438}
   1439
   1440int amdgpu_amdkfd_criu_resume(void *p)
   1441{
   1442	int ret = 0;
   1443	struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
   1444
   1445	mutex_lock(&pinfo->lock);
   1446	pr_debug("scheduling work\n");
   1447	atomic_inc(&pinfo->evicted_bos);
   1448	if (!READ_ONCE(pinfo->block_mmu_notifications)) {
   1449		ret = -EINVAL;
   1450		goto out_unlock;
   1451	}
   1452	WRITE_ONCE(pinfo->block_mmu_notifications, false);
   1453	schedule_delayed_work(&pinfo->restore_userptr_work, 0);
   1454
   1455out_unlock:
   1456	mutex_unlock(&pinfo->lock);
   1457	return ret;
   1458}
   1459
   1460int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
   1461		struct amdgpu_device *adev, uint64_t va, uint64_t size,
   1462		void *drm_priv, struct kgd_mem **mem,
   1463		uint64_t *offset, uint32_t flags, bool criu_resume)
   1464{
   1465	struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
   1466	enum ttm_bo_type bo_type = ttm_bo_type_device;
   1467	struct sg_table *sg = NULL;
   1468	uint64_t user_addr = 0;
   1469	struct amdgpu_bo *bo;
   1470	struct drm_gem_object *gobj = NULL;
   1471	u32 domain, alloc_domain;
   1472	u64 alloc_flags;
   1473	int ret;
   1474
   1475	/*
   1476	 * Check on which domain to allocate BO
   1477	 */
   1478	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
   1479		domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
   1480		alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
   1481		alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ?
   1482			AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
   1483	} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
   1484		domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
   1485		alloc_flags = 0;
   1486	} else {
   1487		domain = AMDGPU_GEM_DOMAIN_GTT;
   1488		alloc_domain = AMDGPU_GEM_DOMAIN_CPU;
   1489		alloc_flags = AMDGPU_GEM_CREATE_PREEMPTIBLE;
   1490
   1491		if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
   1492			if (!offset || !*offset)
   1493				return -EINVAL;
   1494			user_addr = untagged_addr(*offset);
   1495		} else if (flags & (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
   1496				    KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
   1497			bo_type = ttm_bo_type_sg;
   1498			if (size > UINT_MAX)
   1499				return -EINVAL;
   1500			sg = create_doorbell_sg(*offset, size);
   1501			if (!sg)
   1502				return -ENOMEM;
   1503		} else {
   1504			return -EINVAL;
   1505		}
   1506	}
   1507
   1508	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
   1509	if (!*mem) {
   1510		ret = -ENOMEM;
   1511		goto err;
   1512	}
   1513	INIT_LIST_HEAD(&(*mem)->attachments);
   1514	mutex_init(&(*mem)->lock);
   1515	(*mem)->aql_queue = !!(flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM);
   1516
   1517	/* Workaround for AQL queue wraparound bug. Map the same
   1518	 * memory twice. That means we only actually allocate half
   1519	 * the memory.
   1520	 */
   1521	if ((*mem)->aql_queue)
   1522		size = size >> 1;
   1523
   1524	(*mem)->alloc_flags = flags;
   1525
   1526	amdgpu_sync_create(&(*mem)->sync);
   1527
   1528	ret = amdgpu_amdkfd_reserve_mem_limit(adev, size, flags);
   1529	if (ret) {
   1530		pr_debug("Insufficient memory\n");
   1531		goto err_reserve_limit;
   1532	}
   1533
   1534	pr_debug("\tcreate BO VA 0x%llx size 0x%llx domain %s\n",
   1535			va, size, domain_string(alloc_domain));
   1536
   1537	ret = amdgpu_gem_object_create(adev, size, 1, alloc_domain, alloc_flags,
   1538				       bo_type, NULL, &gobj);
   1539	if (ret) {
   1540		pr_debug("Failed to create BO on domain %s. ret %d\n",
   1541			 domain_string(alloc_domain), ret);
   1542		goto err_bo_create;
   1543	}
   1544	ret = drm_vma_node_allow(&gobj->vma_node, drm_priv);
   1545	if (ret) {
   1546		pr_debug("Failed to allow vma node access. ret %d\n", ret);
   1547		goto err_node_allow;
   1548	}
   1549	bo = gem_to_amdgpu_bo(gobj);
   1550	if (bo_type == ttm_bo_type_sg) {
   1551		bo->tbo.sg = sg;
   1552		bo->tbo.ttm->sg = sg;
   1553	}
   1554	bo->kfd_bo = *mem;
   1555	(*mem)->bo = bo;
   1556	if (user_addr)
   1557		bo->flags |= AMDGPU_AMDKFD_CREATE_USERPTR_BO;
   1558
   1559	(*mem)->va = va;
   1560	(*mem)->domain = domain;
   1561	(*mem)->mapped_to_gpu_memory = 0;
   1562	(*mem)->process_info = avm->process_info;
   1563	add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
   1564
   1565	if (user_addr) {
   1566		pr_debug("creating userptr BO for user_addr = %llu\n", user_addr);
   1567		ret = init_user_pages(*mem, user_addr, criu_resume);
   1568		if (ret)
   1569			goto allocate_init_user_pages_failed;
   1570	} else  if (flags & (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
   1571				KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
   1572		ret = amdgpu_amdkfd_gpuvm_pin_bo(bo, AMDGPU_GEM_DOMAIN_GTT);
   1573		if (ret) {
   1574			pr_err("Pinning MMIO/DOORBELL BO during ALLOC FAILED\n");
   1575			goto err_pin_bo;
   1576		}
   1577		bo->allowed_domains = AMDGPU_GEM_DOMAIN_GTT;
   1578		bo->preferred_domains = AMDGPU_GEM_DOMAIN_GTT;
   1579	}
   1580
   1581	if (offset)
   1582		*offset = amdgpu_bo_mmap_offset(bo);
   1583
   1584	return 0;
   1585
   1586allocate_init_user_pages_failed:
   1587err_pin_bo:
   1588	remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
   1589	drm_vma_node_revoke(&gobj->vma_node, drm_priv);
   1590err_node_allow:
   1591	/* Don't unreserve system mem limit twice */
   1592	goto err_reserve_limit;
   1593err_bo_create:
   1594	unreserve_mem_limit(adev, size, flags);
   1595err_reserve_limit:
   1596	mutex_destroy(&(*mem)->lock);
   1597	if (gobj)
   1598		drm_gem_object_put(gobj);
   1599	else
   1600		kfree(*mem);
   1601err:
   1602	if (sg) {
   1603		sg_free_table(sg);
   1604		kfree(sg);
   1605	}
   1606	return ret;
   1607}
   1608
   1609int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
   1610		struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv,
   1611		uint64_t *size)
   1612{
   1613	struct amdkfd_process_info *process_info = mem->process_info;
   1614	unsigned long bo_size = mem->bo->tbo.base.size;
   1615	struct kfd_mem_attachment *entry, *tmp;
   1616	struct bo_vm_reservation_context ctx;
   1617	struct ttm_validate_buffer *bo_list_entry;
   1618	unsigned int mapped_to_gpu_memory;
   1619	int ret;
   1620	bool is_imported = false;
   1621
   1622	mutex_lock(&mem->lock);
   1623
   1624	/* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
   1625	if (mem->alloc_flags &
   1626	    (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
   1627	     KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
   1628		amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
   1629	}
   1630
   1631	mapped_to_gpu_memory = mem->mapped_to_gpu_memory;
   1632	is_imported = mem->is_imported;
   1633	mutex_unlock(&mem->lock);
   1634	/* lock is not needed after this, since mem is unused and will
   1635	 * be freed anyway
   1636	 */
   1637
   1638	if (mapped_to_gpu_memory > 0) {
   1639		pr_debug("BO VA 0x%llx size 0x%lx is still mapped.\n",
   1640				mem->va, bo_size);
   1641		return -EBUSY;
   1642	}
   1643
   1644	/* Make sure restore workers don't access the BO any more */
   1645	bo_list_entry = &mem->validate_list;
   1646	mutex_lock(&process_info->lock);
   1647	list_del(&bo_list_entry->head);
   1648	mutex_unlock(&process_info->lock);
   1649
   1650	/* No more MMU notifiers */
   1651	amdgpu_mn_unregister(mem->bo);
   1652
   1653	ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx);
   1654	if (unlikely(ret))
   1655		return ret;
   1656
   1657	/* The eviction fence should be removed by the last unmap.
   1658	 * TODO: Log an error condition if the bo still has the eviction fence
   1659	 * attached
   1660	 */
   1661	amdgpu_amdkfd_remove_eviction_fence(mem->bo,
   1662					process_info->eviction_fence);
   1663	pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
   1664		mem->va + bo_size * (1 + mem->aql_queue));
   1665
   1666	/* Remove from VM internal data structures */
   1667	list_for_each_entry_safe(entry, tmp, &mem->attachments, list)
   1668		kfd_mem_detach(entry);
   1669
   1670	ret = unreserve_bo_and_vms(&ctx, false, false);
   1671
   1672	/* Free the sync object */
   1673	amdgpu_sync_free(&mem->sync);
   1674
   1675	/* If the SG is not NULL, it's one we created for a doorbell or mmio
   1676	 * remap BO. We need to free it.
   1677	 */
   1678	if (mem->bo->tbo.sg) {
   1679		sg_free_table(mem->bo->tbo.sg);
   1680		kfree(mem->bo->tbo.sg);
   1681	}
   1682
   1683	/* Update the size of the BO being freed if it was allocated from
   1684	 * VRAM and is not imported.
   1685	 */
   1686	if (size) {
   1687		if ((mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM) &&
   1688		    (!is_imported))
   1689			*size = bo_size;
   1690		else
   1691			*size = 0;
   1692	}
   1693
   1694	/* Free the BO*/
   1695	drm_vma_node_revoke(&mem->bo->tbo.base.vma_node, drm_priv);
   1696	if (mem->dmabuf)
   1697		dma_buf_put(mem->dmabuf);
   1698	mutex_destroy(&mem->lock);
   1699
   1700	/* If this releases the last reference, it will end up calling
   1701	 * amdgpu_amdkfd_release_notify and kfree the mem struct. That's why
   1702	 * this needs to be the last call here.
   1703	 */
   1704	drm_gem_object_put(&mem->bo->tbo.base);
   1705
   1706	return ret;
   1707}
   1708
   1709int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
   1710		struct amdgpu_device *adev, struct kgd_mem *mem,
   1711		void *drm_priv)
   1712{
   1713	struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
   1714	int ret;
   1715	struct amdgpu_bo *bo;
   1716	uint32_t domain;
   1717	struct kfd_mem_attachment *entry;
   1718	struct bo_vm_reservation_context ctx;
   1719	unsigned long bo_size;
   1720	bool is_invalid_userptr = false;
   1721
   1722	bo = mem->bo;
   1723	if (!bo) {
   1724		pr_err("Invalid BO when mapping memory to GPU\n");
   1725		return -EINVAL;
   1726	}
   1727
   1728	/* Make sure restore is not running concurrently. Since we
   1729	 * don't map invalid userptr BOs, we rely on the next restore
   1730	 * worker to do the mapping
   1731	 */
   1732	mutex_lock(&mem->process_info->lock);
   1733
   1734	/* Lock mmap-sem. If we find an invalid userptr BO, we can be
   1735	 * sure that the MMU notifier is no longer running
   1736	 * concurrently and the queues are actually stopped
   1737	 */
   1738	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
   1739		mmap_write_lock(current->mm);
   1740		is_invalid_userptr = atomic_read(&mem->invalid);
   1741		mmap_write_unlock(current->mm);
   1742	}
   1743
   1744	mutex_lock(&mem->lock);
   1745
   1746	domain = mem->domain;
   1747	bo_size = bo->tbo.base.size;
   1748
   1749	pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n",
   1750			mem->va,
   1751			mem->va + bo_size * (1 + mem->aql_queue),
   1752			avm, domain_string(domain));
   1753
   1754	if (!kfd_mem_is_attached(avm, mem)) {
   1755		ret = kfd_mem_attach(adev, mem, avm, mem->aql_queue);
   1756		if (ret)
   1757			goto out;
   1758	}
   1759
   1760	ret = reserve_bo_and_vm(mem, avm, &ctx);
   1761	if (unlikely(ret))
   1762		goto out;
   1763
   1764	/* Userptr can be marked as "not invalid", but not actually be
   1765	 * validated yet (still in the system domain). In that case
   1766	 * the queues are still stopped and we can leave mapping for
   1767	 * the next restore worker
   1768	 */
   1769	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) &&
   1770	    bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
   1771		is_invalid_userptr = true;
   1772
   1773	ret = vm_validate_pt_pd_bos(avm);
   1774	if (unlikely(ret))
   1775		goto out_unreserve;
   1776
   1777	if (mem->mapped_to_gpu_memory == 0 &&
   1778	    !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
   1779		/* Validate BO only once. The eviction fence gets added to BO
   1780		 * the first time it is mapped. Validate will wait for all
   1781		 * background evictions to complete.
   1782		 */
   1783		ret = amdgpu_amdkfd_bo_validate(bo, domain, true);
   1784		if (ret) {
   1785			pr_debug("Validate failed\n");
   1786			goto out_unreserve;
   1787		}
   1788	}
   1789
   1790	list_for_each_entry(entry, &mem->attachments, list) {
   1791		if (entry->bo_va->base.vm != avm || entry->is_mapped)
   1792			continue;
   1793
   1794		pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n",
   1795			 entry->va, entry->va + bo_size, entry);
   1796
   1797		ret = map_bo_to_gpuvm(mem, entry, ctx.sync,
   1798				      is_invalid_userptr);
   1799		if (ret) {
   1800			pr_err("Failed to map bo to gpuvm\n");
   1801			goto out_unreserve;
   1802		}
   1803
   1804		ret = vm_update_pds(avm, ctx.sync);
   1805		if (ret) {
   1806			pr_err("Failed to update page directories\n");
   1807			goto out_unreserve;
   1808		}
   1809
   1810		entry->is_mapped = true;
   1811		mem->mapped_to_gpu_memory++;
   1812		pr_debug("\t INC mapping count %d\n",
   1813			 mem->mapped_to_gpu_memory);
   1814	}
   1815
   1816	if (!amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) && !bo->tbo.pin_count)
   1817		amdgpu_bo_fence(bo,
   1818				&avm->process_info->eviction_fence->base,
   1819				true);
   1820	ret = unreserve_bo_and_vms(&ctx, false, false);
   1821
   1822	goto out;
   1823
   1824out_unreserve:
   1825	unreserve_bo_and_vms(&ctx, false, false);
   1826out:
   1827	mutex_unlock(&mem->process_info->lock);
   1828	mutex_unlock(&mem->lock);
   1829	return ret;
   1830}
   1831
   1832int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
   1833		struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv)
   1834{
   1835	struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
   1836	struct amdkfd_process_info *process_info = avm->process_info;
   1837	unsigned long bo_size = mem->bo->tbo.base.size;
   1838	struct kfd_mem_attachment *entry;
   1839	struct bo_vm_reservation_context ctx;
   1840	int ret;
   1841
   1842	mutex_lock(&mem->lock);
   1843
   1844	ret = reserve_bo_and_cond_vms(mem, avm, BO_VM_MAPPED, &ctx);
   1845	if (unlikely(ret))
   1846		goto out;
   1847	/* If no VMs were reserved, it means the BO wasn't actually mapped */
   1848	if (ctx.n_vms == 0) {
   1849		ret = -EINVAL;
   1850		goto unreserve_out;
   1851	}
   1852
   1853	ret = vm_validate_pt_pd_bos(avm);
   1854	if (unlikely(ret))
   1855		goto unreserve_out;
   1856
   1857	pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n",
   1858		mem->va,
   1859		mem->va + bo_size * (1 + mem->aql_queue),
   1860		avm);
   1861
   1862	list_for_each_entry(entry, &mem->attachments, list) {
   1863		if (entry->bo_va->base.vm != avm || !entry->is_mapped)
   1864			continue;
   1865
   1866		pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
   1867			 entry->va, entry->va + bo_size, entry);
   1868
   1869		unmap_bo_from_gpuvm(mem, entry, ctx.sync);
   1870		entry->is_mapped = false;
   1871
   1872		mem->mapped_to_gpu_memory--;
   1873		pr_debug("\t DEC mapping count %d\n",
   1874			 mem->mapped_to_gpu_memory);
   1875	}
   1876
   1877	/* If BO is unmapped from all VMs, unfence it. It can be evicted if
   1878	 * required.
   1879	 */
   1880	if (mem->mapped_to_gpu_memory == 0 &&
   1881	    !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm) &&
   1882	    !mem->bo->tbo.pin_count)
   1883		amdgpu_amdkfd_remove_eviction_fence(mem->bo,
   1884						process_info->eviction_fence);
   1885
   1886unreserve_out:
   1887	unreserve_bo_and_vms(&ctx, false, false);
   1888out:
   1889	mutex_unlock(&mem->lock);
   1890	return ret;
   1891}
   1892
   1893int amdgpu_amdkfd_gpuvm_sync_memory(
   1894		struct amdgpu_device *adev, struct kgd_mem *mem, bool intr)
   1895{
   1896	struct amdgpu_sync sync;
   1897	int ret;
   1898
   1899	amdgpu_sync_create(&sync);
   1900
   1901	mutex_lock(&mem->lock);
   1902	amdgpu_sync_clone(&mem->sync, &sync);
   1903	mutex_unlock(&mem->lock);
   1904
   1905	ret = amdgpu_sync_wait(&sync, intr);
   1906	amdgpu_sync_free(&sync);
   1907	return ret;
   1908}
   1909
   1910int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct amdgpu_device *adev,
   1911		struct kgd_mem *mem, void **kptr, uint64_t *size)
   1912{
   1913	int ret;
   1914	struct amdgpu_bo *bo = mem->bo;
   1915
   1916	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
   1917		pr_err("userptr can't be mapped to kernel\n");
   1918		return -EINVAL;
   1919	}
   1920
   1921	mutex_lock(&mem->process_info->lock);
   1922
   1923	ret = amdgpu_bo_reserve(bo, true);
   1924	if (ret) {
   1925		pr_err("Failed to reserve bo. ret %d\n", ret);
   1926		goto bo_reserve_failed;
   1927	}
   1928
   1929	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
   1930	if (ret) {
   1931		pr_err("Failed to pin bo. ret %d\n", ret);
   1932		goto pin_failed;
   1933	}
   1934
   1935	ret = amdgpu_bo_kmap(bo, kptr);
   1936	if (ret) {
   1937		pr_err("Failed to map bo to kernel. ret %d\n", ret);
   1938		goto kmap_failed;
   1939	}
   1940
   1941	amdgpu_amdkfd_remove_eviction_fence(
   1942		bo, mem->process_info->eviction_fence);
   1943
   1944	if (size)
   1945		*size = amdgpu_bo_size(bo);
   1946
   1947	amdgpu_bo_unreserve(bo);
   1948
   1949	mutex_unlock(&mem->process_info->lock);
   1950	return 0;
   1951
   1952kmap_failed:
   1953	amdgpu_bo_unpin(bo);
   1954pin_failed:
   1955	amdgpu_bo_unreserve(bo);
   1956bo_reserve_failed:
   1957	mutex_unlock(&mem->process_info->lock);
   1958
   1959	return ret;
   1960}
   1961
   1962void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct amdgpu_device *adev,
   1963						  struct kgd_mem *mem)
   1964{
   1965	struct amdgpu_bo *bo = mem->bo;
   1966
   1967	amdgpu_bo_reserve(bo, true);
   1968	amdgpu_bo_kunmap(bo);
   1969	amdgpu_bo_unpin(bo);
   1970	amdgpu_bo_unreserve(bo);
   1971}
   1972
   1973int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
   1974					  struct kfd_vm_fault_info *mem)
   1975{
   1976	if (atomic_read(&adev->gmc.vm_fault_info_updated) == 1) {
   1977		*mem = *adev->gmc.vm_fault_info;
   1978		mb();
   1979		atomic_set(&adev->gmc.vm_fault_info_updated, 0);
   1980	}
   1981	return 0;
   1982}
   1983
   1984int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
   1985				      struct dma_buf *dma_buf,
   1986				      uint64_t va, void *drm_priv,
   1987				      struct kgd_mem **mem, uint64_t *size,
   1988				      uint64_t *mmap_offset)
   1989{
   1990	struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
   1991	struct drm_gem_object *obj;
   1992	struct amdgpu_bo *bo;
   1993	int ret;
   1994
   1995	if (dma_buf->ops != &amdgpu_dmabuf_ops)
   1996		/* Can't handle non-graphics buffers */
   1997		return -EINVAL;
   1998
   1999	obj = dma_buf->priv;
   2000	if (drm_to_adev(obj->dev) != adev)
   2001		/* Can't handle buffers from other devices */
   2002		return -EINVAL;
   2003
   2004	bo = gem_to_amdgpu_bo(obj);
   2005	if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
   2006				    AMDGPU_GEM_DOMAIN_GTT)))
   2007		/* Only VRAM and GTT BOs are supported */
   2008		return -EINVAL;
   2009
   2010	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
   2011	if (!*mem)
   2012		return -ENOMEM;
   2013
   2014	ret = drm_vma_node_allow(&obj->vma_node, drm_priv);
   2015	if (ret) {
   2016		kfree(mem);
   2017		return ret;
   2018	}
   2019
   2020	if (size)
   2021		*size = amdgpu_bo_size(bo);
   2022
   2023	if (mmap_offset)
   2024		*mmap_offset = amdgpu_bo_mmap_offset(bo);
   2025
   2026	INIT_LIST_HEAD(&(*mem)->attachments);
   2027	mutex_init(&(*mem)->lock);
   2028
   2029	(*mem)->alloc_flags =
   2030		((bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
   2031		KFD_IOC_ALLOC_MEM_FLAGS_VRAM : KFD_IOC_ALLOC_MEM_FLAGS_GTT)
   2032		| KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE
   2033		| KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
   2034
   2035	drm_gem_object_get(&bo->tbo.base);
   2036	(*mem)->bo = bo;
   2037	(*mem)->va = va;
   2038	(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
   2039		AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
   2040	(*mem)->mapped_to_gpu_memory = 0;
   2041	(*mem)->process_info = avm->process_info;
   2042	add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, false);
   2043	amdgpu_sync_create(&(*mem)->sync);
   2044	(*mem)->is_imported = true;
   2045
   2046	return 0;
   2047}
   2048
   2049/* Evict a userptr BO by stopping the queues if necessary
   2050 *
   2051 * Runs in MMU notifier, may be in RECLAIM_FS context. This means it
   2052 * cannot do any memory allocations, and cannot take any locks that
   2053 * are held elsewhere while allocating memory. Therefore this is as
   2054 * simple as possible, using atomic counters.
   2055 *
   2056 * It doesn't do anything to the BO itself. The real work happens in
   2057 * restore, where we get updated page addresses. This function only
   2058 * ensures that GPU access to the BO is stopped.
   2059 */
   2060int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
   2061				struct mm_struct *mm)
   2062{
   2063	struct amdkfd_process_info *process_info = mem->process_info;
   2064	int evicted_bos;
   2065	int r = 0;
   2066
   2067	/* Do not process MMU notifications until stage-4 IOCTL is received */
   2068	if (READ_ONCE(process_info->block_mmu_notifications))
   2069		return 0;
   2070
   2071	atomic_inc(&mem->invalid);
   2072	evicted_bos = atomic_inc_return(&process_info->evicted_bos);
   2073	if (evicted_bos == 1) {
   2074		/* First eviction, stop the queues */
   2075		r = kgd2kfd_quiesce_mm(mm);
   2076		if (r)
   2077			pr_err("Failed to quiesce KFD\n");
   2078		schedule_delayed_work(&process_info->restore_userptr_work,
   2079			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
   2080	}
   2081
   2082	return r;
   2083}
   2084
   2085/* Update invalid userptr BOs
   2086 *
   2087 * Moves invalidated (evicted) userptr BOs from userptr_valid_list to
   2088 * userptr_inval_list and updates user pages for all BOs that have
   2089 * been invalidated since their last update.
   2090 */
   2091static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
   2092				     struct mm_struct *mm)
   2093{
   2094	struct kgd_mem *mem, *tmp_mem;
   2095	struct amdgpu_bo *bo;
   2096	struct ttm_operation_ctx ctx = { false, false };
   2097	int invalid, ret;
   2098
   2099	/* Move all invalidated BOs to the userptr_inval_list and
   2100	 * release their user pages by migration to the CPU domain
   2101	 */
   2102	list_for_each_entry_safe(mem, tmp_mem,
   2103				 &process_info->userptr_valid_list,
   2104				 validate_list.head) {
   2105		if (!atomic_read(&mem->invalid))
   2106			continue; /* BO is still valid */
   2107
   2108		bo = mem->bo;
   2109
   2110		if (amdgpu_bo_reserve(bo, true))
   2111			return -EAGAIN;
   2112		amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
   2113		ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
   2114		amdgpu_bo_unreserve(bo);
   2115		if (ret) {
   2116			pr_err("%s: Failed to invalidate userptr BO\n",
   2117			       __func__);
   2118			return -EAGAIN;
   2119		}
   2120
   2121		list_move_tail(&mem->validate_list.head,
   2122			       &process_info->userptr_inval_list);
   2123	}
   2124
   2125	if (list_empty(&process_info->userptr_inval_list))
   2126		return 0; /* All evicted userptr BOs were freed */
   2127
   2128	/* Go through userptr_inval_list and update any invalid user_pages */
   2129	list_for_each_entry(mem, &process_info->userptr_inval_list,
   2130			    validate_list.head) {
   2131		invalid = atomic_read(&mem->invalid);
   2132		if (!invalid)
   2133			/* BO hasn't been invalidated since the last
   2134			 * revalidation attempt. Keep its BO list.
   2135			 */
   2136			continue;
   2137
   2138		bo = mem->bo;
   2139
   2140		/* Get updated user pages */
   2141		ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
   2142		if (ret) {
   2143			pr_debug("Failed %d to get user pages\n", ret);
   2144
   2145			/* Return -EFAULT bad address error as success. It will
   2146			 * fail later with a VM fault if the GPU tries to access
   2147			 * it. Better than hanging indefinitely with stalled
   2148			 * user mode queues.
   2149			 *
   2150			 * Return other error -EBUSY or -ENOMEM to retry restore
   2151			 */
   2152			if (ret != -EFAULT)
   2153				return ret;
   2154		} else {
   2155
   2156			/*
   2157			 * FIXME: Cannot ignore the return code, must hold
   2158			 * notifier_lock
   2159			 */
   2160			amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
   2161		}
   2162
   2163		/* Mark the BO as valid unless it was invalidated
   2164		 * again concurrently.
   2165		 */
   2166		if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid)
   2167			return -EAGAIN;
   2168	}
   2169
   2170	return 0;
   2171}
   2172
   2173/* Validate invalid userptr BOs
   2174 *
   2175 * Validates BOs on the userptr_inval_list, and moves them back to the
   2176 * userptr_valid_list. Also updates GPUVM page tables with new page
   2177 * addresses and waits for the page table updates to complete.
   2178 */
   2179static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
   2180{
   2181	struct amdgpu_bo_list_entry *pd_bo_list_entries;
   2182	struct list_head resv_list, duplicates;
   2183	struct ww_acquire_ctx ticket;
   2184	struct amdgpu_sync sync;
   2185
   2186	struct amdgpu_vm *peer_vm;
   2187	struct kgd_mem *mem, *tmp_mem;
   2188	struct amdgpu_bo *bo;
   2189	struct ttm_operation_ctx ctx = { false, false };
   2190	int i, ret;
   2191
   2192	pd_bo_list_entries = kcalloc(process_info->n_vms,
   2193				     sizeof(struct amdgpu_bo_list_entry),
   2194				     GFP_KERNEL);
   2195	if (!pd_bo_list_entries) {
   2196		pr_err("%s: Failed to allocate PD BO list entries\n", __func__);
   2197		ret = -ENOMEM;
   2198		goto out_no_mem;
   2199	}
   2200
   2201	INIT_LIST_HEAD(&resv_list);
   2202	INIT_LIST_HEAD(&duplicates);
   2203
   2204	/* Get all the page directory BOs that need to be reserved */
   2205	i = 0;
   2206	list_for_each_entry(peer_vm, &process_info->vm_list_head,
   2207			    vm_list_node)
   2208		amdgpu_vm_get_pd_bo(peer_vm, &resv_list,
   2209				    &pd_bo_list_entries[i++]);
   2210	/* Add the userptr_inval_list entries to resv_list */
   2211	list_for_each_entry(mem, &process_info->userptr_inval_list,
   2212			    validate_list.head) {
   2213		list_add_tail(&mem->resv_list.head, &resv_list);
   2214		mem->resv_list.bo = mem->validate_list.bo;
   2215		mem->resv_list.num_shared = mem->validate_list.num_shared;
   2216	}
   2217
   2218	/* Reserve all BOs and page tables for validation */
   2219	ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates);
   2220	WARN(!list_empty(&duplicates), "Duplicates should be empty");
   2221	if (ret)
   2222		goto out_free;
   2223
   2224	amdgpu_sync_create(&sync);
   2225
   2226	ret = process_validate_vms(process_info);
   2227	if (ret)
   2228		goto unreserve_out;
   2229
   2230	/* Validate BOs and update GPUVM page tables */
   2231	list_for_each_entry_safe(mem, tmp_mem,
   2232				 &process_info->userptr_inval_list,
   2233				 validate_list.head) {
   2234		struct kfd_mem_attachment *attachment;
   2235
   2236		bo = mem->bo;
   2237
   2238		/* Validate the BO if we got user pages */
   2239		if (bo->tbo.ttm->pages[0]) {
   2240			amdgpu_bo_placement_from_domain(bo, mem->domain);
   2241			ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
   2242			if (ret) {
   2243				pr_err("%s: failed to validate BO\n", __func__);
   2244				goto unreserve_out;
   2245			}
   2246		}
   2247
   2248		list_move_tail(&mem->validate_list.head,
   2249			       &process_info->userptr_valid_list);
   2250
   2251		/* Update mapping. If the BO was not validated
   2252		 * (because we couldn't get user pages), this will
   2253		 * clear the page table entries, which will result in
   2254		 * VM faults if the GPU tries to access the invalid
   2255		 * memory.
   2256		 */
   2257		list_for_each_entry(attachment, &mem->attachments, list) {
   2258			if (!attachment->is_mapped)
   2259				continue;
   2260
   2261			kfd_mem_dmaunmap_attachment(mem, attachment);
   2262			ret = update_gpuvm_pte(mem, attachment, &sync);
   2263			if (ret) {
   2264				pr_err("%s: update PTE failed\n", __func__);
   2265				/* make sure this gets validated again */
   2266				atomic_inc(&mem->invalid);
   2267				goto unreserve_out;
   2268			}
   2269		}
   2270	}
   2271
   2272	/* Update page directories */
   2273	ret = process_update_pds(process_info, &sync);
   2274
   2275unreserve_out:
   2276	ttm_eu_backoff_reservation(&ticket, &resv_list);
   2277	amdgpu_sync_wait(&sync, false);
   2278	amdgpu_sync_free(&sync);
   2279out_free:
   2280	kfree(pd_bo_list_entries);
   2281out_no_mem:
   2282
   2283	return ret;
   2284}
   2285
   2286/* Worker callback to restore evicted userptr BOs
   2287 *
   2288 * Tries to update and validate all userptr BOs. If successful and no
   2289 * concurrent evictions happened, the queues are restarted. Otherwise,
   2290 * reschedule for another attempt later.
   2291 */
   2292static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
   2293{
   2294	struct delayed_work *dwork = to_delayed_work(work);
   2295	struct amdkfd_process_info *process_info =
   2296		container_of(dwork, struct amdkfd_process_info,
   2297			     restore_userptr_work);
   2298	struct task_struct *usertask;
   2299	struct mm_struct *mm;
   2300	int evicted_bos;
   2301
   2302	evicted_bos = atomic_read(&process_info->evicted_bos);
   2303	if (!evicted_bos)
   2304		return;
   2305
   2306	/* Reference task and mm in case of concurrent process termination */
   2307	usertask = get_pid_task(process_info->pid, PIDTYPE_PID);
   2308	if (!usertask)
   2309		return;
   2310	mm = get_task_mm(usertask);
   2311	if (!mm) {
   2312		put_task_struct(usertask);
   2313		return;
   2314	}
   2315
   2316	mutex_lock(&process_info->lock);
   2317
   2318	if (update_invalid_user_pages(process_info, mm))
   2319		goto unlock_out;
   2320	/* userptr_inval_list can be empty if all evicted userptr BOs
   2321	 * have been freed. In that case there is nothing to validate
   2322	 * and we can just restart the queues.
   2323	 */
   2324	if (!list_empty(&process_info->userptr_inval_list)) {
   2325		if (atomic_read(&process_info->evicted_bos) != evicted_bos)
   2326			goto unlock_out; /* Concurrent eviction, try again */
   2327
   2328		if (validate_invalid_user_pages(process_info))
   2329			goto unlock_out;
   2330	}
   2331	/* Final check for concurrent evicton and atomic update. If
   2332	 * another eviction happens after successful update, it will
   2333	 * be a first eviction that calls quiesce_mm. The eviction
   2334	 * reference counting inside KFD will handle this case.
   2335	 */
   2336	if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) !=
   2337	    evicted_bos)
   2338		goto unlock_out;
   2339	evicted_bos = 0;
   2340	if (kgd2kfd_resume_mm(mm)) {
   2341		pr_err("%s: Failed to resume KFD\n", __func__);
   2342		/* No recovery from this failure. Probably the CP is
   2343		 * hanging. No point trying again.
   2344		 */
   2345	}
   2346
   2347unlock_out:
   2348	mutex_unlock(&process_info->lock);
   2349	mmput(mm);
   2350	put_task_struct(usertask);
   2351
   2352	/* If validation failed, reschedule another attempt */
   2353	if (evicted_bos)
   2354		schedule_delayed_work(&process_info->restore_userptr_work,
   2355			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
   2356}
   2357
   2358/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
   2359 *   KFD process identified by process_info
   2360 *
   2361 * @process_info: amdkfd_process_info of the KFD process
   2362 *
   2363 * After memory eviction, restore thread calls this function. The function
   2364 * should be called when the Process is still valid. BO restore involves -
   2365 *
   2366 * 1.  Release old eviction fence and create new one
   2367 * 2.  Get two copies of PD BO list from all the VMs. Keep one copy as pd_list.
   2368 * 3   Use the second PD list and kfd_bo_list to create a list (ctx.list) of
   2369 *     BOs that need to be reserved.
   2370 * 4.  Reserve all the BOs
   2371 * 5.  Validate of PD and PT BOs.
   2372 * 6.  Validate all KFD BOs using kfd_bo_list and Map them and add new fence
   2373 * 7.  Add fence to all PD and PT BOs.
   2374 * 8.  Unreserve all BOs
   2375 */
   2376int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
   2377{
   2378	struct amdgpu_bo_list_entry *pd_bo_list;
   2379	struct amdkfd_process_info *process_info = info;
   2380	struct amdgpu_vm *peer_vm;
   2381	struct kgd_mem *mem;
   2382	struct bo_vm_reservation_context ctx;
   2383	struct amdgpu_amdkfd_fence *new_fence;
   2384	int ret = 0, i;
   2385	struct list_head duplicate_save;
   2386	struct amdgpu_sync sync_obj;
   2387	unsigned long failed_size = 0;
   2388	unsigned long total_size = 0;
   2389
   2390	INIT_LIST_HEAD(&duplicate_save);
   2391	INIT_LIST_HEAD(&ctx.list);
   2392	INIT_LIST_HEAD(&ctx.duplicates);
   2393
   2394	pd_bo_list = kcalloc(process_info->n_vms,
   2395			     sizeof(struct amdgpu_bo_list_entry),
   2396			     GFP_KERNEL);
   2397	if (!pd_bo_list)
   2398		return -ENOMEM;
   2399
   2400	i = 0;
   2401	mutex_lock(&process_info->lock);
   2402	list_for_each_entry(peer_vm, &process_info->vm_list_head,
   2403			vm_list_node)
   2404		amdgpu_vm_get_pd_bo(peer_vm, &ctx.list, &pd_bo_list[i++]);
   2405
   2406	/* Reserve all BOs and page tables/directory. Add all BOs from
   2407	 * kfd_bo_list to ctx.list
   2408	 */
   2409	list_for_each_entry(mem, &process_info->kfd_bo_list,
   2410			    validate_list.head) {
   2411
   2412		list_add_tail(&mem->resv_list.head, &ctx.list);
   2413		mem->resv_list.bo = mem->validate_list.bo;
   2414		mem->resv_list.num_shared = mem->validate_list.num_shared;
   2415	}
   2416
   2417	ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list,
   2418				     false, &duplicate_save);
   2419	if (ret) {
   2420		pr_debug("Memory eviction: TTM Reserve Failed. Try again\n");
   2421		goto ttm_reserve_fail;
   2422	}
   2423
   2424	amdgpu_sync_create(&sync_obj);
   2425
   2426	/* Validate PDs and PTs */
   2427	ret = process_validate_vms(process_info);
   2428	if (ret)
   2429		goto validate_map_fail;
   2430
   2431	ret = process_sync_pds_resv(process_info, &sync_obj);
   2432	if (ret) {
   2433		pr_debug("Memory eviction: Failed to sync to PD BO moving fence. Try again\n");
   2434		goto validate_map_fail;
   2435	}
   2436
   2437	/* Validate BOs and map them to GPUVM (update VM page tables). */
   2438	list_for_each_entry(mem, &process_info->kfd_bo_list,
   2439			    validate_list.head) {
   2440
   2441		struct amdgpu_bo *bo = mem->bo;
   2442		uint32_t domain = mem->domain;
   2443		struct kfd_mem_attachment *attachment;
   2444		struct dma_resv_iter cursor;
   2445		struct dma_fence *fence;
   2446
   2447		total_size += amdgpu_bo_size(bo);
   2448
   2449		ret = amdgpu_amdkfd_bo_validate(bo, domain, false);
   2450		if (ret) {
   2451			pr_debug("Memory eviction: Validate BOs failed\n");
   2452			failed_size += amdgpu_bo_size(bo);
   2453			ret = amdgpu_amdkfd_bo_validate(bo,
   2454						AMDGPU_GEM_DOMAIN_GTT, false);
   2455			if (ret) {
   2456				pr_debug("Memory eviction: Try again\n");
   2457				goto validate_map_fail;
   2458			}
   2459		}
   2460		dma_resv_for_each_fence(&cursor, bo->tbo.base.resv,
   2461					DMA_RESV_USAGE_KERNEL, fence) {
   2462			ret = amdgpu_sync_fence(&sync_obj, fence);
   2463			if (ret) {
   2464				pr_debug("Memory eviction: Sync BO fence failed. Try again\n");
   2465				goto validate_map_fail;
   2466			}
   2467		}
   2468		list_for_each_entry(attachment, &mem->attachments, list) {
   2469			if (!attachment->is_mapped)
   2470				continue;
   2471
   2472			kfd_mem_dmaunmap_attachment(mem, attachment);
   2473			ret = update_gpuvm_pte(mem, attachment, &sync_obj);
   2474			if (ret) {
   2475				pr_debug("Memory eviction: update PTE failed. Try again\n");
   2476				goto validate_map_fail;
   2477			}
   2478		}
   2479	}
   2480
   2481	if (failed_size)
   2482		pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
   2483
   2484	/* Update page directories */
   2485	ret = process_update_pds(process_info, &sync_obj);
   2486	if (ret) {
   2487		pr_debug("Memory eviction: update PDs failed. Try again\n");
   2488		goto validate_map_fail;
   2489	}
   2490
   2491	/* Wait for validate and PT updates to finish */
   2492	amdgpu_sync_wait(&sync_obj, false);
   2493
   2494	/* Release old eviction fence and create new one, because fence only
   2495	 * goes from unsignaled to signaled, fence cannot be reused.
   2496	 * Use context and mm from the old fence.
   2497	 */
   2498	new_fence = amdgpu_amdkfd_fence_create(
   2499				process_info->eviction_fence->base.context,
   2500				process_info->eviction_fence->mm,
   2501				NULL);
   2502	if (!new_fence) {
   2503		pr_err("Failed to create eviction fence\n");
   2504		ret = -ENOMEM;
   2505		goto validate_map_fail;
   2506	}
   2507	dma_fence_put(&process_info->eviction_fence->base);
   2508	process_info->eviction_fence = new_fence;
   2509	*ef = dma_fence_get(&new_fence->base);
   2510
   2511	/* Attach new eviction fence to all BOs except pinned ones */
   2512	list_for_each_entry(mem, &process_info->kfd_bo_list,
   2513		validate_list.head) {
   2514		if (mem->bo->tbo.pin_count)
   2515			continue;
   2516
   2517		amdgpu_bo_fence(mem->bo,
   2518			&process_info->eviction_fence->base, true);
   2519	}
   2520	/* Attach eviction fence to PD / PT BOs */
   2521	list_for_each_entry(peer_vm, &process_info->vm_list_head,
   2522			    vm_list_node) {
   2523		struct amdgpu_bo *bo = peer_vm->root.bo;
   2524
   2525		amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true);
   2526	}
   2527
   2528validate_map_fail:
   2529	ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list);
   2530	amdgpu_sync_free(&sync_obj);
   2531ttm_reserve_fail:
   2532	mutex_unlock(&process_info->lock);
   2533	kfree(pd_bo_list);
   2534	return ret;
   2535}
   2536
   2537int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem)
   2538{
   2539	struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;
   2540	struct amdgpu_bo *gws_bo = (struct amdgpu_bo *)gws;
   2541	int ret;
   2542
   2543	if (!info || !gws)
   2544		return -EINVAL;
   2545
   2546	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
   2547	if (!*mem)
   2548		return -ENOMEM;
   2549
   2550	mutex_init(&(*mem)->lock);
   2551	INIT_LIST_HEAD(&(*mem)->attachments);
   2552	(*mem)->bo = amdgpu_bo_ref(gws_bo);
   2553	(*mem)->domain = AMDGPU_GEM_DOMAIN_GWS;
   2554	(*mem)->process_info = process_info;
   2555	add_kgd_mem_to_kfd_bo_list(*mem, process_info, false);
   2556	amdgpu_sync_create(&(*mem)->sync);
   2557
   2558
   2559	/* Validate gws bo the first time it is added to process */
   2560	mutex_lock(&(*mem)->process_info->lock);
   2561	ret = amdgpu_bo_reserve(gws_bo, false);
   2562	if (unlikely(ret)) {
   2563		pr_err("Reserve gws bo failed %d\n", ret);
   2564		goto bo_reservation_failure;
   2565	}
   2566
   2567	ret = amdgpu_amdkfd_bo_validate(gws_bo, AMDGPU_GEM_DOMAIN_GWS, true);
   2568	if (ret) {
   2569		pr_err("GWS BO validate failed %d\n", ret);
   2570		goto bo_validation_failure;
   2571	}
   2572	/* GWS resource is shared b/t amdgpu and amdkfd
   2573	 * Add process eviction fence to bo so they can
   2574	 * evict each other.
   2575	 */
   2576	ret = dma_resv_reserve_fences(gws_bo->tbo.base.resv, 1);
   2577	if (ret)
   2578		goto reserve_shared_fail;
   2579	amdgpu_bo_fence(gws_bo, &process_info->eviction_fence->base, true);
   2580	amdgpu_bo_unreserve(gws_bo);
   2581	mutex_unlock(&(*mem)->process_info->lock);
   2582
   2583	return ret;
   2584
   2585reserve_shared_fail:
   2586bo_validation_failure:
   2587	amdgpu_bo_unreserve(gws_bo);
   2588bo_reservation_failure:
   2589	mutex_unlock(&(*mem)->process_info->lock);
   2590	amdgpu_sync_free(&(*mem)->sync);
   2591	remove_kgd_mem_from_kfd_bo_list(*mem, process_info);
   2592	amdgpu_bo_unref(&gws_bo);
   2593	mutex_destroy(&(*mem)->lock);
   2594	kfree(*mem);
   2595	*mem = NULL;
   2596	return ret;
   2597}
   2598
   2599int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)
   2600{
   2601	int ret;
   2602	struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;
   2603	struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
   2604	struct amdgpu_bo *gws_bo = kgd_mem->bo;
   2605
   2606	/* Remove BO from process's validate list so restore worker won't touch
   2607	 * it anymore
   2608	 */
   2609	remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);
   2610
   2611	ret = amdgpu_bo_reserve(gws_bo, false);
   2612	if (unlikely(ret)) {
   2613		pr_err("Reserve gws bo failed %d\n", ret);
   2614		//TODO add BO back to validate_list?
   2615		return ret;
   2616	}
   2617	amdgpu_amdkfd_remove_eviction_fence(gws_bo,
   2618			process_info->eviction_fence);
   2619	amdgpu_bo_unreserve(gws_bo);
   2620	amdgpu_sync_free(&kgd_mem->sync);
   2621	amdgpu_bo_unref(&gws_bo);
   2622	mutex_destroy(&kgd_mem->lock);
   2623	kfree(mem);
   2624	return 0;
   2625}
   2626
   2627/* Returns GPU-specific tiling mode information */
   2628int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
   2629				struct tile_config *config)
   2630{
   2631	config->gb_addr_config = adev->gfx.config.gb_addr_config;
   2632	config->tile_config_ptr = adev->gfx.config.tile_mode_array;
   2633	config->num_tile_configs =
   2634			ARRAY_SIZE(adev->gfx.config.tile_mode_array);
   2635	config->macro_tile_config_ptr =
   2636			adev->gfx.config.macrotile_mode_array;
   2637	config->num_macro_tile_configs =
   2638			ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
   2639
   2640	/* Those values are not set from GFX9 onwards */
   2641	config->num_banks = adev->gfx.config.num_banks;
   2642	config->num_ranks = adev->gfx.config.num_ranks;
   2643
   2644	return 0;
   2645}
   2646
   2647bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem)
   2648{
   2649	struct kfd_mem_attachment *entry;
   2650
   2651	list_for_each_entry(entry, &mem->attachments, list) {
   2652		if (entry->is_mapped && entry->adev == adev)
   2653			return true;
   2654	}
   2655	return false;
   2656}