ne_misc_dev.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ne_misc_dev.c (48943B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
      4 */
      5
      6/**
      7 * DOC: Enclave lifetime management driver for Nitro Enclaves (NE).
      8 * Nitro is a hypervisor that has been developed by Amazon.
      9 */
     10
     11#include <linux/anon_inodes.h>
     12#include <linux/capability.h>
     13#include <linux/cpu.h>
     14#include <linux/device.h>
     15#include <linux/file.h>
     16#include <linux/hugetlb.h>
     17#include <linux/limits.h>
     18#include <linux/list.h>
     19#include <linux/miscdevice.h>
     20#include <linux/mm.h>
     21#include <linux/mman.h>
     22#include <linux/module.h>
     23#include <linux/mutex.h>
     24#include <linux/nitro_enclaves.h>
     25#include <linux/pci.h>
     26#include <linux/poll.h>
     27#include <linux/range.h>
     28#include <linux/slab.h>
     29#include <linux/types.h>
     30#include <uapi/linux/vm_sockets.h>
     31
     32#include "ne_misc_dev.h"
     33#include "ne_pci_dev.h"
     34
     35/**
     36 * NE_CPUS_SIZE - Size for max 128 CPUs, for now, in a cpu-list string, comma
     37 *		  separated. The NE CPU pool includes CPUs from a single NUMA
     38 *		  node.
     39 */
     40#define NE_CPUS_SIZE		(512)
     41
     42/**
     43 * NE_EIF_LOAD_OFFSET - The offset where to copy the Enclave Image Format (EIF)
     44 *			image in enclave memory.
     45 */
     46#define NE_EIF_LOAD_OFFSET	(8 * 1024UL * 1024UL)
     47
     48/**
     49 * NE_MIN_ENCLAVE_MEM_SIZE - The minimum memory size an enclave can be launched
     50 *			     with.
     51 */
     52#define NE_MIN_ENCLAVE_MEM_SIZE	(64 * 1024UL * 1024UL)
     53
     54/**
     55 * NE_MIN_MEM_REGION_SIZE - The minimum size of an enclave memory region.
     56 */
     57#define NE_MIN_MEM_REGION_SIZE	(2 * 1024UL * 1024UL)
     58
     59/**
     60 * NE_PARENT_VM_CID - The CID for the vsock device of the primary / parent VM.
     61 */
     62#define NE_PARENT_VM_CID	(3)
     63
     64static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
     65
     66static const struct file_operations ne_fops = {
     67	.owner		= THIS_MODULE,
     68	.llseek		= noop_llseek,
     69	.unlocked_ioctl	= ne_ioctl,
     70};
     71
     72static struct miscdevice ne_misc_dev = {
     73	.minor	= MISC_DYNAMIC_MINOR,
     74	.name	= "nitro_enclaves",
     75	.fops	= &ne_fops,
     76	.mode	= 0660,
     77};
     78
     79struct ne_devs ne_devs = {
     80	.ne_misc_dev	= &ne_misc_dev,
     81};
     82
     83/*
     84 * TODO: Update logic to create new sysfs entries instead of using
     85 * a kernel parameter e.g. if multiple sysfs files needed.
     86 */
     87static int ne_set_kernel_param(const char *val, const struct kernel_param *kp);
     88
     89static const struct kernel_param_ops ne_cpu_pool_ops = {
     90	.get	= param_get_string,
     91	.set	= ne_set_kernel_param,
     92};
     93
     94static char ne_cpus[NE_CPUS_SIZE];
     95static struct kparam_string ne_cpus_arg = {
     96	.maxlen	= sizeof(ne_cpus),
     97	.string	= ne_cpus,
     98};
     99
    100module_param_cb(ne_cpus, &ne_cpu_pool_ops, &ne_cpus_arg, 0644);
    101/* https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists */
    102MODULE_PARM_DESC(ne_cpus, "<cpu-list> - CPU pool used for Nitro Enclaves");
    103
    104/**
    105 * struct ne_cpu_pool - CPU pool used for Nitro Enclaves.
    106 * @avail_threads_per_core:	Available full CPU cores to be dedicated to
    107 *				enclave(s). The cpumasks from the array, indexed
    108 *				by core id, contain all the threads from the
    109 *				available cores, that are not set for created
    110 *				enclave(s). The full CPU cores are part of the
    111 *				NE CPU pool.
    112 * @mutex:			Mutex for the access to the NE CPU pool.
    113 * @nr_parent_vm_cores :	The size of the available threads per core array.
    114 *				The total number of CPU cores available on the
    115 *				primary / parent VM.
    116 * @nr_threads_per_core:	The number of threads that a full CPU core has.
    117 * @numa_node:			NUMA node of the CPUs in the pool.
    118 */
    119struct ne_cpu_pool {
    120	cpumask_var_t	*avail_threads_per_core;
    121	struct mutex	mutex;
    122	unsigned int	nr_parent_vm_cores;
    123	unsigned int	nr_threads_per_core;
    124	int		numa_node;
    125};
    126
    127static struct ne_cpu_pool ne_cpu_pool;
    128
    129/**
    130 * struct ne_phys_contig_mem_regions - Contiguous physical memory regions.
    131 * @num:	The number of regions that currently has.
    132 * @regions:	The array of physical memory regions.
    133 */
    134struct ne_phys_contig_mem_regions {
    135	unsigned long num;
    136	struct range  *regions;
    137};
    138
    139/**
    140 * ne_check_enclaves_created() - Verify if at least one enclave has been created.
    141 * @void:	No parameters provided.
    142 *
    143 * Context: Process context.
    144 * Return:
    145 * * True if at least one enclave is created.
    146 * * False otherwise.
    147 */
    148static bool ne_check_enclaves_created(void)
    149{
    150	struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
    151	bool ret = false;
    152
    153	if (!ne_pci_dev)
    154		return ret;
    155
    156	mutex_lock(&ne_pci_dev->enclaves_list_mutex);
    157
    158	if (!list_empty(&ne_pci_dev->enclaves_list))
    159		ret = true;
    160
    161	mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
    162
    163	return ret;
    164}
    165
    166/**
    167 * ne_setup_cpu_pool() - Set the NE CPU pool after handling sanity checks such
    168 *			 as not sharing CPU cores with the primary / parent VM
    169 *			 or not using CPU 0, which should remain available for
    170 *			 the primary / parent VM. Offline the CPUs from the
    171 *			 pool after the checks passed.
    172 * @ne_cpu_list:	The CPU list used for setting NE CPU pool.
    173 *
    174 * Context: Process context.
    175 * Return:
    176 * * 0 on success.
    177 * * Negative return value on failure.
    178 */
    179static int ne_setup_cpu_pool(const char *ne_cpu_list)
    180{
    181	int core_id = -1;
    182	unsigned int cpu = 0;
    183	cpumask_var_t cpu_pool;
    184	unsigned int cpu_sibling = 0;
    185	unsigned int i = 0;
    186	int numa_node = -1;
    187	int rc = -EINVAL;
    188
    189	if (!zalloc_cpumask_var(&cpu_pool, GFP_KERNEL))
    190		return -ENOMEM;
    191
    192	mutex_lock(&ne_cpu_pool.mutex);
    193
    194	rc = cpulist_parse(ne_cpu_list, cpu_pool);
    195	if (rc < 0) {
    196		pr_err("%s: Error in cpulist parse [rc=%d]\n", ne_misc_dev.name, rc);
    197
    198		goto free_pool_cpumask;
    199	}
    200
    201	cpu = cpumask_any(cpu_pool);
    202	if (cpu >= nr_cpu_ids) {
    203		pr_err("%s: No CPUs available in CPU pool\n", ne_misc_dev.name);
    204
    205		rc = -EINVAL;
    206
    207		goto free_pool_cpumask;
    208	}
    209
    210	/*
    211	 * Check if the CPUs are online, to further get info about them
    212	 * e.g. numa node, core id, siblings.
    213	 */
    214	for_each_cpu(cpu, cpu_pool)
    215		if (cpu_is_offline(cpu)) {
    216			pr_err("%s: CPU %d is offline, has to be online to get its metadata\n",
    217			       ne_misc_dev.name, cpu);
    218
    219			rc = -EINVAL;
    220
    221			goto free_pool_cpumask;
    222		}
    223
    224	/*
    225	 * Check if the CPUs from the NE CPU pool are from the same NUMA node.
    226	 */
    227	for_each_cpu(cpu, cpu_pool)
    228		if (numa_node < 0) {
    229			numa_node = cpu_to_node(cpu);
    230			if (numa_node < 0) {
    231				pr_err("%s: Invalid NUMA node %d\n",
    232				       ne_misc_dev.name, numa_node);
    233
    234				rc = -EINVAL;
    235
    236				goto free_pool_cpumask;
    237			}
    238		} else {
    239			if (numa_node != cpu_to_node(cpu)) {
    240				pr_err("%s: CPUs with different NUMA nodes\n",
    241				       ne_misc_dev.name);
    242
    243				rc = -EINVAL;
    244
    245				goto free_pool_cpumask;
    246			}
    247		}
    248
    249	/*
    250	 * Check if CPU 0 and its siblings are included in the provided CPU pool
    251	 * They should remain available for the primary / parent VM.
    252	 */
    253	if (cpumask_test_cpu(0, cpu_pool)) {
    254		pr_err("%s: CPU 0 has to remain available\n", ne_misc_dev.name);
    255
    256		rc = -EINVAL;
    257
    258		goto free_pool_cpumask;
    259	}
    260
    261	for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) {
    262		if (cpumask_test_cpu(cpu_sibling, cpu_pool)) {
    263			pr_err("%s: CPU sibling %d for CPU 0 is in CPU pool\n",
    264			       ne_misc_dev.name, cpu_sibling);
    265
    266			rc = -EINVAL;
    267
    268			goto free_pool_cpumask;
    269		}
    270	}
    271
    272	/*
    273	 * Check if CPU siblings are included in the provided CPU pool. The
    274	 * expectation is that full CPU cores are made available in the CPU pool
    275	 * for enclaves.
    276	 */
    277	for_each_cpu(cpu, cpu_pool) {
    278		for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
    279			if (!cpumask_test_cpu(cpu_sibling, cpu_pool)) {
    280				pr_err("%s: CPU %d is not in CPU pool\n",
    281				       ne_misc_dev.name, cpu_sibling);
    282
    283				rc = -EINVAL;
    284
    285				goto free_pool_cpumask;
    286			}
    287		}
    288	}
    289
    290	/* Calculate the number of threads from a full CPU core. */
    291	cpu = cpumask_any(cpu_pool);
    292	for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu))
    293		ne_cpu_pool.nr_threads_per_core++;
    294
    295	ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core;
    296
    297	ne_cpu_pool.avail_threads_per_core = kcalloc(ne_cpu_pool.nr_parent_vm_cores,
    298						     sizeof(*ne_cpu_pool.avail_threads_per_core),
    299						     GFP_KERNEL);
    300	if (!ne_cpu_pool.avail_threads_per_core) {
    301		rc = -ENOMEM;
    302
    303		goto free_pool_cpumask;
    304	}
    305
    306	for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
    307		if (!zalloc_cpumask_var(&ne_cpu_pool.avail_threads_per_core[i], GFP_KERNEL)) {
    308			rc = -ENOMEM;
    309
    310			goto free_cores_cpumask;
    311		}
    312
    313	/*
    314	 * Split the NE CPU pool in threads per core to keep the CPU topology
    315	 * after offlining the CPUs.
    316	 */
    317	for_each_cpu(cpu, cpu_pool) {
    318		core_id = topology_core_id(cpu);
    319		if (core_id < 0 || core_id >= ne_cpu_pool.nr_parent_vm_cores) {
    320			pr_err("%s: Invalid core id  %d for CPU %d\n",
    321			       ne_misc_dev.name, core_id, cpu);
    322
    323			rc = -EINVAL;
    324
    325			goto clear_cpumask;
    326		}
    327
    328		cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id]);
    329	}
    330
    331	/*
    332	 * CPUs that are given to enclave(s) should not be considered online
    333	 * by Linux anymore, as the hypervisor will degrade them to floating.
    334	 * The physical CPUs (full cores) are carved out of the primary / parent
    335	 * VM and given to the enclave VM. The same number of vCPUs would run
    336	 * on less pCPUs for the primary / parent VM.
    337	 *
    338	 * We offline them here, to not degrade performance and expose correct
    339	 * topology to Linux and user space.
    340	 */
    341	for_each_cpu(cpu, cpu_pool) {
    342		rc = remove_cpu(cpu);
    343		if (rc != 0) {
    344			pr_err("%s: CPU %d is not offlined [rc=%d]\n",
    345			       ne_misc_dev.name, cpu, rc);
    346
    347			goto online_cpus;
    348		}
    349	}
    350
    351	free_cpumask_var(cpu_pool);
    352
    353	ne_cpu_pool.numa_node = numa_node;
    354
    355	mutex_unlock(&ne_cpu_pool.mutex);
    356
    357	return 0;
    358
    359online_cpus:
    360	for_each_cpu(cpu, cpu_pool)
    361		add_cpu(cpu);
    362clear_cpumask:
    363	for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
    364		cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]);
    365free_cores_cpumask:
    366	for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
    367		free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]);
    368	kfree(ne_cpu_pool.avail_threads_per_core);
    369free_pool_cpumask:
    370	free_cpumask_var(cpu_pool);
    371	ne_cpu_pool.nr_parent_vm_cores = 0;
    372	ne_cpu_pool.nr_threads_per_core = 0;
    373	ne_cpu_pool.numa_node = -1;
    374	mutex_unlock(&ne_cpu_pool.mutex);
    375
    376	return rc;
    377}
    378
    379/**
    380 * ne_teardown_cpu_pool() - Online the CPUs from the NE CPU pool and cleanup the
    381 *			    CPU pool.
    382 * @void:	No parameters provided.
    383 *
    384 * Context: Process context.
    385 */
    386static void ne_teardown_cpu_pool(void)
    387{
    388	unsigned int cpu = 0;
    389	unsigned int i = 0;
    390	int rc = -EINVAL;
    391
    392	mutex_lock(&ne_cpu_pool.mutex);
    393
    394	if (!ne_cpu_pool.nr_parent_vm_cores) {
    395		mutex_unlock(&ne_cpu_pool.mutex);
    396
    397		return;
    398	}
    399
    400	for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) {
    401		for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]) {
    402			rc = add_cpu(cpu);
    403			if (rc != 0)
    404				pr_err("%s: CPU %d is not onlined [rc=%d]\n",
    405				       ne_misc_dev.name, cpu, rc);
    406		}
    407
    408		cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]);
    409
    410		free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]);
    411	}
    412
    413	kfree(ne_cpu_pool.avail_threads_per_core);
    414	ne_cpu_pool.nr_parent_vm_cores = 0;
    415	ne_cpu_pool.nr_threads_per_core = 0;
    416	ne_cpu_pool.numa_node = -1;
    417
    418	mutex_unlock(&ne_cpu_pool.mutex);
    419}
    420
    421/**
    422 * ne_set_kernel_param() - Set the NE CPU pool value via the NE kernel parameter.
    423 * @val:	NE CPU pool string value.
    424 * @kp :	NE kernel parameter associated with the NE CPU pool.
    425 *
    426 * Context: Process context.
    427 * Return:
    428 * * 0 on success.
    429 * * Negative return value on failure.
    430 */
    431static int ne_set_kernel_param(const char *val, const struct kernel_param *kp)
    432{
    433	char error_val[] = "";
    434	int rc = -EINVAL;
    435
    436	if (!capable(CAP_SYS_ADMIN))
    437		return -EPERM;
    438
    439	if (ne_check_enclaves_created()) {
    440		pr_err("%s: The CPU pool is used by enclave(s)\n", ne_misc_dev.name);
    441
    442		return -EPERM;
    443	}
    444
    445	ne_teardown_cpu_pool();
    446
    447	rc = ne_setup_cpu_pool(val);
    448	if (rc < 0) {
    449		pr_err("%s: Error in setup CPU pool [rc=%d]\n", ne_misc_dev.name, rc);
    450
    451		param_set_copystring(error_val, kp);
    452
    453		return rc;
    454	}
    455
    456	rc = param_set_copystring(val, kp);
    457	if (rc < 0) {
    458		pr_err("%s: Error in param set copystring [rc=%d]\n", ne_misc_dev.name, rc);
    459
    460		ne_teardown_cpu_pool();
    461
    462		param_set_copystring(error_val, kp);
    463
    464		return rc;
    465	}
    466
    467	return 0;
    468}
    469
    470/**
    471 * ne_donated_cpu() - Check if the provided CPU is already used by the enclave.
    472 * @ne_enclave :	Private data associated with the current enclave.
    473 * @cpu:		CPU to check if already used.
    474 *
    475 * Context: Process context. This function is called with the ne_enclave mutex held.
    476 * Return:
    477 * * True if the provided CPU is already used by the enclave.
    478 * * False otherwise.
    479 */
    480static bool ne_donated_cpu(struct ne_enclave *ne_enclave, unsigned int cpu)
    481{
    482	if (cpumask_test_cpu(cpu, ne_enclave->vcpu_ids))
    483		return true;
    484
    485	return false;
    486}
    487
    488/**
    489 * ne_get_unused_core_from_cpu_pool() - Get the id of a full core from the
    490 *					NE CPU pool.
    491 * @void:	No parameters provided.
    492 *
    493 * Context: Process context. This function is called with the ne_enclave and
    494 *	    ne_cpu_pool mutexes held.
    495 * Return:
    496 * * Core id.
    497 * * -1 if no CPU core available in the pool.
    498 */
    499static int ne_get_unused_core_from_cpu_pool(void)
    500{
    501	int core_id = -1;
    502	unsigned int i = 0;
    503
    504	for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
    505		if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i])) {
    506			core_id = i;
    507
    508			break;
    509		}
    510
    511	return core_id;
    512}
    513
    514/**
    515 * ne_set_enclave_threads_per_core() - Set the threads of the provided core in
    516 *				       the enclave data structure.
    517 * @ne_enclave :	Private data associated with the current enclave.
    518 * @core_id:		Core id to get its threads from the NE CPU pool.
    519 * @vcpu_id:		vCPU id part of the provided core.
    520 *
    521 * Context: Process context. This function is called with the ne_enclave and
    522 *	    ne_cpu_pool mutexes held.
    523 * Return:
    524 * * 0 on success.
    525 * * Negative return value on failure.
    526 */
    527static int ne_set_enclave_threads_per_core(struct ne_enclave *ne_enclave,
    528					   int core_id, u32 vcpu_id)
    529{
    530	unsigned int cpu = 0;
    531
    532	if (core_id < 0 && vcpu_id == 0) {
    533		dev_err_ratelimited(ne_misc_dev.this_device,
    534				    "No CPUs available in NE CPU pool\n");
    535
    536		return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
    537	}
    538
    539	if (core_id < 0) {
    540		dev_err_ratelimited(ne_misc_dev.this_device,
    541				    "CPU %d is not in NE CPU pool\n", vcpu_id);
    542
    543		return -NE_ERR_VCPU_NOT_IN_CPU_POOL;
    544	}
    545
    546	if (core_id >= ne_enclave->nr_parent_vm_cores) {
    547		dev_err_ratelimited(ne_misc_dev.this_device,
    548				    "Invalid core id %d - ne_enclave\n", core_id);
    549
    550		return -NE_ERR_VCPU_INVALID_CPU_CORE;
    551	}
    552
    553	for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id])
    554		cpumask_set_cpu(cpu, ne_enclave->threads_per_core[core_id]);
    555
    556	cpumask_clear(ne_cpu_pool.avail_threads_per_core[core_id]);
    557
    558	return 0;
    559}
    560
    561/**
    562 * ne_get_cpu_from_cpu_pool() - Get a CPU from the NE CPU pool, either from the
    563 *				remaining sibling(s) of a CPU core or the first
    564 *				sibling of a new CPU core.
    565 * @ne_enclave :	Private data associated with the current enclave.
    566 * @vcpu_id:		vCPU to get from the NE CPU pool.
    567 *
    568 * Context: Process context. This function is called with the ne_enclave mutex held.
    569 * Return:
    570 * * 0 on success.
    571 * * Negative return value on failure.
    572 */
    573static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id)
    574{
    575	int core_id = -1;
    576	unsigned int cpu = 0;
    577	unsigned int i = 0;
    578	int rc = -EINVAL;
    579
    580	/*
    581	 * If previously allocated a thread of a core to this enclave, first
    582	 * check remaining sibling(s) for new CPU allocations, so that full
    583	 * CPU cores are used for the enclave.
    584	 */
    585	for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
    586		for_each_cpu(cpu, ne_enclave->threads_per_core[i])
    587			if (!ne_donated_cpu(ne_enclave, cpu)) {
    588				*vcpu_id = cpu;
    589
    590				return 0;
    591			}
    592
    593	mutex_lock(&ne_cpu_pool.mutex);
    594
    595	/*
    596	 * If no remaining siblings, get a core from the NE CPU pool and keep
    597	 * track of all the threads in the enclave threads per core data structure.
    598	 */
    599	core_id = ne_get_unused_core_from_cpu_pool();
    600
    601	rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, *vcpu_id);
    602	if (rc < 0)
    603		goto unlock_mutex;
    604
    605	*vcpu_id = cpumask_any(ne_enclave->threads_per_core[core_id]);
    606
    607	rc = 0;
    608
    609unlock_mutex:
    610	mutex_unlock(&ne_cpu_pool.mutex);
    611
    612	return rc;
    613}
    614
    615/**
    616 * ne_get_vcpu_core_from_cpu_pool() - Get from the NE CPU pool the id of the
    617 *				      core associated with the provided vCPU.
    618 * @vcpu_id:	Provided vCPU id to get its associated core id.
    619 *
    620 * Context: Process context. This function is called with the ne_enclave and
    621 *	    ne_cpu_pool mutexes held.
    622 * Return:
    623 * * Core id.
    624 * * -1 if the provided vCPU is not in the pool.
    625 */
    626static int ne_get_vcpu_core_from_cpu_pool(u32 vcpu_id)
    627{
    628	int core_id = -1;
    629	unsigned int i = 0;
    630
    631	for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
    632		if (cpumask_test_cpu(vcpu_id, ne_cpu_pool.avail_threads_per_core[i])) {
    633			core_id = i;
    634
    635			break;
    636	}
    637
    638	return core_id;
    639}
    640
    641/**
    642 * ne_check_cpu_in_cpu_pool() - Check if the given vCPU is in the available CPUs
    643 *				from the pool.
    644 * @ne_enclave :	Private data associated with the current enclave.
    645 * @vcpu_id:		ID of the vCPU to check if available in the NE CPU pool.
    646 *
    647 * Context: Process context. This function is called with the ne_enclave mutex held.
    648 * Return:
    649 * * 0 on success.
    650 * * Negative return value on failure.
    651 */
    652static int ne_check_cpu_in_cpu_pool(struct ne_enclave *ne_enclave, u32 vcpu_id)
    653{
    654	int core_id = -1;
    655	unsigned int i = 0;
    656	int rc = -EINVAL;
    657
    658	if (ne_donated_cpu(ne_enclave, vcpu_id)) {
    659		dev_err_ratelimited(ne_misc_dev.this_device,
    660				    "CPU %d already used\n", vcpu_id);
    661
    662		return -NE_ERR_VCPU_ALREADY_USED;
    663	}
    664
    665	/*
    666	 * If previously allocated a thread of a core to this enclave, but not
    667	 * the full core, first check remaining sibling(s).
    668	 */
    669	for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
    670		if (cpumask_test_cpu(vcpu_id, ne_enclave->threads_per_core[i]))
    671			return 0;
    672
    673	mutex_lock(&ne_cpu_pool.mutex);
    674
    675	/*
    676	 * If no remaining siblings, get from the NE CPU pool the core
    677	 * associated with the vCPU and keep track of all the threads in the
    678	 * enclave threads per core data structure.
    679	 */
    680	core_id = ne_get_vcpu_core_from_cpu_pool(vcpu_id);
    681
    682	rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, vcpu_id);
    683	if (rc < 0)
    684		goto unlock_mutex;
    685
    686	rc = 0;
    687
    688unlock_mutex:
    689	mutex_unlock(&ne_cpu_pool.mutex);
    690
    691	return rc;
    692}
    693
    694/**
    695 * ne_add_vcpu_ioctl() - Add a vCPU to the slot associated with the current
    696 *			 enclave.
    697 * @ne_enclave :	Private data associated with the current enclave.
    698 * @vcpu_id:		ID of the CPU to be associated with the given slot,
    699 *			apic id on x86.
    700 *
    701 * Context: Process context. This function is called with the ne_enclave mutex held.
    702 * Return:
    703 * * 0 on success.
    704 * * Negative return value on failure.
    705 */
    706static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
    707{
    708	struct ne_pci_dev_cmd_reply cmd_reply = {};
    709	struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
    710	int rc = -EINVAL;
    711	struct slot_add_vcpu_req slot_add_vcpu_req = {};
    712
    713	if (ne_enclave->mm != current->mm)
    714		return -EIO;
    715
    716	slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid;
    717	slot_add_vcpu_req.vcpu_id = vcpu_id;
    718
    719	rc = ne_do_request(pdev, SLOT_ADD_VCPU,
    720			   &slot_add_vcpu_req, sizeof(slot_add_vcpu_req),
    721			   &cmd_reply, sizeof(cmd_reply));
    722	if (rc < 0) {
    723		dev_err_ratelimited(ne_misc_dev.this_device,
    724				    "Error in slot add vCPU [rc=%d]\n", rc);
    725
    726		return rc;
    727	}
    728
    729	cpumask_set_cpu(vcpu_id, ne_enclave->vcpu_ids);
    730
    731	ne_enclave->nr_vcpus++;
    732
    733	return 0;
    734}
    735
    736/**
    737 * ne_sanity_check_user_mem_region() - Sanity check the user space memory
    738 *				       region received during the set user
    739 *				       memory region ioctl call.
    740 * @ne_enclave :	Private data associated with the current enclave.
    741 * @mem_region :	User space memory region to be sanity checked.
    742 *
    743 * Context: Process context. This function is called with the ne_enclave mutex held.
    744 * Return:
    745 * * 0 on success.
    746 * * Negative return value on failure.
    747 */
    748static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave,
    749					   struct ne_user_memory_region mem_region)
    750{
    751	struct ne_mem_region *ne_mem_region = NULL;
    752
    753	if (ne_enclave->mm != current->mm)
    754		return -EIO;
    755
    756	if (mem_region.memory_size & (NE_MIN_MEM_REGION_SIZE - 1)) {
    757		dev_err_ratelimited(ne_misc_dev.this_device,
    758				    "User space memory size is not multiple of 2 MiB\n");
    759
    760		return -NE_ERR_INVALID_MEM_REGION_SIZE;
    761	}
    762
    763	if (!IS_ALIGNED(mem_region.userspace_addr, NE_MIN_MEM_REGION_SIZE)) {
    764		dev_err_ratelimited(ne_misc_dev.this_device,
    765				    "User space address is not 2 MiB aligned\n");
    766
    767		return -NE_ERR_UNALIGNED_MEM_REGION_ADDR;
    768	}
    769
    770	if ((mem_region.userspace_addr & (NE_MIN_MEM_REGION_SIZE - 1)) ||
    771	    !access_ok((void __user *)(unsigned long)mem_region.userspace_addr,
    772		       mem_region.memory_size)) {
    773		dev_err_ratelimited(ne_misc_dev.this_device,
    774				    "Invalid user space address range\n");
    775
    776		return -NE_ERR_INVALID_MEM_REGION_ADDR;
    777	}
    778
    779	list_for_each_entry(ne_mem_region, &ne_enclave->mem_regions_list,
    780			    mem_region_list_entry) {
    781		u64 memory_size = ne_mem_region->memory_size;
    782		u64 userspace_addr = ne_mem_region->userspace_addr;
    783
    784		if ((userspace_addr <= mem_region.userspace_addr &&
    785		     mem_region.userspace_addr < (userspace_addr + memory_size)) ||
    786		    (mem_region.userspace_addr <= userspace_addr &&
    787		    (mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) {
    788			dev_err_ratelimited(ne_misc_dev.this_device,
    789					    "User space memory region already used\n");
    790
    791			return -NE_ERR_MEM_REGION_ALREADY_USED;
    792		}
    793	}
    794
    795	return 0;
    796}
    797
    798/**
    799 * ne_sanity_check_user_mem_region_page() - Sanity check a page from the user space
    800 *					    memory region received during the set
    801 *					    user memory region ioctl call.
    802 * @ne_enclave :	Private data associated with the current enclave.
    803 * @mem_region_page:	Page from the user space memory region to be sanity checked.
    804 *
    805 * Context: Process context. This function is called with the ne_enclave mutex held.
    806 * Return:
    807 * * 0 on success.
    808 * * Negative return value on failure.
    809 */
    810static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave,
    811						struct page *mem_region_page)
    812{
    813	if (!PageHuge(mem_region_page)) {
    814		dev_err_ratelimited(ne_misc_dev.this_device,
    815				    "Not a hugetlbfs page\n");
    816
    817		return -NE_ERR_MEM_NOT_HUGE_PAGE;
    818	}
    819
    820	if (page_size(mem_region_page) & (NE_MIN_MEM_REGION_SIZE - 1)) {
    821		dev_err_ratelimited(ne_misc_dev.this_device,
    822				    "Page size not multiple of 2 MiB\n");
    823
    824		return -NE_ERR_INVALID_PAGE_SIZE;
    825	}
    826
    827	if (ne_enclave->numa_node != page_to_nid(mem_region_page)) {
    828		dev_err_ratelimited(ne_misc_dev.this_device,
    829				    "Page is not from NUMA node %d\n",
    830				    ne_enclave->numa_node);
    831
    832		return -NE_ERR_MEM_DIFFERENT_NUMA_NODE;
    833	}
    834
    835	return 0;
    836}
    837
    838/**
    839 * ne_sanity_check_phys_mem_region() - Sanity check the start address and the size
    840 *                                     of a physical memory region.
    841 * @phys_mem_region_paddr : Physical start address of the region to be sanity checked.
    842 * @phys_mem_region_size  : Length of the region to be sanity checked.
    843 *
    844 * Context: Process context. This function is called with the ne_enclave mutex held.
    845 * Return:
    846 * * 0 on success.
    847 * * Negative return value on failure.
    848 */
    849static int ne_sanity_check_phys_mem_region(u64 phys_mem_region_paddr,
    850					   u64 phys_mem_region_size)
    851{
    852	if (phys_mem_region_size & (NE_MIN_MEM_REGION_SIZE - 1)) {
    853		dev_err_ratelimited(ne_misc_dev.this_device,
    854				    "Physical mem region size is not multiple of 2 MiB\n");
    855
    856		return -EINVAL;
    857	}
    858
    859	if (!IS_ALIGNED(phys_mem_region_paddr, NE_MIN_MEM_REGION_SIZE)) {
    860		dev_err_ratelimited(ne_misc_dev.this_device,
    861				    "Physical mem region address is not 2 MiB aligned\n");
    862
    863		return -EINVAL;
    864	}
    865
    866	return 0;
    867}
    868
    869/**
    870 * ne_merge_phys_contig_memory_regions() - Add a memory region and merge the adjacent
    871 *                                         regions if they are physically contiguous.
    872 * @phys_contig_regions : Private data associated with the contiguous physical memory regions.
    873 * @page_paddr :          Physical start address of the region to be added.
    874 * @page_size :           Length of the region to be added.
    875 *
    876 * Context: Process context. This function is called with the ne_enclave mutex held.
    877 * Return:
    878 * * 0 on success.
    879 * * Negative return value on failure.
    880 */
    881static int
    882ne_merge_phys_contig_memory_regions(struct ne_phys_contig_mem_regions *phys_contig_regions,
    883				    u64 page_paddr, u64 page_size)
    884{
    885	unsigned long num = phys_contig_regions->num;
    886	int rc = 0;
    887
    888	rc = ne_sanity_check_phys_mem_region(page_paddr, page_size);
    889	if (rc < 0)
    890		return rc;
    891
    892	/* Physically contiguous, just merge */
    893	if (num && (phys_contig_regions->regions[num - 1].end + 1) == page_paddr) {
    894		phys_contig_regions->regions[num - 1].end += page_size;
    895	} else {
    896		phys_contig_regions->regions[num].start = page_paddr;
    897		phys_contig_regions->regions[num].end = page_paddr + page_size - 1;
    898		phys_contig_regions->num++;
    899	}
    900
    901	return 0;
    902}
    903
    904/**
    905 * ne_set_user_memory_region_ioctl() - Add user space memory region to the slot
    906 *				       associated with the current enclave.
    907 * @ne_enclave :	Private data associated with the current enclave.
    908 * @mem_region :	User space memory region to be associated with the given slot.
    909 *
    910 * Context: Process context. This function is called with the ne_enclave mutex held.
    911 * Return:
    912 * * 0 on success.
    913 * * Negative return value on failure.
    914 */
    915static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave,
    916					   struct ne_user_memory_region mem_region)
    917{
    918	long gup_rc = 0;
    919	unsigned long i = 0;
    920	unsigned long max_nr_pages = 0;
    921	unsigned long memory_size = 0;
    922	struct ne_mem_region *ne_mem_region = NULL;
    923	struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
    924	struct ne_phys_contig_mem_regions phys_contig_mem_regions = {};
    925	int rc = -EINVAL;
    926
    927	rc = ne_sanity_check_user_mem_region(ne_enclave, mem_region);
    928	if (rc < 0)
    929		return rc;
    930
    931	ne_mem_region = kzalloc(sizeof(*ne_mem_region), GFP_KERNEL);
    932	if (!ne_mem_region)
    933		return -ENOMEM;
    934
    935	max_nr_pages = mem_region.memory_size / NE_MIN_MEM_REGION_SIZE;
    936
    937	ne_mem_region->pages = kcalloc(max_nr_pages, sizeof(*ne_mem_region->pages),
    938				       GFP_KERNEL);
    939	if (!ne_mem_region->pages) {
    940		rc = -ENOMEM;
    941
    942		goto free_mem_region;
    943	}
    944
    945	phys_contig_mem_regions.regions = kcalloc(max_nr_pages,
    946						  sizeof(*phys_contig_mem_regions.regions),
    947						  GFP_KERNEL);
    948	if (!phys_contig_mem_regions.regions) {
    949		rc = -ENOMEM;
    950
    951		goto free_mem_region;
    952	}
    953
    954	do {
    955		i = ne_mem_region->nr_pages;
    956
    957		if (i == max_nr_pages) {
    958			dev_err_ratelimited(ne_misc_dev.this_device,
    959					    "Reached max nr of pages in the pages data struct\n");
    960
    961			rc = -ENOMEM;
    962
    963			goto put_pages;
    964		}
    965
    966		gup_rc = get_user_pages_unlocked(mem_region.userspace_addr + memory_size, 1,
    967						 ne_mem_region->pages + i, FOLL_GET);
    968
    969		if (gup_rc < 0) {
    970			rc = gup_rc;
    971
    972			dev_err_ratelimited(ne_misc_dev.this_device,
    973					    "Error in get user pages [rc=%d]\n", rc);
    974
    975			goto put_pages;
    976		}
    977
    978		rc = ne_sanity_check_user_mem_region_page(ne_enclave, ne_mem_region->pages[i]);
    979		if (rc < 0)
    980			goto put_pages;
    981
    982		rc = ne_merge_phys_contig_memory_regions(&phys_contig_mem_regions,
    983							 page_to_phys(ne_mem_region->pages[i]),
    984							 page_size(ne_mem_region->pages[i]));
    985		if (rc < 0)
    986			goto put_pages;
    987
    988		memory_size += page_size(ne_mem_region->pages[i]);
    989
    990		ne_mem_region->nr_pages++;
    991	} while (memory_size < mem_region.memory_size);
    992
    993	if ((ne_enclave->nr_mem_regions + phys_contig_mem_regions.num) >
    994	    ne_enclave->max_mem_regions) {
    995		dev_err_ratelimited(ne_misc_dev.this_device,
    996				    "Reached max memory regions %lld\n",
    997				    ne_enclave->max_mem_regions);
    998
    999		rc = -NE_ERR_MEM_MAX_REGIONS;
   1000
   1001		goto put_pages;
   1002	}
   1003
   1004	for (i = 0; i < phys_contig_mem_regions.num; i++) {
   1005		u64 phys_region_addr = phys_contig_mem_regions.regions[i].start;
   1006		u64 phys_region_size = range_len(&phys_contig_mem_regions.regions[i]);
   1007
   1008		rc = ne_sanity_check_phys_mem_region(phys_region_addr, phys_region_size);
   1009		if (rc < 0)
   1010			goto put_pages;
   1011	}
   1012
   1013	ne_mem_region->memory_size = mem_region.memory_size;
   1014	ne_mem_region->userspace_addr = mem_region.userspace_addr;
   1015
   1016	list_add(&ne_mem_region->mem_region_list_entry, &ne_enclave->mem_regions_list);
   1017
   1018	for (i = 0; i < phys_contig_mem_regions.num; i++) {
   1019		struct ne_pci_dev_cmd_reply cmd_reply = {};
   1020		struct slot_add_mem_req slot_add_mem_req = {};
   1021
   1022		slot_add_mem_req.slot_uid = ne_enclave->slot_uid;
   1023		slot_add_mem_req.paddr = phys_contig_mem_regions.regions[i].start;
   1024		slot_add_mem_req.size = range_len(&phys_contig_mem_regions.regions[i]);
   1025
   1026		rc = ne_do_request(pdev, SLOT_ADD_MEM,
   1027				   &slot_add_mem_req, sizeof(slot_add_mem_req),
   1028				   &cmd_reply, sizeof(cmd_reply));
   1029		if (rc < 0) {
   1030			dev_err_ratelimited(ne_misc_dev.this_device,
   1031					    "Error in slot add mem [rc=%d]\n", rc);
   1032
   1033			kfree(phys_contig_mem_regions.regions);
   1034
   1035			/*
   1036			 * Exit here without put pages as memory regions may
   1037			 * already been added.
   1038			 */
   1039			return rc;
   1040		}
   1041
   1042		ne_enclave->mem_size += slot_add_mem_req.size;
   1043		ne_enclave->nr_mem_regions++;
   1044	}
   1045
   1046	kfree(phys_contig_mem_regions.regions);
   1047
   1048	return 0;
   1049
   1050put_pages:
   1051	for (i = 0; i < ne_mem_region->nr_pages; i++)
   1052		put_page(ne_mem_region->pages[i]);
   1053free_mem_region:
   1054	kfree(phys_contig_mem_regions.regions);
   1055	kfree(ne_mem_region->pages);
   1056	kfree(ne_mem_region);
   1057
   1058	return rc;
   1059}
   1060
   1061/**
   1062 * ne_start_enclave_ioctl() - Trigger enclave start after the enclave resources,
   1063 *			      such as memory and CPU, have been set.
   1064 * @ne_enclave :		Private data associated with the current enclave.
   1065 * @enclave_start_info :	Enclave info that includes enclave cid and flags.
   1066 *
   1067 * Context: Process context. This function is called with the ne_enclave mutex held.
   1068 * Return:
   1069 * * 0 on success.
   1070 * * Negative return value on failure.
   1071 */
   1072static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave,
   1073				  struct ne_enclave_start_info *enclave_start_info)
   1074{
   1075	struct ne_pci_dev_cmd_reply cmd_reply = {};
   1076	unsigned int cpu = 0;
   1077	struct enclave_start_req enclave_start_req = {};
   1078	unsigned int i = 0;
   1079	struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
   1080	int rc = -EINVAL;
   1081
   1082	if (!ne_enclave->nr_mem_regions) {
   1083		dev_err_ratelimited(ne_misc_dev.this_device,
   1084				    "Enclave has no mem regions\n");
   1085
   1086		return -NE_ERR_NO_MEM_REGIONS_ADDED;
   1087	}
   1088
   1089	if (ne_enclave->mem_size < NE_MIN_ENCLAVE_MEM_SIZE) {
   1090		dev_err_ratelimited(ne_misc_dev.this_device,
   1091				    "Enclave memory is less than %ld\n",
   1092				    NE_MIN_ENCLAVE_MEM_SIZE);
   1093
   1094		return -NE_ERR_ENCLAVE_MEM_MIN_SIZE;
   1095	}
   1096
   1097	if (!ne_enclave->nr_vcpus) {
   1098		dev_err_ratelimited(ne_misc_dev.this_device,
   1099				    "Enclave has no vCPUs\n");
   1100
   1101		return -NE_ERR_NO_VCPUS_ADDED;
   1102	}
   1103
   1104	for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
   1105		for_each_cpu(cpu, ne_enclave->threads_per_core[i])
   1106			if (!cpumask_test_cpu(cpu, ne_enclave->vcpu_ids)) {
   1107				dev_err_ratelimited(ne_misc_dev.this_device,
   1108						    "Full CPU cores not used\n");
   1109
   1110				return -NE_ERR_FULL_CORES_NOT_USED;
   1111			}
   1112
   1113	enclave_start_req.enclave_cid = enclave_start_info->enclave_cid;
   1114	enclave_start_req.flags = enclave_start_info->flags;
   1115	enclave_start_req.slot_uid = ne_enclave->slot_uid;
   1116
   1117	rc = ne_do_request(pdev, ENCLAVE_START,
   1118			   &enclave_start_req, sizeof(enclave_start_req),
   1119			   &cmd_reply, sizeof(cmd_reply));
   1120	if (rc < 0) {
   1121		dev_err_ratelimited(ne_misc_dev.this_device,
   1122				    "Error in enclave start [rc=%d]\n", rc);
   1123
   1124		return rc;
   1125	}
   1126
   1127	ne_enclave->state = NE_STATE_RUNNING;
   1128
   1129	enclave_start_info->enclave_cid = cmd_reply.enclave_cid;
   1130
   1131	return 0;
   1132}
   1133
   1134/**
   1135 * ne_enclave_ioctl() - Ioctl function provided by the enclave file.
   1136 * @file:	File associated with this ioctl function.
   1137 * @cmd:	The command that is set for the ioctl call.
   1138 * @arg:	The argument that is provided for the ioctl call.
   1139 *
   1140 * Context: Process context.
   1141 * Return:
   1142 * * 0 on success.
   1143 * * Negative return value on failure.
   1144 */
   1145static long ne_enclave_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
   1146{
   1147	struct ne_enclave *ne_enclave = file->private_data;
   1148
   1149	switch (cmd) {
   1150	case NE_ADD_VCPU: {
   1151		int rc = -EINVAL;
   1152		u32 vcpu_id = 0;
   1153
   1154		if (copy_from_user(&vcpu_id, (void __user *)arg, sizeof(vcpu_id)))
   1155			return -EFAULT;
   1156
   1157		mutex_lock(&ne_enclave->enclave_info_mutex);
   1158
   1159		if (ne_enclave->state != NE_STATE_INIT) {
   1160			dev_err_ratelimited(ne_misc_dev.this_device,
   1161					    "Enclave is not in init state\n");
   1162
   1163			mutex_unlock(&ne_enclave->enclave_info_mutex);
   1164
   1165			return -NE_ERR_NOT_IN_INIT_STATE;
   1166		}
   1167
   1168		if (vcpu_id >= (ne_enclave->nr_parent_vm_cores *
   1169		    ne_enclave->nr_threads_per_core)) {
   1170			dev_err_ratelimited(ne_misc_dev.this_device,
   1171					    "vCPU id higher than max CPU id\n");
   1172
   1173			mutex_unlock(&ne_enclave->enclave_info_mutex);
   1174
   1175			return -NE_ERR_INVALID_VCPU;
   1176		}
   1177
   1178		if (!vcpu_id) {
   1179			/* Use the CPU pool for choosing a CPU for the enclave. */
   1180			rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id);
   1181			if (rc < 0) {
   1182				dev_err_ratelimited(ne_misc_dev.this_device,
   1183						    "Error in get CPU from pool [rc=%d]\n",
   1184						    rc);
   1185
   1186				mutex_unlock(&ne_enclave->enclave_info_mutex);
   1187
   1188				return rc;
   1189			}
   1190		} else {
   1191			/* Check if the provided vCPU is available in the NE CPU pool. */
   1192			rc = ne_check_cpu_in_cpu_pool(ne_enclave, vcpu_id);
   1193			if (rc < 0) {
   1194				dev_err_ratelimited(ne_misc_dev.this_device,
   1195						    "Error in check CPU %d in pool [rc=%d]\n",
   1196						    vcpu_id, rc);
   1197
   1198				mutex_unlock(&ne_enclave->enclave_info_mutex);
   1199
   1200				return rc;
   1201			}
   1202		}
   1203
   1204		rc = ne_add_vcpu_ioctl(ne_enclave, vcpu_id);
   1205		if (rc < 0) {
   1206			mutex_unlock(&ne_enclave->enclave_info_mutex);
   1207
   1208			return rc;
   1209		}
   1210
   1211		mutex_unlock(&ne_enclave->enclave_info_mutex);
   1212
   1213		if (copy_to_user((void __user *)arg, &vcpu_id, sizeof(vcpu_id)))
   1214			return -EFAULT;
   1215
   1216		return 0;
   1217	}
   1218
   1219	case NE_GET_IMAGE_LOAD_INFO: {
   1220		struct ne_image_load_info image_load_info = {};
   1221
   1222		if (copy_from_user(&image_load_info, (void __user *)arg, sizeof(image_load_info)))
   1223			return -EFAULT;
   1224
   1225		mutex_lock(&ne_enclave->enclave_info_mutex);
   1226
   1227		if (ne_enclave->state != NE_STATE_INIT) {
   1228			dev_err_ratelimited(ne_misc_dev.this_device,
   1229					    "Enclave is not in init state\n");
   1230
   1231			mutex_unlock(&ne_enclave->enclave_info_mutex);
   1232
   1233			return -NE_ERR_NOT_IN_INIT_STATE;
   1234		}
   1235
   1236		mutex_unlock(&ne_enclave->enclave_info_mutex);
   1237
   1238		if (!image_load_info.flags ||
   1239		    image_load_info.flags >= NE_IMAGE_LOAD_MAX_FLAG_VAL) {
   1240			dev_err_ratelimited(ne_misc_dev.this_device,
   1241					    "Incorrect flag in enclave image load info\n");
   1242
   1243			return -NE_ERR_INVALID_FLAG_VALUE;
   1244		}
   1245
   1246		if (image_load_info.flags == NE_EIF_IMAGE)
   1247			image_load_info.memory_offset = NE_EIF_LOAD_OFFSET;
   1248
   1249		if (copy_to_user((void __user *)arg, &image_load_info, sizeof(image_load_info)))
   1250			return -EFAULT;
   1251
   1252		return 0;
   1253	}
   1254
   1255	case NE_SET_USER_MEMORY_REGION: {
   1256		struct ne_user_memory_region mem_region = {};
   1257		int rc = -EINVAL;
   1258
   1259		if (copy_from_user(&mem_region, (void __user *)arg, sizeof(mem_region)))
   1260			return -EFAULT;
   1261
   1262		if (mem_region.flags >= NE_MEMORY_REGION_MAX_FLAG_VAL) {
   1263			dev_err_ratelimited(ne_misc_dev.this_device,
   1264					    "Incorrect flag for user memory region\n");
   1265
   1266			return -NE_ERR_INVALID_FLAG_VALUE;
   1267		}
   1268
   1269		mutex_lock(&ne_enclave->enclave_info_mutex);
   1270
   1271		if (ne_enclave->state != NE_STATE_INIT) {
   1272			dev_err_ratelimited(ne_misc_dev.this_device,
   1273					    "Enclave is not in init state\n");
   1274
   1275			mutex_unlock(&ne_enclave->enclave_info_mutex);
   1276
   1277			return -NE_ERR_NOT_IN_INIT_STATE;
   1278		}
   1279
   1280		rc = ne_set_user_memory_region_ioctl(ne_enclave, mem_region);
   1281		if (rc < 0) {
   1282			mutex_unlock(&ne_enclave->enclave_info_mutex);
   1283
   1284			return rc;
   1285		}
   1286
   1287		mutex_unlock(&ne_enclave->enclave_info_mutex);
   1288
   1289		return 0;
   1290	}
   1291
   1292	case NE_START_ENCLAVE: {
   1293		struct ne_enclave_start_info enclave_start_info = {};
   1294		int rc = -EINVAL;
   1295
   1296		if (copy_from_user(&enclave_start_info, (void __user *)arg,
   1297				   sizeof(enclave_start_info)))
   1298			return -EFAULT;
   1299
   1300		if (enclave_start_info.flags >= NE_ENCLAVE_START_MAX_FLAG_VAL) {
   1301			dev_err_ratelimited(ne_misc_dev.this_device,
   1302					    "Incorrect flag in enclave start info\n");
   1303
   1304			return -NE_ERR_INVALID_FLAG_VALUE;
   1305		}
   1306
   1307		/*
   1308		 * Do not use well-known CIDs - 0, 1, 2 - for enclaves.
   1309		 * VMADDR_CID_ANY = -1U
   1310		 * VMADDR_CID_HYPERVISOR = 0
   1311		 * VMADDR_CID_LOCAL = 1
   1312		 * VMADDR_CID_HOST = 2
   1313		 * Note: 0 is used as a placeholder to auto-generate an enclave CID.
   1314		 * http://man7.org/linux/man-pages/man7/vsock.7.html
   1315		 */
   1316		if (enclave_start_info.enclave_cid > 0 &&
   1317		    enclave_start_info.enclave_cid <= VMADDR_CID_HOST) {
   1318			dev_err_ratelimited(ne_misc_dev.this_device,
   1319					    "Well-known CID value, not to be used for enclaves\n");
   1320
   1321			return -NE_ERR_INVALID_ENCLAVE_CID;
   1322		}
   1323
   1324		if (enclave_start_info.enclave_cid == U32_MAX) {
   1325			dev_err_ratelimited(ne_misc_dev.this_device,
   1326					    "Well-known CID value, not to be used for enclaves\n");
   1327
   1328			return -NE_ERR_INVALID_ENCLAVE_CID;
   1329		}
   1330
   1331		/*
   1332		 * Do not use the CID of the primary / parent VM for enclaves.
   1333		 */
   1334		if (enclave_start_info.enclave_cid == NE_PARENT_VM_CID) {
   1335			dev_err_ratelimited(ne_misc_dev.this_device,
   1336					    "CID of the parent VM, not to be used for enclaves\n");
   1337
   1338			return -NE_ERR_INVALID_ENCLAVE_CID;
   1339		}
   1340
   1341		/* 64-bit CIDs are not yet supported for the vsock device. */
   1342		if (enclave_start_info.enclave_cid > U32_MAX) {
   1343			dev_err_ratelimited(ne_misc_dev.this_device,
   1344					    "64-bit CIDs not yet supported for the vsock device\n");
   1345
   1346			return -NE_ERR_INVALID_ENCLAVE_CID;
   1347		}
   1348
   1349		mutex_lock(&ne_enclave->enclave_info_mutex);
   1350
   1351		if (ne_enclave->state != NE_STATE_INIT) {
   1352			dev_err_ratelimited(ne_misc_dev.this_device,
   1353					    "Enclave is not in init state\n");
   1354
   1355			mutex_unlock(&ne_enclave->enclave_info_mutex);
   1356
   1357			return -NE_ERR_NOT_IN_INIT_STATE;
   1358		}
   1359
   1360		rc = ne_start_enclave_ioctl(ne_enclave, &enclave_start_info);
   1361		if (rc < 0) {
   1362			mutex_unlock(&ne_enclave->enclave_info_mutex);
   1363
   1364			return rc;
   1365		}
   1366
   1367		mutex_unlock(&ne_enclave->enclave_info_mutex);
   1368
   1369		if (copy_to_user((void __user *)arg, &enclave_start_info,
   1370				 sizeof(enclave_start_info)))
   1371			return -EFAULT;
   1372
   1373		return 0;
   1374	}
   1375
   1376	default:
   1377		return -ENOTTY;
   1378	}
   1379
   1380	return 0;
   1381}
   1382
   1383/**
   1384 * ne_enclave_remove_all_mem_region_entries() - Remove all memory region entries
   1385 *						from the enclave data structure.
   1386 * @ne_enclave :	Private data associated with the current enclave.
   1387 *
   1388 * Context: Process context. This function is called with the ne_enclave mutex held.
   1389 */
   1390static void ne_enclave_remove_all_mem_region_entries(struct ne_enclave *ne_enclave)
   1391{
   1392	unsigned long i = 0;
   1393	struct ne_mem_region *ne_mem_region = NULL;
   1394	struct ne_mem_region *ne_mem_region_tmp = NULL;
   1395
   1396	list_for_each_entry_safe(ne_mem_region, ne_mem_region_tmp,
   1397				 &ne_enclave->mem_regions_list,
   1398				 mem_region_list_entry) {
   1399		list_del(&ne_mem_region->mem_region_list_entry);
   1400
   1401		for (i = 0; i < ne_mem_region->nr_pages; i++)
   1402			put_page(ne_mem_region->pages[i]);
   1403
   1404		kfree(ne_mem_region->pages);
   1405
   1406		kfree(ne_mem_region);
   1407	}
   1408}
   1409
   1410/**
   1411 * ne_enclave_remove_all_vcpu_id_entries() - Remove all vCPU id entries from
   1412 *					     the enclave data structure.
   1413 * @ne_enclave :	Private data associated with the current enclave.
   1414 *
   1415 * Context: Process context. This function is called with the ne_enclave mutex held.
   1416 */
   1417static void ne_enclave_remove_all_vcpu_id_entries(struct ne_enclave *ne_enclave)
   1418{
   1419	unsigned int cpu = 0;
   1420	unsigned int i = 0;
   1421
   1422	mutex_lock(&ne_cpu_pool.mutex);
   1423
   1424	for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) {
   1425		for_each_cpu(cpu, ne_enclave->threads_per_core[i])
   1426			/* Update the available NE CPU pool. */
   1427			cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]);
   1428
   1429		free_cpumask_var(ne_enclave->threads_per_core[i]);
   1430	}
   1431
   1432	mutex_unlock(&ne_cpu_pool.mutex);
   1433
   1434	kfree(ne_enclave->threads_per_core);
   1435
   1436	free_cpumask_var(ne_enclave->vcpu_ids);
   1437}
   1438
   1439/**
   1440 * ne_pci_dev_remove_enclave_entry() - Remove the enclave entry from the data
   1441 *				       structure that is part of the NE PCI
   1442 *				       device private data.
   1443 * @ne_enclave :	Private data associated with the current enclave.
   1444 * @ne_pci_dev :	Private data associated with the PCI device.
   1445 *
   1446 * Context: Process context. This function is called with the ne_pci_dev enclave
   1447 *	    mutex held.
   1448 */
   1449static void ne_pci_dev_remove_enclave_entry(struct ne_enclave *ne_enclave,
   1450					    struct ne_pci_dev *ne_pci_dev)
   1451{
   1452	struct ne_enclave *ne_enclave_entry = NULL;
   1453	struct ne_enclave *ne_enclave_entry_tmp = NULL;
   1454
   1455	list_for_each_entry_safe(ne_enclave_entry, ne_enclave_entry_tmp,
   1456				 &ne_pci_dev->enclaves_list, enclave_list_entry) {
   1457		if (ne_enclave_entry->slot_uid == ne_enclave->slot_uid) {
   1458			list_del(&ne_enclave_entry->enclave_list_entry);
   1459
   1460			break;
   1461		}
   1462	}
   1463}
   1464
   1465/**
   1466 * ne_enclave_release() - Release function provided by the enclave file.
   1467 * @inode:	Inode associated with this file release function.
   1468 * @file:	File associated with this release function.
   1469 *
   1470 * Context: Process context.
   1471 * Return:
   1472 * * 0 on success.
   1473 * * Negative return value on failure.
   1474 */
   1475static int ne_enclave_release(struct inode *inode, struct file *file)
   1476{
   1477	struct ne_pci_dev_cmd_reply cmd_reply = {};
   1478	struct enclave_stop_req enclave_stop_request = {};
   1479	struct ne_enclave *ne_enclave = file->private_data;
   1480	struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
   1481	struct pci_dev *pdev = ne_pci_dev->pdev;
   1482	int rc = -EINVAL;
   1483	struct slot_free_req slot_free_req = {};
   1484
   1485	if (!ne_enclave)
   1486		return 0;
   1487
   1488	/*
   1489	 * Early exit in case there is an error in the enclave creation logic
   1490	 * and fput() is called on the cleanup path.
   1491	 */
   1492	if (!ne_enclave->slot_uid)
   1493		return 0;
   1494
   1495	/*
   1496	 * Acquire the enclave list mutex before the enclave mutex
   1497	 * in order to avoid deadlocks with @ref ne_event_work_handler.
   1498	 */
   1499	mutex_lock(&ne_pci_dev->enclaves_list_mutex);
   1500	mutex_lock(&ne_enclave->enclave_info_mutex);
   1501
   1502	if (ne_enclave->state != NE_STATE_INIT && ne_enclave->state != NE_STATE_STOPPED) {
   1503		enclave_stop_request.slot_uid = ne_enclave->slot_uid;
   1504
   1505		rc = ne_do_request(pdev, ENCLAVE_STOP,
   1506				   &enclave_stop_request, sizeof(enclave_stop_request),
   1507				   &cmd_reply, sizeof(cmd_reply));
   1508		if (rc < 0) {
   1509			dev_err_ratelimited(ne_misc_dev.this_device,
   1510					    "Error in enclave stop [rc=%d]\n", rc);
   1511
   1512			goto unlock_mutex;
   1513		}
   1514
   1515		memset(&cmd_reply, 0, sizeof(cmd_reply));
   1516	}
   1517
   1518	slot_free_req.slot_uid = ne_enclave->slot_uid;
   1519
   1520	rc = ne_do_request(pdev, SLOT_FREE,
   1521			   &slot_free_req, sizeof(slot_free_req),
   1522			   &cmd_reply, sizeof(cmd_reply));
   1523	if (rc < 0) {
   1524		dev_err_ratelimited(ne_misc_dev.this_device,
   1525				    "Error in slot free [rc=%d]\n", rc);
   1526
   1527		goto unlock_mutex;
   1528	}
   1529
   1530	ne_pci_dev_remove_enclave_entry(ne_enclave, ne_pci_dev);
   1531	ne_enclave_remove_all_mem_region_entries(ne_enclave);
   1532	ne_enclave_remove_all_vcpu_id_entries(ne_enclave);
   1533
   1534	mutex_unlock(&ne_enclave->enclave_info_mutex);
   1535	mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
   1536
   1537	kfree(ne_enclave);
   1538
   1539	return 0;
   1540
   1541unlock_mutex:
   1542	mutex_unlock(&ne_enclave->enclave_info_mutex);
   1543	mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
   1544
   1545	return rc;
   1546}
   1547
   1548/**
   1549 * ne_enclave_poll() - Poll functionality used for enclave out-of-band events.
   1550 * @file:	File associated with this poll function.
   1551 * @wait:	Poll table data structure.
   1552 *
   1553 * Context: Process context.
   1554 * Return:
   1555 * * Poll mask.
   1556 */
   1557static __poll_t ne_enclave_poll(struct file *file, poll_table *wait)
   1558{
   1559	__poll_t mask = 0;
   1560	struct ne_enclave *ne_enclave = file->private_data;
   1561
   1562	poll_wait(file, &ne_enclave->eventq, wait);
   1563
   1564	if (ne_enclave->has_event)
   1565		mask |= EPOLLHUP;
   1566
   1567	return mask;
   1568}
   1569
   1570static const struct file_operations ne_enclave_fops = {
   1571	.owner		= THIS_MODULE,
   1572	.llseek		= noop_llseek,
   1573	.poll		= ne_enclave_poll,
   1574	.unlocked_ioctl	= ne_enclave_ioctl,
   1575	.release	= ne_enclave_release,
   1576};
   1577
   1578/**
   1579 * ne_create_vm_ioctl() - Alloc slot to be associated with an enclave. Create
   1580 *			  enclave file descriptor to be further used for enclave
   1581 *			  resources handling e.g. memory regions and CPUs.
   1582 * @ne_pci_dev :	Private data associated with the PCI device.
   1583 * @slot_uid:		User pointer to store the generated unique slot id
   1584 *			associated with an enclave to.
   1585 *
   1586 * Context: Process context. This function is called with the ne_pci_dev enclave
   1587 *	    mutex held.
   1588 * Return:
   1589 * * Enclave fd on success.
   1590 * * Negative return value on failure.
   1591 */
   1592static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_uid)
   1593{
   1594	struct ne_pci_dev_cmd_reply cmd_reply = {};
   1595	int enclave_fd = -1;
   1596	struct file *enclave_file = NULL;
   1597	unsigned int i = 0;
   1598	struct ne_enclave *ne_enclave = NULL;
   1599	struct pci_dev *pdev = ne_pci_dev->pdev;
   1600	int rc = -EINVAL;
   1601	struct slot_alloc_req slot_alloc_req = {};
   1602
   1603	mutex_lock(&ne_cpu_pool.mutex);
   1604
   1605	for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++)
   1606		if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i]))
   1607			break;
   1608
   1609	if (i == ne_cpu_pool.nr_parent_vm_cores) {
   1610		dev_err_ratelimited(ne_misc_dev.this_device,
   1611				    "No CPUs available in CPU pool\n");
   1612
   1613		mutex_unlock(&ne_cpu_pool.mutex);
   1614
   1615		return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
   1616	}
   1617
   1618	mutex_unlock(&ne_cpu_pool.mutex);
   1619
   1620	ne_enclave = kzalloc(sizeof(*ne_enclave), GFP_KERNEL);
   1621	if (!ne_enclave)
   1622		return -ENOMEM;
   1623
   1624	mutex_lock(&ne_cpu_pool.mutex);
   1625
   1626	ne_enclave->nr_parent_vm_cores = ne_cpu_pool.nr_parent_vm_cores;
   1627	ne_enclave->nr_threads_per_core = ne_cpu_pool.nr_threads_per_core;
   1628	ne_enclave->numa_node = ne_cpu_pool.numa_node;
   1629
   1630	mutex_unlock(&ne_cpu_pool.mutex);
   1631
   1632	ne_enclave->threads_per_core = kcalloc(ne_enclave->nr_parent_vm_cores,
   1633					       sizeof(*ne_enclave->threads_per_core),
   1634					       GFP_KERNEL);
   1635	if (!ne_enclave->threads_per_core) {
   1636		rc = -ENOMEM;
   1637
   1638		goto free_ne_enclave;
   1639	}
   1640
   1641	for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
   1642		if (!zalloc_cpumask_var(&ne_enclave->threads_per_core[i], GFP_KERNEL)) {
   1643			rc = -ENOMEM;
   1644
   1645			goto free_cpumask;
   1646		}
   1647
   1648	if (!zalloc_cpumask_var(&ne_enclave->vcpu_ids, GFP_KERNEL)) {
   1649		rc = -ENOMEM;
   1650
   1651		goto free_cpumask;
   1652	}
   1653
   1654	enclave_fd = get_unused_fd_flags(O_CLOEXEC);
   1655	if (enclave_fd < 0) {
   1656		rc = enclave_fd;
   1657
   1658		dev_err_ratelimited(ne_misc_dev.this_device,
   1659				    "Error in getting unused fd [rc=%d]\n", rc);
   1660
   1661		goto free_cpumask;
   1662	}
   1663
   1664	enclave_file = anon_inode_getfile("ne-vm", &ne_enclave_fops, ne_enclave, O_RDWR);
   1665	if (IS_ERR(enclave_file)) {
   1666		rc = PTR_ERR(enclave_file);
   1667
   1668		dev_err_ratelimited(ne_misc_dev.this_device,
   1669				    "Error in anon inode get file [rc=%d]\n", rc);
   1670
   1671		goto put_fd;
   1672	}
   1673
   1674	rc = ne_do_request(pdev, SLOT_ALLOC,
   1675			   &slot_alloc_req, sizeof(slot_alloc_req),
   1676			   &cmd_reply, sizeof(cmd_reply));
   1677	if (rc < 0) {
   1678		dev_err_ratelimited(ne_misc_dev.this_device,
   1679				    "Error in slot alloc [rc=%d]\n", rc);
   1680
   1681		goto put_file;
   1682	}
   1683
   1684	init_waitqueue_head(&ne_enclave->eventq);
   1685	ne_enclave->has_event = false;
   1686	mutex_init(&ne_enclave->enclave_info_mutex);
   1687	ne_enclave->max_mem_regions = cmd_reply.mem_regions;
   1688	INIT_LIST_HEAD(&ne_enclave->mem_regions_list);
   1689	ne_enclave->mm = current->mm;
   1690	ne_enclave->slot_uid = cmd_reply.slot_uid;
   1691	ne_enclave->state = NE_STATE_INIT;
   1692
   1693	list_add(&ne_enclave->enclave_list_entry, &ne_pci_dev->enclaves_list);
   1694
   1695	if (copy_to_user(slot_uid, &ne_enclave->slot_uid, sizeof(ne_enclave->slot_uid))) {
   1696		/*
   1697		 * As we're holding the only reference to 'enclave_file', fput()
   1698		 * will call ne_enclave_release() which will do a proper cleanup
   1699		 * of all so far allocated resources, leaving only the unused fd
   1700		 * for us to free.
   1701		 */
   1702		fput(enclave_file);
   1703		put_unused_fd(enclave_fd);
   1704
   1705		return -EFAULT;
   1706	}
   1707
   1708	fd_install(enclave_fd, enclave_file);
   1709
   1710	return enclave_fd;
   1711
   1712put_file:
   1713	fput(enclave_file);
   1714put_fd:
   1715	put_unused_fd(enclave_fd);
   1716free_cpumask:
   1717	free_cpumask_var(ne_enclave->vcpu_ids);
   1718	for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++)
   1719		free_cpumask_var(ne_enclave->threads_per_core[i]);
   1720	kfree(ne_enclave->threads_per_core);
   1721free_ne_enclave:
   1722	kfree(ne_enclave);
   1723
   1724	return rc;
   1725}
   1726
   1727/**
   1728 * ne_ioctl() - Ioctl function provided by the NE misc device.
   1729 * @file:	File associated with this ioctl function.
   1730 * @cmd:	The command that is set for the ioctl call.
   1731 * @arg:	The argument that is provided for the ioctl call.
   1732 *
   1733 * Context: Process context.
   1734 * Return:
   1735 * * Ioctl result (e.g. enclave file descriptor) on success.
   1736 * * Negative return value on failure.
   1737 */
   1738static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
   1739{
   1740	switch (cmd) {
   1741	case NE_CREATE_VM: {
   1742		int enclave_fd = -1;
   1743		struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
   1744		u64 __user *slot_uid = (void __user *)arg;
   1745
   1746		mutex_lock(&ne_pci_dev->enclaves_list_mutex);
   1747		enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid);
   1748		mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
   1749
   1750		return enclave_fd;
   1751	}
   1752
   1753	default:
   1754		return -ENOTTY;
   1755	}
   1756
   1757	return 0;
   1758}
   1759
   1760#if defined(CONFIG_NITRO_ENCLAVES_MISC_DEV_TEST)
   1761#include "ne_misc_dev_test.c"
   1762
   1763static inline int ne_misc_dev_test_init(void)
   1764{
   1765	return __kunit_test_suites_init(ne_misc_dev_test_suites);
   1766}
   1767
   1768static inline void ne_misc_dev_test_exit(void)
   1769{
   1770	__kunit_test_suites_exit(ne_misc_dev_test_suites);
   1771}
   1772#else
   1773static inline int ne_misc_dev_test_init(void)
   1774{
   1775	return 0;
   1776}
   1777
   1778static inline void ne_misc_dev_test_exit(void)
   1779{
   1780}
   1781#endif
   1782
   1783static int __init ne_init(void)
   1784{
   1785	int rc = 0;
   1786
   1787	rc = ne_misc_dev_test_init();
   1788	if (rc < 0)
   1789		return rc;
   1790
   1791	mutex_init(&ne_cpu_pool.mutex);
   1792
   1793	return pci_register_driver(&ne_pci_driver);
   1794}
   1795
   1796static void __exit ne_exit(void)
   1797{
   1798	pci_unregister_driver(&ne_pci_driver);
   1799
   1800	ne_teardown_cpu_pool();
   1801
   1802	ne_misc_dev_test_exit();
   1803}
   1804
   1805module_init(ne_init);
   1806module_exit(ne_exit);
   1807
   1808MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
   1809MODULE_DESCRIPTION("Nitro Enclaves Driver");
   1810MODULE_LICENSE("GPL v2");