cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

encl.c (25854B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*  Copyright(c) 2016-20 Intel Corporation. */
      3
      4#include <linux/lockdep.h>
      5#include <linux/mm.h>
      6#include <linux/mman.h>
      7#include <linux/shmem_fs.h>
      8#include <linux/suspend.h>
      9#include <linux/sched/mm.h>
     10#include <asm/sgx.h>
     11#include "encl.h"
     12#include "encls.h"
     13#include "sgx.h"
     14
     15#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
     16/*
     17 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
     18 * determine the page index associated with the first PCMD entry
     19 * within a PCMD page.
     20 */
     21#define PCMD_FIRST_MASK GENMASK(4, 0)
     22
     23/**
     24 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
     25 *                               a PCMD page is in process of being reclaimed.
     26 * @encl:        Enclave to which PCMD page belongs
     27 * @start_addr:  Address of enclave page using first entry within the PCMD page
     28 *
     29 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
     30 * stored. The PCMD data of a reclaimed enclave page contains enough
     31 * information for the processor to verify the page at the time
     32 * it is loaded back into the Enclave Page Cache (EPC).
     33 *
     34 * The backing storage to which enclave pages are reclaimed is laid out as
     35 * follows:
     36 * Encrypted enclave pages:SECS page:PCMD pages
     37 *
     38 * Each PCMD page contains the PCMD metadata of
     39 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
     40 *
     41 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
     42 * process of getting data (and thus soon being non-empty). (b) is tested with
     43 * a check if an enclave page sharing the PCMD page is in the process of being
     44 * reclaimed.
     45 *
     46 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
     47 * intends to reclaim that enclave page - it means that the PCMD page
     48 * associated with that enclave page is about to get some data and thus
     49 * even if the PCMD page is empty, it should not be truncated.
     50 *
     51 * Context: Enclave mutex (&sgx_encl->lock) must be held.
     52 * Return: 1 if the reclaimer is about to write to the PCMD page
     53 *         0 if the reclaimer has no intention to write to the PCMD page
     54 */
     55static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
     56				     unsigned long start_addr)
     57{
     58	int reclaimed = 0;
     59	int i;
     60
     61	/*
     62	 * PCMD_FIRST_MASK is based on number of PCMD entries within
     63	 * PCMD page being 32.
     64	 */
     65	BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
     66
     67	for (i = 0; i < PCMDS_PER_PAGE; i++) {
     68		struct sgx_encl_page *entry;
     69		unsigned long addr;
     70
     71		addr = start_addr + i * PAGE_SIZE;
     72
     73		/*
     74		 * Stop when reaching the SECS page - it does not
     75		 * have a page_array entry and its reclaim is
     76		 * started and completed with enclave mutex held so
     77		 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
     78		 * flag.
     79		 */
     80		if (addr == encl->base + encl->size)
     81			break;
     82
     83		entry = xa_load(&encl->page_array, PFN_DOWN(addr));
     84		if (!entry)
     85			continue;
     86
     87		/*
     88		 * VA page slot ID uses same bit as the flag so it is important
     89		 * to ensure that the page is not already in backing store.
     90		 */
     91		if (entry->epc_page &&
     92		    (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
     93			reclaimed = 1;
     94			break;
     95		}
     96	}
     97
     98	return reclaimed;
     99}
    100
    101/*
    102 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
    103 * follow right after the EPC data in the backing storage. In addition to the
    104 * visible enclave pages, there's one extra page slot for SECS, before PCMD
    105 * structs.
    106 */
    107static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
    108							    unsigned long page_index)
    109{
    110	pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
    111
    112	return epc_end_off + page_index * sizeof(struct sgx_pcmd);
    113}
    114
    115/*
    116 * Free a page from the backing storage in the given page index.
    117 */
    118static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
    119{
    120	struct inode *inode = file_inode(encl->backing);
    121
    122	shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
    123}
    124
    125/*
    126 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
    127 * Pages" in the SDM.
    128 */
    129static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
    130			   struct sgx_epc_page *epc_page,
    131			   struct sgx_epc_page *secs_page)
    132{
    133	unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
    134	struct sgx_encl *encl = encl_page->encl;
    135	pgoff_t page_index, page_pcmd_off;
    136	unsigned long pcmd_first_page;
    137	struct sgx_pageinfo pginfo;
    138	struct sgx_backing b;
    139	bool pcmd_page_empty;
    140	u8 *pcmd_page;
    141	int ret;
    142
    143	if (secs_page)
    144		page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
    145	else
    146		page_index = PFN_DOWN(encl->size);
    147
    148	/*
    149	 * Address of enclave page using the first entry within the PCMD page.
    150	 */
    151	pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
    152
    153	page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
    154
    155	ret = sgx_encl_lookup_backing(encl, page_index, &b);
    156	if (ret)
    157		return ret;
    158
    159	pginfo.addr = encl_page->desc & PAGE_MASK;
    160	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
    161	pcmd_page = kmap_atomic(b.pcmd);
    162	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
    163
    164	if (secs_page)
    165		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
    166	else
    167		pginfo.secs = 0;
    168
    169	ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
    170		     sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
    171	if (ret) {
    172		if (encls_failed(ret))
    173			ENCLS_WARN(ret, "ELDU");
    174
    175		ret = -EFAULT;
    176	}
    177
    178	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
    179	set_page_dirty(b.pcmd);
    180
    181	/*
    182	 * The area for the PCMD in the page was zeroed above.  Check if the
    183	 * whole page is now empty meaning that all PCMD's have been zeroed:
    184	 */
    185	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
    186
    187	kunmap_atomic(pcmd_page);
    188	kunmap_atomic((void *)(unsigned long)pginfo.contents);
    189
    190	get_page(b.pcmd);
    191	sgx_encl_put_backing(&b);
    192
    193	sgx_encl_truncate_backing_page(encl, page_index);
    194
    195	if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
    196		sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
    197		pcmd_page = kmap_atomic(b.pcmd);
    198		if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
    199			pr_warn("PCMD page not empty after truncate.\n");
    200		kunmap_atomic(pcmd_page);
    201	}
    202
    203	put_page(b.pcmd);
    204
    205	return ret;
    206}
    207
    208static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
    209					  struct sgx_epc_page *secs_page)
    210{
    211
    212	unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
    213	struct sgx_encl *encl = encl_page->encl;
    214	struct sgx_epc_page *epc_page;
    215	int ret;
    216
    217	epc_page = sgx_alloc_epc_page(encl_page, false);
    218	if (IS_ERR(epc_page))
    219		return epc_page;
    220
    221	ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
    222	if (ret) {
    223		sgx_encl_free_epc_page(epc_page);
    224		return ERR_PTR(ret);
    225	}
    226
    227	sgx_free_va_slot(encl_page->va_page, va_offset);
    228	list_move(&encl_page->va_page->list, &encl->va_pages);
    229	encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
    230	encl_page->epc_page = epc_page;
    231
    232	return epc_page;
    233}
    234
    235static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
    236						unsigned long addr,
    237						unsigned long vm_flags)
    238{
    239	unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
    240	struct sgx_epc_page *epc_page;
    241	struct sgx_encl_page *entry;
    242
    243	entry = xa_load(&encl->page_array, PFN_DOWN(addr));
    244	if (!entry)
    245		return ERR_PTR(-EFAULT);
    246
    247	/*
    248	 * Verify that the faulted page has equal or higher build time
    249	 * permissions than the VMA permissions (i.e. the subset of {VM_READ,
    250	 * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
    251	 */
    252	if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
    253		return ERR_PTR(-EFAULT);
    254
    255	/* Entry successfully located. */
    256	if (entry->epc_page) {
    257		if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
    258			return ERR_PTR(-EBUSY);
    259
    260		return entry;
    261	}
    262
    263	if (!(encl->secs.epc_page)) {
    264		epc_page = sgx_encl_eldu(&encl->secs, NULL);
    265		if (IS_ERR(epc_page))
    266			return ERR_CAST(epc_page);
    267	}
    268
    269	epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
    270	if (IS_ERR(epc_page))
    271		return ERR_CAST(epc_page);
    272
    273	encl->secs_child_cnt++;
    274	sgx_mark_page_reclaimable(entry->epc_page);
    275
    276	return entry;
    277}
    278
    279static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
    280{
    281	unsigned long addr = (unsigned long)vmf->address;
    282	struct vm_area_struct *vma = vmf->vma;
    283	struct sgx_encl_page *entry;
    284	unsigned long phys_addr;
    285	struct sgx_encl *encl;
    286	vm_fault_t ret;
    287
    288	encl = vma->vm_private_data;
    289
    290	/*
    291	 * It's very unlikely but possible that allocating memory for the
    292	 * mm_list entry of a forked process failed in sgx_vma_open(). When
    293	 * this happens, vm_private_data is set to NULL.
    294	 */
    295	if (unlikely(!encl))
    296		return VM_FAULT_SIGBUS;
    297
    298	mutex_lock(&encl->lock);
    299
    300	entry = sgx_encl_load_page(encl, addr, vma->vm_flags);
    301	if (IS_ERR(entry)) {
    302		mutex_unlock(&encl->lock);
    303
    304		if (PTR_ERR(entry) == -EBUSY)
    305			return VM_FAULT_NOPAGE;
    306
    307		return VM_FAULT_SIGBUS;
    308	}
    309
    310	phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
    311
    312	ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
    313	if (ret != VM_FAULT_NOPAGE) {
    314		mutex_unlock(&encl->lock);
    315
    316		return VM_FAULT_SIGBUS;
    317	}
    318
    319	sgx_encl_test_and_clear_young(vma->vm_mm, entry);
    320	mutex_unlock(&encl->lock);
    321
    322	return VM_FAULT_NOPAGE;
    323}
    324
    325static void sgx_vma_open(struct vm_area_struct *vma)
    326{
    327	struct sgx_encl *encl = vma->vm_private_data;
    328
    329	/*
    330	 * It's possible but unlikely that vm_private_data is NULL. This can
    331	 * happen in a grandchild of a process, when sgx_encl_mm_add() had
    332	 * failed to allocate memory in this callback.
    333	 */
    334	if (unlikely(!encl))
    335		return;
    336
    337	if (sgx_encl_mm_add(encl, vma->vm_mm))
    338		vma->vm_private_data = NULL;
    339}
    340
    341
    342/**
    343 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
    344 * @encl:		an enclave pointer
    345 * @start:		lower bound of the address range, inclusive
    346 * @end:		upper bound of the address range, exclusive
    347 * @vm_flags:		VMA flags
    348 *
    349 * Iterate through the enclave pages contained within [@start, @end) to verify
    350 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
    351 * do not contain any permissions that are not contained in the build time
    352 * permissions of any of the enclave pages within the given address range.
    353 *
    354 * An enclave creator must declare the strongest permissions that will be
    355 * needed for each enclave page. This ensures that mappings have the identical
    356 * or weaker permissions than the earlier declared permissions.
    357 *
    358 * Return: 0 on success, -EACCES otherwise
    359 */
    360int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
    361		     unsigned long end, unsigned long vm_flags)
    362{
    363	unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
    364	struct sgx_encl_page *page;
    365	unsigned long count = 0;
    366	int ret = 0;
    367
    368	XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
    369
    370	/*
    371	 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
    372	 * conflict with the enclave page permissions.
    373	 */
    374	if (current->personality & READ_IMPLIES_EXEC)
    375		return -EACCES;
    376
    377	mutex_lock(&encl->lock);
    378	xas_lock(&xas);
    379	xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
    380		if (~page->vm_max_prot_bits & vm_prot_bits) {
    381			ret = -EACCES;
    382			break;
    383		}
    384
    385		/* Reschedule on every XA_CHECK_SCHED iteration. */
    386		if (!(++count % XA_CHECK_SCHED)) {
    387			xas_pause(&xas);
    388			xas_unlock(&xas);
    389			mutex_unlock(&encl->lock);
    390
    391			cond_resched();
    392
    393			mutex_lock(&encl->lock);
    394			xas_lock(&xas);
    395		}
    396	}
    397	xas_unlock(&xas);
    398	mutex_unlock(&encl->lock);
    399
    400	return ret;
    401}
    402
    403static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
    404			    unsigned long end, unsigned long newflags)
    405{
    406	return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
    407}
    408
    409static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
    410			       unsigned long addr, void *data)
    411{
    412	unsigned long offset = addr & ~PAGE_MASK;
    413	int ret;
    414
    415
    416	ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
    417	if (ret)
    418		return -EIO;
    419
    420	return 0;
    421}
    422
    423static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
    424				unsigned long addr, void *data)
    425{
    426	unsigned long offset = addr & ~PAGE_MASK;
    427	int ret;
    428
    429	ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
    430	if (ret)
    431		return -EIO;
    432
    433	return 0;
    434}
    435
    436/*
    437 * Load an enclave page to EPC if required, and take encl->lock.
    438 */
    439static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
    440						   unsigned long addr,
    441						   unsigned long vm_flags)
    442{
    443	struct sgx_encl_page *entry;
    444
    445	for ( ; ; ) {
    446		mutex_lock(&encl->lock);
    447
    448		entry = sgx_encl_load_page(encl, addr, vm_flags);
    449		if (PTR_ERR(entry) != -EBUSY)
    450			break;
    451
    452		mutex_unlock(&encl->lock);
    453	}
    454
    455	if (IS_ERR(entry))
    456		mutex_unlock(&encl->lock);
    457
    458	return entry;
    459}
    460
    461static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
    462			  void *buf, int len, int write)
    463{
    464	struct sgx_encl *encl = vma->vm_private_data;
    465	struct sgx_encl_page *entry = NULL;
    466	char data[sizeof(unsigned long)];
    467	unsigned long align;
    468	int offset;
    469	int cnt;
    470	int ret = 0;
    471	int i;
    472
    473	/*
    474	 * If process was forked, VMA is still there but vm_private_data is set
    475	 * to NULL.
    476	 */
    477	if (!encl)
    478		return -EFAULT;
    479
    480	if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
    481		return -EFAULT;
    482
    483	for (i = 0; i < len; i += cnt) {
    484		entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
    485					      vma->vm_flags);
    486		if (IS_ERR(entry)) {
    487			ret = PTR_ERR(entry);
    488			break;
    489		}
    490
    491		align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
    492		offset = (addr + i) & (sizeof(unsigned long) - 1);
    493		cnt = sizeof(unsigned long) - offset;
    494		cnt = min(cnt, len - i);
    495
    496		ret = sgx_encl_debug_read(encl, entry, align, data);
    497		if (ret)
    498			goto out;
    499
    500		if (write) {
    501			memcpy(data + offset, buf + i, cnt);
    502			ret = sgx_encl_debug_write(encl, entry, align, data);
    503			if (ret)
    504				goto out;
    505		} else {
    506			memcpy(buf + i, data + offset, cnt);
    507		}
    508
    509out:
    510		mutex_unlock(&encl->lock);
    511
    512		if (ret)
    513			break;
    514	}
    515
    516	return ret < 0 ? ret : i;
    517}
    518
    519const struct vm_operations_struct sgx_vm_ops = {
    520	.fault = sgx_vma_fault,
    521	.mprotect = sgx_vma_mprotect,
    522	.open = sgx_vma_open,
    523	.access = sgx_vma_access,
    524};
    525
    526/**
    527 * sgx_encl_release - Destroy an enclave instance
    528 * @ref:	address of a kref inside &sgx_encl
    529 *
    530 * Used together with kref_put(). Frees all the resources associated with the
    531 * enclave and the instance itself.
    532 */
    533void sgx_encl_release(struct kref *ref)
    534{
    535	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
    536	struct sgx_va_page *va_page;
    537	struct sgx_encl_page *entry;
    538	unsigned long index;
    539
    540	xa_for_each(&encl->page_array, index, entry) {
    541		if (entry->epc_page) {
    542			/*
    543			 * The page and its radix tree entry cannot be freed
    544			 * if the page is being held by the reclaimer.
    545			 */
    546			if (sgx_unmark_page_reclaimable(entry->epc_page))
    547				continue;
    548
    549			sgx_encl_free_epc_page(entry->epc_page);
    550			encl->secs_child_cnt--;
    551			entry->epc_page = NULL;
    552		}
    553
    554		kfree(entry);
    555		/* Invoke scheduler to prevent soft lockups. */
    556		cond_resched();
    557	}
    558
    559	xa_destroy(&encl->page_array);
    560
    561	if (!encl->secs_child_cnt && encl->secs.epc_page) {
    562		sgx_encl_free_epc_page(encl->secs.epc_page);
    563		encl->secs.epc_page = NULL;
    564	}
    565
    566	while (!list_empty(&encl->va_pages)) {
    567		va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
    568					   list);
    569		list_del(&va_page->list);
    570		sgx_encl_free_epc_page(va_page->epc_page);
    571		kfree(va_page);
    572	}
    573
    574	if (encl->backing)
    575		fput(encl->backing);
    576
    577	cleanup_srcu_struct(&encl->srcu);
    578
    579	WARN_ON_ONCE(!list_empty(&encl->mm_list));
    580
    581	/* Detect EPC page leak's. */
    582	WARN_ON_ONCE(encl->secs_child_cnt);
    583	WARN_ON_ONCE(encl->secs.epc_page);
    584
    585	kfree(encl);
    586}
    587
    588/*
    589 * 'mm' is exiting and no longer needs mmu notifications.
    590 */
    591static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
    592				     struct mm_struct *mm)
    593{
    594	struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
    595	struct sgx_encl_mm *tmp = NULL;
    596
    597	/*
    598	 * The enclave itself can remove encl_mm.  Note, objects can't be moved
    599	 * off an RCU protected list, but deletion is ok.
    600	 */
    601	spin_lock(&encl_mm->encl->mm_lock);
    602	list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
    603		if (tmp == encl_mm) {
    604			list_del_rcu(&encl_mm->list);
    605			break;
    606		}
    607	}
    608	spin_unlock(&encl_mm->encl->mm_lock);
    609
    610	if (tmp == encl_mm) {
    611		synchronize_srcu(&encl_mm->encl->srcu);
    612		mmu_notifier_put(mn);
    613	}
    614}
    615
    616static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
    617{
    618	struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
    619
    620	/* 'encl_mm' is going away, put encl_mm->encl reference: */
    621	kref_put(&encl_mm->encl->refcount, sgx_encl_release);
    622
    623	kfree(encl_mm);
    624}
    625
    626static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
    627	.release		= sgx_mmu_notifier_release,
    628	.free_notifier		= sgx_mmu_notifier_free,
    629};
    630
    631static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
    632					    struct mm_struct *mm)
    633{
    634	struct sgx_encl_mm *encl_mm = NULL;
    635	struct sgx_encl_mm *tmp;
    636	int idx;
    637
    638	idx = srcu_read_lock(&encl->srcu);
    639
    640	list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
    641		if (tmp->mm == mm) {
    642			encl_mm = tmp;
    643			break;
    644		}
    645	}
    646
    647	srcu_read_unlock(&encl->srcu, idx);
    648
    649	return encl_mm;
    650}
    651
    652int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
    653{
    654	struct sgx_encl_mm *encl_mm;
    655	int ret;
    656
    657	/*
    658	 * Even though a single enclave may be mapped into an mm more than once,
    659	 * each 'mm' only appears once on encl->mm_list. This is guaranteed by
    660	 * holding the mm's mmap lock for write before an mm can be added or
    661	 * remove to an encl->mm_list.
    662	 */
    663	mmap_assert_write_locked(mm);
    664
    665	/*
    666	 * It's possible that an entry already exists in the mm_list, because it
    667	 * is removed only on VFS release or process exit.
    668	 */
    669	if (sgx_encl_find_mm(encl, mm))
    670		return 0;
    671
    672	encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
    673	if (!encl_mm)
    674		return -ENOMEM;
    675
    676	/* Grab a refcount for the encl_mm->encl reference: */
    677	kref_get(&encl->refcount);
    678	encl_mm->encl = encl;
    679	encl_mm->mm = mm;
    680	encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
    681
    682	ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
    683	if (ret) {
    684		kfree(encl_mm);
    685		return ret;
    686	}
    687
    688	spin_lock(&encl->mm_lock);
    689	list_add_rcu(&encl_mm->list, &encl->mm_list);
    690	/* Pairs with smp_rmb() in sgx_reclaimer_block(). */
    691	smp_wmb();
    692	encl->mm_list_version++;
    693	spin_unlock(&encl->mm_lock);
    694
    695	return 0;
    696}
    697
    698static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
    699					      pgoff_t index)
    700{
    701	struct inode *inode = encl->backing->f_path.dentry->d_inode;
    702	struct address_space *mapping = inode->i_mapping;
    703	gfp_t gfpmask = mapping_gfp_mask(mapping);
    704
    705	return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
    706}
    707
    708/**
    709 * sgx_encl_get_backing() - Pin the backing storage
    710 * @encl:	an enclave pointer
    711 * @page_index:	enclave page index
    712 * @backing:	data for accessing backing storage for the page
    713 *
    714 * Pin the backing storage pages for storing the encrypted contents and Paging
    715 * Crypto MetaData (PCMD) of an enclave page.
    716 *
    717 * Return:
    718 *   0 on success,
    719 *   -errno otherwise.
    720 */
    721static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
    722			 struct sgx_backing *backing)
    723{
    724	pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
    725	struct page *contents;
    726	struct page *pcmd;
    727
    728	contents = sgx_encl_get_backing_page(encl, page_index);
    729	if (IS_ERR(contents))
    730		return PTR_ERR(contents);
    731
    732	pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
    733	if (IS_ERR(pcmd)) {
    734		put_page(contents);
    735		return PTR_ERR(pcmd);
    736	}
    737
    738	backing->page_index = page_index;
    739	backing->contents = contents;
    740	backing->pcmd = pcmd;
    741	backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
    742
    743	return 0;
    744}
    745
    746/*
    747 * When called from ksgxd, returns the mem_cgroup of a struct mm stored
    748 * in the enclave's mm_list. When not called from ksgxd, just returns
    749 * the mem_cgroup of the current task.
    750 */
    751static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
    752{
    753	struct mem_cgroup *memcg = NULL;
    754	struct sgx_encl_mm *encl_mm;
    755	int idx;
    756
    757	/*
    758	 * If called from normal task context, return the mem_cgroup
    759	 * of the current task's mm. The remainder of the handling is for
    760	 * ksgxd.
    761	 */
    762	if (!current_is_ksgxd())
    763		return get_mem_cgroup_from_mm(current->mm);
    764
    765	/*
    766	 * Search the enclave's mm_list to find an mm associated with
    767	 * this enclave to charge the allocation to.
    768	 */
    769	idx = srcu_read_lock(&encl->srcu);
    770
    771	list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
    772		if (!mmget_not_zero(encl_mm->mm))
    773			continue;
    774
    775		memcg = get_mem_cgroup_from_mm(encl_mm->mm);
    776
    777		mmput_async(encl_mm->mm);
    778
    779		break;
    780	}
    781
    782	srcu_read_unlock(&encl->srcu, idx);
    783
    784	/*
    785	 * In the rare case that there isn't an mm associated with
    786	 * the enclave, set memcg to the current active mem_cgroup.
    787	 * This will be the root mem_cgroup if there is no active
    788	 * mem_cgroup.
    789	 */
    790	if (!memcg)
    791		return get_mem_cgroup_from_mm(NULL);
    792
    793	return memcg;
    794}
    795
    796/**
    797 * sgx_encl_alloc_backing() - allocate a new backing storage page
    798 * @encl:	an enclave pointer
    799 * @page_index:	enclave page index
    800 * @backing:	data for accessing backing storage for the page
    801 *
    802 * When called from ksgxd, sets the active memcg from one of the
    803 * mms in the enclave's mm_list prior to any backing page allocation,
    804 * in order to ensure that shmem page allocations are charged to the
    805 * enclave.
    806 *
    807 * Return:
    808 *   0 on success,
    809 *   -errno otherwise.
    810 */
    811int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
    812			   struct sgx_backing *backing)
    813{
    814	struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
    815	struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
    816	int ret;
    817
    818	ret = sgx_encl_get_backing(encl, page_index, backing);
    819
    820	set_active_memcg(memcg);
    821	mem_cgroup_put(encl_memcg);
    822
    823	return ret;
    824}
    825
    826/**
    827 * sgx_encl_lookup_backing() - retrieve an existing backing storage page
    828 * @encl:	an enclave pointer
    829 * @page_index:	enclave page index
    830 * @backing:	data for accessing backing storage for the page
    831 *
    832 * Retrieve a backing page for loading data back into an EPC page with ELDU.
    833 * It is the caller's responsibility to ensure that it is appropriate to use
    834 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
    835 * not used correctly, this will cause an allocation which is not accounted for.
    836 *
    837 * Return:
    838 *   0 on success,
    839 *   -errno otherwise.
    840 */
    841int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
    842			   struct sgx_backing *backing)
    843{
    844	return sgx_encl_get_backing(encl, page_index, backing);
    845}
    846
    847/**
    848 * sgx_encl_put_backing() - Unpin the backing storage
    849 * @backing:	data for accessing backing storage for the page
    850 */
    851void sgx_encl_put_backing(struct sgx_backing *backing)
    852{
    853	put_page(backing->pcmd);
    854	put_page(backing->contents);
    855}
    856
    857static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
    858					    void *data)
    859{
    860	pte_t pte;
    861	int ret;
    862
    863	ret = pte_young(*ptep);
    864	if (ret) {
    865		pte = pte_mkold(*ptep);
    866		set_pte_at((struct mm_struct *)data, addr, ptep, pte);
    867	}
    868
    869	return ret;
    870}
    871
    872/**
    873 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
    874 * @mm:		mm_struct that is checked
    875 * @page:	enclave page to be tested for recent access
    876 *
    877 * Checks the Access (A) bit from the PTE corresponding to the enclave page and
    878 * clears it.
    879 *
    880 * Return: 1 if the page has been recently accessed and 0 if not.
    881 */
    882int sgx_encl_test_and_clear_young(struct mm_struct *mm,
    883				  struct sgx_encl_page *page)
    884{
    885	unsigned long addr = page->desc & PAGE_MASK;
    886	struct sgx_encl *encl = page->encl;
    887	struct vm_area_struct *vma;
    888	int ret;
    889
    890	ret = sgx_encl_find(mm, addr, &vma);
    891	if (ret)
    892		return 0;
    893
    894	if (encl != vma->vm_private_data)
    895		return 0;
    896
    897	ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
    898				  sgx_encl_test_and_clear_young_cb, vma->vm_mm);
    899	if (ret < 0)
    900		return 0;
    901
    902	return ret;
    903}
    904
    905/**
    906 * sgx_alloc_va_page() - Allocate a Version Array (VA) page
    907 *
    908 * Allocate a free EPC page and convert it to a Version Array (VA) page.
    909 *
    910 * Return:
    911 *   a VA page,
    912 *   -errno otherwise
    913 */
    914struct sgx_epc_page *sgx_alloc_va_page(void)
    915{
    916	struct sgx_epc_page *epc_page;
    917	int ret;
    918
    919	epc_page = sgx_alloc_epc_page(NULL, true);
    920	if (IS_ERR(epc_page))
    921		return ERR_CAST(epc_page);
    922
    923	ret = __epa(sgx_get_epc_virt_addr(epc_page));
    924	if (ret) {
    925		WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
    926		sgx_encl_free_epc_page(epc_page);
    927		return ERR_PTR(-EFAULT);
    928	}
    929
    930	return epc_page;
    931}
    932
    933/**
    934 * sgx_alloc_va_slot - allocate a VA slot
    935 * @va_page:	a &struct sgx_va_page instance
    936 *
    937 * Allocates a slot from a &struct sgx_va_page instance.
    938 *
    939 * Return: offset of the slot inside the VA page
    940 */
    941unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
    942{
    943	int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
    944
    945	if (slot < SGX_VA_SLOT_COUNT)
    946		set_bit(slot, va_page->slots);
    947
    948	return slot << 3;
    949}
    950
    951/**
    952 * sgx_free_va_slot - free a VA slot
    953 * @va_page:	a &struct sgx_va_page instance
    954 * @offset:	offset of the slot inside the VA page
    955 *
    956 * Frees a slot from a &struct sgx_va_page instance.
    957 */
    958void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
    959{
    960	clear_bit(offset >> 3, va_page->slots);
    961}
    962
    963/**
    964 * sgx_va_page_full - is the VA page full?
    965 * @va_page:	a &struct sgx_va_page instance
    966 *
    967 * Return: true if all slots have been taken
    968 */
    969bool sgx_va_page_full(struct sgx_va_page *va_page)
    970{
    971	int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
    972
    973	return slot == SGX_VA_SLOT_COUNT;
    974}
    975
    976/**
    977 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
    978 * @page:	EPC page to be freed
    979 *
    980 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
    981 * only upon success, it puts the page back to free page list.  Otherwise, it
    982 * gives a WARNING to indicate page is leaked.
    983 */
    984void sgx_encl_free_epc_page(struct sgx_epc_page *page)
    985{
    986	int ret;
    987
    988	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
    989
    990	ret = __eremove(sgx_get_epc_virt_addr(page));
    991	if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
    992		return;
    993
    994	sgx_free_epc_page(page);
    995}