cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

xstate.c (49958B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * xsave/xrstor support.
      4 *
      5 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
      6 */
      7#include <linux/bitops.h>
      8#include <linux/compat.h>
      9#include <linux/cpu.h>
     10#include <linux/mman.h>
     11#include <linux/nospec.h>
     12#include <linux/pkeys.h>
     13#include <linux/seq_file.h>
     14#include <linux/proc_fs.h>
     15#include <linux/vmalloc.h>
     16
     17#include <asm/fpu/api.h>
     18#include <asm/fpu/regset.h>
     19#include <asm/fpu/signal.h>
     20#include <asm/fpu/xcr.h>
     21
     22#include <asm/tlbflush.h>
     23#include <asm/prctl.h>
     24#include <asm/elf.h>
     25
     26#include "context.h"
     27#include "internal.h"
     28#include "legacy.h"
     29#include "xstate.h"
     30
     31#define for_each_extended_xfeature(bit, mask)				\
     32	(bit) = FIRST_EXTENDED_XFEATURE;				\
     33	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
     34
     35/*
     36 * Although we spell it out in here, the Processor Trace
     37 * xfeature is completely unused.  We use other mechanisms
     38 * to save/restore PT state in Linux.
     39 */
     40static const char *xfeature_names[] =
     41{
     42	"x87 floating point registers"	,
     43	"SSE registers"			,
     44	"AVX registers"			,
     45	"MPX bounds registers"		,
     46	"MPX CSR"			,
     47	"AVX-512 opmask"		,
     48	"AVX-512 Hi256"			,
     49	"AVX-512 ZMM_Hi256"		,
     50	"Processor Trace (unused)"	,
     51	"Protection Keys User registers",
     52	"PASID state",
     53	"unknown xstate feature"	,
     54	"unknown xstate feature"	,
     55	"unknown xstate feature"	,
     56	"unknown xstate feature"	,
     57	"unknown xstate feature"	,
     58	"unknown xstate feature"	,
     59	"AMX Tile config"		,
     60	"AMX Tile data"			,
     61	"unknown xstate feature"	,
     62};
     63
     64static unsigned short xsave_cpuid_features[] __initdata = {
     65	[XFEATURE_FP]				= X86_FEATURE_FPU,
     66	[XFEATURE_SSE]				= X86_FEATURE_XMM,
     67	[XFEATURE_YMM]				= X86_FEATURE_AVX,
     68	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
     69	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
     70	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
     71	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
     72	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
     73	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
     74	[XFEATURE_PKRU]				= X86_FEATURE_PKU,
     75	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
     76	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
     77	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
     78};
     79
     80static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
     81	{ [ 0 ... XFEATURE_MAX - 1] = -1};
     82static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
     83	{ [ 0 ... XFEATURE_MAX - 1] = -1};
     84static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
     85
     86#define XSTATE_FLAG_SUPERVISOR	BIT(0)
     87#define XSTATE_FLAG_ALIGNED64	BIT(1)
     88
     89/*
     90 * Return whether the system supports a given xfeature.
     91 *
     92 * Also return the name of the (most advanced) feature that the caller requested:
     93 */
     94int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
     95{
     96	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
     97
     98	if (unlikely(feature_name)) {
     99		long xfeature_idx, max_idx;
    100		u64 xfeatures_print;
    101		/*
    102		 * So we use FLS here to be able to print the most advanced
    103		 * feature that was requested but is missing. So if a driver
    104		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
    105		 * missing AVX feature - this is the most informative message
    106		 * to users:
    107		 */
    108		if (xfeatures_missing)
    109			xfeatures_print = xfeatures_missing;
    110		else
    111			xfeatures_print = xfeatures_needed;
    112
    113		xfeature_idx = fls64(xfeatures_print)-1;
    114		max_idx = ARRAY_SIZE(xfeature_names)-1;
    115		xfeature_idx = min(xfeature_idx, max_idx);
    116
    117		*feature_name = xfeature_names[xfeature_idx];
    118	}
    119
    120	if (xfeatures_missing)
    121		return 0;
    122
    123	return 1;
    124}
    125EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
    126
    127static bool xfeature_is_aligned64(int xfeature_nr)
    128{
    129	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
    130}
    131
    132static bool xfeature_is_supervisor(int xfeature_nr)
    133{
    134	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
    135}
    136
    137static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
    138{
    139	unsigned int offs, i;
    140
    141	/*
    142	 * Non-compacted format and legacy features use the cached fixed
    143	 * offsets.
    144	 */
    145	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
    146	    xfeature <= XFEATURE_SSE)
    147		return xstate_offsets[xfeature];
    148
    149	/*
    150	 * Compacted format offsets depend on the actual content of the
    151	 * compacted xsave area which is determined by the xcomp_bv header
    152	 * field.
    153	 */
    154	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
    155	for_each_extended_xfeature(i, xcomp_bv) {
    156		if (xfeature_is_aligned64(i))
    157			offs = ALIGN(offs, 64);
    158		if (i == xfeature)
    159			break;
    160		offs += xstate_sizes[i];
    161	}
    162	return offs;
    163}
    164
    165/*
    166 * Enable the extended processor state save/restore feature.
    167 * Called once per CPU onlining.
    168 */
    169void fpu__init_cpu_xstate(void)
    170{
    171	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
    172		return;
    173
    174	cr4_set_bits(X86_CR4_OSXSAVE);
    175
    176	/*
    177	 * Must happen after CR4 setup and before xsetbv() to allow KVM
    178	 * lazy passthrough.  Write independent of the dynamic state static
    179	 * key as that does not work on the boot CPU. This also ensures
    180	 * that any stale state is wiped out from XFD.
    181	 */
    182	if (cpu_feature_enabled(X86_FEATURE_XFD))
    183		wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
    184
    185	/*
    186	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
    187	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
    188	 * states can be set here.
    189	 */
    190	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
    191
    192	/*
    193	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
    194	 */
    195	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
    196		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
    197				     xfeatures_mask_independent());
    198	}
    199}
    200
    201static bool xfeature_enabled(enum xfeature xfeature)
    202{
    203	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
    204}
    205
    206/*
    207 * Record the offsets and sizes of various xstates contained
    208 * in the XSAVE state memory layout.
    209 */
    210static void __init setup_xstate_cache(void)
    211{
    212	u32 eax, ebx, ecx, edx, i;
    213	/* start at the beginning of the "extended state" */
    214	unsigned int last_good_offset = offsetof(struct xregs_state,
    215						 extended_state_area);
    216	/*
    217	 * The FP xstates and SSE xstates are legacy states. They are always
    218	 * in the fixed offsets in the xsave area in either compacted form
    219	 * or standard form.
    220	 */
    221	xstate_offsets[XFEATURE_FP]	= 0;
    222	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
    223						   xmm_space);
    224
    225	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
    226	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
    227						       xmm_space);
    228
    229	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
    230		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
    231
    232		xstate_sizes[i] = eax;
    233		xstate_flags[i] = ecx;
    234
    235		/*
    236		 * If an xfeature is supervisor state, the offset in EBX is
    237		 * invalid, leave it to -1.
    238		 */
    239		if (xfeature_is_supervisor(i))
    240			continue;
    241
    242		xstate_offsets[i] = ebx;
    243
    244		/*
    245		 * In our xstate size checks, we assume that the highest-numbered
    246		 * xstate feature has the highest offset in the buffer.  Ensure
    247		 * it does.
    248		 */
    249		WARN_ONCE(last_good_offset > xstate_offsets[i],
    250			  "x86/fpu: misordered xstate at %d\n", last_good_offset);
    251
    252		last_good_offset = xstate_offsets[i];
    253	}
    254}
    255
    256static void __init print_xstate_feature(u64 xstate_mask)
    257{
    258	const char *feature_name;
    259
    260	if (cpu_has_xfeatures(xstate_mask, &feature_name))
    261		pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
    262}
    263
    264/*
    265 * Print out all the supported xstate features:
    266 */
    267static void __init print_xstate_features(void)
    268{
    269	print_xstate_feature(XFEATURE_MASK_FP);
    270	print_xstate_feature(XFEATURE_MASK_SSE);
    271	print_xstate_feature(XFEATURE_MASK_YMM);
    272	print_xstate_feature(XFEATURE_MASK_BNDREGS);
    273	print_xstate_feature(XFEATURE_MASK_BNDCSR);
    274	print_xstate_feature(XFEATURE_MASK_OPMASK);
    275	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
    276	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
    277	print_xstate_feature(XFEATURE_MASK_PKRU);
    278	print_xstate_feature(XFEATURE_MASK_PASID);
    279	print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
    280	print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
    281}
    282
    283/*
    284 * This check is important because it is easy to get XSTATE_*
    285 * confused with XSTATE_BIT_*.
    286 */
    287#define CHECK_XFEATURE(nr) do {		\
    288	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
    289	WARN_ON(nr >= XFEATURE_MAX);	\
    290} while (0)
    291
    292/*
    293 * Print out xstate component offsets and sizes
    294 */
    295static void __init print_xstate_offset_size(void)
    296{
    297	int i;
    298
    299	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
    300		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
    301			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
    302			i, xstate_sizes[i]);
    303	}
    304}
    305
    306/*
    307 * This function is called only during boot time when x86 caps are not set
    308 * up and alternative can not be used yet.
    309 */
    310static __init void os_xrstor_booting(struct xregs_state *xstate)
    311{
    312	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
    313	u32 lmask = mask;
    314	u32 hmask = mask >> 32;
    315	int err;
    316
    317	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
    318		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
    319	else
    320		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
    321
    322	/*
    323	 * We should never fault when copying from a kernel buffer, and the FPU
    324	 * state we set at boot time should be valid.
    325	 */
    326	WARN_ON_FPU(err);
    327}
    328
    329/*
    330 * All supported features have either init state all zeros or are
    331 * handled in setup_init_fpu() individually. This is an explicit
    332 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
    333 * newly added supported features at build time and make people
    334 * actually look at the init state for the new feature.
    335 */
    336#define XFEATURES_INIT_FPSTATE_HANDLED		\
    337	(XFEATURE_MASK_FP |			\
    338	 XFEATURE_MASK_SSE |			\
    339	 XFEATURE_MASK_YMM |			\
    340	 XFEATURE_MASK_OPMASK |			\
    341	 XFEATURE_MASK_ZMM_Hi256 |		\
    342	 XFEATURE_MASK_Hi16_ZMM	 |		\
    343	 XFEATURE_MASK_PKRU |			\
    344	 XFEATURE_MASK_BNDREGS |		\
    345	 XFEATURE_MASK_BNDCSR |			\
    346	 XFEATURE_MASK_PASID |			\
    347	 XFEATURE_MASK_XTILE)
    348
    349/*
    350 * setup the xstate image representing the init state
    351 */
    352static void __init setup_init_fpu_buf(void)
    353{
    354	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
    355		      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
    356		     XFEATURES_INIT_FPSTATE_HANDLED);
    357
    358	if (!boot_cpu_has(X86_FEATURE_XSAVE))
    359		return;
    360
    361	print_xstate_features();
    362
    363	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features);
    364
    365	/*
    366	 * Init all the features state with header.xfeatures being 0x0
    367	 */
    368	os_xrstor_booting(&init_fpstate.regs.xsave);
    369
    370	/*
    371	 * All components are now in init state. Read the state back so
    372	 * that init_fpstate contains all non-zero init state. This only
    373	 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
    374	 * those use the init optimization which skips writing data for
    375	 * components in init state.
    376	 *
    377	 * XSAVE could be used, but that would require to reshuffle the
    378	 * data when XSAVEC/S is available because XSAVEC/S uses xstate
    379	 * compaction. But doing so is a pointless exercise because most
    380	 * components have an all zeros init state except for the legacy
    381	 * ones (FP and SSE). Those can be saved with FXSAVE into the
    382	 * legacy area. Adding new features requires to ensure that init
    383	 * state is all zeroes or if not to add the necessary handling
    384	 * here.
    385	 */
    386	fxsave(&init_fpstate.regs.fxsave);
    387}
    388
    389int xfeature_size(int xfeature_nr)
    390{
    391	u32 eax, ebx, ecx, edx;
    392
    393	CHECK_XFEATURE(xfeature_nr);
    394	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
    395	return eax;
    396}
    397
    398/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
    399static int validate_user_xstate_header(const struct xstate_header *hdr,
    400				       struct fpstate *fpstate)
    401{
    402	/* No unknown or supervisor features may be set */
    403	if (hdr->xfeatures & ~fpstate->user_xfeatures)
    404		return -EINVAL;
    405
    406	/* Userspace must use the uncompacted format */
    407	if (hdr->xcomp_bv)
    408		return -EINVAL;
    409
    410	/*
    411	 * If 'reserved' is shrunken to add a new field, make sure to validate
    412	 * that new field here!
    413	 */
    414	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
    415
    416	/* No reserved bits may be set */
    417	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
    418		return -EINVAL;
    419
    420	return 0;
    421}
    422
    423static void __init __xstate_dump_leaves(void)
    424{
    425	int i;
    426	u32 eax, ebx, ecx, edx;
    427	static int should_dump = 1;
    428
    429	if (!should_dump)
    430		return;
    431	should_dump = 0;
    432	/*
    433	 * Dump out a few leaves past the ones that we support
    434	 * just in case there are some goodies up there
    435	 */
    436	for (i = 0; i < XFEATURE_MAX + 10; i++) {
    437		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
    438		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
    439			XSTATE_CPUID, i, eax, ebx, ecx, edx);
    440	}
    441}
    442
    443#define XSTATE_WARN_ON(x) do {							\
    444	if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) {	\
    445		__xstate_dump_leaves();						\
    446	}									\
    447} while (0)
    448
    449#define XCHECK_SZ(sz, nr, nr_macro, __struct) do {			\
    450	if ((nr == nr_macro) &&						\
    451	    WARN_ONCE(sz != sizeof(__struct),				\
    452		"%s: struct is %zu bytes, cpu state %d bytes\n",	\
    453		__stringify(nr_macro), sizeof(__struct), sz)) {		\
    454		__xstate_dump_leaves();					\
    455	}								\
    456} while (0)
    457
    458/**
    459 * check_xtile_data_against_struct - Check tile data state size.
    460 *
    461 * Calculate the state size by multiplying the single tile size which is
    462 * recorded in a C struct, and the number of tiles that the CPU informs.
    463 * Compare the provided size with the calculation.
    464 *
    465 * @size:	The tile data state size
    466 *
    467 * Returns:	0 on success, -EINVAL on mismatch.
    468 */
    469static int __init check_xtile_data_against_struct(int size)
    470{
    471	u32 max_palid, palid, state_size;
    472	u32 eax, ebx, ecx, edx;
    473	u16 max_tile;
    474
    475	/*
    476	 * Check the maximum palette id:
    477	 *   eax: the highest numbered palette subleaf.
    478	 */
    479	cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
    480
    481	/*
    482	 * Cross-check each tile size and find the maximum number of
    483	 * supported tiles.
    484	 */
    485	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
    486		u16 tile_size, max;
    487
    488		/*
    489		 * Check the tile size info:
    490		 *   eax[31:16]:  bytes per title
    491		 *   ebx[31:16]:  the max names (or max number of tiles)
    492		 */
    493		cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
    494		tile_size = eax >> 16;
    495		max = ebx >> 16;
    496
    497		if (tile_size != sizeof(struct xtile_data)) {
    498			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
    499			       __stringify(XFEATURE_XTILE_DATA),
    500			       sizeof(struct xtile_data), tile_size);
    501			__xstate_dump_leaves();
    502			return -EINVAL;
    503		}
    504
    505		if (max > max_tile)
    506			max_tile = max;
    507	}
    508
    509	state_size = sizeof(struct xtile_data) * max_tile;
    510	if (size != state_size) {
    511		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
    512		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
    513		__xstate_dump_leaves();
    514		return -EINVAL;
    515	}
    516	return 0;
    517}
    518
    519/*
    520 * We have a C struct for each 'xstate'.  We need to ensure
    521 * that our software representation matches what the CPU
    522 * tells us about the state's size.
    523 */
    524static bool __init check_xstate_against_struct(int nr)
    525{
    526	/*
    527	 * Ask the CPU for the size of the state.
    528	 */
    529	int sz = xfeature_size(nr);
    530	/*
    531	 * Match each CPU state with the corresponding software
    532	 * structure.
    533	 */
    534	XCHECK_SZ(sz, nr, XFEATURE_YMM,       struct ymmh_struct);
    535	XCHECK_SZ(sz, nr, XFEATURE_BNDREGS,   struct mpx_bndreg_state);
    536	XCHECK_SZ(sz, nr, XFEATURE_BNDCSR,    struct mpx_bndcsr_state);
    537	XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
    538	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
    539	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
    540	XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
    541	XCHECK_SZ(sz, nr, XFEATURE_PASID,     struct ia32_pasid_state);
    542	XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
    543
    544	/* The tile data size varies between implementations. */
    545	if (nr == XFEATURE_XTILE_DATA)
    546		check_xtile_data_against_struct(sz);
    547
    548	/*
    549	 * Make *SURE* to add any feature numbers in below if
    550	 * there are "holes" in the xsave state component
    551	 * numbers.
    552	 */
    553	if ((nr < XFEATURE_YMM) ||
    554	    (nr >= XFEATURE_MAX) ||
    555	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
    556	    ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
    557		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
    558		XSTATE_WARN_ON(1);
    559		return false;
    560	}
    561	return true;
    562}
    563
    564static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
    565{
    566	unsigned int topmost = fls64(xfeatures) -  1;
    567	unsigned int offset = xstate_offsets[topmost];
    568
    569	if (topmost <= XFEATURE_SSE)
    570		return sizeof(struct xregs_state);
    571
    572	if (compacted)
    573		offset = xfeature_get_offset(xfeatures, topmost);
    574	return offset + xstate_sizes[topmost];
    575}
    576
    577/*
    578 * This essentially double-checks what the cpu told us about
    579 * how large the XSAVE buffer needs to be.  We are recalculating
    580 * it to be safe.
    581 *
    582 * Independent XSAVE features allocate their own buffers and are not
    583 * covered by these checks. Only the size of the buffer for task->fpu
    584 * is checked here.
    585 */
    586static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
    587{
    588	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
    589	bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
    590	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
    591	int i;
    592
    593	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
    594		if (!check_xstate_against_struct(i))
    595			return false;
    596		/*
    597		 * Supervisor state components can be managed only by
    598		 * XSAVES.
    599		 */
    600		if (!xsaves && xfeature_is_supervisor(i)) {
    601			XSTATE_WARN_ON(1);
    602			return false;
    603		}
    604	}
    605	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
    606	XSTATE_WARN_ON(size != kernel_size);
    607	return size == kernel_size;
    608}
    609
    610/*
    611 * Get total size of enabled xstates in XCR0 | IA32_XSS.
    612 *
    613 * Note the SDM's wording here.  "sub-function 0" only enumerates
    614 * the size of the *user* states.  If we use it to size a buffer
    615 * that we use 'XSAVES' on, we could potentially overflow the
    616 * buffer because 'XSAVES' saves system states too.
    617 *
    618 * This also takes compaction into account. So this works for
    619 * XSAVEC as well.
    620 */
    621static unsigned int __init get_compacted_size(void)
    622{
    623	unsigned int eax, ebx, ecx, edx;
    624	/*
    625	 * - CPUID function 0DH, sub-function 1:
    626	 *    EBX enumerates the size (in bytes) required by
    627	 *    the XSAVES instruction for an XSAVE area
    628	 *    containing all the state components
    629	 *    corresponding to bits currently set in
    630	 *    XCR0 | IA32_XSS.
    631	 *
    632	 * When XSAVES is not available but XSAVEC is (virt), then there
    633	 * are no supervisor states, but XSAVEC still uses compacted
    634	 * format.
    635	 */
    636	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
    637	return ebx;
    638}
    639
    640/*
    641 * Get the total size of the enabled xstates without the independent supervisor
    642 * features.
    643 */
    644static unsigned int __init get_xsave_compacted_size(void)
    645{
    646	u64 mask = xfeatures_mask_independent();
    647	unsigned int size;
    648
    649	if (!mask)
    650		return get_compacted_size();
    651
    652	/* Disable independent features. */
    653	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
    654
    655	/*
    656	 * Ask the hardware what size is required of the buffer.
    657	 * This is the size required for the task->fpu buffer.
    658	 */
    659	size = get_compacted_size();
    660
    661	/* Re-enable independent features so XSAVES will work on them again. */
    662	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
    663
    664	return size;
    665}
    666
    667static unsigned int __init get_xsave_size_user(void)
    668{
    669	unsigned int eax, ebx, ecx, edx;
    670	/*
    671	 * - CPUID function 0DH, sub-function 0:
    672	 *    EBX enumerates the size (in bytes) required by
    673	 *    the XSAVE instruction for an XSAVE area
    674	 *    containing all the *user* state components
    675	 *    corresponding to bits currently set in XCR0.
    676	 */
    677	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
    678	return ebx;
    679}
    680
    681/*
    682 * Will the runtime-enumerated 'xstate_size' fit in the init
    683 * task's statically-allocated buffer?
    684 */
    685static bool __init is_supported_xstate_size(unsigned int test_xstate_size)
    686{
    687	if (test_xstate_size <= sizeof(init_fpstate.regs))
    688		return true;
    689
    690	pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
    691			sizeof(init_fpstate.regs), test_xstate_size);
    692	return false;
    693}
    694
    695static int __init init_xstate_size(void)
    696{
    697	/* Recompute the context size for enabled features: */
    698	unsigned int user_size, kernel_size, kernel_default_size;
    699	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
    700
    701	/* Uncompacted user space size */
    702	user_size = get_xsave_size_user();
    703
    704	/*
    705	 * XSAVES kernel size includes supervisor states and uses compacted
    706	 * format. XSAVEC uses compacted format, but does not save
    707	 * supervisor states.
    708	 *
    709	 * XSAVE[OPT] do not support supervisor states so kernel and user
    710	 * size is identical.
    711	 */
    712	if (compacted)
    713		kernel_size = get_xsave_compacted_size();
    714	else
    715		kernel_size = user_size;
    716
    717	kernel_default_size =
    718		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
    719
    720	/* Ensure we have the space to store all default enabled features. */
    721	if (!is_supported_xstate_size(kernel_default_size))
    722		return -EINVAL;
    723
    724	if (!paranoid_xstate_size_valid(kernel_size))
    725		return -EINVAL;
    726
    727	fpu_kernel_cfg.max_size = kernel_size;
    728	fpu_user_cfg.max_size = user_size;
    729
    730	fpu_kernel_cfg.default_size = kernel_default_size;
    731	fpu_user_cfg.default_size =
    732		xstate_calculate_size(fpu_user_cfg.default_features, false);
    733
    734	return 0;
    735}
    736
    737/*
    738 * We enabled the XSAVE hardware, but something went wrong and
    739 * we can not use it.  Disable it.
    740 */
    741static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
    742{
    743	fpu_kernel_cfg.max_features = 0;
    744	cr4_clear_bits(X86_CR4_OSXSAVE);
    745	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
    746
    747	/* Restore the legacy size.*/
    748	fpu_kernel_cfg.max_size = legacy_size;
    749	fpu_kernel_cfg.default_size = legacy_size;
    750	fpu_user_cfg.max_size = legacy_size;
    751	fpu_user_cfg.default_size = legacy_size;
    752
    753	/*
    754	 * Prevent enabling the static branch which enables writes to the
    755	 * XFD MSR.
    756	 */
    757	init_fpstate.xfd = 0;
    758
    759	fpstate_reset(&current->thread.fpu);
    760}
    761
    762/*
    763 * Enable and initialize the xsave feature.
    764 * Called once per system bootup.
    765 */
    766void __init fpu__init_system_xstate(unsigned int legacy_size)
    767{
    768	unsigned int eax, ebx, ecx, edx;
    769	u64 xfeatures;
    770	int err;
    771	int i;
    772
    773	if (!boot_cpu_has(X86_FEATURE_FPU)) {
    774		pr_info("x86/fpu: No FPU detected\n");
    775		return;
    776	}
    777
    778	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
    779		pr_info("x86/fpu: x87 FPU will use %s\n",
    780			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
    781		return;
    782	}
    783
    784	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
    785		WARN_ON_FPU(1);
    786		return;
    787	}
    788
    789	/*
    790	 * Find user xstates supported by the processor.
    791	 */
    792	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
    793	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
    794
    795	/*
    796	 * Find supervisor xstates supported by the processor.
    797	 */
    798	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
    799	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
    800
    801	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
    802		/*
    803		 * This indicates that something really unexpected happened
    804		 * with the enumeration.  Disable XSAVE and try to continue
    805		 * booting without it.  This is too early to BUG().
    806		 */
    807		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
    808		       fpu_kernel_cfg.max_features);
    809		goto out_disable;
    810	}
    811
    812	/*
    813	 * Clear XSAVE features that are disabled in the normal CPUID.
    814	 */
    815	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
    816		unsigned short cid = xsave_cpuid_features[i];
    817
    818		/* Careful: X86_FEATURE_FPU is 0! */
    819		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
    820			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
    821	}
    822
    823	if (!cpu_feature_enabled(X86_FEATURE_XFD))
    824		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
    825
    826	if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
    827		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
    828	else
    829		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
    830					XFEATURE_MASK_SUPERVISOR_SUPPORTED;
    831
    832	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
    833	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
    834
    835	/* Clean out dynamic features from default */
    836	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
    837	fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
    838
    839	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
    840	fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
    841
    842	/* Store it for paranoia check at the end */
    843	xfeatures = fpu_kernel_cfg.max_features;
    844
    845	/*
    846	 * Initialize the default XFD state in initfp_state and enable the
    847	 * dynamic sizing mechanism if dynamic states are available.  The
    848	 * static key cannot be enabled here because this runs before
    849	 * jump_label_init(). This is delayed to an initcall.
    850	 */
    851	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
    852
    853	/* Set up compaction feature bit */
    854	if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
    855	    cpu_feature_enabled(X86_FEATURE_XSAVES))
    856		setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
    857
    858	/* Enable xstate instructions to be able to continue with initialization: */
    859	fpu__init_cpu_xstate();
    860
    861	/* Cache size, offset and flags for initialization */
    862	setup_xstate_cache();
    863
    864	err = init_xstate_size();
    865	if (err)
    866		goto out_disable;
    867
    868	/* Reset the state for the current task */
    869	fpstate_reset(&current->thread.fpu);
    870
    871	/*
    872	 * Update info used for ptrace frames; use standard-format size and no
    873	 * supervisor xstates:
    874	 */
    875	update_regset_xstate_info(fpu_user_cfg.max_size,
    876				  fpu_user_cfg.max_features);
    877
    878	setup_init_fpu_buf();
    879
    880	/*
    881	 * Paranoia check whether something in the setup modified the
    882	 * xfeatures mask.
    883	 */
    884	if (xfeatures != fpu_kernel_cfg.max_features) {
    885		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
    886		       xfeatures, fpu_kernel_cfg.max_features);
    887		goto out_disable;
    888	}
    889
    890	print_xstate_offset_size();
    891	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
    892		fpu_kernel_cfg.max_features,
    893		fpu_kernel_cfg.max_size,
    894		boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
    895	return;
    896
    897out_disable:
    898	/* something went wrong, try to boot without any XSAVE support */
    899	fpu__init_disable_system_xstate(legacy_size);
    900}
    901
    902/*
    903 * Restore minimal FPU state after suspend:
    904 */
    905void fpu__resume_cpu(void)
    906{
    907	/*
    908	 * Restore XCR0 on xsave capable CPUs:
    909	 */
    910	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
    911		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
    912
    913	/*
    914	 * Restore IA32_XSS. The same CPUID bit enumerates support
    915	 * of XSAVES and MSR_IA32_XSS.
    916	 */
    917	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
    918		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
    919				     xfeatures_mask_independent());
    920	}
    921
    922	if (fpu_state_size_dynamic())
    923		wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
    924}
    925
    926/*
    927 * Given an xstate feature nr, calculate where in the xsave
    928 * buffer the state is.  Callers should ensure that the buffer
    929 * is valid.
    930 */
    931static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
    932{
    933	u64 xcomp_bv = xsave->header.xcomp_bv;
    934
    935	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
    936		return NULL;
    937
    938	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
    939		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
    940			return NULL;
    941	}
    942
    943	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
    944}
    945
    946/*
    947 * Given the xsave area and a state inside, this function returns the
    948 * address of the state.
    949 *
    950 * This is the API that is called to get xstate address in either
    951 * standard format or compacted format of xsave area.
    952 *
    953 * Note that if there is no data for the field in the xsave buffer
    954 * this will return NULL.
    955 *
    956 * Inputs:
    957 *	xstate: the thread's storage area for all FPU data
    958 *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
    959 *	XFEATURE_SSE, etc...)
    960 * Output:
    961 *	address of the state in the xsave area, or NULL if the
    962 *	field is not present in the xsave buffer.
    963 */
    964void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
    965{
    966	/*
    967	 * Do we even *have* xsave state?
    968	 */
    969	if (!boot_cpu_has(X86_FEATURE_XSAVE))
    970		return NULL;
    971
    972	/*
    973	 * We should not ever be requesting features that we
    974	 * have not enabled.
    975	 */
    976	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
    977		return NULL;
    978
    979	/*
    980	 * This assumes the last 'xsave*' instruction to
    981	 * have requested that 'xfeature_nr' be saved.
    982	 * If it did not, we might be seeing and old value
    983	 * of the field in the buffer.
    984	 *
    985	 * This can happen because the last 'xsave' did not
    986	 * request that this feature be saved (unlikely)
    987	 * or because the "init optimization" caused it
    988	 * to not be saved.
    989	 */
    990	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
    991		return NULL;
    992
    993	return __raw_xsave_addr(xsave, xfeature_nr);
    994}
    995
    996#ifdef CONFIG_ARCH_HAS_PKEYS
    997
    998/*
    999 * This will go out and modify PKRU register to set the access
   1000 * rights for @pkey to @init_val.
   1001 */
   1002int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
   1003			      unsigned long init_val)
   1004{
   1005	u32 old_pkru, new_pkru_bits = 0;
   1006	int pkey_shift;
   1007
   1008	/*
   1009	 * This check implies XSAVE support.  OSPKE only gets
   1010	 * set if we enable XSAVE and we enable PKU in XCR0.
   1011	 */
   1012	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
   1013		return -EINVAL;
   1014
   1015	/*
   1016	 * This code should only be called with valid 'pkey'
   1017	 * values originating from in-kernel users.  Complain
   1018	 * if a bad value is observed.
   1019	 */
   1020	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
   1021		return -EINVAL;
   1022
   1023	/* Set the bits we need in PKRU:  */
   1024	if (init_val & PKEY_DISABLE_ACCESS)
   1025		new_pkru_bits |= PKRU_AD_BIT;
   1026	if (init_val & PKEY_DISABLE_WRITE)
   1027		new_pkru_bits |= PKRU_WD_BIT;
   1028
   1029	/* Shift the bits in to the correct place in PKRU for pkey: */
   1030	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
   1031	new_pkru_bits <<= pkey_shift;
   1032
   1033	/* Get old PKRU and mask off any old bits in place: */
   1034	old_pkru = read_pkru();
   1035	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
   1036
   1037	/* Write old part along with new part: */
   1038	write_pkru(old_pkru | new_pkru_bits);
   1039
   1040	return 0;
   1041}
   1042#endif /* ! CONFIG_ARCH_HAS_PKEYS */
   1043
   1044static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
   1045			 void *init_xstate, unsigned int size)
   1046{
   1047	membuf_write(to, from_xstate ? xstate : init_xstate, size);
   1048}
   1049
   1050/**
   1051 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
   1052 * @to:		membuf descriptor
   1053 * @fpstate:	The fpstate buffer from which to copy
   1054 * @pkru_val:	The PKRU value to store in the PKRU component
   1055 * @copy_mode:	The requested copy mode
   1056 *
   1057 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
   1058 * format, i.e. from the kernel internal hardware dependent storage format
   1059 * to the requested @mode. UABI XSTATE is always uncompacted!
   1060 *
   1061 * It supports partial copy but @to.pos always starts from zero.
   1062 */
   1063void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
   1064			       u32 pkru_val, enum xstate_copy_mode copy_mode)
   1065{
   1066	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
   1067	struct xregs_state *xinit = &init_fpstate.regs.xsave;
   1068	struct xregs_state *xsave = &fpstate->regs.xsave;
   1069	struct xstate_header header;
   1070	unsigned int zerofrom;
   1071	u64 mask;
   1072	int i;
   1073
   1074	memset(&header, 0, sizeof(header));
   1075	header.xfeatures = xsave->header.xfeatures;
   1076
   1077	/* Mask out the feature bits depending on copy mode */
   1078	switch (copy_mode) {
   1079	case XSTATE_COPY_FP:
   1080		header.xfeatures &= XFEATURE_MASK_FP;
   1081		break;
   1082
   1083	case XSTATE_COPY_FX:
   1084		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
   1085		break;
   1086
   1087	case XSTATE_COPY_XSAVE:
   1088		header.xfeatures &= fpstate->user_xfeatures;
   1089		break;
   1090	}
   1091
   1092	/* Copy FP state up to MXCSR */
   1093	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
   1094		     &xinit->i387, off_mxcsr);
   1095
   1096	/* Copy MXCSR when SSE or YMM are set in the feature mask */
   1097	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
   1098		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
   1099		     MXCSR_AND_FLAGS_SIZE);
   1100
   1101	/* Copy the remaining FP state */
   1102	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
   1103		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
   1104		     sizeof(xsave->i387.st_space));
   1105
   1106	/* Copy the SSE state - shared with YMM, but independently managed */
   1107	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
   1108		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
   1109		     sizeof(xsave->i387.xmm_space));
   1110
   1111	if (copy_mode != XSTATE_COPY_XSAVE)
   1112		goto out;
   1113
   1114	/* Zero the padding area */
   1115	membuf_zero(&to, sizeof(xsave->i387.padding));
   1116
   1117	/* Copy xsave->i387.sw_reserved */
   1118	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
   1119
   1120	/* Copy the user space relevant state of @xsave->header */
   1121	membuf_write(&to, &header, sizeof(header));
   1122
   1123	zerofrom = offsetof(struct xregs_state, extended_state_area);
   1124
   1125	/*
   1126	 * The ptrace buffer is in non-compacted XSAVE format.  In
   1127	 * non-compacted format disabled features still occupy state space,
   1128	 * but there is no state to copy from in the compacted
   1129	 * init_fpstate. The gap tracking will zero these states.
   1130	 */
   1131	mask = fpstate->user_xfeatures;
   1132
   1133	for_each_extended_xfeature(i, mask) {
   1134		/*
   1135		 * If there was a feature or alignment gap, zero the space
   1136		 * in the destination buffer.
   1137		 */
   1138		if (zerofrom < xstate_offsets[i])
   1139			membuf_zero(&to, xstate_offsets[i] - zerofrom);
   1140
   1141		if (i == XFEATURE_PKRU) {
   1142			struct pkru_state pkru = {0};
   1143			/*
   1144			 * PKRU is not necessarily up to date in the
   1145			 * XSAVE buffer. Use the provided value.
   1146			 */
   1147			pkru.pkru = pkru_val;
   1148			membuf_write(&to, &pkru, sizeof(pkru));
   1149		} else {
   1150			copy_feature(header.xfeatures & BIT_ULL(i), &to,
   1151				     __raw_xsave_addr(xsave, i),
   1152				     __raw_xsave_addr(xinit, i),
   1153				     xstate_sizes[i]);
   1154		}
   1155		/*
   1156		 * Keep track of the last copied state in the non-compacted
   1157		 * target buffer for gap zeroing.
   1158		 */
   1159		zerofrom = xstate_offsets[i] + xstate_sizes[i];
   1160	}
   1161
   1162out:
   1163	if (to.left)
   1164		membuf_zero(&to, to.left);
   1165}
   1166
   1167/**
   1168 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
   1169 * @to:		membuf descriptor
   1170 * @tsk:	The task from which to copy the saved xstate
   1171 * @copy_mode:	The requested copy mode
   1172 *
   1173 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
   1174 * format, i.e. from the kernel internal hardware dependent storage format
   1175 * to the requested @mode. UABI XSTATE is always uncompacted!
   1176 *
   1177 * It supports partial copy but @to.pos always starts from zero.
   1178 */
   1179void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
   1180			     enum xstate_copy_mode copy_mode)
   1181{
   1182	__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
   1183				  tsk->thread.pkru, copy_mode);
   1184}
   1185
   1186static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
   1187			    const void *kbuf, const void __user *ubuf)
   1188{
   1189	if (kbuf) {
   1190		memcpy(dst, kbuf + offset, size);
   1191	} else {
   1192		if (copy_from_user(dst, ubuf + offset, size))
   1193			return -EFAULT;
   1194	}
   1195	return 0;
   1196}
   1197
   1198
   1199static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
   1200			       const void __user *ubuf)
   1201{
   1202	struct xregs_state *xsave = &fpstate->regs.xsave;
   1203	unsigned int offset, size;
   1204	struct xstate_header hdr;
   1205	u64 mask;
   1206	int i;
   1207
   1208	offset = offsetof(struct xregs_state, header);
   1209	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
   1210		return -EFAULT;
   1211
   1212	if (validate_user_xstate_header(&hdr, fpstate))
   1213		return -EINVAL;
   1214
   1215	/* Validate MXCSR when any of the related features is in use */
   1216	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
   1217	if (hdr.xfeatures & mask) {
   1218		u32 mxcsr[2];
   1219
   1220		offset = offsetof(struct fxregs_state, mxcsr);
   1221		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
   1222			return -EFAULT;
   1223
   1224		/* Reserved bits in MXCSR must be zero. */
   1225		if (mxcsr[0] & ~mxcsr_feature_mask)
   1226			return -EINVAL;
   1227
   1228		/* SSE and YMM require MXCSR even when FP is not in use. */
   1229		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
   1230			xsave->i387.mxcsr = mxcsr[0];
   1231			xsave->i387.mxcsr_mask = mxcsr[1];
   1232		}
   1233	}
   1234
   1235	for (i = 0; i < XFEATURE_MAX; i++) {
   1236		mask = BIT_ULL(i);
   1237
   1238		if (hdr.xfeatures & mask) {
   1239			void *dst = __raw_xsave_addr(xsave, i);
   1240
   1241			offset = xstate_offsets[i];
   1242			size = xstate_sizes[i];
   1243
   1244			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
   1245				return -EFAULT;
   1246		}
   1247	}
   1248
   1249	/*
   1250	 * The state that came in from userspace was user-state only.
   1251	 * Mask all the user states out of 'xfeatures':
   1252	 */
   1253	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
   1254
   1255	/*
   1256	 * Add back in the features that came in from userspace:
   1257	 */
   1258	xsave->header.xfeatures |= hdr.xfeatures;
   1259
   1260	return 0;
   1261}
   1262
   1263/*
   1264 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
   1265 * format and copy to the target thread. Used by ptrace and KVM.
   1266 */
   1267int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
   1268{
   1269	return copy_uabi_to_xstate(fpstate, kbuf, NULL);
   1270}
   1271
   1272/*
   1273 * Convert from a sigreturn standard-format user-space buffer to kernel
   1274 * XSAVE[S] format and copy to the target thread. This is called from the
   1275 * sigreturn() and rt_sigreturn() system calls.
   1276 */
   1277int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
   1278				      const void __user *ubuf)
   1279{
   1280	return copy_uabi_to_xstate(fpstate, NULL, ubuf);
   1281}
   1282
   1283static bool validate_independent_components(u64 mask)
   1284{
   1285	u64 xchk;
   1286
   1287	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
   1288		return false;
   1289
   1290	xchk = ~xfeatures_mask_independent();
   1291
   1292	if (WARN_ON_ONCE(!mask || mask & xchk))
   1293		return false;
   1294
   1295	return true;
   1296}
   1297
   1298/**
   1299 * xsaves - Save selected components to a kernel xstate buffer
   1300 * @xstate:	Pointer to the buffer
   1301 * @mask:	Feature mask to select the components to save
   1302 *
   1303 * The @xstate buffer must be 64 byte aligned and correctly initialized as
   1304 * XSAVES does not write the full xstate header. Before first use the
   1305 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
   1306 * can #GP.
   1307 *
   1308 * The feature mask must be a subset of the independent features.
   1309 */
   1310void xsaves(struct xregs_state *xstate, u64 mask)
   1311{
   1312	int err;
   1313
   1314	if (!validate_independent_components(mask))
   1315		return;
   1316
   1317	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
   1318	WARN_ON_ONCE(err);
   1319}
   1320
   1321/**
   1322 * xrstors - Restore selected components from a kernel xstate buffer
   1323 * @xstate:	Pointer to the buffer
   1324 * @mask:	Feature mask to select the components to restore
   1325 *
   1326 * The @xstate buffer must be 64 byte aligned and correctly initialized
   1327 * otherwise XRSTORS from that buffer can #GP.
   1328 *
   1329 * Proper usage is to restore the state which was saved with
   1330 * xsaves() into @xstate.
   1331 *
   1332 * The feature mask must be a subset of the independent features.
   1333 */
   1334void xrstors(struct xregs_state *xstate, u64 mask)
   1335{
   1336	int err;
   1337
   1338	if (!validate_independent_components(mask))
   1339		return;
   1340
   1341	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
   1342	WARN_ON_ONCE(err);
   1343}
   1344
   1345#if IS_ENABLED(CONFIG_KVM)
   1346void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
   1347{
   1348	void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
   1349
   1350	if (addr)
   1351		memset(addr, 0, xstate_sizes[xfeature]);
   1352}
   1353EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
   1354#endif
   1355
   1356#ifdef CONFIG_X86_64
   1357
   1358#ifdef CONFIG_X86_DEBUG_FPU
   1359/*
   1360 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
   1361 * can safely operate on the @fpstate buffer.
   1362 */
   1363static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
   1364{
   1365	u64 xfd = __this_cpu_read(xfd_state);
   1366
   1367	if (fpstate->xfd == xfd)
   1368		return true;
   1369
   1370	 /*
   1371	  * The XFD MSR does not match fpstate->xfd. That's invalid when
   1372	  * the passed in fpstate is current's fpstate.
   1373	  */
   1374	if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
   1375		return false;
   1376
   1377	/*
   1378	 * XRSTOR(S) from init_fpstate are always correct as it will just
   1379	 * bring all components into init state and not read from the
   1380	 * buffer. XSAVE(S) raises #PF after init.
   1381	 */
   1382	if (fpstate == &init_fpstate)
   1383		return rstor;
   1384
   1385	/*
   1386	 * XSAVE(S): clone(), fpu_swap_kvm_fpu()
   1387	 * XRSTORS(S): fpu_swap_kvm_fpu()
   1388	 */
   1389
   1390	/*
   1391	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
   1392	 * the buffer area for XFD-disabled state components.
   1393	 */
   1394	mask &= ~xfd;
   1395
   1396	/*
   1397	 * Remove features which are valid in fpstate. They
   1398	 * have space allocated in fpstate.
   1399	 */
   1400	mask &= ~fpstate->xfeatures;
   1401
   1402	/*
   1403	 * Any remaining state components in 'mask' might be written
   1404	 * by XSAVE/XRSTOR. Fail validation it found.
   1405	 */
   1406	return !mask;
   1407}
   1408
   1409void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
   1410{
   1411	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
   1412}
   1413#endif /* CONFIG_X86_DEBUG_FPU */
   1414
   1415static int __init xfd_update_static_branch(void)
   1416{
   1417	/*
   1418	 * If init_fpstate.xfd has bits set then dynamic features are
   1419	 * available and the dynamic sizing must be enabled.
   1420	 */
   1421	if (init_fpstate.xfd)
   1422		static_branch_enable(&__fpu_state_size_dynamic);
   1423	return 0;
   1424}
   1425arch_initcall(xfd_update_static_branch)
   1426
   1427void fpstate_free(struct fpu *fpu)
   1428{
   1429	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
   1430		vfree(fpu->fpstate);
   1431}
   1432
   1433/**
   1434 * fpstate_realloc - Reallocate struct fpstate for the requested new features
   1435 *
   1436 * @xfeatures:	A bitmap of xstate features which extend the enabled features
   1437 *		of that task
   1438 * @ksize:	The required size for the kernel buffer
   1439 * @usize:	The required size for user space buffers
   1440 * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
   1441 *
   1442 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
   1443 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
   1444 * with large states are likely to live longer.
   1445 *
   1446 * Returns: 0 on success, -ENOMEM on allocation error.
   1447 */
   1448static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
   1449			   unsigned int usize, struct fpu_guest *guest_fpu)
   1450{
   1451	struct fpu *fpu = &current->thread.fpu;
   1452	struct fpstate *curfps, *newfps = NULL;
   1453	unsigned int fpsize;
   1454	bool in_use;
   1455
   1456	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
   1457
   1458	newfps = vzalloc(fpsize);
   1459	if (!newfps)
   1460		return -ENOMEM;
   1461	newfps->size = ksize;
   1462	newfps->user_size = usize;
   1463	newfps->is_valloc = true;
   1464
   1465	/*
   1466	 * When a guest FPU is supplied, use @guest_fpu->fpstate
   1467	 * as reference independent whether it is in use or not.
   1468	 */
   1469	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
   1470
   1471	/* Determine whether @curfps is the active fpstate */
   1472	in_use = fpu->fpstate == curfps;
   1473
   1474	if (guest_fpu) {
   1475		newfps->is_guest = true;
   1476		newfps->is_confidential = curfps->is_confidential;
   1477		newfps->in_use = curfps->in_use;
   1478		guest_fpu->xfeatures |= xfeatures;
   1479		guest_fpu->uabi_size = usize;
   1480	}
   1481
   1482	fpregs_lock();
   1483	/*
   1484	 * If @curfps is in use, ensure that the current state is in the
   1485	 * registers before swapping fpstate as that might invalidate it
   1486	 * due to layout changes.
   1487	 */
   1488	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
   1489		fpregs_restore_userregs();
   1490
   1491	newfps->xfeatures = curfps->xfeatures | xfeatures;
   1492
   1493	if (!guest_fpu)
   1494		newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
   1495
   1496	newfps->xfd = curfps->xfd & ~xfeatures;
   1497
   1498	/* Do the final updates within the locked region */
   1499	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
   1500
   1501	if (guest_fpu) {
   1502		guest_fpu->fpstate = newfps;
   1503		/* If curfps is active, update the FPU fpstate pointer */
   1504		if (in_use)
   1505			fpu->fpstate = newfps;
   1506	} else {
   1507		fpu->fpstate = newfps;
   1508	}
   1509
   1510	if (in_use)
   1511		xfd_update_state(fpu->fpstate);
   1512	fpregs_unlock();
   1513
   1514	/* Only free valloc'ed state */
   1515	if (curfps && curfps->is_valloc)
   1516		vfree(curfps);
   1517
   1518	return 0;
   1519}
   1520
   1521static int validate_sigaltstack(unsigned int usize)
   1522{
   1523	struct task_struct *thread, *leader = current->group_leader;
   1524	unsigned long framesize = get_sigframe_size();
   1525
   1526	lockdep_assert_held(&current->sighand->siglock);
   1527
   1528	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
   1529	framesize -= fpu_user_cfg.max_size;
   1530	framesize += usize;
   1531	for_each_thread(leader, thread) {
   1532		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
   1533			return -ENOSPC;
   1534	}
   1535	return 0;
   1536}
   1537
   1538static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
   1539{
   1540	/*
   1541	 * This deliberately does not exclude !XSAVES as we still might
   1542	 * decide to optionally context switch XCR0 or talk the silicon
   1543	 * vendors into extending XFD for the pre AMX states, especially
   1544	 * AVX512.
   1545	 */
   1546	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
   1547	struct fpu *fpu = &current->group_leader->thread.fpu;
   1548	struct fpu_state_perm *perm;
   1549	unsigned int ksize, usize;
   1550	u64 mask;
   1551	int ret = 0;
   1552
   1553	/* Check whether fully enabled */
   1554	if ((permitted & requested) == requested)
   1555		return 0;
   1556
   1557	/* Calculate the resulting kernel state size */
   1558	mask = permitted | requested;
   1559	/* Take supervisor states into account on the host */
   1560	if (!guest)
   1561		mask |= xfeatures_mask_supervisor();
   1562	ksize = xstate_calculate_size(mask, compacted);
   1563
   1564	/* Calculate the resulting user state size */
   1565	mask &= XFEATURE_MASK_USER_SUPPORTED;
   1566	usize = xstate_calculate_size(mask, false);
   1567
   1568	if (!guest) {
   1569		ret = validate_sigaltstack(usize);
   1570		if (ret)
   1571			return ret;
   1572	}
   1573
   1574	perm = guest ? &fpu->guest_perm : &fpu->perm;
   1575	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
   1576	WRITE_ONCE(perm->__state_perm, mask);
   1577	/* Protected by sighand lock */
   1578	perm->__state_size = ksize;
   1579	perm->__user_state_size = usize;
   1580	return ret;
   1581}
   1582
   1583/*
   1584 * Permissions array to map facilities with more than one component
   1585 */
   1586static const u64 xstate_prctl_req[XFEATURE_MAX] = {
   1587	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
   1588};
   1589
   1590static int xstate_request_perm(unsigned long idx, bool guest)
   1591{
   1592	u64 permitted, requested;
   1593	int ret;
   1594
   1595	if (idx >= XFEATURE_MAX)
   1596		return -EINVAL;
   1597
   1598	/*
   1599	 * Look up the facility mask which can require more than
   1600	 * one xstate component.
   1601	 */
   1602	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
   1603	requested = xstate_prctl_req[idx];
   1604	if (!requested)
   1605		return -EOPNOTSUPP;
   1606
   1607	if ((fpu_user_cfg.max_features & requested) != requested)
   1608		return -EOPNOTSUPP;
   1609
   1610	/* Lockless quick check */
   1611	permitted = xstate_get_group_perm(guest);
   1612	if ((permitted & requested) == requested)
   1613		return 0;
   1614
   1615	/* Protect against concurrent modifications */
   1616	spin_lock_irq(&current->sighand->siglock);
   1617	permitted = xstate_get_group_perm(guest);
   1618
   1619	/* First vCPU allocation locks the permissions. */
   1620	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
   1621		ret = -EBUSY;
   1622	else
   1623		ret = __xstate_request_perm(permitted, requested, guest);
   1624	spin_unlock_irq(&current->sighand->siglock);
   1625	return ret;
   1626}
   1627
   1628int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
   1629{
   1630	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
   1631	struct fpu_state_perm *perm;
   1632	unsigned int ksize, usize;
   1633	struct fpu *fpu;
   1634
   1635	if (!xfd_event) {
   1636		if (!guest_fpu)
   1637			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
   1638		return 0;
   1639	}
   1640
   1641	/* Protect against concurrent modifications */
   1642	spin_lock_irq(&current->sighand->siglock);
   1643
   1644	/* If not permitted let it die */
   1645	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
   1646		spin_unlock_irq(&current->sighand->siglock);
   1647		return -EPERM;
   1648	}
   1649
   1650	fpu = &current->group_leader->thread.fpu;
   1651	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
   1652	ksize = perm->__state_size;
   1653	usize = perm->__user_state_size;
   1654
   1655	/*
   1656	 * The feature is permitted. State size is sufficient.  Dropping
   1657	 * the lock is safe here even if more features are added from
   1658	 * another task, the retrieved buffer sizes are valid for the
   1659	 * currently requested feature(s).
   1660	 */
   1661	spin_unlock_irq(&current->sighand->siglock);
   1662
   1663	/*
   1664	 * Try to allocate a new fpstate. If that fails there is no way
   1665	 * out.
   1666	 */
   1667	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
   1668		return -EFAULT;
   1669	return 0;
   1670}
   1671
   1672int xfd_enable_feature(u64 xfd_err)
   1673{
   1674	return __xfd_enable_feature(xfd_err, NULL);
   1675}
   1676
   1677#else /* CONFIG_X86_64 */
   1678static inline int xstate_request_perm(unsigned long idx, bool guest)
   1679{
   1680	return -EPERM;
   1681}
   1682#endif  /* !CONFIG_X86_64 */
   1683
   1684u64 xstate_get_guest_group_perm(void)
   1685{
   1686	return xstate_get_group_perm(true);
   1687}
   1688EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
   1689
   1690/**
   1691 * fpu_xstate_prctl - xstate permission operations
   1692 * @tsk:	Redundant pointer to current
   1693 * @option:	A subfunction of arch_prctl()
   1694 * @arg2:	option argument
   1695 * Return:	0 if successful; otherwise, an error code
   1696 *
   1697 * Option arguments:
   1698 *
   1699 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
   1700 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
   1701 * ARCH_REQ_XCOMP_PERM: Facility number requested
   1702 *
   1703 * For facilities which require more than one XSTATE component, the request
   1704 * must be the highest state component number related to that facility,
   1705 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
   1706 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
   1707 */
   1708long fpu_xstate_prctl(int option, unsigned long arg2)
   1709{
   1710	u64 __user *uptr = (u64 __user *)arg2;
   1711	u64 permitted, supported;
   1712	unsigned long idx = arg2;
   1713	bool guest = false;
   1714
   1715	switch (option) {
   1716	case ARCH_GET_XCOMP_SUPP:
   1717		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
   1718		return put_user(supported, uptr);
   1719
   1720	case ARCH_GET_XCOMP_PERM:
   1721		/*
   1722		 * Lockless snapshot as it can also change right after the
   1723		 * dropping the lock.
   1724		 */
   1725		permitted = xstate_get_host_group_perm();
   1726		permitted &= XFEATURE_MASK_USER_SUPPORTED;
   1727		return put_user(permitted, uptr);
   1728
   1729	case ARCH_GET_XCOMP_GUEST_PERM:
   1730		permitted = xstate_get_guest_group_perm();
   1731		permitted &= XFEATURE_MASK_USER_SUPPORTED;
   1732		return put_user(permitted, uptr);
   1733
   1734	case ARCH_REQ_XCOMP_GUEST_PERM:
   1735		guest = true;
   1736		fallthrough;
   1737
   1738	case ARCH_REQ_XCOMP_PERM:
   1739		if (!IS_ENABLED(CONFIG_X86_64))
   1740			return -EOPNOTSUPP;
   1741
   1742		return xstate_request_perm(idx, guest);
   1743
   1744	default:
   1745		return -EINVAL;
   1746	}
   1747}
   1748
   1749#ifdef CONFIG_PROC_PID_ARCH_STATUS
   1750/*
   1751 * Report the amount of time elapsed in millisecond since last AVX512
   1752 * use in the task.
   1753 */
   1754static void avx512_status(struct seq_file *m, struct task_struct *task)
   1755{
   1756	unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
   1757	long delta;
   1758
   1759	if (!timestamp) {
   1760		/*
   1761		 * Report -1 if no AVX512 usage
   1762		 */
   1763		delta = -1;
   1764	} else {
   1765		delta = (long)(jiffies - timestamp);
   1766		/*
   1767		 * Cap to LONG_MAX if time difference > LONG_MAX
   1768		 */
   1769		if (delta < 0)
   1770			delta = LONG_MAX;
   1771		delta = jiffies_to_msecs(delta);
   1772	}
   1773
   1774	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
   1775	seq_putc(m, '\n');
   1776}
   1777
   1778/*
   1779 * Report architecture specific information
   1780 */
   1781int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
   1782			struct pid *pid, struct task_struct *task)
   1783{
   1784	/*
   1785	 * Report AVX512 state if the processor and build option supported.
   1786	 */
   1787	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
   1788		avx512_status(m, task);
   1789
   1790	return 0;
   1791}
   1792#endif /* CONFIG_PROC_PID_ARCH_STATUS */