cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

process_64.c (22402B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  Copyright (C) 1995  Linus Torvalds
      4 *
      5 *  Pentium III FXSR, SSE support
      6 *	Gareth Hughes <gareth@valinux.com>, May 2000
      7 *
      8 *  X86-64 port
      9 *	Andi Kleen.
     10 *
     11 *	CPU hotplug support - ashok.raj@intel.com
     12 */
     13
     14/*
     15 * This file handles the architecture-dependent parts of process handling..
     16 */
     17
     18#include <linux/cpu.h>
     19#include <linux/errno.h>
     20#include <linux/sched.h>
     21#include <linux/sched/task.h>
     22#include <linux/sched/task_stack.h>
     23#include <linux/fs.h>
     24#include <linux/kernel.h>
     25#include <linux/mm.h>
     26#include <linux/elfcore.h>
     27#include <linux/smp.h>
     28#include <linux/slab.h>
     29#include <linux/user.h>
     30#include <linux/interrupt.h>
     31#include <linux/delay.h>
     32#include <linux/export.h>
     33#include <linux/ptrace.h>
     34#include <linux/notifier.h>
     35#include <linux/kprobes.h>
     36#include <linux/kdebug.h>
     37#include <linux/prctl.h>
     38#include <linux/uaccess.h>
     39#include <linux/io.h>
     40#include <linux/ftrace.h>
     41#include <linux/syscalls.h>
     42
     43#include <asm/processor.h>
     44#include <asm/pkru.h>
     45#include <asm/fpu/sched.h>
     46#include <asm/mmu_context.h>
     47#include <asm/prctl.h>
     48#include <asm/desc.h>
     49#include <asm/proto.h>
     50#include <asm/ia32.h>
     51#include <asm/debugreg.h>
     52#include <asm/switch_to.h>
     53#include <asm/xen/hypervisor.h>
     54#include <asm/vdso.h>
     55#include <asm/resctrl.h>
     56#include <asm/unistd.h>
     57#include <asm/fsgsbase.h>
     58#ifdef CONFIG_IA32_EMULATION
     59/* Not included via unistd.h */
     60#include <asm/unistd_32_ia32.h>
     61#endif
     62
     63#include "process.h"
     64
     65/* Prints also some state that isn't saved in the pt_regs */
     66void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
     67		 const char *log_lvl)
     68{
     69	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
     70	unsigned long d0, d1, d2, d3, d6, d7;
     71	unsigned int fsindex, gsindex;
     72	unsigned int ds, es;
     73
     74	show_iret_regs(regs, log_lvl);
     75
     76	if (regs->orig_ax != -1)
     77		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
     78	else
     79		pr_cont("\n");
     80
     81	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
     82	       log_lvl, regs->ax, regs->bx, regs->cx);
     83	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
     84	       log_lvl, regs->dx, regs->si, regs->di);
     85	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
     86	       log_lvl, regs->bp, regs->r8, regs->r9);
     87	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
     88	       log_lvl, regs->r10, regs->r11, regs->r12);
     89	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
     90	       log_lvl, regs->r13, regs->r14, regs->r15);
     91
     92	if (mode == SHOW_REGS_SHORT)
     93		return;
     94
     95	if (mode == SHOW_REGS_USER) {
     96		rdmsrl(MSR_FS_BASE, fs);
     97		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
     98		printk("%sFS:  %016lx GS:  %016lx\n",
     99		       log_lvl, fs, shadowgs);
    100		return;
    101	}
    102
    103	asm("movl %%ds,%0" : "=r" (ds));
    104	asm("movl %%es,%0" : "=r" (es));
    105	asm("movl %%fs,%0" : "=r" (fsindex));
    106	asm("movl %%gs,%0" : "=r" (gsindex));
    107
    108	rdmsrl(MSR_FS_BASE, fs);
    109	rdmsrl(MSR_GS_BASE, gs);
    110	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
    111
    112	cr0 = read_cr0();
    113	cr2 = read_cr2();
    114	cr3 = __read_cr3();
    115	cr4 = __read_cr4();
    116
    117	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
    118	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
    119	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
    120		log_lvl, regs->cs, ds, es, cr0);
    121	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
    122		log_lvl, cr2, cr3, cr4);
    123
    124	get_debugreg(d0, 0);
    125	get_debugreg(d1, 1);
    126	get_debugreg(d2, 2);
    127	get_debugreg(d3, 3);
    128	get_debugreg(d6, 6);
    129	get_debugreg(d7, 7);
    130
    131	/* Only print out debug registers if they are in their non-default state. */
    132	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
    133	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
    134		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
    135		       log_lvl, d0, d1, d2);
    136		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
    137		       log_lvl, d3, d6, d7);
    138	}
    139
    140	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
    141		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
    142}
    143
    144void release_thread(struct task_struct *dead_task)
    145{
    146	WARN_ON(dead_task->mm);
    147}
    148
    149enum which_selector {
    150	FS,
    151	GS
    152};
    153
    154/*
    155 * Out of line to be protected from kprobes and tracing. If this would be
    156 * traced or probed than any access to a per CPU variable happens with
    157 * the wrong GS.
    158 *
    159 * It is not used on Xen paravirt. When paravirt support is needed, it
    160 * needs to be renamed with native_ prefix.
    161 */
    162static noinstr unsigned long __rdgsbase_inactive(void)
    163{
    164	unsigned long gsbase;
    165
    166	lockdep_assert_irqs_disabled();
    167
    168	if (!static_cpu_has(X86_FEATURE_XENPV)) {
    169		native_swapgs();
    170		gsbase = rdgsbase();
    171		native_swapgs();
    172	} else {
    173		instrumentation_begin();
    174		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
    175		instrumentation_end();
    176	}
    177
    178	return gsbase;
    179}
    180
    181/*
    182 * Out of line to be protected from kprobes and tracing. If this would be
    183 * traced or probed than any access to a per CPU variable happens with
    184 * the wrong GS.
    185 *
    186 * It is not used on Xen paravirt. When paravirt support is needed, it
    187 * needs to be renamed with native_ prefix.
    188 */
    189static noinstr void __wrgsbase_inactive(unsigned long gsbase)
    190{
    191	lockdep_assert_irqs_disabled();
    192
    193	if (!static_cpu_has(X86_FEATURE_XENPV)) {
    194		native_swapgs();
    195		wrgsbase(gsbase);
    196		native_swapgs();
    197	} else {
    198		instrumentation_begin();
    199		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
    200		instrumentation_end();
    201	}
    202}
    203
    204/*
    205 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
    206 * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
    207 * It's forcibly inlined because it'll generate better code and this function
    208 * is hot.
    209 */
    210static __always_inline void save_base_legacy(struct task_struct *prev_p,
    211					     unsigned short selector,
    212					     enum which_selector which)
    213{
    214	if (likely(selector == 0)) {
    215		/*
    216		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
    217		 * be the pre-existing saved base or it could be zero.  On AMD
    218		 * (with X86_BUG_NULL_SEG), the segment base could be almost
    219		 * anything.
    220		 *
    221		 * This branch is very hot (it's hit twice on almost every
    222		 * context switch between 64-bit programs), and avoiding
    223		 * the RDMSR helps a lot, so we just assume that whatever
    224		 * value is already saved is correct.  This matches historical
    225		 * Linux behavior, so it won't break existing applications.
    226		 *
    227		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
    228		 * report that the base is zero, it needs to actually be zero:
    229		 * see the corresponding logic in load_seg_legacy.
    230		 */
    231	} else {
    232		/*
    233		 * If the selector is 1, 2, or 3, then the base is zero on
    234		 * !X86_BUG_NULL_SEG CPUs and could be anything on
    235		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
    236		 * has never attempted to preserve the base across context
    237		 * switches.
    238		 *
    239		 * If selector > 3, then it refers to a real segment, and
    240		 * saving the base isn't necessary.
    241		 */
    242		if (which == FS)
    243			prev_p->thread.fsbase = 0;
    244		else
    245			prev_p->thread.gsbase = 0;
    246	}
    247}
    248
    249static __always_inline void save_fsgs(struct task_struct *task)
    250{
    251	savesegment(fs, task->thread.fsindex);
    252	savesegment(gs, task->thread.gsindex);
    253	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
    254		/*
    255		 * If FSGSBASE is enabled, we can't make any useful guesses
    256		 * about the base, and user code expects us to save the current
    257		 * value.  Fortunately, reading the base directly is efficient.
    258		 */
    259		task->thread.fsbase = rdfsbase();
    260		task->thread.gsbase = __rdgsbase_inactive();
    261	} else {
    262		save_base_legacy(task, task->thread.fsindex, FS);
    263		save_base_legacy(task, task->thread.gsindex, GS);
    264	}
    265}
    266
    267/*
    268 * While a process is running,current->thread.fsbase and current->thread.gsbase
    269 * may not match the corresponding CPU registers (see save_base_legacy()).
    270 */
    271void current_save_fsgs(void)
    272{
    273	unsigned long flags;
    274
    275	/* Interrupts need to be off for FSGSBASE */
    276	local_irq_save(flags);
    277	save_fsgs(current);
    278	local_irq_restore(flags);
    279}
    280#if IS_ENABLED(CONFIG_KVM)
    281EXPORT_SYMBOL_GPL(current_save_fsgs);
    282#endif
    283
    284static __always_inline void loadseg(enum which_selector which,
    285				    unsigned short sel)
    286{
    287	if (which == FS)
    288		loadsegment(fs, sel);
    289	else
    290		load_gs_index(sel);
    291}
    292
    293static __always_inline void load_seg_legacy(unsigned short prev_index,
    294					    unsigned long prev_base,
    295					    unsigned short next_index,
    296					    unsigned long next_base,
    297					    enum which_selector which)
    298{
    299	if (likely(next_index <= 3)) {
    300		/*
    301		 * The next task is using 64-bit TLS, is not using this
    302		 * segment at all, or is having fun with arcane CPU features.
    303		 */
    304		if (next_base == 0) {
    305			/*
    306			 * Nasty case: on AMD CPUs, we need to forcibly zero
    307			 * the base.
    308			 */
    309			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
    310				loadseg(which, __USER_DS);
    311				loadseg(which, next_index);
    312			} else {
    313				/*
    314				 * We could try to exhaustively detect cases
    315				 * under which we can skip the segment load,
    316				 * but there's really only one case that matters
    317				 * for performance: if both the previous and
    318				 * next states are fully zeroed, we can skip
    319				 * the load.
    320				 *
    321				 * (This assumes that prev_base == 0 has no
    322				 * false positives.  This is the case on
    323				 * Intel-style CPUs.)
    324				 */
    325				if (likely(prev_index | next_index | prev_base))
    326					loadseg(which, next_index);
    327			}
    328		} else {
    329			if (prev_index != next_index)
    330				loadseg(which, next_index);
    331			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
    332			       next_base);
    333		}
    334	} else {
    335		/*
    336		 * The next task is using a real segment.  Loading the selector
    337		 * is sufficient.
    338		 */
    339		loadseg(which, next_index);
    340	}
    341}
    342
    343/*
    344 * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
    345 * is not XSTATE managed on context switch because that would require a
    346 * lookup in the task's FPU xsave buffer and require to keep that updated
    347 * in various places.
    348 */
    349static __always_inline void x86_pkru_load(struct thread_struct *prev,
    350					  struct thread_struct *next)
    351{
    352	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
    353		return;
    354
    355	/* Stash the prev task's value: */
    356	prev->pkru = rdpkru();
    357
    358	/*
    359	 * PKRU writes are slightly expensive.  Avoid them when not
    360	 * strictly necessary:
    361	 */
    362	if (prev->pkru != next->pkru)
    363		wrpkru(next->pkru);
    364}
    365
    366static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
    367					      struct thread_struct *next)
    368{
    369	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
    370		/* Update the FS and GS selectors if they could have changed. */
    371		if (unlikely(prev->fsindex || next->fsindex))
    372			loadseg(FS, next->fsindex);
    373		if (unlikely(prev->gsindex || next->gsindex))
    374			loadseg(GS, next->gsindex);
    375
    376		/* Update the bases. */
    377		wrfsbase(next->fsbase);
    378		__wrgsbase_inactive(next->gsbase);
    379	} else {
    380		load_seg_legacy(prev->fsindex, prev->fsbase,
    381				next->fsindex, next->fsbase, FS);
    382		load_seg_legacy(prev->gsindex, prev->gsbase,
    383				next->gsindex, next->gsbase, GS);
    384	}
    385}
    386
    387unsigned long x86_fsgsbase_read_task(struct task_struct *task,
    388				     unsigned short selector)
    389{
    390	unsigned short idx = selector >> 3;
    391	unsigned long base;
    392
    393	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
    394		if (unlikely(idx >= GDT_ENTRIES))
    395			return 0;
    396
    397		/*
    398		 * There are no user segments in the GDT with nonzero bases
    399		 * other than the TLS segments.
    400		 */
    401		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
    402			return 0;
    403
    404		idx -= GDT_ENTRY_TLS_MIN;
    405		base = get_desc_base(&task->thread.tls_array[idx]);
    406	} else {
    407#ifdef CONFIG_MODIFY_LDT_SYSCALL
    408		struct ldt_struct *ldt;
    409
    410		/*
    411		 * If performance here mattered, we could protect the LDT
    412		 * with RCU.  This is a slow path, though, so we can just
    413		 * take the mutex.
    414		 */
    415		mutex_lock(&task->mm->context.lock);
    416		ldt = task->mm->context.ldt;
    417		if (unlikely(!ldt || idx >= ldt->nr_entries))
    418			base = 0;
    419		else
    420			base = get_desc_base(ldt->entries + idx);
    421		mutex_unlock(&task->mm->context.lock);
    422#else
    423		base = 0;
    424#endif
    425	}
    426
    427	return base;
    428}
    429
    430unsigned long x86_gsbase_read_cpu_inactive(void)
    431{
    432	unsigned long gsbase;
    433
    434	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
    435		unsigned long flags;
    436
    437		local_irq_save(flags);
    438		gsbase = __rdgsbase_inactive();
    439		local_irq_restore(flags);
    440	} else {
    441		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
    442	}
    443
    444	return gsbase;
    445}
    446
    447void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
    448{
    449	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
    450		unsigned long flags;
    451
    452		local_irq_save(flags);
    453		__wrgsbase_inactive(gsbase);
    454		local_irq_restore(flags);
    455	} else {
    456		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
    457	}
    458}
    459
    460unsigned long x86_fsbase_read_task(struct task_struct *task)
    461{
    462	unsigned long fsbase;
    463
    464	if (task == current)
    465		fsbase = x86_fsbase_read_cpu();
    466	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
    467		 (task->thread.fsindex == 0))
    468		fsbase = task->thread.fsbase;
    469	else
    470		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
    471
    472	return fsbase;
    473}
    474
    475unsigned long x86_gsbase_read_task(struct task_struct *task)
    476{
    477	unsigned long gsbase;
    478
    479	if (task == current)
    480		gsbase = x86_gsbase_read_cpu_inactive();
    481	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
    482		 (task->thread.gsindex == 0))
    483		gsbase = task->thread.gsbase;
    484	else
    485		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
    486
    487	return gsbase;
    488}
    489
    490void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
    491{
    492	WARN_ON_ONCE(task == current);
    493
    494	task->thread.fsbase = fsbase;
    495}
    496
    497void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
    498{
    499	WARN_ON_ONCE(task == current);
    500
    501	task->thread.gsbase = gsbase;
    502}
    503
    504static void
    505start_thread_common(struct pt_regs *regs, unsigned long new_ip,
    506		    unsigned long new_sp,
    507		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
    508{
    509	WARN_ON_ONCE(regs != current_pt_regs());
    510
    511	if (static_cpu_has(X86_BUG_NULL_SEG)) {
    512		/* Loading zero below won't clear the base. */
    513		loadsegment(fs, __USER_DS);
    514		load_gs_index(__USER_DS);
    515	}
    516
    517	loadsegment(fs, 0);
    518	loadsegment(es, _ds);
    519	loadsegment(ds, _ds);
    520	load_gs_index(0);
    521
    522	regs->ip		= new_ip;
    523	regs->sp		= new_sp;
    524	regs->cs		= _cs;
    525	regs->ss		= _ss;
    526	regs->flags		= X86_EFLAGS_IF;
    527}
    528
    529void
    530start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
    531{
    532	start_thread_common(regs, new_ip, new_sp,
    533			    __USER_CS, __USER_DS, 0);
    534}
    535EXPORT_SYMBOL_GPL(start_thread);
    536
    537#ifdef CONFIG_COMPAT
    538void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
    539{
    540	start_thread_common(regs, new_ip, new_sp,
    541			    x32 ? __USER_CS : __USER32_CS,
    542			    __USER_DS, __USER_DS);
    543}
    544#endif
    545
    546/*
    547 *	switch_to(x,y) should switch tasks from x to y.
    548 *
    549 * This could still be optimized:
    550 * - fold all the options into a flag word and test it with a single test.
    551 * - could test fs/gs bitsliced
    552 *
    553 * Kprobes not supported here. Set the probe on schedule instead.
    554 * Function graph tracer not supported too.
    555 */
    556__visible __notrace_funcgraph struct task_struct *
    557__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
    558{
    559	struct thread_struct *prev = &prev_p->thread;
    560	struct thread_struct *next = &next_p->thread;
    561	struct fpu *prev_fpu = &prev->fpu;
    562	int cpu = smp_processor_id();
    563
    564	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
    565		     this_cpu_read(hardirq_stack_inuse));
    566
    567	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
    568		switch_fpu_prepare(prev_fpu, cpu);
    569
    570	/* We must save %fs and %gs before load_TLS() because
    571	 * %fs and %gs may be cleared by load_TLS().
    572	 *
    573	 * (e.g. xen_load_tls())
    574	 */
    575	save_fsgs(prev_p);
    576
    577	/*
    578	 * Load TLS before restoring any segments so that segment loads
    579	 * reference the correct GDT entries.
    580	 */
    581	load_TLS(next, cpu);
    582
    583	/*
    584	 * Leave lazy mode, flushing any hypercalls made here.  This
    585	 * must be done after loading TLS entries in the GDT but before
    586	 * loading segments that might reference them.
    587	 */
    588	arch_end_context_switch(next_p);
    589
    590	/* Switch DS and ES.
    591	 *
    592	 * Reading them only returns the selectors, but writing them (if
    593	 * nonzero) loads the full descriptor from the GDT or LDT.  The
    594	 * LDT for next is loaded in switch_mm, and the GDT is loaded
    595	 * above.
    596	 *
    597	 * We therefore need to write new values to the segment
    598	 * registers on every context switch unless both the new and old
    599	 * values are zero.
    600	 *
    601	 * Note that we don't need to do anything for CS and SS, as
    602	 * those are saved and restored as part of pt_regs.
    603	 */
    604	savesegment(es, prev->es);
    605	if (unlikely(next->es | prev->es))
    606		loadsegment(es, next->es);
    607
    608	savesegment(ds, prev->ds);
    609	if (unlikely(next->ds | prev->ds))
    610		loadsegment(ds, next->ds);
    611
    612	x86_fsgsbase_load(prev, next);
    613
    614	x86_pkru_load(prev, next);
    615
    616	/*
    617	 * Switch the PDA and FPU contexts.
    618	 */
    619	this_cpu_write(current_task, next_p);
    620	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
    621
    622	switch_fpu_finish();
    623
    624	/* Reload sp0. */
    625	update_task_stack(next_p);
    626
    627	switch_to_extra(prev_p, next_p);
    628
    629	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
    630		/*
    631		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
    632		 * does not update the cached descriptor.  As a result, if we
    633		 * do SYSRET while SS is NULL, we'll end up in user mode with
    634		 * SS apparently equal to __USER_DS but actually unusable.
    635		 *
    636		 * The straightforward workaround would be to fix it up just
    637		 * before SYSRET, but that would slow down the system call
    638		 * fast paths.  Instead, we ensure that SS is never NULL in
    639		 * system call context.  We do this by replacing NULL SS
    640		 * selectors at every context switch.  SYSCALL sets up a valid
    641		 * SS, so the only way to get NULL is to re-enter the kernel
    642		 * from CPL 3 through an interrupt.  Since that can't happen
    643		 * in the same task as a running syscall, we are guaranteed to
    644		 * context switch between every interrupt vector entry and a
    645		 * subsequent SYSRET.
    646		 *
    647		 * We read SS first because SS reads are much faster than
    648		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
    649		 * it previously had a different non-NULL value.
    650		 */
    651		unsigned short ss_sel;
    652		savesegment(ss, ss_sel);
    653		if (ss_sel != __KERNEL_DS)
    654			loadsegment(ss, __KERNEL_DS);
    655	}
    656
    657	/* Load the Intel cache allocation PQR MSR. */
    658	resctrl_sched_in();
    659
    660	return prev_p;
    661}
    662
    663void set_personality_64bit(void)
    664{
    665	/* inherit personality from parent */
    666
    667	/* Make sure to be in 64bit mode */
    668	clear_thread_flag(TIF_ADDR32);
    669	/* Pretend that this comes from a 64bit execve */
    670	task_pt_regs(current)->orig_ax = __NR_execve;
    671	current_thread_info()->status &= ~TS_COMPAT;
    672	if (current->mm)
    673		current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
    674
    675	/* TBD: overwrites user setup. Should have two bits.
    676	   But 64bit processes have always behaved this way,
    677	   so it's not too bad. The main problem is just that
    678	   32bit children are affected again. */
    679	current->personality &= ~READ_IMPLIES_EXEC;
    680}
    681
    682static void __set_personality_x32(void)
    683{
    684#ifdef CONFIG_X86_X32_ABI
    685	if (current->mm)
    686		current->mm->context.flags = 0;
    687
    688	current->personality &= ~READ_IMPLIES_EXEC;
    689	/*
    690	 * in_32bit_syscall() uses the presence of the x32 syscall bit
    691	 * flag to determine compat status.  The x86 mmap() code relies on
    692	 * the syscall bitness so set x32 syscall bit right here to make
    693	 * in_32bit_syscall() work during exec().
    694	 *
    695	 * Pretend to come from a x32 execve.
    696	 */
    697	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
    698	current_thread_info()->status &= ~TS_COMPAT;
    699#endif
    700}
    701
    702static void __set_personality_ia32(void)
    703{
    704#ifdef CONFIG_IA32_EMULATION
    705	if (current->mm) {
    706		/*
    707		 * uprobes applied to this MM need to know this and
    708		 * cannot use user_64bit_mode() at that time.
    709		 */
    710		current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
    711	}
    712
    713	current->personality |= force_personality32;
    714	/* Prepare the first "return" to user space */
    715	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
    716	current_thread_info()->status |= TS_COMPAT;
    717#endif
    718}
    719
    720void set_personality_ia32(bool x32)
    721{
    722	/* Make sure to be in 32bit mode */
    723	set_thread_flag(TIF_ADDR32);
    724
    725	if (x32)
    726		__set_personality_x32();
    727	else
    728		__set_personality_ia32();
    729}
    730EXPORT_SYMBOL_GPL(set_personality_ia32);
    731
    732#ifdef CONFIG_CHECKPOINT_RESTORE
    733static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
    734{
    735	int ret;
    736
    737	ret = map_vdso_once(image, addr);
    738	if (ret)
    739		return ret;
    740
    741	return (long)image->size;
    742}
    743#endif
    744
    745long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
    746{
    747	int ret = 0;
    748
    749	switch (option) {
    750	case ARCH_SET_GS: {
    751		if (unlikely(arg2 >= TASK_SIZE_MAX))
    752			return -EPERM;
    753
    754		preempt_disable();
    755		/*
    756		 * ARCH_SET_GS has always overwritten the index
    757		 * and the base. Zero is the most sensible value
    758		 * to put in the index, and is the only value that
    759		 * makes any sense if FSGSBASE is unavailable.
    760		 */
    761		if (task == current) {
    762			loadseg(GS, 0);
    763			x86_gsbase_write_cpu_inactive(arg2);
    764
    765			/*
    766			 * On non-FSGSBASE systems, save_base_legacy() expects
    767			 * that we also fill in thread.gsbase.
    768			 */
    769			task->thread.gsbase = arg2;
    770
    771		} else {
    772			task->thread.gsindex = 0;
    773			x86_gsbase_write_task(task, arg2);
    774		}
    775		preempt_enable();
    776		break;
    777	}
    778	case ARCH_SET_FS: {
    779		/*
    780		 * Not strictly needed for %fs, but do it for symmetry
    781		 * with %gs
    782		 */
    783		if (unlikely(arg2 >= TASK_SIZE_MAX))
    784			return -EPERM;
    785
    786		preempt_disable();
    787		/*
    788		 * Set the selector to 0 for the same reason
    789		 * as %gs above.
    790		 */
    791		if (task == current) {
    792			loadseg(FS, 0);
    793			x86_fsbase_write_cpu(arg2);
    794
    795			/*
    796			 * On non-FSGSBASE systems, save_base_legacy() expects
    797			 * that we also fill in thread.fsbase.
    798			 */
    799			task->thread.fsbase = arg2;
    800		} else {
    801			task->thread.fsindex = 0;
    802			x86_fsbase_write_task(task, arg2);
    803		}
    804		preempt_enable();
    805		break;
    806	}
    807	case ARCH_GET_FS: {
    808		unsigned long base = x86_fsbase_read_task(task);
    809
    810		ret = put_user(base, (unsigned long __user *)arg2);
    811		break;
    812	}
    813	case ARCH_GET_GS: {
    814		unsigned long base = x86_gsbase_read_task(task);
    815
    816		ret = put_user(base, (unsigned long __user *)arg2);
    817		break;
    818	}
    819
    820#ifdef CONFIG_CHECKPOINT_RESTORE
    821# ifdef CONFIG_X86_X32_ABI
    822	case ARCH_MAP_VDSO_X32:
    823		return prctl_map_vdso(&vdso_image_x32, arg2);
    824# endif
    825# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
    826	case ARCH_MAP_VDSO_32:
    827		return prctl_map_vdso(&vdso_image_32, arg2);
    828# endif
    829	case ARCH_MAP_VDSO_64:
    830		return prctl_map_vdso(&vdso_image_64, arg2);
    831#endif
    832
    833	default:
    834		ret = -EINVAL;
    835		break;
    836	}
    837
    838	return ret;
    839}
    840
    841SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
    842{
    843	long ret;
    844
    845	ret = do_arch_prctl_64(current, option, arg2);
    846	if (ret == -EINVAL)
    847		ret = do_arch_prctl_common(option, arg2);
    848
    849	return ret;
    850}
    851
    852#ifdef CONFIG_IA32_EMULATION
    853COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
    854{
    855	return do_arch_prctl_common(option, arg2);
    856}
    857#endif
    858
    859unsigned long KSTK_ESP(struct task_struct *task)
    860{
    861	return task_pt_regs(task)->sp;
    862}