cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

common.c (8518B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * common.c - C code for kernel entry and exit
      4 * Copyright (c) 2015 Andrew Lutomirski
      5 *
      6 * Based on asm and ptrace code by many authors.  The code here originated
      7 * in ptrace.c and signal.c.
      8 */
      9
     10#include <linux/kernel.h>
     11#include <linux/sched.h>
     12#include <linux/sched/task_stack.h>
     13#include <linux/entry-common.h>
     14#include <linux/mm.h>
     15#include <linux/smp.h>
     16#include <linux/errno.h>
     17#include <linux/ptrace.h>
     18#include <linux/export.h>
     19#include <linux/nospec.h>
     20#include <linux/syscalls.h>
     21#include <linux/uaccess.h>
     22
     23#ifdef CONFIG_XEN_PV
     24#include <xen/xen-ops.h>
     25#include <xen/events.h>
     26#endif
     27
     28#include <asm/desc.h>
     29#include <asm/traps.h>
     30#include <asm/vdso.h>
     31#include <asm/cpufeature.h>
     32#include <asm/fpu/api.h>
     33#include <asm/nospec-branch.h>
     34#include <asm/io_bitmap.h>
     35#include <asm/syscall.h>
     36#include <asm/irq_stack.h>
     37
     38#ifdef CONFIG_X86_64
     39
     40static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
     41{
     42	/*
     43	 * Convert negative numbers to very high and thus out of range
     44	 * numbers for comparisons.
     45	 */
     46	unsigned int unr = nr;
     47
     48	if (likely(unr < NR_syscalls)) {
     49		unr = array_index_nospec(unr, NR_syscalls);
     50		regs->ax = sys_call_table[unr](regs);
     51		return true;
     52	}
     53	return false;
     54}
     55
     56static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
     57{
     58	/*
     59	 * Adjust the starting offset of the table, and convert numbers
     60	 * < __X32_SYSCALL_BIT to very high and thus out of range
     61	 * numbers for comparisons.
     62	 */
     63	unsigned int xnr = nr - __X32_SYSCALL_BIT;
     64
     65	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
     66		xnr = array_index_nospec(xnr, X32_NR_syscalls);
     67		regs->ax = x32_sys_call_table[xnr](regs);
     68		return true;
     69	}
     70	return false;
     71}
     72
     73__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
     74{
     75	add_random_kstack_offset();
     76	nr = syscall_enter_from_user_mode(regs, nr);
     77
     78	instrumentation_begin();
     79
     80	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
     81		/* Invalid system call, but still a system call. */
     82		regs->ax = __x64_sys_ni_syscall(regs);
     83	}
     84
     85	instrumentation_end();
     86	syscall_exit_to_user_mode(regs);
     87}
     88#endif
     89
     90#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
     91static __always_inline int syscall_32_enter(struct pt_regs *regs)
     92{
     93	if (IS_ENABLED(CONFIG_IA32_EMULATION))
     94		current_thread_info()->status |= TS_COMPAT;
     95
     96	return (int)regs->orig_ax;
     97}
     98
     99/*
    100 * Invoke a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.
    101 */
    102static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
    103{
    104	/*
    105	 * Convert negative numbers to very high and thus out of range
    106	 * numbers for comparisons.
    107	 */
    108	unsigned int unr = nr;
    109
    110	if (likely(unr < IA32_NR_syscalls)) {
    111		unr = array_index_nospec(unr, IA32_NR_syscalls);
    112		regs->ax = ia32_sys_call_table[unr](regs);
    113	} else if (nr != -1) {
    114		regs->ax = __ia32_sys_ni_syscall(regs);
    115	}
    116}
    117
    118/* Handles int $0x80 */
    119__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
    120{
    121	int nr = syscall_32_enter(regs);
    122
    123	add_random_kstack_offset();
    124	/*
    125	 * Subtlety here: if ptrace pokes something larger than 2^31-1 into
    126	 * orig_ax, the int return value truncates it. This matches
    127	 * the semantics of syscall_get_nr().
    128	 */
    129	nr = syscall_enter_from_user_mode(regs, nr);
    130	instrumentation_begin();
    131
    132	do_syscall_32_irqs_on(regs, nr);
    133
    134	instrumentation_end();
    135	syscall_exit_to_user_mode(regs);
    136}
    137
    138static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
    139{
    140	int nr = syscall_32_enter(regs);
    141	int res;
    142
    143	add_random_kstack_offset();
    144	/*
    145	 * This cannot use syscall_enter_from_user_mode() as it has to
    146	 * fetch EBP before invoking any of the syscall entry work
    147	 * functions.
    148	 */
    149	syscall_enter_from_user_mode_prepare(regs);
    150
    151	instrumentation_begin();
    152	/* Fetch EBP from where the vDSO stashed it. */
    153	if (IS_ENABLED(CONFIG_X86_64)) {
    154		/*
    155		 * Micro-optimization: the pointer we're following is
    156		 * explicitly 32 bits, so it can't be out of range.
    157		 */
    158		res = __get_user(*(u32 *)&regs->bp,
    159			 (u32 __user __force *)(unsigned long)(u32)regs->sp);
    160	} else {
    161		res = get_user(*(u32 *)&regs->bp,
    162		       (u32 __user __force *)(unsigned long)(u32)regs->sp);
    163	}
    164
    165	if (res) {
    166		/* User code screwed up. */
    167		regs->ax = -EFAULT;
    168
    169		local_irq_disable();
    170		instrumentation_end();
    171		irqentry_exit_to_user_mode(regs);
    172		return false;
    173	}
    174
    175	nr = syscall_enter_from_user_mode_work(regs, nr);
    176
    177	/* Now this is just like a normal syscall. */
    178	do_syscall_32_irqs_on(regs, nr);
    179
    180	instrumentation_end();
    181	syscall_exit_to_user_mode(regs);
    182	return true;
    183}
    184
    185/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
    186__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
    187{
    188	/*
    189	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
    190	 * convention.  Adjust regs so it looks like we entered using int80.
    191	 */
    192	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
    193					vdso_image_32.sym_int80_landing_pad;
    194
    195	/*
    196	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
    197	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
    198	 * Fix it up.
    199	 */
    200	regs->ip = landing_pad;
    201
    202	/* Invoke the syscall. If it failed, keep it simple: use IRET. */
    203	if (!__do_fast_syscall_32(regs))
    204		return 0;
    205
    206#ifdef CONFIG_X86_64
    207	/*
    208	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
    209	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
    210	 * bother with SYSEXIT.
    211	 *
    212	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
    213	 * because the ECX fixup above will ensure that this is essentially
    214	 * never the case.
    215	 */
    216	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
    217		regs->ip == landing_pad &&
    218		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
    219#else
    220	/*
    221	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
    222	 *
    223	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
    224	 * because the ECX fixup above will ensure that this is essentially
    225	 * never the case.
    226	 *
    227	 * We don't allow syscalls at all from VM86 mode, but we still
    228	 * need to check VM, because we might be returning from sys_vm86.
    229	 */
    230	return static_cpu_has(X86_FEATURE_SEP) &&
    231		regs->cs == __USER_CS && regs->ss == __USER_DS &&
    232		regs->ip == landing_pad &&
    233		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
    234#endif
    235}
    236
    237/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
    238__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
    239{
    240	/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
    241	regs->sp = regs->bp;
    242
    243	/* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
    244	regs->flags |= X86_EFLAGS_IF;
    245
    246	return do_fast_syscall_32(regs);
    247}
    248#endif
    249
    250SYSCALL_DEFINE0(ni_syscall)
    251{
    252	return -ENOSYS;
    253}
    254
    255#ifdef CONFIG_XEN_PV
    256#ifndef CONFIG_PREEMPTION
    257/*
    258 * Some hypercalls issued by the toolstack can take many 10s of
    259 * seconds. Allow tasks running hypercalls via the privcmd driver to
    260 * be voluntarily preempted even if full kernel preemption is
    261 * disabled.
    262 *
    263 * Such preemptible hypercalls are bracketed by
    264 * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
    265 * calls.
    266 */
    267DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
    268EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
    269
    270/*
    271 * In case of scheduling the flag must be cleared and restored after
    272 * returning from schedule as the task might move to a different CPU.
    273 */
    274static __always_inline bool get_and_clear_inhcall(void)
    275{
    276	bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
    277
    278	__this_cpu_write(xen_in_preemptible_hcall, false);
    279	return inhcall;
    280}
    281
    282static __always_inline void restore_inhcall(bool inhcall)
    283{
    284	__this_cpu_write(xen_in_preemptible_hcall, inhcall);
    285}
    286#else
    287static __always_inline bool get_and_clear_inhcall(void) { return false; }
    288static __always_inline void restore_inhcall(bool inhcall) { }
    289#endif
    290
    291static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
    292{
    293	struct pt_regs *old_regs = set_irq_regs(regs);
    294
    295	inc_irq_stat(irq_hv_callback_count);
    296
    297	xen_hvm_evtchn_do_upcall();
    298
    299	set_irq_regs(old_regs);
    300}
    301
    302__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
    303{
    304	irqentry_state_t state = irqentry_enter(regs);
    305	bool inhcall;
    306
    307	instrumentation_begin();
    308	run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
    309
    310	inhcall = get_and_clear_inhcall();
    311	if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
    312		irqentry_exit_cond_resched();
    313		instrumentation_end();
    314		restore_inhcall(inhcall);
    315	} else {
    316		instrumentation_end();
    317		irqentry_exit(regs, state);
    318	}
    319}
    320#endif /* CONFIG_XEN_PV */