cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

alternative.c (41130B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2#define pr_fmt(fmt) "SMP alternatives: " fmt
      3
      4#include <linux/module.h>
      5#include <linux/sched.h>
      6#include <linux/perf_event.h>
      7#include <linux/mutex.h>
      8#include <linux/list.h>
      9#include <linux/stringify.h>
     10#include <linux/highmem.h>
     11#include <linux/mm.h>
     12#include <linux/vmalloc.h>
     13#include <linux/memory.h>
     14#include <linux/stop_machine.h>
     15#include <linux/slab.h>
     16#include <linux/kdebug.h>
     17#include <linux/kprobes.h>
     18#include <linux/mmu_context.h>
     19#include <linux/bsearch.h>
     20#include <linux/sync_core.h>
     21#include <asm/text-patching.h>
     22#include <asm/alternative.h>
     23#include <asm/sections.h>
     24#include <asm/mce.h>
     25#include <asm/nmi.h>
     26#include <asm/cacheflush.h>
     27#include <asm/tlbflush.h>
     28#include <asm/insn.h>
     29#include <asm/io.h>
     30#include <asm/fixmap.h>
     31#include <asm/paravirt.h>
     32#include <asm/asm-prototypes.h>
     33
     34int __read_mostly alternatives_patched;
     35
     36EXPORT_SYMBOL_GPL(alternatives_patched);
     37
     38#define MAX_PATCH_LEN (255-1)
     39
     40static int __initdata_or_module debug_alternative;
     41
     42static int __init debug_alt(char *str)
     43{
     44	debug_alternative = 1;
     45	return 1;
     46}
     47__setup("debug-alternative", debug_alt);
     48
     49static int noreplace_smp;
     50
     51static int __init setup_noreplace_smp(char *str)
     52{
     53	noreplace_smp = 1;
     54	return 1;
     55}
     56__setup("noreplace-smp", setup_noreplace_smp);
     57
     58#define DPRINTK(fmt, args...)						\
     59do {									\
     60	if (debug_alternative)						\
     61		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
     62} while (0)
     63
     64#define DUMP_BYTES(buf, len, fmt, args...)				\
     65do {									\
     66	if (unlikely(debug_alternative)) {				\
     67		int j;							\
     68									\
     69		if (!(len))						\
     70			break;						\
     71									\
     72		printk(KERN_DEBUG pr_fmt(fmt), ##args);			\
     73		for (j = 0; j < (len) - 1; j++)				\
     74			printk(KERN_CONT "%02hhx ", buf[j]);		\
     75		printk(KERN_CONT "%02hhx\n", buf[j]);			\
     76	}								\
     77} while (0)
     78
     79static const unsigned char x86nops[] =
     80{
     81	BYTES_NOP1,
     82	BYTES_NOP2,
     83	BYTES_NOP3,
     84	BYTES_NOP4,
     85	BYTES_NOP5,
     86	BYTES_NOP6,
     87	BYTES_NOP7,
     88	BYTES_NOP8,
     89};
     90
     91const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
     92{
     93	NULL,
     94	x86nops,
     95	x86nops + 1,
     96	x86nops + 1 + 2,
     97	x86nops + 1 + 2 + 3,
     98	x86nops + 1 + 2 + 3 + 4,
     99	x86nops + 1 + 2 + 3 + 4 + 5,
    100	x86nops + 1 + 2 + 3 + 4 + 5 + 6,
    101	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
    102};
    103
    104/* Use this to add nops to a buffer, then text_poke the whole buffer. */
    105static void __init_or_module add_nops(void *insns, unsigned int len)
    106{
    107	while (len > 0) {
    108		unsigned int noplen = len;
    109		if (noplen > ASM_NOP_MAX)
    110			noplen = ASM_NOP_MAX;
    111		memcpy(insns, x86_nops[noplen], noplen);
    112		insns += noplen;
    113		len -= noplen;
    114	}
    115}
    116
    117extern s32 __retpoline_sites[], __retpoline_sites_end[];
    118extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
    119extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
    120extern s32 __smp_locks[], __smp_locks_end[];
    121void text_poke_early(void *addr, const void *opcode, size_t len);
    122
    123/*
    124 * Are we looking at a near JMP with a 1 or 4-byte displacement.
    125 */
    126static inline bool is_jmp(const u8 opcode)
    127{
    128	return opcode == 0xeb || opcode == 0xe9;
    129}
    130
    131static void __init_or_module
    132recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
    133{
    134	u8 *next_rip, *tgt_rip;
    135	s32 n_dspl, o_dspl;
    136	int repl_len;
    137
    138	if (a->replacementlen != 5)
    139		return;
    140
    141	o_dspl = *(s32 *)(insn_buff + 1);
    142
    143	/* next_rip of the replacement JMP */
    144	next_rip = repl_insn + a->replacementlen;
    145	/* target rip of the replacement JMP */
    146	tgt_rip  = next_rip + o_dspl;
    147	n_dspl = tgt_rip - orig_insn;
    148
    149	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
    150
    151	if (tgt_rip - orig_insn >= 0) {
    152		if (n_dspl - 2 <= 127)
    153			goto two_byte_jmp;
    154		else
    155			goto five_byte_jmp;
    156	/* negative offset */
    157	} else {
    158		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
    159			goto two_byte_jmp;
    160		else
    161			goto five_byte_jmp;
    162	}
    163
    164two_byte_jmp:
    165	n_dspl -= 2;
    166
    167	insn_buff[0] = 0xeb;
    168	insn_buff[1] = (s8)n_dspl;
    169	add_nops(insn_buff + 2, 3);
    170
    171	repl_len = 2;
    172	goto done;
    173
    174five_byte_jmp:
    175	n_dspl -= 5;
    176
    177	insn_buff[0] = 0xe9;
    178	*(s32 *)&insn_buff[1] = n_dspl;
    179
    180	repl_len = 5;
    181
    182done:
    183
    184	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
    185		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
    186}
    187
    188/*
    189 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
    190 *
    191 * @instr: instruction byte stream
    192 * @instrlen: length of the above
    193 * @off: offset within @instr where the first NOP has been detected
    194 *
    195 * Return: number of NOPs found (and replaced).
    196 */
    197static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
    198{
    199	unsigned long flags;
    200	int i = off, nnops;
    201
    202	while (i < instrlen) {
    203		if (instr[i] != 0x90)
    204			break;
    205
    206		i++;
    207	}
    208
    209	nnops = i - off;
    210
    211	if (nnops <= 1)
    212		return nnops;
    213
    214	local_irq_save(flags);
    215	add_nops(instr + off, nnops);
    216	local_irq_restore(flags);
    217
    218	DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
    219
    220	return nnops;
    221}
    222
    223/*
    224 * "noinline" to cause control flow change and thus invalidate I$ and
    225 * cause refetch after modification.
    226 */
    227static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
    228{
    229	struct insn insn;
    230	int i = 0;
    231
    232	/*
    233	 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
    234	 * ones.
    235	 */
    236	for (;;) {
    237		if (insn_decode_kernel(&insn, &instr[i]))
    238			return;
    239
    240		/*
    241		 * See if this and any potentially following NOPs can be
    242		 * optimized.
    243		 */
    244		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
    245			i += optimize_nops_range(instr, len, i);
    246		else
    247			i += insn.length;
    248
    249		if (i >= len)
    250			return;
    251	}
    252}
    253
    254/*
    255 * Replace instructions with better alternatives for this CPU type. This runs
    256 * before SMP is initialized to avoid SMP problems with self modifying code.
    257 * This implies that asymmetric systems where APs have less capabilities than
    258 * the boot processor are not handled. Tough. Make sure you disable such
    259 * features by hand.
    260 *
    261 * Marked "noinline" to cause control flow change and thus insn cache
    262 * to refetch changed I$ lines.
    263 */
    264void __init_or_module noinline apply_alternatives(struct alt_instr *start,
    265						  struct alt_instr *end)
    266{
    267	struct alt_instr *a;
    268	u8 *instr, *replacement;
    269	u8 insn_buff[MAX_PATCH_LEN];
    270
    271	DPRINTK("alt table %px, -> %px", start, end);
    272	/*
    273	 * The scan order should be from start to end. A later scanned
    274	 * alternative code can overwrite previously scanned alternative code.
    275	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
    276	 * patch code.
    277	 *
    278	 * So be careful if you want to change the scan order to any other
    279	 * order.
    280	 */
    281	for (a = start; a < end; a++) {
    282		int insn_buff_sz = 0;
    283		/* Mask away "NOT" flag bit for feature to test. */
    284		u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
    285
    286		instr = (u8 *)&a->instr_offset + a->instr_offset;
    287		replacement = (u8 *)&a->repl_offset + a->repl_offset;
    288		BUG_ON(a->instrlen > sizeof(insn_buff));
    289		BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
    290
    291		/*
    292		 * Patch if either:
    293		 * - feature is present
    294		 * - feature not present but ALTINSTR_FLAG_INV is set to mean,
    295		 *   patch if feature is *NOT* present.
    296		 */
    297		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
    298			goto next;
    299
    300		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
    301			(a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
    302			feature >> 5,
    303			feature & 0x1f,
    304			instr, instr, a->instrlen,
    305			replacement, a->replacementlen);
    306
    307		DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
    308		DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
    309
    310		memcpy(insn_buff, replacement, a->replacementlen);
    311		insn_buff_sz = a->replacementlen;
    312
    313		/*
    314		 * 0xe8 is a relative jump; fix the offset.
    315		 *
    316		 * Instruction length is checked before the opcode to avoid
    317		 * accessing uninitialized bytes for zero-length replacements.
    318		 */
    319		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
    320			*(s32 *)(insn_buff + 1) += replacement - instr;
    321			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
    322				*(s32 *)(insn_buff + 1),
    323				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
    324		}
    325
    326		if (a->replacementlen && is_jmp(replacement[0]))
    327			recompute_jump(a, instr, replacement, insn_buff);
    328
    329		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
    330			insn_buff[insn_buff_sz] = 0x90;
    331
    332		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
    333
    334		text_poke_early(instr, insn_buff, insn_buff_sz);
    335
    336next:
    337		optimize_nops(instr, a->instrlen);
    338	}
    339}
    340
    341#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
    342
    343/*
    344 * CALL/JMP *%\reg
    345 */
    346static int emit_indirect(int op, int reg, u8 *bytes)
    347{
    348	int i = 0;
    349	u8 modrm;
    350
    351	switch (op) {
    352	case CALL_INSN_OPCODE:
    353		modrm = 0x10; /* Reg = 2; CALL r/m */
    354		break;
    355
    356	case JMP32_INSN_OPCODE:
    357		modrm = 0x20; /* Reg = 4; JMP r/m */
    358		break;
    359
    360	default:
    361		WARN_ON_ONCE(1);
    362		return -1;
    363	}
    364
    365	if (reg >= 8) {
    366		bytes[i++] = 0x41; /* REX.B prefix */
    367		reg -= 8;
    368	}
    369
    370	modrm |= 0xc0; /* Mod = 3 */
    371	modrm += reg;
    372
    373	bytes[i++] = 0xff; /* opcode */
    374	bytes[i++] = modrm;
    375
    376	return i;
    377}
    378
    379/*
    380 * Rewrite the compiler generated retpoline thunk calls.
    381 *
    382 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
    383 * indirect instructions, avoiding the extra indirection.
    384 *
    385 * For example, convert:
    386 *
    387 *   CALL __x86_indirect_thunk_\reg
    388 *
    389 * into:
    390 *
    391 *   CALL *%\reg
    392 *
    393 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
    394 */
    395static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
    396{
    397	retpoline_thunk_t *target;
    398	int reg, ret, i = 0;
    399	u8 op, cc;
    400
    401	target = addr + insn->length + insn->immediate.value;
    402	reg = target - __x86_indirect_thunk_array;
    403
    404	if (WARN_ON_ONCE(reg & ~0xf))
    405		return -1;
    406
    407	/* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
    408	BUG_ON(reg == 4);
    409
    410	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
    411	    !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE))
    412		return -1;
    413
    414	op = insn->opcode.bytes[0];
    415
    416	/*
    417	 * Convert:
    418	 *
    419	 *   Jcc.d32 __x86_indirect_thunk_\reg
    420	 *
    421	 * into:
    422	 *
    423	 *   Jncc.d8 1f
    424	 *   [ LFENCE ]
    425	 *   JMP *%\reg
    426	 *   [ NOP ]
    427	 * 1:
    428	 */
    429	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
    430	if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
    431		cc = insn->opcode.bytes[1] & 0xf;
    432		cc ^= 1; /* invert condition */
    433
    434		bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
    435		bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
    436
    437		/* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
    438		op = JMP32_INSN_OPCODE;
    439	}
    440
    441	/*
    442	 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
    443	 */
    444	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
    445		bytes[i++] = 0x0f;
    446		bytes[i++] = 0xae;
    447		bytes[i++] = 0xe8; /* LFENCE */
    448	}
    449
    450	ret = emit_indirect(op, reg, bytes + i);
    451	if (ret < 0)
    452		return ret;
    453	i += ret;
    454
    455	for (; i < insn->length;)
    456		bytes[i++] = BYTES_NOP1;
    457
    458	return i;
    459}
    460
    461/*
    462 * Generated by 'objtool --retpoline'.
    463 */
    464void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
    465{
    466	s32 *s;
    467
    468	for (s = start; s < end; s++) {
    469		void *addr = (void *)s + *s;
    470		struct insn insn;
    471		int len, ret;
    472		u8 bytes[16];
    473		u8 op1, op2;
    474
    475		ret = insn_decode_kernel(&insn, addr);
    476		if (WARN_ON_ONCE(ret < 0))
    477			continue;
    478
    479		op1 = insn.opcode.bytes[0];
    480		op2 = insn.opcode.bytes[1];
    481
    482		switch (op1) {
    483		case CALL_INSN_OPCODE:
    484		case JMP32_INSN_OPCODE:
    485			break;
    486
    487		case 0x0f: /* escape */
    488			if (op2 >= 0x80 && op2 <= 0x8f)
    489				break;
    490			fallthrough;
    491		default:
    492			WARN_ON_ONCE(1);
    493			continue;
    494		}
    495
    496		DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
    497			addr, addr, insn.length,
    498			addr + insn.length + insn.immediate.value);
    499
    500		len = patch_retpoline(addr, &insn, bytes);
    501		if (len == insn.length) {
    502			optimize_nops(bytes, len);
    503			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
    504			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
    505			text_poke_early(addr, bytes, len);
    506		}
    507	}
    508}
    509
    510#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
    511
    512void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
    513
    514#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
    515
    516#ifdef CONFIG_X86_KERNEL_IBT
    517
    518/*
    519 * Generated by: objtool --ibt
    520 */
    521void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
    522{
    523	s32 *s;
    524
    525	for (s = start; s < end; s++) {
    526		u32 endbr, poison = gen_endbr_poison();
    527		void *addr = (void *)s + *s;
    528
    529		if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
    530			continue;
    531
    532		if (WARN_ON_ONCE(!is_endbr(endbr)))
    533			continue;
    534
    535		DPRINTK("ENDBR at: %pS (%px)", addr, addr);
    536
    537		/*
    538		 * When we have IBT, the lack of ENDBR will trigger #CP
    539		 */
    540		DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
    541		DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
    542		text_poke_early(addr, &poison, 4);
    543	}
    544}
    545
    546#else
    547
    548void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { }
    549
    550#endif /* CONFIG_X86_KERNEL_IBT */
    551
    552#ifdef CONFIG_SMP
    553static void alternatives_smp_lock(const s32 *start, const s32 *end,
    554				  u8 *text, u8 *text_end)
    555{
    556	const s32 *poff;
    557
    558	for (poff = start; poff < end; poff++) {
    559		u8 *ptr = (u8 *)poff + *poff;
    560
    561		if (!*poff || ptr < text || ptr >= text_end)
    562			continue;
    563		/* turn DS segment override prefix into lock prefix */
    564		if (*ptr == 0x3e)
    565			text_poke(ptr, ((unsigned char []){0xf0}), 1);
    566	}
    567}
    568
    569static void alternatives_smp_unlock(const s32 *start, const s32 *end,
    570				    u8 *text, u8 *text_end)
    571{
    572	const s32 *poff;
    573
    574	for (poff = start; poff < end; poff++) {
    575		u8 *ptr = (u8 *)poff + *poff;
    576
    577		if (!*poff || ptr < text || ptr >= text_end)
    578			continue;
    579		/* turn lock prefix into DS segment override prefix */
    580		if (*ptr == 0xf0)
    581			text_poke(ptr, ((unsigned char []){0x3E}), 1);
    582	}
    583}
    584
    585struct smp_alt_module {
    586	/* what is this ??? */
    587	struct module	*mod;
    588	char		*name;
    589
    590	/* ptrs to lock prefixes */
    591	const s32	*locks;
    592	const s32	*locks_end;
    593
    594	/* .text segment, needed to avoid patching init code ;) */
    595	u8		*text;
    596	u8		*text_end;
    597
    598	struct list_head next;
    599};
    600static LIST_HEAD(smp_alt_modules);
    601static bool uniproc_patched = false;	/* protected by text_mutex */
    602
    603void __init_or_module alternatives_smp_module_add(struct module *mod,
    604						  char *name,
    605						  void *locks, void *locks_end,
    606						  void *text,  void *text_end)
    607{
    608	struct smp_alt_module *smp;
    609
    610	mutex_lock(&text_mutex);
    611	if (!uniproc_patched)
    612		goto unlock;
    613
    614	if (num_possible_cpus() == 1)
    615		/* Don't bother remembering, we'll never have to undo it. */
    616		goto smp_unlock;
    617
    618	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
    619	if (NULL == smp)
    620		/* we'll run the (safe but slow) SMP code then ... */
    621		goto unlock;
    622
    623	smp->mod	= mod;
    624	smp->name	= name;
    625	smp->locks	= locks;
    626	smp->locks_end	= locks_end;
    627	smp->text	= text;
    628	smp->text_end	= text_end;
    629	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
    630		smp->locks, smp->locks_end,
    631		smp->text, smp->text_end, smp->name);
    632
    633	list_add_tail(&smp->next, &smp_alt_modules);
    634smp_unlock:
    635	alternatives_smp_unlock(locks, locks_end, text, text_end);
    636unlock:
    637	mutex_unlock(&text_mutex);
    638}
    639
    640void __init_or_module alternatives_smp_module_del(struct module *mod)
    641{
    642	struct smp_alt_module *item;
    643
    644	mutex_lock(&text_mutex);
    645	list_for_each_entry(item, &smp_alt_modules, next) {
    646		if (mod != item->mod)
    647			continue;
    648		list_del(&item->next);
    649		kfree(item);
    650		break;
    651	}
    652	mutex_unlock(&text_mutex);
    653}
    654
    655void alternatives_enable_smp(void)
    656{
    657	struct smp_alt_module *mod;
    658
    659	/* Why bother if there are no other CPUs? */
    660	BUG_ON(num_possible_cpus() == 1);
    661
    662	mutex_lock(&text_mutex);
    663
    664	if (uniproc_patched) {
    665		pr_info("switching to SMP code\n");
    666		BUG_ON(num_online_cpus() != 1);
    667		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
    668		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
    669		list_for_each_entry(mod, &smp_alt_modules, next)
    670			alternatives_smp_lock(mod->locks, mod->locks_end,
    671					      mod->text, mod->text_end);
    672		uniproc_patched = false;
    673	}
    674	mutex_unlock(&text_mutex);
    675}
    676
    677/*
    678 * Return 1 if the address range is reserved for SMP-alternatives.
    679 * Must hold text_mutex.
    680 */
    681int alternatives_text_reserved(void *start, void *end)
    682{
    683	struct smp_alt_module *mod;
    684	const s32 *poff;
    685	u8 *text_start = start;
    686	u8 *text_end = end;
    687
    688	lockdep_assert_held(&text_mutex);
    689
    690	list_for_each_entry(mod, &smp_alt_modules, next) {
    691		if (mod->text > text_end || mod->text_end < text_start)
    692			continue;
    693		for (poff = mod->locks; poff < mod->locks_end; poff++) {
    694			const u8 *ptr = (const u8 *)poff + *poff;
    695
    696			if (text_start <= ptr && text_end > ptr)
    697				return 1;
    698		}
    699	}
    700
    701	return 0;
    702}
    703#endif /* CONFIG_SMP */
    704
    705#ifdef CONFIG_PARAVIRT
    706void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
    707				     struct paravirt_patch_site *end)
    708{
    709	struct paravirt_patch_site *p;
    710	char insn_buff[MAX_PATCH_LEN];
    711
    712	for (p = start; p < end; p++) {
    713		unsigned int used;
    714
    715		BUG_ON(p->len > MAX_PATCH_LEN);
    716		/* prep the buffer with the original instructions */
    717		memcpy(insn_buff, p->instr, p->len);
    718		used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
    719
    720		BUG_ON(used > p->len);
    721
    722		/* Pad the rest with nops */
    723		add_nops(insn_buff + used, p->len - used);
    724		text_poke_early(p->instr, insn_buff, p->len);
    725	}
    726}
    727extern struct paravirt_patch_site __start_parainstructions[],
    728	__stop_parainstructions[];
    729#endif	/* CONFIG_PARAVIRT */
    730
    731/*
    732 * Self-test for the INT3 based CALL emulation code.
    733 *
    734 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
    735 * properly and that there is a stack gap between the INT3 frame and the
    736 * previous context. Without this gap doing a virtual PUSH on the interrupted
    737 * stack would corrupt the INT3 IRET frame.
    738 *
    739 * See entry_{32,64}.S for more details.
    740 */
    741
    742/*
    743 * We define the int3_magic() function in assembly to control the calling
    744 * convention such that we can 'call' it from assembly.
    745 */
    746
    747extern void int3_magic(unsigned int *ptr); /* defined in asm */
    748
    749asm (
    750"	.pushsection	.init.text, \"ax\", @progbits\n"
    751"	.type		int3_magic, @function\n"
    752"int3_magic:\n"
    753	ANNOTATE_NOENDBR
    754"	movl	$1, (%" _ASM_ARG1 ")\n"
    755	ASM_RET
    756"	.size		int3_magic, .-int3_magic\n"
    757"	.popsection\n"
    758);
    759
    760extern void int3_selftest_ip(void); /* defined in asm below */
    761
    762static int __init
    763int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
    764{
    765	unsigned long selftest = (unsigned long)&int3_selftest_ip;
    766	struct die_args *args = data;
    767	struct pt_regs *regs = args->regs;
    768
    769	OPTIMIZER_HIDE_VAR(selftest);
    770
    771	if (!regs || user_mode(regs))
    772		return NOTIFY_DONE;
    773
    774	if (val != DIE_INT3)
    775		return NOTIFY_DONE;
    776
    777	if (regs->ip - INT3_INSN_SIZE != selftest)
    778		return NOTIFY_DONE;
    779
    780	int3_emulate_call(regs, (unsigned long)&int3_magic);
    781	return NOTIFY_STOP;
    782}
    783
    784/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
    785static noinline void __init int3_selftest(void)
    786{
    787	static __initdata struct notifier_block int3_exception_nb = {
    788		.notifier_call	= int3_exception_notify,
    789		.priority	= INT_MAX-1, /* last */
    790	};
    791	unsigned int val = 0;
    792
    793	BUG_ON(register_die_notifier(&int3_exception_nb));
    794
    795	/*
    796	 * Basically: int3_magic(&val); but really complicated :-)
    797	 *
    798	 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
    799	 * notifier above will emulate CALL for us.
    800	 */
    801	asm volatile ("int3_selftest_ip:\n\t"
    802		      ANNOTATE_NOENDBR
    803		      "    int3; nop; nop; nop; nop\n\t"
    804		      : ASM_CALL_CONSTRAINT
    805		      : __ASM_SEL_RAW(a, D) (&val)
    806		      : "memory");
    807
    808	BUG_ON(val != 1);
    809
    810	unregister_die_notifier(&int3_exception_nb);
    811}
    812
    813void __init alternative_instructions(void)
    814{
    815	int3_selftest();
    816
    817	/*
    818	 * The patching is not fully atomic, so try to avoid local
    819	 * interruptions that might execute the to be patched code.
    820	 * Other CPUs are not running.
    821	 */
    822	stop_nmi();
    823
    824	/*
    825	 * Don't stop machine check exceptions while patching.
    826	 * MCEs only happen when something got corrupted and in this
    827	 * case we must do something about the corruption.
    828	 * Ignoring it is worse than an unlikely patching race.
    829	 * Also machine checks tend to be broadcast and if one CPU
    830	 * goes into machine check the others follow quickly, so we don't
    831	 * expect a machine check to cause undue problems during to code
    832	 * patching.
    833	 */
    834
    835	/*
    836	 * Paravirt patching and alternative patching can be combined to
    837	 * replace a function call with a short direct code sequence (e.g.
    838	 * by setting a constant return value instead of doing that in an
    839	 * external function).
    840	 * In order to make this work the following sequence is required:
    841	 * 1. set (artificial) features depending on used paravirt
    842	 *    functions which can later influence alternative patching
    843	 * 2. apply paravirt patching (generally replacing an indirect
    844	 *    function call with a direct one)
    845	 * 3. apply alternative patching (e.g. replacing a direct function
    846	 *    call with a custom code sequence)
    847	 * Doing paravirt patching after alternative patching would clobber
    848	 * the optimization of the custom code with a function call again.
    849	 */
    850	paravirt_set_cap();
    851
    852	/*
    853	 * First patch paravirt functions, such that we overwrite the indirect
    854	 * call with the direct call.
    855	 */
    856	apply_paravirt(__parainstructions, __parainstructions_end);
    857
    858	/*
    859	 * Rewrite the retpolines, must be done before alternatives since
    860	 * those can rewrite the retpoline thunks.
    861	 */
    862	apply_retpolines(__retpoline_sites, __retpoline_sites_end);
    863
    864	/*
    865	 * Then patch alternatives, such that those paravirt calls that are in
    866	 * alternatives can be overwritten by their immediate fragments.
    867	 */
    868	apply_alternatives(__alt_instructions, __alt_instructions_end);
    869
    870	apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
    871
    872#ifdef CONFIG_SMP
    873	/* Patch to UP if other cpus not imminent. */
    874	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
    875		uniproc_patched = true;
    876		alternatives_smp_module_add(NULL, "core kernel",
    877					    __smp_locks, __smp_locks_end,
    878					    _text, _etext);
    879	}
    880
    881	if (!uniproc_patched || num_possible_cpus() == 1) {
    882		free_init_pages("SMP alternatives",
    883				(unsigned long)__smp_locks,
    884				(unsigned long)__smp_locks_end);
    885	}
    886#endif
    887
    888	restart_nmi();
    889	alternatives_patched = 1;
    890}
    891
    892/**
    893 * text_poke_early - Update instructions on a live kernel at boot time
    894 * @addr: address to modify
    895 * @opcode: source of the copy
    896 * @len: length to copy
    897 *
    898 * When you use this code to patch more than one byte of an instruction
    899 * you need to make sure that other CPUs cannot execute this code in parallel.
    900 * Also no thread must be currently preempted in the middle of these
    901 * instructions. And on the local CPU you need to be protected against NMI or
    902 * MCE handlers seeing an inconsistent instruction while you patch.
    903 */
    904void __init_or_module text_poke_early(void *addr, const void *opcode,
    905				      size_t len)
    906{
    907	unsigned long flags;
    908
    909	if (boot_cpu_has(X86_FEATURE_NX) &&
    910	    is_module_text_address((unsigned long)addr)) {
    911		/*
    912		 * Modules text is marked initially as non-executable, so the
    913		 * code cannot be running and speculative code-fetches are
    914		 * prevented. Just change the code.
    915		 */
    916		memcpy(addr, opcode, len);
    917	} else {
    918		local_irq_save(flags);
    919		memcpy(addr, opcode, len);
    920		local_irq_restore(flags);
    921		sync_core();
    922
    923		/*
    924		 * Could also do a CLFLUSH here to speed up CPU recovery; but
    925		 * that causes hangs on some VIA CPUs.
    926		 */
    927	}
    928}
    929
    930typedef struct {
    931	struct mm_struct *mm;
    932} temp_mm_state_t;
    933
    934/*
    935 * Using a temporary mm allows to set temporary mappings that are not accessible
    936 * by other CPUs. Such mappings are needed to perform sensitive memory writes
    937 * that override the kernel memory protections (e.g., W^X), without exposing the
    938 * temporary page-table mappings that are required for these write operations to
    939 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
    940 * mapping is torn down.
    941 *
    942 * Context: The temporary mm needs to be used exclusively by a single core. To
    943 *          harden security IRQs must be disabled while the temporary mm is
    944 *          loaded, thereby preventing interrupt handler bugs from overriding
    945 *          the kernel memory protection.
    946 */
    947static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
    948{
    949	temp_mm_state_t temp_state;
    950
    951	lockdep_assert_irqs_disabled();
    952
    953	/*
    954	 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
    955	 * with a stale address space WITHOUT being in lazy mode after
    956	 * restoring the previous mm.
    957	 */
    958	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
    959		leave_mm(smp_processor_id());
    960
    961	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
    962	switch_mm_irqs_off(NULL, mm, current);
    963
    964	/*
    965	 * If breakpoints are enabled, disable them while the temporary mm is
    966	 * used. Userspace might set up watchpoints on addresses that are used
    967	 * in the temporary mm, which would lead to wrong signals being sent or
    968	 * crashes.
    969	 *
    970	 * Note that breakpoints are not disabled selectively, which also causes
    971	 * kernel breakpoints (e.g., perf's) to be disabled. This might be
    972	 * undesirable, but still seems reasonable as the code that runs in the
    973	 * temporary mm should be short.
    974	 */
    975	if (hw_breakpoint_active())
    976		hw_breakpoint_disable();
    977
    978	return temp_state;
    979}
    980
    981static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
    982{
    983	lockdep_assert_irqs_disabled();
    984	switch_mm_irqs_off(NULL, prev_state.mm, current);
    985
    986	/*
    987	 * Restore the breakpoints if they were disabled before the temporary mm
    988	 * was loaded.
    989	 */
    990	if (hw_breakpoint_active())
    991		hw_breakpoint_restore();
    992}
    993
    994__ro_after_init struct mm_struct *poking_mm;
    995__ro_after_init unsigned long poking_addr;
    996
    997static void text_poke_memcpy(void *dst, const void *src, size_t len)
    998{
    999	memcpy(dst, src, len);
   1000}
   1001
   1002static void text_poke_memset(void *dst, const void *src, size_t len)
   1003{
   1004	int c = *(const int *)src;
   1005
   1006	memset(dst, c, len);
   1007}
   1008
   1009typedef void text_poke_f(void *dst, const void *src, size_t len);
   1010
   1011static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
   1012{
   1013	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
   1014	struct page *pages[2] = {NULL};
   1015	temp_mm_state_t prev;
   1016	unsigned long flags;
   1017	pte_t pte, *ptep;
   1018	spinlock_t *ptl;
   1019	pgprot_t pgprot;
   1020
   1021	/*
   1022	 * While boot memory allocator is running we cannot use struct pages as
   1023	 * they are not yet initialized. There is no way to recover.
   1024	 */
   1025	BUG_ON(!after_bootmem);
   1026
   1027	if (!core_kernel_text((unsigned long)addr)) {
   1028		pages[0] = vmalloc_to_page(addr);
   1029		if (cross_page_boundary)
   1030			pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
   1031	} else {
   1032		pages[0] = virt_to_page(addr);
   1033		WARN_ON(!PageReserved(pages[0]));
   1034		if (cross_page_boundary)
   1035			pages[1] = virt_to_page(addr + PAGE_SIZE);
   1036	}
   1037	/*
   1038	 * If something went wrong, crash and burn since recovery paths are not
   1039	 * implemented.
   1040	 */
   1041	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
   1042
   1043	/*
   1044	 * Map the page without the global bit, as TLB flushing is done with
   1045	 * flush_tlb_mm_range(), which is intended for non-global PTEs.
   1046	 */
   1047	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
   1048
   1049	/*
   1050	 * The lock is not really needed, but this allows to avoid open-coding.
   1051	 */
   1052	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
   1053
   1054	/*
   1055	 * This must not fail; preallocated in poking_init().
   1056	 */
   1057	VM_BUG_ON(!ptep);
   1058
   1059	local_irq_save(flags);
   1060
   1061	pte = mk_pte(pages[0], pgprot);
   1062	set_pte_at(poking_mm, poking_addr, ptep, pte);
   1063
   1064	if (cross_page_boundary) {
   1065		pte = mk_pte(pages[1], pgprot);
   1066		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
   1067	}
   1068
   1069	/*
   1070	 * Loading the temporary mm behaves as a compiler barrier, which
   1071	 * guarantees that the PTE will be set at the time memcpy() is done.
   1072	 */
   1073	prev = use_temporary_mm(poking_mm);
   1074
   1075	kasan_disable_current();
   1076	func((u8 *)poking_addr + offset_in_page(addr), src, len);
   1077	kasan_enable_current();
   1078
   1079	/*
   1080	 * Ensure that the PTE is only cleared after the instructions of memcpy
   1081	 * were issued by using a compiler barrier.
   1082	 */
   1083	barrier();
   1084
   1085	pte_clear(poking_mm, poking_addr, ptep);
   1086	if (cross_page_boundary)
   1087		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
   1088
   1089	/*
   1090	 * Loading the previous page-table hierarchy requires a serializing
   1091	 * instruction that already allows the core to see the updated version.
   1092	 * Xen-PV is assumed to serialize execution in a similar manner.
   1093	 */
   1094	unuse_temporary_mm(prev);
   1095
   1096	/*
   1097	 * Flushing the TLB might involve IPIs, which would require enabled
   1098	 * IRQs, but not if the mm is not used, as it is in this point.
   1099	 */
   1100	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
   1101			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
   1102			   PAGE_SHIFT, false);
   1103
   1104	if (func == text_poke_memcpy) {
   1105		/*
   1106		 * If the text does not match what we just wrote then something is
   1107		 * fundamentally screwy; there's nothing we can really do about that.
   1108		 */
   1109		BUG_ON(memcmp(addr, src, len));
   1110	}
   1111
   1112	local_irq_restore(flags);
   1113	pte_unmap_unlock(ptep, ptl);
   1114	return addr;
   1115}
   1116
   1117/**
   1118 * text_poke - Update instructions on a live kernel
   1119 * @addr: address to modify
   1120 * @opcode: source of the copy
   1121 * @len: length to copy
   1122 *
   1123 * Only atomic text poke/set should be allowed when not doing early patching.
   1124 * It means the size must be writable atomically and the address must be aligned
   1125 * in a way that permits an atomic write. It also makes sure we fit on a single
   1126 * page.
   1127 *
   1128 * Note that the caller must ensure that if the modified code is part of a
   1129 * module, the module would not be removed during poking. This can be achieved
   1130 * by registering a module notifier, and ordering module removal and patching
   1131 * trough a mutex.
   1132 */
   1133void *text_poke(void *addr, const void *opcode, size_t len)
   1134{
   1135	lockdep_assert_held(&text_mutex);
   1136
   1137	return __text_poke(text_poke_memcpy, addr, opcode, len);
   1138}
   1139
   1140/**
   1141 * text_poke_kgdb - Update instructions on a live kernel by kgdb
   1142 * @addr: address to modify
   1143 * @opcode: source of the copy
   1144 * @len: length to copy
   1145 *
   1146 * Only atomic text poke/set should be allowed when not doing early patching.
   1147 * It means the size must be writable atomically and the address must be aligned
   1148 * in a way that permits an atomic write. It also makes sure we fit on a single
   1149 * page.
   1150 *
   1151 * Context: should only be used by kgdb, which ensures no other core is running,
   1152 *	    despite the fact it does not hold the text_mutex.
   1153 */
   1154void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
   1155{
   1156	return __text_poke(text_poke_memcpy, addr, opcode, len);
   1157}
   1158
   1159/**
   1160 * text_poke_copy - Copy instructions into (an unused part of) RX memory
   1161 * @addr: address to modify
   1162 * @opcode: source of the copy
   1163 * @len: length to copy, could be more than 2x PAGE_SIZE
   1164 *
   1165 * Not safe against concurrent execution; useful for JITs to dump
   1166 * new code blocks into unused regions of RX memory. Can be used in
   1167 * conjunction with synchronize_rcu_tasks() to wait for existing
   1168 * execution to quiesce after having made sure no existing functions
   1169 * pointers are live.
   1170 */
   1171void *text_poke_copy(void *addr, const void *opcode, size_t len)
   1172{
   1173	unsigned long start = (unsigned long)addr;
   1174	size_t patched = 0;
   1175
   1176	if (WARN_ON_ONCE(core_kernel_text(start)))
   1177		return NULL;
   1178
   1179	mutex_lock(&text_mutex);
   1180	while (patched < len) {
   1181		unsigned long ptr = start + patched;
   1182		size_t s;
   1183
   1184		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
   1185
   1186		__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
   1187		patched += s;
   1188	}
   1189	mutex_unlock(&text_mutex);
   1190	return addr;
   1191}
   1192
   1193/**
   1194 * text_poke_set - memset into (an unused part of) RX memory
   1195 * @addr: address to modify
   1196 * @c: the byte to fill the area with
   1197 * @len: length to copy, could be more than 2x PAGE_SIZE
   1198 *
   1199 * This is useful to overwrite unused regions of RX memory with illegal
   1200 * instructions.
   1201 */
   1202void *text_poke_set(void *addr, int c, size_t len)
   1203{
   1204	unsigned long start = (unsigned long)addr;
   1205	size_t patched = 0;
   1206
   1207	if (WARN_ON_ONCE(core_kernel_text(start)))
   1208		return NULL;
   1209
   1210	mutex_lock(&text_mutex);
   1211	while (patched < len) {
   1212		unsigned long ptr = start + patched;
   1213		size_t s;
   1214
   1215		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
   1216
   1217		__text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
   1218		patched += s;
   1219	}
   1220	mutex_unlock(&text_mutex);
   1221	return addr;
   1222}
   1223
   1224static void do_sync_core(void *info)
   1225{
   1226	sync_core();
   1227}
   1228
   1229void text_poke_sync(void)
   1230{
   1231	on_each_cpu(do_sync_core, NULL, 1);
   1232}
   1233
   1234struct text_poke_loc {
   1235	/* addr := _stext + rel_addr */
   1236	s32 rel_addr;
   1237	s32 disp;
   1238	u8 len;
   1239	u8 opcode;
   1240	const u8 text[POKE_MAX_OPCODE_SIZE];
   1241	/* see text_poke_bp_batch() */
   1242	u8 old;
   1243};
   1244
   1245struct bp_patching_desc {
   1246	struct text_poke_loc *vec;
   1247	int nr_entries;
   1248	atomic_t refs;
   1249};
   1250
   1251static struct bp_patching_desc *bp_desc;
   1252
   1253static __always_inline
   1254struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
   1255{
   1256	/* rcu_dereference */
   1257	struct bp_patching_desc *desc = __READ_ONCE(*descp);
   1258
   1259	if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
   1260		return NULL;
   1261
   1262	return desc;
   1263}
   1264
   1265static __always_inline void put_desc(struct bp_patching_desc *desc)
   1266{
   1267	smp_mb__before_atomic();
   1268	arch_atomic_dec(&desc->refs);
   1269}
   1270
   1271static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
   1272{
   1273	return _stext + tp->rel_addr;
   1274}
   1275
   1276static __always_inline int patch_cmp(const void *key, const void *elt)
   1277{
   1278	struct text_poke_loc *tp = (struct text_poke_loc *) elt;
   1279
   1280	if (key < text_poke_addr(tp))
   1281		return -1;
   1282	if (key > text_poke_addr(tp))
   1283		return 1;
   1284	return 0;
   1285}
   1286
   1287noinstr int poke_int3_handler(struct pt_regs *regs)
   1288{
   1289	struct bp_patching_desc *desc;
   1290	struct text_poke_loc *tp;
   1291	int ret = 0;
   1292	void *ip;
   1293
   1294	if (user_mode(regs))
   1295		return 0;
   1296
   1297	/*
   1298	 * Having observed our INT3 instruction, we now must observe
   1299	 * bp_desc:
   1300	 *
   1301	 *	bp_desc = desc			INT3
   1302	 *	WMB				RMB
   1303	 *	write INT3			if (desc)
   1304	 */
   1305	smp_rmb();
   1306
   1307	desc = try_get_desc(&bp_desc);
   1308	if (!desc)
   1309		return 0;
   1310
   1311	/*
   1312	 * Discount the INT3. See text_poke_bp_batch().
   1313	 */
   1314	ip = (void *) regs->ip - INT3_INSN_SIZE;
   1315
   1316	/*
   1317	 * Skip the binary search if there is a single member in the vector.
   1318	 */
   1319	if (unlikely(desc->nr_entries > 1)) {
   1320		tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
   1321				      sizeof(struct text_poke_loc),
   1322				      patch_cmp);
   1323		if (!tp)
   1324			goto out_put;
   1325	} else {
   1326		tp = desc->vec;
   1327		if (text_poke_addr(tp) != ip)
   1328			goto out_put;
   1329	}
   1330
   1331	ip += tp->len;
   1332
   1333	switch (tp->opcode) {
   1334	case INT3_INSN_OPCODE:
   1335		/*
   1336		 * Someone poked an explicit INT3, they'll want to handle it,
   1337		 * do not consume.
   1338		 */
   1339		goto out_put;
   1340
   1341	case RET_INSN_OPCODE:
   1342		int3_emulate_ret(regs);
   1343		break;
   1344
   1345	case CALL_INSN_OPCODE:
   1346		int3_emulate_call(regs, (long)ip + tp->disp);
   1347		break;
   1348
   1349	case JMP32_INSN_OPCODE:
   1350	case JMP8_INSN_OPCODE:
   1351		int3_emulate_jmp(regs, (long)ip + tp->disp);
   1352		break;
   1353
   1354	default:
   1355		BUG();
   1356	}
   1357
   1358	ret = 1;
   1359
   1360out_put:
   1361	put_desc(desc);
   1362	return ret;
   1363}
   1364
   1365#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
   1366static struct text_poke_loc tp_vec[TP_VEC_MAX];
   1367static int tp_vec_nr;
   1368
   1369/**
   1370 * text_poke_bp_batch() -- update instructions on live kernel on SMP
   1371 * @tp:			vector of instructions to patch
   1372 * @nr_entries:		number of entries in the vector
   1373 *
   1374 * Modify multi-byte instruction by using int3 breakpoint on SMP.
   1375 * We completely avoid stop_machine() here, and achieve the
   1376 * synchronization using int3 breakpoint.
   1377 *
   1378 * The way it is done:
   1379 *	- For each entry in the vector:
   1380 *		- add a int3 trap to the address that will be patched
   1381 *	- sync cores
   1382 *	- For each entry in the vector:
   1383 *		- update all but the first byte of the patched range
   1384 *	- sync cores
   1385 *	- For each entry in the vector:
   1386 *		- replace the first byte (int3) by the first byte of
   1387 *		  replacing opcode
   1388 *	- sync cores
   1389 */
   1390static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
   1391{
   1392	struct bp_patching_desc desc = {
   1393		.vec = tp,
   1394		.nr_entries = nr_entries,
   1395		.refs = ATOMIC_INIT(1),
   1396	};
   1397	unsigned char int3 = INT3_INSN_OPCODE;
   1398	unsigned int i;
   1399	int do_sync;
   1400
   1401	lockdep_assert_held(&text_mutex);
   1402
   1403	smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
   1404
   1405	/*
   1406	 * Corresponding read barrier in int3 notifier for making sure the
   1407	 * nr_entries and handler are correctly ordered wrt. patching.
   1408	 */
   1409	smp_wmb();
   1410
   1411	/*
   1412	 * First step: add a int3 trap to the address that will be patched.
   1413	 */
   1414	for (i = 0; i < nr_entries; i++) {
   1415		tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
   1416		text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
   1417	}
   1418
   1419	text_poke_sync();
   1420
   1421	/*
   1422	 * Second step: update all but the first byte of the patched range.
   1423	 */
   1424	for (do_sync = 0, i = 0; i < nr_entries; i++) {
   1425		u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
   1426		int len = tp[i].len;
   1427
   1428		if (len - INT3_INSN_SIZE > 0) {
   1429			memcpy(old + INT3_INSN_SIZE,
   1430			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
   1431			       len - INT3_INSN_SIZE);
   1432			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
   1433				  (const char *)tp[i].text + INT3_INSN_SIZE,
   1434				  len - INT3_INSN_SIZE);
   1435			do_sync++;
   1436		}
   1437
   1438		/*
   1439		 * Emit a perf event to record the text poke, primarily to
   1440		 * support Intel PT decoding which must walk the executable code
   1441		 * to reconstruct the trace. The flow up to here is:
   1442		 *   - write INT3 byte
   1443		 *   - IPI-SYNC
   1444		 *   - write instruction tail
   1445		 * At this point the actual control flow will be through the
   1446		 * INT3 and handler and not hit the old or new instruction.
   1447		 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
   1448		 * can still be decoded. Subsequently:
   1449		 *   - emit RECORD_TEXT_POKE with the new instruction
   1450		 *   - IPI-SYNC
   1451		 *   - write first byte
   1452		 *   - IPI-SYNC
   1453		 * So before the text poke event timestamp, the decoder will see
   1454		 * either the old instruction flow or FUP/TIP of INT3. After the
   1455		 * text poke event timestamp, the decoder will see either the
   1456		 * new instruction flow or FUP/TIP of INT3. Thus decoders can
   1457		 * use the timestamp as the point at which to modify the
   1458		 * executable code.
   1459		 * The old instruction is recorded so that the event can be
   1460		 * processed forwards or backwards.
   1461		 */
   1462		perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
   1463				     tp[i].text, len);
   1464	}
   1465
   1466	if (do_sync) {
   1467		/*
   1468		 * According to Intel, this core syncing is very likely
   1469		 * not necessary and we'd be safe even without it. But
   1470		 * better safe than sorry (plus there's not only Intel).
   1471		 */
   1472		text_poke_sync();
   1473	}
   1474
   1475	/*
   1476	 * Third step: replace the first byte (int3) by the first byte of
   1477	 * replacing opcode.
   1478	 */
   1479	for (do_sync = 0, i = 0; i < nr_entries; i++) {
   1480		if (tp[i].text[0] == INT3_INSN_OPCODE)
   1481			continue;
   1482
   1483		text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
   1484		do_sync++;
   1485	}
   1486
   1487	if (do_sync)
   1488		text_poke_sync();
   1489
   1490	/*
   1491	 * Remove and synchronize_rcu(), except we have a very primitive
   1492	 * refcount based completion.
   1493	 */
   1494	WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
   1495	if (!atomic_dec_and_test(&desc.refs))
   1496		atomic_cond_read_acquire(&desc.refs, !VAL);
   1497}
   1498
   1499static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
   1500			       const void *opcode, size_t len, const void *emulate)
   1501{
   1502	struct insn insn;
   1503	int ret, i;
   1504
   1505	memcpy((void *)tp->text, opcode, len);
   1506	if (!emulate)
   1507		emulate = opcode;
   1508
   1509	ret = insn_decode_kernel(&insn, emulate);
   1510	BUG_ON(ret < 0);
   1511
   1512	tp->rel_addr = addr - (void *)_stext;
   1513	tp->len = len;
   1514	tp->opcode = insn.opcode.bytes[0];
   1515
   1516	switch (tp->opcode) {
   1517	case RET_INSN_OPCODE:
   1518	case JMP32_INSN_OPCODE:
   1519	case JMP8_INSN_OPCODE:
   1520		/*
   1521		 * Control flow instructions without implied execution of the
   1522		 * next instruction can be padded with INT3.
   1523		 */
   1524		for (i = insn.length; i < len; i++)
   1525			BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
   1526		break;
   1527
   1528	default:
   1529		BUG_ON(len != insn.length);
   1530	};
   1531
   1532
   1533	switch (tp->opcode) {
   1534	case INT3_INSN_OPCODE:
   1535	case RET_INSN_OPCODE:
   1536		break;
   1537
   1538	case CALL_INSN_OPCODE:
   1539	case JMP32_INSN_OPCODE:
   1540	case JMP8_INSN_OPCODE:
   1541		tp->disp = insn.immediate.value;
   1542		break;
   1543
   1544	default: /* assume NOP */
   1545		switch (len) {
   1546		case 2: /* NOP2 -- emulate as JMP8+0 */
   1547			BUG_ON(memcmp(emulate, x86_nops[len], len));
   1548			tp->opcode = JMP8_INSN_OPCODE;
   1549			tp->disp = 0;
   1550			break;
   1551
   1552		case 5: /* NOP5 -- emulate as JMP32+0 */
   1553			BUG_ON(memcmp(emulate, x86_nops[len], len));
   1554			tp->opcode = JMP32_INSN_OPCODE;
   1555			tp->disp = 0;
   1556			break;
   1557
   1558		default: /* unknown instruction */
   1559			BUG();
   1560		}
   1561		break;
   1562	}
   1563}
   1564
   1565/*
   1566 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
   1567 * early if needed.
   1568 */
   1569static bool tp_order_fail(void *addr)
   1570{
   1571	struct text_poke_loc *tp;
   1572
   1573	if (!tp_vec_nr)
   1574		return false;
   1575
   1576	if (!addr) /* force */
   1577		return true;
   1578
   1579	tp = &tp_vec[tp_vec_nr - 1];
   1580	if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
   1581		return true;
   1582
   1583	return false;
   1584}
   1585
   1586static void text_poke_flush(void *addr)
   1587{
   1588	if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
   1589		text_poke_bp_batch(tp_vec, tp_vec_nr);
   1590		tp_vec_nr = 0;
   1591	}
   1592}
   1593
   1594void text_poke_finish(void)
   1595{
   1596	text_poke_flush(NULL);
   1597}
   1598
   1599void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
   1600{
   1601	struct text_poke_loc *tp;
   1602
   1603	if (unlikely(system_state == SYSTEM_BOOTING)) {
   1604		text_poke_early(addr, opcode, len);
   1605		return;
   1606	}
   1607
   1608	text_poke_flush(addr);
   1609
   1610	tp = &tp_vec[tp_vec_nr++];
   1611	text_poke_loc_init(tp, addr, opcode, len, emulate);
   1612}
   1613
   1614/**
   1615 * text_poke_bp() -- update instructions on live kernel on SMP
   1616 * @addr:	address to patch
   1617 * @opcode:	opcode of new instruction
   1618 * @len:	length to copy
   1619 * @emulate:	instruction to be emulated
   1620 *
   1621 * Update a single instruction with the vector in the stack, avoiding
   1622 * dynamically allocated memory. This function should be used when it is
   1623 * not possible to allocate memory.
   1624 */
   1625void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
   1626{
   1627	struct text_poke_loc tp;
   1628
   1629	if (unlikely(system_state == SYSTEM_BOOTING)) {
   1630		text_poke_early(addr, opcode, len);
   1631		return;
   1632	}
   1633
   1634	text_poke_loc_init(&tp, addr, opcode, len, emulate);
   1635	text_poke_bp_batch(&tp, 1);
   1636}