opt.c (15273B)
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Kernel Probes Jump Optimization (Optprobes) 4 * 5 * Copyright (C) IBM Corporation, 2002, 2004 6 * Copyright (C) Hitachi Ltd., 2012 7 */ 8#include <linux/kprobes.h> 9#include <linux/perf_event.h> 10#include <linux/ptrace.h> 11#include <linux/string.h> 12#include <linux/slab.h> 13#include <linux/hardirq.h> 14#include <linux/preempt.h> 15#include <linux/extable.h> 16#include <linux/kdebug.h> 17#include <linux/kallsyms.h> 18#include <linux/ftrace.h> 19#include <linux/objtool.h> 20#include <linux/pgtable.h> 21#include <linux/static_call.h> 22 23#include <asm/text-patching.h> 24#include <asm/cacheflush.h> 25#include <asm/desc.h> 26#include <linux/uaccess.h> 27#include <asm/alternative.h> 28#include <asm/insn.h> 29#include <asm/debugreg.h> 30#include <asm/set_memory.h> 31#include <asm/sections.h> 32#include <asm/nospec-branch.h> 33 34#include "common.h" 35 36unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) 37{ 38 struct optimized_kprobe *op; 39 struct kprobe *kp; 40 long offs; 41 int i; 42 43 for (i = 0; i < JMP32_INSN_SIZE; i++) { 44 kp = get_kprobe((void *)addr - i); 45 /* This function only handles jump-optimized kprobe */ 46 if (kp && kprobe_optimized(kp)) { 47 op = container_of(kp, struct optimized_kprobe, kp); 48 /* If op->list is not empty, op is under optimizing */ 49 if (list_empty(&op->list)) 50 goto found; 51 } 52 } 53 54 return addr; 55found: 56 /* 57 * If the kprobe can be optimized, original bytes which can be 58 * overwritten by jump destination address. In this case, original 59 * bytes must be recovered from op->optinsn.copied_insn buffer. 60 */ 61 if (copy_from_kernel_nofault(buf, (void *)addr, 62 MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 63 return 0UL; 64 65 if (addr == (unsigned long)kp->addr) { 66 buf[0] = kp->opcode; 67 memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE); 68 } else { 69 offs = addr - (unsigned long)kp->addr - 1; 70 memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs); 71 } 72 73 return (unsigned long)buf; 74} 75 76static void synthesize_clac(kprobe_opcode_t *addr) 77{ 78 /* 79 * Can't be static_cpu_has() due to how objtool treats this feature bit. 80 * This isn't a fast path anyway. 81 */ 82 if (!boot_cpu_has(X86_FEATURE_SMAP)) 83 return; 84 85 /* Replace the NOP3 with CLAC */ 86 addr[0] = 0x0f; 87 addr[1] = 0x01; 88 addr[2] = 0xca; 89} 90 91/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ 92static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) 93{ 94#ifdef CONFIG_X86_64 95 *addr++ = 0x48; 96 *addr++ = 0xbf; 97#else 98 *addr++ = 0xb8; 99#endif 100 *(unsigned long *)addr = val; 101} 102 103asm ( 104 ".pushsection .rodata\n" 105 "optprobe_template_func:\n" 106 ".global optprobe_template_entry\n" 107 "optprobe_template_entry:\n" 108#ifdef CONFIG_X86_64 109 " pushq $" __stringify(__KERNEL_DS) "\n" 110 /* Save the 'sp - 8', this will be fixed later. */ 111 " pushq %rsp\n" 112 " pushfq\n" 113 ".global optprobe_template_clac\n" 114 "optprobe_template_clac:\n" 115 ASM_NOP3 116 SAVE_REGS_STRING 117 " movq %rsp, %rsi\n" 118 ".global optprobe_template_val\n" 119 "optprobe_template_val:\n" 120 ASM_NOP5 121 ASM_NOP5 122 ".global optprobe_template_call\n" 123 "optprobe_template_call:\n" 124 ASM_NOP5 125 /* Copy 'regs->flags' into 'regs->ss'. */ 126 " movq 18*8(%rsp), %rdx\n" 127 " movq %rdx, 20*8(%rsp)\n" 128 RESTORE_REGS_STRING 129 /* Skip 'regs->flags' and 'regs->sp'. */ 130 " addq $16, %rsp\n" 131 /* And pop flags register from 'regs->ss'. */ 132 " popfq\n" 133#else /* CONFIG_X86_32 */ 134 " pushl %ss\n" 135 /* Save the 'sp - 4', this will be fixed later. */ 136 " pushl %esp\n" 137 " pushfl\n" 138 ".global optprobe_template_clac\n" 139 "optprobe_template_clac:\n" 140 ASM_NOP3 141 SAVE_REGS_STRING 142 " movl %esp, %edx\n" 143 ".global optprobe_template_val\n" 144 "optprobe_template_val:\n" 145 ASM_NOP5 146 ".global optprobe_template_call\n" 147 "optprobe_template_call:\n" 148 ASM_NOP5 149 /* Copy 'regs->flags' into 'regs->ss'. */ 150 " movl 14*4(%esp), %edx\n" 151 " movl %edx, 16*4(%esp)\n" 152 RESTORE_REGS_STRING 153 /* Skip 'regs->flags' and 'regs->sp'. */ 154 " addl $8, %esp\n" 155 /* And pop flags register from 'regs->ss'. */ 156 " popfl\n" 157#endif 158 ".global optprobe_template_end\n" 159 "optprobe_template_end:\n" 160 ".popsection\n"); 161 162void optprobe_template_func(void); 163STACK_FRAME_NON_STANDARD(optprobe_template_func); 164 165#define TMPL_CLAC_IDX \ 166 ((long)optprobe_template_clac - (long)optprobe_template_entry) 167#define TMPL_MOVE_IDX \ 168 ((long)optprobe_template_val - (long)optprobe_template_entry) 169#define TMPL_CALL_IDX \ 170 ((long)optprobe_template_call - (long)optprobe_template_entry) 171#define TMPL_END_IDX \ 172 ((long)optprobe_template_end - (long)optprobe_template_entry) 173 174/* Optimized kprobe call back function: called from optinsn */ 175static void 176optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) 177{ 178 /* This is possible if op is under delayed unoptimizing */ 179 if (kprobe_disabled(&op->kp)) 180 return; 181 182 preempt_disable(); 183 if (kprobe_running()) { 184 kprobes_inc_nmissed_count(&op->kp); 185 } else { 186 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 187 /* Adjust stack pointer */ 188 regs->sp += sizeof(long); 189 /* Save skipped registers */ 190 regs->cs = __KERNEL_CS; 191#ifdef CONFIG_X86_32 192 regs->gs = 0; 193#endif 194 regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; 195 regs->orig_ax = ~0UL; 196 197 __this_cpu_write(current_kprobe, &op->kp); 198 kcb->kprobe_status = KPROBE_HIT_ACTIVE; 199 opt_pre_handler(&op->kp, regs); 200 __this_cpu_write(current_kprobe, NULL); 201 } 202 preempt_enable(); 203} 204NOKPROBE_SYMBOL(optimized_callback); 205 206static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) 207{ 208 struct insn insn; 209 int len = 0, ret; 210 211 while (len < JMP32_INSN_SIZE) { 212 ret = __copy_instruction(dest + len, src + len, real + len, &insn); 213 if (!ret || !can_boost(&insn, src + len)) 214 return -EINVAL; 215 len += ret; 216 } 217 /* Check whether the address range is reserved */ 218 if (ftrace_text_reserved(src, src + len - 1) || 219 alternatives_text_reserved(src, src + len - 1) || 220 jump_label_text_reserved(src, src + len - 1) || 221 static_call_text_reserved(src, src + len - 1)) 222 return -EBUSY; 223 224 return len; 225} 226 227/* Check whether insn is indirect jump */ 228static int __insn_is_indirect_jump(struct insn *insn) 229{ 230 return ((insn->opcode.bytes[0] == 0xff && 231 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ 232 insn->opcode.bytes[0] == 0xea); /* Segment based jump */ 233} 234 235/* Check whether insn jumps into specified address range */ 236static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) 237{ 238 unsigned long target = 0; 239 240 switch (insn->opcode.bytes[0]) { 241 case 0xe0: /* loopne */ 242 case 0xe1: /* loope */ 243 case 0xe2: /* loop */ 244 case 0xe3: /* jcxz */ 245 case 0xe9: /* near relative jump */ 246 case 0xeb: /* short relative jump */ 247 break; 248 case 0x0f: 249 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ 250 break; 251 return 0; 252 default: 253 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ 254 break; 255 return 0; 256 } 257 target = (unsigned long)insn->next_byte + insn->immediate.value; 258 259 return (start <= target && target <= start + len); 260} 261 262static int insn_is_indirect_jump(struct insn *insn) 263{ 264 int ret = __insn_is_indirect_jump(insn); 265 266#ifdef CONFIG_RETPOLINE 267 /* 268 * Jump to x86_indirect_thunk_* is treated as an indirect jump. 269 * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with 270 * older gcc may use indirect jump. So we add this check instead of 271 * replace indirect-jump check. 272 */ 273 if (!ret) 274 ret = insn_jump_into_range(insn, 275 (unsigned long)__indirect_thunk_start, 276 (unsigned long)__indirect_thunk_end - 277 (unsigned long)__indirect_thunk_start); 278#endif 279 return ret; 280} 281 282static bool is_padding_int3(unsigned long addr, unsigned long eaddr) 283{ 284 unsigned char ops; 285 286 for (; addr < eaddr; addr++) { 287 if (get_kernel_nofault(ops, (void *)addr) < 0 || 288 ops != INT3_INSN_OPCODE) 289 return false; 290 } 291 292 return true; 293} 294 295/* Decode whole function to ensure any instructions don't jump into target */ 296static int can_optimize(unsigned long paddr) 297{ 298 unsigned long addr, size = 0, offset = 0; 299 struct insn insn; 300 kprobe_opcode_t buf[MAX_INSN_SIZE]; 301 302 /* Lookup symbol including addr */ 303 if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) 304 return 0; 305 306 /* 307 * Do not optimize in the entry code due to the unstable 308 * stack handling and registers setup. 309 */ 310 if (((paddr >= (unsigned long)__entry_text_start) && 311 (paddr < (unsigned long)__entry_text_end))) 312 return 0; 313 314 /* Check there is enough space for a relative jump. */ 315 if (size - offset < JMP32_INSN_SIZE) 316 return 0; 317 318 /* Decode instructions */ 319 addr = paddr - offset; 320 while (addr < paddr - offset + size) { /* Decode until function end */ 321 unsigned long recovered_insn; 322 int ret; 323 324 if (search_exception_tables(addr)) 325 /* 326 * Since some fixup code will jumps into this function, 327 * we can't optimize kprobe in this function. 328 */ 329 return 0; 330 recovered_insn = recover_probed_instruction(buf, addr); 331 if (!recovered_insn) 332 return 0; 333 334 ret = insn_decode_kernel(&insn, (void *)recovered_insn); 335 if (ret < 0) 336 return 0; 337 338 /* 339 * In the case of detecting unknown breakpoint, this could be 340 * a padding INT3 between functions. Let's check that all the 341 * rest of the bytes are also INT3. 342 */ 343 if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) 344 return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; 345 346 /* Recover address */ 347 insn.kaddr = (void *)addr; 348 insn.next_byte = (void *)(addr + insn.length); 349 /* Check any instructions don't jump into target */ 350 if (insn_is_indirect_jump(&insn) || 351 insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, 352 DISP32_SIZE)) 353 return 0; 354 addr += insn.length; 355 } 356 357 return 1; 358} 359 360/* Check optimized_kprobe can actually be optimized. */ 361int arch_check_optimized_kprobe(struct optimized_kprobe *op) 362{ 363 int i; 364 struct kprobe *p; 365 366 for (i = 1; i < op->optinsn.size; i++) { 367 p = get_kprobe(op->kp.addr + i); 368 if (p && !kprobe_disabled(p)) 369 return -EEXIST; 370 } 371 372 return 0; 373} 374 375/* Check the addr is within the optimized instructions. */ 376int arch_within_optimized_kprobe(struct optimized_kprobe *op, 377 kprobe_opcode_t *addr) 378{ 379 return (op->kp.addr <= addr && 380 op->kp.addr + op->optinsn.size > addr); 381} 382 383/* Free optimized instruction slot */ 384static 385void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) 386{ 387 u8 *slot = op->optinsn.insn; 388 if (slot) { 389 int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; 390 391 /* Record the perf event before freeing the slot */ 392 if (dirty) 393 perf_event_text_poke(slot, slot, len, NULL, 0); 394 395 free_optinsn_slot(slot, dirty); 396 op->optinsn.insn = NULL; 397 op->optinsn.size = 0; 398 } 399} 400 401void arch_remove_optimized_kprobe(struct optimized_kprobe *op) 402{ 403 __arch_remove_optimized_kprobe(op, 1); 404} 405 406/* 407 * Copy replacing target instructions 408 * Target instructions MUST be relocatable (checked inside) 409 * This is called when new aggr(opt)probe is allocated or reused. 410 */ 411int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, 412 struct kprobe *__unused) 413{ 414 u8 *buf = NULL, *slot; 415 int ret, len; 416 long rel; 417 418 if (!can_optimize((unsigned long)op->kp.addr)) 419 return -EILSEQ; 420 421 buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL); 422 if (!buf) 423 return -ENOMEM; 424 425 op->optinsn.insn = slot = get_optinsn_slot(); 426 if (!slot) { 427 ret = -ENOMEM; 428 goto out; 429 } 430 431 /* 432 * Verify if the address gap is in 2GB range, because this uses 433 * a relative jump. 434 */ 435 rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE; 436 if (abs(rel) > 0x7fffffff) { 437 ret = -ERANGE; 438 goto err; 439 } 440 441 /* Copy arch-dep-instance from template */ 442 memcpy(buf, optprobe_template_entry, TMPL_END_IDX); 443 444 /* Copy instructions into the out-of-line buffer */ 445 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr, 446 slot + TMPL_END_IDX); 447 if (ret < 0) 448 goto err; 449 op->optinsn.size = ret; 450 len = TMPL_END_IDX + op->optinsn.size; 451 452 synthesize_clac(buf + TMPL_CLAC_IDX); 453 454 /* Set probe information */ 455 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); 456 457 /* Set probe function call */ 458 synthesize_relcall(buf + TMPL_CALL_IDX, 459 slot + TMPL_CALL_IDX, optimized_callback); 460 461 /* Set returning jmp instruction at the tail of out-of-line buffer */ 462 synthesize_reljump(buf + len, slot + len, 463 (u8 *)op->kp.addr + op->optinsn.size); 464 len += JMP32_INSN_SIZE; 465 466 /* 467 * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also 468 * used in __arch_remove_optimized_kprobe(). 469 */ 470 471 /* We have to use text_poke() for instruction buffer because it is RO */ 472 perf_event_text_poke(slot, NULL, 0, buf, len); 473 text_poke(slot, buf, len); 474 475 ret = 0; 476out: 477 kfree(buf); 478 return ret; 479 480err: 481 __arch_remove_optimized_kprobe(op, 0); 482 goto out; 483} 484 485/* 486 * Replace breakpoints (INT3) with relative jumps (JMP.d32). 487 * Caller must call with locking kprobe_mutex and text_mutex. 488 * 489 * The caller will have installed a regular kprobe and after that issued 490 * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in 491 * the 4 bytes after the INT3 are unused and can now be overwritten. 492 */ 493void arch_optimize_kprobes(struct list_head *oplist) 494{ 495 struct optimized_kprobe *op, *tmp; 496 u8 insn_buff[JMP32_INSN_SIZE]; 497 498 list_for_each_entry_safe(op, tmp, oplist, list) { 499 s32 rel = (s32)((long)op->optinsn.insn - 500 ((long)op->kp.addr + JMP32_INSN_SIZE)); 501 502 WARN_ON(kprobe_disabled(&op->kp)); 503 504 /* Backup instructions which will be replaced by jump address */ 505 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE, 506 DISP32_SIZE); 507 508 insn_buff[0] = JMP32_INSN_OPCODE; 509 *(s32 *)(&insn_buff[1]) = rel; 510 511 text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); 512 513 list_del_init(&op->list); 514 } 515} 516 517/* 518 * Replace a relative jump (JMP.d32) with a breakpoint (INT3). 519 * 520 * After that, we can restore the 4 bytes after the INT3 to undo what 521 * arch_optimize_kprobes() scribbled. This is safe since those bytes will be 522 * unused once the INT3 lands. 523 */ 524void arch_unoptimize_kprobe(struct optimized_kprobe *op) 525{ 526 u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; 527 u8 old[JMP32_INSN_SIZE]; 528 u8 *addr = op->kp.addr; 529 530 memcpy(old, op->kp.addr, JMP32_INSN_SIZE); 531 memcpy(new + INT3_INSN_SIZE, 532 op->optinsn.copied_insn, 533 JMP32_INSN_SIZE - INT3_INSN_SIZE); 534 535 text_poke(addr, new, INT3_INSN_SIZE); 536 text_poke_sync(); 537 text_poke(addr + INT3_INSN_SIZE, 538 new + INT3_INSN_SIZE, 539 JMP32_INSN_SIZE - INT3_INSN_SIZE); 540 text_poke_sync(); 541 542 perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); 543} 544 545/* 546 * Recover original instructions and breakpoints from relative jumps. 547 * Caller must call with locking kprobe_mutex. 548 */ 549extern void arch_unoptimize_kprobes(struct list_head *oplist, 550 struct list_head *done_list) 551{ 552 struct optimized_kprobe *op, *tmp; 553 554 list_for_each_entry_safe(op, tmp, oplist, list) { 555 arch_unoptimize_kprobe(op); 556 list_move(&op->list, done_list); 557 } 558} 559 560int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) 561{ 562 struct optimized_kprobe *op; 563 564 if (p->flags & KPROBE_FLAG_OPTIMIZED) { 565 /* This kprobe is really able to run optimized path. */ 566 op = container_of(p, struct optimized_kprobe, kp); 567 /* Detour through copied instructions */ 568 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; 569 if (!reenter) 570 reset_current_kprobe(); 571 return 1; 572 } 573 return 0; 574} 575NOKPROBE_SYMBOL(setup_detour_execution);