From: Masami Hiramatsu <mhiramat@redhat.com> Date: Fri, 1 Aug 2008 19:43:15 -0400 Subject: [x86_64] kprobe: kprobe-booster and return probe-booster Message-id: 48939F93.5060102@redhat.com O-Subject: [RHEL5.3 PATCH] BZ438725: support kprobe-booster and return probe-booster on x86-64 Bugzilla: 438725 RH-Acked-by: Dave Anderson <anderson@redhat.com> Hello, Bugzilla: --------- https://bugzilla.redhat.com/show_bug.cgi?id=438725 Description: ------------ I backported a series of patches of kprobe-booster and kretprobe-booster on x86-64 to 2.6.18-101.el5. This patch is including three patches listed at "Upstream status" section. I modified thoes patches below 3 points. - if CONFIG_PREEMPT is set, disable booster without checking CONFIG_PM. - free_insn_slot() doesn't check whether the probe is boostable or not. Because these are related to '[PATCH] kprobes: enable booster on the preemptible kernel' which has been merged into 2.6.20 and is not backported to rhel5 kernel. - Since the original patch adds boostable flag to struct arch_specific_insn which is a part of struct kprobe, it breaks kABI. Thus, I added following modifications for preserving kABI. - extend MAX_INSN_SIZE 1byte to make a space for the boostable flag in an instruction slot.(kprobes internally allocates an instruction slot(buffer) ,which size is MAX_INSN_SIZE, for each kprobe. So, no kABI is changed.) - define __MAX_INSN_SIZE as the original MAX_INSN_SIZE(=15). - use __MAX_INSN_SIZE in arch/x86-64/kernel/kprobes.c instead of MAX_INSN_SIZE. - add __set/__get_boostable() for encapsulating the access to the boostable flag which is at the end of an instruction slot. - use 0,1,2 (instead of -1,0,1) to indicate boostable flag, since the type of instruction slot is unsigned char(u8). Brew: ----- This patch set was built on all platforms. http://brewweb.devel.redhat.com/brew/buildinfo?buildID=73971 Testing: -------- I tested this patch by systemtap-testsuite(0.6.2-1.el5), and the test result was same as the original kernel(2.6.18-95.el5). I also tested it by my kprobe-benchmark test module to ensure that kprobes was actually boosted. Upstream status: ---------------- 0b0122faf4833548072d23f3c3063c23bc289746 //fix resume_execution() on x86-64 aa470140e86e45723cf8387292edbce9106ddc1f //kprobe-booster on x86-64 da07ab0375897bb9e108b28129df140ecd3ee94e //kretprobe-booster on x86-64 Thank you, diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index ad9d2f4..8a5b93f 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -29,6 +29,8 @@ * Fixed to handle %rip-relative addressing mode correctly. * 2005-May Rusty Lynch <rusty.lynch@intel.com> * Added function return probes functionality + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster + * and kretprobe-booster for x86-64 */ #include <linux/kprobes.h> @@ -57,6 +59,122 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); /* + * boostable = 0: This instruction type is not boostable. + * boostable = 1: This instruction type is boostable. + * boostable = 2: This instruction has been boosted: we have + * added a relative jump after the instruction copy in insn, + * so no single-step and fixup are needed (unless there's + * a post_handler or break_handler). + */ +static int __get_boostable(struct kprobe *p) +{ + return p->ainsn.insn[BOOSTABLE_FLAG]; +} +static int __set_boostable(struct kprobe *p, int flag) +{ + return p->ainsn.insn[BOOSTABLE_FLAG] = (kprobe_opcode_t)flag; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static __always_inline void set_jmp_op(void *from, void *to) +{ + struct __arch_jmp_op { + char op; + s32 raddr; + } __attribute__((packed)) * jop; + jop = (struct __arch_jmp_op *)from; + jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); + jop->op = RELATIVEJUMP_INSTRUCTION; +} + +/* + * returns non-zero if opcode is boostable + * RIP relative instructions are adjusted at copying time + */ +static __always_inline int can_boost(kprobe_opcode_t *opcodes) +{ +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 64)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not boost. + */ + static const unsigned long twobyte_is_boostable[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0)|/* 00 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 20 */ + W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),/* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)|/* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1)|/* 60 */ + W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1),/* 70 */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)|/* 90 */ + W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1),/* b0 */ + W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1)|/* c0 */ + W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* d0 */ + W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* e0 */ + W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; +#undef W + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; + +retry: + if (opcodes - orig_opcodes > __MAX_INSN_SIZE - 1) + return 0; + opcode = *(opcodes++); + + /* 2nd-byte opcode */ + if (opcode == 0x0f) { + if (opcodes - orig_opcodes > __MAX_INSN_SIZE - 1) + return 0; + return test_bit(*opcodes, twobyte_is_boostable); + } + + switch (opcode & 0xf0) { + case 0x40: + goto retry; /* REX prefix is boostable */ + case 0x60: + if (0x63 < opcode && opcode < 0x67) + goto retry; /* prefixes */ + /* can't boost Address-size override and bound */ + return (opcode != 0x62 && opcode != 0x67); + case 0x70: + return 0; /* can't boost conditional jump */ + case 0xc0: + /* can't boost software-interruptions */ + return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; + case 0xd0: + /* can boost AA* and XLAT */ + return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); + case 0xe0: + /* can boost in/out and absolute jmps */ + return ((opcode & 0x04) || opcode == 0xea); + case 0xf0: + if ((opcode & 0x0c) == 0 && opcode != 0xf1) + goto retry; /* lock/rep(ne) prefix */ + /* clear and set flags are boostable */ + return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + default: + /* segment override prefixes are boostable */ + if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) + goto retry; /* prefixes */ + /* CS override prefix and call are not boostable */ + return (opcode != 0x2e && opcode != 0x9a); + } +} + +/* * returns non-zero if opcode modifies the interrupt flag. */ static __always_inline int is_IF_modifier(kprobe_opcode_t *insn) @@ -87,7 +205,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) /* * Determine if the instruction uses the %rip-relative addressing mode. - * If it does, return the address of the 32-bit displacement word. + * If it does, Return the address of the 32-bit displacement word. * If not, return null. */ static s32 __kprobes *is_riprel(u8 *insn) @@ -191,7 +309,7 @@ static s32 __kprobes *is_riprel(u8 *insn) static void __kprobes arch_copy_kprobe(struct kprobe *p) { s32 *ripdisp; - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); + memcpy(p->ainsn.insn, p->addr, __MAX_INSN_SIZE); ripdisp = is_riprel(p->ainsn.insn); if (ripdisp) { /* @@ -211,6 +329,11 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ *ripdisp = disp; } + if (can_boost(p->addr)) { + __set_boostable(p, 1); + } else { + __set_boostable(p, 0); + } p->opcode = *p->addr; } @@ -386,6 +509,15 @@ int __kprobes kprobe_handler(struct pt_regs *regs) return 1; ss_probe: +#if !defined(CONFIG_PREEMPT) + if (__get_boostable(p) == 2 && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + reset_current_kprobe(); + regs->rip = (unsigned long)p->ainsn.insn; + preempt_enable_no_resched(); + return 1; + } +#endif prepare_singlestep(p, regs); kcb->kprobe_status = KPROBE_HIT_SS; return 1; @@ -396,21 +528,65 @@ no_kprobe: } /* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. + * When a retprobed function returns, this code saves registers and + * calls trampoline_handler() runs, which calls the kretprobe's handler. */ - void kretprobe_trampoline_holder(void) + void __kprobes kretprobe_trampoline_holder(void) { asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - "nop\n"); + "kretprobe_trampoline: \n" + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + /* + * Skip cs, ip, orig_ax. + * trampoline_handler() will plug in these values + */ + " subq $24, %rsp\n" + " pushq %rdi\n" + " pushq %rsi\n" + " pushq %rdx\n" + " pushq %rcx\n" + " pushq %rax\n" + " pushq %r8\n" + " pushq %r9\n" + " pushq %r10\n" + " pushq %r11\n" + " pushq %rbx\n" + " pushq %rbp\n" + " pushq %r12\n" + " pushq %r13\n" + " pushq %r14\n" + " pushq %r15\n" + " movq %rsp, %rdi\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movq %rax, 152(%rsp)\n" + " popq %r15\n" + " popq %r14\n" + " popq %r13\n" + " popq %r12\n" + " popq %rbp\n" + " popq %rbx\n" + " popq %r11\n" + " popq %r10\n" + " popq %r9\n" + " popq %r8\n" + " popq %rax\n" + " popq %rcx\n" + " popq %rdx\n" + " popq %rsi\n" + " popq %rdi\n" + /* Skip orig_ax, ip, cs */ + " addq $24, %rsp\n" + " popfq\n" + " ret\n"); } /* - * Called when we hit the probe point at kretprobe_trampoline + * Called from kretprobe_trampoline */ -int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +fastcall void * __kprobes trampoline_handler(struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head; @@ -420,7 +596,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); - + /* fixup rt_regs */ + regs->cs = __KERNEL_CS; + regs->rip = trampoline_address; + regs->orig_rax = 0xffffffffffffffff; /* * It is possible to have multiple instances associated with a given * task either because an multiple functions in the call path @@ -439,8 +618,12 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) /* another task is sharing our hash bucket */ continue; - if (ri->rp && ri->rp->handler) + if (ri->rp && ri->rp->handler) { + __get_cpu_var(current_kprobe) = &ri->rp->kp; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->rp->handler(ri, regs); + __get_cpu_var(current_kprobe) = NULL; + } orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri); @@ -455,18 +638,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) } BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); - regs->rip = orig_ret_address; - reset_current_kprobe(); spin_unlock_irqrestore(&kretprobe_lock, flags); - preempt_enable_no_resched(); - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; + return (void *)orig_ret_address; } /* @@ -490,12 +665,16 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) * 2) If the single-stepped instruction was a call, the return address * that is atop the stack is the address following the copied instruction. * We need to make it the address following the original instruction. + * + * If this is the first time we've single-stepped the instruction at + * this probepoint, and the instruction is boostable, boost it: add a + * jump instruction after the copied instruction, that jumps to the next + * instruction after the probepoint. */ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { unsigned long *tos = (unsigned long *)regs->rsp; - unsigned long next_rip = 0; unsigned long copy_rip = (unsigned long)p->ainsn.insn; unsigned long orig_rip = (unsigned long)p->addr; kprobe_opcode_t *insn = p->ainsn.insn; @@ -504,46 +683,59 @@ static void __kprobes resume_execution(struct kprobe *p, if (*insn >= 0x40 && *insn <= 0x4f) insn++; + regs->eflags &= ~TF_MASK; switch (*insn) { - case 0x9c: /* pushfl */ + case 0x9c: /* pushfl */ *tos &= ~(TF_MASK | IF_MASK); *tos |= kcb->kprobe_old_rflags; break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: + case 0xc2: /* iret/ret/lret */ + case 0xc3: case 0xca: - regs->eflags &= ~TF_MASK; - /* rip is already adjusted, no more changes required*/ - return; - case 0xe8: /* call relative - Fix return addr */ + case 0xcb: + case 0xcf: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + __set_boostable(p, 2); + goto no_change; + case 0xe8: /* call relative - Fix return addr */ *tos = orig_rip + (*tos - copy_rip); break; case 0xff: if ((insn[1] & 0x30) == 0x10) { /* call absolute, indirect */ - /* Fix return addr; rip is correct. */ - next_rip = regs->rip; + /* Fix return addr; ip is correct. */ + /* not boostable */ *tos = orig_rip + (*tos - copy_rip); + goto no_change; } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* rip is correct. */ - next_rip = regs->rip; + /* ip is correct. And this is boostable */ + __set_boostable(p, 2); + goto no_change; } - break; - case 0xea: /* jmp absolute -- rip is correct */ - next_rip = regs->rip; - break; default: break; } - - regs->eflags &= ~TF_MASK; - if (next_rip) { - regs->rip = next_rip; - } else { - regs->rip = orig_rip + (regs->rip - copy_rip); + if (__get_boostable(p) == 1) { + if ((regs->rip > copy_rip) && + (regs->rip - copy_rip) + 5 < __MAX_INSN_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + set_jmp_op((void *)regs->rip, + (void *)orig_rip + (regs->rip - copy_rip)); + __set_boostable(p, 2); + } else { + __set_boostable(p, 0); + } } + + regs->rip = orig_rip + (regs->rip - copy_rip); +no_change: + + return; } int __kprobes post_kprobe_handler(struct pt_regs *regs) @@ -746,12 +938,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - int __init arch_init_kprobes(void) { - return register_kprobe(&trampoline_p); + return 0; } diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h index 63dfa21..77a2a72 100644 --- a/include/asm-x86_64/kprobes.h +++ b/include/asm-x86_64/kprobes.h @@ -34,7 +34,11 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc -#define MAX_INSN_SIZE 15 +#define RELATIVEJUMP_INSTRUCTION 0xe9 +/* Use the last byte of insn slot as a flag for preserving kABI compatibility */ +#define __MAX_INSN_SIZE 15 +#define MAX_INSN_SIZE (__MAX_INSN_SIZE + 1) +#define BOOSTABLE_FLAG (MAX_INSN_SIZE - 1) #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \ (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \