On Fri, 2009-02-06 at 15:49 -0500, Masami Hiramatsu wrote: > Hi Jim, > > I'm also interested in the instruction decoder. > If you don't mind, could we share the API specification? > I'd like to port djprobe on it.
I'm enclosing the little x86 instruction-analysis protoype I hacked together (insn_x86.*), along with a copy of systemtap's runtime/uprobes2/uprobes_x86.c, which I modified to use it. But again, we haven't really settled on an API. For example, my x86 prototype doesn't collect all the info that kvm needs. We're thinking that adapting some existing code (like kvm in the x86 case) might be more palatable to LKML. Jim > > Thanks! > > Jim Keniston wrote: > > Hi, Roland. Back in a conference call in December, we discussed > > approaches to refactoring utrace-related code such as uprobes, to > > make some of the services provided there more generally available. > > In particular, you suggested an "instruction analysis" service that > > various subsystems could exploit -- kprobes and uprobes/ubp at first, > > and eventually perhaps gdb, perfmon, kvm, ftrace, and djprobes. > > ... > > Srikar Dronamraju and I are exploring two different approaches to an > > x86 instruction-parsing service. Since x86 kvm seems to have one of > > the most systematic and thorough approaches, Srikar is prototyping a > > generalization of kvm's x86_decode_insn() to make it support kprobes, > > and eventually uprobes. (Note that kvm does NOT appear to be a good > > starting place on powerpc and s390.) Approaching from the minimalist > > side, I've implemented an x86 instruction-parsing API with just enough > > smarts (so far) to support kprobes and uprobes. > > > > We'd be interested to know whether these efforts are consistent > > with what you have in mind. > > > > See more details below. > > > > Jim ...
/* * x86 instruction analysis * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ #include <linux/string.h> // #include <asm/insn.h> #include "insn_x86.h" /** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @x86_64: true for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const u8 *kaddr, bool x86_64) { memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->next_byte = kaddr; insn->x86_64 = x86_64; } EXPORT_SYMBOL_GPL(insn_init); /** * insn_get_prefixes - scan x86 instruction prefix bytes * @insn: &struct insn containing instruction * * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already true. */ void insn_get_prefixes(struct insn *insn) { u32 pfx; struct insn_field *prefixes = &insn->prefixes; if (prefixes->got) return; for (;; insn->next_byte++, prefixes->nbytes++) { u8 b = *(insn->next_byte); #ifdef CONFIG_X86_64 if ((b & 0xf0) == 0x40 && insn->x86_64) { prefixes->value |= X86_PFX_REX; prefixes->value |= (b & 0x0f) * X86_PFX_REX_BASE; /* REX prefix is always last. */ break; } #endif switch (b) { case 0x26: pfx = X86_PFX_ES; break; case 0x2E: pfx = X86_PFX_CS; break; case 0x36: pfx = X86_PFX_SS; break; case 0x3E: pfx = X86_PFX_DS; break; case 0x64: pfx = X86_PFX_FS; break; case 0x65: pfx = X86_PFX_GS; break; case 0x66: pfx = X86_PFX_OPNDSZ; break; case 0x67: pfx = X86_PFX_ADDRSZ; break; case 0xF0: pfx = X86_PFX_LOCK; break; case 0xF2: pfx = X86_PFX_REPNE; break; case 0xF3: pfx = X86_PFX_REPE; break; default: pfx = 0x0; break; } if (!pfx) break; prefixes->value |= pfx; } prefixes->got = true; } EXPORT_SYMBOL_GPL(insn_get_prefixes); /** * insn_get_opcode - collect opcode(s) * @insn: &struct insn containing instruction * * Populates @insn->opcode1 (and @insn->opcode2, if it's a 2-byte opcode) * and updates @insn->next_byte to point past the opcode byte(s). * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already true. */ void insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; if (opcode->got) return; if (!insn->prefixes.got) insn_get_prefixes(insn); insn->opcode1 = *insn->next_byte++; if (insn->opcode1 == 0x0f) { insn->opcode2 = *insn->next_byte++; opcode->nbytes = 2; } else opcode->nbytes = 1; opcode->value = insn->opcode1; opcode->got = true; } EXPORT_SYMBOL_GPL(insn_get_opcode); const u32 onebyte_has_modrm[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ----------------------------------------------- */ W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; const u32 twobyte_has_modrm[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ----------------------------------------------- */ W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; /** * insn_get_modrm - collect ModRM byte, if any * @insn: &struct insn containing instruction * * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already true. */ void insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; if (modrm->got) return; if (!insn->opcode.got) insn_get_opcode(insn); if (insn->opcode.nbytes == 2) modrm->nbytes = test_bit(insn->opcode2, (const unsigned long*) twobyte_has_modrm); else modrm->nbytes = test_bit(insn->opcode1, (const unsigned long*) onebyte_has_modrm); if (modrm->nbytes) modrm->value = *(insn->next_byte++); modrm->got = true; } EXPORT_SYMBOL_GPL(insn_get_modrm); #ifdef CONFIG_X86_64 /** * insn_rip_relative() - Does instruction use RIP-relative addressing mode? * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. No effect if @insn->x86_64 is false. */ bool insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; if (!insn->x86_64) return false; if (!modrm->got) insn_get_modrm(insn); /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. */ return (insn_field_exists(modrm) && (modrm->value & 0xc7) == 0x5); } EXPORT_SYMBOL_GPL(insn_rip_relative); #endif
#ifndef _ASM_X86_INSN_H #define _ASM_X86_INSN_H /* * x86 instruction analysis * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ #include <linux/types.h> #undef W #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ << (row % 32)) /* legacy instruction prefixes */ #define X86_PFX_OPNDSZ 0x1 /* 0x66 */ #define X86_PFX_ADDRSZ 0x2 /* 0x67 */ #define X86_PFX_CS 0x4 /* 0x2E */ #define X86_PFX_DS 0x8 /* 0x3E */ #define X86_PFX_ES 0x10 /* 0x26 */ #define X86_PFX_FS 0x20 /* 0x64 */ #define X86_PFX_GS 0x40 /* 0x65 */ #define X86_PFX_SS 0x80 /* 0x36 */ #define X86_PFX_LOCK 0x100 /* 0xF0 */ #define X86_PFX_REPE 0x200 /* 0xF3 */ #define X86_PFX_REPNE 0x400 /* 0xF2 */ /* REX prefix */ #define X86_PFX_REX 0x800 /* 0x4X */ /* REX prefix dissected */ #define X86_PFX_REX_BASE 0x1000 #define X86_PFX_REXB 0x1000 /* 0x41 bit */ #define X86_PFX_REXX 0x2000 /* 0x42 bit */ #define X86_PFX_REXR 0x4000 /* 0x44 bit */ #define X86_PFX_REXW 0x8000 /* 0x48 bit */ struct insn_field { s32 value; bool got; /* true if we've run insn_get_xxx() for this field */ u8 nbytes; }; struct insn { u8 opcode1, opcode2; struct insn_field prefixes; /* prefixes.value is a bitmap */ struct insn_field opcode; /* opcode.value == opcode1 */ struct insn_field modrm; struct insn_field sib; struct insn_field displacement; struct insn_field immediate; const u8 *kaddr; /* kernel address of insn (copy) to analyze */ const u8 *next_byte; bool x86_64; }; extern void insn_init(struct insn *insn, const u8 *kaddr, bool x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); extern void insn_get_modrm(struct insn *insn); #ifdef CONFIG_X86_64 extern bool insn_rip_relative(struct insn *insn); #else bool insn_rip_relative(struct insn *insn) { return false; } #endif static inline bool insn_field_exists(const struct insn_field *field) { return (field->nbytes > 0); } static inline u8 insn_extract_reg(int modrm) { return (modrm >> 3) & 0x7; } #endif /* _ASM_X86_INSN_H */
/* * Userspace Probes (UProbes) * uprobes.c * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2006-2008 */ /* * In versions of uprobes built in the SystemTap runtime, this file * is #included at the end of uprobes.c. */ #include <asm/uaccess.h> #include "insn_x86.h" #ifdef CONFIG_X86_32 #define is_32bit_app(tsk) 1 #else #define is_32bit_app(tsk) (test_tsk_thread_flag(tsk, TIF_IA32)) #endif static const u64 good_insns_64[256 / 64] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ------------------------------- */ W(0x00, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 00 */ W(0x10, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 10 */ W(0x20, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 20 */ W(0x30, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0), /* 30 */ W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ W(0x60, 0,0,0,1,1,1,0,0,1,1,1,1,0,0,0,0)| /* 60 */ W(0x70, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 70 */ W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ W(0x90, 1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1)| /* 90 */ W(0xa0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* a0 */ W(0xb0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* b0 */ W(0xc0, 1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,0)| /* c0 */ W(0xd0, 1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)| /* d0 */ W(0xe0, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* e0 */ W(0xf0, 0,0,1,1,0,1,1,1,1,1,0,0,1,1,1,1) /* f0 */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; /* Good-instruction tables for 32-bit apps -- copied from i386 uprobes */ static const u64 good_insns_32[256 / 64] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ------------------------------- */ W(0x00, 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0)| /* 00 */ W(0x10, 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0)| /* 10 */ W(0x20, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1)| /* 20 */ W(0x30, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1), /* 30 */ W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ W(0x60, 1,1,1,0,1,1,0,0,1,1,1,1,0,0,0,0)| /* 60 */ W(0x70, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 70 */ W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ W(0x90, 1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1)| /* 90 */ W(0xa0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* a0 */ W(0xb0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* b0 */ W(0xc0, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0)| /* c0 */ W(0xd0, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1)| /* d0 */ W(0xe0, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* e0 */ W(0xf0, 0,0,1,1,0,1,1,1,1,1,0,0,1,1,1,1) /* f0 */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; /* Using this for both 64-bit and 32-bit apps */ static const u64 good_2byte_insns[256 / 64] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ------------------------------- */ W(0x00, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1)| /* 00 */ W(0x10, 1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1)| /* 10 */ W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* 20 */ W(0x30, 0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 60 */ W(0x70, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 90 */ W(0xa0, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1)| /* a0 */ W(0xb0, 1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1), /* b0 */ W(0xc0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* c0 */ W(0xd0, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* d0 */ W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* e0 */ W(0xf0, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* f0 */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; /* * opcodes we'll probably never support: * 6c-6d, e4-e5, ec-ed - in * 6e-6f, e6-e7, ee-ef - out * cc, cd - int3, int * cf - iret * d6 - illegal instruction * f1 - int1/icebp * f4 - hlt * fa, fb - cli, sti * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 * * invalid opcodes in 64-bit mode: * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, c4-c5, d4-d5 * * 63 - we support this opcode in x86_64 but not in i386. * opcodes we may need to refine support for: * 0f - 2-byte instructions: For many of these instructions, the validity * depends on the prefix and/or the reg field. On such instructions, we * just consider the opcode combination valid if it corresponds to any * valid instruction. * 8f - Group 1 - only reg = 0 is OK * c6-c7 - Group 11 - only reg = 0 is OK * d9-df - fpu insns with some illegal encodings * f2, f3 - repnz, repz prefixes. These are also the first byte for * certain floating-point instructions, such as addsd. * fe - Group 4 - only reg = 0 or 1 is OK * ff - Group 5 - only reg = 0-6 is OK * * others -- Do we need to support these? * 0f - (floating-point?) prefetch instructions * 07, 17, 1f - pop es, pop ss, pop ds * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- * but 64 and 65 (fs: and gs:) seem to be used, so we support them * 67 - addr16 prefix * 9b - wait/fwait * ce - into * f0 - lock prefix */ #define UP_BAD_PREFIXES \ (X86_PFX_ES | X86_PFX_CS | X86_PFX_SS | X86_PFX_DS | X86_PFX_LOCK) /* * TODO: * - Where necessary, examine the modrm byte and allow only valid instructions * in the different Groups and fpu instructions. * - Note: If we go past the first byte, do we need to verify that * subsequent bytes were actually there, rather than off the last page? * - Be clearer about which instructions we'll never probe. */ static void report_bad_1byte_opcode(int mode, uprobe_opcode_t op) { printk(KERN_ERR "In %d-bit apps, " "uprobes does not currently support probing " "instructions whose first byte is 0x%2.2x\n", mode, op); } static void report_bad_2byte_opcode(uprobe_opcode_t op) { printk(KERN_ERR "uprobes does not currently support probing " "instructions with the 2-byte opcode 0x0f 0x%2.2x\n", op); } static int validate_insn_32bits(struct uprobe_probept *ppt, struct insn *insn) { insn_init(insn, ppt->insn, false); /* Skip good instruction prefixes; reject "bad" ones. */ insn_get_opcode(insn); if (insn->prefixes.value & UP_BAD_PREFIXES) { report_bad_1byte_opcode(32, insn->opcode1); return -EPERM; } if (test_bit(insn->opcode1, (unsigned long*)good_insns_32)) return 0; if (insn->opcode.nbytes == 2) { if (test_bit(insn->opcode2, (unsigned long*)good_2byte_insns)) return 0; report_bad_2byte_opcode(insn->opcode2); } else report_bad_1byte_opcode(32, insn->opcode1); return -EPERM; } static int validate_insn_64bits(struct uprobe_probept *ppt, struct insn *insn) { insn_init(insn, ppt->insn, true); /* Skip good instruction prefixes; reject "bad" ones. */ insn_get_opcode(insn); if (insn->prefixes.value & UP_BAD_PREFIXES) { report_bad_1byte_opcode(64, insn->opcode1); return -EPERM; } if (test_bit(insn->opcode1, (unsigned long*)good_insns_64)) return 0; if (insn->opcode.nbytes == 2) { if (test_bit(insn->opcode2, (unsigned long*)good_2byte_insns)) return 0; report_bad_2byte_opcode(insn->opcode2); } else report_bad_1byte_opcode(64, insn->opcode1); return -EPERM; } #ifdef CONFIG_X86_64 static int handle_riprel_insn(struct uprobe_probept *ppt, struct insn *insn); #endif static int arch_validate_probed_insn(struct uprobe_probept *ppt, struct task_struct *tsk) { int ret; struct insn insn; #ifdef CONFIG_X86_64 ppt->arch_info.flags = 0x0; ppt->arch_info.rip_target_address = 0x0; #endif if (is_32bit_app(tsk)) return validate_insn_32bits(ppt, &insn); if ((ret = validate_insn_64bits(ppt, &insn)) != 0) return ret; #ifdef CONFIG_X86_64 (void) handle_riprel_insn(ppt, &insn); #endif return 0; } #ifdef CONFIG_X86_64 /* * Returns 0 if the indicated instruction has no immediate operand * and/or can't use rip-relative addressing. Otherwise returns * the size of the immediate operand in the instruction. (Note that * for instructions such as "movq $7,xxxx(%rip)" the immediate-operand * field is 4 bytes, even though 8 bytes are stored.) */ static int immediate_operand_size(struct insn *insn) { u8 reg = insn_extract_reg(insn->modrm.value); bool operand_size_prefix = ((insn->prefixes.value & X86_PFX_OPNDSZ) != 0x0); BUG_ON(!insn_field_exists(&insn->modrm)); switch (insn->opcode1) { case 0x6b: /* imul immed,mem,reg */ case 0x80: /* Group 1 */ case 0x83: /* Group 1 */ case 0xc0: /* Group 2 */ case 0xc1: /* Group 2 */ case 0xc6: /* Group 11 */ return 1; case 0x69: /* imul immed,mem,reg */ case 0x81: /* Group 1 */ case 0xc7: /* Group 11 */ return (operand_size_prefix ? 2 : 4); case 0xf6: /* Group 3, reg field == 0 or 1 */ return (reg > 1 ? 0 : 1); case 0xf7: /* Group 3, reg field == 0 or 1 */ if (reg > 1) return 0; return (operand_size_prefix ? 2 : 4); case 0x0f: /* 2-byte opcodes */ switch (insn->opcode2) { /* * Note: 0x71-73 (Groups 12-14) have immediate operands, * but not memory operands. */ case 0x70: /* pshuf* immed,mem,reg */ case 0xa4: /* shld immed,reg,mem */ case 0xac: /* shrd immed,reg,mem */ case 0xc2: /* cmpps or cmppd */ case 0xc4: /* pinsrw */ case 0xc5: /* pextrw */ case 0xc6: /* shufps or shufpd */ case 0x0f: /* 3DNow extensions */ return 1; default: return 0; } } return 0; } /* * If pp->insn doesn't use rip-relative addressing, return 0. Otherwise, * rewrite the instruction so that it accesses its memory operand * indirectly through a scratch register. Set flags and rip_target_address * in ppt->arch_info accordingly. (The contents of the scratch register * will be saved before we single-step the modified instruction, and * restored afterward.) Return 1. * * We do this because a rip-relative instruction can access only a * relatively small area (+/- 2 GB from the instruction), and the SSOL * area typically lies beyond that area. At least for instructions * that store to memory, we can't single-step the original instruction * and "fix things up" later, because the misdirected store could be * disastrous. * * Some useful facts about rip-relative instructions: * - There's always a modrm byte. * - There's never a SIB byte. * - The offset is always 4 bytes. */ static int handle_riprel_insn(struct uprobe_probept *ppt, struct insn *insn) { u8 *cursor; u8 reg; int immed_size, instruction_size; if (!insn_rip_relative(insn)) return 0; /* * We have a rip-relative instruction. Point cursor at the * modrm byte. The next 4 bytes are the offset. Beyond the * offset, for some instructions, is the immediate operand. */ cursor = ppt->insn + insn->prefixes.nbytes + insn->opcode.nbytes; immed_size = immediate_operand_size(insn); instruction_size = insn->prefixes.nbytes + insn->opcode.nbytes + 1 /* modrm byte */ + 4 /* offset */ + immed_size; /* immediate field */ #undef DEBUG_UPROBES_RIP #ifdef DEBUG_UPROBES_RIP { int i; BUG_ON(instruction_size > 15); printk(KERN_INFO "Munging rip-relative insn:"); for (i = 0; i < instruction_size; i++) printk(" %2.2x", ppt->insn[i]); printk("\n"); } #endif /* * Convert from rip-relative addressing to indirect addressing * via a scratch register. Change the r/m field from 0x5 (%rip) * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. */ reg = insn_extract_reg(insn->modrm.value); if (reg == 0) { /* * The register operand (if any) is either the A register * (%rax, %eax, etc.) or (if the 0x4 bit is set in the * REX prefix) %r8. In any case, we know the C register * is NOT the register operand, so we use %rcx (register * #1) for the scratch register. */ ppt->arch_info.flags = UPFIX_RIP_RCX; /* Change modrm from 00 000 101 to 00 000 001. */ *cursor = 0x1; } else { /* Use %rax (register #0) for the scratch register. */ ppt->arch_info.flags = UPFIX_RIP_RAX; /* Change modrm from 00 xxx 101 to 00 xxx 000 */ *cursor = (reg << 3); } /* Target address = address of next instruction + (signed) offset */ cursor++; ppt->arch_info.rip_target_address = (long) ppt->vaddr + instruction_size + *((s32*)cursor); if (immed_size) memmove(cursor, cursor+4, immed_size); #ifdef DEBUG_UPROBES_RIP { int i; printk(KERN_INFO "Munged rip-relative insn: "); for (i = 0; i < instruction_size-4; i++) printk(" %2.2x", ppt->insn[i]); printk("\n"); printk(KERN_INFO "Target address = %#lx\n", ppt->arch_info.rip_target_address); } #endif return 1; } #endif /* * Get an instruction slot from the process's SSOL area, containing the * instruction at ppt's probepoint. Point the rip at that slot, in * preparation for single-stepping out of line. * * If we're emulating a rip-relative instruction, save the contents * of the scratch register and store the target address in that register. */ static void uprobe_pre_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt, struct pt_regs *regs) { struct uprobe_ssol_slot *slot; slot = uprobe_get_insn_slot(ppt); if (!slot) { utask->doomed = 1; return; } regs->ip = (long)slot->insn; utask->singlestep_addr = regs->ip; #ifdef CONFIG_X86_64 if (ppt->arch_info.flags == UPFIX_RIP_RAX) { utask->arch_info.saved_scratch_register = regs->ax; regs->ax = ppt->arch_info.rip_target_address; } else if (ppt->arch_info.flags == UPFIX_RIP_RCX) { utask->arch_info.saved_scratch_register = regs->cx; regs->cx = ppt->arch_info.rip_target_address; } #endif } /* * Called by uprobe_post_ssout() to adjust the return address * pushed by a call instruction executed out of line. */ static void adjust_ret_addr(unsigned long rsp, long correction, struct uprobe_task *utask) { unsigned long nleft; if (is_32bit_app(current)) { s32 ra; nleft = copy_from_user(&ra, (const void __user *) rsp, 4); if (unlikely(nleft != 0)) goto fail; ra += (s32) correction; nleft = copy_to_user((void __user *) rsp, &ra, 4); if (unlikely(nleft != 0)) goto fail; } else { s64 ra; nleft = copy_from_user(&ra, (const void __user *) rsp, 8); if (unlikely(nleft != 0)) goto fail; ra += correction; nleft = copy_to_user((void __user *) rsp, &ra, 8); if (unlikely(nleft != 0)) goto fail; } return; fail: printk(KERN_ERR "uprobes: Failed to adjust return address after" " single-stepping call instruction;" " pid=%d, rsp=%#lx\n", current->pid, rsp); utask->doomed = 1; } /* * Called after single-stepping. ppt->vaddr is the address of the * instruction whose first byte has been replaced by the "int3" * instruction. To avoid the SMP problems that can occur when we * temporarily put back the original opcode to single-step, we * single-stepped a copy of the instruction. The address of this * copy is utask->singlestep_addr. * * This function prepares to return from the post-single-step * trap. We have to fix things up as follows: * * 0) Typically, the new rip is relative to the copied instruction. We * need to make it relative to the original instruction. Exceptions are * return instructions and absolute or indirect jump or call instructions. * * 1) If the single-stepped instruction was a call, the return address * that is atop the stack is the address following the copied instruction. * We need to make it the address following the original instruction. * * 2) If the original instruction was a rip-relative instruction such as * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent * instruction using a scratch register -- e.g., "movl %edx,(%rax)". * We need to restore the contents of the scratch register and adjust * the rip, keeping in mind that the instruction we executed is 4 bytes * shorter than the original instruction (since we squeezed out the offset * field). */ static void uprobe_post_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt, struct pt_regs *regs) { unsigned long next_ip = 0; unsigned long copy_ip = utask->singlestep_addr; unsigned long orig_ip = ppt->vaddr; long correction = (long) (orig_ip - copy_ip); uprobe_opcode_t *insn = ppt->insn; #ifdef CONFIG_X86_64 unsigned long flags = ppt->arch_info.flags; #endif up_read(&ppt->slot->rwsem); #ifdef CONFIG_X86_64 if (flags & (UPFIX_RIP_RAX | UPFIX_RIP_RCX)) { if (flags & UPFIX_RIP_RAX) regs->ax = utask->arch_info.saved_scratch_register; else regs->cx = utask->arch_info.saved_scratch_register; /* * The original instruction includes a displacement, and so * is 4 bytes longer than what we've just single-stepped. * Fall through to handle stuff like "jmpq *...(%rip)" and * "callq *...(%rip)". */ correction += 4; } #endif /* * TODO: Move all this instruction parsing to * arch_validate_probed_insn(), and store what we learn in * ppt->arch_info.flags. * * We don't bother skipping prefixes here because none of the * instructions that require special treatment (other than * rip-relative instructions, handled above) involve prefixes. */ switch (*insn) { case 0xc3: /* ret/lret */ case 0xcb: case 0xc2: case 0xca: /* rip is correct */ next_ip = regs->ip; break; case 0xe8: /* call relative - Fix return addr */ adjust_ret_addr(regs->sp, correction, utask); break; case 0x9a: /* call absolute - Fix return addr */ adjust_ret_addr(regs->sp, correction, utask); next_ip = regs->ip; break; case 0xff: if ((insn[1] & 0x30) == 0x10) { /* call absolute, indirect */ /* Fix return addr; rip is correct. */ next_ip = regs->ip; adjust_ret_addr(regs->sp, correction, utask); } else if ((insn[1] & 0x31) == 0x20 || /* jmp near, absolute indirect */ (insn[1] & 0x31) == 0x21) { /* jmp far, absolute indirect */ /* rip is correct. */ next_ip = regs->ip; } break; case 0xea: /* jmp absolute -- rip is correct */ next_ip = regs->ip; break; default: break; } if (next_ip) regs->ip = next_ip; else regs->ip += correction; } /* * Replace the return address with the trampoline address. Returns * the original return address. */ static unsigned long arch_hijack_uret_addr(unsigned long trampoline_address, struct pt_regs *regs, struct uprobe_task *utask) { int nleft; unsigned long orig_ret_addr = 0; /* clear high bits for 32-bit apps */ size_t rasize; if (is_32bit_app(current)) rasize = 4; else rasize = 8; nleft = copy_from_user(&orig_ret_addr, (const void __user *) regs->sp, rasize); if (unlikely(nleft != 0)) return 0; if (orig_ret_addr == trampoline_address) /* * There's another uretprobe on this function, and it was * processed first, so the return address has already * been hijacked. */ return orig_ret_addr; nleft = copy_to_user((void __user *) regs->sp, &trampoline_address, rasize); if (unlikely(nleft != 0)) { if (nleft != rasize) { printk(KERN_ERR "uretprobe_entry_handler: " "return address partially clobbered -- " "pid=%d, %%sp=%#lx, %%ip=%#lx\n", current->pid, regs->sp, regs->ip); utask->doomed = 1; } // else nothing written, so no harm return 0; } return orig_ret_addr; } #include "insn_x86.c"