The branch main has been updated by markj:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=f0bc4ed144fc59b6f72d90c46b91ca803d3b29ce

commit f0bc4ed144fc59b6f72d90c46b91ca803d3b29ce
Author:     Christos Margiolis <[email protected]>
AuthorDate: 2022-10-11 15:28:17 +0000
Commit:     Mark Johnston <[email protected]>
CommitDate: 2022-10-11 22:19:08 +0000

    kinst: Initial revision
    
    This is a new DTrace provider which allows arbitrary kernel instructions
    to be traced.  Currently it is implemented only for amd64.
    
    kinst probes are created on demand by libdtrace, and there is a probe
    for each kernel instruction.  Probes are named
    kinst:<module>:<function>:<offset>, where "offset" is the offset of the
    target instruction relative to the beginning of the function.  Omitting
    "offset" causes all instructions in the function to be traced.
    
    kinst works similarly to FBT in that it places a breakpoint on the
    target instruction and hooks into the kernel breakpoint handler.
    Because kinst has to be able to trace arbitrary instructions, it does
    not emulate most of them in software but rather causes the traced thread
    to execute a copy of the instruction before returning to the original
    code.
    
    The provider is quite low-level and as-is will be useful mostly only to
    kernel developers.  However, it provides a great deal of visibility into
    kernel code execution and could be used as a building block for
    higher-level tooling which can in some sense translate between C sources
    and generated machine code.  In particular, the "regs" variable recently
    added to D allows the CPU's register file to be accessed from kinst
    probes.
    
    kinst is experimental and should not be used on production systems for
    now.
    
    In collaboration with:  markj
    Sponsored by:           Google, Inc. (GSoC 2022)
    MFC after:              3 months
    Differential Revision:  https://reviews.freebsd.org/D36851
---
 sys/cddl/dev/dtrace/dtrace_cddl.h    |   2 +
 sys/cddl/dev/kinst/amd64/kinst_isa.c | 550 +++++++++++++++++++++++++++++++++++
 sys/cddl/dev/kinst/amd64/kinst_isa.h |  45 +++
 sys/cddl/dev/kinst/kinst.c           | 233 +++++++++++++++
 sys/cddl/dev/kinst/kinst.h           |  71 +++++
 sys/cddl/dev/kinst/trampoline.c      | 303 +++++++++++++++++++
 sys/modules/dtrace/Makefile          |   1 +
 sys/modules/dtrace/kinst/Makefile    |  17 ++
 8 files changed, 1222 insertions(+)

diff --git a/sys/cddl/dev/dtrace/dtrace_cddl.h 
b/sys/cddl/dev/dtrace/dtrace_cddl.h
index 08b6f80d4bae..b2397d621355 100644
--- a/sys/cddl/dev/dtrace/dtrace_cddl.h
+++ b/sys/cddl/dev/dtrace/dtrace_cddl.h
@@ -88,6 +88,7 @@ typedef struct kdtrace_thread {
        void            *td_systrace_args; /* syscall probe arguments. */
        uint64_t        td_fasttrap_tp_gen; /* Tracepoint hash table gen. */
        struct trapframe *td_dtrace_trapframe; /* Trap frame from invop. */
+       void            *td_kinst;
 } kdtrace_thread_t;
 
 /*
@@ -117,6 +118,7 @@ typedef struct kdtrace_thread {
 #define        t_dtrace_systrace_args  td_dtrace->td_systrace_args
 #define        t_fasttrap_tp_gen       td_dtrace->td_fasttrap_tp_gen
 #define        t_dtrace_trapframe      td_dtrace->td_dtrace_trapframe
+#define        t_kinst         td_dtrace->td_kinst
 #define        p_dtrace_helpers        p_dtrace->p_dtrace_helpers
 #define        p_dtrace_count  p_dtrace->p_dtrace_count
 #define        p_dtrace_probes p_dtrace->p_dtrace_probes
diff --git a/sys/cddl/dev/kinst/amd64/kinst_isa.c 
b/sys/cddl/dev/kinst/amd64/kinst_isa.c
new file mode 100644
index 000000000000..6d8d5d521617
--- /dev/null
+++ b/sys/cddl/dev/kinst/amd64/kinst_isa.c
@@ -0,0 +1,550 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <[email protected]>
+ * Copyright 2022 Mark Johnston <[email protected]>
+ */
+
+#include <sys/param.h>
+
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+
+#include <sys/dtrace.h>
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#include <dis_tables.h>
+
+#include "kinst.h"
+
+#define KINST_PUSHL_RBP                0x55
+#define KINST_STI              0xfb
+#define KINST_POPF             0x9d
+
+#define KINST_MODRM_MOD(b)     (((b) & 0xc0) >> 6)
+#define KINST_MODRM_REG(b)     (((b) & 0x38) >> 3)
+#define KINST_MODRM_RM(b)      ((b) & 0x07)
+
+#define KINST_SIB_SCALE(s)     (((s) & 0xc0) >> 6)
+#define KINST_SIB_INDEX(s)     (((s) & 0x38) >> 3)
+#define KINST_SIB_BASE(s)      (((s) & 0x07) >> 0)
+
+#define KINST_REX_W(r)         (((r) & 0x08) >> 3)
+#define KINST_REX_R(r)         (((r) & 0x04) >> 2)
+#define KINST_REX_X(r)         (((r) & 0x02) >> 1)
+#define KINST_REX_B(r)         (((r) & 0x01) >> 0)
+
+#define KINST_F_CALL           0x0001  /* instruction is a "call" */
+#define KINST_F_DIRECT_CALL    0x0002  /* instruction is a direct call */
+#define KINST_F_RIPREL         0x0004  /* instruction is position-dependent */
+#define KINST_F_JMP            0x0008  /* instruction is a %rip-relative jmp */
+#define KINST_F_MOD_DIRECT     0x0010  /* operand is not a memory address */
+
+/*
+ * Map ModR/M register bits to a trapframe offset.
+ */
+static int
+kinst_regoff(int reg)
+{
+#define        _MATCH_REG(i, reg)                      \
+       case i:                                 \
+               return (offsetof(struct trapframe, tf_ ## reg) / \
+                   sizeof(register_t))
+       switch (reg) {
+       _MATCH_REG( 0, rax);
+       _MATCH_REG( 1, rcx);
+       _MATCH_REG( 2, rdx);
+       _MATCH_REG( 3, rbx);
+       _MATCH_REG( 4, rsp); /* SIB when mod != 3 */
+       _MATCH_REG( 5, rbp);
+       _MATCH_REG( 6, rsi);
+       _MATCH_REG( 7, rdi);
+       _MATCH_REG( 8, r8); /* REX.R is set */
+       _MATCH_REG( 9, r9);
+       _MATCH_REG(10, r10);
+       _MATCH_REG(11, r11);
+       _MATCH_REG(12, r12);
+       _MATCH_REG(13, r13);
+       _MATCH_REG(14, r14);
+       _MATCH_REG(15, r15);
+       }
+#undef _MATCH_REG
+       panic("%s: unhandled register index %d", __func__, reg);
+}
+
+/*
+ * Obtain the specified register's value.
+ */
+static uint64_t
+kinst_regval(struct trapframe *frame, int reg)
+{
+       if (reg == -1)
+               return (0);
+       return (((register_t *)frame)[kinst_regoff(reg)]);
+}
+
+static uint32_t
+kinst_riprel_disp(struct kinst_probe *kp, void *dst)
+{
+       return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
+           (intptr_t)dst));
+}
+
+static void
+kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
+{
+       uint8_t *instr;
+       uint32_t disp;
+       int ilen;
+
+       ilen = kp->kp_md.tinstlen;
+
+       memcpy(tramp, kp->kp_md.template, ilen);
+       if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
+               disp = kinst_riprel_disp(kp, tramp);
+               memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
+       }
+
+       /*
+        * The following position-independent jmp takes us back to the
+        * original code.  It is encoded as "jmp *0(%rip)" (six bytes),
+        * followed by the absolute address of the instruction following
+        * the one that was traced (eight bytes).
+        */
+       tramp[ilen + 0] = 0xff;
+       tramp[ilen + 1] = 0x25;
+       tramp[ilen + 2] = 0x00;
+       tramp[ilen + 3] = 0x00;
+       tramp[ilen + 4] = 0x00;
+       tramp[ilen + 5] = 0x00;
+       instr = kp->kp_patchpoint + kp->kp_md.instlen;
+       memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
+}
+
+int
+kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
+{
+       solaris_cpu_t *cpu;
+       uintptr_t *stack, retaddr;
+       struct kinst_probe *kp;
+       struct kinst_probe_md *kpmd;
+       uint8_t *tramp;
+
+       stack = (uintptr_t *)frame->tf_rsp;
+       cpu = &solaris_cpu[curcpu];
+
+       LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
+               if ((uintptr_t)kp->kp_patchpoint == addr)
+                       break;
+       }
+       if (kp == NULL)
+               return (0);
+
+       DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+       cpu->cpu_dtrace_caller = stack[0];
+       DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+       dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
+       cpu->cpu_dtrace_caller = 0;
+
+       kpmd = &kp->kp_md;
+       if ((kpmd->flags & KINST_F_CALL) != 0) {
+               /*
+                * dtrace_invop_start() reserves space on the stack to
+                * store the return address of the call instruction.
+                */
+               retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
+               *(uintptr_t *)scratch = retaddr;
+
+               if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
+                       frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
+                           kpmd->disp + kpmd->instlen);
+               } else {
+                       register_t rval;
+
+                       if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
+                               /* rip-relative */
+                               rval = frame->tf_rip - 1 + kpmd->instlen;
+                       } else {
+                               /* indirect */
+                               rval = kinst_regval(frame, kpmd->reg1) +
+                                   (kinst_regval(frame, kpmd->reg2) <<
+                                   kpmd->scale);
+                       }
+
+                       if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
+                               frame->tf_rip = rval + kpmd->disp;
+                       } else {
+                               frame->tf_rip =
+                                   *(uintptr_t *)(rval + kpmd->disp);
+                       }
+               }
+               return (DTRACE_INVOP_CALL);
+       } else {
+               tramp = curthread->t_kinst;
+               if (tramp == NULL) {
+                       /*
+                        * A trampoline allocation failed, so this probe is
+                        * effectively disabled.  Restore the original
+                        * instruction.
+                        *
+                        * We can't safely print anything here, but the
+                        * trampoline allocator should have left a breadcrumb in
+                        * the dmesg.
+                        */
+                       kinst_patch_tracepoint(kp, kp->kp_savedval);
+                       frame->tf_rip = (register_t)kp->kp_patchpoint;
+               } else {
+                       kinst_trampoline_populate(kp, tramp);
+                       frame->tf_rip = (register_t)tramp;
+               }
+               return (DTRACE_INVOP_NOP);
+       }
+}
+
+void
+kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
+{
+       register_t reg;
+       int oldwp;
+
+       reg = intr_disable();
+       oldwp = disable_wp();
+       *kp->kp_patchpoint = val;
+       restore_wp(oldwp);
+       intr_restore(reg);
+}
+
+static void
+kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
+{
+       kp->kp_md.disp = (int64_t)(int8_t)byte;
+}
+
+static void
+kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
+{
+       int32_t disp32;
+
+       memcpy(&disp32, bytes, sizeof(disp32));
+       kp->kp_md.disp = (int64_t)disp32;
+}
+
+static int
+kinst_dis_get_byte(void *p)
+{
+       int ret;
+       uint8_t **instr = p;
+
+       ret = **instr;
+       (*instr)++;
+
+       return (ret);
+}
+
+/*
+ * Set up all of the state needed to faithfully execute a probed instruction.
+ *
+ * In the simple case, we copy the instruction unmodified to a per-thread
+ * trampoline, wherein it is followed by a jump back to the original code.
+ * - Instructions can have %rip as an operand:
+ *   - with %rip-relative addressing encoded in ModR/M, or
+ *   - implicitly as a part of the instruction definition (jmp, call).
+ * - Call instructions (which may be %rip-relative) need to push the correct
+ *   return address onto the stack.
+ *
+ * Call instructions are simple enough to be emulated in software, so we simply
+ * do not use the trampoline mechanism in that case.  kinst_invop() will 
compute
+ * the branch target using the address info computed here (register operands 
and
+ * displacement).
+ *
+ * %rip-relative operands encoded using the ModR/M byte always use a 32-bit
+ * displacement; when populating the trampoline the displacement is adjusted to
+ * be relative to the trampoline address.  Trampolines are always allocated
+ * above KERNBASE for this reason.
+ *
+ * For other %rip-relative operands (just jumps) we take the same approach.
+ * Instructions which specify an 8-bit displacement must be rewritten to use a
+ * 32-bit displacement.
+ */
+static int
+kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
+{
+       struct kinst_probe_md *kpmd;
+       dis86_t d86;
+       uint8_t *bytes, modrm, rex;
+       int dispoff, i, ilen, opcidx;
+
+       kpmd = &kp->kp_md;
+
+       d86.d86_data = instr;
+       d86.d86_get_byte = kinst_dis_get_byte;
+       d86.d86_check_func = NULL;
+       if (dtrace_disx86(&d86, SIZE64) != 0) {
+               KINST_LOG("failed to disassemble instruction at: %p", *instr);
+               return (EINVAL);
+       }
+       bytes = d86.d86_bytes;
+       kpmd->instlen = kpmd->tinstlen = d86.d86_len;
+
+       /*
+        * Skip over prefixes, save REX.
+        */
+       rex = 0;
+       for (i = 0; i < kpmd->instlen; i++) {
+               switch (bytes[i]) {
+               case 0xf0 ... 0xf3:
+                       /* group 1 */
+                       continue;
+               case 0x26:
+               case 0x2e:
+               case 0x36:
+               case 0x3e:
+               case 0x64:
+               case 0x65:
+                       /* group 2 */
+                       continue;
+               case 0x66:
+                       /* group 3 */
+                       continue;
+               case 0x67:
+                       /* group 4 */
+                       continue;
+               case 0x40 ... 0x4f:
+                       /* REX */
+                       rex = bytes[i];
+                       continue;
+               }
+               break;
+       }
+       KASSERT(i < kpmd->instlen,
+           ("%s: failed to disassemble instruction at %p", __func__, bytes));
+       opcidx = i;
+
+       /*
+        * Identify instructions of interest by opcode: calls and jumps.
+        * Extract displacements.
+        */
+       dispoff = -1;
+       switch (bytes[opcidx]) {
+       case 0x0f:
+               switch (bytes[opcidx + 1]) {
+               case 0x80 ... 0x8f:
+                       /* conditional jmp near */
+                       kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+                       dispoff = opcidx + 2;
+                       kinst_set_disp32(kp, &bytes[dispoff]);
+                       break;
+               }
+               break;
+       case 0xe3:
+               /*
+                * There is no straightforward way to translate this instruction
+                * to use a 32-bit displacement.  Fortunately, it is rarely
+                * used.
+                */
+               return (EINVAL);
+       case 0x70 ... 0x7f:
+               /* conditional jmp short */
+               kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+               dispoff = opcidx + 1;
+               kinst_set_disp8(kp, bytes[dispoff]);
+               break;
+       case 0xe9:
+               /* unconditional jmp near */
+               kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+               dispoff = opcidx + 1;
+               kinst_set_disp32(kp, &bytes[dispoff]);
+               break;
+       case 0xeb:
+               /* unconditional jmp short */
+               kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
+               dispoff = opcidx + 1;
+               kinst_set_disp8(kp, bytes[dispoff]);
+               break;
+       case 0xe8:
+       case 0x9a:
+               /* direct call */
+               kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
+               dispoff = opcidx + 1;
+               kinst_set_disp32(kp, &bytes[dispoff]);
+               break;
+       case 0xff:
+               KASSERT(d86.d86_got_modrm,
+                   ("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
+               switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
+               case 0x02:
+               case 0x03:
+                       /* indirect call */
+                       kpmd->flags |= KINST_F_CALL;
+                       break;
+               case 0x04:
+               case 0x05:
+                       /* indirect jump */
+                       kpmd->flags |= KINST_F_JMP;
+                       break;
+               }
+       }
+
+       /*
+        * If there's a ModR/M byte, we need to check it to see if the operand
+        * is %rip-relative, and rewrite the displacement if so.  If not, we
+        * might still have to extract operand info if this is a call
+        * instruction.
+        */
+       if (d86.d86_got_modrm) {
+               uint8_t mod, rm, sib;
+
+               kpmd->reg1 = kpmd->reg2 = -1;
+
+               modrm = bytes[d86.d86_rmindex];
+               mod = KINST_MODRM_MOD(modrm);
+               rm = KINST_MODRM_RM(modrm);
+               if (mod == 0 && rm == 5) {
+                       kpmd->flags |= KINST_F_RIPREL;
+                       dispoff = d86.d86_rmindex + 1;
+                       kinst_set_disp32(kp, &bytes[dispoff]);
+               } else if ((kpmd->flags & KINST_F_CALL) != 0) {
+                       bool havesib;
+
+                       havesib = (mod != 3 && rm == 4);
+                       dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
+                       if (mod == 1)
+                               kinst_set_disp8(kp, bytes[dispoff]);
+                       else if (mod == 2)
+                               kinst_set_disp32(kp, &bytes[dispoff]);
+                       else if (mod == 3)
+                               kpmd->flags |= KINST_F_MOD_DIRECT;
+
+                       if (havesib) {
+                               sib = bytes[d86.d86_rmindex + 1];
+                               if (KINST_SIB_BASE(sib) != 5) {
+                                       kpmd->reg1 = KINST_SIB_BASE(sib) |
+                                           (KINST_REX_B(rex) << 3);
+                               }
+                               kpmd->scale = KINST_SIB_SCALE(sib);
+                               kpmd->reg2 = KINST_SIB_INDEX(sib) |
+                                   (KINST_REX_X(rex) << 3);
+                       } else {
+                               kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
+                       }
+               }
+       }
+
+       /*
+        * Calls are emulated in software; once operands are decoded we have
+        * nothing else to do.
+        */
+       if ((kpmd->flags & KINST_F_CALL) != 0)
+               return (0);
+
+       /*
+        * Allocate and populate an instruction trampoline template.
+        *
+        * Position-independent instructions can simply be copied, but
+        * position-dependent instructions require some surgery: jump
+        * instructions with an 8-bit displacement need to be converted to use a
+        * 32-bit displacement, and the adjusted displacement needs to be
+        * computed.
+        */
+       ilen = kpmd->instlen;
+       if ((kpmd->flags & KINST_F_RIPREL) != 0) {
+               if ((kpmd->flags & KINST_F_JMP) == 0 ||
+                   bytes[opcidx] == 0x0f ||
+                   bytes[opcidx] == 0xe9 ||
+                   bytes[opcidx] == 0xff) {
+                       memcpy(kpmd->template, bytes, dispoff);
+                       memcpy(&kpmd->template[dispoff + 4],
+                           &bytes[dispoff + 4], ilen - (dispoff + 4));
+                       kpmd->dispoff = dispoff;
+               } else if (bytes[opcidx] == 0xeb) {
+                       memcpy(kpmd->template, bytes, opcidx);
+                       kpmd->template[opcidx] = 0xe9;
+                       kpmd->dispoff = opcidx + 1;
+
+                       /* Instruction length changes from 2 to 5. */
+                       kpmd->tinstlen = 5;
+                       kpmd->disp -= 3;
+               } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f)  {
+                       memcpy(kpmd->template, bytes, opcidx);
+                       kpmd->template[opcidx] = 0x0f;
+                       kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
+                       kpmd->dispoff = opcidx + 2;
+
+                       /* Instruction length changes from 2 to 6. */
+                       kpmd->tinstlen = 6;
+                       kpmd->disp -= 4;
+               } else {
+                       panic("unhandled opcode %#x", bytes[opcidx]);
+               }
+       } else {
+               memcpy(kpmd->template, bytes, ilen);
+       }
+
+       return (0);
+}
+
+int
+kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
+    void *opaque)
+{
+       struct kinst_probe *kp;
+       dtrace_kinst_probedesc_t *pd;
+       const char *func;
+       int error, n, off;
+       uint8_t *instr, *limit;
+
+       pd = opaque;
+       func = symval->name;
+       if (strcmp(func, pd->kpd_func) != 0 || strcmp(func, "trap_check") == 0)
+               return (0);
+
+       instr = (uint8_t *)symval->value;
+       limit = (uint8_t *)symval->value + symval->size;
+       if (instr >= limit)
+               return (0);
+
+       /*
+        * Ignore functions not beginning with the usual function prologue.
+        * These might correspond to assembly routines with which we should not
+        * meddle.
+        */
+       if (*instr != KINST_PUSHL_RBP)
+               return (0);
+
+       n = 0;
+       while (instr < limit) {
+               off = (int)(instr - (uint8_t *)symval->value);
+               if (pd->kpd_off != -1 && off != pd->kpd_off) {
+                       instr += dtrace_instr_size(instr);
+                       continue;
+               }
+
+               /*
+                * Prevent separate dtrace(1) instances from creating copies of
+                * the same probe.
+                */
+               LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
+                       if (strcmp(kp->kp_func, func) == 0 &&
+                           strtol(kp->kp_name, NULL, 10) == off)
+                               return (0);
+               }
+               if (++n > KINST_PROBETAB_MAX) {
+                       KINST_LOG("probe list full: %d entries", n);
+                       return (ENOMEM);
+               }
+               kp = malloc(sizeof(struct kinst_probe), M_KINST,
+                   M_WAITOK | M_ZERO);
+               kp->kp_func = func;
+               snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
+               kp->kp_savedval = *instr;
+               kp->kp_patchval = KINST_PATCHVAL;
+               kp->kp_patchpoint = instr;
+
+               error = kinst_instr_dissect(kp, &instr);
+               if (error != 0)
+                       return (error);
+
+               kinst_probe_create(kp, lf);
+       }
+
+       return (0);
+}
diff --git a/sys/cddl/dev/kinst/amd64/kinst_isa.h 
b/sys/cddl/dev/kinst/amd64/kinst_isa.h
new file mode 100644
index 000000000000..4c6387b8cb50
--- /dev/null
+++ b/sys/cddl/dev/kinst/amd64/kinst_isa.h
@@ -0,0 +1,45 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <[email protected]>
+ * Copyright 2022 Mark Johnston <[email protected]>
+ */
+
+#ifndef _KINST_ISA_H_
+#define _KINST_ISA_H_
+
+#include <sys/types.h>
+
+#define KINST_PATCHVAL         0xcc
+
+/*
+ * Each trampoline is 32 bytes long and contains [instruction, jmp]. Since we
+ * have 2 instructions stored in the trampoline, and each of them can take up
+ * to 16 bytes, 32 bytes is enough to cover even the worst case scenario.
+ */
+#define        KINST_TRAMP_SIZE        32
+#define        KINST_TRAMPCHUNK_SIZE   PAGE_SIZE
+
+/*
+ * Fill the trampolines with breakpoint instructions so that the kernel will
+ * crash cleanly if things somehow go wrong.
+ */
+#define        KINST_TRAMP_INIT(t, s)  memset((t), KINST_PATCHVAL, (s))
+
+typedef uint8_t kinst_patchval_t;
+
+struct kinst_probe_md {
+       int                     flags;
+       int                     instlen;        /* original instr len */
+       int                     tinstlen;       /* trampoline instr len */
+       uint8_t                 template[16];   /* copied into thread tramps */
+       int                     dispoff;        /* offset of rip displacement */
+
+       /* operands to "call" instruction branch target */
+       int                     reg1;
+       int                     reg2;
+       int                     scale;
+       int64_t                 disp;
+};
+
+#endif /* _KINST_ISA_H_ */
diff --git a/sys/cddl/dev/kinst/kinst.c b/sys/cddl/dev/kinst/kinst.c
new file mode 100644
index 000000000000..a7d04e927fa7
--- /dev/null
+++ b/sys/cddl/dev/kinst/kinst.c
@@ -0,0 +1,233 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <[email protected]>
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+
+#include <sys/dtrace.h>
+
+#include "kinst.h"
+
+MALLOC_DEFINE(M_KINST, "kinst", "Kernel Instruction Tracing");
+
+static d_open_t                kinst_open;
+static d_close_t       kinst_close;
+static d_ioctl_t       kinst_ioctl;
+
+static void    kinst_provide_module(void *, modctl_t *);
+static void    kinst_getargdesc(void *, dtrace_id_t, void *,
+                   dtrace_argdesc_t *);
+static void    kinst_destroy(void *, dtrace_id_t, void *);
+static void    kinst_enable(void *, dtrace_id_t, void *);
+static void    kinst_disable(void *, dtrace_id_t, void *);
+static int     kinst_load(void *);
+static int     kinst_unload(void *);
+static int     kinst_modevent(module_t, int, void *);
+
+static dtrace_pattr_t kinst_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+};
+
+static dtrace_pops_t kinst_pops = {
+       .dtps_provide           = NULL,
+       .dtps_provide_module    = kinst_provide_module,
+       .dtps_enable            = kinst_enable,
+       .dtps_disable           = kinst_disable,
+       .dtps_suspend           = NULL,
+       .dtps_resume            = NULL,
+       .dtps_getargdesc        = kinst_getargdesc,
+       .dtps_getargval         = NULL,
+       .dtps_usermode          = NULL,
+       .dtps_destroy           = kinst_destroy
+};
+
+static struct cdevsw kinst_cdevsw = {
+       .d_name                 = "kinst",
+       .d_version              = D_VERSION,
+       .d_flags                = D_TRACKCLOSE,
+       .d_open                 = kinst_open,
+       .d_close                = kinst_close,
+       .d_ioctl                = kinst_ioctl,
+};
+
+static dtrace_provider_id_t    kinst_id;
+struct kinst_probe_list        *kinst_probetab;
+static struct cdev     *kinst_cdev;
+
+void
+kinst_probe_create(struct kinst_probe *kp, linker_file_t lf)
+{
+       kp->kp_id = dtrace_probe_create(kinst_id, lf->filename,
+           kp->kp_func, kp->kp_name, 3, kp);
+
+       LIST_INSERT_HEAD(KINST_GETPROBE(kp->kp_patchpoint), kp, kp_hashnext);
+}
+
+static int
+kinst_open(struct cdev *dev __unused, int oflags __unused, int devtype 
__unused,
+    struct thread *td __unused)
+{
+       return (0);
+}
+
+static int
+kinst_close(struct cdev *dev __unused, int fflag __unused, int devtype 
__unused,
+    struct thread *td __unused)
+{
+       dtrace_condense(kinst_id);
+       return (0);
+}
+
+static int
+kinst_linker_file_cb(linker_file_t lf, void *arg)
+{
+       dtrace_kinst_probedesc_t *pd;
+
+       pd = arg;
+       if (pd->kpd_mod[0] != '\0' && strcmp(pd->kpd_mod, lf->filename) != 0)
+               return (0);
+
+       /*
+        * Invoke kinst_make_probe_function() once for each function symbol in
+        * the module "lf".
+        */
+       return (linker_file_function_listall(lf, kinst_make_probe, arg));
+}
+
+static int
+kinst_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr,
+    int flags __unused, struct thread *td __unused)
+{
+       dtrace_kinst_probedesc_t *pd;
+       int error = 0;
+
+       switch (cmd) {
+       case KINSTIOC_MAKEPROBE:
+               pd = (dtrace_kinst_probedesc_t *)addr;
+               pd->kpd_func[sizeof(pd->kpd_func) - 1] = '\0';
+               pd->kpd_mod[sizeof(pd->kpd_mod) - 1] = '\0';
+
+               /* Loop over all functions in the kernel and loaded modules. */
+               error = linker_file_foreach(kinst_linker_file_cb, pd);
+               break;
+       default:
+               error = ENOTTY;
+               break;
+       }
+
+       return (error);
+}
+
+static void
+kinst_provide_module(void *arg, modctl_t *lf)
+{
+}
+
+static void
+kinst_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
+{
+       desc->dtargd_ndx = DTRACE_ARGNONE;
+}
+
+static void
+kinst_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+       struct kinst_probe *kp = parg;
+
+       LIST_REMOVE(kp, kp_hashnext);
+       free(kp, M_KINST);
+}
+
+static void
+kinst_enable(void *arg, dtrace_id_t id, void *parg)
+{
+       struct kinst_probe *kp = parg;
+
+       kinst_patch_tracepoint(kp, kp->kp_patchval);
+}
+
+static void
+kinst_disable(void *arg, dtrace_id_t id, void *parg)
+{
+       struct kinst_probe *kp = parg;
+
+       kinst_patch_tracepoint(kp, kp->kp_savedval);
+}
+
+static int
+kinst_load(void *dummy)
+{
+       int error;
+
+       error = kinst_trampoline_init();
+       if (error != 0)
+               return (error);
+
+       error = dtrace_register("kinst", &kinst_attr, DTRACE_PRIV_USER, NULL,
+           &kinst_pops, NULL, &kinst_id);
+       if (error != 0) {
+               kinst_trampoline_deinit();
+               return (error);
+       }
+       kinst_probetab = malloc(KINST_PROBETAB_MAX *
+           sizeof(struct kinst_probe_list), M_KINST, M_WAITOK | M_ZERO);
+       for (int i = 0; i < KINST_PROBETAB_MAX; i++)
+               LIST_INIT(&kinst_probetab[i]);
+       kinst_cdev = make_dev(&kinst_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+           "dtrace/kinst");
+       dtrace_invop_add(kinst_invop);
+       return (0);
+}
+
+static int
+kinst_unload(void *dummy)
+{
+       free(kinst_probetab, M_KINST);
+       kinst_trampoline_deinit();
+       dtrace_invop_remove(kinst_invop);
+       destroy_dev(kinst_cdev);
+
+       return (dtrace_unregister(kinst_id));
+}
+
+static int
+kinst_modevent(module_t mod __unused, int type, void *data __unused)
+{
+       int error = 0;
+
+       switch (type) {
+       case MOD_LOAD:
+               KINST_LOG(
+                   "kinst: This provider is experimental, exercise caution");
+               break;
+       case MOD_UNLOAD:
+               break;
+       case MOD_SHUTDOWN:
+               break;
+       default:
+               error = EOPNOTSUPP;
+               break;
+       }
+
+       return (error);
+}
+
+SYSINIT(kinst_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_load, NULL);
+SYSUNINIT(kinst_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, kinst_unload,
+    NULL);
+
+DEV_MODULE(kinst, kinst_modevent, NULL);
+MODULE_VERSION(kinst, 1);
+MODULE_DEPEND(kinst, dtrace, 1, 1, 1);
+MODULE_DEPEND(kinst, opensolaris, 1, 1, 1);
diff --git a/sys/cddl/dev/kinst/kinst.h b/sys/cddl/dev/kinst/kinst.h
new file mode 100644
index 000000000000..ea1a5b50004f
--- /dev/null
+++ b/sys/cddl/dev/kinst/kinst.h
@@ -0,0 +1,71 @@
+/*
+ * SPDX-License-Identifier: CDDL 1.0
+ *
+ * Copyright 2022 Christos Margiolis <[email protected]>
+ */
+
+#ifndef _KINST_H_
+#define _KINST_H_
+
+#include <sys/dtrace.h>
+
+typedef struct {
+       char    kpd_func[DTRACE_FUNCNAMELEN];
+       char    kpd_mod[DTRACE_MODNAMELEN];
+       int     kpd_off;
+} dtrace_kinst_probedesc_t;
+
+#define KINSTIOC_MAKEPROBE     _IOW('k', 1, dtrace_kinst_probedesc_t)
+
+#ifdef _KERNEL
+
+#include <sys/queue.h>
+
+#include "kinst_isa.h"
+
+struct kinst_probe {
+       LIST_ENTRY(kinst_probe) kp_hashnext;
+       const char              *kp_func;
+       char                    kp_name[16];
+       dtrace_id_t             kp_id;
+       kinst_patchval_t        kp_patchval;
+       kinst_patchval_t        kp_savedval;
+       kinst_patchval_t        *kp_patchpoint;
+
+       struct kinst_probe_md   kp_md;
+};
+
+LIST_HEAD(kinst_probe_list, kinst_probe);
+
+extern struct kinst_probe_list *kinst_probetab;
+
+#define KINST_PROBETAB_MAX     0x8000  /* 32k */
+#define KINST_ADDR2NDX(addr)   (((uintptr_t)(addr)) & (KINST_PROBETAB_MAX - 1))
+#define KINST_GETPROBE(i)      (&kinst_probetab[KINST_ADDR2NDX(i)])
+
+struct linker_file;
+struct linker_symval;
+
+int    kinst_invop(uintptr_t, struct trapframe *, uintptr_t);
+int    kinst_make_probe(struct linker_file *, int, struct linker_symval *,
+           void *);
+void   kinst_patch_tracepoint(struct kinst_probe *, kinst_patchval_t);
+void   kinst_probe_create(struct kinst_probe *, struct linker_file *);
+
+int    kinst_trampoline_init(void);
+int    kinst_trampoline_deinit(void);
+uint8_t        *kinst_trampoline_alloc(int);
+void   kinst_trampoline_dealloc(uint8_t *);
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_KINST);
*** 354 LINES SKIPPED ***

Reply via email to