We copy 7 bytes at eip for userspace's instruction decode; we have to
carefully handle the case where eip is at the end of a page.  We can't
leave this to userspace since kernel has all the page table decode
logic.

The decode logic moves to userspace, basically unchanged.

Signed-off-by: Rusty Russell <ru...@rustcorp.com.au>
---
 drivers/lguest/x86/core.c | 133 +++++++++++++----------------------------
 tools/lguest/lguest.c     | 149 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 192 insertions(+), 90 deletions(-)

diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index f7a16b4ea456..42e87bf14113 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -314,95 +314,52 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
  * usually attached to a PC.
  *
  * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We see if it's one of those troublesome
- * instructions and skip over it.  We return true if we did.
+ * Protection Fault) and come here.  We queue this to be sent out to the
+ * Launcher to handle.
  */
-static int emulate_insn(struct lg_cpu *cpu)
-{
-       u8 insn;
-       unsigned int insnlen = 0, in = 0, small_operand = 0;
-       /*
-        * The eip contains the *virtual* address of the Guest's instruction:
-        * walk the Guest's page tables to find the "physical" address.
-        */
-       unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
-
-       /*
-        * This must be the Guest kernel trying to do something, not userspace!
-        * The bottom two bits of the CS segment register are the privilege
-        * level.
-        */
-       if ((cpu->regs->cs & 3) != GUEST_PL)
-               return 0;
 
-       /* Decoding x86 instructions is icky. */
-       insn = lgread(cpu, physaddr, u8);
-
-       /*
-        * Around 2.6.33, the kernel started using an emulation for the
-        * cmpxchg8b instruction in early boot on many configurations.  This
-        * code isn't paravirtualized, and it tries to disable interrupts.
-        * Ignore it, which will Mostly Work.
-        */
-       if (insn == 0xfa) {
-               /* "cli", or Clear Interrupt Enable instruction.  Skip it. */
-               cpu->regs->eip++;
-               return 1;
+/*
+ * The eip contains the *virtual* address of the Guest's instruction:
+ * we copy the instruction here so the Launcher doesn't have to walk
+ * the page tables to decode it.  We handle the case (eg. in a kernel
+ * module) where the instruction is over two pages, and the pages are
+ * virtually but not physically contiguous.
+ *
+ * The longest possible x86 instruction is 15 bytes, but we don't handle
+ * anything that strange.
+ */
+static void copy_from_guest(struct lg_cpu *cpu,
+                           void *dst, unsigned long vaddr, size_t len)
+{
+       size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
+       unsigned long paddr;
+
+       BUG_ON(len > PAGE_SIZE);
+
+       /* If it goes over a page, copy in two parts. */
+       if (len > to_page_end) {
+               /* But make sure the next page is mapped! */
+               if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
+                       copy_from_guest(cpu, dst + to_page_end,
+                                       vaddr + to_page_end,
+                                       len - to_page_end);
+               else
+                       /* Otherwise fill with zeroes. */
+                       memset(dst + to_page_end, 0, len - to_page_end);
+               len = to_page_end;
        }
 
-       /*
-        * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
-        */
-       if (insn == 0x66) {
-               small_operand = 1;
-               /* The instruction is 1 byte so far, read the next byte. */
-               insnlen = 1;
-               insn = lgread(cpu, physaddr + insnlen, u8);
-       }
+       /* This will kill the guest if it isn't mapped, but that
+        * shouldn't happen. */
+       __lgread(cpu, dst, guest_pa(cpu, vaddr), len);
+}
 
-       /*
-        * We can ignore the lower bit for the moment and decode the 4 opcodes
-        * we need to emulate.
-        */
-       switch (insn & 0xFE) {
-       case 0xE4: /* in     <next byte>,%al */
-               insnlen += 2;
-               in = 1;
-               break;
-       case 0xEC: /* in     (%dx),%al */
-               insnlen += 1;
-               in = 1;
-               break;
-       case 0xE6: /* out    %al,<next byte> */
-               insnlen += 2;
-               break;
-       case 0xEE: /* out    %al,(%dx) */
-               insnlen += 1;
-               break;
-       default:
-               /* OK, we don't know what this is, can't emulate. */
-               return 0;
-       }
 
-       /*
-        * If it was an "IN" instruction, they expect the result to be read
-        * into %eax, so we change %eax.  We always return all-ones, which
-        * traditionally means "there's nothing there".
-        */
-       if (in) {
-               /* Lower bit tells means it's a 32/16 bit access */
-               if (insn & 0x1) {
-                       if (small_operand)
-                               cpu->regs->eax |= 0xFFFF;
-                       else
-                               cpu->regs->eax = 0xFFFFFFFF;
-               } else
-                       cpu->regs->eax |= 0xFF;
-       }
-       /* Finally, we've "done" the instruction, so move past it. */
-       cpu->regs->eip += insnlen;
-       /* Success! */
-       return 1;
+static void setup_emulate_insn(struct lg_cpu *cpu)
+{
+       cpu->pending.trap = 13;
+       copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
+                       sizeof(cpu->pending.insn));
 }
 
 /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
@@ -410,14 +367,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 {
        switch (cpu->regs->trapnum) {
        case 13: /* We've intercepted a General Protection Fault. */
-               /*
-                * Check if this was one of those annoying IN or OUT
-                * instructions which we need to emulate.  If so, we just go
-                * back into the Guest after we've done it.
-                */
+               /* Hand to Launcher to emulate those pesky IN and OUT insns */
                if (cpu->regs->errcode == 0) {
-                       if (emulate_insn(cpu))
-                               return;
+                       setup_emulate_insn(cpu);
+                       return;
                }
                break;
        case 14: /* We've intercepted a Page Fault. */
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 0e754d04876d..b2217657f62c 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -41,6 +41,7 @@
 #include <signal.h>
 #include <pwd.h>
 #include <grp.h>
+#include <sys/user.h>
 
 #ifndef VIRTIO_F_ANY_LAYOUT
 #define VIRTIO_F_ANY_LAYOUT            27
@@ -1143,6 +1144,150 @@ static void handle_output(unsigned long addr)
              strnlen(from_guest_phys(addr), guest_limit - addr));
 }
 
+/*L:216
+ * This is where we emulate a handful of Guest instructions.  It's ugly
+ * and we used to do it in the kernel but it grew over time.
+ */
+
+/*
+ * We use the ptrace syscall's pt_regs struct to talk about registers
+ * to lguest: these macros convert the names to the offsets.
+ */
+#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
+#define setreg(name, val) \
+       setreg_off(offsetof(struct user_regs_struct, name), (val))
+
+static u32 getreg_off(size_t offset)
+{
+       u32 r;
+       unsigned long args[] = { LHREQ_GETREG, offset };
+
+       if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
+               err(1, "Getting register %u", offset);
+       if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
+               err(1, "Reading register %u", offset);
+
+       return r;
+}
+
+static void setreg_off(size_t offset, u32 val)
+{
+       unsigned long args[] = { LHREQ_SETREG, offset, val };
+
+       if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
+               err(1, "Setting register %u", offset);
+}
+
+static void emulate_insn(const u8 insn[])
+{
+       unsigned long args[] = { LHREQ_TRAP, 13 };
+       unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
+       unsigned int eax, port, mask;
+       /*
+        * We always return all-ones on IO port reads, which traditionally
+        * means "there's nothing there".
+        */
+       u32 val = 0xFFFFFFFF;
+
+       /*
+        * This must be the Guest kernel trying to do something, not userspace!
+        * The bottom two bits of the CS segment register are the privilege
+        * level.
+        */
+       if ((getreg(xcs) & 3) != 0x1)
+               goto no_emulate;
+
+       /* Decoding x86 instructions is icky. */
+
+       /*
+        * Around 2.6.33, the kernel started using an emulation for the
+        * cmpxchg8b instruction in early boot on many configurations.  This
+        * code isn't paravirtualized, and it tries to disable interrupts.
+        * Ignore it, which will Mostly Work.
+        */
+       if (insn[insnlen] == 0xfa) {
+               /* "cli", or Clear Interrupt Enable instruction.  Skip it. */
+               insnlen = 1;
+               goto skip_insn;
+       }
+
+       /*
+        * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
+        */
+       if (insn[insnlen] == 0x66) {
+               small_operand = 1;
+               /* The instruction is 1 byte so far, read the next byte. */
+               insnlen = 1;
+       }
+
+       /* If the lower bit isn't set, it's a single byte access */
+       byte_access = !(insn[insnlen] & 1);
+
+       /*
+        * Now we can ignore the lower bit and decode the 4 opcodes
+        * we need to emulate.
+        */
+       switch (insn[insnlen] & 0xFE) {
+       case 0xE4: /* in     <next byte>,%al */
+               port = insn[insnlen+1];
+               insnlen += 2;
+               in = 1;
+               break;
+       case 0xEC: /* in     (%dx),%al */
+               port = getreg(edx) & 0xFFFF;
+               insnlen += 1;
+               in = 1;
+               break;
+       case 0xE6: /* out    %al,<next byte> */
+               port = insn[insnlen+1];
+               insnlen += 2;
+               break;
+       case 0xEE: /* out    %al,(%dx) */
+               port = getreg(edx) & 0xFFFF;
+               insnlen += 1;
+               break;
+       default:
+               /* OK, we don't know what this is, can't emulate. */
+               goto no_emulate;
+       }
+
+       /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
+       if (byte_access)
+               mask = 0xFF;
+       else if (small_operand)
+               mask = 0xFFFF;
+       else
+               mask = 0xFFFFFFFF;
+
+       /*
+        * If it was an "IN" instruction, they expect the result to be read
+        * into %eax, so we change %eax.
+        */
+       eax = getreg(eax);
+
+       if (in) {
+               /* Clear the bits we're about to read */
+               eax &= ~mask;
+               /* Copy bits in from val. */
+               eax |= val & mask;
+               /* Now update the register. */
+               setreg(eax, eax);
+       }
+
+       verbose("IO %s of %x to %u: %#08x\n",
+               in ? "IN" : "OUT", mask, port, eax);
+skip_insn:
+       /* Finally, we've "done" the instruction, so move past it. */
+       setreg(eip, getreg(eip) + insnlen);
+       return;
+
+no_emulate:
+       /* Inject trap into Guest. */
+       if (write(lguest_fd, args, sizeof(args)) < 0)
+               err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
+}
+
+
 /*L:190
  * Device Setup
  *
@@ -1832,6 +1977,10 @@ static void __attribute__((noreturn)) run_guest(void)
                                verbose("Notify on address %#08x\n",
                                        notify.addr);
                                handle_output(notify.addr);
+                       } else if (notify.trap == 13) {
+                               verbose("Emulating instruction at %#x\n",
+                                       getreg(eip));
+                               emulate_insn(notify.insn);
                        } else
                                errx(1, "Unknown trap %i addr %#08x\n",
                                     notify.trap, notify.addr);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to