simple direct-threaded bytecode interpreter

Kragen Javier Sitaker Fri, 09 Nov 2007 19:36:36 -0800

// Simple direct-token-threaded (in the Forth sense) interpreter. -*- asm -*-
// by Kragen Javier Sitaker; dedicated to the public domain, i.e. I
// relinquish whatever exclusive rights copyright law gives me with
// regard to this work.
// Major parts taken from Richard W.M. Jones's public-domain JONESFORTH 42
// by Richard W.M. Jones <[EMAIL PROTECTED]> http://annexia.org/forth



// This program just outputs "hello, world, hello" under Linux.

// to compile:   
// gcc -m32 -nostdlib -static -o sdtbi simple-direct-threaded-bytecode.S


// The normal Forth representation of a function is as an array of
// pointers to the other functions it calls, in sequence; a few of
// those other functions may move the interpreter pointer around in
// that array, or snarf up a constant that's stored in the array, or
// stuff like that, but for the most part, the functions just get
// called in sequence.  This is called "threaded code" and it's fairly
// compact, especially on 16-bit systems where the pointers are only
// two bytes.

// A traditional approach taken by Forth implementations to reduce
// code size even further is called "token threading".  Rather than
// making arrays of 16-bit or 32-bit pointers, they make lists of
// 8-bit indices into an array of pointers.  This has two advantages:

// 1. the indices are one fourth the size of a list of 32-bit pointers;
// 2. it is possible to save these lists of indices somewhere outside
//    of memory and continue to use them even after making some
//    changes to the code, as long as the same indices in the table
//    have the same meanings.  So, for example, you could write some
//    boot firmware in this "bytecode".

// It also has some disadvantages:
// 1. You run out of space in the table.  Even a fairly minimal full
//    Forth system contains close to 256 subroutines.  You can
//    mitigate this by packing, say, two 12-bit pointers every three
//    bytes, or maybe by having a special bytecode that looks up the
//    next byte in an extended table.
// 2. It's slower and makes the machine-code part of the program take
//    more space.  The traditional LODSW; JMP AX version of $NEXT from
//    the eForth Model, which fetches and executes the next execution
//    token in the threaded list, is three bytes and two instructions;
//    my 'next' here is 12 bytes and four instructions, which is big
//    enough that I jump to it (5 bytes) rather than making an assembler
//    macro.  Which blows your branch target buffers to pieces.  Oh well.

// Anyway, so this is an example program built using this technique.
// It doesn't implement Forthlike stacks or interpreted subroutines or
// the ability to define new subroutines; it just runs a bunch of
// machine-code subroutines one after the other.

// include system library header so we know __NR_exit = 1 and __NR_write = 4
#include <asm/unistd.h>
        
        .section .rodata        // stuff goes in ELF's read-only data section
token_table:                    // table to define the "bytecode" instructions
        // These are pointers to machine-code subroutines
        //   0      1      2      3      4      5
        .int exit0, print, hello, world, comma, newline
instructions:
        // And here is the actual "program" in that bytecode.
        .byte 2, 1, 4, 1, 3, 1, 4, 1, 2, 1, 5, 1, 0
                
        .text                   // the following stuff goes in the text segment
        .globl _start           // declare _start as a global symbol 
                                // (otherwise ld won't be able to find it)
_start:                         // this is the entry point for ELF I guess
        mov $instructions, %esi // %esi is the interpreter pointer register
        jmp next                // and now we start the interpreter.
                                // (somewhat silly since we could just
                                // fall through..)
        
// The rest of the program consists just of the definitions of the
// six things in the token_table.
next:
        xor %eax, %eax          // set %eax to 0
        lodsb                   // load %al from where %esi points
                                // (%esi is the interpreter pointer)
        lea token_table(,%eax,4), %eax  // eax := token_table + eax * 4bytes
        jmp *(%eax)             // now %eax points at an address in the token
                                // table; so we fetch that address from the
                                // token table and jump to it.
        
        // This is no longer the fashionable way to make system calls
        // in Linux.  Now you're supposed to use SYSENTER on newer
        // CPUs, and rather than have you figure out which one to use,
        // the kernel mmaps a chunk of code called a VDSO into your
        // memory space at a random address and tells you where to
        // find it using the ELF auxiliary vector.  Then you're
        // supposed to invoke the dynamic linker or something to parse
        // the ELF executable mysteriously manifested in this way by
        // the kernel, and then resolve an undefined symbol in libc
        // into calls to it.  See "What is linux-gate.so.1?"
        // http://www.trilithium.com/johan/2005/08/linux-gate/
        // "The Linux kernel: System Calls" by Andries Brouwer, 2003-02-01
        // http://www.win.tue.nl/%7Eaeb/linux/lk/lk-4.html
        // "About ELF Auxiliary Vectors" by Manu Garg
        // http://manugarg.googlepages.com/aboutelfauxiliaryvectors

        // But the old int $0x80 approach still works, thank goodness,
        // because all of that is *way* more than these three
        // instructions.

exit0:         
        mov $0, %ebx            // first argument to system call: 0
        mov $__NR_exit, %eax    // system call to call: exit
        int $0x80               // make a system call
        
print:  
        mov $1, %ebx  // stdout
        // we expect address in %ecx and length in %edx here
        mov $__NR_write, %eax
        int $0x80
        jmp next

        // The rest of the code exists just to poke string addresses
        // and lengths into %ecx and %edx so print can print them.
        // (I probably should have just written a gas macro...)
hello:
        mov $hello_string, %ecx
        mov $5, %edx
        jmp next
        .section .rodata
hello_string:
        .ascii "hello"
        .text

world:
        mov $world_string, %ecx
        mov $5, %edx
        jmp next
        .section .rodata
world_string:
        .ascii "world"
        .text

comma:
        mov $comma_string, %ecx
        mov $2, %edx
        jmp next
        .section .rodata
comma_string:
        .ascii ", "
        .text

newline:
        mov $newline_string, %ecx
        mov $1, %edx
        jmp next
        .section .rodata
newline_string:
        .ascii "\n"
        .text

simple direct-threaded bytecode interpreter

Reply via email to