// Simple direct-token-threaded (in the Forth sense) interpreter. -*- asm -*- // by Kragen Javier Sitaker; dedicated to the public domain, i.e. I // relinquish whatever exclusive rights copyright law gives me with // regard to this work. // Major parts taken from Richard W.M. Jones's public-domain JONESFORTH 42 // by Richard W.M. Jones <[EMAIL PROTECTED]> http://annexia.org/forth
// This program just outputs "hello, world, hello" under Linux. // to compile: // gcc -m32 -nostdlib -static -o sdtbi simple-direct-threaded-bytecode.S // The normal Forth representation of a function is as an array of // pointers to the other functions it calls, in sequence; a few of // those other functions may move the interpreter pointer around in // that array, or snarf up a constant that's stored in the array, or // stuff like that, but for the most part, the functions just get // called in sequence. This is called "threaded code" and it's fairly // compact, especially on 16-bit systems where the pointers are only // two bytes. // A traditional approach taken by Forth implementations to reduce // code size even further is called "token threading". Rather than // making arrays of 16-bit or 32-bit pointers, they make lists of // 8-bit indices into an array of pointers. This has two advantages: // 1. the indices are one fourth the size of a list of 32-bit pointers; // 2. it is possible to save these lists of indices somewhere outside // of memory and continue to use them even after making some // changes to the code, as long as the same indices in the table // have the same meanings. So, for example, you could write some // boot firmware in this "bytecode". // It also has some disadvantages: // 1. You run out of space in the table. Even a fairly minimal full // Forth system contains close to 256 subroutines. You can // mitigate this by packing, say, two 12-bit pointers every three // bytes, or maybe by having a special bytecode that looks up the // next byte in an extended table. // 2. It's slower and makes the machine-code part of the program take // more space. The traditional LODSW; JMP AX version of $NEXT from // the eForth Model, which fetches and executes the next execution // token in the threaded list, is three bytes and two instructions; // my 'next' here is 12 bytes and four instructions, which is big // enough that I jump to it (5 bytes) rather than making an assembler // macro. Which blows your branch target buffers to pieces. Oh well. // Anyway, so this is an example program built using this technique. // It doesn't implement Forthlike stacks or interpreted subroutines or // the ability to define new subroutines; it just runs a bunch of // machine-code subroutines one after the other. // include system library header so we know __NR_exit = 1 and __NR_write = 4 #include <asm/unistd.h> .section .rodata // stuff goes in ELF's read-only data section token_table: // table to define the "bytecode" instructions // These are pointers to machine-code subroutines // 0 1 2 3 4 5 .int exit0, print, hello, world, comma, newline instructions: // And here is the actual "program" in that bytecode. .byte 2, 1, 4, 1, 3, 1, 4, 1, 2, 1, 5, 1, 0 .text // the following stuff goes in the text segment .globl _start // declare _start as a global symbol // (otherwise ld won't be able to find it) _start: // this is the entry point for ELF I guess mov $instructions, %esi // %esi is the interpreter pointer register jmp next // and now we start the interpreter. // (somewhat silly since we could just // fall through..) // The rest of the program consists just of the definitions of the // six things in the token_table. next: xor %eax, %eax // set %eax to 0 lodsb // load %al from where %esi points // (%esi is the interpreter pointer) lea token_table(,%eax,4), %eax // eax := token_table + eax * 4bytes jmp *(%eax) // now %eax points at an address in the token // table; so we fetch that address from the // token table and jump to it. // This is no longer the fashionable way to make system calls // in Linux. Now you're supposed to use SYSENTER on newer // CPUs, and rather than have you figure out which one to use, // the kernel mmaps a chunk of code called a VDSO into your // memory space at a random address and tells you where to // find it using the ELF auxiliary vector. Then you're // supposed to invoke the dynamic linker or something to parse // the ELF executable mysteriously manifested in this way by // the kernel, and then resolve an undefined symbol in libc // into calls to it. See "What is linux-gate.so.1?" // http://www.trilithium.com/johan/2005/08/linux-gate/ // "The Linux kernel: System Calls" by Andries Brouwer, 2003-02-01 // http://www.win.tue.nl/%7Eaeb/linux/lk/lk-4.html // "About ELF Auxiliary Vectors" by Manu Garg // http://manugarg.googlepages.com/aboutelfauxiliaryvectors // But the old int $0x80 approach still works, thank goodness, // because all of that is *way* more than these three // instructions. exit0: mov $0, %ebx // first argument to system call: 0 mov $__NR_exit, %eax // system call to call: exit int $0x80 // make a system call print: mov $1, %ebx // stdout // we expect address in %ecx and length in %edx here mov $__NR_write, %eax int $0x80 jmp next // The rest of the code exists just to poke string addresses // and lengths into %ecx and %edx so print can print them. // (I probably should have just written a gas macro...) hello: mov $hello_string, %ecx mov $5, %edx jmp next .section .rodata hello_string: .ascii "hello" .text world: mov $world_string, %ecx mov $5, %edx jmp next .section .rodata world_string: .ascii "world" .text comma: mov $comma_string, %ecx mov $2, %edx jmp next .section .rodata comma_string: .ascii ", " .text newline: mov $newline_string, %ecx mov $1, %edx jmp next .section .rodata newline_string: .ascii "\n" .text