// Simple direct-token-threaded (in the Forth sense) interpreter. -*- asm -*-
// by Kragen Javier Sitaker; dedicated to the public domain, i.e. I
// relinquish whatever exclusive rights copyright law gives me with
// regard to this work.
// Major parts taken from Richard W.M. Jones's public-domain JONESFORTH 42
// by Richard W.M. Jones <[EMAIL PROTECTED]> http://annexia.org/forth
// This program just outputs "hello, world, hello" under Linux.
// to compile:
// gcc -m32 -nostdlib -static -o sdtbi simple-direct-threaded-bytecode.S
// The normal Forth representation of a function is as an array of
// pointers to the other functions it calls, in sequence; a few of
// those other functions may move the interpreter pointer around in
// that array, or snarf up a constant that's stored in the array, or
// stuff like that, but for the most part, the functions just get
// called in sequence. This is called "threaded code" and it's fairly
// compact, especially on 16-bit systems where the pointers are only
// two bytes.
// A traditional approach taken by Forth implementations to reduce
// code size even further is called "token threading". Rather than
// making arrays of 16-bit or 32-bit pointers, they make lists of
// 8-bit indices into an array of pointers. This has two advantages:
// 1. the indices are one fourth the size of a list of 32-bit pointers;
// 2. it is possible to save these lists of indices somewhere outside
// of memory and continue to use them even after making some
// changes to the code, as long as the same indices in the table
// have the same meanings. So, for example, you could write some
// boot firmware in this "bytecode".
// It also has some disadvantages:
// 1. You run out of space in the table. Even a fairly minimal full
// Forth system contains close to 256 subroutines. You can
// mitigate this by packing, say, two 12-bit pointers every three
// bytes, or maybe by having a special bytecode that looks up the
// next byte in an extended table.
// 2. It's slower and makes the machine-code part of the program take
// more space. The traditional LODSW; JMP AX version of $NEXT from
// the eForth Model, which fetches and executes the next execution
// token in the threaded list, is three bytes and two instructions;
// my 'next' here is 12 bytes and four instructions, which is big
// enough that I jump to it (5 bytes) rather than making an assembler
// macro. Which blows your branch target buffers to pieces. Oh well.
// Anyway, so this is an example program built using this technique.
// It doesn't implement Forthlike stacks or interpreted subroutines or
// the ability to define new subroutines; it just runs a bunch of
// machine-code subroutines one after the other.
// include system library header so we know __NR_exit = 1 and __NR_write = 4
#include <asm/unistd.h>
.section .rodata // stuff goes in ELF's read-only data section
token_table: // table to define the "bytecode" instructions
// These are pointers to machine-code subroutines
// 0 1 2 3 4 5
.int exit0, print, hello, world, comma, newline
instructions:
// And here is the actual "program" in that bytecode.
.byte 2, 1, 4, 1, 3, 1, 4, 1, 2, 1, 5, 1, 0
.text // the following stuff goes in the text segment
.globl _start // declare _start as a global symbol
// (otherwise ld won't be able to find it)
_start: // this is the entry point for ELF I guess
mov $instructions, %esi // %esi is the interpreter pointer register
jmp next // and now we start the interpreter.
// (somewhat silly since we could just
// fall through..)
// The rest of the program consists just of the definitions of the
// six things in the token_table.
next:
xor %eax, %eax // set %eax to 0
lodsb // load %al from where %esi points
// (%esi is the interpreter pointer)
lea token_table(,%eax,4), %eax // eax := token_table + eax * 4bytes
jmp *(%eax) // now %eax points at an address in the token
// table; so we fetch that address from the
// token table and jump to it.
// This is no longer the fashionable way to make system calls
// in Linux. Now you're supposed to use SYSENTER on newer
// CPUs, and rather than have you figure out which one to use,
// the kernel mmaps a chunk of code called a VDSO into your
// memory space at a random address and tells you where to
// find it using the ELF auxiliary vector. Then you're
// supposed to invoke the dynamic linker or something to parse
// the ELF executable mysteriously manifested in this way by
// the kernel, and then resolve an undefined symbol in libc
// into calls to it. See "What is linux-gate.so.1?"
// http://www.trilithium.com/johan/2005/08/linux-gate/
// "The Linux kernel: System Calls" by Andries Brouwer, 2003-02-01
// http://www.win.tue.nl/%7Eaeb/linux/lk/lk-4.html
// "About ELF Auxiliary Vectors" by Manu Garg
// http://manugarg.googlepages.com/aboutelfauxiliaryvectors
// But the old int $0x80 approach still works, thank goodness,
// because all of that is *way* more than these three
// instructions.
exit0:
mov $0, %ebx // first argument to system call: 0
mov $__NR_exit, %eax // system call to call: exit
int $0x80 // make a system call
print:
mov $1, %ebx // stdout
// we expect address in %ecx and length in %edx here
mov $__NR_write, %eax
int $0x80
jmp next
// The rest of the code exists just to poke string addresses
// and lengths into %ecx and %edx so print can print them.
// (I probably should have just written a gas macro...)
hello:
mov $hello_string, %ecx
mov $5, %edx
jmp next
.section .rodata
hello_string:
.ascii "hello"
.text
world:
mov $world_string, %ecx
mov $5, %edx
jmp next
.section .rodata
world_string:
.ascii "world"
.text
comma:
mov $comma_string, %ecx
mov $2, %edx
jmp next
.section .rodata
comma_string:
.ascii ", "
.text
newline:
mov $newline_string, %ecx
mov $1, %edx
jmp next
.section .rodata
newline_string:
.ascii "\n"
.text