On Tue, 9 Oct 2001, Dan Sugalski wrote:
> At 01:20 PM 10/9/2001 +0200, Paolo Molaro wrote:
> >On 10/07/01 Bryan C. Warnock wrote:
> > > while (*pc) {
> > > switch (*pc) {
> > > }
> > > }
> If anyone wants an official ruling...
>
> DO_OP can contain any code you like as long as it:
>
> *) Some version ultimately compiles everywhere
> *) Allows opcodes above a certain point (probably 256 or 512) to be
> overridden at runtime
>
> Computed gotos, switches, function table dispatches, generated machine
> code, or some other form of coding madness are all OK. I don't care which
> way a particular platform goes as long as it doesn't constrain another
> platform's ability to go another way.
Hehe. Ok, I delved into the particulars of gcc and found:
void
runloop(int**code) {
static void* array[] = { &&op_end, &&op1, &&op2 };
start:
goto *array[*code];
op1:
foo; goto start;
op2:
foo; goto start;
op_end:
return;
} // end code
The key was &&lbl which returns the physical address of the a jumpable
label. This is highly gcc-specific, but given that we can use
#ifdef's I don't see a problem with it. :)
I even wrote an ugly version that avoided that array indirection.
What it did was "convert" the code-base to change the op-codes into
physical addresses. This obviously has some negative ramifications,
but is definately doable. By removing that extra indirection I got a
couple percent extra improvement (12%). I can't imagine you'll get much
faster than that (except for an inlining jit or perl6->C compiler).
The only problem was that the &&lbl's didn't want to be non-local, so
I had to perform some evil magic.
The unpolished code is attached with a cheesy instruction set which
was just designed to prevent the optimizer from optimizing away my
op-codes.
Here are the benchmarks on a 466MHZ dual proc celeron:
w/ code=500,000,000 and loop_cnt = 10
(cache-bound)
gcc speeds.c
-----
switcher time delta=45
caller time delta=47
jumper time delta=43
code compile time delta=4
fast jumper time delta=34
gcc -O speeds.c
-----
switcher time delta=30
caller time delta=41
jumper time delta=28
code compile time delta=3
fast jumper time delta=26
gcc -O2 speeds.c
-------
switcher time delta=30
caller time delta=41
jumper time delta=28
code compile time delta=3
fast jumper time delta=25
gcc -O3 speeds.c
--------
switcher time delta=29
caller time delta=41
jumper time delta=28
code compile time delta=2
fast jumper time delta=26
==========
==========
w/ code=1000, loop_cnt = 5,000,000
gcc -O3 speeds.c
------
switcher time delta=195
caller time delta=268
jumper time delta=185
code compile time delta=0
fast jumper time delta=154
# Note a marginal improvement from switch -> jump, yet an even bigger
improvement in jump to fast-jump.
It's unlikely to have a 50M (x4B+) code-base. But it's possible to
have an inner loop that has 500,000 iterations.
-Michael
#include<time.h>
#include<stdio.h>
#include<stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
//#define SIZE 50000000
#define SIZE 1000
int codep[SIZE];
void
dump_mappings()
{
char strbuf[1024];
int pid = getpid();
sprintf( strbuf, "cat /proc/%i/maps", pid );
system( strbuf );
sprintf( strbuf, "ls -l /proc/%i/fd" );
system( strbuf );
printf("press enter to continue");
scanf("%c",strbuf);
}
int x,y,z;
void
switcher( int*code)
{
while(1)
switch(*code) {
case 0: goto DONE;
break;
case 1: x++;
code += 1;
break;
case 2: y++;
code += 1;
break;
case 3: z++;
code += 1;
break;
case 4: x = y + z;
code += 1;
break;
}
DONE:
} // end switcher
int* end(int*code) { return 0; }
int* o1(int*code) { x++; return code+1; }
int* o2(int*code) { y++; return code+1; }
int* o3(int*code) { z++; return code+1; }
int* o4(int*code) { x = y + z; return code+1; }
typedef int* (*op_handler_t)(int*code);
op_handler_t op_codes[] = {
end, o1, o2, o3, o4
};
void
caller( int*code)
{
while(*code)
code = op_codes[ *code ](code);
} // end caller
void
gotoer( int*code)
{
static int* array[] = { &&lend, &&lo1, &&lo2, &&lo3, &&lo4 };
start:
/*
if ( !code ) {
puts("null code");
exit(-1);
}
if ( *code > 3 ) {
puts("Invalid idx");
exit(-1);
}
*/
goto *array[ *code ];
lo1:
x++;
code += 1;
goto start;
lo2:
y++;
code += 1;
goto start;
lo3:
z++;
code += 1;
goto start;
lo4:
z++;
code += 1;
goto start;
lend:
return;
} // end gotoer
int*
convert( int size, int*code, int*map )
{
int* retval, *base;
int idx =0;
//dump_mappings();
retval = base = (int*)malloc(size * sizeof(int));
if ( !retval ) {
puts("out of memory");
exit(-1);
}
//dump_mappings();
//printf("retval=%p, codep=%p\n", retval, codep );
/*
if (!retval) {
puts("couldn't allocate retval");
exit(-1);
}
*/
code[size - 1] = 0; // to make sure
while( *code ) {
*retval++ =
map[ *code++ ];
//idx++;
}
//printf("size=%i, idx=%i\n", size, idx );
//exit(0);
*retval++ = map[ 0 ];
return base;
} // end convert
int**
fgotoer( int* code, int f_fake )
{
static int* array[] = { &&fend, &&fo1, &&fo2, &&fo3, &&fo4 };
if ( f_fake ) {
return (int**)array;
}
fstart:
goto **code;
fo1:
x++;
code += 1;
//puts("o1");
goto fstart;
fo2:
y++;
code += 1;
//puts("o2");
goto fstart;
fo3:
z++;
code += 1;
//puts("o3");
goto fstart;
fo4:
z++;
code += 1;
//puts("o4");
goto fstart;
fend:
//puts("end");
return;
} // end fgotoer
int
main()
{
time_t start_time, stop_time;
int* code_ref = codep;
int i;
int** compiled_code;
//int cnt = 10;
int cnt = 500000;
for( i = 0; i < SIZE ; i++ )
*code_ref++ = ( i % 3 ) + 1;
codep[ SIZE - 1 ] = 0;
time(&start_time);
for( i = 0; i < cnt; i++ )
switcher( codep );
time(&stop_time);
printf("switcher time delta=%i\n", stop_time - start_time );
start_time = stop_time;
for( i = 0; i < cnt; i++ )
caller( codep );
time(&stop_time);
printf("caller time delta=%i\n", stop_time - start_time );
start_time = stop_time;
for( i = 0; i < cnt; i++ )
gotoer( codep );
time(&stop_time);
printf("jumper time delta=%i\n", stop_time - start_time );
start_time = stop_time;
compiled_code = (int**)convert( SIZE, codep, (int*)fgotoer(0, 1) );
time(&stop_time);
printf("code compile time delta=%i\n", stop_time - start_time );
start_time = stop_time;
for( i = 0; i < cnt; i++ )
fgotoer( (int*)compiled_code, 0 );
time(&stop_time);
printf("code jumper time delta=%i\n", stop_time - start_time );
start_time = stop_time;
return 0;
}