I wanted to see how much faster I could get in C than in Python.  The
answer so far is only about five times.

// A simple little wave-mechanics display thing in SDL, by Kragen
// Javier Sitaker, 2007-12-07 and 08.

// Originally I wrote it in Python with PyGame (49 lines of code), but
// I was disappointed with how slowly it ran, about 17fps on my
// PIII-Coppermine-700 at 320x240.  So I thought I'd try writing it in
// C to see how much faster it was, and much to my surprise, it was
// only something like 50% faster at 26fps --- although the size
// penalty was not as bad as I thought, as my first C version is only
// about 70 lines of code.  And it was fast to write!  It took under
// an hour.  Let's hear it for SDL!

// But it seemed like the sqrt and sin functions were taking up a lot
// of the run time, and any floating-point math seemed to be kind of
// costly, so I wrote this "all-integer" version.  In places, the
// integers are scaled by 256 so as to be able to represent fractional
// values, and there are tables that are initialized (to integers)
// using floating-point math.  (I thought floating-point was supposed
// to be fast now, but I guess my 1999 CPU hasn't gotten the news
// yet.)  This quadrupled the speed to 105fps.

// This version has a couple of interfering waves, one of which has a
// moving center, and so it only gets 61fps instead of 105fps at
// 320x240 on my laptop.  But that's still faster than my screen
// refreshes.

// Unfortunately the moving center doesn't produce a Doppler effect as
// it ought to.  That will require a different approach.

// I compiled with gcc -Wall -O5 -fomit-frame-pointer -g -lSDL on gcc
// 4.1.2.

#include <SDL/SDL.h>
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <assert.h>
#include <math.h>

// returns current time in 256ths of a second
int getnow_scaled() {
  struct timeval now;
  gettimeofday(&now, 0);
  return (now.tv_sec << 8) + now.tv_usec * 256 / 1000000;
}

int make_grayscale_component(Uint32 mask, float level) {
  return (int)(mask * level) & mask;
}

#define max_grayscale_value 256
short grayscale_palette[max_grayscale_value];
void init_grayscale_palette(struct SDL_PixelFormat *format) {
  int ii;
  assert(sizeof(grayscale_palette[0]) == format->BytesPerPixel);
  for (ii = 0; ii < max_grayscale_value; ii++) {
    float level = ii / 256.0;
    grayscale_palette[ii] = 
      make_grayscale_component(format->Rmask, level) +
      make_grayscale_component(format->Gmask, level) +
      make_grayscale_component(format->Bmask, level);
  }
}

// The idea here is that sqrtx256[x/256] == 256 * sqrt(x), which is
// equivalent to saying that sqrtx256[x] == 256 * 16 * sqrt(x), so we
// can do exact lookups for small numbers and linear interpolation for
// large ones.

// The max we'll currently encounter is 640*640 + 480*480 = 640 000,
// which is 256 * 2500.
#define max_sqrtx256 2500
int sqrtx256[max_sqrtx256];
void init_sqrtx256() {
  int ii;
  for (ii = 0; ii < max_sqrtx256; ii++)
    sqrtx256[ii] = (int)(0.5 + 256 * 16 * sqrt(ii));
}

// returns an approximation of 256 * sqrt(sqr)
int inline interp_sqrt(int sqr) {
  // This line eats 10% of performance:
  if (sqr < max_sqrtx256) return sqrtx256[sqr] >> 4;
  // the rest of this routine compiles inline to 10 instructions: mov
  // and sar mov mov sub imul sar lea jmp.
  int high = sqr >> 8; // we hope high < max_sqrtx256-1
  int below = sqrtx256[high];
  int above = sqrtx256[high+1];
  int spread = above - below;
  int correction = ((sqr & 0xff) * spread) >> 8;
  return correction + below;
}

// 4 * pi * 256, accurate to six places; I'd take things modulo 
// 2 * pi * 256 but rounding that to an integer introduces 100x as
// much error
#define fourpi 3217

// rtable: the idea is that you index into rtable with a value that
// represents an angle, in radians, scaled by 256, which in this case
// comes from the distance from the center of some wave minus the
// current time; and you get back a value scaled to the range [0, 256)
// representing (1 + sin(theta))/2.
unsigned char rtable[fourpi];
void init_rtable() {
  int ii;
  for (ii = 0; ii < fourpi; ii++) {
    int sin_val = 128 + 128 * sin(ii / 256.0) + 0.5;
    if (sin_val > 255) sin_val = 255;
    if (sin_val < 0) sin_val = 0;
    rtable[ii] = sin_val;
  }
}

int xorigin=0, yorigin=100;
void redraw_world(SDL_Surface *screen) {
  int xx, yy, w = screen->w, h = screen->h;
  short *pix;
  int now_scaled = getnow_scaled();
  assert(sizeof(*pix) == screen->format->BytesPerPixel);
  SDL_LockSurface(screen);
  pix = screen->pixels;
  for (yy = 0; yy < h; yy++) {
    for (xx = 0; xx < w; xx++) {
      int rx = xx - w/2, ry = yy - h/2;
      unsigned rscaled = interp_sqrt(rx*rx + ry*ry)/(w/64) - now_scaled;
      int dx = xx - xorigin, dy = yy - yorigin;
      unsigned rscaled2= interp_sqrt(dx*dx + dy*dy)/(w/32) - now_scaled * 4;
      *pix++ = grayscale_palette[(rtable[rscaled % fourpi] + 
                                  rtable[rscaled2% fourpi])/2];
    }
  }
  SDL_UnlockSurface(screen);
}

int lastframe;                        // global so main() can initialize it
#define movement_scaling 16
void update_moving_center(SDL_Surface *screen) {
  int now = getnow_scaled();
  int dt = now - lastframe;           // delta time since last frame
  int dx, dy;
  static int dx_err = 0, dy_err = 0;  // keep track of fractional pixels
  static int xdir = 1, ydir = 1;
  lastframe = now;                    // we only wanted lastframe to get dt
  dx = xdir * dt + dx_err;            // now calculate desired pixel movements
  dy = ydir * dt + dy_err;

  xorigin += dx / movement_scaling;   // apply them
  yorigin += dy / movement_scaling;
  dx_err = dx % movement_scaling;     // save up fractional pixels for later
  dy_err = dy % movement_scaling;

  if (xorigin > screen->w) xdir = -1; // bounce if need be; it's OK to be 
  if (xorigin < 0) xdir = 1;          // a little bit off the screen
  if (yorigin > screen->h) ydir = -1;
  if (yorigin < 0) ydir = 1;
}

int main(int argc, char **argv) {
  int frames = 0;
  int start, end;
  SDL_Surface *screen;
  SDL_Init(SDL_INIT_EVERYTHING);
  screen = SDL_SetVideoMode(320, 240, 16, SDL_FULLSCREEN);
  //screen = SDL_SetVideoMode(640, 480, 16, SDL_FULLSCREEN);
  if (!screen) abort();
  init_grayscale_palette(screen->format);
  init_sqrtx256();
  //memcpy(sqrtx256, init_sqrtx256, 1024);  just for fun
  init_rtable();
  lastframe = start = getnow_scaled();
  for (;;) {
    SDL_Event ev;
    if (SDL_PollEvent(&ev)) {
      if (ev.type == SDL_MOUSEBUTTONDOWN || ev.type == SDL_QUIT) break;
    } else {
      redraw_world(screen);
      SDL_Flip(screen);
      frames++;
      update_moving_center(screen);
    }
  }
  end = getnow_scaled();
  SDL_Quit();
  printf("%.2f seconds, %.2f fps\n", (end - start)/256.0, 
         256.0 * frames / (end - start));
  return 0;
}

Reply via email to