On Sat, Mar 22, 2008 at 6:34 PM, Charles R Harris <[EMAIL PROTECTED]>
wrote:

I've attached a double version.  Compile with
gcc -msse2 -mfpmath=sse -O2 vec_bench_dbl.c -o vec_bench_dbl

Chuck
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <emmintrin.h>

int sizes[6] = {100, 1000, 10000, 100000, 1000000, 10000000};
int nsizes = 6;
int repeats = 300;

double
accurate_time()
{
  struct timeval t;
  gettimeofday(&t,NULL);
  return (double)t.tv_sec + t.tv_usec*0.000001;
}

void
rand_vector(int sz, double* vec)
{
  int i;
  for (i=0; i<sz; i++) {
    vec[i] = (double)rand()/RAND_MAX;
  }
}

typedef void (*vec_func)(double*, double*, double*, int);

double
time_func(vec_func func, int sz)
{
  int i;
  double t1, t2;
  double *v1, *v2, *v3;

  v1 = (double*)malloc(sizeof(double)*sz);
  v2 = (double*)malloc(sizeof(double)*sz);
  v3 = (double*)malloc(sizeof(double)*sz);
  rand_vector(sz, v1);
  rand_vector(sz, v2);

  t1 = accurate_time();
  for (i=0; i<repeats; i++) {
    func(v1, v2, v3, sz);
  }
  t2 = accurate_time();

  free(v1);
  free(v2);
  free(v3);

  return (t2-t1)/repeats;
}

void
multiply_double_simple(double* a, double* b, double* c, int sz)
{
  int i;
  for (i=0; i<sz; i++) {
    c[i] = a[i]*b[i];
  }
}

void
multiply_double_intrin(double* a, double* b, double* c, int sz)
{
  __m128d a_, b_, c_;
  int i;
  for (i = 0; i<(sz & -2); i+=2) {
    a_ = _mm_loadu_pd(a + i);
    b_ = _mm_loadu_pd(b + i);
    c_ = _mm_mul_pd(a_, b_);
    _mm_storeu_pd(c + i, c_);
  }
  for ( ; i < sz; i++) {
    a_ = _mm_load_sd(a + i);
    b_ = _mm_load_sd(b + i);
    c_ = _mm_mul_sd(a_, b_);
    _mm_store_sd(c + i, c_);
  }
}

/*
void
multiply_float_inline(float* a, float* b, float* c, int sz)
{
#ifndef __SSE__
#error Use -msse
#else
  __asm__ __volatile__(
   "mov %[sz], %%eax\n\t"
   "test $-4, %%eax\n\t"
   "jz .L_11_%=\n\t"
   ".L_10_%=:\n\t"

   "movlps 0(%[a]), %%xmm0\n\t" // xmm0 = a
   "movhps 8(%[a]), %%xmm0\n\t"

   "movlps 0(%[b]), %%xmm1\n\t" // xmm1 = b
   "movhps 8(%[b]), %%xmm1\n\t"

   "mulps %%xmm0, %%xmm1\n\t" // xmm1 = a*b

   "movlps %%xmm1, 0(%[c])\n\t"
   "movhps %%xmm1, 8(%[c])\n\t"

   "add $16, %[a]\n\t"
   "add $16, %[b]\n\t"
   "add $16, %[c]\n\t"

   "sub $4, %%eax\n\t"
   "test $-4, %%eax\n\t"
   "jnz .L_10_%=\n\t"
   ".L_11_%=:\n\t"

   "test $-1, %%eax\n\t"
   "jz .L_13_%=\n\t"
   ".L_12_%=:\n\t"

   "movss 0(%[a]), %%xmm0\n\t"
   "movss 0(%[b]), %%xmm1\n\t"
   "mulss %%xmm0, %%xmm1\n\t"
   "movss %%xmm1, 0(%[c])\n\t"
   
   "add $4, %[a]\n\t"
   "add $4, %[b]\n\t"
   "add $4, %[c]\n\t"

   "sub $1, %%eax\n\t"
   "test $-1, %%eax\n\t"
   "jnz .L_12_%=\n\t"
   ".L_13_%=:\n\t"
  : // Outputs
   [a] "=r" (a),
   [b] "=r" (b),
   [c] "=r" (c)
  : // Inputs
   "0" (a),
   "1" (b),
   "2" (c),
   [sz] "m" (sz)
  : "%eax", "%xmm0", "%xmm1"
  );
#endif
}
*/

vec_func methods[3] = {multiply_double_simple, multiply_double_intrin};
char *method_names[3] = {"Simple", "Intrin"};
int nmethods = 2;

void
test_methods(void)
{
  int test_size = 10003; // Not divisible by 4
  int i, j;
  double *v1, *v2, *v3, *v4;
  v1 = (double*)malloc(test_size*sizeof(double));
  v2 = (double*)malloc(test_size*sizeof(double));
  v3 = (double*)malloc(test_size*sizeof(double));
  v4 = (double*)malloc(test_size*sizeof(double));

  rand_vector(test_size, v1);
  rand_vector(test_size, v2);

  methods[0](v1, v2, v3, test_size);

  for (i=1; i<nmethods; i++) {
    methods[i](v1, v2, v4, test_size);
    for (j=0; j<test_size; j++) {
      assert(v4[j] == v3[j]);
    }
  }

  free(v1);
  free(v2);
  free(v3);
  free(v4);
}

int
main(void)
{
  int i, j;
  double dt;
  double dt_simp;

  printf("Testing methods...\n");
  test_methods();
  printf("All OK\n\n");

  printf("%20s", "Problem size");
  for (i=0; i<nmethods; i++) {
    printf("%20s", method_names[i]);
  }
  printf("\n");

  for (i=0; i<nsizes; i++) {
    printf("%20d", sizes[i]);
    dt_simp = time_func(methods[0], sizes[i])*1000.0;
    printf("  %7.4fms (%5.1f%%)", dt_simp, 100.0);
    for (j=1; j<nmethods; j++) {
      dt = time_func(methods[j], sizes[i])*1000.0;
      printf("  %7.4fms (%5.1f%%)", dt, (100.0*dt)/dt_simp);
    }
    printf("\n");
  }
}
_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@scipy.org
http://projects.scipy.org/mailman/listinfo/numpy-discussion

Reply via email to