OK, i've written a simple benchmark which implements an elementwise
multiply (A=B*C) in three different ways (standard C, intrinsics, hand
coded assembly). On the face of things the results seem to indicate
that the vectorization works best on medium sized inputs. If people
could post the results of running the benchmark on their machines
(takes ~1min) along with the output of gcc --version and their chip
model, that wd be v useful.

It should be compiled with: gcc -msse -O2 vec_bench.c -o vec_bench

Here's two:

CPU: Core Duo T2500 @ 2GHz
gcc --version: gcc (GCC) 4.1.2 (Ubuntu 4.1.2-0ubuntu4)
        Problem size              Simple              Intrin              Inline
                 100   0.0003ms (100.0%)   0.0002ms ( 67.7%)   0.0002ms ( 50.6%)
                1000   0.0030ms (100.0%)   0.0021ms ( 69.2%)   0.0015ms ( 50.6%)
               10000   0.0370ms (100.0%)   0.0267ms ( 72.0%)   0.0279ms ( 75.4%)
              100000   0.2258ms (100.0%)   0.1469ms ( 65.0%)   0.1273ms ( 56.4%)
             1000000   4.5690ms (100.0%)   4.4616ms ( 97.6%)   4.4185ms ( 96.7%)
            10000000  47.0022ms (100.0%)  45.4100ms ( 96.6%)  44.4437ms ( 94.6%)

CPU: Intel Xeon E5345 @ 2.33Ghz
gcc --version: gcc (GCC) 4.1.2 20070925 (Red Hat 4.1.2-33)
        Problem size              Simple              Intrin              Inline
                 100   0.0001ms (100.0%)   0.0001ms ( 69.2%)   0.0001ms ( 77.4%)
                1000   0.0010ms (100.0%)   0.0008ms ( 78.1%)   0.0009ms ( 86.6%)
               10000   0.0108ms (100.0%)   0.0088ms ( 81.2%)   0.0086ms ( 79.6%)
              100000   0.1131ms (100.0%)   0.0897ms ( 79.3%)   0.0872ms ( 77.1%)
             1000000   5.2103ms (100.0%)   3.9153ms ( 75.1%)   3.8328ms ( 73.6%)
            10000000  54.1815ms (100.0%)  51.8286ms ( 95.7%)  51.4366ms ( 94.9%)

James
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <xmmintrin.h>

int sizes[6] = {100, 1000, 10000, 100000, 1000000, 10000000};
int nsizes = 6;
int repeats = 300;

double
accurate_time()
{
  struct timeval t;
  gettimeofday(&t,NULL);
  return (double)t.tv_sec + t.tv_usec*0.000001;
}

void
rand_vector(int sz, float* vec)
{
  int i;
  for (i=0; i<sz; i++) {
    vec[i] = (float)rand()/RAND_MAX;
  }
}

typedef void (*vec_func)(float*, float*, float*, int);

double
time_func(vec_func func, int sz)
{
  int i;
  double t1, t2;
  float *v1, *v2, *v3;

  v1 = (float*)malloc(sizeof(float)*sz);
  v2 = (float*)malloc(sizeof(float)*sz);
  v3 = (float*)malloc(sizeof(float)*sz);
  rand_vector(sz, v1);
  rand_vector(sz, v2);

  t1 = accurate_time();
  for (i=0; i<repeats; i++) {
    func(v1, v2, v3, sz);
  }
  t2 = accurate_time();
 
  free(v1);
  free(v2);
  free(v3);

  return (t2-t1)/repeats;
}

void
multiply_float_simple(float* a, float* b, float* c, int sz)
{
  int i;
  for (i=0; i<sz; i++) {
    c[i] = a[i]*b[i];
  }
}

void
multiply_float_intrin(float* a, float* b, float* c, int sz)
{
  __m128 a_, b_, c_;
  int i;
  for (i = 0; i<(sz & -4); i+=4) {
    a_ = _mm_loadu_ps(a + i);
    b_ = _mm_loadu_ps(b + i);
    c_ = _mm_mul_ps(a_, b_);
    _mm_storeu_ps(c + i, c_);
  }
  for ( ; i < sz; i++) {
    a_ = _mm_load_ss(a + i);
    b_ = _mm_load_ss(b + i);
    c_ = _mm_mul_ss(a_, b_);
    _mm_store_ss(c + i, c_);
  }
}

void
multiply_float_inline(float* a, float* b, float* c, int sz)
{
#ifndef __SSE__
#error Use -msse
#else
  __asm__ __volatile__(
   "mov %[sz], %%eax\n\t"
   "test $-4, %%eax\n\t"
   "jz .L_11_%=\n\t"
   ".L_10_%=:\n\t"

   "movlps 0(%[a]), %%xmm0\n\t" // xmm0 = a
   "movhps 8(%[a]), %%xmm0\n\t"

   "movlps 0(%[b]), %%xmm1\n\t" // xmm1 = b
   "movhps 8(%[b]), %%xmm1\n\t"

   "mulps %%xmm0, %%xmm1\n\t" // xmm1 = a*b

   "movlps %%xmm1, 0(%[c])\n\t"
   "movhps %%xmm1, 8(%[c])\n\t"

   "add $16, %[a]\n\t"
   "add $16, %[b]\n\t"
   "add $16, %[c]\n\t"

   "sub $4, %%eax\n\t"
   "test $-4, %%eax\n\t"
   "jnz .L_10_%=\n\t"
   ".L_11_%=:\n\t"

   "test $-1, %%eax\n\t"
   "jz .L_13_%=\n\t"
   ".L_12_%=:\n\t"

   "movss 0(%[a]), %%xmm0\n\t"
   "movss 0(%[b]), %%xmm1\n\t"
   "mulss %%xmm0, %%xmm1\n\t"
   "movss %%xmm1, 0(%[c])\n\t"
   
   "add $4, %[a]\n\t"
   "add $4, %[b]\n\t"
   "add $4, %[c]\n\t"

   "sub $1, %%eax\n\t"
   "test $-1, %%eax\n\t"
   "jnz .L_12_%=\n\t"
   ".L_13_%=:\n\t"
  : // Outputs
   [a] "=r" (a),
   [b] "=r" (b),
   [c] "=r" (c)
  : // Inputs
   "0" (a),
   "1" (b),
   "2" (c),
   [sz] "m" (sz)
  : "%eax", "%xmm0", "%xmm1"
  );
#endif
}

vec_func methods[3] = {multiply_float_simple, multiply_float_intrin, multiply_float_inline};
char *method_names[3] = {"Simple", "Intrin", "Inline"};
int nmethods = 3;

void
test_methods(void)
{
  int test_size = 10003; // Not divisible by 4
  int i, j;
  float *v1, *v2, *v3, *v4;
  v1 = (float*)malloc(test_size*sizeof(float));
  v2 = (float*)malloc(test_size*sizeof(float));
  v3 = (float*)malloc(test_size*sizeof(float));
  v4 = (float*)malloc(test_size*sizeof(float));

  rand_vector(test_size, v1);
  rand_vector(test_size, v2);

  methods[0](v1, v2, v3, test_size);

  for (i=1; i<nmethods; i++) {
    methods[i](v1, v2, v4, test_size);
    for (j=0; j<test_size; j++) {
      assert(v4[j] == v3[j]);
    }
  }

  free(v1);
  free(v2);
  free(v3);
  free(v4);
}

int
main(void)
{
  int i, j;
  double dt;
  double dt_simp;

  printf("Testing methods...\n");
  test_methods();
  printf("All OK\n\n");

  printf("%20s", "Problem size");
  for (i=0; i<nmethods; i++) {
    printf("%20s", method_names[i]);
  }
  printf("\n");

  for (i=0; i<nsizes; i++) {
    printf("%20d", sizes[i]);
    dt_simp = time_func(methods[0], sizes[i])*1000.0;
    printf("  %7.4fms (%5.1f%%)", dt_simp, 100.0);
    for (j=1; j<nmethods; j++) {
      dt = time_func(methods[j], sizes[i])*1000.0;
      printf("  %7.4fms (%5.1f%%)", dt, (100.0*dt)/dt_simp);
    }
    printf("\n");
  }
}
_______________________________________________
Numpy-discussion mailing list
Numpy-discussion@scipy.org
http://projects.scipy.org/mailman/listinfo/numpy-discussion

Reply via email to