On Sat, Mar 22, 2008 at 6:34 PM, Charles R Harris <[EMAIL PROTECTED]> wrote:
I've attached a double version. Compile with gcc -msse2 -mfpmath=sse -O2 vec_bench_dbl.c -o vec_bench_dbl Chuck
#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <emmintrin.h> int sizes[6] = {100, 1000, 10000, 100000, 1000000, 10000000}; int nsizes = 6; int repeats = 300; double accurate_time() { struct timeval t; gettimeofday(&t,NULL); return (double)t.tv_sec + t.tv_usec*0.000001; } void rand_vector(int sz, double* vec) { int i; for (i=0; i<sz; i++) { vec[i] = (double)rand()/RAND_MAX; } } typedef void (*vec_func)(double*, double*, double*, int); double time_func(vec_func func, int sz) { int i; double t1, t2; double *v1, *v2, *v3; v1 = (double*)malloc(sizeof(double)*sz); v2 = (double*)malloc(sizeof(double)*sz); v3 = (double*)malloc(sizeof(double)*sz); rand_vector(sz, v1); rand_vector(sz, v2); t1 = accurate_time(); for (i=0; i<repeats; i++) { func(v1, v2, v3, sz); } t2 = accurate_time(); free(v1); free(v2); free(v3); return (t2-t1)/repeats; } void multiply_double_simple(double* a, double* b, double* c, int sz) { int i; for (i=0; i<sz; i++) { c[i] = a[i]*b[i]; } } void multiply_double_intrin(double* a, double* b, double* c, int sz) { __m128d a_, b_, c_; int i; for (i = 0; i<(sz & -2); i+=2) { a_ = _mm_loadu_pd(a + i); b_ = _mm_loadu_pd(b + i); c_ = _mm_mul_pd(a_, b_); _mm_storeu_pd(c + i, c_); } for ( ; i < sz; i++) { a_ = _mm_load_sd(a + i); b_ = _mm_load_sd(b + i); c_ = _mm_mul_sd(a_, b_); _mm_store_sd(c + i, c_); } } /* void multiply_float_inline(float* a, float* b, float* c, int sz) { #ifndef __SSE__ #error Use -msse #else __asm__ __volatile__( "mov %[sz], %%eax\n\t" "test $-4, %%eax\n\t" "jz .L_11_%=\n\t" ".L_10_%=:\n\t" "movlps 0(%[a]), %%xmm0\n\t" // xmm0 = a "movhps 8(%[a]), %%xmm0\n\t" "movlps 0(%[b]), %%xmm1\n\t" // xmm1 = b "movhps 8(%[b]), %%xmm1\n\t" "mulps %%xmm0, %%xmm1\n\t" // xmm1 = a*b "movlps %%xmm1, 0(%[c])\n\t" "movhps %%xmm1, 8(%[c])\n\t" "add $16, %[a]\n\t" "add $16, %[b]\n\t" "add $16, %[c]\n\t" "sub $4, %%eax\n\t" "test $-4, %%eax\n\t" "jnz .L_10_%=\n\t" ".L_11_%=:\n\t" "test $-1, %%eax\n\t" "jz .L_13_%=\n\t" ".L_12_%=:\n\t" "movss 0(%[a]), %%xmm0\n\t" "movss 0(%[b]), %%xmm1\n\t" "mulss %%xmm0, %%xmm1\n\t" "movss %%xmm1, 0(%[c])\n\t" "add $4, %[a]\n\t" "add $4, %[b]\n\t" "add $4, %[c]\n\t" "sub $1, %%eax\n\t" "test $-1, %%eax\n\t" "jnz .L_12_%=\n\t" ".L_13_%=:\n\t" : // Outputs [a] "=r" (a), [b] "=r" (b), [c] "=r" (c) : // Inputs "0" (a), "1" (b), "2" (c), [sz] "m" (sz) : "%eax", "%xmm0", "%xmm1" ); #endif } */ vec_func methods[3] = {multiply_double_simple, multiply_double_intrin}; char *method_names[3] = {"Simple", "Intrin"}; int nmethods = 2; void test_methods(void) { int test_size = 10003; // Not divisible by 4 int i, j; double *v1, *v2, *v3, *v4; v1 = (double*)malloc(test_size*sizeof(double)); v2 = (double*)malloc(test_size*sizeof(double)); v3 = (double*)malloc(test_size*sizeof(double)); v4 = (double*)malloc(test_size*sizeof(double)); rand_vector(test_size, v1); rand_vector(test_size, v2); methods[0](v1, v2, v3, test_size); for (i=1; i<nmethods; i++) { methods[i](v1, v2, v4, test_size); for (j=0; j<test_size; j++) { assert(v4[j] == v3[j]); } } free(v1); free(v2); free(v3); free(v4); } int main(void) { int i, j; double dt; double dt_simp; printf("Testing methods...\n"); test_methods(); printf("All OK\n\n"); printf("%20s", "Problem size"); for (i=0; i<nmethods; i++) { printf("%20s", method_names[i]); } printf("\n"); for (i=0; i<nsizes; i++) { printf("%20d", sizes[i]); dt_simp = time_func(methods[0], sizes[i])*1000.0; printf(" %7.4fms (%5.1f%%)", dt_simp, 100.0); for (j=1; j<nmethods; j++) { dt = time_func(methods[j], sizes[i])*1000.0; printf(" %7.4fms (%5.1f%%)", dt, (100.0*dt)/dt_simp); } printf("\n"); } }
_______________________________________________ Numpy-discussion mailing list Numpy-discussion@scipy.org http://projects.scipy.org/mailman/listinfo/numpy-discussion