OK, i've written a simple benchmark which implements an elementwise multiply (A=B*C) in three different ways (standard C, intrinsics, hand coded assembly). On the face of things the results seem to indicate that the vectorization works best on medium sized inputs. If people could post the results of running the benchmark on their machines (takes ~1min) along with the output of gcc --version and their chip model, that wd be v useful.
It should be compiled with: gcc -msse -O2 vec_bench.c -o vec_bench Here's two: CPU: Core Duo T2500 @ 2GHz gcc --version: gcc (GCC) 4.1.2 (Ubuntu 4.1.2-0ubuntu4) Problem size Simple Intrin Inline 100 0.0003ms (100.0%) 0.0002ms ( 67.7%) 0.0002ms ( 50.6%) 1000 0.0030ms (100.0%) 0.0021ms ( 69.2%) 0.0015ms ( 50.6%) 10000 0.0370ms (100.0%) 0.0267ms ( 72.0%) 0.0279ms ( 75.4%) 100000 0.2258ms (100.0%) 0.1469ms ( 65.0%) 0.1273ms ( 56.4%) 1000000 4.5690ms (100.0%) 4.4616ms ( 97.6%) 4.4185ms ( 96.7%) 10000000 47.0022ms (100.0%) 45.4100ms ( 96.6%) 44.4437ms ( 94.6%) CPU: Intel Xeon E5345 @ 2.33Ghz gcc --version: gcc (GCC) 4.1.2 20070925 (Red Hat 4.1.2-33) Problem size Simple Intrin Inline 100 0.0001ms (100.0%) 0.0001ms ( 69.2%) 0.0001ms ( 77.4%) 1000 0.0010ms (100.0%) 0.0008ms ( 78.1%) 0.0009ms ( 86.6%) 10000 0.0108ms (100.0%) 0.0088ms ( 81.2%) 0.0086ms ( 79.6%) 100000 0.1131ms (100.0%) 0.0897ms ( 79.3%) 0.0872ms ( 77.1%) 1000000 5.2103ms (100.0%) 3.9153ms ( 75.1%) 3.8328ms ( 73.6%) 10000000 54.1815ms (100.0%) 51.8286ms ( 95.7%) 51.4366ms ( 94.9%) James
#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <xmmintrin.h> int sizes[6] = {100, 1000, 10000, 100000, 1000000, 10000000}; int nsizes = 6; int repeats = 300; double accurate_time() { struct timeval t; gettimeofday(&t,NULL); return (double)t.tv_sec + t.tv_usec*0.000001; } void rand_vector(int sz, float* vec) { int i; for (i=0; i<sz; i++) { vec[i] = (float)rand()/RAND_MAX; } } typedef void (*vec_func)(float*, float*, float*, int); double time_func(vec_func func, int sz) { int i; double t1, t2; float *v1, *v2, *v3; v1 = (float*)malloc(sizeof(float)*sz); v2 = (float*)malloc(sizeof(float)*sz); v3 = (float*)malloc(sizeof(float)*sz); rand_vector(sz, v1); rand_vector(sz, v2); t1 = accurate_time(); for (i=0; i<repeats; i++) { func(v1, v2, v3, sz); } t2 = accurate_time(); free(v1); free(v2); free(v3); return (t2-t1)/repeats; } void multiply_float_simple(float* a, float* b, float* c, int sz) { int i; for (i=0; i<sz; i++) { c[i] = a[i]*b[i]; } } void multiply_float_intrin(float* a, float* b, float* c, int sz) { __m128 a_, b_, c_; int i; for (i = 0; i<(sz & -4); i+=4) { a_ = _mm_loadu_ps(a + i); b_ = _mm_loadu_ps(b + i); c_ = _mm_mul_ps(a_, b_); _mm_storeu_ps(c + i, c_); } for ( ; i < sz; i++) { a_ = _mm_load_ss(a + i); b_ = _mm_load_ss(b + i); c_ = _mm_mul_ss(a_, b_); _mm_store_ss(c + i, c_); } } void multiply_float_inline(float* a, float* b, float* c, int sz) { #ifndef __SSE__ #error Use -msse #else __asm__ __volatile__( "mov %[sz], %%eax\n\t" "test $-4, %%eax\n\t" "jz .L_11_%=\n\t" ".L_10_%=:\n\t" "movlps 0(%[a]), %%xmm0\n\t" // xmm0 = a "movhps 8(%[a]), %%xmm0\n\t" "movlps 0(%[b]), %%xmm1\n\t" // xmm1 = b "movhps 8(%[b]), %%xmm1\n\t" "mulps %%xmm0, %%xmm1\n\t" // xmm1 = a*b "movlps %%xmm1, 0(%[c])\n\t" "movhps %%xmm1, 8(%[c])\n\t" "add $16, %[a]\n\t" "add $16, %[b]\n\t" "add $16, %[c]\n\t" "sub $4, %%eax\n\t" "test $-4, %%eax\n\t" "jnz .L_10_%=\n\t" ".L_11_%=:\n\t" "test $-1, %%eax\n\t" "jz .L_13_%=\n\t" ".L_12_%=:\n\t" "movss 0(%[a]), %%xmm0\n\t" "movss 0(%[b]), %%xmm1\n\t" "mulss %%xmm0, %%xmm1\n\t" "movss %%xmm1, 0(%[c])\n\t" "add $4, %[a]\n\t" "add $4, %[b]\n\t" "add $4, %[c]\n\t" "sub $1, %%eax\n\t" "test $-1, %%eax\n\t" "jnz .L_12_%=\n\t" ".L_13_%=:\n\t" : // Outputs [a] "=r" (a), [b] "=r" (b), [c] "=r" (c) : // Inputs "0" (a), "1" (b), "2" (c), [sz] "m" (sz) : "%eax", "%xmm0", "%xmm1" ); #endif } vec_func methods[3] = {multiply_float_simple, multiply_float_intrin, multiply_float_inline}; char *method_names[3] = {"Simple", "Intrin", "Inline"}; int nmethods = 3; void test_methods(void) { int test_size = 10003; // Not divisible by 4 int i, j; float *v1, *v2, *v3, *v4; v1 = (float*)malloc(test_size*sizeof(float)); v2 = (float*)malloc(test_size*sizeof(float)); v3 = (float*)malloc(test_size*sizeof(float)); v4 = (float*)malloc(test_size*sizeof(float)); rand_vector(test_size, v1); rand_vector(test_size, v2); methods[0](v1, v2, v3, test_size); for (i=1; i<nmethods; i++) { methods[i](v1, v2, v4, test_size); for (j=0; j<test_size; j++) { assert(v4[j] == v3[j]); } } free(v1); free(v2); free(v3); free(v4); } int main(void) { int i, j; double dt; double dt_simp; printf("Testing methods...\n"); test_methods(); printf("All OK\n\n"); printf("%20s", "Problem size"); for (i=0; i<nmethods; i++) { printf("%20s", method_names[i]); } printf("\n"); for (i=0; i<nsizes; i++) { printf("%20d", sizes[i]); dt_simp = time_func(methods[0], sizes[i])*1000.0; printf(" %7.4fms (%5.1f%%)", dt_simp, 100.0); for (j=1; j<nmethods; j++) { dt = time_func(methods[j], sizes[i])*1000.0; printf(" %7.4fms (%5.1f%%)", dt, (100.0*dt)/dt_simp); } printf("\n"); } }
_______________________________________________ Numpy-discussion mailing list Numpy-discussion@scipy.org http://projects.scipy.org/mailman/listinfo/numpy-discussion