kai: > I was evaluating using D for some numerical stuff.
For that evaluation you probably have to use the LDC compiler, that is able to optimize better. > void main (string[] args) > { > double [] foo = new double [cast(int)1e6]; > for (int i=0;i<1e3;i++) > { > for (int j=0;j<1e6-1;j++) > { > foo[j]=foo[j]+foo[j+1]; > } > } > } Using floating point for indexes and lengths is not a good practice. In D large numbers are written like 1_000_000. Use -release too. > Any ideas? Am I somehow not hitting a vital compiler optimization? DMD compiler doesn't perform many optimizations, especially on floating point computations. But the bigger problem in your code is that you are performing operations on NaNs (that's the default initalization of FP values in D), and operations on NaNs are usually quite slower. Your code in C: #include "stdio.h" #include "stdlib.h" #define N 1000000 int main() { double *foo = calloc(N, sizeof(double)); // malloc suffices here int i, j; for (j = 0; j < N; j++) foo[j] = 1.0; for (i = 0; i < 1000; i++) for (j = 0; j < N-1; j++) foo[j] = foo[j] + foo[j + 1]; printf("%f", foo[N-1]); return 0; } /* gcc -O3 -s -Wall test.c -o test Timings, outer loop=1_000 times: 7.72 ------------------ gcc -Wall -O3 -fomit-frame-pointer -msse3 -march=native test.c -o test (Running on a VirtualBox) Timings, outer loop=1_000 times: 7.69 s Just the inner loop: .L7: fldl 8(%edx) fadd %st, %st(1) fxch %st(1) fstpl (%edx) addl $8, %edx cmpl %ecx, %edx jne .L7 */ -------------------- Your code in D1: version (Tango) import tango.stdc.stdio: printf; else import std.c.stdio: printf; void main() { const int N = 1_000_000; double[] foo = new double[N]; foo[] = 1.0; for (int i = 0; i < 1_000; i++) for (int j = 0; j < N-1; j++) foo[j] = foo[j] + foo[j + 1]; printf("%f", foo[N-1]); } /* dmd -O -release -inline test.d (Not running on a VirtualBox) Timings, outer loop=1_000 times: 9.35 s Just the inner loop: L34: fld qword ptr 8[EDX*8][ECX] fadd qword ptr [EDX*8][ECX] fstp qword ptr [EDX*8][ECX] inc EDX cmp EDX,0F423Fh jb L34 ----------------------- ldc -O3 -release -inline test.d (Running on a VirtualBox) Timings, outer loop=1_000 times: 7.87 s Just the inner loop: .LBB1_2: movsd (%eax,%ecx,8), %xmm0 addsd 8(%eax,%ecx,8), %xmm0 movsd %xmm0, (%eax,%ecx,8) incl %ecx cmpl $999999, %ecx jne .LBB1_2 ----------------------- ldc -unroll-allow-partial -O3 -release -inline test.d (Running on a VirtualBox) Timings, outer loop=1_000 times: 7.75 s Just the inner loop: .LBB1_2: movsd (%eax,%ecx,8), %xmm0 addsd 8(%eax,%ecx,8), %xmm0 movsd %xmm0, (%eax,%ecx,8) movsd 8(%eax,%ecx,8), %xmm0 addsd 16(%eax,%ecx,8), %xmm0 movsd %xmm0, 8(%eax,%ecx,8) movsd 16(%eax,%ecx,8), %xmm0 addsd 24(%eax,%ecx,8), %xmm0 movsd %xmm0, 16(%eax,%ecx,8) movsd 24(%eax,%ecx,8), %xmm0 addsd 32(%eax,%ecx,8), %xmm0 movsd %xmm0, 24(%eax,%ecx,8) movsd 32(%eax,%ecx,8), %xmm0 addsd 40(%eax,%ecx,8), %xmm0 movsd %xmm0, 32(%eax,%ecx,8) movsd 40(%eax,%ecx,8), %xmm0 addsd 48(%eax,%ecx,8), %xmm0 movsd %xmm0, 40(%eax,%ecx,8) movsd 48(%eax,%ecx,8), %xmm0 addsd 56(%eax,%ecx,8), %xmm0 movsd %xmm0, 48(%eax,%ecx,8) movsd 56(%eax,%ecx,8), %xmm0 addsd 64(%eax,%ecx,8), %xmm0 movsd %xmm0, 56(%eax,%ecx,8) movsd 64(%eax,%ecx,8), %xmm0 addsd 72(%eax,%ecx,8), %xmm0 movsd %xmm0, 64(%eax,%ecx,8) addl $9, %ecx cmpl $999999, %ecx jne .LBB1_2 */ As you see the code generated by ldc is about as good as the one generated by gcc. There are of course other ways to optimize this code... Bye, bearophile