Moritz Warning: > I think this might be interesting; it's also using D.
I was about to post this :-) The little benchmark: http://attractivechaos.wordpress.com/2011/04/25/my-programming-language-benchmarks-plb/ http://attractivechaos.github.com/plb/ (The matrix mul code is not a realistic example of code because people use library code to do this operation.) Original C code: double **mm_mul(int n, double *const *a, double *const *b) { int i, j, k; double **m, **c; m = mm_init(n); c = mm_init(n); for (i = 0; i < n; ++i) // transpose for (j = 0; j < n; ++j) c[i][j] = b[j][i]; for (i = 0; i < n; ++i) { double *p = a[i], *q = m[i]; for (j = 0; j < n; ++j) { double t = 0.0, *r = c[j]; for (k = 0; k < n; ++k) t += p[k] * r[k]; q[j] = t; } } mm_destroy(n, c); return m; } The original D1 code of the matrix mul, coming from the Java version: double[][] matmul(double[][] a, double[][] b) { int m = a.length, n = a[0].length, p = b[0].length; double[][] x, c; x.length = m; c.length = p; for (int i = 0; i < m; ++i) x[i].length = p; for (int i = 0; i < p; ++i) c[i].length = n; for (int i = 0; i < n; ++i) // transpose for (int j = 0; j < p; ++j) c[j][i] = b[i][j]; for (int i = 0; i < m; ++i) for (int j = 0; j < p; ++j) { double s = 0.0; for (int k = 0; k < n; ++k) s += a[i][k] * c[j][k]; x[i][j] = s; } return x; } Improved D2 code: http://codepad.org/aifQYhjI pure nothrow double[][] matMul(in double[][] a, in double[][] b) { immutable int m = a.length, n = a[0].length, p = b[0].length; // transpose auto c = new double[][](p, n); foreach (i, brow; b) foreach (j, bx; brow) c[j][i] = bx; auto x = new double[][](m, p); foreach (i, arow; a) foreach (j, crow; c) { // x[i][j] = std.numeric.dotProduct(arow, crow); // right way double s = 0.0; foreach (k, arowk; arow) s += arowk * crow[k]; x[i][j] = s; } return x; } Lua code: function matrix.mul(a, b, n) local y = matrix.new(n, n) local c = matrix.T(b, n) for i = 1, n - 1 do for j = 1, n - 1 do local sum = 0 for k = 0, n - 1 do sum = sum + a[i][k] * c[j][k] end y[i][j] = sum end end return y end -------------------- Timings, seconds: ICC-12.0.3 C 1.8 GCC-4.3.2 C 2.3 GDC-0.24 D 8.8 (DMD gives similar timings) Java@JRE-1.6.0_14 Java 2.9 LuaJIT-2.0.0-beta6 (JIT) Lua 2.7 Improved D version: about 4.4 seconds Inner loop GCC: L25: fldl (%edx,%eax,8) fmull (%ecx,%eax,8) addl $1, %eax cmpl %ebx, %eax faddp %st, %st(1) jne L25 Inner loop DMD, original version: L153: mov EDX,4[EBX] mov EAX,[EBX] mov EAX,024h[ESP] fld qword ptr [ESI*8][EDX] mov EDX,028h[ESP] mov EAX,[EDI*8][EDX] mov EDX,4[EDI*8][EDX] fmul qword ptr [ESI*8][EDX] inc ESI cmp ESI,EBP fadd qword ptr 034h[ESP] fstp qword ptr 034h[ESP] jl L153 Inner loop DMD, improved: L172: fld qword ptr [EBX*8][ECX] fmul qword ptr [EBX*8][ESI] inc EBX cmp EBX,04Ch[ESP] fadd qword ptr 054h[ESP] fstp qword ptr 054h[ESP] jb L172 The asm produced by DMD perfoms too many memory accesses in the inner loop. Is it possible to improve the DMD backend to remove this problem? Bye, bearophile