On Wednesday, 15 June 2016 at 09:32:21 UTC, Andrea Fontana wrote:
Then I think the slice.byElement.array is the right solution.

The problem with that is that it slows down the code. I compared matrix multiplication between R and D's cblas adaptor and ndslice.

n = 4000
Matrices: A, B
Sizes: both n by n
Engine: both call openblas

R Elapsed Time: 2.709 s
D's cblas and ndslice: 3.593 s

The R code:

n = 4000; A = matrix(runif(n*n), nr = n); B = matrix(runif(n*n), nr = n)
system.time(C <- A%*%B)

The D code:

import std.stdio : writeln;
import std.experimental.ndslice;
import std.random : Random, uniform;
import std.conv : to;
import std.array : array;
import cblas;
import std.datetime : StopWatch;


T[] runif(T)(ulong len, T min, T max){
        T[] arr = new T[len];
        Random gen;
        for(ulong i = 0; i < len; ++i)
                arr[i] = uniform(min, max, gen);
        return arr;
}

// Random matrix
auto rmat(T)(ulong nrow, ulong ncol, T min, T max){
        return runif(nrow*ncol, min, max).sliced(nrow, ncol);
}

auto matrix_mult(T)(Slice!(2, T*) a, Slice!(2, T*) b){
        int M = to!int(a.shape[0]);
        int K = to!int(a.shape[1]);
        int N = to!int(b.shape[1]);
        int n_el = to!int(a.elementsCount);
        T[] A = a.byElement.array;
        T[] B = b.byElement.array;
        T[] C = new T[M*N];
gemm(Order.ColMajor, Transpose.NoTrans, Transpose.NoTrans, M, N, K, 1., A.ptr, K, B.ptr, N, 0, C.ptr, N);
        return C.sliced(M, N);
}


void main()
{
        int n = 4000;
        auto A = rmat(n, n, 0., 1.);
        auto B = rmat(n, n, 0., 1. );
        StopWatch sw;
        sw.start();
        auto C = matrix_mult(A, B);
        sw.stop();
        writeln("Time taken: \n\t", sw.peek().msecs, " [ms]");
}

In my system monitor I can see the copy phase in the D process as as single core process. There should be a way to do go from ndslice to T[] without copying. Using a foreach loop is even slower

Reply via email to