Thanks Tim for the quick reply.
I rewrote the same four cases as functions, ran them once for the JIT 
compilation and now the results are consistent (and pretty good) - I also 
kept the code from the old case a' x b' at the end and you can see how much 
slower it is on my computer (but now I understand why).
This is the output and below is my code (again two 1000x1000 matrixes and 
20 iterations):

JIT Compilation:
  0.225234 seconds (584.87 k allocations: 27.644 MB)
  0.043811 seconds (64.50 k allocations: 3.020 MB)
  0.003492 seconds (4.09 k allocations: 219.771 KB)
  0.002444 seconds (3.30 k allocations: 173.763 KB)
Full run:
  0.363003 seconds (104 allocations: 152.590 MB, 1.78% gc time)
a x b 1000x1000x1000 0.383239 s 104373.502648 MFLOPS
  0.355820 seconds (18.49 k allocations: 161.055 MB, 2.97% gc time)
a' x b 1000x1000x1000 0.358574 s 111553.011766 MFLOPS
  0.333026 seconds (109 allocations: 160.220 MB, 3.34% gc time)
a x b' 1000x1000x1000 0.336589 s 118839.346176 MFLOPS
  0.371250 seconds (114 allocations: 167.850 MB, 13.26% gc time)
a' x b' 1000x1000x1000 0.373455 s 107107.937773 MFLOPS
old a' x b' 1000x1000x1000 0.308466 s 6483.691371 MFLOPS



Thanks again.
Franco

#!/bin/julia -q

using ArgParse

s = ArgParseSettings()

@add_arg_table s begin
    "-m"
    "-k"
    "-n"
    "-t"
end

parsed_args = parse_args(s)
m = parse(Int, parsed_args["m"])
k = parse(Int, parsed_args["k"])
n = parse(Int, parsed_args["n"])
t = parse(Int, parsed_args["t"])

a = 2 * rand(m, k) - 1
b = 2 * rand(k, n) - 1

function axb(a::Array{Float64,2}, b::Array{Float64,2}, t::Int64)
    for i = 1:t
        c = a * b
    end
end

function atxb(a::Array{Float64,2}, b::Array{Float64,2}, t::Int64)
    for i = 1:t
        c = a' * b
    end
end

function axbt(a::Array{Float64,2}, b::Array{Float64,2}, t::Int64)
    for i = 1:t
        c = a * b'
    end
end

function atxbt(a::Array{Float64,2}, b::Array{Float64,2}, t::Int64)
    for i = 1:t
        c = a' * b'
    end
end


println("JIT Compilation:")
@time axb(reshape([1.0], 1, 1), reshape([1.0], 1, 1), 1);
@time atxb(reshape([1.0], 1, 1), reshape([1.0], 1, 1), 1);
@time axbt(reshape([1.0], 1, 1), reshape([1.0], 1, 1), 1);
@time atxbt(reshape([1.0], 1, 1), reshape([1.0], 1, 1), 1);

println("Full run:")

tstart = time()
@time axb(a, b, t)
tend = time()
duration = tend - tstart
mflops = (2 * m * n * k) / (duration / t) * 1.0e-6
@printf("a x b\t%dx%dx%d\t%f s\t%f MFLOPS\n", m, n, k, duration, mflops)

tstart = time()
@time atxb(a', b, t)
tend = time()
duration = tend - tstart
mflops = (2 * m * n * k) / (duration / t) * 1.0e-6
@printf("a' x b\t%dx%dx%d\t%f s\t%f MFLOPS\n", m, n, k, duration, mflops)

tstart = time()
@time axbt(a, b', t)
tend = time()
duration = tend - tstart
mflops = (2 * m * n * k) / (duration / t) * 1.0e-6
@printf("a x b'\t%dx%dx%d\t%f s\t%f MFLOPS\n", m, n, k, duration, mflops)

tstart = time()
@time atxbt(a', b', t)
tend = time()
duration = tend - tstart
mflops = (2 * m * n * k) / (duration / t) * 1.0e-6
@printf("a' x b'\t%dx%dx%d\t%f s\t%f MFLOPS\n", m, n, k, duration, mflops)

# old version of a' x b'
start = time()
for i = 1:t
    c = a' * b'
end
tend = time()
duration = (tend - tstart) / t
mflops = (2 * m * n * k) / duration * 1.0e-6
@printf("old a' x b'\t%dx%dx%d\t%f s\t%f MFLOPS\n", m, n, k, duration, 
mflops)



Reply via email to