The following code is my current version of train_one():
function train_one(c :: LinearClassifier, x :: Array{Float64}, y :: Int64,
input_gradient :: Array{Float64}, α :: Float64 = 0.025)
predict!(c, x)
c.outputs[y] -= 1
# input_gradient = ( c.weights * outputs' )'
# BLAS.gemv!('N', α, c.weights, c.outputs, 1.0, input_gradient)
m = 0.0
j = 0
limit = c.n - 4
for i in 1:c.k
m = α * c.outputs[i]
j = 1
while j <= limit
@nexprs 4 (idx->input_gradient[j+idx-1] += m * c.weights[j+idx-1
, i])
j+=4
end
while j <= c.n
input_gradient[j] += m * c.weights[j, i]
j+=1
end
end
# c.weights -= α * x' * outputs;
# BLAS.ger!(-α, vec(x), c.outputs, c.weights)
for i in 1:c.k
m = α * c.outputs[i]
j = 1
while j <= limit
c.weights[j, i] -= m * x[j]
c.weights[j+1, i] -= m * x[j+1]
c.weights[j+2, i] -= m * x[j+2]
c.weights[j+3, i] -= m * x[j+3]
j+=4
end
while j <= c.n
c.weights[j, i] -= m * x[j]
j+=1
end
end
end
在 2015年2月19日星期四 UTC+8下午10:51:20,Zhixuan Yang写道:
>
> Hello everyone,
>
> Recently I'm working on my first Julia project, a word embedding training
> program similar to Google's word2vec <https://code.google.com/p/word2vec/>
> (the code
> of word2vec is indeed very high-quality, but I want to add more features,
> so I decided to write a new one). Thanks to Julia's expressiveness, it cost
> me less than 2 days to write the entire program. But it runs really slow,
> about 100x slower than the C code of word2vec (the algorithm is the same).
> I've been trying to optimize my code for several days (adding type
> annotations, using BLAS to do computation, eliminating memory allocations
> ...), but it is still 30x slower than the C code.
>
> The critical part of my program is the following function (it also
> consumes most of the time according to the profiling result):
>
> function train_one(c :: LinearClassifier, x :: Array{Float64}, y :: Int64;
> α :: Float64 = 0.025, input_gradient :: Union(Nothing, Array{Float64}) =
> nothing)
> predict!(c, x)
> c.outputs[y] -= 1
>
> if input_gradient != nothing
> # input_gradient = ( c.weights * outputs' )'
> BLAS.gemv!('N', α, c.weights, c.outputs, 1.0, input_gradient)
> end
>
> # c.weights -= α * x' * outputs;
> BLAS.ger!(-α, vec(x), c.outputs, c.weights)
> end
>
> function predict!(c :: LinearClassifier, x :: Array{Float64})
> c.outputs = vec(softmax(x * c.weights))
> end
>
> type LinearClassifier
> k :: Int64 # number of outputs
> n :: Int64 # number of inputs
> weights :: Array{Float64, 2} # k * n weight matrix
>
> outputs :: Vector{Float64}
> end
>
> And the entire program can be found here
> <https://github.com/yangzhixuan/embed>. Could you please check my code
> and tell me what I can do to get performance comparable to C.
>
> Regards.
> Yang Zhixuan
>