Hello

I've come across a baffling error. I have a custom CUDA kernel to calculate 
squared row norms of a matrix. It works fine on the host computer:

julia> d_M = residual_shared(Y_init,A_init,S_init,k,sig)
CUDArt.CudaArray{Float32,2}(CUDArt.CudaPtr{Float32}(Ptr{Float32} @
0x0000000b037a0000),(4000,2500),0)

julia> sum(cudakernels.sqrownorms(d_M))
5.149127f6

However when I try to run the same code on a remote machine, the variable 
`d_M' gets calculated properly. The custom kernel launch code looks like:

function sqrownorms{T}(d_M::CUDArt.CudaArray{T,2})
    elty = eltype(d_M)
    n1, n2 = size(d_M)
    d_dots = CudaArray(map(elty, ones(n1)))
    dev = device(d_dots)
    dotf = ptxdict[(dev, "sqrownorms", elty)]
    numblox = Int(ceil(n1/maxBlock)) 
    CUDArt.launch(dotf, numblox, maxBlock, (d_M, n1, n2, d_dots))
    dots = to_host(d_dots)
    free(d_dots)
    return dots
end

Running the inside of this on a remote causes the following crash message. 
(Running the function produces an unhelpful process exited arrgh! error).

julia> sow(reps[5], :d_M, :(residual_shared(Y,A_init,S_init,1,sig)))

julia> p2 = quote 
           elty = eltype(d_M)
           n1, n2 = size(d_M)
           d_dots = CudaArray(map(elty, ones(n1)))
           dev = device(d_dots)
           dotf = cudakernels.ptxdict[(dev, "sqrownorms", elty)]
           numblox = Int(ceil(n1/cudakernels.maxBlock))
           CUDArt.launch(dotf, numblox, cudakernels.maxBlock, (d_M, n1, n2, 
d_dots))
           dots = to_host(d_dots)
           free(d_dots)
           dots
       end;

julia> reap(reps[5], p2)  #this is a remote call fetch of the eval of the 
`p2' block in global scope
ERROR: On worker 38:
"an illegal memory access was encountered"
 [inlined code] from essentials.jl:111
 in checkerror at /home/mcp50/.julia/v0.5/CUDArt/src/libcudart-6.5.jl:16
 [inlined code] from /home/mcp50/.julia/v0.5/CUDArt/src/stream.jl:11
 in cudaMemcpyAsync at /home/mcp50/.julia/v0.5/CUDArt/src/../gen-6.5/
gen_libcudart.jl:396
 in copy! at /home/mcp50/.julia/v0.5/CUDArt/src/arrays.jl:152
 in to_host at /home/mcp50/.julia/v0.5/CUDArt/src/arrays.jl:148
 in anonymous at multi.jl:892
 in run_work_thunk at multi.jl:645
 [inlined code] from multi.jl:892
 in anonymous at task.jl:59
 in remotecall_fetch at multi.jl:731
 [inlined code] from multi.jl:368
 in remotecall_fetch at multi.jl:734
 in anonymous at task.jl:443
 in sync_end at ./task.jl:409
 [inlined code] from task.jl:418
 in reap at /home/mcp50/.julia/v0.5/ClusterUtils/src/ClusterUtils.jl:203

Any thoughts much appreciated - I'm not sure where to go with this now.

Matthew


Reply via email to