Hello,
Yet another stupid question. Most probably, I missed something
obvious, but anyway - can someone explain why I get some NaN's in
output for the program (listed below)? Surprisingly, bug disappears if
I send '1' instead of '-1' as a third parameter to function (or remove
'int' parameters completely and leave only two pointers). Same kernel
in pure Cuda works fine. Looks like memory corruption, but I can't
figure out where it happens...
Output:
$ python test.py
6.65991e-37
nan
Code:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import numpy
src = """
__global__ void test(float *in, float *out, int dir, int S)
{
__shared__ float sMem[2048];
size_t lMemLoad;
float a[16] = {0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0};
int lId = threadIdx.x;
# (0..15) * 128 + (0..7) + (0..15)*8 < 2048 always,
# so I think I do not breach shared memory borders...
lMemLoad = (lId % 16) * 128 + (lId / 16);
for(int k = 0; k < 16; k++)
sMem[lMemLoad + k * 8] = a[k];
__syncthreads();
for(int k = 0; k < 16; k++)
a[k] = sMem[lMemLoad + k * 8];
__syncthreads();
for(int k = 0; k < 16; k++)
out[lMemLoad + k * 8] = a[k];
}
"""
mod = SourceModule(src)
func = mod.get_function('test')
data = numpy.ones(2048).astype(numpy.float32) # unused
a_gpu = gpuarray.to_gpu(data)
b_gpu = gpuarray.GPUArray(data.shape, dtype=data.dtype)
func.prepare("PPii", block=(128,1,1))
# expected results (all zeros)
func.prepared_call((1,1), a_gpu.gpudata, b_gpu.gpudata, 1, 1)
res = b_gpu.get()
print numpy.sum(res)
# unexpected result (NaN)
func.prepared_call((1,1), a_gpu.gpudata, b_gpu.gpudata, -1, 1)
res = b_gpu.get()
print numpy.sum(res)
_______________________________________________
PyCUDA mailing list
[email protected]
http://host304.hostmonster.com/mailman/listinfo/pycuda_tiker.net