Hello, Yet another stupid question. Most probably, I missed something obvious, but anyway - can someone explain why I get some NaN's in output for the program (listed below)? Surprisingly, bug disappears if I send '1' instead of '-1' as a third parameter to function (or remove 'int' parameters completely and leave only two pointers). Same kernel in pure Cuda works fine. Looks like memory corruption, but I can't figure out where it happens...
Output: $ python test.py 6.65991e-37 nan Code: import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import pycuda.gpuarray as gpuarray import numpy src = """ __global__ void test(float *in, float *out, int dir, int S) { __shared__ float sMem[2048]; size_t lMemLoad; float a[16] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; int lId = threadIdx.x; # (0..15) * 128 + (0..7) + (0..15)*8 < 2048 always, # so I think I do not breach shared memory borders... lMemLoad = (lId % 16) * 128 + (lId / 16); for(int k = 0; k < 16; k++) sMem[lMemLoad + k * 8] = a[k]; __syncthreads(); for(int k = 0; k < 16; k++) a[k] = sMem[lMemLoad + k * 8]; __syncthreads(); for(int k = 0; k < 16; k++) out[lMemLoad + k * 8] = a[k]; } """ mod = SourceModule(src) func = mod.get_function('test') data = numpy.ones(2048).astype(numpy.float32) # unused a_gpu = gpuarray.to_gpu(data) b_gpu = gpuarray.GPUArray(data.shape, dtype=data.dtype) func.prepare("PPii", block=(128,1,1)) # expected results (all zeros) func.prepared_call((1,1), a_gpu.gpudata, b_gpu.gpudata, 1, 1) res = b_gpu.get() print numpy.sum(res) # unexpected result (NaN) func.prepared_call((1,1), a_gpu.gpudata, b_gpu.gpudata, -1, 1) res = b_gpu.get() print numpy.sum(res) _______________________________________________ PyCUDA mailing list pyc...@host304.hostmonster.com http://host304.hostmonster.com/mailman/listinfo/pycuda_tiker.net