Good day,
I'm new to both CUDA and PyCUDA. I'm trying to write binary
erosion/dilation accelerator module for my project, but they are slower
then scipy.ndimage's functions.
I don't know if i'm doing something wrong(as I said, I'm new), or nvidia
nvs140m in my notebook is just not fast enough.
It would be great if someone with more powerful card could try it, or
may be some guru :) could have a look into my sources?
Source is attached.
If I get it to work, I'll share it for all :)
Anyway, CUDA and PyCUDA are great work!
Thanks
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import pycuda.driver as drv
import numpy, scipy.ndimage
import time
mod = drv.SourceModule("""
#define W 2000
#define H 2400
#define V(dx,dy) ( (x+dx < 0 || x+dx >= W || y+dy < 0 || y+dy >= H) ? 0 : a[y+dy][x+dx] )
__global__ void dilation3x3(unsigned int dest[H][W], unsigned int a[H][W])
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < W && y < H)
dest[y][x] = (V(-1,-1)+V(0,-1)+V(1,-1)+V(-1,0)+V(0,0)+V(1,0)+V(-1,1)+V(0,1)+V(1,1))>0;
}
""")
print "Compiled"
dilation3x3 = mod.get_function("dilation3x3")
img = numpy.random.rand(2400,2000)
img = numpy.where(img > 0.9,1,0).astype(numpy.uint32)
g_array = gpuarray.to_gpu(img)
ts = time.time()
dest_gpu = gpuarray.empty_like(g_array)
dilation3x3( dest_gpu, g_array,
block=(20,24,1), grid=(100,100))
dest_host = dest_gpu.get()
print "Tgpu=",time.time()-ts
elem = numpy.ones((3,3))
ts = time.time()
dest_cpu = scipy.ndimage.binary_dilation(img, iterations=1, structure=elem)
print "Tcpu=",time.time()-ts
print numpy.average(dest_host)
print numpy.average(dest_cpu)
print (dest_host-dest_cpu).max()
_______________________________________________
PyCUDA mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net