Good day,

I'm new to both CUDA and PyCUDA. I'm trying to write binary
erosion/dilation accelerator module for my project, but they are slower
then scipy.ndimage's functions.

I don't know if i'm doing something wrong(as I said, I'm new), or nvidia
nvs140m in my notebook is just not fast enough.

It would be great if someone with more powerful card could try it, or
may be some guru :) could have a look into my sources?

Source is attached.

If I get it to work, I'll share it for all :)

Anyway, CUDA and PyCUDA are great work!

Thanks

import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import pycuda.driver as drv
import numpy, scipy.ndimage
import time

mod = drv.SourceModule("""
#define W 2000
#define H 2400
#define V(dx,dy)	( (x+dx < 0 || x+dx >= W || y+dy < 0 || y+dy >= H) ? 0 : a[y+dy][x+dx] )
__global__ void dilation3x3(unsigned int dest[H][W], unsigned int a[H][W])
{
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockIdx.y * blockDim.y + threadIdx.y;
	if(x < W && y < H)
		dest[y][x] = (V(-1,-1)+V(0,-1)+V(1,-1)+V(-1,0)+V(0,0)+V(1,0)+V(-1,1)+V(0,1)+V(1,1))>0;
}
""")

print "Compiled"

dilation3x3 = mod.get_function("dilation3x3")

img = numpy.random.rand(2400,2000)
img = numpy.where(img > 0.9,1,0).astype(numpy.uint32)
g_array = gpuarray.to_gpu(img)
					
ts = time.time()
dest_gpu = gpuarray.empty_like(g_array)
dilation3x3(	dest_gpu, g_array,
				block=(20,24,1), grid=(100,100))
dest_host = dest_gpu.get()
print "Tgpu=",time.time()-ts

elem = numpy.ones((3,3))

ts = time.time()
dest_cpu = scipy.ndimage.binary_dilation(img, iterations=1, structure=elem)

print "Tcpu=",time.time()-ts

print numpy.average(dest_host)
print numpy.average(dest_cpu)
print (dest_host-dest_cpu).max()
_______________________________________________
PyCUDA mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net

Reply via email to