Hello Everyone,

I recently saw (on this list) the conversation about comparing GPU speed to
CPU speed. The following code snippet was discussed:



import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
import numpy
import numpy.linalg as la
from pycuda.compiler import SourceModule

blocks = 64
block_size = 512
nbr_values = blocks * block_size
n_iter = 100000

#############

# GPU SECTION


mod = SourceModule("""

__global__ void addone(float *dest, float *a, int n_iter)

{

 const int i = blockDim.x*blockIdx.x + threadIdx.x;

 for(int n = 0; n < n_iter; n++) {

   a[i] = sin(a[i]);

 }

 dest[i] = a[i];

}

""")

addone = mod.get_function("addone")

a = numpy.ones(nbr_values).astype(numpy.float32)
a += 1 # a is now an array of 2s


dest = numpy.zeros_like(a)

start = drv.Event()
end = drv.Event()
start.record()

addone(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1),
block=(block_size,1,1))

#stop timer

end.record()
end.synchronize()
secs = start.time_till(end)*1e-3
print "GPU time:", secs
print "GPU result starts with...", dest[:3]



It was then suggested that this could be made even faster with the use of
shared memory. As I am currently trying to better understand shared memory,
I tried to just that:




import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
import numpy
import numpy.linalg as la
from pycuda.compiler import SourceModule

blocks = 64
block_size = 512
nbr_values = blocks * block_size
n_iter = 100000

#############

# GPU SECTION


mod = SourceModule("""

__global__ void addone(float *dest, float *a, int n_iter)

{



 const int i = blockDim.x*blockIdx.x + threadIdx.x;



 __shared__ float A[512];

 A[threadIdx.x] = a[i];



 syncthreads();



 for(int n = 0; n < n_iter; n++) {



   A[threadIdx.x] = sin(A[threadIdx.x]);



 }



 syncthreads();

 dest[i] = A[threadIdx.x];

}

""")

addone = mod.get_function("addone")

a = numpy.ones(nbr_values).astype(numpy.float32)
a += 1 # a is now an array of 2s


dest = numpy.zeros_like(a)

start = drv.Event()
end = drv.Event()
start.record()

addone(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1),
block=(block_size,1,1))

#stop timer

end.record()
end.synchronize()
secs = start.time_till(end)*1e-3
print "GPU time:", secs
print "GPU result starts with...", dest[:3]




It seems simple enough, but for some reason the shared memory version is
slower. Not by much, but it is definitely slower.

Am I doing something wrong, or is the time to transfer to shared memory just
greater than the time saved by using shared memory?


Thank you for your time!

    ---Chris Heuser
_______________________________________________
PyCUDA mailing list
[email protected]
http://host304.hostmonster.com/mailman/listinfo/pycuda_tiker.net

Reply via email to