Hello, When you use the erf function with pyCuda, execution takes almost 2x longer than with nvcc. The exp function takes about 30% longer. I use these functions in my pyCuda program a lot and in that case it makes the program 3x slower than when using nvcc. I attached two small example programs that can be run straight away. Regards, Michiel.
add_loop.cu
Description: Binary data
import numpy
import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule N = 10000000 NrOfBlocks = 64 NrOfThreads = 64 def getSource(): source = SourceModule(""" #define N 10000000 __global__ void add( float* a, float* b, float* c ) { int tid = blockIdx.x * blockDim.x + threadIdx.x; while (tid < N) { // c[tid] = a[tid] + b[tid]; // c[tid] = erf(a[tid]) + erf(b[tid]); c[tid] = exp(a[tid]) + exp(b[tid]); tid += gridDim.x * blockDim.x; } } """) return source def main(): print "N:", N a = numpy.arange(N).astype(numpy.float32) b = 2*a.copy() c = numpy.zeros(N).astype(numpy.float32) mod = getSource() a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize) b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize) c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize) cuda.memcpy_htod(a_gpu, a) cuda.memcpy_htod(b_gpu, b) cudaStart = cuda.Event() cudaStop = cuda.Event() cudaStart.record() func = mod.get_function("add") func( a_gpu, b_gpu, c_gpu, block=(NrOfThreads,1,1), grid=(NrOfBlocks,1) ) cudaStop.record() cudaStop.synchronize() cuda.memcpy_dtoh(c, c_gpu) cudaMSecs = cudaStart.time_till(cudaStop) print "Time (ms):", cudaMSecs if __name__ == "__main__": main()
_______________________________________________ PyCUDA mailing list PyCUDA@tiker.net http://lists.tiker.net/listinfo/pycuda