Hello,
 
When you use the erf function with pyCuda, execution takes almost 2x longer 
than with nvcc. The exp function takes about 30% longer.
I use these functions in my pyCuda program a lot and in that case it makes the 
program 3x slower than when using nvcc.
 
I attached two small example programs that can be run straight away.
 
Regards,
Michiel.
 

Attachment: add_loop.cu
Description: Binary data

import numpy

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

N = 10000000
NrOfBlocks = 64
NrOfThreads = 64


def getSource():
    source = SourceModule("""
        #define N           10000000
        
        __global__ void add( float* a, float* b, float* c ) 
        {
            int tid = blockIdx.x * blockDim.x + threadIdx.x;
            while (tid < N) 
            {
        //        c[tid] = a[tid] + b[tid];
        //        c[tid] = erf(a[tid]) + erf(b[tid]);
                c[tid] = exp(a[tid]) + exp(b[tid]);
                
                tid += gridDim.x * blockDim.x;
            }
        }
    """)
        
    return source

def main():

    print "N:", N

    a = numpy.arange(N).astype(numpy.float32)
    b = 2*a.copy()
    c = numpy.zeros(N).astype(numpy.float32)
    
    mod = getSource()

    a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
    b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize)
    c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize)

    cuda.memcpy_htod(a_gpu, a)
    cuda.memcpy_htod(b_gpu, b)

    cudaStart = cuda.Event()
    cudaStop = cuda.Event()
    cudaStart.record()

    func = mod.get_function("add")
    func( a_gpu, b_gpu, c_gpu, block=(NrOfThreads,1,1), grid=(NrOfBlocks,1) )

    cudaStop.record()
    cudaStop.synchronize()

    cuda.memcpy_dtoh(c, c_gpu)

    cudaMSecs = cudaStart.time_till(cudaStop)
    print "Time (ms):", cudaMSecs



if __name__ == "__main__":
    main()
_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda

Reply via email to