Hello,
I have built a benchmark for my custom dot kernel, pasted below. It only
outperforms cpu dot for big sizes, expectable in my educated guess. When
ITERS goes up to 10000, there is a drastic overhead. Is this function call
overhead? Moreover, for big sizes some of the outputs don't match.
Thanks in advance for some insights,
RL
#### the benchmark ####
from time import clock
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
import numpy as np
import numpy
import random
from pycuda.compiler import SourceModule
SIZE = 512
ITERS = 1000
a = numpy.array(range(SIZE),dtype=numpy.float32)
b = numpy.array([range(SIZE+1) for i in range(SIZE)],
dtype=numpy.float32, order='F')
BLOCK_SIZE = (SIZE,1,1)
GRID_SIZE = (SIZE+1,1)
mod = SourceModule("""
#define BLOCK_SIZE %i
__global__ void vecmatdot(float *dest, float *a, float *b)
{
__shared__ float partials[BLOCK_SIZE];
int bx = blockIdx.x;
int linear_thr_idx = bx * blockDim.x + threadIdx.x;
partials[threadIdx.x] = a[threadIdx.x] * b[linear_thr_idx];
__syncthreads();
if( threadIdx.x == 0){
float sum = 0.0f;
for(int i = 0 ; i < BLOCK_SIZE; i++)
sum += partials[i];
dest[bx] = sum;
}
}
"""%(SIZE,))
vecmatdot = mod.get_function("vecmatdot")
sum_= .0
for i in range(ITERS):
s = clock()
bl = np.dot(a,b)
sum_ += (clock()-s)
print "%f seconds" % (sum_/ITERS,)
gpu_a = gpuarray.to_gpu(a)
gpu_b = gpuarray.to_gpu(b)
gpudest = gpuarray.to_gpu(numpy.zeros_like(b[0]))
sum_ = .0
for i in range(ITERS):
s = clock()
vecmatdot(
gpudest, gpu_a, gpu_b,
block=BLOCK_SIZE, grid=GRID_SIZE)
sum_ += (clock()-s)
print "%f seconds" % (sum_/ITERS,)
print (gpudest.get() - bl)[::11]
#### the end ####
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda