I have several thousand matrices where I need to calculate their dot
product. So, it seems pyCuda should do the trick (i hope). I am
running into an issue with block sizes.

Here is my code

#!/usr/bin/env python
import sys,time
from string import Template
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
from pycuda.compiler import SourceModule
import pycuda.autoinit


def main():
    d={}
    size=4
    d['size']=size

    src=Template("""
__global__ void MatrixMulKernel(float *a, float *b, float *c)
{
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    float Pvalue = 0;

    for (int k = 0; k < $size; ++k) {
        float Aelement = a[ty * $size + k];
        float Belement = b[k * $size + tx];
        Pvalue += Aelement * Belement;
    }
   c[ty * $size + tx] = Pvalue;

    }
    """)

    #src.substitute(d)

    a_cpu = np.random.randn(size,size).astype(np.float32)
    b_cpu = np.random.randn(size,size).astype(np.float32)

    a_gpu=gpuarray.to_gpu(a_cpu)
    b_gpu=gpuarray.to_gpu(b_cpu)
    c_gpu = gpuarray.empty((size,size), np.float32)

    src.substitute(d)
    mod = compiler.SourceModule(src.substitute(d))
    mm=mod.get_function("MatrixMulKernel")
    v=mm(a_gpu,b_gpu,c_gpu,
            block=(16,16,1),
            )
    start=time.time()

    gpu_ans=c_gpu.get()
    stop=time.time()
    print "Gpu",stop-start

    start=time.time()
    cpu_ans=np.dot(a_cpu,b_cpu)
    stop=time.time()
    print "Cpu",stop-start


    #print gpu_ans
    #print cpu_ans
    print np.allclose(gpu_ans,cpu_ans,atol=1e-03)


Couple of issues:
When I increase size of matrix it seems it gets less accurate than CPU
dot product therefore I have to use np.allclose to compare.
Also, what is the optimal block size I should be using?

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to