Hi,

   I have run into a problem while converting a (py)cuda program to
(py)opencl. Basically, it seems that only some local_size are working
reliably, and I fail to understand why.
   I have written a small program which tests the different platforms
available, with different local_size, up to max_work_group_size.

   The program compares the OpenCL to the numpy calculation and writes
if the test passed or failed. Surprisingly the test fails for local
sizes>1 on CPU (using AMD's platform), with no problem on GPU (nVidia).

   I even put a reqd_work_group_size kernel option, but it does not make
any difference.

   Any idea ?

     Vincent
import pyopencl as cl
from numpy import *
import time

CL_CODE="""
__kernel __attribute__((reqd_work_group_size(%(warp_size)d, 1, 1)))
void Fhkl(__global float *va, __global float *vb, __global float *vc, const long na, const long nb)
{
   #define BLOCKSIZE %(warp_size)d
   // Block index
   int bx = get_group_id(0);
   int by = get_group_id(1);

   // Thread index
   int tx = get_local_id(0);
   //int ty = get_local_id(1);
    
   const unsigned long ix=tx+(bx+by*get_num_groups(0))*BLOCKSIZE;
   __local float a[BLOCKSIZE];
   float b=vb[ix];
   float s=0;
   for (unsigned int i=0;i<nb;i+=BLOCKSIZE)
   {
      a[tx]=va[i+tx];
      barrier(CLK_LOCAL_MEM_FENCE);
      for(unsigned int i=0;i<BLOCKSIZE;i++)
      {
         s+=native_cos(a[i]*b);
      }
   }
   vc[ix]=s;
}"""

# for the sake of simplicity, use sizes that do not need zero-padding
na=2**10
nb=2**10
a=random.uniform(0,10,na).astype(float32)
b=random.uniform(0,10,nb).astype(float32)
c=b*0
gold=c*0
for i in xrange(len(gold)):
  gold[i]=cos(a*b[i]).sum()

devices=[]
for p in cl.get_platforms():
  devices+=p.get_devices()

for d0 in devices:
  ctx = cl.Context([d0])
  queue = cl.CommandQueue(ctx)
  mf = cl.mem_flags
  a_ = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=a, size=a.nbytes)
  b_ = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=b, size=b.nbytes)
  c_ = cl.Buffer(ctx, mf.WRITE_ONLY, size=c.nbytes)
  
  warp_size=1
  while warp_size<=d0.max_work_group_size:
    kernel_params={"warp_size":warp_size}
    options = "-cl-mad-enable -cl-fast-relaxed-math"
    prg = cl.Program(ctx, CL_CODE % kernel_params,).build(options=options)
    t0=time.time()
    prg.Fhkl(queue, (nb,1), (warp_size,1), a_, b_, c_, int64(na), int64(nb))
    cl.enqueue_copy(queue, c, c_).wait()
    dt=time.time()-t0
    s="PASS"
    if abs(c-gold).max()>.01:s="FAIL"
    print "%40s:%50s, warp_size=%4d, maxdiff=%8.4f -%5s-, dt=%7.5fs, Gflops=%8.4f"%(d0.platform.name,d0.name,warp_size,abs(c-gold).max(),s,dt,na*nb/dt/1e9)
    warp_size*=2
_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl

Reply via email to