I have not done the profiling using OpenCL tools, only the buitlin Python 
timer.  I will give this a try shortly and report back.

Someone else asked for a code sample, the main loop is presented below (If it 
needs some sort of formatting tags I don't know how to use them, I apologize in 
advance :-)  Warning I am a physicist, and I program like one.  The code is 
supposed to move an electron randomly through a material grid and calculate the 
energy deposited in the grid. Thanks in advance. 

Joe

/*************************/
inline void GAtomicAdd(volatile __global float *source, const float operand) {
    union {
        unsigned int intVal;
        float floatVal;
    } newVal;
    union {
        unsigned int intVal;
        float floatVal;
    } prevVal;
    do {
        prevVal.floatVal = *source;
        newVal.floatVal = prevVal.floatVal + operand;
    } while (atomic_cmpxchg((volatile __global unsigned int *)source, 
prevVal.intVal, newVal.intVal) != prevVal.intVal);
}

/*Main calculation engine*/
__kernel void cleMC( __global const float *x, __global const float *y, __global 
const float *z, __global float * dose, \
   __global const float * dens,__global float * pdose, int nelec ,    \
   const float A, \
   const float alp, const float En0, const float bnc,\
   const int nvert,__global const float *vertx, __global const float *verty, 
const int NROW,const int NCOL,const int NZ, \
   const float sstp,const float strt,const float xmin, const float xmax, const 
float ymin, \
   const float ymax, const float zmin, const float zmax,const float 
phper,__global ranluxcl_state_t *ranluxcltab){
  float minSe=0.01f,e0,delE,rnd;
  float4 rlx;
  float ex, ey, ez, edx, edy, edz,vssd=90.0f;
  float theta,the1,the2,the3,urn,dlo,dl,rn,r1,fac,afac=1.2e-3f,cfac,sfc,sfr,sfy;
  float dx,dy,dz;
  float vec[3],tdl;
  //curandState localstate;
  int indx,indy,indz,notin;
  int ph=0,lid,i,init;
  //random numbers stuff
  //ranluxclstate stores the state of the generator.
  ranluxcl_state_t ranluxclstate;
 //Download state into ranluxclstate struct.
  ranluxcl_download_seed(&ranluxclstate, ranluxcltab);  
   /*based on the initial conditions will choose which direction is the 
smallest, not necessary as of 10-10-13 since all dimensions are equal from 
python*/
  dx=fabs(x[1]-x[0]);dy=fabs(y[1]-y[0]);dz=fabs(z[1]-z[0]);
  if(dx<=dy && dx<=dz){
    dlo=dx;
  }else if(dy<=dx && dy<=dz){
    dlo=dy;
  }else if(dz<=dx && dz<=dy){
    dlo=dz;
  }
  /*minimum energy electron can have from the range tables that will just cross 
the volume, ScsdaSe defined in header file incs2f.h, minSe minimum energy from 
tables*/
  minSe=0.01f;
  /*defines the unique thread number using CUDA variables*/
  //i=threadIdx.x+blockIdx.x*blockDim.x;
  init=get_local_id(0)+get_group_id(0)*get_local_size(0);
  lid=get_local_id(0);  
  /*begin electron loop, generate enough electrons to satisfy the requested 
number from python*/
  /*choose an energy from a normal distribution of spread alp and mean E0*/
    rlx=ranluxcl32norm(&ranluxclstate);
     e0=rlx.x*alp+En0;
    /*notin defines whether the electron is in the grid, or being blocked by 
the electron cutout*/
    notin=1;
    /*choose random positions on the grid in the beams eye view*/
    rlx=ranluxcl32(&ranluxclstate);
    ex=rlx.x*(sstp-strt)-(sstp-strt)/2.0f;
    ex+=rlx.y*0.01f-0.005f;
    ez=rlx.z*(sstp-strt)-(sstp-strt)/2.0f;
    ez+=rlx.w*0.01f-0.005f;
    /*see incs2f.h for pnpoly, is the elctron in the grid and in the apeture of 
the block*/
    notin=pnpoly(nvert,vertx,verty,ex,ez);
    if (notin==0) e0=0.0f;
    //ey is ymin but testing to have it at iso
    ey=ymin;rn=bnc;dl=rn*dlo;
    minSe=ScsdaSe(dl);
    if(minSe<0.01f)minSe=0.01f;
    urn=sqrt(ex*ex+ez*ez+vssd*vssd);
    /*stepsize pseudo velocity vector, used to step the electrons through the 
grid*/
    edy=dl*vssd/urn;edx=dl*ex/urn;edz=dl*ez/urn;
    //is this a photon
    /*phper from python is the photon percentage needed to match the machine 
measurements is random number is less than the percentage then make this a 
photon and not an electron*/
   rlx=ranluxcl32(&ranluxclstate);
    if(rlx.x<phper)ph=1;
      while(e0>minSe){
      /*if the electron has wandered off the grid exit the loop*/
      if(ez>zmax || ez<zmin||ex>xmax||ex<xmin||ey>ymax||ey<ymin)break;
      /*find which array index the electron is in*/
      indx=d_wher(x,ex,NCOL);indy=d_wher(y,ey,NROW);
      indz=d_wher(z,ez,NZ);
      //check the density we are currently in
      fac=dens[indz+indx*NZ+indy*NZ*NCOL];
      /*scale the water stopping powers etc to the current material*/
      sfc=scalSeSc(e0,fac);
      sfr=scalSeSr(e0,fac);
      sfy=scalSeSy(e0,fac);
      //scattering first
      /*scatter the electron for next iteration*/
      tdl=0.0f;
      vec[0]=edx;vec[1]=edy;vec[2]=edz;
      /*adjust for photon vs electron*/
      if(ph==1){cfac=afac;}else{cfac=fac;}
      /*CUDA notation for theta=sqrt(fac*dl)*(A*PI/(180))*/
      theta=sqrt(fac*dl)*(A*PI/180.0f);
      /*for photon or air density scattring angle is larger by 3.75 times, 
experimentally determined*/
      if(fac==afac){theta*=3.75f;}
      rlx=ranluxcl32(&ranluxclstate);
      urn=rlx.x*2.0f-1.0f;
      /*find the direction with the largest magnitude, scater around it 
randomly choosing one of the other two directions*/
      if(edx >= edy && edx >= edz){
         the2=(theta*urn);
         the3=(theta*urn);
         urn=rlx.y;
         if(urn<=0.5f){ 
             edx=(vec[0]*native_cos(the2)+vec[2]*native_sin(the2));
             edz=(vec[2]*native_cos(the2)-vec[0]*native_sin(the2));
         }else{
             edx=(vec[0]*native_cos(the3)-vec[1]*native_sin(the3));
             edy=(vec[1]*native_cos(the3)+vec[0]*native_sin(the3));
         }
 }else if(edy >= edx && edy>=edz){
         the1=theta*urn;
         the3=theta*urn;
         urn=rlx.y;
         if(urn<=0.5f){
             edy=(vec[1]*native_cos(the1)-vec[2]*native_sin(the1));
             edz=(vec[2]*native_cos(the1)+vec[1]*native_sin(the1));
         }else{
             edx=(vec[0]*native_cos(the3)-vec[1]*native_sin(the3));
             edy=(vec[1]*native_cos(the3)+vec[0]*native_sin(the3));
         }
 }else if(edz>=edx && edz>=edy){
         the2=theta*urn;
         the1=theta*urn;
         urn=rlx.y;
         if(urn<=0.5f){
             edy=(vec[1]*native_cos(the1)-vec[2]*native_sin(the1));
             edz=(vec[2]*native_cos(the1)+vec[1]*native_sin(the1));
         }else{
             edx=(vec[0]*native_cos(the2)+vec[2]*native_sin(the2));
             edz=(vec[2]*native_cos(the2)-vec[0]*native_sin(the2));
         }
 }
      /*move the elctron to the next position*/
      ex+=edx;ey+=edy;ez+=edz;
      /*total distance moved, CUDA for sqrt(edx*edx+edy*edy+edz*edz);*/
      tdl=sqrt(edx*edx+edy*edy+edz*edz);
      /*choose radiative interaction or collisional interaction*/
      r1=(sfy*SeSy(e0));
      rlx=ranluxcl32(&ranluxclstate);
      if(rlx.x<r1){
 //change in energy by moving through with radiative interaction SeSr
 //CUDA for delE=fac*sfr*SeSr(e0)*tdl;
        delE=cfac*sfr*SeSr(e0)*tdl;
        if(delE>e0)delE=e0;
        e0-=delE;
 //add the lost energy to the photon dose array
        urn=delE/cfac;
        //urn*=10000.0f;
        GAtomicAdd(&pdose[indz+indx*NZ+indy*NZ*NCOL],urn);
        //use fixed point to see if it helps speed, it doesn't so removed
        //atomic_add(&pdose[indz+indx*NZ+indy*NZ*NCOL],(int)urn);
        if(e0<minSe || delE<0.0f)break;
      } else {
 //change in energy by moving through with collisional interaction SeSc
 //CUDA for delE=sfc*SeSc(e0)*tdl*fac;
        delE=sfc*SeSc(e0)*tdl*cfac;
        if(delE>e0) delE=e0;
        e0-=delE;
 //atomic dose calc
 //add the lost energy to the dose array
        urn=delE/cfac;
        //urn*=10000.0f;
        GAtomicAdd(&dose[indz+indx*NZ+indy*NZ*NCOL],urn);
        //atomic_add(&dose[indz+indx*NZ+indy*NZ*NCOL],(int)urn);
        if(e0<minSe || delE<0.0f)break;
      }
    }
  /*save the local random number state for the next call to the kernel*/
  ranluxcl_synchronize(&ranluxclstate);
  ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
}
/*************************/


________________________________________
From: Andreas Kloeckner [[email protected]]
Sent: Tuesday, December 17, 2013 10:56 PM
To: Joe Haywood
Cc: [email protected]
Subject: RE: openCL your thoughts

Dear Joe,

please keep requests such as this to the mailing list. Thanks. I've cc'd
them on my reply.

Joe Haywood <[email protected]> writes:
> I was hoping to pick your brain a little more.  After rewriting my
> original Python/Cuda/C++ program to Python/PyOpenCL I have done some
> speed comparisons.  I cannot get the pyopencl version to run as fast
> as the original.  I have an NVidia GT 430 for testing.  Running the
> original code, the program takes ~10 seconds to complete.  Running the
> opencl version takes ~24 seconds to complete (not including build
> time).  Both programs produce the same results, within the uncertainty
> I expect from a Monte Carlo code.
>
> The differences between the two are, the CUDA code uses the CURAND
> libray for random numbers, whereas the OPENCL code uses ranlux from
> pyopencl-ranlux.cl.  The CUDA code is compiled as a callable library
> using nvcc with optimizations like -O3 -fast-math -mtune=native etc
> and called in Python using the weave library.  The Opencl kernel is
> compiled using the -cl-mad-enable -cl-unsafe-math etc. compile
> options.  In the CUDA code I have rewritten some of the functions to
> use the faster math like
> "theta=__fmul_rn(__fsqrt_rn(__fmul_rn(fac,dl)),__fdividef(__fmul_rn(A,PI),180.0f));"

Ranluxcl supports a 'luxury' setting that influences the speed of the
generator. This knob trades off speed against quality of random numbers.

> I have tried moving enqueue_copy commands around, not reinitializing
> the ranlux generator, etc but I cannot speed up the opencl version
> anymore. Is there something I am missing in Pyopencl that would help
> with this?

Have you tried measuring (using OpenCL event-based profiling) what is
actually taking time?

Andreas
_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl

Reply via email to