I have not done the profiling using OpenCL tools, only the buitlin Python
timer. I will give this a try shortly and report back.
Someone else asked for a code sample, the main loop is presented below (If it
needs some sort of formatting tags I don't know how to use them, I apologize in
advance :-) Warning I am a physicist, and I program like one. The code is
supposed to move an electron randomly through a material grid and calculate the
energy deposited in the grid. Thanks in advance.
Joe
/*************************/
inline void GAtomicAdd(volatile __global float *source, const float operand) {
union {
unsigned int intVal;
float floatVal;
} newVal;
union {
unsigned int intVal;
float floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atomic_cmpxchg((volatile __global unsigned int *)source,
prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
/*Main calculation engine*/
__kernel void cleMC( __global const float *x, __global const float *y, __global
const float *z, __global float * dose, \
__global const float * dens,__global float * pdose, int nelec , \
const float A, \
const float alp, const float En0, const float bnc,\
const int nvert,__global const float *vertx, __global const float *verty,
const int NROW,const int NCOL,const int NZ, \
const float sstp,const float strt,const float xmin, const float xmax, const
float ymin, \
const float ymax, const float zmin, const float zmax,const float
phper,__global ranluxcl_state_t *ranluxcltab){
float minSe=0.01f,e0,delE,rnd;
float4 rlx;
float ex, ey, ez, edx, edy, edz,vssd=90.0f;
float theta,the1,the2,the3,urn,dlo,dl,rn,r1,fac,afac=1.2e-3f,cfac,sfc,sfr,sfy;
float dx,dy,dz;
float vec[3],tdl;
//curandState localstate;
int indx,indy,indz,notin;
int ph=0,lid,i,init;
//random numbers stuff
//ranluxclstate stores the state of the generator.
ranluxcl_state_t ranluxclstate;
//Download state into ranluxclstate struct.
ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
/*based on the initial conditions will choose which direction is the
smallest, not necessary as of 10-10-13 since all dimensions are equal from
python*/
dx=fabs(x[1]-x[0]);dy=fabs(y[1]-y[0]);dz=fabs(z[1]-z[0]);
if(dx<=dy && dx<=dz){
dlo=dx;
}else if(dy<=dx && dy<=dz){
dlo=dy;
}else if(dz<=dx && dz<=dy){
dlo=dz;
}
/*minimum energy electron can have from the range tables that will just cross
the volume, ScsdaSe defined in header file incs2f.h, minSe minimum energy from
tables*/
minSe=0.01f;
/*defines the unique thread number using CUDA variables*/
//i=threadIdx.x+blockIdx.x*blockDim.x;
init=get_local_id(0)+get_group_id(0)*get_local_size(0);
lid=get_local_id(0);
/*begin electron loop, generate enough electrons to satisfy the requested
number from python*/
/*choose an energy from a normal distribution of spread alp and mean E0*/
rlx=ranluxcl32norm(&ranluxclstate);
e0=rlx.x*alp+En0;
/*notin defines whether the electron is in the grid, or being blocked by
the electron cutout*/
notin=1;
/*choose random positions on the grid in the beams eye view*/
rlx=ranluxcl32(&ranluxclstate);
ex=rlx.x*(sstp-strt)-(sstp-strt)/2.0f;
ex+=rlx.y*0.01f-0.005f;
ez=rlx.z*(sstp-strt)-(sstp-strt)/2.0f;
ez+=rlx.w*0.01f-0.005f;
/*see incs2f.h for pnpoly, is the elctron in the grid and in the apeture of
the block*/
notin=pnpoly(nvert,vertx,verty,ex,ez);
if (notin==0) e0=0.0f;
//ey is ymin but testing to have it at iso
ey=ymin;rn=bnc;dl=rn*dlo;
minSe=ScsdaSe(dl);
if(minSe<0.01f)minSe=0.01f;
urn=sqrt(ex*ex+ez*ez+vssd*vssd);
/*stepsize pseudo velocity vector, used to step the electrons through the
grid*/
edy=dl*vssd/urn;edx=dl*ex/urn;edz=dl*ez/urn;
//is this a photon
/*phper from python is the photon percentage needed to match the machine
measurements is random number is less than the percentage then make this a
photon and not an electron*/
rlx=ranluxcl32(&ranluxclstate);
if(rlx.x<phper)ph=1;
while(e0>minSe){
/*if the electron has wandered off the grid exit the loop*/
if(ez>zmax || ez<zmin||ex>xmax||ex<xmin||ey>ymax||ey<ymin)break;
/*find which array index the electron is in*/
indx=d_wher(x,ex,NCOL);indy=d_wher(y,ey,NROW);
indz=d_wher(z,ez,NZ);
//check the density we are currently in
fac=dens[indz+indx*NZ+indy*NZ*NCOL];
/*scale the water stopping powers etc to the current material*/
sfc=scalSeSc(e0,fac);
sfr=scalSeSr(e0,fac);
sfy=scalSeSy(e0,fac);
//scattering first
/*scatter the electron for next iteration*/
tdl=0.0f;
vec[0]=edx;vec[1]=edy;vec[2]=edz;
/*adjust for photon vs electron*/
if(ph==1){cfac=afac;}else{cfac=fac;}
/*CUDA notation for theta=sqrt(fac*dl)*(A*PI/(180))*/
theta=sqrt(fac*dl)*(A*PI/180.0f);
/*for photon or air density scattring angle is larger by 3.75 times,
experimentally determined*/
if(fac==afac){theta*=3.75f;}
rlx=ranluxcl32(&ranluxclstate);
urn=rlx.x*2.0f-1.0f;
/*find the direction with the largest magnitude, scater around it
randomly choosing one of the other two directions*/
if(edx >= edy && edx >= edz){
the2=(theta*urn);
the3=(theta*urn);
urn=rlx.y;
if(urn<=0.5f){
edx=(vec[0]*native_cos(the2)+vec[2]*native_sin(the2));
edz=(vec[2]*native_cos(the2)-vec[0]*native_sin(the2));
}else{
edx=(vec[0]*native_cos(the3)-vec[1]*native_sin(the3));
edy=(vec[1]*native_cos(the3)+vec[0]*native_sin(the3));
}
}else if(edy >= edx && edy>=edz){
the1=theta*urn;
the3=theta*urn;
urn=rlx.y;
if(urn<=0.5f){
edy=(vec[1]*native_cos(the1)-vec[2]*native_sin(the1));
edz=(vec[2]*native_cos(the1)+vec[1]*native_sin(the1));
}else{
edx=(vec[0]*native_cos(the3)-vec[1]*native_sin(the3));
edy=(vec[1]*native_cos(the3)+vec[0]*native_sin(the3));
}
}else if(edz>=edx && edz>=edy){
the2=theta*urn;
the1=theta*urn;
urn=rlx.y;
if(urn<=0.5f){
edy=(vec[1]*native_cos(the1)-vec[2]*native_sin(the1));
edz=(vec[2]*native_cos(the1)+vec[1]*native_sin(the1));
}else{
edx=(vec[0]*native_cos(the2)+vec[2]*native_sin(the2));
edz=(vec[2]*native_cos(the2)-vec[0]*native_sin(the2));
}
}
/*move the elctron to the next position*/
ex+=edx;ey+=edy;ez+=edz;
/*total distance moved, CUDA for sqrt(edx*edx+edy*edy+edz*edz);*/
tdl=sqrt(edx*edx+edy*edy+edz*edz);
/*choose radiative interaction or collisional interaction*/
r1=(sfy*SeSy(e0));
rlx=ranluxcl32(&ranluxclstate);
if(rlx.x<r1){
//change in energy by moving through with radiative interaction SeSr
//CUDA for delE=fac*sfr*SeSr(e0)*tdl;
delE=cfac*sfr*SeSr(e0)*tdl;
if(delE>e0)delE=e0;
e0-=delE;
//add the lost energy to the photon dose array
urn=delE/cfac;
//urn*=10000.0f;
GAtomicAdd(&pdose[indz+indx*NZ+indy*NZ*NCOL],urn);
//use fixed point to see if it helps speed, it doesn't so removed
//atomic_add(&pdose[indz+indx*NZ+indy*NZ*NCOL],(int)urn);
if(e0<minSe || delE<0.0f)break;
} else {
//change in energy by moving through with collisional interaction SeSc
//CUDA for delE=sfc*SeSc(e0)*tdl*fac;
delE=sfc*SeSc(e0)*tdl*cfac;
if(delE>e0) delE=e0;
e0-=delE;
//atomic dose calc
//add the lost energy to the dose array
urn=delE/cfac;
//urn*=10000.0f;
GAtomicAdd(&dose[indz+indx*NZ+indy*NZ*NCOL],urn);
//atomic_add(&dose[indz+indx*NZ+indy*NZ*NCOL],(int)urn);
if(e0<minSe || delE<0.0f)break;
}
}
/*save the local random number state for the next call to the kernel*/
ranluxcl_synchronize(&ranluxclstate);
ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
}
/*************************/
________________________________________
From: Andreas Kloeckner [[email protected]]
Sent: Tuesday, December 17, 2013 10:56 PM
To: Joe Haywood
Cc: [email protected]
Subject: RE: openCL your thoughts
Dear Joe,
please keep requests such as this to the mailing list. Thanks. I've cc'd
them on my reply.
Joe Haywood <[email protected]> writes:
> I was hoping to pick your brain a little more. After rewriting my
> original Python/Cuda/C++ program to Python/PyOpenCL I have done some
> speed comparisons. I cannot get the pyopencl version to run as fast
> as the original. I have an NVidia GT 430 for testing. Running the
> original code, the program takes ~10 seconds to complete. Running the
> opencl version takes ~24 seconds to complete (not including build
> time). Both programs produce the same results, within the uncertainty
> I expect from a Monte Carlo code.
>
> The differences between the two are, the CUDA code uses the CURAND
> libray for random numbers, whereas the OPENCL code uses ranlux from
> pyopencl-ranlux.cl. The CUDA code is compiled as a callable library
> using nvcc with optimizations like -O3 -fast-math -mtune=native etc
> and called in Python using the weave library. The Opencl kernel is
> compiled using the -cl-mad-enable -cl-unsafe-math etc. compile
> options. In the CUDA code I have rewritten some of the functions to
> use the faster math like
> "theta=__fmul_rn(__fsqrt_rn(__fmul_rn(fac,dl)),__fdividef(__fmul_rn(A,PI),180.0f));"
Ranluxcl supports a 'luxury' setting that influences the speed of the
generator. This knob trades off speed against quality of random numbers.
> I have tried moving enqueue_copy commands around, not reinitializing
> the ranlux generator, etc but I cannot speed up the opencl version
> anymore. Is there something I am missing in Pyopencl that would help
> with this?
Have you tried measuring (using OpenCL event-based profiling) what is
actually taking time?
Andreas
_______________________________________________
PyOpenCL mailing list
[email protected]
http://lists.tiker.net/listinfo/pyopencl