Hello,
I'm trying to use PyCUDA for numerical calculations and I have some
problems with resourses. I want to integrate some function using Sympson
method(scalar multiplication of vector of Sympson coefficients and
vector of function).
When I use simple function everything seems to be ok, but for some
reason more copmlex function rises an error:
"pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources"
Traceback (most recent call last):
File "scalar_prod_complex_1_test.py", line 198, in <module>
func(dest,a_gpu,b_gpu,f_gpu,W,S_h_quant, block =
(thread_strides, 1, 1), grid = (block_size,1))
File
"/usr/local/lib/python2.6/dist-packages/pycuda-2011.1-py2.6-linux-x86_64.egg/pycuda/driver.py",
line 187, in function_call
func.launch_grid(*grid)
pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
My program is attached to the mail.
P.S. I'm new in CUDA so it's possible that I've just made some stupid
mistake=)
Best regards,
Mikhail
#Import pycuda
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.tools as tools
import pycuda.gpuarray as gpuarray
#Import math
import matplotlib
matplotlib.use("Agg")
import numpy
from pylab import *
from time import clock
cr1 = clock()
#define number of treads and blocks
thread_strides = 512
block_size = 1*32
total_size = thread_strides*block_size
dtype = numpy.float32
##########################
### Create arrays ###
##########################
N = total_size
f = logspace(log(10.0),log(1570.0), N, base = e).astype(numpy.float32)
result = zeros(N,dtype = numpy.float32)
koeff = zeros(N,dtype = numpy.float32)
koeff[0] = 1
koeff[1] = 4
fi = 2
while fi < N-2:
koeff[fi] = 2.0
fi+=1
koeff[fi] = 4.0
fi+=1
#print fi
#print f[fi]
koeff[fi] = 1
koeff *= f**(-4.0/3.0)
koeff *= log(f[fi]/f[0])/(3.0*fi)
koeff = numpy.array(koeff).astype(numpy.float32)
cr = clock()
##########################
### Arrays to GPU ####
##########################
a_gpu = gpuarray.to_gpu(koeff)
b_gpu = gpuarray.to_gpu(result)
f_gpu = gpuarray.to_gpu(f)
dest = cuda.mem_alloc(32*total_size)
W = cuda.mem_alloc(32*total_size)
S_h_quant = cuda.mem_alloc(32*total_size)
##########################
### C++ code ###
##########################
src = SourceModule("""
#include <pycuda-complex.hpp>
#define BLOCK_SIZE 512
#define _USE_MATH_DEFINES
///////////////////////
// Define constants ///
///////////////////////
#define b (1.0)
#define phi (1.0)
#define G (2700.0)
#define nu (0.9)
#define M (1.0)
#define H_BAR (1.0)
#define L (4000.0)
#define J (pow(2.0 * M_PI * 100.0,3))
///////////////////////
// Define operations //
///////////////////////
#define delta (G * sin(b))
#define g (G * cos(b))
#define W(i) (2.0*M_PI*f[i])
#define REDUCE(b, c) (b + c)
///////////////////////////
// Calculation functions //
///////////////////////////
__device__ float S_h_quant(int i,float *f)
/*************************
* Complex function *
*************************/
{float R_xx = -1/(M*W(i)*W(i));
float R_FF = M*J*delta*(G*G-W(i)*W(i))/((G*G - W(i)*W(i))*(G*G - W(i)*W(i))+4*g*g*W(i)*W(i));
float S_x_SN=(H_BAR / (2.0*M*J*g*nu)) * (W(i)*W(i)*W(i)*W(i) + 2.0*G*W(i)*G*W(i)*cos(2.0 * b) + G*G*G*G) / (G*G * cos(b + phi)*cos(b + phi) + W(i)*W(i)*cos(phi)*cos(phi));
float S_xF=H_BAR*(G*G*sin(b+phi)*cos(b+phi)+W(i)*W(i)*sin(phi)*cos(phi))/(G*G*cos(b + phi)*cos(b+phi)+W(i)*W(i)*cos(phi)*cos(phi));
float S_F_RPN = 2.0*H_BAR*M*J*g*(G*G+W(i)*W(i))/(W(i)*W(i)*W(i)*W(i)+2.0*G*G*W(i)*W(i)*cos(2.0*b)+ G*G*G*G);
float Conjugate= R_xx*R_FF+1;
float Real_part= R_xx*Conjugate*S_xF;
return (4.0/(L*L))*((R_xx*R_FF+1)*(R_xx*R_FF+1)*S_x_SN+2.0*Real_part+R_xx*R_xx*S_F_RPN);}
/*************************
* Simple function *
*************************/
//return ((4*H_BAR/(M*L*L*W(i)*W(i)))*((2.0*pow(G,4))/(W(i)*W(i)*(G*G + W(i)*W(i))) + (W(i)*W(i)*(G*G + W(i)*W(i)))/(2.0*pow(G,4))));}
//////////////////////////
// Additional functions //
//////////////////////////
__device__ float read_and_map(int i, float *a, float *be, float *f)
{float ram = a[i]/(be[i] + S_h_quant(i, f));
return ram;}
///////////////////
// Main function //
///////////////////
__global__ void adding(float *dest, float *a, float *be, float *f,float *W,float *S_h_quant )
{ __shared__ float redResult[BLOCK_SIZE];
unsigned int n = 1*32*512;
unsigned int tid = threadIdx.x;
unsigned int seq_count = n*BLOCK_SIZE*tid + n - 1;
unsigned int i = blockIdx.x*BLOCK_SIZE*seq_count + tid;
float r_a_m = 0;
float acc = 0;
for (unsigned s = 0; s < seq_count; ++s)
{
if (i >= n)
break;
r_a_m = read_and_map(i,a,be,f);
acc = REDUCE(acc, r_a_m);
i += BLOCK_SIZE;
}
redResult[tid] = acc;
__syncthreads();
#if (BLOCK_SIZE >= 512)
if (tid < 256) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 256]); }
__syncthreads();
#endif
#if (BLOCK_SIZE >= 256)
if (tid < 128) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 128]); }
__syncthreads();
#endif
#if (BLOCK_SIZE >= 128)
if (tid < 64) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 64]); }
__syncthreads();
#endif
if (tid < 32)
{
if (BLOCK_SIZE >= 64) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 32]);
if (BLOCK_SIZE >= 32) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 16]);
if (BLOCK_SIZE >= 16) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 8]);
if (BLOCK_SIZE >= 8) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 4]);
if (BLOCK_SIZE >= 4) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 2]);
if (BLOCK_SIZE >= 2) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 1]);
}
if (tid == 0) dest[blockIdx.x] = redResult[0];
}
""")
############################
### Function call ###
############################
func = src.get_function("adding")
func(dest,a_gpu,b_gpu,f_gpu,W,S_h_quant, block = (thread_strides, 1, 1), grid = (block_size,1))
c = cuda.from_device(dest, 1, numpy.float32)
print c
print clock()-cr
print clock()-cr1
_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda