Hello,

I'm trying to use PyCUDA for numerical calculations and I have some problems with resourses. I want to integrate some function using Sympson method(scalar multiplication of vector of Sympson coefficients and vector of function). When I use simple function everything seems to be ok, but for some reason more copmlex function rises an error: "pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources"

   Traceback (most recent call last):
      File "scalar_prod_complex_1_test.py", line 198, in <module>
        func(dest,a_gpu,b_gpu,f_gpu,W,S_h_quant, block =
   (thread_strides, 1, 1), grid = (block_size,1))
      File
   
"/usr/local/lib/python2.6/dist-packages/pycuda-2011.1-py2.6-linux-x86_64.egg/pycuda/driver.py",
   line 187, in function_call
        func.launch_grid(*grid)
   pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources

My program is attached to the mail.

P.S. I'm new in CUDA so it's possible that I've just made some stupid mistake=)

Best regards,
Mikhail
#Import pycuda
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.tools as tools
import pycuda.gpuarray as gpuarray
#Import math
import matplotlib
matplotlib.use("Agg")
import numpy
from pylab import *
from time import clock

cr1 = clock()

#define number of treads and blocks
thread_strides = 512
block_size = 1*32
total_size = thread_strides*block_size
dtype = numpy.float32

##########################
###   Create arrays    ###
##########################
N = total_size
f = logspace(log(10.0),log(1570.0), N, base = e).astype(numpy.float32)
result = zeros(N,dtype = numpy.float32)
koeff = zeros(N,dtype = numpy.float32)
koeff[0] = 1
koeff[1] = 4
fi = 2

while fi < N-2:
        koeff[fi] = 2.0
        fi+=1
        koeff[fi] = 4.0
        fi+=1
        #print fi
        #print f[fi]
koeff[fi] = 1
koeff *= f**(-4.0/3.0)
koeff *= log(f[fi]/f[0])/(3.0*fi)
koeff = numpy.array(koeff).astype(numpy.float32)

cr = clock()
##########################
###   Arrays to GPU   ####
##########################

a_gpu = gpuarray.to_gpu(koeff)
b_gpu = gpuarray.to_gpu(result)
f_gpu = gpuarray.to_gpu(f)
dest = cuda.mem_alloc(32*total_size)
W = cuda.mem_alloc(32*total_size)
S_h_quant = cuda.mem_alloc(32*total_size)

##########################
###   C++ code         ###
##########################
src = SourceModule("""
#include <pycuda-complex.hpp>

#define BLOCK_SIZE 512
#define _USE_MATH_DEFINES


///////////////////////
// Define constants ///
///////////////////////

#define b (1.0)
#define phi (1.0)
#define G (2700.0)
#define nu (0.9)
#define M (1.0)
#define H_BAR (1.0)
#define L (4000.0)
#define J (pow(2.0 * M_PI * 100.0,3))

///////////////////////
// Define operations //
///////////////////////
#define delta (G * sin(b))
#define g (G * cos(b))
#define W(i) (2.0*M_PI*f[i]) 

#define REDUCE(b, c) (b + c)


///////////////////////////
// Calculation functions //
///////////////////////////

__device__ float S_h_quant(int i,float *f)
/*************************
*    Complex function    *
*************************/

	{float R_xx = -1/(M*W(i)*W(i));
	float R_FF =  M*J*delta*(G*G-W(i)*W(i))/((G*G - W(i)*W(i))*(G*G - W(i)*W(i))+4*g*g*W(i)*W(i));
	float S_x_SN=(H_BAR / (2.0*M*J*g*nu)) * (W(i)*W(i)*W(i)*W(i) + 2.0*G*W(i)*G*W(i)*cos(2.0 * b) + G*G*G*G) / (G*G * cos(b + phi)*cos(b + phi) + W(i)*W(i)*cos(phi)*cos(phi));
	float S_xF=H_BAR*(G*G*sin(b+phi)*cos(b+phi)+W(i)*W(i)*sin(phi)*cos(phi))/(G*G*cos(b + phi)*cos(b+phi)+W(i)*W(i)*cos(phi)*cos(phi));
	float S_F_RPN = 2.0*H_BAR*M*J*g*(G*G+W(i)*W(i))/(W(i)*W(i)*W(i)*W(i)+2.0*G*G*W(i)*W(i)*cos(2.0*b)+ G*G*G*G);
	float Conjugate= R_xx*R_FF+1;
	float Real_part= R_xx*Conjugate*S_xF;
	return (4.0/(L*L))*((R_xx*R_FF+1)*(R_xx*R_FF+1)*S_x_SN+2.0*Real_part+R_xx*R_xx*S_F_RPN);}

/*************************
*    Simple function    *
*************************/

	//return ((4*H_BAR/(M*L*L*W(i)*W(i)))*((2.0*pow(G,4))/(W(i)*W(i)*(G*G + W(i)*W(i)))  + (W(i)*W(i)*(G*G + W(i)*W(i)))/(2.0*pow(G,4))));}

//////////////////////////
// Additional functions //
//////////////////////////

__device__ float read_and_map(int i, float *a, float *be, float *f)
	{float ram = a[i]/(be[i] + S_h_quant(i, f));
	return ram;}

///////////////////
// Main function //
///////////////////

__global__ void adding(float *dest, float *a, float *be, float *f,float *W,float *S_h_quant )
    {   __shared__ float redResult[BLOCK_SIZE];
        
        unsigned int n = 1*32*512;
        unsigned int tid = threadIdx.x;
        unsigned int seq_count = n*BLOCK_SIZE*tid + n - 1;
        unsigned int i = blockIdx.x*BLOCK_SIZE*seq_count + tid;
        float r_a_m = 0;
        float acc = 0;

        for (unsigned s = 0; s < seq_count; ++s)
          { 
            if (i >= n)
              break;
	    r_a_m = read_and_map(i,a,be,f);
            acc = REDUCE(acc, r_a_m); 
  
            i += BLOCK_SIZE; 
          }
  
          redResult[tid] = acc;
  
          __syncthreads();
  
          #if (BLOCK_SIZE >= 512) 
            if (tid < 256) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 256]); }
            __syncthreads();
          #endif
  
          #if (BLOCK_SIZE >= 256) 
            if (tid < 128) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 128]); } 
            __syncthreads(); 
          #endif
  
          #if (BLOCK_SIZE >= 128) 
            if (tid < 64) { redResult[tid] = REDUCE(redResult[tid], redResult[tid + 64]); } 
            __syncthreads(); 
          #endif
  
          if (tid < 32) 
          {
            if (BLOCK_SIZE >= 64) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 32]);
            if (BLOCK_SIZE >= 32) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 16]);
            if (BLOCK_SIZE >= 16) redResult[tid] = REDUCE(redResult[tid], redResult[tid + 8]);
            if (BLOCK_SIZE >= 8)  redResult[tid] = REDUCE(redResult[tid], redResult[tid + 4]);
            if (BLOCK_SIZE >= 4)  redResult[tid] = REDUCE(redResult[tid], redResult[tid + 2]);
            if (BLOCK_SIZE >= 2)  redResult[tid] = REDUCE(redResult[tid], redResult[tid + 1]);
          }
  
          if (tid == 0) dest[blockIdx.x] = redResult[0];
          
	  
    }

""")
############################
###   Function call      ###
############################

func = src.get_function("adding")
func(dest,a_gpu,b_gpu,f_gpu,W,S_h_quant, block = (thread_strides, 1, 1), grid = (block_size,1))
c = cuda.from_device(dest, 1, numpy.float32)


print c
print clock()-cr
print clock()-cr1


_______________________________________________
PyCUDA mailing list
PyCUDA@tiker.net
http://lists.tiker.net/listinfo/pycuda

Reply via email to