Hi Folks:

I'm working on simulating a simple neural network model on the GPU.
In my case, I should see benefits from performing many simulations of
the simple model at once across threads instead of parallelizing
individual simulations because the neural network is so small.

I'd like to pass a struct with arrays containing parameters and
initialization information for the neural network and also a place to
put results.  This is only to keep the code clean (otherwise I'll be
passing in handfuls of parameters to the kernel.)  I have had full
success passing in separate parameters, but have failed to pass the
struct, getting launch failed errors at various stages of the process
(sometimes when allocating memory and sometimes with trying to read it
off the device.)

I've included a simplified example below.  I realize the class to
handle talking to the C struct is a bit crazy, but if it worked I
could clean it up into a more general class.

Is there any clue as to what is wrong or is there a better way to
accomplish what I'm trying to do?  I'm pretty new to pycuda and cuda,
so I won't be offended at all if you give me drastically different
suggestions of what to do or if you point out a ridiculous error that
I'm making ;)

Thanks,
Per

PS-> I'm using a git clone of pycuda from about a week ago and version
2.1 of CUDA libs on a GTX285.

struct_test.py (also attached, but in case no attachments are allowed):
------------------

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np

mod = SourceModule(
    """

struct results
{
  unsigned int n;
  float *A;
  float *B;
};

__global__ void struct_test(results *res)
{
  unsigned int i;
  for (i=0; i<res->n; i++)
  {
    res->A[i] = res->B[i] + 1;
  }
}

    """)


cu_struct = mod.get_function("struct_test")

class Results(object):
    def __init__(self, n=10):
        self._cptr = None
        self.n = np.uint32(n)
        self.A = np.zeros(self.n,dtype=np.float32)
        self.B = np.ones(self.n,dtype=np.float32)
    def send_to_gpu(self):
        if self._cptr is None:
            self._cptr = cuda.mem_alloc(self.nbytes())
        cuda.memcpy_htod(self._cptr, self.pack())
    def get_from_gpu(self):
        if not self._cptr is None:
            tempstr = np.array([' ']*self.nbytes())
            cuda.memcpy_dtoh(tempstr,self._cptr)
            ind = np.array([0,self.n.nbytes])
            self.n = np.fromstring(tempstr[ind[0]:ind[1]],
                                   dtype=self.n.dtype).reshape(self.n.shape)
            ind[0] += self.n.nbytes
            ind[1] += self.A.nbytes
            self.A = np.fromstring(tempstr[ind[0]:ind[1]],
                                   dtype=self.A.dtype).reshape(self.A.dtype)
            ind[0] += self.A.nbytes
            ind[1] += self.B.nbytes
            self.B = np.fromstring(tempstr[ind[0]:ind[1]],
                                   dtype=self.B.dtype).reshape(self.B.dtype)
    def pack(self):
        return self.n.tostring() + self.A.tostring() + self.B.tostring()
    def nbytes(self):
        return self.n.nbytes + self.A.nbytes + self.B.nbytes

res = Results(10)
res.send_to_gpu()
cu_struct(res._cptr, block=(1,1,1))
res.get_from_gpu()

print res.A
print res.B
print res.n
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np

mod = SourceModule(
    """

struct results
{
  unsigned int n;
  float *A;
  float *B;
};

__global__ void struct_test(results *res)
{
  unsigned int i;
  for (i=0; i<res->n; i++)
  {
    res->A[i] = res->B[i] + 1;
  }
}

    """)


cu_struct = mod.get_function("struct_test")

class Results(object):
    def __init__(self, n=10):
        self._cptr = None
        self.n = np.uint32(n)
        self.A = np.zeros(self.n,dtype=np.float32)
        self.B = np.ones(self.n,dtype=np.float32)
    def send_to_gpu(self):
        if self._cptr is None:
            self._cptr = cuda.mem_alloc(self.nbytes())
        cuda.memcpy_htod(self._cptr, self.pack())
    def get_from_gpu(self):
        if not self._cptr is None:
            tempstr = np.array([' ']*self.nbytes())
            cuda.memcpy_dtoh(tempstr,self._cptr)
            ind = np.array([0,self.n.nbytes])
            self.n = np.fromstring(tempstr[ind[0]:ind[1]],
                                   dtype=self.n.dtype).reshape(self.n.shape)
            ind[0] += self.n.nbytes
            ind[1] += self.A.nbytes
            self.A = np.fromstring(tempstr[ind[0]:ind[1]],
                                   dtype=self.A.dtype).reshape(self.A.dtype)
            ind[0] += self.A.nbytes
            ind[1] += self.B.nbytes
            self.B = np.fromstring(tempstr[ind[0]:ind[1]],
                                   dtype=self.B.dtype).reshape(self.B.dtype)
    def pack(self):
        return self.n.tostring() + self.A.tostring() + self.B.tostring()
    def nbytes(self):
        return self.n.nbytes + self.A.nbytes + self.B.nbytes

res = Results(10)
res.send_to_gpu()
cu_struct(res._cptr, block=(1,1,1))
res.get_from_gpu()

print res.A
print res.B
print res.n
_______________________________________________
PyCuda mailing list
[email protected]
http://tiker.net/mailman/listinfo/pycuda_tiker.net

Reply via email to