[PyCUDA] CURAND 4

Tomasz Rybak Tue, 07 Jun 2011 02:51:49 -0700

Hello.
I attach patch bringing new CURAND 4 features to PyCUDA.
Because there is more Sobol generator types, I decided
to introduce inheritance: _SobolRandomNumberGeneratorBase
and _ScrambledSobolRandomNumberGeneratorBase both
inherit from _RandomNumberGeneratorBase and are inherited
by proper Sobol{32,64}RandomNumberGenerator classes.
This leaves repetition in Sobol64 and ScrambledSobol64
concerning methods accepting 64-bit arguments, but
I do not have idea how to solve it without complicating code.


I have not yet added curand_log_normal kernels - unlike
other kernels they require additional parameters. I wanted
to focus on bringing new types to PyCUDA first, and only
then try to extend classes by adding new kernels.

Best regards.


-- 
Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860
Fingerprint A481 824E 7DD3 9C0E C40A  488E C654 FB33 2AD5 9860
http://member.acm.org/~tomaszrybak

diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index 4a4bd7e..f236ad6 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -258,6 +258,10 @@ if get_curand_version() >= (3, 2, 0):
     direction_vector_set = _curand.direction_vector_set
     _get_direction_vectors = _curand._get_direction_vectors
 
+if get_curand_version() >= (3, 2, 0):
+    _get_scramble_constants32 = _curand._get_scramble_constants32
+    _get_scramble_constants64 = _curand._get_scramble_constants64
+
 # {{{ Base class
 
 gen_template = """
@@ -279,14 +283,14 @@ extern "C"
 
 %(generators)s
 
-__global__ void skip_ahead(%(state_type)s *s, const int n, const int skip)
+__global__ void skip_ahead(%(state_type)s *s, const int n, const unsigned int skip)
 {
   const int idx = blockIdx.x*blockDim.x+threadIdx.x;
   if (idx < n)
     skipahead(skip, &s[idx]);
 }
 
-__global__ void skip_ahead_array(%(state_type)s *s, const int n, const int *skip)
+__global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned int *skip)
 {
   const int idx = blockIdx.x*blockDim.x+threadIdx.x;
   if (idx < n)
@@ -529,20 +533,158 @@ if get_curand_version() >= (3, 2, 0):
 
 # }}}
 
-# {{{ Sobol32 RNG
+# {{{ Sobol RNG
 
 def generate_direction_vectors(count, direction=None):
-    if direction is None:
-        direction = direction_vector_set.VECTOR_32
-
-    result = np.empty((count, 32), dtype=np.int32)
+    if get_curand_version() >= (4, 0, 0):
+        if direction == direction_vector_set.VECTOR_64 or \
+            direction == direction_vector_set.SCRAMBLED_VECTOR_64:
+            result = np.empty((count, 64), dtype=np.uint64)
+        else:
+            result = np.empty((count, 32), dtype=np.uint32)
+    else:
+        result = np.empty((count, 32), dtype=np.uint32)
     _get_direction_vectors(direction, result, count)
     return pycuda.gpuarray.to_gpu(result)
 
+if get_curand_version() >= (4, 0, 0):
+    def generate_scramble_constants32(count):
+        result = np.empty((count, ), dtype=np.uint32)
+        _get_scramble_constants32(result, count)
+        return pycuda.gpuarray.to_gpu(result)
+
+    def generate_scramble_constants64(count):
+        result = np.empty((count, ), dtype=np.uint64)
+        _get_scramble_constants64(result, count)
+        return pycuda.gpuarray.to_gpu(result)
+
+class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
+    """
+    Class surrounding CURAND kernels from CUDA 3.2.
+    It allows for generating quasi-random numbers with uniform
+    and normal probability function of type int, float, and double.
+    """
+
+    has_box_muller = False
+
+    def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size,
+        dir_vector_set, offset, state_type, sobol_random_source):
+        super(_SobolRandomNumberGeneratorBase, self).__init__(state_type,
+            sobol_random_source)
+
+        if dir_vector is None:
+            dir_vector = generate_direction_vectors(
+                self.block_count * self.generators_per_block, dir_vector_set)
+
+        if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
+                and dir_vector.dtype == dir_vector_dtype
+                and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)):
+            raise TypeError("seed must be GPUArray of integers of right length")
+
+        p = self.module.get_function("prepare")
+        p.prepare("PiPi", block=(self.generators_per_block, 1, 1))
+
+        from pycuda.characterize import has_stack
+        has_stack = has_stack()
+
+        if has_stack:
+            prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
+
+        try:
+            if has_stack:
+                drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+            try:
+
+                dev = drv.Context.get_device()
+                if dev.compute_capability() >= (2, 0):
+                    p.prepared_call((self.block_count, 1), self.state,
+                        self.block_count * self.generators_per_block, dir_vector.gpudata, offset)
+                else:
+                    p.prepared_call((2 * self.block_count, 1), self.state,
+                        self.block_count * self.generators_per_block // 2, dir_vector.gpudata, offset)
+            except drv.LaunchError:
+                raise ValueError("Initialisation failed. Decrease number of threads.")
+
+        finally:
+            if has_stack:
+                drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+
+    def _kernels(self):
+        return (_RandomNumberGeneratorBase._kernels(self)
+                + [self.module.get_function("prepare")])
+
+class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
+    """
+    Class surrounding CURAND kernels from CUDA 4.0.
+    It allows for generating quasi-random numbers with uniform
+    and normal probability function of type int, float, and double.
+    """
+
+    has_box_muller = False
+
+    def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size,
+        dir_vector_set, scramble_vector, scramble_vector_function,
+        offset, state_type, sobol_random_source):
+        super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__(state_type,
+            sobol_random_source)
+
+        if dir_vector is None:
+            dir_vector = generate_direction_vectors(
+                self.block_count * self.generators_per_block,
+                dir_vector_set)
+
+        if scramble_vector is None:
+            scramble_vector = scramble_vector_function(
+                self.block_count * self.generators_per_block)
+
+        if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
+                and dir_vector.dtype == dir_vector_dtype
+                and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)):
+            raise TypeError("seed must be GPUArray of integers of right length")
+
+        if not (isinstance(scramble_vector, pycuda.gpuarray.GPUArray)
+                and scramble_vector.dtype == dir_vector_dtype
+                and scramble_vector.shape == (self.block_count * self.generators_per_block, )):
+            raise TypeError("scramble must be GPUArray of integers of right length")
+
+        p = self.module.get_function("prepare")
+        p.prepare("PiPPi", block=(self.generators_per_block, 1, 1))
+
+        from pycuda.characterize import has_stack
+        has_stack = has_stack()
+
+        if has_stack:
+            prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
+
+        try:
+            if has_stack:
+                drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+            try:
+
+                dev = drv.Context.get_device()
+                if dev.compute_capability() >= (2, 0):
+                    p.prepared_call((self.block_count, 1), self.state,
+                        self.block_count * self.generators_per_block,
+                        dir_vector.gpudata, scramble_vector.gpudata, offset)
+                else:
+                    p.prepared_call((2 * self.block_count, 1), self.state,
+                        self.block_count * self.generators_per_block // 2,
+                        dir_vector.gpudata, scramble_vector.gpudata, offset)
+            except drv.LaunchError:
+                raise ValueError("Initialisation failed. Decrease number of threads.")
+
+        finally:
+            if has_stack:
+                drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+
+    def _kernels(self):
+        return (_RandomNumberGeneratorBase._kernels(self)
+                + [self.module.get_function("prepare")])
+
 sobol32_random_source = """
 extern "C" {
-__global__ void prepare(curandStateSobol32 *s, const int n, curandDirectionVectors32_t *v,
-    const unsigned int o)
+__global__ void prepare(curandStateSobol32 *s, const int n,
+    curandDirectionVectors32_t *v, const unsigned int o)
 {
   const int id = blockIdx.x*blockDim.x+threadIdx.x;
   if (id < n)
@@ -552,60 +694,150 @@ __global__ void prepare(curandStateSobol32 *s, const int n, curandDirectionVecto
 """
 
 if get_curand_version() >= (3, 2, 0):
-    class Sobol32RandomNumberGenerator(_RandomNumberGeneratorBase):
+    class Sobol32RandomNumberGenerator(_SobolRandomNumberGeneratorBase):
         """
         Class surrounding CURAND kernels from CUDA 3.2.
         It allows for generating quasi-random numbers with uniform
         and normal probability function of type int, float, and double.
         """
 
-        has_box_muller = False
-
         def __init__(self, dir_vector=None, offset=0):
-            super(Sobol32RandomNumberGenerator, self).__init__('curandStateSobol32',
-                sobol32_random_source)
+            super(Sobol32RandomNumberGenerator, self).__init__(dir_vector,
+                np.uint32, 32, direction_vector_set.VECTOR_32, offset, 
+                'curandStateSobol32', sobol32_random_source)
 
-            if dir_vector is None:
-                dir_vector = generate_direction_vectors(
-                    self.block_count * self.generators_per_block)
+scrambledsobol32_random_source = """
+extern "C" {
+__global__ void prepare(curandStateScrambledSobol32 *s, const int n,
+    curandDirectionVectors32_t *v, unsigned int *scramble, const unsigned int o)
+{
+  const int id = blockIdx.x*blockDim.x+threadIdx.x;
+  if (id < n)
+    curand_init(v[id], scramble[id], o, &s[id]);
+}
+}
+"""
 
-            if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
-                    and dir_vector.dtype == np.int32
-                    and dir_vector.shape == (self.block_count * self.generators_per_block, 32)):
-                raise TypeError("seed must be GPUArray of integers of right length")
+if get_curand_version() >= (4, 0, 0):
+    class ScrambledSobol32RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase):
+        """
+        Class surrounding CURAND kernels from CUDA 4.0.
+        It allows for generating quasi-random numbers with uniform
+        and normal probability function of type int, float, and double.
+        """
 
-            p = self.module.get_function("prepare")
-            p.prepare("PiPi", block=(self.generators_per_block, 1, 1))
+        def __init__(self, dir_vector=None, scramble_vector=None, offset=0):
+            super(ScrambledSobol32RandomNumberGenerator, self).__init__(dir_vector,
+                np.uint32, 32, direction_vector_set.SCRAMBLED_VECTOR_32,
+                scramble_vector, generate_scramble_constants32, offset, 
+                'curandStateScrambledSobol32', scrambledsobol32_random_source)
 
-            from pycuda.characterize import has_stack
-            has_stack = has_stack()
+sobol64_random_source = """
+extern "C" {
+__global__ void prepare(curandStateSobol64 *s, const int n, curandDirectionVectors64_t *v,
+    const unsigned int o)
+{
+  const int id = blockIdx.x*blockDim.x+threadIdx.x;
+  if (id < n)
+    curand_init(v[id], o, &s[id]);
+}
+}
+"""
 
-            if has_stack:
-                prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
+random_skip_ahead64_source = """
+extern "C" {
+__global__ void skip_ahead64(%(state_type)s *s, const int n, const unsigned long long skip)
+{
+  const int idx = blockIdx.x*blockDim.x+threadIdx.x;
+  if (idx < n)
+    skipahead(skip, &s[idx]);
+}
 
-            try:
-                if has_stack:
-                    drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
-                try:
+__global__ void skip_ahead_array64(%(state_type)s *s, const int n, const unsigned long long *skip)
+{
+  const int idx = blockIdx.x*blockDim.x+threadIdx.x;
+  if (idx < n)
+      skipahead(skip[idx], &s[idx]);
+}
+}
+"""
 
-                    dev = drv.Context.get_device()
-                    if dev.compute_capability() >= (2, 0):
-                        p.prepared_call((self.block_count, 1), self.state,
-                            self.block_count * self.generators_per_block, dir_vector.gpudata, offset)
-                    else:
-                        p.prepared_call((2 * self.block_count, 1), self.state,
-                            self.block_count * self.generators_per_block // 2, dir_vector.gpudata, offset)
-                except drv.LaunchError:
-                    raise ValueError("Initialisation failed. Decrease number of threads.")
+if get_curand_version() >= (4, 0, 0):
+    class Sobol64RandomNumberGenerator(_SobolRandomNumberGeneratorBase):
+        """
+        Class surrounding CURAND kernels from CUDA 4.0.
+        It allows for generating quasi-random numbers with uniform
+        and normal probability function of type int, float, and double.
+        """
 
-            finally:
-                if has_stack:
-                    drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+        def __init__(self, dir_vector=None, offset=0):
+            super(Sobol64RandomNumberGenerator, self).__init__(dir_vector,
+                np.uint64, 64, direction_vector_set.VECTOR_64, offset, 
+                'curandStateSobol64', sobol64_random_source+random_skip_ahead64_source)
+
+            self.skip_ahead64 = self.module.get_function("skip_ahead64")
+            self.skip_ahead64.prepare("Pii")
+            self.skip_ahead_array64 = self.module.get_function("skip_ahead_array64")
+            self.skip_ahead_array64.prepare("PiP")
 
         def _kernels(self):
-            return (_RandomNumberGeneratorBase._kernels(self)
-                    + [self.module.get_function("prepare")])
+            return (_SobolRandomNumberGeneratorBase._kernels(self)
+                    + [self.module.get_function("skip_ahead64"), self.module.get_function("skip_ahead_array64")])
+
+        def call_skip_ahead64(self, i, stream=None):
+            self.skip_ahead64.prepared_async_call(
+                    (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+                    self.state, self.generators_per_block, i)
 
+        def call_skip_ahead_array64(self, i, stream=None):
+            self.skip_ahead_array64.prepared_async_call(
+                    (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+                    self.state, self.generators_per_block, i.gpudata)
+
+scrambledsobol64_random_source = """
+extern "C" {
+__global__ void prepare(curandStateScrambledSobol64 *s, const int n,
+    curandDirectionVectors64_t *v, unsigned long long *scramble, const unsigned int o)
+{
+  const int id = blockIdx.x*blockDim.x+threadIdx.x;
+  if (id < n)
+    curand_init(v[id], scramble[id], o, &s[id]);
+}
+}
+"""
+
+if get_curand_version() >= (4, 0, 0):
+    class ScrambledSobol64RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase):
+        """
+        Class surrounding CURAND kernels from CUDA 4.0.
+        It allows for generating quasi-random numbers with uniform
+        and normal probability function of type int, float, and double.
+        """
+
+        def __init__(self, dir_vector=None, scramble_vector=None, offset=0):
+            super(ScrambledSobol64RandomNumberGenerator, self).__init__(dir_vector,
+                np.uint64, 64, direction_vector_set.SCRAMBLED_VECTOR_64,
+                scramble_vector, generate_scramble_constants64, offset, 
+                'curandStateScrambledSobol64', sobol64_random_source+random_skip_ahead64_source)
+
+            self.skip_ahead64 = self.module.get_function("skip_ahead64")
+            self.skip_ahead64.prepare("Pii")
+            self.skip_ahead_array64 = self.module.get_function("skip_ahead_array64")
+            self.skip_ahead_array64.prepare("PiP")
+
+        def _kernels(self):
+            return (_ScrambledSobolRandomNumberGeneratorBase._kernels(self)
+                    + [self.module.get_function("skip_ahead64"), self.module.get_function("skip_ahead_array64")])
+
+        def call_skip_ahead64(self, i, stream=None):
+            self.skip_ahead64.prepared_async_call(
+                    (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+                    self.state, self.generators_per_block, i)
+
+        def call_skip_ahead_array64(self, i, stream=None):
+            self.skip_ahead_array64.prepared_async_call(
+                    (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+                    self.state, self.generators_per_block, i.gpudata)
 # }}}
 
 # }}}
diff --git a/src/cpp/curand.hpp b/src/cpp/curand.hpp
index 9783239..b3d015d 100644
--- a/src/cpp/curand.hpp
+++ b/src/cpp/curand.hpp
@@ -52,19 +52,94 @@ namespace pycuda { namespace curandom {
 
     if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
       throw py::error_already_set();
-    if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set) {
+    if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set
+#if CUDAPP_CUDA_VERSION >= 4000
+      || CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 == set
+#endif
+    ) {
       curandDirectionVectors32_t *vectors;
       CURAND_CALL_GUARDED(curandGetDirectionVectors32, (&vectors, set));
       while (count > 0) {
         int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors32_t);
-        memcpy((int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size);
+        memcpy((unsigned int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size);
 	count -= size/sizeof(curandDirectionVectors32_t);
         n++;
       }
     }
+#if CUDAPP_CUDA_VERSION >= 4000
+    if (CURAND_DIRECTION_VECTORS_64_JOEKUO6 == set
+      || CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 == set) {
+      curandDirectionVectors64_t *vectors;
+      CURAND_CALL_GUARDED(curandGetDirectionVectors64, (&vectors, set));
+      while (count > 0) {
+        int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors64_t);
+        memcpy((unsigned long long *)buf+n*20000*sizeof(curandDirectionVectors64_t)/sizeof(unsigned long long), vectors, size);
+	count -= size/sizeof(curandDirectionVectors64_t);
+        n++;
+      }
+    }
+#endif
+  }
+#endif
+
+#if CUDAPP_CUDA_VERSION >= 4000
+  void py_curand_get_scramble_constants32(py::object dst, int count)
+  {
+    void *buf;
+    PYCUDA_BUFFER_SIZE_T len;
+    int n = 0;
+
+    if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
+      throw py::error_already_set();
+    unsigned int *vectors;
+    CURAND_CALL_GUARDED(curandGetScrambleConstants32, (&vectors));
+// Documentation does not mention number of dimensions
+// Assuming the same as in getDirectionVectors*
+    while (count > 0) {
+      int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned int);
+      memcpy((unsigned int *)buf+n*20000, vectors, size);
+      count -= size/sizeof(unsigned int);
+      n++;
+    }
+  }
+
+  void py_curand_get_scramble_constants64(py::object dst, int count)
+  {
+    void *buf;
+    PYCUDA_BUFFER_SIZE_T len;
+    int n = 0;
+
+    if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
+      throw py::error_already_set();
+    unsigned long long *vectors;
+    CURAND_CALL_GUARDED(curandGetScrambleConstants64, (&vectors));
+// Documentation does not mention number of dimensions
+// Assuming the same as in getDirectionVectors*
+    while (count > 0) {
+      int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned long long);
+      memcpy((unsigned long long *)buf+n*20000, vectors, size);
+      count -= size/sizeof(unsigned long long);
+      n++;
+    }
   }
 #endif
 
+// TODO: add more methods in classes (log)
+// log_normal(state, mean, stddev)
+// log_normal(Scrambled64)
+// log_normal_double(Scrambled64)
+// log_normal(Sobol64)
+// log_normal_double(Sobol64)
+// log_normal(Scrambled32)
+// log_normal_double(Scrambled32)
+// log_normal(Sobol32)
+// log_normal_double(Sobol32)
+//
+// log_normal(XORWOW)
+// log_normal_double(XORWOW)
+// log_normal2(XORWOW)
+// log_normal2_double(XORWOW)
+//
 } }
 
 #endif
diff --git a/src/wrapper/wrap_curand.cpp b/src/wrapper/wrap_curand.cpp
index 4ca1e42..681c339 100644
--- a/src/wrapper/wrap_curand.cpp
+++ b/src/wrapper/wrap_curand.cpp
@@ -19,6 +19,11 @@ void pycuda_expose_curand()
 #if CUDAPP_CUDA_VERSION >= 3020
   py::enum_<curandDirectionVectorSet_t>("direction_vector_set")
     .value("VECTOR_32", CURAND_DIRECTION_VECTORS_32_JOEKUO6)
+#if CUDAPP_CUDA_VERSION >= 4000
+    .value("SCRAMBLED_VECTOR_32", CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)
+    .value("VECTOR_64", CURAND_DIRECTION_VECTORS_32_JOEKUO6)
+    .value("SCRAMBLED_VECTOR_64", CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6)
+#endif
   ;
 #endif
 
@@ -28,4 +33,11 @@ void pycuda_expose_curand()
   py::def("_get_direction_vectors", py_curand_get_direction_vectors,
       (arg("set"), arg("dst"), arg("count")));
 #endif
+
+#if CUDAPP_CUDA_VERSION >= 4000
+  py::def("_get_scramble_constants32", py_curand_get_scramble_constants32,
+      (arg("dst"), arg("count")));
+  py::def("_get_scramble_constants64", py_curand_get_scramble_constants64,
+      (arg("dst"), arg("count")));
+#endif
 }

signature.asc
Description: This is a digitally signed message part

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

[PyCUDA] CURAND 4

Reply via email to