Hello.
I attach patch bringing new CURAND 4 features to PyCUDA.
Because there is more Sobol generator types, I decided
to introduce inheritance: _SobolRandomNumberGeneratorBase
and _ScrambledSobolRandomNumberGeneratorBase both
inherit from _RandomNumberGeneratorBase and are inherited
by proper Sobol{32,64}RandomNumberGenerator classes.
This leaves repetition in Sobol64 and ScrambledSobol64
concerning methods accepting 64-bit arguments, but
I do not have idea how to solve it without complicating code.I have not yet added curand_log_normal kernels - unlike other kernels they require additional parameters. I wanted to focus on bringing new types to PyCUDA first, and only then try to extend classes by adding new kernels. Best regards. -- Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index 4a4bd7e..f236ad6 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -258,6 +258,10 @@ if get_curand_version() >= (3, 2, 0):
direction_vector_set = _curand.direction_vector_set
_get_direction_vectors = _curand._get_direction_vectors
+if get_curand_version() >= (3, 2, 0):
+ _get_scramble_constants32 = _curand._get_scramble_constants32
+ _get_scramble_constants64 = _curand._get_scramble_constants64
+
# {{{ Base class
gen_template = """
@@ -279,14 +283,14 @@ extern "C"
%(generators)s
-__global__ void skip_ahead(%(state_type)s *s, const int n, const int skip)
+__global__ void skip_ahead(%(state_type)s *s, const int n, const unsigned int skip)
{
const int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < n)
skipahead(skip, &s[idx]);
}
-__global__ void skip_ahead_array(%(state_type)s *s, const int n, const int *skip)
+__global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned int *skip)
{
const int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < n)
@@ -529,20 +533,158 @@ if get_curand_version() >= (3, 2, 0):
# }}}
-# {{{ Sobol32 RNG
+# {{{ Sobol RNG
def generate_direction_vectors(count, direction=None):
- if direction is None:
- direction = direction_vector_set.VECTOR_32
-
- result = np.empty((count, 32), dtype=np.int32)
+ if get_curand_version() >= (4, 0, 0):
+ if direction == direction_vector_set.VECTOR_64 or \
+ direction == direction_vector_set.SCRAMBLED_VECTOR_64:
+ result = np.empty((count, 64), dtype=np.uint64)
+ else:
+ result = np.empty((count, 32), dtype=np.uint32)
+ else:
+ result = np.empty((count, 32), dtype=np.uint32)
_get_direction_vectors(direction, result, count)
return pycuda.gpuarray.to_gpu(result)
+if get_curand_version() >= (4, 0, 0):
+ def generate_scramble_constants32(count):
+ result = np.empty((count, ), dtype=np.uint32)
+ _get_scramble_constants32(result, count)
+ return pycuda.gpuarray.to_gpu(result)
+
+ def generate_scramble_constants64(count):
+ result = np.empty((count, ), dtype=np.uint64)
+ _get_scramble_constants64(result, count)
+ return pycuda.gpuarray.to_gpu(result)
+
+class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 3.2.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
+
+ has_box_muller = False
+
+ def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size,
+ dir_vector_set, offset, state_type, sobol_random_source):
+ super(_SobolRandomNumberGeneratorBase, self).__init__(state_type,
+ sobol_random_source)
+
+ if dir_vector is None:
+ dir_vector = generate_direction_vectors(
+ self.block_count * self.generators_per_block, dir_vector_set)
+
+ if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
+ and dir_vector.dtype == dir_vector_dtype
+ and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)):
+ raise TypeError("seed must be GPUArray of integers of right length")
+
+ p = self.module.get_function("prepare")
+ p.prepare("PiPi", block=(self.generators_per_block, 1, 1))
+
+ from pycuda.characterize import has_stack
+ has_stack = has_stack()
+
+ if has_stack:
+ prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
+
+ try:
+ if has_stack:
+ drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+ try:
+
+ dev = drv.Context.get_device()
+ if dev.compute_capability() >= (2, 0):
+ p.prepared_call((self.block_count, 1), self.state,
+ self.block_count * self.generators_per_block, dir_vector.gpudata, offset)
+ else:
+ p.prepared_call((2 * self.block_count, 1), self.state,
+ self.block_count * self.generators_per_block // 2, dir_vector.gpudata, offset)
+ except drv.LaunchError:
+ raise ValueError("Initialisation failed. Decrease number of threads.")
+
+ finally:
+ if has_stack:
+ drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+
+ def _kernels(self):
+ return (_RandomNumberGeneratorBase._kernels(self)
+ + [self.module.get_function("prepare")])
+
+class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 4.0.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
+
+ has_box_muller = False
+
+ def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size,
+ dir_vector_set, scramble_vector, scramble_vector_function,
+ offset, state_type, sobol_random_source):
+ super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__(state_type,
+ sobol_random_source)
+
+ if dir_vector is None:
+ dir_vector = generate_direction_vectors(
+ self.block_count * self.generators_per_block,
+ dir_vector_set)
+
+ if scramble_vector is None:
+ scramble_vector = scramble_vector_function(
+ self.block_count * self.generators_per_block)
+
+ if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
+ and dir_vector.dtype == dir_vector_dtype
+ and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)):
+ raise TypeError("seed must be GPUArray of integers of right length")
+
+ if not (isinstance(scramble_vector, pycuda.gpuarray.GPUArray)
+ and scramble_vector.dtype == dir_vector_dtype
+ and scramble_vector.shape == (self.block_count * self.generators_per_block, )):
+ raise TypeError("scramble must be GPUArray of integers of right length")
+
+ p = self.module.get_function("prepare")
+ p.prepare("PiPPi", block=(self.generators_per_block, 1, 1))
+
+ from pycuda.characterize import has_stack
+ has_stack = has_stack()
+
+ if has_stack:
+ prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
+
+ try:
+ if has_stack:
+ drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+ try:
+
+ dev = drv.Context.get_device()
+ if dev.compute_capability() >= (2, 0):
+ p.prepared_call((self.block_count, 1), self.state,
+ self.block_count * self.generators_per_block,
+ dir_vector.gpudata, scramble_vector.gpudata, offset)
+ else:
+ p.prepared_call((2 * self.block_count, 1), self.state,
+ self.block_count * self.generators_per_block // 2,
+ dir_vector.gpudata, scramble_vector.gpudata, offset)
+ except drv.LaunchError:
+ raise ValueError("Initialisation failed. Decrease number of threads.")
+
+ finally:
+ if has_stack:
+ drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+
+ def _kernels(self):
+ return (_RandomNumberGeneratorBase._kernels(self)
+ + [self.module.get_function("prepare")])
+
sobol32_random_source = """
extern "C" {
-__global__ void prepare(curandStateSobol32 *s, const int n, curandDirectionVectors32_t *v,
- const unsigned int o)
+__global__ void prepare(curandStateSobol32 *s, const int n,
+ curandDirectionVectors32_t *v, const unsigned int o)
{
const int id = blockIdx.x*blockDim.x+threadIdx.x;
if (id < n)
@@ -552,60 +694,150 @@ __global__ void prepare(curandStateSobol32 *s, const int n, curandDirectionVecto
"""
if get_curand_version() >= (3, 2, 0):
- class Sobol32RandomNumberGenerator(_RandomNumberGeneratorBase):
+ class Sobol32RandomNumberGenerator(_SobolRandomNumberGeneratorBase):
"""
Class surrounding CURAND kernels from CUDA 3.2.
It allows for generating quasi-random numbers with uniform
and normal probability function of type int, float, and double.
"""
- has_box_muller = False
-
def __init__(self, dir_vector=None, offset=0):
- super(Sobol32RandomNumberGenerator, self).__init__('curandStateSobol32',
- sobol32_random_source)
+ super(Sobol32RandomNumberGenerator, self).__init__(dir_vector,
+ np.uint32, 32, direction_vector_set.VECTOR_32, offset,
+ 'curandStateSobol32', sobol32_random_source)
- if dir_vector is None:
- dir_vector = generate_direction_vectors(
- self.block_count * self.generators_per_block)
+scrambledsobol32_random_source = """
+extern "C" {
+__global__ void prepare(curandStateScrambledSobol32 *s, const int n,
+ curandDirectionVectors32_t *v, unsigned int *scramble, const unsigned int o)
+{
+ const int id = blockIdx.x*blockDim.x+threadIdx.x;
+ if (id < n)
+ curand_init(v[id], scramble[id], o, &s[id]);
+}
+}
+"""
- if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
- and dir_vector.dtype == np.int32
- and dir_vector.shape == (self.block_count * self.generators_per_block, 32)):
- raise TypeError("seed must be GPUArray of integers of right length")
+if get_curand_version() >= (4, 0, 0):
+ class ScrambledSobol32RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 4.0.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
- p = self.module.get_function("prepare")
- p.prepare("PiPi", block=(self.generators_per_block, 1, 1))
+ def __init__(self, dir_vector=None, scramble_vector=None, offset=0):
+ super(ScrambledSobol32RandomNumberGenerator, self).__init__(dir_vector,
+ np.uint32, 32, direction_vector_set.SCRAMBLED_VECTOR_32,
+ scramble_vector, generate_scramble_constants32, offset,
+ 'curandStateScrambledSobol32', scrambledsobol32_random_source)
- from pycuda.characterize import has_stack
- has_stack = has_stack()
+sobol64_random_source = """
+extern "C" {
+__global__ void prepare(curandStateSobol64 *s, const int n, curandDirectionVectors64_t *v,
+ const unsigned int o)
+{
+ const int id = blockIdx.x*blockDim.x+threadIdx.x;
+ if (id < n)
+ curand_init(v[id], o, &s[id]);
+}
+}
+"""
- if has_stack:
- prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
+random_skip_ahead64_source = """
+extern "C" {
+__global__ void skip_ahead64(%(state_type)s *s, const int n, const unsigned long long skip)
+{
+ const int idx = blockIdx.x*blockDim.x+threadIdx.x;
+ if (idx < n)
+ skipahead(skip, &s[idx]);
+}
- try:
- if has_stack:
- drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
- try:
+__global__ void skip_ahead_array64(%(state_type)s *s, const int n, const unsigned long long *skip)
+{
+ const int idx = blockIdx.x*blockDim.x+threadIdx.x;
+ if (idx < n)
+ skipahead(skip[idx], &s[idx]);
+}
+}
+"""
- dev = drv.Context.get_device()
- if dev.compute_capability() >= (2, 0):
- p.prepared_call((self.block_count, 1), self.state,
- self.block_count * self.generators_per_block, dir_vector.gpudata, offset)
- else:
- p.prepared_call((2 * self.block_count, 1), self.state,
- self.block_count * self.generators_per_block // 2, dir_vector.gpudata, offset)
- except drv.LaunchError:
- raise ValueError("Initialisation failed. Decrease number of threads.")
+if get_curand_version() >= (4, 0, 0):
+ class Sobol64RandomNumberGenerator(_SobolRandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 4.0.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
- finally:
- if has_stack:
- drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+ def __init__(self, dir_vector=None, offset=0):
+ super(Sobol64RandomNumberGenerator, self).__init__(dir_vector,
+ np.uint64, 64, direction_vector_set.VECTOR_64, offset,
+ 'curandStateSobol64', sobol64_random_source+random_skip_ahead64_source)
+
+ self.skip_ahead64 = self.module.get_function("skip_ahead64")
+ self.skip_ahead64.prepare("Pii")
+ self.skip_ahead_array64 = self.module.get_function("skip_ahead_array64")
+ self.skip_ahead_array64.prepare("PiP")
def _kernels(self):
- return (_RandomNumberGeneratorBase._kernels(self)
- + [self.module.get_function("prepare")])
+ return (_SobolRandomNumberGeneratorBase._kernels(self)
+ + [self.module.get_function("skip_ahead64"), self.module.get_function("skip_ahead_array64")])
+
+ def call_skip_ahead64(self, i, stream=None):
+ self.skip_ahead64.prepared_async_call(
+ (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+ self.state, self.generators_per_block, i)
+ def call_skip_ahead_array64(self, i, stream=None):
+ self.skip_ahead_array64.prepared_async_call(
+ (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+ self.state, self.generators_per_block, i.gpudata)
+
+scrambledsobol64_random_source = """
+extern "C" {
+__global__ void prepare(curandStateScrambledSobol64 *s, const int n,
+ curandDirectionVectors64_t *v, unsigned long long *scramble, const unsigned int o)
+{
+ const int id = blockIdx.x*blockDim.x+threadIdx.x;
+ if (id < n)
+ curand_init(v[id], scramble[id], o, &s[id]);
+}
+}
+"""
+
+if get_curand_version() >= (4, 0, 0):
+ class ScrambledSobol64RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 4.0.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
+
+ def __init__(self, dir_vector=None, scramble_vector=None, offset=0):
+ super(ScrambledSobol64RandomNumberGenerator, self).__init__(dir_vector,
+ np.uint64, 64, direction_vector_set.SCRAMBLED_VECTOR_64,
+ scramble_vector, generate_scramble_constants64, offset,
+ 'curandStateScrambledSobol64', sobol64_random_source+random_skip_ahead64_source)
+
+ self.skip_ahead64 = self.module.get_function("skip_ahead64")
+ self.skip_ahead64.prepare("Pii")
+ self.skip_ahead_array64 = self.module.get_function("skip_ahead_array64")
+ self.skip_ahead_array64.prepare("PiP")
+
+ def _kernels(self):
+ return (_ScrambledSobolRandomNumberGeneratorBase._kernels(self)
+ + [self.module.get_function("skip_ahead64"), self.module.get_function("skip_ahead_array64")])
+
+ def call_skip_ahead64(self, i, stream=None):
+ self.skip_ahead64.prepared_async_call(
+ (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+ self.state, self.generators_per_block, i)
+
+ def call_skip_ahead_array64(self, i, stream=None):
+ self.skip_ahead_array64.prepared_async_call(
+ (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+ self.state, self.generators_per_block, i.gpudata)
# }}}
# }}}
diff --git a/src/cpp/curand.hpp b/src/cpp/curand.hpp
index 9783239..b3d015d 100644
--- a/src/cpp/curand.hpp
+++ b/src/cpp/curand.hpp
@@ -52,19 +52,94 @@ namespace pycuda { namespace curandom {
if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
throw py::error_already_set();
- if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set) {
+ if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set
+#if CUDAPP_CUDA_VERSION >= 4000
+ || CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 == set
+#endif
+ ) {
curandDirectionVectors32_t *vectors;
CURAND_CALL_GUARDED(curandGetDirectionVectors32, (&vectors, set));
while (count > 0) {
int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors32_t);
- memcpy((int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size);
+ memcpy((unsigned int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size);
count -= size/sizeof(curandDirectionVectors32_t);
n++;
}
}
+#if CUDAPP_CUDA_VERSION >= 4000
+ if (CURAND_DIRECTION_VECTORS_64_JOEKUO6 == set
+ || CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 == set) {
+ curandDirectionVectors64_t *vectors;
+ CURAND_CALL_GUARDED(curandGetDirectionVectors64, (&vectors, set));
+ while (count > 0) {
+ int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors64_t);
+ memcpy((unsigned long long *)buf+n*20000*sizeof(curandDirectionVectors64_t)/sizeof(unsigned long long), vectors, size);
+ count -= size/sizeof(curandDirectionVectors64_t);
+ n++;
+ }
+ }
+#endif
+ }
+#endif
+
+#if CUDAPP_CUDA_VERSION >= 4000
+ void py_curand_get_scramble_constants32(py::object dst, int count)
+ {
+ void *buf;
+ PYCUDA_BUFFER_SIZE_T len;
+ int n = 0;
+
+ if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
+ throw py::error_already_set();
+ unsigned int *vectors;
+ CURAND_CALL_GUARDED(curandGetScrambleConstants32, (&vectors));
+// Documentation does not mention number of dimensions
+// Assuming the same as in getDirectionVectors*
+ while (count > 0) {
+ int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned int);
+ memcpy((unsigned int *)buf+n*20000, vectors, size);
+ count -= size/sizeof(unsigned int);
+ n++;
+ }
+ }
+
+ void py_curand_get_scramble_constants64(py::object dst, int count)
+ {
+ void *buf;
+ PYCUDA_BUFFER_SIZE_T len;
+ int n = 0;
+
+ if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
+ throw py::error_already_set();
+ unsigned long long *vectors;
+ CURAND_CALL_GUARDED(curandGetScrambleConstants64, (&vectors));
+// Documentation does not mention number of dimensions
+// Assuming the same as in getDirectionVectors*
+ while (count > 0) {
+ int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned long long);
+ memcpy((unsigned long long *)buf+n*20000, vectors, size);
+ count -= size/sizeof(unsigned long long);
+ n++;
+ }
}
#endif
+// TODO: add more methods in classes (log)
+// log_normal(state, mean, stddev)
+// log_normal(Scrambled64)
+// log_normal_double(Scrambled64)
+// log_normal(Sobol64)
+// log_normal_double(Sobol64)
+// log_normal(Scrambled32)
+// log_normal_double(Scrambled32)
+// log_normal(Sobol32)
+// log_normal_double(Sobol32)
+//
+// log_normal(XORWOW)
+// log_normal_double(XORWOW)
+// log_normal2(XORWOW)
+// log_normal2_double(XORWOW)
+//
} }
#endif
diff --git a/src/wrapper/wrap_curand.cpp b/src/wrapper/wrap_curand.cpp
index 4ca1e42..681c339 100644
--- a/src/wrapper/wrap_curand.cpp
+++ b/src/wrapper/wrap_curand.cpp
@@ -19,6 +19,11 @@ void pycuda_expose_curand()
#if CUDAPP_CUDA_VERSION >= 3020
py::enum_<curandDirectionVectorSet_t>("direction_vector_set")
.value("VECTOR_32", CURAND_DIRECTION_VECTORS_32_JOEKUO6)
+#if CUDAPP_CUDA_VERSION >= 4000
+ .value("SCRAMBLED_VECTOR_32", CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)
+ .value("VECTOR_64", CURAND_DIRECTION_VECTORS_32_JOEKUO6)
+ .value("SCRAMBLED_VECTOR_64", CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6)
+#endif
;
#endif
@@ -28,4 +33,11 @@ void pycuda_expose_curand()
py::def("_get_direction_vectors", py_curand_get_direction_vectors,
(arg("set"), arg("dst"), arg("count")));
#endif
+
+#if CUDAPP_CUDA_VERSION >= 4000
+ py::def("_get_scramble_constants32", py_curand_get_scramble_constants32,
+ (arg("dst"), arg("count")));
+ py::def("_get_scramble_constants64", py_curand_get_scramble_constants64,
+ (arg("dst"), arg("count")));
+#endif
}
signature.asc
Description: This is a digitally signed message part
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
