Hello.
I have been trying to finally fix problem with CURAND 4 wrapper.
I am sending patch adding classes wrapping generators added in CURAND 4,
i.e. Sobol64 and ScrambledSobol{32, 64}.
I have fixed warnings related to block size in prepare_call.I have also tried to fix problem with lack of resources when creating Sobol64 classes. I have decided to decrease number of threads in constructor for both 1.x and 2.x devices. Unfortunately currently I have access to ION (1.1 device), so I cannot test it on Fermi device - can someone test it on Tesla C20x0 or GeForce 4x0 or 5x0? If this patch is OK, then the only missing part of CURAND 4 will be log_* generators. Best regards. -- Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
diff --git a/doc/source/array.rst b/doc/source/array.rst
index 431aa37..e2a3427 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -447,10 +447,31 @@ Quasirandom numbers are more expensive to generate.
Accepts array i of integer values, telling each generator how many
values to skip.
+ .. method call_skip_ahead_sequence(i, stream=None)
+
+ Forces all generators to skip i subsequences. Is equivalent to
+ generating i * :math:`2^67` values and discarding results,
+ but is much faster.
+
+ .. method call_skip_ahead_sequence_array(i, stream=None)
+
+ Accepts array i of integer values, telling each generator how many
+ subsequences to skip.
+
.. function:: generate_direction_vectors(count, direction=direction_vector_set.VECTOR_32)
Return an :class:`GPUArray` `count` filled with direction vectors
- used to initialize Sobol32 generators.
+ used to initialize Sobol generators.
+
+.. function:: generate_scramble_constants32(count)
+
+ Return a :class:`GPUArray` filled with `count' 32-bit unsigned integer
+ numbers used to initialize :class:`ScrambledSobol32RandomNumberGenerator`
+
+.. function:: generate_scramble_constants64(count)
+
+ Return a :class:`GPUArray` filled with `count' 64-bit unsigned integer
+ numbers used to initialize :class:`ScrambledSobol64RandomNumberGenerator`
.. class:: Sobol32RandomNumberGenerator(dir_vector=None, offset=0)
@@ -487,6 +508,116 @@ Quasirandom numbers are more expensive to generate.
Accepts array i of integer values, telling each generator how many
values to skip.
+.. class:: ScrambledSobol32RandomNumberGenerator(dir_vector=None, scramble_vector=None, offset=0)
+
+ :arg dir_vector: a :class:`GPUArray` of 32-element `uint32` vectors which
+ are used to initialize quasirandom generator; it must contain one vector
+ for each initialized generator
+ :arg scramble_vector: a :class:`GPUArray` of `uint32` elements which
+ are used to initialize quasirandom generator; it must contain one number
+ for each initialized generator
+ :arg offset: Starting index into the Sobol32 sequence, given direction
+ vector.
+
+ Provides quasirandom numbers. Generates
+ sequences with period of :math:`2^32`.
+
+ CUDA 4.0 and above.
+
+ .. versionadded:: 2011.1
+
+ .. method fill_uniform(data, stream=None)
+
+ Fills in :class:`GPUArray` *data* with uniformly distributed
+ quasirandom values.
+
+ .. method fill_normal(data, stream=None)
+
+ Fills in :class:`GPUArray` *data* with normally distributed
+ quasirandom values.
+
+ .. method call_skip_ahead(i, stream=None)
+
+ Forces all generators to skip i values. Is equivalent to generating
+ i values and discarding results, but is much faster.
+
+ .. method call_skip_ahead_array(i, stream=None)
+
+ Accepts array i of integer values, telling each generator how many
+ values to skip.
+
+.. class:: Sobol64RandomNumberGenerator(dir_vector=None, offset=0)
+
+ :arg dir_vector: a :class:`GPUArray` of 64-element `uint64` vectors which
+ are used to initialize quasirandom generator; it must contain one vector
+ for each initialized generator
+ :arg offset: Starting index into the Sobol64 sequence, given direction
+ vector.
+
+ Provides quasirandom numbers. Generates
+ sequences with period of :math:`2^64`.
+
+ CUDA 4.0 and above.
+
+ .. versionadded:: 2011.1
+
+ .. method fill_uniform(data, stream=None)
+
+ Fills in :class:`GPUArray` *data* with uniformly distributed
+ quasirandom values.
+
+ .. method fill_normal(data, stream=None)
+
+ Fills in :class:`GPUArray` *data* with normally distributed
+ quasirandom values.
+
+ .. method call_skip_ahead(i, stream=None)
+
+ Forces all generators to skip i values. Is equivalent to generating
+ i values and discarding results, but is much faster.
+
+ .. method call_skip_ahead_array(i, stream=None)
+
+ Accepts array i of integer values, telling each generator how many
+ values to skip.
+
+.. class:: ScrambledSobol64RandomNumberGenerator(dir_vector=None, scramble_vector=None, offset=0)
+
+ :arg dir_vector: a :class:`GPUArray` of 64-element `uint64` vectors which
+ are used to initialize quasirandom generator; it must contain one vector
+ for each initialized generator
+ :arg scramble_vector: a :class:`GPUArray` of `uint64` vectors which
+ are used to initialize quasirandom generator; it must contain one vector
+ for each initialized generator
+ :arg offset: Starting index into the ScrambledSobol64 sequence,
+ given direction vector.
+
+ Provides quasirandom numbers. Generates
+ sequences with period of :math:`2^64`.
+
+ CUDA 4.0 and above.
+
+ .. versionadded:: 2011.1
+
+ .. method fill_uniform(data, stream=None)
+
+ Fills in :class:`GPUArray` *data* with uniformly distributed
+ quasirandom values.
+
+ .. method fill_normal(data, stream=None)
+
+ Fills in :class:`GPUArray` *data* with normally distributed
+ quasirandom values.
+
+ .. method call_skip_ahead(i, stream=None)
+
+ Forces all generators to skip i values. Is equivalent to generating
+ i values and discarding results, but is much faster.
+
+ .. method call_skip_ahead_array(i, stream=None)
+
+ Accepts array i of integer values, telling each generator how many
+ values to skip.
Single-pass Custom Expression Evaluation
----------------------------------------
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index ed7b8dc..9b1c5eb 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -258,6 +258,10 @@ if get_curand_version() >= (3, 2, 0):
direction_vector_set = _curand.direction_vector_set
_get_direction_vectors = _curand._get_direction_vectors
+if get_curand_version() >= (4, 0, 0):
+ _get_scramble_constants32 = _curand._get_scramble_constants32
+ _get_scramble_constants64 = _curand._get_scramble_constants64
+
# {{{ Base class
gen_template = """
@@ -279,14 +283,19 @@ extern "C"
%(generators)s
-__global__ void skip_ahead(%(state_type)s *s, const int n, const int skip)
+}
+"""
+
+random_skip_ahead32_source = """
+extern "C" {
+__global__ void skip_ahead(%(state_type)s *s, const int n, const unsigned int skip)
{
const int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < n)
skipahead(skip, &s[idx]);
}
-__global__ void skip_ahead_array(%(state_type)s *s, const int n, const int *skip)
+__global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned int *skip)
{
const int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < n)
@@ -295,6 +304,23 @@ __global__ void skip_ahead_array(%(state_type)s *s, const int n, const int *skip
}
"""
+random_skip_ahead64_source = """
+extern "C" {
+__global__ void skip_ahead(%(state_type)s *s, const int n, const unsigned long long skip)
+{
+ const int idx = blockIdx.x*blockDim.x+threadIdx.x;
+ if (idx < n)
+ skipahead(skip, &s[idx]);
+}
+
+__global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned long long *skip)
+{
+ const int idx = blockIdx.x*blockDim.x+threadIdx.x;
+ if (idx < n)
+ skipahead(skip[idx], &s[idx]);
+}
+}
+"""
@@ -460,7 +486,7 @@ def seed_getter_unique(N):
xorwow_random_source = """
extern "C" {
-__global__ void prepare_with_seeds(curandState *s, const int n,
+__global__ void prepare_with_seeds(curandStateXORWOW *s, const int n,
const int *seed, const int offset)
{
const int id = blockIdx.x*blockDim.x+threadIdx.x;
@@ -468,6 +494,19 @@ __global__ void prepare_with_seeds(curandState *s, const int n,
curand_init(seed[id], threadIdx.x, offset, &s[id]);
}
+__global__ void skip_ahead_sequence(%(state_type)s *s, const int n, const unsigned int skip)
+{
+ const int idx = blockIdx.x*blockDim.x+threadIdx.x;
+ if (idx < n)
+ skipahead_sequence(skip, &s[idx]);
+}
+
+__global__ void skip_ahead_sequence_array(%(state_type)s *s, const int n, const unsigned int *skip)
+{
+ const int idx = blockIdx.x*blockDim.x+threadIdx.x;
+ if (idx < n)
+ skipahead_sequence(skip[idx], &s[idx]);
+}
}
"""
@@ -483,7 +522,7 @@ if get_curand_version() >= (3, 2, 0):
"""
super(XORWOWRandomNumberGenerator, self).__init__(
- 'curandStateXORWOW', xorwow_random_source)
+ 'curandStateXORWOW', xorwow_random_source+random_skip_ahead64_source)
if seed_getter is None:
seed = array.to_gpu(
@@ -501,6 +540,10 @@ if get_curand_version() >= (3, 2, 0):
p = self.module.get_function("prepare_with_seeds")
p.prepare("PiPi")
+ self.skip_ahead_sequence = self.module.get_function("skip_ahead_sequence")
+ self.skip_ahead_sequence.prepare("Pii")
+ self.skip_ahead_sequence_array = self.module.get_function("skip_ahead_sequence_array")
+ self.skip_ahead_sequence_array.prepare("PiP")
from pycuda.characterize import has_stack
has_stack = has_stack()
@@ -523,26 +566,164 @@ if get_curand_version() >= (3, 2, 0):
if has_stack:
drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+ def call_skip_ahead_sequence(self, i, stream=None):
+ self.skip_ahead_sequence.prepared_async_call(
+ (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+ self.state, self.generators_per_block, i)
+
+ def call_skip_ahead_sequence_array(self, i, stream=None):
+ self.skip_ahead_sequence_array.prepared_async_call(
+ (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+ self.state, self.generators_per_block, i.gpudata)
+
def _kernels(self):
return (_RandomNumberGeneratorBase._kernels(self)
- + [self.module.get_function("prepare_with_seeds")])
+ + [self.module.get_function("prepare_with_seeds")]
+ + [self.module.get_function("skip_ahead_sequence"),
+ self.module.get_function("skip_ahead_sequence_array")])
# }}}
-# {{{ Sobol32 RNG
+# {{{ Sobol RNG
def generate_direction_vectors(count, direction=None):
- if direction is None:
- direction = direction_vector_set.VECTOR_32
-
- result = np.empty((count, 32), dtype=np.int32)
+ if get_curand_version() >= (4, 0, 0):
+ if direction == direction_vector_set.VECTOR_64 or \
+ direction == direction_vector_set.SCRAMBLED_VECTOR_64:
+ result = np.empty((count, 64), dtype=np.uint64)
+ else:
+ result = np.empty((count, 32), dtype=np.uint32)
+ else:
+ result = np.empty((count, 32), dtype=np.uint32)
_get_direction_vectors(direction, result, count)
return pycuda.gpuarray.to_gpu(result)
+if get_curand_version() >= (4, 0, 0):
+ def generate_scramble_constants32(count):
+ result = np.empty((count, ), dtype=np.uint32)
+ _get_scramble_constants32(result, count)
+ return pycuda.gpuarray.to_gpu(result)
+
+ def generate_scramble_constants64(count):
+ result = np.empty((count, ), dtype=np.uint64)
+ _get_scramble_constants64(result, count)
+ return pycuda.gpuarray.to_gpu(result)
+
+class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 3.2.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
+
+ has_box_muller = False
+
+ def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size,
+ dir_vector_set, offset, state_type, sobol_random_source):
+ super(_SobolRandomNumberGeneratorBase, self).__init__(state_type,
+ sobol_random_source)
+
+ if dir_vector is None:
+ dir_vector = generate_direction_vectors(
+ self.block_count * self.generators_per_block, dir_vector_set)
+
+ if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
+ and dir_vector.dtype == dir_vector_dtype
+ and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)):
+ raise TypeError("seed must be GPUArray of integers of right length")
+
+ p = self.module.get_function("prepare")
+ p.prepare("PiPi")
+
+ from pycuda.characterize import has_stack
+ has_stack = has_stack()
+
+ if has_stack:
+ prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
+
+ try:
+ if has_stack:
+ drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+ try:
+ p.prepared_call((2 * self.block_count, 1), (self.generators_per_block // 2, 1, 1),
+ self.state, self.block_count * self.generators_per_block,
+ dir_vector.gpudata, offset)
+ except drv.LaunchError:
+ raise ValueError("Initialisation failed. Decrease number of threads.")
+
+ finally:
+ if has_stack:
+ drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+
+ def _kernels(self):
+ return (_RandomNumberGeneratorBase._kernels(self)
+ + [self.module.get_function("prepare")])
+
+class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 4.0.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
+
+ has_box_muller = False
+
+ def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size,
+ dir_vector_set, scramble_vector, scramble_vector_function,
+ offset, state_type, sobol_random_source):
+ super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__(state_type,
+ sobol_random_source)
+
+ if dir_vector is None:
+ dir_vector = generate_direction_vectors(
+ self.block_count * self.generators_per_block,
+ dir_vector_set)
+
+ if scramble_vector is None:
+ scramble_vector = scramble_vector_function(
+ self.block_count * self.generators_per_block)
+
+ if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
+ and dir_vector.dtype == dir_vector_dtype
+ and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)):
+ raise TypeError("seed must be GPUArray of integers of right length")
+
+ if not (isinstance(scramble_vector, pycuda.gpuarray.GPUArray)
+ and scramble_vector.dtype == dir_vector_dtype
+ and scramble_vector.shape == (self.block_count * self.generators_per_block, )):
+ raise TypeError("scramble must be GPUArray of integers of right length")
+
+ p = self.module.get_function("prepare")
+ p.prepare("PiPPi")
+
+ from pycuda.characterize import has_stack
+ has_stack = has_stack()
+
+ if has_stack:
+ prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
+
+ try:
+ if has_stack:
+ drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
+ try:
+ p.prepared_call((2 * self.block_count, 1), (self.generators_per_block // 2, 1, 1),
+ self.state, self.block_count * self.generators_per_block,
+ dir_vector.gpudata, scramble_vector.gpudata, offset)
+ except drv.LaunchError:
+ raise ValueError("Initialisation failed. Decrease number of threads.")
+
+ finally:
+ if has_stack:
+ drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+
+ def _kernels(self):
+ return (_RandomNumberGeneratorBase._kernels(self)
+ + [self.module.get_function("prepare")])
+
sobol32_random_source = """
extern "C" {
-__global__ void prepare(curandStateSobol32 *s, const int n, curandDirectionVectors32_t *v,
- const unsigned int o)
+__global__ void prepare(curandStateSobol32 *s, const int n,
+ curandDirectionVectors32_t *v, const unsigned int o)
{
const int id = blockIdx.x*blockDim.x+threadIdx.x;
if (id < n)
@@ -552,59 +733,95 @@ __global__ void prepare(curandStateSobol32 *s, const int n, curandDirectionVecto
"""
if get_curand_version() >= (3, 2, 0):
- class Sobol32RandomNumberGenerator(_RandomNumberGeneratorBase):
+ class Sobol32RandomNumberGenerator(_SobolRandomNumberGeneratorBase):
"""
Class surrounding CURAND kernels from CUDA 3.2.
It allows for generating quasi-random numbers with uniform
and normal probability function of type int, float, and double.
"""
- has_box_muller = False
-
def __init__(self, dir_vector=None, offset=0):
- super(Sobol32RandomNumberGenerator, self).__init__('curandStateSobol32',
- sobol32_random_source)
+ super(Sobol32RandomNumberGenerator, self).__init__(dir_vector,
+ np.uint32, 32, direction_vector_set.VECTOR_32, offset,
+ 'curandStateSobol32', sobol32_random_source+random_skip_ahead32_source)
- if dir_vector is None:
- dir_vector = generate_direction_vectors(
- self.block_count * self.generators_per_block)
+scrambledsobol32_random_source = """
+extern "C" {
+__global__ void prepare(curandStateScrambledSobol32 *s, const int n,
+ curandDirectionVectors32_t *v, unsigned int *scramble, const unsigned int o)
+{
+ const int id = blockIdx.x*blockDim.x+threadIdx.x;
+ if (id < n)
+ curand_init(v[id], scramble[id], o, &s[id]);
+}
+}
+"""
- if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray)
- and dir_vector.dtype == np.int32
- and dir_vector.shape == (self.block_count * self.generators_per_block, 32)):
- raise TypeError("seed must be GPUArray of integers of right length")
+if get_curand_version() >= (4, 0, 0):
+ class ScrambledSobol32RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 4.0.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
- p = self.module.get_function("prepare")
- p.prepare("PiPi", block=(self.generators_per_block, 1, 1))
+ def __init__(self, dir_vector=None, scramble_vector=None, offset=0):
+ super(ScrambledSobol32RandomNumberGenerator, self).__init__(dir_vector,
+ np.uint32, 32, direction_vector_set.SCRAMBLED_VECTOR_32,
+ scramble_vector, generate_scramble_constants32, offset,
+ 'curandStateScrambledSobol32', scrambledsobol32_random_source+random_skip_ahead32_source)
- from pycuda.characterize import has_stack
- has_stack = has_stack()
+sobol64_random_source = """
+extern "C" {
+__global__ void prepare(curandStateSobol64 *s, const int n, curandDirectionVectors64_t *v,
+ const unsigned int o)
+{
+ const int id = blockIdx.x*blockDim.x+threadIdx.x;
+ if (id < n)
+ curand_init(v[id], o, &s[id]);
+}
+}
+"""
- if has_stack:
- prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE)
- try:
- if has_stack:
- drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
- try:
+if get_curand_version() >= (4, 0, 0):
+ class Sobol64RandomNumberGenerator(_SobolRandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 4.0.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
- dev = drv.Context.get_device()
- if dev.compute_capability() >= (2, 0):
- p.prepared_call((self.block_count, 1), self.state,
- self.block_count * self.generators_per_block, dir_vector.gpudata, offset)
- else:
- p.prepared_call((2 * self.block_count, 1), self.state,
- self.block_count * self.generators_per_block // 2, dir_vector.gpudata, offset)
- except drv.LaunchError:
- raise ValueError("Initialisation failed. Decrease number of threads.")
+ def __init__(self, dir_vector=None, offset=0):
+ super(Sobol64RandomNumberGenerator, self).__init__(dir_vector,
+ np.uint64, 64, direction_vector_set.VECTOR_64, offset,
+ 'curandStateSobol64', sobol64_random_source+random_skip_ahead64_source)
- finally:
- if has_stack:
- drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
+scrambledsobol64_random_source = """
+extern "C" {
+__global__ void prepare(curandStateScrambledSobol64 *s, const int n,
+ curandDirectionVectors64_t *v, unsigned long long *scramble, const unsigned int o)
+{
+ const int id = blockIdx.x*blockDim.x+threadIdx.x;
+ if (id < n)
+ curand_init(v[id], scramble[id], o, &s[id]);
+}
+}
+"""
- def _kernels(self):
- return (_RandomNumberGeneratorBase._kernels(self)
- + [self.module.get_function("prepare")])
+if get_curand_version() >= (4, 0, 0):
+ class ScrambledSobol64RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase):
+ """
+ Class surrounding CURAND kernels from CUDA 4.0.
+ It allows for generating quasi-random numbers with uniform
+ and normal probability function of type int, float, and double.
+ """
+
+ def __init__(self, dir_vector=None, scramble_vector=None, offset=0):
+ super(ScrambledSobol64RandomNumberGenerator, self).__init__(dir_vector,
+ np.uint64, 64, direction_vector_set.SCRAMBLED_VECTOR_64,
+ scramble_vector, generate_scramble_constants64, offset,
+ 'curandStateScrambledSobol64', sobol64_random_source+random_skip_ahead64_source)
# }}}
diff --git a/src/cpp/curand.hpp b/src/cpp/curand.hpp
index 9783239..b3d015d 100644
--- a/src/cpp/curand.hpp
+++ b/src/cpp/curand.hpp
@@ -52,19 +52,94 @@ namespace pycuda { namespace curandom {
if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
throw py::error_already_set();
- if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set) {
+ if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set
+#if CUDAPP_CUDA_VERSION >= 4000
+ || CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 == set
+#endif
+ ) {
curandDirectionVectors32_t *vectors;
CURAND_CALL_GUARDED(curandGetDirectionVectors32, (&vectors, set));
while (count > 0) {
int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors32_t);
- memcpy((int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size);
+ memcpy((unsigned int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size);
count -= size/sizeof(curandDirectionVectors32_t);
n++;
}
}
+#if CUDAPP_CUDA_VERSION >= 4000
+ if (CURAND_DIRECTION_VECTORS_64_JOEKUO6 == set
+ || CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 == set) {
+ curandDirectionVectors64_t *vectors;
+ CURAND_CALL_GUARDED(curandGetDirectionVectors64, (&vectors, set));
+ while (count > 0) {
+ int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors64_t);
+ memcpy((unsigned long long *)buf+n*20000*sizeof(curandDirectionVectors64_t)/sizeof(unsigned long long), vectors, size);
+ count -= size/sizeof(curandDirectionVectors64_t);
+ n++;
+ }
+ }
+#endif
+ }
+#endif
+
+#if CUDAPP_CUDA_VERSION >= 4000
+ void py_curand_get_scramble_constants32(py::object dst, int count)
+ {
+ void *buf;
+ PYCUDA_BUFFER_SIZE_T len;
+ int n = 0;
+
+ if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
+ throw py::error_already_set();
+ unsigned int *vectors;
+ CURAND_CALL_GUARDED(curandGetScrambleConstants32, (&vectors));
+// Documentation does not mention number of dimensions
+// Assuming the same as in getDirectionVectors*
+ while (count > 0) {
+ int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned int);
+ memcpy((unsigned int *)buf+n*20000, vectors, size);
+ count -= size/sizeof(unsigned int);
+ n++;
+ }
+ }
+
+ void py_curand_get_scramble_constants64(py::object dst, int count)
+ {
+ void *buf;
+ PYCUDA_BUFFER_SIZE_T len;
+ int n = 0;
+
+ if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len))
+ throw py::error_already_set();
+ unsigned long long *vectors;
+ CURAND_CALL_GUARDED(curandGetScrambleConstants64, (&vectors));
+// Documentation does not mention number of dimensions
+// Assuming the same as in getDirectionVectors*
+ while (count > 0) {
+ int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned long long);
+ memcpy((unsigned long long *)buf+n*20000, vectors, size);
+ count -= size/sizeof(unsigned long long);
+ n++;
+ }
}
#endif
+// TODO: add more methods in classes (log)
+// log_normal(state, mean, stddev)
+// log_normal(Scrambled64)
+// log_normal_double(Scrambled64)
+// log_normal(Sobol64)
+// log_normal_double(Sobol64)
+// log_normal(Scrambled32)
+// log_normal_double(Scrambled32)
+// log_normal(Sobol32)
+// log_normal_double(Sobol32)
+//
+// log_normal(XORWOW)
+// log_normal_double(XORWOW)
+// log_normal2(XORWOW)
+// log_normal2_double(XORWOW)
+//
} }
#endif
diff --git a/src/wrapper/wrap_curand.cpp b/src/wrapper/wrap_curand.cpp
index 4ca1e42..8bb2b7e 100644
--- a/src/wrapper/wrap_curand.cpp
+++ b/src/wrapper/wrap_curand.cpp
@@ -19,6 +19,11 @@ void pycuda_expose_curand()
#if CUDAPP_CUDA_VERSION >= 3020
py::enum_<curandDirectionVectorSet_t>("direction_vector_set")
.value("VECTOR_32", CURAND_DIRECTION_VECTORS_32_JOEKUO6)
+#if CUDAPP_CUDA_VERSION >= 4000
+ .value("SCRAMBLED_VECTOR_32", CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6)
+ .value("VECTOR_64", CURAND_DIRECTION_VECTORS_64_JOEKUO6)
+ .value("SCRAMBLED_VECTOR_64", CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6)
+#endif
;
#endif
@@ -28,4 +33,11 @@ void pycuda_expose_curand()
py::def("_get_direction_vectors", py_curand_get_direction_vectors,
(arg("set"), arg("dst"), arg("count")));
#endif
+
+#if CUDAPP_CUDA_VERSION >= 4000
+ py::def("_get_scramble_constants32", py_curand_get_scramble_constants32,
+ (arg("dst"), arg("count")));
+ py::def("_get_scramble_constants64", py_curand_get_scramble_constants64,
+ (arg("dst"), arg("count")));
+#endif
}
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 5149ed8..8672fcf 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -259,20 +259,30 @@ class TestGPUArray:
from pytest import skip
skip("curand not installed")
-
- from pycuda.curandom import (
- XORWOWRandomNumberGenerator,
- Sobol32RandomNumberGenerator)
+ generator_types = []
+ if get_curand_version() >= (3, 2, 0):
+ from pycuda.curandom import (
+ XORWOWRandomNumberGenerator,
+ Sobol32RandomNumberGenerator)
+ generator_types.extend([
+ XORWOWRandomNumberGenerator,
+ Sobol32RandomNumberGenerator])
+ if get_curand_version() >= (4, 0, 0):
+ from pycuda.curandom import (
+ ScrambledSobol32RandomNumberGenerator,
+ Sobol64RandomNumberGenerator,
+ ScrambledSobol64RandomNumberGenerator)
+ generator_types.extend([
+ ScrambledSobol32RandomNumberGenerator,
+ Sobol64RandomNumberGenerator,
+ ScrambledSobol64RandomNumberGenerator])
if has_double_support():
dtypes = [np.float32, np.float64]
else:
dtypes = [np.float32]
- for gen_type in [
- XORWOWRandomNumberGenerator,
- #Sobol32RandomNumberGenerator
- ]:
+ for gen_type in generator_types:
gen = gen_type()
for dtype in dtypes:
signature.asc
Description: This is a digitally signed message part
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
