Hi Andreas I've modified the patch to take the existing behaviour into account.
best Simon On Thu, Oct 16, 2014 at 7:15 AM, Andreas Kloeckner <[email protected]> wrote: > Hi Simon, > > Simon Perkins <[email protected]> writes: > > Here's the patch! > > The patch looks good. One minor complaint is that in absence of an > allocator kwarg, your patch changes existing behavior. Specifically, the > allocator that was previously used was the one of the array passed to > the reduction, and now it's pycuda.driver.mem_alloc. > > Andreas >
From a8dde56b9ec89f859ed164baf6f5cefff3eac56d Mon Sep 17 00:00:00 2001 From: Simon Perkins <[email protected]> Date: Wed, 15 Oct 2014 16:57:20 +0200 Subject: [PATCH] Add the ability to specify allocators for reduction kernels. --- pycuda/gpuarray.py | 14 +++++++------- pycuda/reduction.py | 8 ++++++-- test/test_gpuarray.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 8cac81e..cbd65c0 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1278,30 +1278,30 @@ maximum = _make_binary_minmax_func("max") # {{{ reductions -def sum(a, dtype=None, stream=None): +def sum(a, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_sum_kernel krnl = get_sum_kernel(dtype, a.dtype) - return krnl(a, stream=stream) + return krnl(a, stream=stream, allocator=allocator) -def subset_sum(subset, a, dtype=None, stream=None): +def subset_sum(subset, a, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_subset_sum_kernel krnl = get_subset_sum_kernel(dtype, subset.dtype, a.dtype) return krnl(subset, a, stream=stream) -def dot(a, b, dtype=None, stream=None): +def dot(a, b, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_dot_kernel if dtype is None: dtype = _get_common_dtype(a, b) krnl = get_dot_kernel(dtype, a.dtype, b.dtype) - return krnl(a, b, stream=stream) + return krnl(a, b, stream=stream, allocator=allocator) -def subset_dot(subset, a, b, dtype=None, stream=None): +def subset_dot(subset, a, b, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_subset_dot_kernel krnl = get_subset_dot_kernel(dtype, subset.dtype, a.dtype, b.dtype) - return krnl(subset, a, b, stream=stream) + return krnl(subset, a, b, stream=stream, allocator=allocator) def _make_minmax_kernel(what): diff --git a/pycuda/reduction.py b/pycuda/reduction.py index c958b26..8a48e34 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -256,6 +256,10 @@ class ReductionKernel: repr_vec = vectors[0] sz = repr_vec.size + + allocator = kwargs.get("allocator",None) + if allocator is None: + allocator = repr_vec.allocator if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT: total_block_size = SMALL_SEQ_COUNT*self.block_size @@ -267,9 +271,9 @@ class ReductionKernel: seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1: - result = empty((), self.dtype_out, repr_vec.allocator) + result = empty((), self.dtype_out, allocator) else: - result = empty((block_count,), self.dtype_out, repr_vec.allocator) + result = empty((block_count,), self.dtype_out, allocator) kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 1c88a65..928badb 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -841,6 +841,52 @@ class TestGPUArray: assert minmax["cur_max"] == np.max(a) @mark_cuda_test + def test_sum_allocator(self): + import pycuda.tools + pool = pycuda.tools.DeviceMemoryPool() + + rng = np.random.randint(low=512,high=1024) + + a = gpuarray.arange(rng,dtype=np.int32) + b = gpuarray.sum(a) + c = gpuarray.sum(a, allocator=pool.allocate) + + # Test that we get the correct results + assert b.get() == rng*(rng-1)//2 + assert c.get() == rng*(rng-1)//2 + + # Test that result arrays were allocated with the appropriate allocator + assert b.allocator == a.allocator + assert c.allocator == pool.allocate + + @mark_cuda_test + def test_dot_allocator(self): + import pycuda.tools + pool = pycuda.tools.DeviceMemoryPool() + + a_cpu = np.random.randint(low=512,high=1024,size=1024) + b_cpu = np.random.randint(low=512,high=1024,size=1024) + + # Compute the result on the CPU + dot_cpu_1 = np.dot(a_cpu, b_cpu) + + a_gpu = gpuarray.to_gpu(a_cpu) + b_gpu = gpuarray.to_gpu(b_cpu) + + # Compute the result on the GPU using different allocators + dot_gpu_1 = gpuarray.dot(a_gpu, b_gpu) + dot_gpu_2 = gpuarray.dot(a_gpu, b_gpu, allocator=pool.allocate) + + # Test that we get the correct results + assert dot_cpu_1 == dot_gpu_1.get() + assert dot_cpu_1 == dot_gpu_2.get() + + # Test that result arrays were allocated with the appropriate allocator + assert dot_gpu_1.allocator == a_gpu.allocator + assert dot_gpu_2.allocator == pool.allocate + + + @mark_cuda_test def test_view_and_strides(self): from pycuda.curandom import rand as curand -- 1.9.1
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
