Hi Andreas Here's the patch!
best Simon On Fri, Oct 10, 2014 at 5:41 PM, Andreas Kloeckner <[email protected]> wrote: > Simon Perkins <[email protected]> writes: > > > Hi there > > > > Would it be possible to add an allocator keyword argument to > > ReductionKernel.__call__ and gpuarray.sum etc.? > > > > At the moment we have: > > > > krnl = ReductionKernel(...) > > result = krnl(a, stream) > > > > Now __call__() uses a.allocator to make device allocations, but unless a > > has been allocated using a DeviceMemoryPool, a device allocation and > > deallocation occurs for the returned value. Additionally, this serialises > > asynchronous stream calls. One possible work-around is: > > > > pool = pycuda.tools.DeviceMemoryPool() > > tmp_alloc = a.allocator > > a.allocator = pool.allocate > > result = krnl(a, stream) > > a.allocator = tmp_alloc > > I'd be happy to take a patch. > > Andreas >
From 63d010fa96eeb5ae016c6034c1eb60c2e0318f7f Mon Sep 17 00:00:00 2001 From: Simon Perkins <[email protected]> Date: Wed, 15 Oct 2014 16:57:20 +0200 Subject: [PATCH] Add the ability to specify allocators for reduction kernels. --- pycuda/gpuarray.py | 14 +++++++------- pycuda/reduction.py | 10 ++++++++-- test/test_gpuarray.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 8cac81e..cbd65c0 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1278,30 +1278,30 @@ maximum = _make_binary_minmax_func("max") # {{{ reductions -def sum(a, dtype=None, stream=None): +def sum(a, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_sum_kernel krnl = get_sum_kernel(dtype, a.dtype) - return krnl(a, stream=stream) + return krnl(a, stream=stream, allocator=allocator) -def subset_sum(subset, a, dtype=None, stream=None): +def subset_sum(subset, a, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_subset_sum_kernel krnl = get_subset_sum_kernel(dtype, subset.dtype, a.dtype) return krnl(subset, a, stream=stream) -def dot(a, b, dtype=None, stream=None): +def dot(a, b, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_dot_kernel if dtype is None: dtype = _get_common_dtype(a, b) krnl = get_dot_kernel(dtype, a.dtype, b.dtype) - return krnl(a, b, stream=stream) + return krnl(a, b, stream=stream, allocator=allocator) -def subset_dot(subset, a, b, dtype=None, stream=None): +def subset_dot(subset, a, b, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_subset_dot_kernel krnl = get_subset_dot_kernel(dtype, subset.dtype, a.dtype, b.dtype) - return krnl(subset, a, b, stream=stream) + return krnl(subset, a, b, stream=stream, allocator=allocator) def _make_minmax_kernel(what): diff --git a/pycuda/reduction.py b/pycuda/reduction.py index c958b26..163fa6e 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -232,6 +232,12 @@ class ReductionKernel: stream = kwargs.get("stream") + allocator = kwargs.get("allocator", None) + + if allocator is None: + import pycuda.driver + allocator = pycuda.driver.mem_alloc + from gpuarray import empty f = s1_func @@ -267,9 +273,9 @@ class ReductionKernel: seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1: - result = empty((), self.dtype_out, repr_vec.allocator) + result = empty((), self.dtype_out, allocator) else: - result = empty((block_count,), self.dtype_out, repr_vec.allocator) + result = empty((block_count,), self.dtype_out, allocator) kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 1c88a65..e0e249b 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -841,6 +841,52 @@ class TestGPUArray: assert minmax["cur_max"] == np.max(a) @mark_cuda_test + def test_sum_allocator(self): + import pycuda.tools + pool = pycuda.tools.DeviceMemoryPool() + + rng = np.random.randint(low=512,high=1024) + + a = gpuarray.arange(rng,dtype=np.int32) + b = gpuarray.sum(a) + c = gpuarray.sum(a, allocator=pool.allocate) + + # Test that we get the correct results + assert b.get() == rng*(rng-1)//2 + assert c.get() == rng*(rng-1)//2 + + # Test that result arrays were allocated with the appropriate allocator + assert b.allocator == drv.mem_alloc + assert c.allocator == pool.allocate + + @mark_cuda_test + def test_dot_allocator(self): + import pycuda.tools + pool = pycuda.tools.DeviceMemoryPool() + + a_cpu = np.random.randint(low=512,high=1024,size=1024) + b_cpu = np.random.randint(low=512,high=1024,size=1024) + + # Compute the result on the CPU + dot_cpu_1 = np.dot(a_cpu, b_cpu) + + a_gpu = gpuarray.to_gpu(a_cpu) + b_gpu = gpuarray.to_gpu(b_cpu) + + # Compute the result on the GPU using different allocators + dot_gpu_1 = gpuarray.dot(a_gpu, b_gpu) + dot_gpu_2 = gpuarray.dot(a_gpu, b_gpu, allocator=pool.allocate) + + # Test that we get the correct results + assert dot_cpu_1 == dot_gpu_1.get() + assert dot_cpu_1 == dot_gpu_2.get() + + # Test that result arrays were allocated with the appropriate allocator + assert dot_gpu_1.allocator == drv.mem_alloc + assert dot_gpu_2.allocator == pool.allocate + + + @mark_cuda_test def test_view_and_strides(self): from pycuda.curandom import rand as curand -- 1.9.1
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
