Hi Andreas

Here's the patch!

best
  Simon

On Fri, Oct 10, 2014 at 5:41 PM, Andreas Kloeckner <[email protected]>
wrote:

> Simon Perkins <[email protected]> writes:
>
> > Hi there
> >
> > Would it be possible to add an allocator keyword argument to
> > ReductionKernel.__call__ and gpuarray.sum etc.?
> >
> > At the moment we have:
> >
> > krnl = ReductionKernel(...)
> > result = krnl(a, stream)
> >
> > Now  __call__() uses a.allocator to make device allocations, but unless a
> > has been allocated using a DeviceMemoryPool, a device allocation and
> > deallocation occurs for the returned value. Additionally, this serialises
> > asynchronous stream calls. One possible work-around is:
> >
> > pool = pycuda.tools.DeviceMemoryPool()
> > tmp_alloc = a.allocator
> > a.allocator = pool.allocate
> > result = krnl(a, stream)
> > a.allocator = tmp_alloc
>
> I'd be happy to take a patch.
>
> Andreas
>
From 63d010fa96eeb5ae016c6034c1eb60c2e0318f7f Mon Sep 17 00:00:00 2001
From: Simon Perkins <[email protected]>
Date: Wed, 15 Oct 2014 16:57:20 +0200
Subject: [PATCH] Add the ability to specify allocators for reduction kernels.

---
 pycuda/gpuarray.py    | 14 +++++++-------
 pycuda/reduction.py   | 10 ++++++++--
 test/test_gpuarray.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 8cac81e..cbd65c0 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1278,30 +1278,30 @@ maximum = _make_binary_minmax_func("max")
 
 # {{{ reductions
 
-def sum(a, dtype=None, stream=None):
+def sum(a, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_sum_kernel
     krnl = get_sum_kernel(dtype, a.dtype)
-    return krnl(a, stream=stream)
+    return krnl(a, stream=stream, allocator=allocator)
 
 
-def subset_sum(subset, a, dtype=None, stream=None):
+def subset_sum(subset, a, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_subset_sum_kernel
     krnl = get_subset_sum_kernel(dtype, subset.dtype, a.dtype)
     return krnl(subset, a, stream=stream)
 
 
-def dot(a, b, dtype=None, stream=None):
+def dot(a, b, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_dot_kernel
     if dtype is None:
         dtype = _get_common_dtype(a, b)
     krnl = get_dot_kernel(dtype, a.dtype, b.dtype)
-    return krnl(a, b, stream=stream)
+    return krnl(a, b, stream=stream, allocator=allocator)
 
 
-def subset_dot(subset, a, b, dtype=None, stream=None):
+def subset_dot(subset, a, b, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_subset_dot_kernel
     krnl = get_subset_dot_kernel(dtype, subset.dtype, a.dtype, b.dtype)
-    return krnl(subset, a, b, stream=stream)
+    return krnl(subset, a, b, stream=stream, allocator=allocator)
 
 
 def _make_minmax_kernel(what):
diff --git a/pycuda/reduction.py b/pycuda/reduction.py
index c958b26..163fa6e 100644
--- a/pycuda/reduction.py
+++ b/pycuda/reduction.py
@@ -232,6 +232,12 @@ class ReductionKernel:
 
         stream = kwargs.get("stream")
 
+        allocator = kwargs.get("allocator", None)
+
+        if allocator is None:
+            import pycuda.driver
+            allocator = pycuda.driver.mem_alloc
+
         from gpuarray import empty
 
         f = s1_func
@@ -267,9 +273,9 @@ class ReductionKernel:
                 seq_count = (sz + macroblock_size - 1) // macroblock_size
 
             if block_count == 1:
-                result = empty((), self.dtype_out, repr_vec.allocator)
+                result = empty((), self.dtype_out, allocator)
             else:
-                result = empty((block_count,), self.dtype_out, repr_vec.allocator)
+                result = empty((block_count,), self.dtype_out, allocator)
 
             kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize)
 
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 1c88a65..e0e249b 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -841,6 +841,52 @@ class TestGPUArray:
         assert minmax["cur_max"] == np.max(a)
 
     @mark_cuda_test
+    def test_sum_allocator(self):
+        import pycuda.tools
+        pool = pycuda.tools.DeviceMemoryPool()
+
+        rng = np.random.randint(low=512,high=1024)
+
+        a = gpuarray.arange(rng,dtype=np.int32)
+        b = gpuarray.sum(a)
+        c = gpuarray.sum(a, allocator=pool.allocate)
+
+        # Test that we get the correct results
+        assert b.get() == rng*(rng-1)//2
+        assert c.get() == rng*(rng-1)//2
+
+        # Test that result arrays were allocated with the appropriate allocator
+        assert b.allocator == drv.mem_alloc
+        assert c.allocator == pool.allocate
+
+    @mark_cuda_test
+    def test_dot_allocator(self):
+        import pycuda.tools
+        pool = pycuda.tools.DeviceMemoryPool()
+
+        a_cpu = np.random.randint(low=512,high=1024,size=1024)
+        b_cpu = np.random.randint(low=512,high=1024,size=1024)
+
+        # Compute the result on the CPU
+        dot_cpu_1 = np.dot(a_cpu, b_cpu)
+
+        a_gpu = gpuarray.to_gpu(a_cpu)
+        b_gpu = gpuarray.to_gpu(b_cpu)
+
+        # Compute the result on the GPU using different allocators
+        dot_gpu_1 = gpuarray.dot(a_gpu, b_gpu)
+        dot_gpu_2 = gpuarray.dot(a_gpu, b_gpu, allocator=pool.allocate)
+
+        # Test that we get the correct results
+        assert dot_cpu_1 == dot_gpu_1.get()
+        assert dot_cpu_1 == dot_gpu_2.get()
+
+        # Test that result arrays were allocated with the appropriate allocator
+        assert dot_gpu_1.allocator == drv.mem_alloc
+        assert dot_gpu_2.allocator == pool.allocate
+
+
+    @mark_cuda_test
     def test_view_and_strides(self):
         from pycuda.curandom import rand as curand
 
-- 
1.9.1

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to