Hi Andreas

I've modified the patch to take the existing behaviour into account.

best
  Simon

On Thu, Oct 16, 2014 at 7:15 AM, Andreas Kloeckner <[email protected]>
wrote:

> Hi Simon,
>
> Simon Perkins <[email protected]> writes:
> > Here's the patch!
>
> The patch looks good. One minor complaint is that in absence of an
> allocator kwarg, your patch changes existing behavior. Specifically, the
> allocator that was previously used was the one of the array passed to
> the reduction, and now it's pycuda.driver.mem_alloc.
>
> Andreas
>
From a8dde56b9ec89f859ed164baf6f5cefff3eac56d Mon Sep 17 00:00:00 2001
From: Simon Perkins <[email protected]>
Date: Wed, 15 Oct 2014 16:57:20 +0200
Subject: [PATCH] Add the ability to specify allocators for reduction kernels.

---
 pycuda/gpuarray.py    | 14 +++++++-------
 pycuda/reduction.py   |  8 ++++++--
 test/test_gpuarray.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 8cac81e..cbd65c0 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1278,30 +1278,30 @@ maximum = _make_binary_minmax_func("max")
 
 # {{{ reductions
 
-def sum(a, dtype=None, stream=None):
+def sum(a, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_sum_kernel
     krnl = get_sum_kernel(dtype, a.dtype)
-    return krnl(a, stream=stream)
+    return krnl(a, stream=stream, allocator=allocator)
 
 
-def subset_sum(subset, a, dtype=None, stream=None):
+def subset_sum(subset, a, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_subset_sum_kernel
     krnl = get_subset_sum_kernel(dtype, subset.dtype, a.dtype)
     return krnl(subset, a, stream=stream)
 
 
-def dot(a, b, dtype=None, stream=None):
+def dot(a, b, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_dot_kernel
     if dtype is None:
         dtype = _get_common_dtype(a, b)
     krnl = get_dot_kernel(dtype, a.dtype, b.dtype)
-    return krnl(a, b, stream=stream)
+    return krnl(a, b, stream=stream, allocator=allocator)
 
 
-def subset_dot(subset, a, b, dtype=None, stream=None):
+def subset_dot(subset, a, b, dtype=None, stream=None, allocator=None):
     from pycuda.reduction import get_subset_dot_kernel
     krnl = get_subset_dot_kernel(dtype, subset.dtype, a.dtype, b.dtype)
-    return krnl(subset, a, b, stream=stream)
+    return krnl(subset, a, b, stream=stream, allocator=allocator)
 
 
 def _make_minmax_kernel(what):
diff --git a/pycuda/reduction.py b/pycuda/reduction.py
index c958b26..8a48e34 100644
--- a/pycuda/reduction.py
+++ b/pycuda/reduction.py
@@ -256,6 +256,10 @@ class ReductionKernel:
 
             repr_vec = vectors[0]
             sz = repr_vec.size
+            
+            allocator = kwargs.get("allocator",None)
+            if allocator is None:
+            	allocator = repr_vec.allocator
 
             if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT:
                 total_block_size = SMALL_SEQ_COUNT*self.block_size
@@ -267,9 +271,9 @@ class ReductionKernel:
                 seq_count = (sz + macroblock_size - 1) // macroblock_size
 
             if block_count == 1:
-                result = empty((), self.dtype_out, repr_vec.allocator)
+                result = empty((), self.dtype_out, allocator)
             else:
-                result = empty((block_count,), self.dtype_out, repr_vec.allocator)
+                result = empty((block_count,), self.dtype_out, allocator)
 
             kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize)
 
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 1c88a65..928badb 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -841,6 +841,52 @@ class TestGPUArray:
         assert minmax["cur_max"] == np.max(a)
 
     @mark_cuda_test
+    def test_sum_allocator(self):
+        import pycuda.tools
+        pool = pycuda.tools.DeviceMemoryPool()
+
+        rng = np.random.randint(low=512,high=1024)
+
+        a = gpuarray.arange(rng,dtype=np.int32)
+        b = gpuarray.sum(a)
+        c = gpuarray.sum(a, allocator=pool.allocate)
+
+        # Test that we get the correct results
+        assert b.get() == rng*(rng-1)//2
+        assert c.get() == rng*(rng-1)//2
+
+        # Test that result arrays were allocated with the appropriate allocator
+        assert b.allocator == a.allocator
+        assert c.allocator == pool.allocate
+
+    @mark_cuda_test
+    def test_dot_allocator(self):
+        import pycuda.tools
+        pool = pycuda.tools.DeviceMemoryPool()
+
+        a_cpu = np.random.randint(low=512,high=1024,size=1024)
+        b_cpu = np.random.randint(low=512,high=1024,size=1024)
+
+        # Compute the result on the CPU
+        dot_cpu_1 = np.dot(a_cpu, b_cpu)
+
+        a_gpu = gpuarray.to_gpu(a_cpu)
+        b_gpu = gpuarray.to_gpu(b_cpu)
+
+        # Compute the result on the GPU using different allocators
+        dot_gpu_1 = gpuarray.dot(a_gpu, b_gpu)
+        dot_gpu_2 = gpuarray.dot(a_gpu, b_gpu, allocator=pool.allocate)
+
+        # Test that we get the correct results
+        assert dot_cpu_1 == dot_gpu_1.get()
+        assert dot_cpu_1 == dot_gpu_2.get()
+
+        # Test that result arrays were allocated with the appropriate allocator
+        assert dot_gpu_1.allocator == a_gpu.allocator
+        assert dot_gpu_2.allocator == pool.allocate
+
+
+    @mark_cuda_test
     def test_view_and_strides(self):
         from pycuda.curandom import rand as curand
 
-- 
1.9.1

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to