This is an automated email from the ASF dual-hosted git repository. markd pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit f10eb03821dbc30bcec731b2cc93125ec39ea3ff Author: Mark Dokter <[email protected]> AuthorDate: Wed Apr 14 23:59:51 2021 +0200 [MINOR] Reduce memory footprint of reduceALL GPU operation The temporary buffer needs to hold at most num_blocks (of first reduction wave) items of size <data-type>, not N (size of input). --- src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java index dc5a1f0..f6e950a 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java @@ -933,10 +933,9 @@ public class LibMatrixCUDA { int[] tmp = getKernelParamsForReduceAll(gCtx, n); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; - Pointer tempOut = gCtx.allocate(instName, n*sizeOfDataType); + Pointer tempOut = gCtx.allocate(instName, (long) blocks * sizeOfDataType); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n); - //cudaDeviceSynchronize; int s = blocks; while (s > 1) {
