This is an automated email from the ASF dual-hosted git repository.

niketanpansare pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
     new 70bf610  [SYSTEMML-540] Optimized sparse-to-dense conversion on GPU 
and added a flag to disable forced memset0
70bf610 is described below

commit 70bf61093dc3814ccbec867de4e4753cb9f3e086
Author: Niketan Pansare <npan...@us.ibm.com>
AuthorDate: Thu Mar 28 22:44:24 2019 -0700

    [SYSTEMML-540] Optimized sparse-to-dense conversion on GPU and added a flag 
to disable forced memset0
    
    - Improved the performance of sparse-to-dense conversion of empty matrices.
    - Added a flag sysml.gpu.force.memSetZero that allows the user to disable 
forced memset0.
    - This flag is turned on for now and after exhaustive testing, it will be 
turned off later by default.
---
 conf/SystemML-config.xml.template                  |  3 +++
 src/main/java/org/apache/sysml/conf/DMLConfig.java |  4 +++-
 .../instructions/gpu/context/CSRPointer.java       |  3 +++
 .../instructions/gpu/context/GPUMemoryManager.java | 20 ++++++++++++++++----
 .../instructions/gpu/context/GPUObject.java        | 22 ++++++++++++++++++----
 5 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index 17cc2cc..cd0d311 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -121,4 +121,7 @@
    
    <!-- Should SystemML runtime force the lstm builtin functions to use the 
CuDNN kernels (default: true) -->
    <sysml.gpu.lstm.force.cudnn>true</sysml.gpu.lstm.force.cudnn>
+   
+   <!-- Should SystemML GPU memory manager force memSet(0) for the allocated 
arrays (default: true) -->
+   <sysml.gpu.force.memSetZero>true</sysml.gpu.force.memSetZero>
 </root>
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 0b5ed78..e435c77 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -96,6 +96,7 @@ public class DMLConfig
        public static final String GPU_MEMORY_ALLOCATOR = 
"sysml.gpu.memory.allocator"; // String to specify the memory allocator to use. 
Supported values are: cuda, unified_memory
        public static final String FLOATING_POINT_PRECISION = 
"sysml.floating.point.precision"; // String to specify the datatype to use 
internally: supported values are double, single
        public static final String PRINT_GPU_MEMORY_INFO = 
"sysml.gpu.print.memoryInfo";
+       public static final String GPU_FORCE_MEMSET_ZERO = 
"sysml.gpu.force.memSetZero";
        public static final String EVICTION_SHADOW_BUFFERSIZE = 
"sysml.gpu.eviction.shadow.bufferSize";
        public static final String GPU_RECOMPUTE_ACTIVATIONS = 
"sysml.gpu.recompute.activations";
 
@@ -140,6 +141,7 @@ public class DMLConfig
                _defaultVals.put(NATIVE_BLAS_DIR,        "none" );
                _defaultVals.put(EXTRA_FINEGRAINED_STATS,"false" );
                _defaultVals.put(PRINT_GPU_MEMORY_INFO,  "false" );
+               _defaultVals.put(GPU_FORCE_MEMSET_ZERO,  "true" );
                _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE,  "0.5" );
                _defaultVals.put(STATS_MAX_WRAP_LEN,     "30" );
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
@@ -431,7 +433,7 @@ public class DMLConfig
                                YARN_APPMASTER, YARN_APPMASTERMEM, 
YARN_MAPREDUCEMEM, 
                                CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, 
NATIVE_BLAS_DIR,
                                COMPRESSED_LINALG, 
-                               CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, 
CODEGEN_PLANCACHE, CODEGEN_LITERALS,
+                               CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, 
CODEGEN_PLANCACHE, CODEGEN_LITERALS, GPU_FORCE_MEMSET_ZERO,
                                EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, 
PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE,
                                AVAILABLE_GPUS, SYNCHRONIZE_GPU, 
EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, 
EVICTION_SHADOW_BUFFERSIZE,
                                GPU_MEMORY_ALLOCATOR, 
GPU_MEMORY_UTILIZATION_FACTOR, GPU_RECOMPUTE_ACTIVATIONS, FORCE_LSTM_CUDNN
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index b3ec497..d7bd295 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -303,6 +303,9 @@ public class CSRPointer {
                r.val = gCtx.allocate(null, getDataTypeSizeOf(nnz2));
                r.rowPtr = gCtx.allocate(null, getIntSizeOf(rows + 1));
                r.colInd = gCtx.allocate(null, getIntSizeOf(nnz2));
+               GPUMemoryManager.postAllocateMemset0(r.val, 
getDataTypeSizeOf(nnz2), null);
+               GPUMemoryManager.postAllocateMemset0(r.rowPtr, 
getIntSizeOf(rows + 1), null);
+               GPUMemoryManager.postAllocateMemset0(r.colInd, 
getIntSizeOf(nnz2), null);
                return r;
        }
 
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index cf579ec..d15b953 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -57,6 +57,7 @@ public class GPUMemoryManager {
        private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 
7, 8, 9, 10, 11}; // Avoids printing too much text while debugging
        
        private final boolean PRINT_GPU_MEMORY_INFO = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
+       public static boolean GPU_FORCE_MEMSET_ZERO = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_FORCE_MEMSET_ZERO);
        
        protected final GPUMemoryAllocator allocator;
        
/*****************************************************************************************/
@@ -141,6 +142,7 @@ public class GPUMemoryManager {
        private static final double WARN_UTILIZATION_FACTOR = 0.7;
        
        public GPUMemoryManager(GPUContext gpuCtx) {
+               GPU_FORCE_MEMSET_ZERO = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.GPU_FORCE_MEMSET_ZERO);
                matrixMemoryManager = new GPUMatrixMemoryManager(this);
                lazyCudaFreeMemoryManager = new 
GPULazyCudaFreeMemoryManager(this);
                String allocatorType = 
ConfigurationManager.getDMLConfig().getTextValue(DMLConfig.GPU_MEMORY_ALLOCATOR);
@@ -361,12 +363,22 @@ public class GPUMemoryManager {
                                        + toString());
                }
                
-               long t0 = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
-               cudaMemset(A, 0, size);
-               addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, 
GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+               if(GPU_FORCE_MEMSET_ZERO) {
+                       long t0 = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
+                       cudaMemset(A, 0, size);
+                       addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, 
GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+               }
                return A;
        }
        
+       public static void postAllocateMemset0(Pointer A, long size, String 
opcode) {
+               if(!GPU_FORCE_MEMSET_ZERO) {
+                       long t0 = ConfigurationManager.isStatistics() ? 
System.nanoTime() : 0;
+                       cudaMemset(A, 0, size);
+                       addMiscTime(opcode, GPUStatistics.cudaMemSet0Time, 
GPUStatistics.cudaMemSet0Count, GPUInstruction.MISC_TIMER_SET_ZERO, t0);
+               }
+       }
+       
        private int worstCaseContiguousMemorySizeCompare(GPUObject o1, 
GPUObject o2) {
                long ret = o1.getWorstCaseContiguousMemorySize() - 
o2.getWorstCaseContiguousMemorySize();
                return ret < 0 ? -1 : (ret == 0 ? 0 : 1);
@@ -553,7 +565,7 @@ public class GPUMemoryManager {
         * @param instructionLevelTimer member of GPUInstruction
         * @param startTime start time
         */
-       private void addMiscTime(String opcode, LongAdder globalGPUTimer, 
LongAdder globalGPUCounter, String instructionLevelTimer, long startTime) {
+       private static void addMiscTime(String opcode, LongAdder 
globalGPUTimer, LongAdder globalGPUCounter, String instructionLevelTimer, long 
startTime) {
                if(ConfigurationManager.isStatistics()) {
                        long totalTime = System.nanoTime() - startTime;
                        globalGPUTimer.add(totalTime);
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 9d263aa..254c9d7 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -434,9 +434,15 @@ public class GPUObject {
                        start = System.nanoTime();
                if (getJcudaSparseMatrixPtr() == null || !isAllocated())
                        throw new DMLRuntimeException("Expected allocated 
sparse matrix before sparseToDense() call");
-
-               sparseToColumnMajorDense();
-               denseColumnMajorToRowMajor();
+               if(getJcudaSparseMatrixPtr().nnz == 0) {
+                       long size = ((long) mat.getNumRows()) * 
getDataTypeSizeOf(mat.getNumColumns());
+                       setDensePointer(allocate(size));
+                       GPUMemoryManager.postAllocateMemset0(getDensePointer(), 
size, instructionName);
+               }
+               else {
+                       sparseToColumnMajorDense();
+                       denseColumnMajorToRowMajor();
+               }
                if (ConfigurationManager.isStatistics())
                        end = System.nanoTime();
                if (instructionName != null && 
ConfigurationManager.isFinegrainedStatistics())
@@ -446,6 +452,10 @@ public class GPUObject {
                if (ConfigurationManager.isStatistics())
                        GPUStatistics.cudaSparseToDenseCount.add(1);
        }
+       
+       private static long getDataTypeSizeOf(long numElems) {
+               return numElems * ((long) LibMatrixCUDA.sizeOfDataType);
+       }
 
        /**
         * More efficient method to convert sparse to dense but returns dense 
in column major format
@@ -521,10 +531,14 @@ public class GPUObject {
                setDensePointer(allocate(size));
                // The "fill" kernel is called which treats the matrix 
"jcudaDensePtr" like a vector and fills it with value "v"
                // If the fill value is 0, no need to call the special kernel, 
the allocate memsets the allocated region to 0
-               if (v != 0)
+               if (v != 0) {
                        getGPUContext().getKernels()
                        .launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
                                        getDensePointer(), v, numElems);
+               }
+               else {
+                       GPUMemoryManager.postAllocateMemset0(getDensePointer(), 
size, null);
+               }
        }
 
        /**

Reply via email to