Repository: incubator-systemml
Updated Branches:
  refs/heads/master ee33ec62d -> 4f9dcf9ad


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 0c0410c..b46985f 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -50,10 +50,10 @@ public class LibMatrixDNN {
        
        //library configurations and external contracts
        public static final boolean SUPPORTS_SPARSE_OUTPUTS = false; 
//operations able to handle sparse outputs 
-       private static final boolean DISPLAY_STATISTICS = false; //conv2d 
summaries in stats output
        private static final boolean ALLOW_MULTI_THREADED_OPS = true; //enable 
multi-threading in cp
        private static final int NUM_TASK_FACTOR = 2; //number of tasks is 
vcores scaled by this factor
-       
+       public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in 
stats output
+
        private enum TaskType {
                MaxPooling_Forward, MaxPooling_Backward, 
                // Alternate approaches that we tried but the performance was 
unsatisfactory be included: direct, non-looped im2col

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
new file mode 100644
index 0000000..044901b
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.utils;
+
+import org.apache.sysml.api.DMLScript;
+
+import java.util.*;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Measures performance numbers when GPU mode is enabled
+ * Printed as part of {@link Statistics}.
+ */
+public class GPUStatistics {
+
+  // Whether or not extra per-instruction statistics will be recorded and 
shown for the GPU
+  public static boolean DISPLAY_STATISTICS = false;
+
+  private static int iNoOfExecutedGPUInst = 0;
+
+  public static long cudaInitTime = 0;
+  public static long cudaLibrariesInitTime = 0;
+  public static AtomicLong cudaSparseToDenseTime = new AtomicLong(0);          
// time spent in converting sparse matrix block to dense
+  public static AtomicLong cudaDenseToSparseTime = new AtomicLong(0);          
// time spent in converting dense matrix block to sparse
+  public static AtomicLong cudaSparseConversionTime = new AtomicLong(0);       
// time spent in converting between sparse block types
+  public static AtomicLong cudaSparseToDenseCount = new AtomicLong(0);
+  public static AtomicLong cudaDenseToSparseCount = new AtomicLong(0);
+  public static AtomicLong cudaSparseConversionCount = new AtomicLong(0);
+
+  public static AtomicLong cudaAllocTime = new AtomicLong(0);             // 
time spent in allocating memory on the GPU
+  public static AtomicLong cudaDeAllocTime = new AtomicLong(0);           // 
time spent in deallocating memory on the GPU
+  public static AtomicLong cudaToDevTime = new AtomicLong(0);             // 
time spent in copying data from host (CPU) to device (GPU) memory
+  public static AtomicLong cudaFromDevTime = new AtomicLong(0);           // 
time spent in copying data from device to host
+  public static AtomicLong cudaAllocCount = new AtomicLong(0);
+  public static AtomicLong cudaDeAllocCount = new AtomicLong(0);
+  public static AtomicLong cudaToDevCount = new AtomicLong(0);
+  public static AtomicLong cudaFromDevCount = new AtomicLong(0);
+  public static AtomicLong cudaEvictionCount = new AtomicLong(0);
+
+  // Per instruction miscellaneous timers.
+  // Used to record events in a CP Heavy Hitter instruction and
+  // provide a breakdown of how time was spent in that instruction
+  private static HashMap<String, HashMap<String, Long>> _cpInstMiscTime = new 
HashMap<String, HashMap<String, Long>> ();
+  private static HashMap<String, HashMap<String, Long>> _cpInstMiscCount = new 
HashMap<String, HashMap<String, Long>> ();
+
+  /**
+   * Resets the miscellaneous timers & counters
+   */
+  public static void resetMiscTimers(){
+    _cpInstMiscTime.clear();
+    _cpInstMiscCount.clear();
+  }
+
+  /**
+   * Resets all the cuda counters and timers, including the misc timers & 
counters
+   */
+  public static void reset(){
+    cudaInitTime = 0;
+    cudaLibrariesInitTime = 0;
+    cudaAllocTime.set(0);
+    cudaDeAllocTime.set(0);
+    cudaToDevTime.set(0);
+    cudaFromDevTime.set(0);
+    cudaAllocCount.set(0);
+    cudaDeAllocCount.set(0);
+    cudaToDevCount.set(0);
+    cudaFromDevCount.set(0);
+    cudaEvictionCount.set(0);
+    resetMiscTimers();
+  }
+
+
+  public static synchronized void setNoOfExecutedGPUInst(int numJobs) {
+    iNoOfExecutedGPUInst = numJobs;
+  }
+
+  public static synchronized void incrementNoOfExecutedGPUInst() {
+    iNoOfExecutedGPUInst ++;
+  }
+
+  public static synchronized int getNoOfExecutedGPUInst() {
+    return iNoOfExecutedGPUInst;
+  }
+
+  /**
+   * "Maintains" or adds time to miscellaneous timers per instruction/op, also 
increments associated count
+   * @param instructionName    name of the instruction/op
+   * @param miscTimer                          name of the miscellaneous timer
+   * @param timeNanos                          time in nano seconds
+   * @param incrementCount     how much to increment the count of the 
miscTimer by
+   */
+  public synchronized static void maintainCPMiscTimes( String instructionName, 
String miscTimer, long timeNanos, long incrementCount)
+  {
+    if (!DISPLAY_STATISTICS)
+      return;
+
+    HashMap<String, Long> miscTimesMap = _cpInstMiscTime.get(instructionName);
+    if (miscTimesMap == null) {
+      miscTimesMap = new HashMap<String, Long>();
+      _cpInstMiscTime.put(instructionName, miscTimesMap);
+    }
+    Long oldVal = miscTimesMap.get(miscTimer);
+    Long newVal = timeNanos + ((oldVal!=null) ? oldVal : 0);
+    miscTimesMap.put(miscTimer, newVal);
+
+    HashMap<String, Long> miscCountMap = _cpInstMiscCount.get(instructionName);
+    if (miscCountMap == null){
+      miscCountMap = new HashMap<String, Long>();
+      _cpInstMiscCount.put(instructionName, miscCountMap);
+    }
+    Long oldCnt = miscCountMap.get(miscTimer);
+    Long newCnt = incrementCount + ((oldCnt!=null) ? oldCnt : 0);
+    miscCountMap.put(miscTimer, newCnt);
+  }
+
+  /**
+   * "Maintains" or adds time to miscellaneous timers per instruction/op, also 
increments associated count by 1
+   * @param instructionName    name of the instruction/op
+   * @param miscTimer                          name of the miscellaneous timer
+   * @param timeNanos                          time in nano seconds
+   */
+  public synchronized static void maintainCPMiscTimes( String instructionName, 
String miscTimer, long timeNanos){
+    maintainCPMiscTimes(instructionName, miscTimer, timeNanos, 1);
+  }
+
+  /**
+   * Used to print misc timers (and their counts) for a given instruction/op
+   * @param instructionName name of the instruction/op
+   * @return  a formatted string of misc timers for a given instruction/op
+   */
+  public static String getStringForCPMiscTimesPerInstruction(String 
instructionName) {
+    StringBuffer sb = new StringBuffer();
+    HashMap<String, Long> miscTimerMap = _cpInstMiscTime.get(instructionName);
+    if (miscTimerMap != null) {
+      List<Map.Entry<String, Long>> sortedList = new 
ArrayList<Map.Entry<String, Long>>(miscTimerMap.entrySet());
+      // Sort the times to display by the most expensive first
+      Collections.sort(sortedList, new Comparator<Map.Entry<String, Long>>() {
+        @Override
+        public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> 
o2) {
+          return (int) (o1.getValue() - o2.getValue());
+        }
+      });
+      Iterator<Map.Entry<String, Long>> miscTimeIter = sortedList.iterator();
+      HashMap<String, Long> miscCountMap = 
_cpInstMiscCount.get(instructionName);
+      while (miscTimeIter.hasNext()) {
+        Map.Entry<String, Long> e = miscTimeIter.next();
+        String miscTimerName = e.getKey();
+        Long miscTimerTime = e.getValue();
+        Long miscCount = miscCountMap.get(miscTimerName);
+        sb.append(miscTimerName + "[" + String.format("%.3f", (double) 
miscTimerTime / 1000000000.0) + "s," + miscCount + "]");
+        if (miscTimeIter.hasNext())
+          sb.append(", ");
+      }
+    }
+    return sb.toString();
+  }
+
+  /**
+   * Used to print out cuda timers & counters
+   * @return a formatted string of cuda timers & counters
+   */
+  public static String getStringForCudaTimers() {
+    StringBuffer sb = new StringBuffer();
+    sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", 
cudaInitTime*1e-9) + "/"
+            + String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n");
+    sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + 
".\n");
+    sb.append("GPU mem tx time  (alloc/dealloc/toDev/fromDev):\t"
+            + String.format("%.3f", cudaAllocTime.get()*1e-9) + "/"
+            + String.format("%.3f", cudaDeAllocTime.get()*1e-9) + "/"
+            + String.format("%.3f", cudaToDevTime.get()*1e-9) + "/"
+            + String.format("%.3f", cudaFromDevTime.get()*1e-9)  + " sec.\n");
+    sb.append("GPU mem tx count (alloc/dealloc/toDev/fromDev/evict):\t"
+            + cudaAllocCount.get() + "/"
+            + cudaDeAllocCount.get() + "/"
+            + cudaSparseConversionCount.get() + "/"
+            + cudaToDevCount.get() + "/"
+            + cudaFromDevCount.get() + "/"
+            + cudaEvictionCount.get() + ".\n");
+    sb.append("GPU conversion time  (sparseConv/sp2dense/dense2sp):\t"
+            + String.format("%.3f", cudaSparseConversionTime.get()*1e-9) + "/"
+            + String.format("%.3f", cudaSparseToDenseTime.get()*1e-9) + "/"
+            + String.format("%.3f", cudaDenseToSparseTime.get()*1e-9) + " 
sec.\n");
+    sb.append("GPU conversion count (sparseConv/sp2dense/dense2sp):\t"
+            + cudaSparseConversionCount.get() + "/"
+            + cudaSparseToDenseCount.get() + "/"
+            + cudaDenseToSparseCount.get() + ".\n");
+
+    return sb.toString();
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java 
b/src/main/java/org/apache/sysml/utils/Statistics.java
index 08f0452..262950f 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -22,12 +22,8 @@ package org.apache.sysml.utils;
 import java.lang.management.CompilationMXBean;
 import java.lang.management.GarbageCollectorMXBean;
 import java.lang.management.ManagementFactory;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;
 import java.util.Map.Entry;
-import java.util.Set;
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.sysml.api.DMLScript;
@@ -47,7 +43,7 @@ import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
  * This class captures all statistics.
  */
 public class Statistics 
-{      
+{
        private static long compileStartTime = 0;
        private static long compileEndTime = 0;
        
@@ -62,7 +58,6 @@ public class Statistics
        private static int iNoOfExecutedSPInst = 0;
        private static int iNoOfCompiledSPInst = 0;
        
-       private static int iNoOfExecutedGPUInst = 0;
 
        //JVM stats
        private static long jitCompileTime = 0; //in milli sec
@@ -105,29 +100,11 @@ public class Statistics
        //heavy hitter counts and times 
        private static HashMap<String,Long> _cpInstTime   =  new 
HashMap<String, Long>();
        private static HashMap<String,Long> _cpInstCounts =  new 
HashMap<String, Long>();
-       
+
        private static AtomicLong lTotalUIPVar = new AtomicLong(0);
        private static AtomicLong lTotalLix = new AtomicLong(0);
        private static AtomicLong lTotalLixUIP = new AtomicLong(0);
-       
-       public static long cudaInitTime = 0;
-       public static long cudaLibrariesInitTime = 0;
-       public static AtomicLong cudaSparseToDenseTime = new AtomicLong(0);     
        // Measures time spent in converting sparse matrix block to dense
-       public static AtomicLong cudaSparseToDenseCount = new AtomicLong(0);
-       public static AtomicLong cudaDenseToSparseTime = new AtomicLong(0);     
        // Measures time spent in converting dense matrix block to sparse
-       public static AtomicLong cudaDenseToSparseCount = new AtomicLong(0);
-       public static AtomicLong cudaSparseConversionTime = new AtomicLong(0);  
// Measures time spent in converting between sparse block types
-       public static AtomicLong cudaSparseConversionCount = new AtomicLong(0);
-       public static AtomicLong cudaAllocTime = new AtomicLong(0);
-       public static AtomicLong cudaDeAllocTime = new AtomicLong(0);
-       public static AtomicLong cudaToDevTime = new AtomicLong(0);
-       public static AtomicLong cudaFromDevTime = new AtomicLong(0);
-       public static AtomicLong cudaAllocCount = new AtomicLong(0);
-       public static AtomicLong cudaDeAllocCount = new AtomicLong(0);
-       public static AtomicLong cudaToDevCount = new AtomicLong(0);
-       public static AtomicLong cudaFromDevCount = new AtomicLong(0);
-       public static AtomicLong cudaEvictionCount = new AtomicLong(0);
-       
+
        public static synchronized void setNoOfExecutedMRJobs(int 
iNoOfExecutedMRJobs) {
                Statistics.iNoOfExecutedMRJobs = iNoOfExecutedMRJobs;
        }
@@ -155,19 +132,7 @@ public class Statistics
        public static synchronized void incrementNoOfCompiledMRJobs() {
                iNoOfCompiledMRJobs ++;
        }
-       
-       
-       public static synchronized void setNoOfExecutedGPUInst(int numJobs) {
-               iNoOfExecutedGPUInst = numJobs;
-       }
-       
-       public static synchronized void incrementNoOfExecutedGPUInst() {
-               iNoOfExecutedGPUInst ++;
-       }
-       
-       public static synchronized int getNoOfExecutedGPUInst() {
-               return iNoOfExecutedGPUInst;
-       }
+
 
        public static synchronized void setNoOfExecutedSPInst(int numJobs) {
                iNoOfExecutedSPInst = numJobs;
@@ -249,7 +214,7 @@ public class Statistics
                }
                
                if( DMLScript.USE_ACCELERATOR )
-                       setNoOfExecutedGPUInst(0);
+                       GPUStatistics.setNoOfExecutedGPUInst(0);
        }
        
        public static synchronized void incrementJITCompileTime( long time ) {
@@ -432,18 +397,8 @@ public class Statistics
                resetJVMgcTime();
                resetJVMgcCount();
                resetCPHeavyHitters();
-               
-               cudaInitTime = 0;
-               cudaLibrariesInitTime = 0;
-               cudaAllocTime.set(0);
-               cudaDeAllocTime.set(0);
-               cudaToDevTime.set(0);
-               cudaFromDevTime.set(0);
-               cudaAllocCount.set(0);
-               cudaDeAllocCount.set(0);
-               cudaToDevCount.set(0);
-               cudaFromDevCount.set(0);
-               cudaEvictionCount.set(0);
+
+               GPUStatistics.reset();
                LibMatrixDNN.resetStatistics();
        }
 
@@ -523,18 +478,24 @@ public class Statistics
                
                return opcode;
        }
-       
-       public synchronized static void maintainCPHeavyHitters( String key, 
long timeNanos )
+
+       /**
+        * "Maintains" or adds time to per instruction/op timers, also 
increments associated count
+        * @param instructionName       name of the instruction/op
+        * @param timeNanos                             time in nano seconds
+        */
+       public synchronized static void maintainCPHeavyHitters( String 
instructionName, long timeNanos )
        {
-               Long oldVal = _cpInstTime.get(key);
+               Long oldVal = _cpInstTime.get(instructionName);
                Long newVal = timeNanos + ((oldVal!=null) ? oldVal : 0);
-               _cpInstTime.put(key, newVal);
+               _cpInstTime.put(instructionName, newVal);
 
-               Long oldCnt = _cpInstCounts.get(key);
+               Long oldCnt = _cpInstCounts.get(instructionName);
                Long newCnt = 1 + ((oldCnt!=null) ? oldCnt : 0);
-               _cpInstCounts.put(key, newCnt);
+               _cpInstCounts.put(instructionName, newCnt);
        }
-       
+
+
        public static Set<String> getCPHeavyHitterOpCodes() {
                return _cpInstTime.keySet();
        }
@@ -568,6 +529,11 @@ public class Statistics
                        sb.append(String.format("%.3f", 
((double)tmp[len-1-i].getValue())/1000000000));
                        sb.append(" sec \t");
                        sb.append(_cpInstCounts.get(key));
+                       sb.append("\t");
+                       // Add the miscellaneous timer info
+                       if (GPUStatistics.DISPLAY_STATISTICS) {
+                               
sb.append(GPUStatistics.getStringForCPMiscTimesPerInstruction(key));
+                       }
                        sb.append("\n");
                }
                
@@ -685,32 +651,9 @@ public class Statistics
                                sb.append("Number of compiled MR Jobs:\t" + 
getNoOfCompiledMRJobs() + ".\n");
                        sb.append("Number of executed MR Jobs:\t" + 
getNoOfExecutedMRJobs() + ".\n");   
                }
-               
-               if( DMLScript.USE_ACCELERATOR && DMLScript.STATISTICS ) {
-                       sb.append("CUDA/CuLibraries init time:\t" + 
String.format("%.3f", cudaInitTime*1e-9) + "/"
-                                       + String.format("%.3f", 
cudaLibrariesInitTime*1e-9) + " sec.\n");
-                       sb.append("Number of executed GPU inst:\t" + 
getNoOfExecutedGPUInst() + ".\n");
-                       sb.append("GPU mem tx time  
(alloc/dealloc/toDev/fromDev):\t"
-                                       + String.format("%.3f", 
cudaAllocTime.get()*1e-9) + "/"
-                                       + String.format("%.3f", 
cudaDeAllocTime.get()*1e-9) + "/"
-                                       + String.format("%.3f", 
cudaToDevTime.get()*1e-9) + "/"
-                                       + String.format("%.3f", 
cudaFromDevTime.get()*1e-9)  + " sec.\n");
-                       sb.append("GPU mem tx count 
(alloc/dealloc/toDev/fromDev/evict):\t"
-                                       + cudaAllocCount.get() + "/"
-                                       + cudaDeAllocCount.get() + "/"
-                                       + cudaSparseConversionCount.get() + "/"
-                                       + cudaToDevCount.get() + "/"
-                                       + cudaFromDevCount.get() + "/"
-                                       + cudaEvictionCount.get() + ".\n");
-                       sb.append("GPU conversion time  
(sparseConv/sp2dense/dense2sp):\t"
-                                       + String.format("%.3f", 
cudaSparseConversionTime.get()*1e-9) + "/"
-                                       + String.format("%.3f", 
cudaSparseToDenseTime.get()*1e-9) + "/"
-                                       + String.format("%.3f", 
cudaDenseToSparseTime.get()*1e-9) + " sec.\n");
-                       sb.append("GPU conversion count 
(sparseConv/sp2dense/dense2sp):\t"
-                                       + cudaSparseConversionCount.get() + "/"
-                                       + cudaSparseToDenseCount.get() + "/"
-                                       + cudaDenseToSparseCount.get() + ".\n");
-               }
+
+               if( DMLScript.USE_ACCELERATOR && DMLScript.STATISTICS)
+                       sb.append(GPUStatistics.getStringForCudaTimers());
                
                //show extended caching/compilation statistics
                if( DMLScript.STATISTICS ) 
@@ -750,6 +693,7 @@ public class Statistics
                                sb.append("ParFor result merge time:\t" + 
String.format("%.3f", ((double)getParforMergeTime())/1000) + " sec.\n");      
                                sb.append("ParFor total update in-place:\t" + 
lTotalUIPVar + "/" + lTotalLixUIP + "/" + lTotalLix + "\n");
                        }
+
                        sb.append("Total JIT compile time:\t\t" + 
((double)getJITCompileTime())/1000 + " sec.\n");
                        sb.append("Total JVM GC count:\t\t" + getJVMgcCount() + 
".\n");
                        sb.append("Total JVM GC time:\t\t" + 
((double)getJVMgcTime())/1000 + " sec.\n");

Reply via email to