Repository: incubator-systemml Updated Branches: refs/heads/master ee33ec62d -> 4f9dcf9ad
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index 0c0410c..b46985f 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -50,10 +50,10 @@ public class LibMatrixDNN { //library configurations and external contracts public static final boolean SUPPORTS_SPARSE_OUTPUTS = false; //operations able to handle sparse outputs - private static final boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output private static final boolean ALLOW_MULTI_THREADED_OPS = true; //enable multi-threading in cp private static final int NUM_TASK_FACTOR = 2; //number of tasks is vcores scaled by this factor - + public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output + private enum TaskType { MaxPooling_Forward, MaxPooling_Backward, // Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java new file mode 100644 index 0000000..044901b --- /dev/null +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.utils; + +import org.apache.sysml.api.DMLScript; + +import java.util.*; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Measures performance numbers when GPU mode is enabled + * Printed as part of {@link Statistics}. + */ +public class GPUStatistics { + + // Whether or not extra per-instruction statistics will be recorded and shown for the GPU + public static boolean DISPLAY_STATISTICS = false; + + private static int iNoOfExecutedGPUInst = 0; + + public static long cudaInitTime = 0; + public static long cudaLibrariesInitTime = 0; + public static AtomicLong cudaSparseToDenseTime = new AtomicLong(0); // time spent in converting sparse matrix block to dense + public static AtomicLong cudaDenseToSparseTime = new AtomicLong(0); // time spent in converting dense matrix block to sparse + public static AtomicLong cudaSparseConversionTime = new AtomicLong(0); // time spent in converting between sparse block types + public static AtomicLong cudaSparseToDenseCount = new AtomicLong(0); + public static AtomicLong cudaDenseToSparseCount = new AtomicLong(0); + public static AtomicLong cudaSparseConversionCount = new AtomicLong(0); + + public static AtomicLong cudaAllocTime = new AtomicLong(0); // time spent in allocating memory on the GPU + public static AtomicLong cudaDeAllocTime = new AtomicLong(0); // time spent in deallocating memory on the GPU + public static AtomicLong cudaToDevTime = new AtomicLong(0); // time spent in copying data from host (CPU) to device (GPU) memory + public static AtomicLong cudaFromDevTime = new AtomicLong(0); // time spent in copying data from device to host + public static AtomicLong cudaAllocCount = new AtomicLong(0); + public static AtomicLong cudaDeAllocCount = new AtomicLong(0); + public static AtomicLong cudaToDevCount = new AtomicLong(0); + public static AtomicLong cudaFromDevCount = new AtomicLong(0); + public static AtomicLong cudaEvictionCount = new AtomicLong(0); + + // Per instruction miscellaneous timers. + // Used to record events in a CP Heavy Hitter instruction and + // provide a breakdown of how time was spent in that instruction + private static HashMap<String, HashMap<String, Long>> _cpInstMiscTime = new HashMap<String, HashMap<String, Long>> (); + private static HashMap<String, HashMap<String, Long>> _cpInstMiscCount = new HashMap<String, HashMap<String, Long>> (); + + /** + * Resets the miscellaneous timers & counters + */ + public static void resetMiscTimers(){ + _cpInstMiscTime.clear(); + _cpInstMiscCount.clear(); + } + + /** + * Resets all the cuda counters and timers, including the misc timers & counters + */ + public static void reset(){ + cudaInitTime = 0; + cudaLibrariesInitTime = 0; + cudaAllocTime.set(0); + cudaDeAllocTime.set(0); + cudaToDevTime.set(0); + cudaFromDevTime.set(0); + cudaAllocCount.set(0); + cudaDeAllocCount.set(0); + cudaToDevCount.set(0); + cudaFromDevCount.set(0); + cudaEvictionCount.set(0); + resetMiscTimers(); + } + + + public static synchronized void setNoOfExecutedGPUInst(int numJobs) { + iNoOfExecutedGPUInst = numJobs; + } + + public static synchronized void incrementNoOfExecutedGPUInst() { + iNoOfExecutedGPUInst ++; + } + + public static synchronized int getNoOfExecutedGPUInst() { + return iNoOfExecutedGPUInst; + } + + /** + * "Maintains" or adds time to miscellaneous timers per instruction/op, also increments associated count + * @param instructionName name of the instruction/op + * @param miscTimer name of the miscellaneous timer + * @param timeNanos time in nano seconds + * @param incrementCount how much to increment the count of the miscTimer by + */ + public synchronized static void maintainCPMiscTimes( String instructionName, String miscTimer, long timeNanos, long incrementCount) + { + if (!DISPLAY_STATISTICS) + return; + + HashMap<String, Long> miscTimesMap = _cpInstMiscTime.get(instructionName); + if (miscTimesMap == null) { + miscTimesMap = new HashMap<String, Long>(); + _cpInstMiscTime.put(instructionName, miscTimesMap); + } + Long oldVal = miscTimesMap.get(miscTimer); + Long newVal = timeNanos + ((oldVal!=null) ? oldVal : 0); + miscTimesMap.put(miscTimer, newVal); + + HashMap<String, Long> miscCountMap = _cpInstMiscCount.get(instructionName); + if (miscCountMap == null){ + miscCountMap = new HashMap<String, Long>(); + _cpInstMiscCount.put(instructionName, miscCountMap); + } + Long oldCnt = miscCountMap.get(miscTimer); + Long newCnt = incrementCount + ((oldCnt!=null) ? oldCnt : 0); + miscCountMap.put(miscTimer, newCnt); + } + + /** + * "Maintains" or adds time to miscellaneous timers per instruction/op, also increments associated count by 1 + * @param instructionName name of the instruction/op + * @param miscTimer name of the miscellaneous timer + * @param timeNanos time in nano seconds + */ + public synchronized static void maintainCPMiscTimes( String instructionName, String miscTimer, long timeNanos){ + maintainCPMiscTimes(instructionName, miscTimer, timeNanos, 1); + } + + /** + * Used to print misc timers (and their counts) for a given instruction/op + * @param instructionName name of the instruction/op + * @return a formatted string of misc timers for a given instruction/op + */ + public static String getStringForCPMiscTimesPerInstruction(String instructionName) { + StringBuffer sb = new StringBuffer(); + HashMap<String, Long> miscTimerMap = _cpInstMiscTime.get(instructionName); + if (miscTimerMap != null) { + List<Map.Entry<String, Long>> sortedList = new ArrayList<Map.Entry<String, Long>>(miscTimerMap.entrySet()); + // Sort the times to display by the most expensive first + Collections.sort(sortedList, new Comparator<Map.Entry<String, Long>>() { + @Override + public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) { + return (int) (o1.getValue() - o2.getValue()); + } + }); + Iterator<Map.Entry<String, Long>> miscTimeIter = sortedList.iterator(); + HashMap<String, Long> miscCountMap = _cpInstMiscCount.get(instructionName); + while (miscTimeIter.hasNext()) { + Map.Entry<String, Long> e = miscTimeIter.next(); + String miscTimerName = e.getKey(); + Long miscTimerTime = e.getValue(); + Long miscCount = miscCountMap.get(miscTimerName); + sb.append(miscTimerName + "[" + String.format("%.3f", (double) miscTimerTime / 1000000000.0) + "s," + miscCount + "]"); + if (miscTimeIter.hasNext()) + sb.append(", "); + } + } + return sb.toString(); + } + + /** + * Used to print out cuda timers & counters + * @return a formatted string of cuda timers & counters + */ + public static String getStringForCudaTimers() { + StringBuffer sb = new StringBuffer(); + sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/" + + String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n"); + sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n"); + sb.append("GPU mem tx time (alloc/dealloc/toDev/fromDev):\t" + + String.format("%.3f", cudaAllocTime.get()*1e-9) + "/" + + String.format("%.3f", cudaDeAllocTime.get()*1e-9) + "/" + + String.format("%.3f", cudaToDevTime.get()*1e-9) + "/" + + String.format("%.3f", cudaFromDevTime.get()*1e-9) + " sec.\n"); + sb.append("GPU mem tx count (alloc/dealloc/toDev/fromDev/evict):\t" + + cudaAllocCount.get() + "/" + + cudaDeAllocCount.get() + "/" + + cudaSparseConversionCount.get() + "/" + + cudaToDevCount.get() + "/" + + cudaFromDevCount.get() + "/" + + cudaEvictionCount.get() + ".\n"); + sb.append("GPU conversion time (sparseConv/sp2dense/dense2sp):\t" + + String.format("%.3f", cudaSparseConversionTime.get()*1e-9) + "/" + + String.format("%.3f", cudaSparseToDenseTime.get()*1e-9) + "/" + + String.format("%.3f", cudaDenseToSparseTime.get()*1e-9) + " sec.\n"); + sb.append("GPU conversion count (sparseConv/sp2dense/dense2sp):\t" + + cudaSparseConversionCount.get() + "/" + + cudaSparseToDenseCount.get() + "/" + + cudaDenseToSparseCount.get() + ".\n"); + + return sb.toString(); + } + + +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/utils/Statistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java index 08f0452..262950f 100644 --- a/src/main/java/org/apache/sysml/utils/Statistics.java +++ b/src/main/java/org/apache/sysml/utils/Statistics.java @@ -22,12 +22,8 @@ package org.apache.sysml.utils; import java.lang.management.CompilationMXBean; import java.lang.management.GarbageCollectorMXBean; import java.lang.management.ManagementFactory; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; +import java.util.*; import java.util.Map.Entry; -import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import org.apache.sysml.api.DMLScript; @@ -47,7 +43,7 @@ import org.apache.sysml.runtime.matrix.data.LibMatrixDNN; * This class captures all statistics. */ public class Statistics -{ +{ private static long compileStartTime = 0; private static long compileEndTime = 0; @@ -62,7 +58,6 @@ public class Statistics private static int iNoOfExecutedSPInst = 0; private static int iNoOfCompiledSPInst = 0; - private static int iNoOfExecutedGPUInst = 0; //JVM stats private static long jitCompileTime = 0; //in milli sec @@ -105,29 +100,11 @@ public class Statistics //heavy hitter counts and times private static HashMap<String,Long> _cpInstTime = new HashMap<String, Long>(); private static HashMap<String,Long> _cpInstCounts = new HashMap<String, Long>(); - + private static AtomicLong lTotalUIPVar = new AtomicLong(0); private static AtomicLong lTotalLix = new AtomicLong(0); private static AtomicLong lTotalLixUIP = new AtomicLong(0); - - public static long cudaInitTime = 0; - public static long cudaLibrariesInitTime = 0; - public static AtomicLong cudaSparseToDenseTime = new AtomicLong(0); // Measures time spent in converting sparse matrix block to dense - public static AtomicLong cudaSparseToDenseCount = new AtomicLong(0); - public static AtomicLong cudaDenseToSparseTime = new AtomicLong(0); // Measures time spent in converting dense matrix block to sparse - public static AtomicLong cudaDenseToSparseCount = new AtomicLong(0); - public static AtomicLong cudaSparseConversionTime = new AtomicLong(0); // Measures time spent in converting between sparse block types - public static AtomicLong cudaSparseConversionCount = new AtomicLong(0); - public static AtomicLong cudaAllocTime = new AtomicLong(0); - public static AtomicLong cudaDeAllocTime = new AtomicLong(0); - public static AtomicLong cudaToDevTime = new AtomicLong(0); - public static AtomicLong cudaFromDevTime = new AtomicLong(0); - public static AtomicLong cudaAllocCount = new AtomicLong(0); - public static AtomicLong cudaDeAllocCount = new AtomicLong(0); - public static AtomicLong cudaToDevCount = new AtomicLong(0); - public static AtomicLong cudaFromDevCount = new AtomicLong(0); - public static AtomicLong cudaEvictionCount = new AtomicLong(0); - + public static synchronized void setNoOfExecutedMRJobs(int iNoOfExecutedMRJobs) { Statistics.iNoOfExecutedMRJobs = iNoOfExecutedMRJobs; } @@ -155,19 +132,7 @@ public class Statistics public static synchronized void incrementNoOfCompiledMRJobs() { iNoOfCompiledMRJobs ++; } - - - public static synchronized void setNoOfExecutedGPUInst(int numJobs) { - iNoOfExecutedGPUInst = numJobs; - } - - public static synchronized void incrementNoOfExecutedGPUInst() { - iNoOfExecutedGPUInst ++; - } - - public static synchronized int getNoOfExecutedGPUInst() { - return iNoOfExecutedGPUInst; - } + public static synchronized void setNoOfExecutedSPInst(int numJobs) { iNoOfExecutedSPInst = numJobs; @@ -249,7 +214,7 @@ public class Statistics } if( DMLScript.USE_ACCELERATOR ) - setNoOfExecutedGPUInst(0); + GPUStatistics.setNoOfExecutedGPUInst(0); } public static synchronized void incrementJITCompileTime( long time ) { @@ -432,18 +397,8 @@ public class Statistics resetJVMgcTime(); resetJVMgcCount(); resetCPHeavyHitters(); - - cudaInitTime = 0; - cudaLibrariesInitTime = 0; - cudaAllocTime.set(0); - cudaDeAllocTime.set(0); - cudaToDevTime.set(0); - cudaFromDevTime.set(0); - cudaAllocCount.set(0); - cudaDeAllocCount.set(0); - cudaToDevCount.set(0); - cudaFromDevCount.set(0); - cudaEvictionCount.set(0); + + GPUStatistics.reset(); LibMatrixDNN.resetStatistics(); } @@ -523,18 +478,24 @@ public class Statistics return opcode; } - - public synchronized static void maintainCPHeavyHitters( String key, long timeNanos ) + + /** + * "Maintains" or adds time to per instruction/op timers, also increments associated count + * @param instructionName name of the instruction/op + * @param timeNanos time in nano seconds + */ + public synchronized static void maintainCPHeavyHitters( String instructionName, long timeNanos ) { - Long oldVal = _cpInstTime.get(key); + Long oldVal = _cpInstTime.get(instructionName); Long newVal = timeNanos + ((oldVal!=null) ? oldVal : 0); - _cpInstTime.put(key, newVal); + _cpInstTime.put(instructionName, newVal); - Long oldCnt = _cpInstCounts.get(key); + Long oldCnt = _cpInstCounts.get(instructionName); Long newCnt = 1 + ((oldCnt!=null) ? oldCnt : 0); - _cpInstCounts.put(key, newCnt); + _cpInstCounts.put(instructionName, newCnt); } - + + public static Set<String> getCPHeavyHitterOpCodes() { return _cpInstTime.keySet(); } @@ -568,6 +529,11 @@ public class Statistics sb.append(String.format("%.3f", ((double)tmp[len-1-i].getValue())/1000000000)); sb.append(" sec \t"); sb.append(_cpInstCounts.get(key)); + sb.append("\t"); + // Add the miscellaneous timer info + if (GPUStatistics.DISPLAY_STATISTICS) { + sb.append(GPUStatistics.getStringForCPMiscTimesPerInstruction(key)); + } sb.append("\n"); } @@ -685,32 +651,9 @@ public class Statistics sb.append("Number of compiled MR Jobs:\t" + getNoOfCompiledMRJobs() + ".\n"); sb.append("Number of executed MR Jobs:\t" + getNoOfExecutedMRJobs() + ".\n"); } - - if( DMLScript.USE_ACCELERATOR && DMLScript.STATISTICS ) { - sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/" - + String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n"); - sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n"); - sb.append("GPU mem tx time (alloc/dealloc/toDev/fromDev):\t" - + String.format("%.3f", cudaAllocTime.get()*1e-9) + "/" - + String.format("%.3f", cudaDeAllocTime.get()*1e-9) + "/" - + String.format("%.3f", cudaToDevTime.get()*1e-9) + "/" - + String.format("%.3f", cudaFromDevTime.get()*1e-9) + " sec.\n"); - sb.append("GPU mem tx count (alloc/dealloc/toDev/fromDev/evict):\t" - + cudaAllocCount.get() + "/" - + cudaDeAllocCount.get() + "/" - + cudaSparseConversionCount.get() + "/" - + cudaToDevCount.get() + "/" - + cudaFromDevCount.get() + "/" - + cudaEvictionCount.get() + ".\n"); - sb.append("GPU conversion time (sparseConv/sp2dense/dense2sp):\t" - + String.format("%.3f", cudaSparseConversionTime.get()*1e-9) + "/" - + String.format("%.3f", cudaSparseToDenseTime.get()*1e-9) + "/" - + String.format("%.3f", cudaDenseToSparseTime.get()*1e-9) + " sec.\n"); - sb.append("GPU conversion count (sparseConv/sp2dense/dense2sp):\t" - + cudaSparseConversionCount.get() + "/" - + cudaSparseToDenseCount.get() + "/" - + cudaDenseToSparseCount.get() + ".\n"); - } + + if( DMLScript.USE_ACCELERATOR && DMLScript.STATISTICS) + sb.append(GPUStatistics.getStringForCudaTimers()); //show extended caching/compilation statistics if( DMLScript.STATISTICS ) @@ -750,6 +693,7 @@ public class Statistics sb.append("ParFor result merge time:\t" + String.format("%.3f", ((double)getParforMergeTime())/1000) + " sec.\n"); sb.append("ParFor total update in-place:\t" + lTotalUIPVar + "/" + lTotalLixUIP + "/" + lTotalLix + "\n"); } + sb.append("Total JIT compile time:\t\t" + ((double)getJITCompileTime())/1000 + " sec.\n"); sb.append("Total JVM GC count:\t\t" + getJVMgcCount() + ".\n"); sb.append("Total JVM GC time:\t\t" + ((double)getJVMgcTime())/1000 + " sec.\n");
