[SYSTEMML-540] Include the memory requirement of each layer in the summary table of Caffe2DML
- This helps the user to estimate the batch size she should set for optimal performance. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/881caa9b Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/881caa9b Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/881caa9b Branch: refs/heads/master Commit: 881caa9ba508b029f72f27d468bb33805704c7cb Parents: 8f4ecdc Author: Niketan Pansare <npan...@us.ibm.com> Authored: Wed Oct 25 15:40:21 2017 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Wed Oct 25 15:42:04 2017 -0700 ---------------------------------------------------------------------- docs/beginners-guide-caffe2dml.md | 37 +++++++++------- .../org/apache/sysml/api/dl/Caffe2DML.scala | 46 ++++++++++++++++++-- 2 files changed, 63 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/881caa9b/docs/beginners-guide-caffe2dml.md ---------------------------------------------------------------------- diff --git a/docs/beginners-guide-caffe2dml.md b/docs/beginners-guide-caffe2dml.md index 4d6b7fd..8814283 100644 --- a/docs/beginners-guide-caffe2dml.md +++ b/docs/beginners-guide-caffe2dml.md @@ -64,22 +64,27 @@ lenet.summary() Output: ``` -+-----+---------------+--------------+------------+---------+-----------+---------+ -| Name| Type| Output| Weight| Bias| Top| Bottom| -+-----+---------------+--------------+------------+---------+-----------+---------+ -|mnist| Data| (, 1, 28, 28)| | |mnist,mnist| | -|conv1| Convolution|(, 32, 28, 28)| [32 X 25]| [32 X 1]| conv1| mnist| -|relu1| ReLU|(, 32, 28, 28)| | | relu1| conv1| -|pool1| Pooling|(, 32, 14, 14)| | | pool1| relu1| -|conv2| Convolution|(, 64, 14, 14)| [64 X 800]| [64 X 1]| conv2| pool1| -|relu2| ReLU|(, 64, 14, 14)| | | relu2| conv2| -|pool2| Pooling| (, 64, 7, 7)| | | pool2| relu2| -| ip1| InnerProduct| (, 512, 1, 1)|[3136 X 512]|[1 X 512]| ip1| pool2| -|relu3| ReLU| (, 512, 1, 1)| | | relu3| ip1| -|drop1| Dropout| (, 512, 1, 1)| | | drop1| relu3| -| ip2| InnerProduct| (, 10, 1, 1)| [512 X 10]| [1 X 10]| ip2| drop1| -| loss|SoftmaxWithLoss| (, 10, 1, 1)| | | loss|ip2,mnist| -+-----+---------------+--------------+------------+---------+-----------+---------+ ++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+ +| Name| Type| Output| Weight| Bias| Top| Bottom|Memory* (train/test)| ++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+ +|mnist| Data| (, 1, 28, 28)| | |mnist,mnist| | 1/0| +|conv1| Convolution|(, 32, 28, 28)| [32 X 25]| [32 X 1]| conv1| mnist| 25/12| +|relu1| ReLU|(, 32, 28, 28)| | | relu1| conv1| 25/12| +|pool1| Pooling|(, 32, 14, 14)| | | pool1| relu1| 6/3| +|conv2| Convolution|(, 64, 14, 14)| [64 X 800]| [64 X 1]| conv2| pool1| 38/7| +|relu2| ReLU|(, 64, 14, 14)| | | relu2| conv2| 12/6| +|pool2| Pooling| (, 64, 7, 7)| | | pool2| relu2| 3/2| +| ip1| InnerProduct| (, 512, 1, 1)|[3136 X 512]|[1 X 512]| ip1| pool2| 797/13| +|relu3| ReLU| (, 512, 1, 1)| | | relu3| ip1| 1/0| +|drop1| Dropout| (, 512, 1, 1)| | | drop1| relu3| 1/0| +| ip2| InnerProduct| (, 10, 1, 1)| [512 X 10]| [1 X 10]| ip2| drop1| 3/0| +| loss|SoftmaxWithLoss| (, 10, 1, 1)| | | loss|ip2,mnist| 0/0| ++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+ + +Total number of layer outputs/errors/weights/bias/gradients: 5568768/5568768/1662752/618/106455680 +Total memory requirements for parameters* for train/test: 910/55 +[Advanced] Key network statistics to compute intermediate CP overhead batchSize/maxThreads/1-thread im2col*(sum, max)/1-thread reshape_col*(sum, max): 64/48/(1, 1)/(0, 0). +* => memory in megabytes assuming the parameters are in double precision and in dense format. ``` To train the above lenet model, we use the MNIST dataset. http://git-wip-us.apache.org/repos/asf/systemml/blob/881caa9b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala index 03b9a3b..56be5d6 100644 --- a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala +++ b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala @@ -50,6 +50,8 @@ import java.util.Random import org.apache.commons.logging.Log import org.apache.commons.logging.LogFactory import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer +import org.apache.sysml.hops.OptimizerUtils +import java.lang.Double /*************************************************************************************** DESIGN OF CAFFE2DML: @@ -306,10 +308,21 @@ class Caffe2DML(val sc: SparkContext, def getTrainAlgo(): String = if (inputs.containsKey("$train_algo")) inputs.get("$train_algo") else "minibatch" def getTestAlgo(): String = if (inputs.containsKey("$test_algo")) inputs.get("$test_algo") else "minibatch" + private def getMemInBytes(l:CaffeLayer, batchSize:Int, isTraining:Boolean):Long = { + val numLayerOutput = l.outputShape._1.toLong * l.outputShape._2.toLong * l.outputShape._3.toLong * batchSize + val numLayerError = numLayerOutput + val numLayerWeights = if(l.weightShape != null) l.weightShape()(0).toLong * l.weightShape()(1).toLong else 0 + val numLayerBias = if(l.biasShape != null)l.biasShape()(0).toLong * l.biasShape()(1).toLong else 0 + val numLayerGradients = (numLayerWeights + numLayerBias) * batchSize + if(isTraining) (numLayerOutput + numLayerError + numLayerWeights + numLayerBias + numLayerGradients)*Double.BYTES + else (numLayerOutput + numLayerWeights + numLayerBias)*Double.BYTES + } def summary(sparkSession: org.apache.spark.sql.SparkSession): Unit = { - val header = Seq("Name", "Type", "Output", "Weight", "Bias", "Top", "Bottom") - val entries = net.getLayers - .map(l => (l, net.getCaffeLayer(l))) + val layers = net.getLayers .map(l => (l, net.getCaffeLayer(l))) + val numDataLayers = layers.filter(l => l._2.isInstanceOf[Data]).length + val batchSize = if(numDataLayers == 1) layers.filter(l => l._2.isInstanceOf[Data]).map(l => l._2.param.getDataParam.getBatchSize).get(0) else -1 + val header = Seq("Name", "Type", "Output", "Weight", "Bias", "Top", "Bottom", "Memory* (train/test)") + val entries = layers .map(l => { val layer = l._2 (l._1, @@ -318,10 +331,35 @@ class Caffe2DML(val sc: SparkContext, if (layer.weightShape != null) "[" + layer.weightShape()(0) + " X " + layer.weightShape()(1) + "]" else "", if (layer.biasShape != null) "[" + layer.biasShape()(0) + " X " + layer.biasShape()(1) + "]" else "", layer.param.getTopList.mkString(","), - layer.param.getBottomList.mkString(",")) + layer.param.getBottomList.mkString(","), + OptimizerUtils.toMB(getMemInBytes(l._2, batchSize, true)) + "/" + OptimizerUtils.toMB(getMemInBytes(l._2, batchSize, false)) + ) }) import sparkSession.implicits._ sc.parallelize(entries).toDF(header: _*).show(net.getLayers.size) + + val numLayerOutput = layers.map(l => l._2.outputShape._1.toLong * l._2.outputShape._2.toLong * l._2.outputShape._3.toLong).sum * batchSize + val numLayerError = numLayerOutput + val numLayerWeights = layers.map(l => if(l._2.weightShape != null) l._2.weightShape()(0).toLong * l._2.weightShape()(1).toLong else 0).sum + val numLayerBias = layers.map(l => if(l._2.biasShape != null) l._2.biasShape()(0).toLong * l._2.biasShape()(1).toLong else 0).sum + val numLayerGradients = (numLayerWeights + numLayerBias) * batchSize + val convLayers = layers.filter(l => l._2.isInstanceOf[Convolution]).map(l => l._2.asInstanceOf[Convolution]) + val crspq = convLayers.map(l => l.numChannels.toLong*l.kernel_h.toLong*l.kernel_w.toLong*l.outputShape._2.toLong*l.outputShape._3.toLong) + val kpq = convLayers.map(l => l.outputShape._1.toLong*l.outputShape._2.toLong*l.outputShape._3.toLong) + + if(getTrainAlgo().equals("minibatch") && getTestAlgo().equals("minibatch")) { + System.out.println("Total number of layer outputs/errors/weights/bias/gradients: " + numLayerOutput + "/" + numLayerError + + "/" + numLayerWeights + "/" + numLayerBias + "/" + numLayerGradients) + System.out.println("Total memory requirements for parameters* for train/test: " + + OptimizerUtils.toMB(layers.map(l => getMemInBytes(l._2, batchSize, true)).sum) + "/" + + OptimizerUtils.toMB(layers.map(l => getMemInBytes(l._2, batchSize, false)).sum)) + System.out.println("[Advanced] Key network statistics to compute intermediate CP overhead " + + "batchSize/maxThreads/1-thread im2col*(sum, max)/1-thread reshape_col*(sum, max): " + + batchSize + "/" + OptimizerUtils.getConstrainedNumThreads(-1) + "/(" + + OptimizerUtils.toMB(crspq.sum*Double.BYTES) + ", " + OptimizerUtils.toMB(crspq.max*Double.BYTES) + ")/(" + + OptimizerUtils.toMB(kpq.sum*Double.BYTES) + ", " + OptimizerUtils.toMB(kpq.max*Double.BYTES) + ").") + } + System.out.println("* => memory in megabytes assuming the parameters are in double precision and in dense format.") } // ================================================================================================