Repository: incubator-systemml Updated Branches: refs/heads/master 59ff8b3d4 -> 7ed8e3f49
[SYSTEMML-1479] Integrated DeConvolution and Concat layer in Caffe2DML Closes #492. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/7ed8e3f4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/7ed8e3f4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/7ed8e3f4 Branch: refs/heads/master Commit: 7ed8e3f4994b80f8e98f16e40f12667180fd6a02 Parents: 59ff8b3 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Sat May 13 10:45:55 2017 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Sat May 13 10:45:55 2017 -0700 ---------------------------------------------------------------------- .../org/apache/sysml/api/dl/CaffeLayer.scala | 713 ++++++++++++++++++- .../org/apache/sysml/api/dl/CaffeNetwork.scala | 14 + .../org/apache/sysml/api/dl/DMLGenerator.scala | 14 +- 3 files changed, 713 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7ed8e3f4/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala index 4faa203..0d1740e 100644 --- a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala +++ b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala @@ -76,12 +76,12 @@ trait CaffeLayer extends BaseDMLGenerator { if(computedDout == null) { val ret = net.getTopLayers(param.getName).map(l => net.getCaffeLayer(l)).toList if(ret.size == 0) throw new LanguageException("Expected atleast 1 top layer for " + param.getName) - else if(ret.size == 1) computedDout = ret(0).dX - else computedDout = sum(new StringBuilder, ret.map(_.dX).toList).toString() + else if(ret.size == 1) computedDout = ret(0).dX(id) + else computedDout = sum(new StringBuilder, ret.map(_.dX(id)).toList).toString() } computedDout } - val dX = "dOut" + id + def dX(bottomLayerID:Int) = "dOut" + id + "_" + bottomLayerID // -------------------------------------------------------------------------------------- // No need to override these methods in subclasses, instead classes that have weights and biases // should implement HasWeight and HasBias traits. @@ -99,8 +99,33 @@ trait CaffeLayer extends BaseDMLGenerator { def invokeForward(dmlScript:StringBuilder, returnVariables:List[String], arguments:String*):Unit = { invoke(dmlScript, sourceFileName + "::", returnVariables, "forward", arguments.toList) } + // ----------------------------------------------------------------------------------- + // All the layers (with the exception of Concat) call one of the below methods in the backward function. + // The preceding layer expects that 'dX(bottomLayerID) + outSuffix' is assigned. + // l1 <--- dX(1) <-----| + // |-- [current layer: dOut3 (computed by backward)] <---- "dOut" + id + outSuffix + // l2 <--- dX(2) <-----| + // The below functions perform two functions: + // 1. Compute backward: either call dml file's backward (for example: invokeBackward) or just propagate next layers errors (assignDoutToDX) + // 2. Then make sure that all the preceding layer get the errors using: + // bottomLayerIDs.map(bottomLayerID => dmlScript.append( dX(bottomLayerID) + outSuffix + " = " + "dOut" + id + outSuffix + "; ")) + + // The layers that have a corresponding dml script call this method. + // Assumption: the first variable of resultVariables is always dX def invokeBackward(dmlScript:StringBuilder, outSuffix:String, resultVariables:List[String], arguments:String*):Unit = { - invoke(dmlScript, sourceFileName + "::", resultVariables.map(_ + outSuffix), "backward", arguments.toList) + invoke(dmlScript, sourceFileName + "::", resultVariables.map(_ + outSuffix), "backward", arguments.toList, false) + val bottomLayerIDs = net.getBottomLayers(param.getName).map(l => net.getCaffeLayer(l).id) + dmlScript.append("; ") + bottomLayerIDs.map(bottomLayerID => dmlScript.append( dX(bottomLayerID) + outSuffix + " = " + resultVariables(0) + outSuffix + "; ")) + dmlScript.append("\n") + } + // On-the-fly layers (such as Scale and Elementwise) call this function to propagate next layers errors to the previous layer + def assignDoutToDX(dmlScript:StringBuilder, outSuffix:String):Unit = { + dmlScript.append("dOut" + id + outSuffix + " = " + dout) + val bottomLayerIDs = net.getBottomLayers(param.getName).map(l => net.getCaffeLayer(l).id) + dmlScript.append("; ") + bottomLayerIDs.map(bottomLayerID => dmlScript.append( dX(bottomLayerID) + outSuffix + " = " + "dOut" + id + outSuffix + "; ")) + dmlScript.append("\n") } // -------------------------------------------------------------------------------------- } @@ -144,16 +169,135 @@ class Data(val param:LayerParameter, val id:Int, val net:CaffeNetwork, val numCh class BatchNorm(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extends CaffeLayer with HasWeight with HasBias { // val scale = override def sourceFileName = "batch_norm2d" + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * Inputs: + * - C: Number of input channels (dimensionality of input depth). + * + * Outputs: + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + * - ema_mean: Exponential moving average of the mean, of + * shape (C, 1). + * - ema_var: Exponential moving average of the variance, of + * shape (C, 1). + */ override def init(dmlScript:StringBuilder) = invokeInit(dmlScript, List[String](gamma, beta, ema_mean, ema_var), numChannels) var update_mean_var = true + /* + * Computes the forward pass for a 2D (spatial) batch normalization + * layer. The input data has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * A spatial batch normalization layer uses the per-channel sample + * mean and per-channel uncorrected sample variance during training + * to normalize each channel of the input data. Additionally, it + * introduces learnable parameters (gamma, beta) to control the + * amount of normalization. + * + * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta` + * + * This implementation maintains exponential moving averages of the + * mean and variance during training for use during testing. + * + * Reference: + * - Batch Normalization: Accelerating Deep Network Training by + * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015 + * - https://arxiv.org/abs/1502.03167 + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - mode: 'train' or 'test' to indicate if the model is currently + * being trained or tested. During training, the current batch + * mean and variance will be used to normalize the inputs, while + * during testing, the exponential average of the mean and + * variance over all previous batches will be used. + * - ema_mean: Exponential moving average of the mean, of + * shape (C, 1). + * - ema_var: Exponential moving average of the variance, of + * shape (C, 1). + * - mu: Momentum value for moving averages. + * Typical values are in the range of [0.9, 0.999]. + * - epsilon: Smoothing term to avoid divide by zero errors. + * Typical values are in the range of [1e-5, 1e-3]. + * + * Outputs: + * - out: Outputs, of shape (N, C*Hin*Win). + * - ema_mean_upd: Updated exponential moving average of the mean, + * of shape (C, 1). + * - ema_var_upd: Updated exponential moving average of the variance, + * of shape (C, 1). + * - cache_mean: Cache of the batch mean, of shape (C, 1). + * Note: This is used for performance during training. + * - cache_var: Cache of the batch variance, of shape (C, 1). + * Note: This is used for performance during training. + * - cache_norm: Cache of the normalized inputs, of + * shape (C, N*Hin*Win). Note: This is used for performance + * during training. + */ def forward(dmlScript: StringBuilder, isPrediction: Boolean): Unit = { val mode = if(isPrediction) "\"test\"" else "\"train\"" invokeForward(dmlScript, List[String](out, withSuffix(ema_mean), withSuffix(ema_var), withSuffix(cache_mean), withSuffix(cache_var), withSuffix(cache_norm)), X, gamma, beta, numChannels, Hin, Win, mode, ema_mean, ema_var, ma_fraction, eps) } - + /* + * Computes the backward pass for a 2D (spatial) batch normalization + * layer. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win). + * - out: Outputs from the forward pass, of shape (N, C*Hin*Win). + * - ema_mean_upd: Updated exponential moving average of the mean + * from the forward pass, of shape (C, 1). + * - ema_var_upd: Updated exponential moving average of the variance + * from the forward pass, of shape (C, 1). + * - cache_mean: Cache of the batch mean from the forward pass, of + * shape (C, 1). Note: This is used for performance during + * training. + * - cache_var: Cache of the batch variance from the forward pass, + * of shape (C, 1). Note: This is used for performance during + * training. + * - cache_norm: Cache of the normalized inputs from the forward + * pass, of shape (C, N*Hin*Win). Note: This is used for + * performance during training. + * - X: Input data matrix to the forward pass, of + * shape (N, C*Hin*Win). + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - mode: 'train' or 'test' to indicate if the model is currently + * being trained or tested. During training, the current batch + * mean and variance will be used to normalize the inputs, while + * during testing, the exponential average of the mean and + * variance over all previous batches will be used. + * - ema_mean: Exponential moving average of the mean, of + * shape (C, 1). + * - ema_var: Exponential moving average of the variance, of + * shape (C, 1). + * - mu: Momentum value for moving averages. + * Typical values are in the range of [0.9, 0.999]. + * - epsilon: Smoothing term to avoid divide by zero errors. + * Typical values are in the range of [1e-5, 1e-3]. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dgamma: Gradient wrt `W`, of shape (C, 1). + * - dbeta: Gradient wrt `b`, of shape (C, 1). + * + */ def backward(dmlScript: StringBuilder, outSuffix:String): Unit = { - invokeBackward(dmlScript, outSuffix, List[String](dX, dgamma, dbeta), dout, out, ema_mean, ema_var, cache_mean, cache_var, cache_norm, X, gamma, beta, numChannels, + invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, dgamma, dbeta), dout, out, ema_mean, ema_var, cache_mean, cache_var, cache_norm, X, gamma, beta, numChannels, Hin, Win, "\"train\"", ema_mean, ema_var, ma_fraction, eps) } @@ -190,8 +334,9 @@ class Scale(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extends if(!param.getScaleParam.getBiasTerm) throw new LanguageException("Add \"scale_param { bias_term: true }\" to the layer " + param.getName) override def sourceFileName = null override def init(dmlScript: StringBuilder): Unit = {} + // TODO: Generalize this !! def forward(dmlScript: StringBuilder, isPrediction: Boolean): Unit = assign(dmlScript, out, X) - override def backward(dmlScript: StringBuilder, outSuffix:String): Unit = assign(dmlScript, dX + outSuffix, dout) + override def backward(dmlScript: StringBuilder, outSuffix:String): Unit = assignDoutToDX(dmlScript, outSuffix) } // ------------------------------------------------------------------ @@ -200,10 +345,10 @@ class Elementwise(val param:LayerParameter, val id:Int, val net:CaffeNetwork) ex override def init(dmlScript: StringBuilder): Unit = {} if(param.getEltwiseParam.hasOperation && param.getEltwiseParam.getOperation != EltwiseOp.SUM) throw new LanguageException("Currently only elementwise sum operation supported") - def forward(dmlScript: StringBuilder, isPrediction: Boolean): Unit = { + override def forward(dmlScript: StringBuilder, isPrediction: Boolean): Unit = { addAndAssign(dmlScript, out, param.getBottomList.map(b => net.getCaffeLayer(b).out).toList) } - override def backward(dmlScript: StringBuilder, outSuffix:String): Unit = assign(dmlScript, dX + outSuffix, dout) + override def backward(dmlScript: StringBuilder, outSuffix:String): Unit = assignDoutToDX(dmlScript, outSuffix) override def outputShape = { if(_out == null) _out = net.getCaffeLayer(net.getBottomLayers(param.getName).take(1).toSeq.get(0)).outputShape _out @@ -212,6 +357,117 @@ class Elementwise(val param:LayerParameter, val id:Int, val net:CaffeNetwork) ex } +class Concat(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extends CaffeLayer { + override def sourceFileName = null + override def init(dmlScript: StringBuilder): Unit = {} + var _childLayers:List[CaffeLayer] = null + + // Utility function to create string of format: + // fn(fn(fn(_childLayers(0).out, _childLayers(1).out), _childLayers(2).out), ...) + // This is useful because we do not support multi-input cbind and rbind in DML. + def _getMultiFn(fn:String):String = { + if(_childLayers == null) _childLayers = net.getBottomLayers(param.getName).map(l => net.getCaffeLayer(l)).toList + var tmp = fn + "(" + _childLayers(0).out + ", " + _childLayers(1).out + ")" + for(i <- 2 until _childLayers.size) { + tmp = fn + "(" + tmp + ", " + _childLayers(i).out + ")" + } + tmp + } + + /* + * Computes the forward pass for a concatenation layer. + * + * Inputs: + * - n_i * c_i * h * w for each input blob i from 1 to K. + * + * Outputs: + * - out: Outputs, of shape + * - if axis = 0: (n_1 + n_2 + ... + n_K) * c_1 * h * w, and all input c_i should be the same. + * - if axis = 1: n_1 * (c_1 + c_2 + ... + c_K) * h * w, and all input n_i should be the same. + */ + override def forward(dmlScript: StringBuilder, isPrediction: Boolean): Unit = { + if(param.getConcatParam.getAxis == 0) { + // rbind the inputs + assign(dmlScript, out, _getMultiFn("rbind")) + } + else if(param.getConcatParam.getAxis == 1) { + // cbind the inputs + assign(dmlScript, out, _getMultiFn("cbind")) + } + else { + throw new DMLRuntimeException("Incorrect axis parameter for the layer " + param.getName) + } + } + + def startIndex(outSuffix:String):String = "concat_start_index_" + outSuffix + def endIndex(outSuffix:String):String = "concat_start_index_" + outSuffix + def getConcatIndex(bottomLayerOut:String, outSuffix:String):String = + startIndex(outSuffix) + " = " + endIndex(outSuffix) + " + 1; " + + endIndex(outSuffix) + " = " + startIndex(outSuffix) + " + nrow(" + bottomLayerOut + "); " + + /* + * Computes the backward pass for a concatenation layer. + * + * The top gradients are deconcatenated back to the inputs. + * + */ + override def backward(dmlScript: StringBuilder, outSuffix:String): Unit = { + val bottomLayers = net.getBottomLayers(param.getName).map(l => net.getCaffeLayer(l)).toList + val dOutVar = "dOut" + id + outSuffix + // concat_end_index = 0 + dmlScript.append(dOutVar + " = " + dout + "; concat_end_index" + outSuffix + " = 0; ") + + val indexString = "concat_start_index" + outSuffix + " : concat_end_index" + outSuffix + val doutVarAssignment = if(param.getConcatParam.getAxis == 0) " = " + dOutVar + "[" + indexString + ", ]; " + else " = " + dOutVar + "[," + indexString + " ]; " + + // concat_start_index = concat_end_index + 1 + // concat_end_index = concat_start_index + $$ - 1 + val initializeIndexString = "concat_start_index" + outSuffix + " = concat_end_index" + outSuffix + " + 1; concat_end_index" + outSuffix + + " = concat_start_index" + outSuffix + " + $$ - 1; " + if(param.getConcatParam.getAxis == 0) { + bottomLayers.map(l => { + dmlScript.append(initializeIndexString.replaceAll("$$", nrow(l.out))) + // X1 = Z[concat_start_index:concat_end_index,] + .append( dX(l.id) + outSuffix + doutVarAssignment) + }) + } + else { + bottomLayers.map(l => { + dmlScript.append(initializeIndexString.replaceAll("$$", int_mult(l.outputShape._1, l.outputShape._2, l.outputShape._3) )) + // X1 = Z[concat_start_index:concat_end_index,] + .append( dX(l.id) + outSuffix + doutVarAssignment) + }) + } + dmlScript.append("\n") + } + def sumChannels():String = { + val channels = _childLayers.map(_.outputShape._1) + try { + channels.reduce((c1, c2) => (c1.toInt + c2.toInt).toString()) + } + catch { + case _:Throwable => sum(new StringBuilder, channels).toString + } + } + override def outputShape = { + if(_out == null) { + if(_childLayers == null) _childLayers = net.getBottomLayers(param.getName).map(l => net.getCaffeLayer(l)).toList + if(param.getConcatParam.getAxis == 0) { + _out = _childLayers(0).outputShape + } + else if(param.getConcatParam.getAxis == 1) { + _out = (sumChannels(), _childLayers(0).outputShape._2, _childLayers(0).outputShape._3) + } + else { + throw new DMLRuntimeException("Incorrect axis parameter for the layer " + param.getName) + } + } + _out + } + var _out:(String, String, String) = null +} + class SoftmaxWithLoss(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extends CaffeLayer with IsLossLayer { // ------------------------------------------------- override def sourceFileName = "softmax" @@ -219,8 +475,13 @@ class SoftmaxWithLoss(val param:LayerParameter, val id:Int, val net:CaffeNetwork override def forward(dmlScript:StringBuilder, isPrediction:Boolean) = invokeForward(dmlScript, List[String](out), scores) override def backward(dmlScript:StringBuilder, outSuffix:String) = { - invoke(dmlScript, "cross_entropy_loss::", List[String]("dProbs" + outSuffix), "backward", out, "yb") - invoke(dmlScript.append("\t"), "softmax::", List[String](dX + outSuffix), "backward", "dProbs", scores) + invoke(dmlScript, "cross_entropy_loss::", List[String]("dProbs" + outSuffix), "backward", false, out, "yb") + dmlScript.append("; ") + invoke(dmlScript, "softmax::", List[String]("dOut" + id + outSuffix), "backward", false, "dProbs", scores) + val bottomLayerIDs = net.getBottomLayers(param.getName).map(l => net.getCaffeLayer(l).id) + dmlScript.append("; ") + bottomLayerIDs.map(bottomLayerID => dmlScript.append( dX(bottomLayerID) + outSuffix + " = " + "dOut" + id + outSuffix + "; ")) + dmlScript.append("\n") } override def computeLoss(dmlScript:StringBuilder, numTabs:Int) = { val tabBuilder = new StringBuilder @@ -249,11 +510,36 @@ class SoftmaxWithLoss(val param:LayerParameter, val id:Int, val net:CaffeNetwork } class ReLU(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extends CaffeLayer { + // TODO: Leaky ReLU: negative_slope [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0. // ------------------------------------------------- override def sourceFileName = "relu" override def init(dmlScript:StringBuilder) = { } + /* + * Computes the forward pass for a ReLU nonlinearity layer. + * + * Performs an element-wise evaluation of `f(input) = max(0, input)`. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * + * Outputs: + * - out: Outputs, of same shape as `X`. + */ override def forward(dmlScript:StringBuilder, isPrediction:Boolean) = invokeForward(dmlScript, List[String](out), X) - override def backward(dmlScript:StringBuilder, outSuffix:String) = invokeBackward(dmlScript, outSuffix, List[String](dX), dout, X) + /* + * Computes the backward pass for a ReLU nonlinearity layer. + * + * Essentially performs a pass-through of the upstream gradient + * for cells > 0. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of same shape as `X`. + * - X: Previous input data matrix, of shape (any, any). + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + override def backward(dmlScript:StringBuilder, outSuffix:String) = invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id), dout, X) // ------------------------------------------------- } @@ -261,29 +547,114 @@ class Dropout(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extend // ------------------------------------------------- override def sourceFileName = "dropout" override def init(dmlScript:StringBuilder) = { } + /* + * Computes the forward pass for an inverted dropout layer. + * + * Drops the inputs element-wise with a probability p, and divides + * by p to maintain the expected values of those inputs (which are + * the outputs of neurons) at test time. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * - p: Probability of keeping a neuron output. + * - seed: [Optional: -1] Random number generator seed to allow for + * deterministic evaluation. Set to -1 for a random seed. + * + * Outputs: + * - out: Outputs, of same shape as `X`. + * - mask: Dropout mask used to compute the output. + */ override def forward(dmlScript:StringBuilder, isPrediction:Boolean) = if(!isPrediction) invokeForward(dmlScript, List[String](out, mask), X, p, seed) else assign(dmlScript, out, X) // Forward-pass not required to be performed during prediction for Dropout layer - override def backward(dmlScript:StringBuilder, outSuffix:String) = invokeBackward(dmlScript, outSuffix, List[String](dX), dout, X, p, mask) + /* + * Computes the backward pass for an inverted dropout layer. + * + * Applies the mask to the upstream gradient, and divides by p to + * maintain the expected values at test time. + * + * Inputs: + * - dout: Gradient wrt `out`, of same shape as `X`. + * - X: Inputs, of shape (any, any). + * - p: Probability of keeping a neuron output. + * - mask: Dropout mask used to compute the output. + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + override def backward(dmlScript:StringBuilder, outSuffix:String) = invokeBackward(dmlScript, outSuffix, + List[String]("dOut" + id), dout, X, p, mask) // ------------------------------------------------- def mask = "mask" + id - def p = param.getDropoutParam.getDropoutRatio.toString + // dropout ratio + def p = if(param.getDropoutParam.hasDropoutRatio()) param.getDropoutParam.getDropoutRatio.toString else "0.5" def seed = "-1" } class InnerProduct(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extends CaffeLayer with HasWeight with HasBias { // ------------------------------------------------- + // TODO: bias_filler [default type: 'constant' value: 0]; bias_term [default true]: specifies whether to learn and apply a set of additive biases to the filter outputs override def sourceFileName = "affine" + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 + * + * Inputs: + * - D: Dimensionality of the input features (number of features). + * - M: Number of neurons in this layer. + * + * Outputs: + * - W: Weights, of shape (D, M). + * - b: Biases, of shape (1, M). + */ override def init(dmlScript:StringBuilder) = invokeInit(dmlScript, List[String](weight, bias), numFeatures, numNeurons) + /* + * Computes the forward pass for an affine (fully-connected) layer + * with M neurons. The input data has N examples, each with D + * features. + * + * Inputs: + * - X: Inputs, of shape (N, D). + * - W: Weights, of shape (D, M). + * - b: Biases, of shape (1, M). + * + * Outputs: + * - out: Outputs, of shape (N, M). + */ override def forward(dmlScript:StringBuilder, isPrediction:Boolean) = invokeForward(dmlScript, List[String](out), X, weight, bias) + /* + * Computes the backward pass for a fully-connected (affine) layer + * with M neurons. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of shape (N, M). + * - X: Inputs, of shape (N, D). + * - W: Weights, of shape (D, M). + * - b: Biases, of shape (1, M). + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, D). + * - dW: Gradient wrt `W`, of shape (D, M). + * - db: Gradient wrt `b`, of shape (1, M). + */ override def backward(dmlScript:StringBuilder, outSuffix:String) = - invokeBackward(dmlScript, outSuffix, List[String](dX, dWeight, dBias), dout, X, weight, bias) + invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, dWeight, dBias), dout, X, weight, bias) // ------------------------------------------------- + // num_output (c_o): the number of filters def numNeurons = param.getInnerProductParam.getNumOutput.toString def numFeatures = int_mult(bottomLayerOutputShape._1, bottomLayerOutputShape._2, bottomLayerOutputShape._3) + // n * c_o * 1 * 1 override def outputShape = ( param.getInnerProductParam.getNumOutput.toString, "1", "1" ) } @@ -291,11 +662,65 @@ class MaxPooling(val param:LayerParameter, val id:Int, val net:CaffeNetwork) ext // ------------------------------------------------- override def sourceFileName = "max_pool2d_builtin" override def init(dmlScript:StringBuilder) = {} + /* + * Computes the forward pass for a 2D spatial max pooling layer. + * The input data has N examples, each represented as a 3D volume + * unrolled into a single vector. + * + * This implementation uses a built-in operator for higher + * performance. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * A typical value is 0. + * - padw: Padding for left and right sides. + * A typical value is 0. + * + * Outputs: + * - out: Outputs, of shape (N, C*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ override def forward(dmlScript:StringBuilder, isPrediction:Boolean) = invokeForward(dmlScript, List[String](out, "ignoreHout_"+id, "ignoreWout_"+id), X, numChannels, Hin, Win, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w) + /* + * Computes the backward pass for a 2D spatial max pooling layer. + * The input data has N examples, each represented as a 3D volume + * unrolled into a single vector. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, C*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * A typical value is 0. + * - padw: Padding for left and right sides. + * A typical value is 0. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + */ override def backward(dmlScript:StringBuilder, outSuffix:String) = - invokeBackward(dmlScript, outSuffix, List[String](dX), dout, Hout, Wout, X, numChannels, Hin, Win, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w) + invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id), dout, Hout, Wout, X, numChannels, Hin, Win, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w) + // n * c * h_o * w_o, where h_o and w_o are computed in the same way as convolution. override def outputShape = ( numChannels, Hout, Wout ) // ------------------------------------------------- def Hin = bottomLayerOutputShape._2 @@ -304,29 +729,129 @@ class MaxPooling(val param:LayerParameter, val id:Int, val net:CaffeNetwork) ext def Wout = ConvolutionUtils.getConv2dOutputMap(bottomLayerOutputShape._3, kernel_w, stride_w, pad_w) def poolingParam = param.getPoolingParam def numChannels = bottomLayerOutputShape._1 + // kernel_size (or kernel_h and kernel_w): specifies height and width of each filter def kernel_h = if(poolingParam.hasKernelH) poolingParam.getKernelH.toString else poolingParam.getKernelSize.toString def kernel_w = if(poolingParam.hasKernelW) poolingParam.getKernelW.toString else poolingParam.getKernelSize.toString + // stride (or stride_h and stride_w) [default 1]: specifies the intervals at which to apply the filters to the input def stride_h = if(poolingParam.hasStrideH) poolingParam.getStrideH.toString - else poolingParam.getStride.toString + else if(poolingParam.hasStride) poolingParam.getStride.toString + else "1" def stride_w = if(poolingParam.hasStrideW) poolingParam.getStrideW.toString - else poolingParam.getStride.toString + else if(poolingParam.hasStride) poolingParam.getStride.toString + else "1" + // pad (or pad_h and pad_w) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input def pad_h = if(poolingParam.hasPadH) poolingParam.getPadH.toString - else poolingParam.getPad.toString + else if(poolingParam.hasPad) poolingParam.getPad.toString + else "0" def pad_w = if(poolingParam.hasPadW) poolingParam.getPadW.toString - else poolingParam.getPad.toString + else if(poolingParam.hasPad) poolingParam.getPad.toString + else "0" } class Convolution(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extends CaffeLayer with HasWeight with HasBias { // ------------------------------------------------- override def sourceFileName = "conv2d_builtin"; + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 + * + * Inputs: + * - F: Number of filters. + * - C: Number of input channels (dimensionality of depth). + * - Hf: Filter height. + * - Wf: Filter width. + * + * Outputs: + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + */ override def init(dmlScript:StringBuilder) = invokeInit(dmlScript, List[String](weight, bias), numKernels, numChannels, kernel_h, kernel_w) + /* + * Computes the forward pass for a 2D spatial convolutional layer with + * F filters. The input data has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * This implementation uses a built-in operator for higher + * performance. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * For same output height as input, set `padh = (Hf - 1) / 2`, + * assuming `strideh = 1`. + * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` + * preserves the spatial dimensions of the input. + * - padw: Padding for left and right sides. + * For same output width as input, set `padw = (Wf - 1) / 2`, + * assuming `stridew = 1`. + * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` + * preserves the spatial dimensions of the input. + * + * Outputs: + * - out: Outputs, of shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ override def forward(dmlScript:StringBuilder, isPrediction:Boolean) = invokeForward(dmlScript, List[String](out, "ignoreHout_"+id, "ignoreWout_"+id), X, weight, bias, numChannels, Hin, Win, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w) + /* + * Computes the backward pass for a 2D spatial convolutional layer + * with F filters. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * For same output height as input, set `padh = (Hf - 1) / 2`, + * assuming `strideh = 1`. + * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` + * preserves the spatial dimensions of the input. + * - padw: Padding for left and right sides. + * For same output width as input, set `padw = (Wf - 1) / 2`, + * assuming `stridew = 1`. + * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` + * preserves the spatial dimensions of the input. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). + * - db: Gradient wrt `b`, of shape (F, 1). + */ override def backward(dmlScript:StringBuilder, outSuffix:String) = - invokeBackward(dmlScript, outSuffix, List[String](dX, dWeight, dBias), dout, Hout, Wout, X, weight, bias, numChannels, Hin, Win, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w) + invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, dWeight, dBias), dout, Hout, Wout, X, weight, bias, numChannels, Hin, Win, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w) + // n * c_o * h_o * w_o, where h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1 and w_o likewise. override def outputShape = ( numKernels, Hout, Wout ) // ------------------------------------------------- def numChannels = bottomLayerOutputShape._1 @@ -334,24 +859,162 @@ class Convolution(val param:LayerParameter, val id:Int, val net:CaffeNetwork) ex def Win = bottomLayerOutputShape._3 def Hout = ConvolutionUtils.getConv2dOutputMap(bottomLayerOutputShape._2, kernel_h, stride_h, pad_h) def Wout = ConvolutionUtils.getConv2dOutputMap(bottomLayerOutputShape._3, kernel_w, stride_w, pad_w) + // ------------------------------------------------- + def convParam = param.getConvolutionParam + // num_output (c_o): the number of filters + def numKernels = convParam.getNumOutput.toString + // kernel_size (or kernel_h and kernel_w): specifies height and width of each filter + def kernel_h = if(convParam.hasKernelH) convParam.getKernelH.toString + else if(convParam.getKernelSizeCount > 0) convParam.getKernelSize(0).toString + else throw new LanguageException("Incorrect kernel parameters") + def kernel_w = if(convParam.hasKernelW) convParam.getKernelW.toString + else if(convParam.getKernelSizeCount > 0) convParam.getKernelSize(0).toString + else throw new LanguageException("Incorrect kernel parameters") + // stride (or stride_h and stride_w) [default 1]: specifies the intervals at which to apply the filters to the input + def stride_h = if(convParam.hasStrideH) convParam.getStrideH.toString + else if(convParam.getStrideCount > 0) convParam.getStride(0).toString + else "1" + def stride_w = if(convParam.hasStrideW) convParam.getStrideW.toString + else if(convParam.getStrideCount > 0) convParam.getStride(0).toString + else "1" + // pad (or pad_h and pad_w) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input + def pad_h = if(convParam.hasPadH) convParam.getPadH.toString + else if(convParam.getPadCount > 0) convParam.getPad(0).toString + else "0" + def pad_w = if(convParam.hasPadW) convParam.getPadW.toString + else if(convParam.getPadCount > 0) convParam.getPad(0).toString + else "0" +} + +class DeConvolution(val param:LayerParameter, val id:Int, val net:CaffeNetwork) extends CaffeLayer with HasWeight with HasBias { + override def sourceFileName: String = "conv2d_transpose" + /* + * Utility function to initialize the parameters of this layer. + * + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 + * + * Inputs: + * - F: Number of filters. + * - C: Number of input channels (dimensionality of depth). + * - Hf: Filter height. + * - Wf: Filter width. + * + * Outputs: + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + */ + override def init(dmlScript: StringBuilder): Unit = + invokeInit(dmlScript, List[String](weight, bias), numKernels, numChannels, kernel_h, kernel_w) + + /* + * Computes the forward pass for a 2D spatial transpose convolutional + * layer with F filters. The input data has N examples, each + * represented as a 3D tensor flattened into a single vector. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * - padw: Padding for left and right sides. + * - out_padh: extra padding for top side. This should + * lie in [0, strideh-1]. + * - out_padw: extra padding for right side. This should + * lie in [0, stridew-1]. + * + * Outputs: + * - out: Outputs, of shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ + override def forward(dmlScript: StringBuilder,isPrediction: Boolean): Unit = + invokeForward(dmlScript, List[String](out, "ignoreHout_"+id, "ignoreWout_"+id), + X, weight, bias, numChannels, Hin, Win, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, "0", "0") + + /* + * Computes the backward pass for a 2D spatial transpose + * convolutional layer with F filters. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * - padw: Padding for left and right sides. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). + * - db: Gradient wrt `b`, of shape (F, 1). + */ + override def backward(dmlScript:StringBuilder, outSuffix:String) = + invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, dWeight, dBias), + dout, Hout, Wout, X, weight, bias, numChannels, Hin, Win, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w) + // n * c_o * h_o * w_o, where h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1 and w_o likewise. + override def outputShape = ( numChannels, Hout, Wout ) + // ------------------------------------------------- + def numChannels = bottomLayerOutputShape._1 + def Hin = bottomLayerOutputShape._2 + def Win = bottomLayerOutputShape._3 + // Hout = strideh * (Hin-1) - 2*padh + Hf + out_padh + def Hout:String = try { + (stride_h.toInt * (Hin.toInt-1) - 2*pad_h.toInt + kernel_h.toInt).toString() + } + catch { + case _:Throwable => stride_h + " * " + "(" + Hin + "-1) - 2*" + pad_h + " + " + kernel_h + } + // Wout = stridew * (Win-1) - 2*padw + Wf + out_padw + def Wout:String = try { + (stride_w.toInt * (Win.toInt-1) - 2*pad_w.toInt + kernel_w.toInt).toString() + } + catch { + case _:Throwable => stride_w + " * " + "(" + Win + "-1) - 2*" + pad_w + " + " + kernel_w + } + // ------------------------------------------------- def convParam = param.getConvolutionParam + // num_output (c_o): the number of filters def numKernels = convParam.getNumOutput.toString + // kernel_size (or kernel_h and kernel_w): specifies height and width of each filter def kernel_h = if(convParam.hasKernelH) convParam.getKernelH.toString else if(convParam.getKernelSizeCount > 0) convParam.getKernelSize(0).toString else throw new LanguageException("Incorrect kernel parameters") def kernel_w = if(convParam.hasKernelW) convParam.getKernelW.toString else if(convParam.getKernelSizeCount > 0) convParam.getKernelSize(0).toString else throw new LanguageException("Incorrect kernel parameters") + // stride (or stride_h and stride_w) [default 1]: specifies the intervals at which to apply the filters to the input def stride_h = if(convParam.hasStrideH) convParam.getStrideH.toString else if(convParam.getStrideCount > 0) convParam.getStride(0).toString - else throw new LanguageException("Incorrect stride parameters:" + convParam.getStrideH + " " + convParam.getStrideList + " " + param.getName) + else "1" def stride_w = if(convParam.hasStrideW) convParam.getStrideW.toString else if(convParam.getStrideCount > 0) convParam.getStride(0).toString - else throw new LanguageException("Incorrect stride parameters") + else "1" + // pad (or pad_h and pad_w) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input def pad_h = if(convParam.hasPadH) convParam.getPadH.toString else if(convParam.getPadCount > 0) convParam.getPad(0).toString - else throw new LanguageException("Incorrect pad parameters") + else "0" def pad_w = if(convParam.hasPadW) convParam.getPadW.toString else if(convParam.getPadCount > 0) convParam.getPad(0).toString - else throw new LanguageException("Incorrect pad parameters") + else "0" } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7ed8e3f4/src/main/scala/org/apache/sysml/api/dl/CaffeNetwork.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeNetwork.scala b/src/main/scala/org/apache/sysml/api/dl/CaffeNetwork.scala index 73490b4..c106cb7 100644 --- a/src/main/scala/org/apache/sysml/api/dl/CaffeNetwork.scala +++ b/src/main/scala/org/apache/sysml/api/dl/CaffeNetwork.scala @@ -125,6 +125,18 @@ class CaffeNetwork(netFilePath:String, val currentPhase:Phase, } else l }) + + // Condition 5: Deal with incorrect naming + // Example: layer { name: foo, bottom: arbitrary, top: bar } ... Rename the layer to bar + private def isIncorrectNamingLayer(l:LayerParameter): Boolean = l.getTopCount == 1 && !l.getTop(0).equalsIgnoreCase(l.getName) + _caffeLayerParams = _caffeLayerParams.map(l => { + if(isIncorrectNamingLayer(l)) { + val builder = l.toBuilder(); + builder.setName(l.getTop(0)) + builder.build() + } + else l + }) // -------------------------------------------------------------------------------- @@ -174,6 +186,8 @@ class CaffeNetwork(netFilePath:String, val currentPhase:Phase, case "batchnorm" => new BatchNorm(param, id, this) case "scale" => new Scale(param, id, this) case "eltwise" => new Elementwise(param, id, this) + case "concat" => new Concat(param, id, this) + case "deconvolution" => new DeConvolution(param, id, this) case _ => throw new LanguageException("Layer of type " + param.getType + " is not supported") } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7ed8e3f4/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala b/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala index ec4269a..668d996 100644 --- a/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala +++ b/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala @@ -74,6 +74,9 @@ trait BaseDMLGenerator { dmlScript.append("\n") } def invoke(dmlScript:StringBuilder, namespace1:String, returnVariables:List[String], functionName:String, arguments:List[String]):Unit = { + invoke(dmlScript, namespace1, returnVariables, functionName, arguments, true) + } + def invoke(dmlScript:StringBuilder, namespace1:String, returnVariables:List[String], functionName:String, arguments:List[String], appendNewLine:Boolean):Unit = { if(returnVariables.length == 0) throw new DMLRuntimeException("User-defined functions should have atleast one return value") if(returnVariables.length > 1) dmlScript.append("[") dmlScript.append(returnVariables(0)) @@ -96,10 +99,15 @@ trait BaseDMLGenerator { } } } - dmlScript.append(")\n") + dmlScript.append(")") + if(appendNewLine) + dmlScript.append("\n") + } + def invoke(dmlScript:StringBuilder, namespace1:String, returnVariables:List[String], functionName:String, appendNewLine:Boolean, arguments:String*):Unit = { + invoke(dmlScript, namespace1, returnVariables, functionName, arguments.toList, appendNewLine) } def invoke(dmlScript:StringBuilder, namespace1:String, returnVariables:List[String], functionName:String, arguments:String*):Unit = { - invoke(dmlScript, namespace1, returnVariables, functionName, arguments.toList) + invoke(dmlScript, namespace1, returnVariables, functionName, arguments.toList, true) } def rightIndexing(dmlScript:StringBuilder, varName:String, rl:String, ru:String, cl:String, cu:String):StringBuilder = { dmlScript.append(varName).append("[") @@ -244,7 +252,7 @@ trait VisualizeDMLGenerator extends TabbedDMLGenerator { case "dweight" => l.dWeight case "dbias" => l.dBias case "output" => l.out - case "doutput" => l.dX + // case "doutput" => l.dX case _ => throw new DMLRuntimeException("Cannot visualize the variable of type:" + varType) } }