Repository: systemml Updated Branches: refs/heads/master 47973a905 -> 6af1df01d
[SYSTEMML-540] Low-rank fully connected layer This layer has three advantages over the affine layer: - It has significantly lower memory requirement than affine layer making it ideal for devices such as GPUs. - It implicitly avoids overfitting by minimizing the number of parameters in the neural network. - It can exploit sparsity-aware fused operators. Closes #720. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6af1df01 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6af1df01 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6af1df01 Branch: refs/heads/master Commit: 6af1df01d5ebe4c37025e06f7d839a2b2ee7d34b Parents: 47973a9 Author: Niketan Pansare <[email protected]> Authored: Wed Jan 24 10:32:15 2018 -0800 Committer: Niketan Pansare <[email protected]> Committed: Wed Jan 24 10:32:15 2018 -0800 ---------------------------------------------------------------------- scripts/nn/layers/low_rank_affine.dml | 109 +++++++++++++++++++ scripts/nn/test/grad_check.dml | 100 +++++++++++++++++ scripts/nn/test/run_tests.dml | 1 + src/main/proto/caffe/caffe.proto | 2 + .../org/apache/sysml/api/dl/Caffe2DML.scala | 20 +++- .../org/apache/sysml/api/dl/CaffeLayer.scala | 33 ++++-- .../org/apache/sysml/api/dl/CaffeSolver.scala | 39 +++++++ .../org/apache/sysml/api/dl/DMLGenerator.scala | 1 + .../scala/org/apache/sysml/api/dl/Utils.scala | 1 + 9 files changed, 294 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/scripts/nn/layers/low_rank_affine.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/low_rank_affine.dml b/scripts/nn/layers/low_rank_affine.dml new file mode 100644 index 0000000..32293f9 --- /dev/null +++ b/scripts/nn/layers/low_rank_affine.dml @@ -0,0 +1,109 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Low-rank Affine (fully-connected) layer. + * + * This layer has three advantages over the affine layer: + * 1. It has significantly lower memory requirement than affine layer making it ideal for devices such as GPUs. + * 2. It implicitly avoids overfitting by minimizing the number of parameters in the neural network. + * 3. It can exploit sparsity-aware fused operators. + */ + +forward = function(matrix[double] X, matrix[double] U, matrix[double] V, matrix[double] b) + return (matrix[double] out) { + /* + * Computes the forward pass for a low-rank affine (fully-connected) layer + * with M neurons. The input data has N examples, each with D + * features. + * + * Inputs: + * - X: Inputs, of shape (N, D). + * - U: LHS factor matrix for weights, of shape (D, R). + * - V: RHS factor matrix for weights, of shape (R, M). + * - b: Biases, of shape (1, M). + * + * Outputs: + * - out: Outputs, of shape (N, M). + */ + out = X %*% U %*% V + b +} + +backward = function(matrix[double] dout, matrix[double] X, + matrix[double] U, matrix[double] V, matrix[double] b) + return (matrix[double] dX, matrix[double] dU, matrix[double] dV, matrix[double] db) { + /* + * Computes the backward pass for a low-rank fully-connected (affine) layer + * with M neurons. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of shape (N, M). + * - X: Inputs, of shape (N, D). + * - U: LHS factor matrix for weights, of shape (D, R). + * - V: RHS factor matrix for weights, of shape (R, M). + * - b: Biases, of shape (1, M). + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, D). + * - dU: Gradient wrt `U`, of shape (D, R). + * - dV: Gradient wrt `V`, of shape (R, M). + * - db: Gradient wrt `b`, of shape (1, M). + */ + dX = dout %*% t(V) %*% t(U) + + # If out = Z %*% L, then dL = t(Z) %*% dout + # Substituting Z = X %*% U and L = V, we get + dV = t(U) %*% t(X) %*% dout + + dU = t(X) %*% dout %*% t(V) + + db = colSums(dout) +} + +init = function(int D, int M, int R) + return (matrix[double] U, matrix[double] V, matrix[double] b) { + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 + * + * Inputs: + * - D: Dimensionality of the input features (number of features). + * - M: Number of neurons in this layer. + * - R: Rank of U,V matrices such that R << min(D, M). + * + * Outputs: + * - U: LHS factor matrix for weights, of shape (D, R). + * - V: RHS factor matrix for weights, of shape (R, M). + * - b: Biases, of shape (1, M). + */ + U = rand(rows=D, cols=R, pdf="normal") * sqrt(2.0/D) + V = rand(rows=R, cols=M, pdf="normal") * sqrt(2.0/R) + b = matrix(0, rows=1, cols=M) +} + http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/scripts/nn/test/grad_check.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml index 41a8bc6..9c551b8 100644 --- a/scripts/nn/test/grad_check.dml +++ b/scripts/nn/test/grad_check.dml @@ -23,6 +23,7 @@ * Gradient checks for various architectures. */ source("nn/layers/affine.dml") as affine +source("nn/layers/low_rank_affine.dml") as low_rank_affine source("nn/layers/batch_norm1d.dml") as batch_norm1d source("nn/layers/batch_norm2d.dml") as batch_norm2d source("nn/layers/conv2d.dml") as conv2d @@ -133,6 +134,105 @@ affine = function() { } } +low_rank_affine = function() { + /* + * Gradient check for the low rank affine layer. + */ + print("Grad checking the low rank affine layer with L2 loss.") + + # Generate data + N = 3 # num examples + D = 100 # num features + M = 10 # num neurons + R = 2 # rank + X = rand(rows=N, cols=D) + y = rand(rows=N, cols=M) + [U, V, b] = low_rank_affine::init(D, M, R) + + # Compute analytical gradients of loss wrt parameters + out = low_rank_affine::forward(X, U, V, b) + dout = l2_loss::backward(out, y) + [dX, dU, dV, db] = low_rank_affine::backward(dout, X, U, V, b) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + outmh = low_rank_affine::forward(X, U, V, b) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + outph = low_rank_affine::forward(X, U, V, b) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking U.") + for (i in 1:nrow(U)) { + for (j in 1:ncol(U)) { + # Compute numerical derivative + old = as.scalar(U[i,j]) + U[i,j] = old - h + outmh = low_rank_affine::forward(X, U, V, b) + lossmh = l2_loss::forward(outmh, y) + U[i,j] = old + h + outph = low_rank_affine::forward(X, U, V, b) + lossph = l2_loss::forward(outph, y) + U[i,j] = old # reset + dU_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dU[i,j]), dU_num, lossph, lossmh) + } + } + + print(" - Grad checking V.") + for (i in 1:nrow(V)) { + for (j in 1:ncol(V)) { + # Compute numerical derivative + old = as.scalar(V[i,j]) + V[i,j] = old - h + outmh = low_rank_affine::forward(X, U, V, b) + lossmh = l2_loss::forward(outmh, y) + V[i,j] = old + h + outph = low_rank_affine::forward(X, U, V, b) + lossph = l2_loss::forward(outph, y) + V[i,j] = old # reset + dV_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh) + } + } + + print(" - Grad checking b.") + for (i in 1:nrow(b)) { + for (j in 1:ncol(b)) { + # Compute numerical derivative + old = as.scalar(b[i,j]) + b[i,j] = old - h + outmh = low_rank_affine::forward(X, U, V, b) + lossmh = l2_loss::forward(outmh, y) + b[i,j] = old + h + outph = low_rank_affine::forward(X, U, V, b) + lossph = l2_loss::forward(outph, y) + b[i,j] = old # reset + db_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) + } + } +} + batch_norm1d = function() { /* * Gradient check for the 1D batch normalization layer. http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/scripts/nn/test/run_tests.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml index 27d6a4a..f70701b 100644 --- a/scripts/nn/test/run_tests.dml +++ b/scripts/nn/test/run_tests.dml @@ -41,6 +41,7 @@ print("") # Core layers grad_check::affine() +grad_check::low_rank_affine() grad_check::batch_norm1d() grad_check::batch_norm2d() grad_check::conv2d() http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/proto/caffe/caffe.proto ---------------------------------------------------------------------- diff --git a/src/main/proto/caffe/caffe.proto b/src/main/proto/caffe/caffe.proto index 444b2f9..c5a5799 100644 --- a/src/main/proto/caffe/caffe.proto +++ b/src/main/proto/caffe/caffe.proto @@ -833,6 +833,8 @@ message InnerProductParameter { // of the weight matrix. The weight matrix itself is not going to be transposed // but rather the transfer flag of operations will be toggled accordingly. optional bool transpose = 6 [default = false]; + + optional int32 rank = 7 [default = 0]; // rank of U, V matrices } message InputParameter { http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala index 5d17a4d..9f75008 100644 --- a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala +++ b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala @@ -313,7 +313,11 @@ class Caffe2DML(val sc: SparkContext, val numLayerInput = if(!l.isInstanceOf[Data]) l.bottomLayerOutputShape._1.toLong * l.bottomLayerOutputShape._2.toLong * l.bottomLayerOutputShape._3.toLong * batchSize else 0 val numLayerOutput = l.outputShape._1.toLong * l.outputShape._2.toLong * l.outputShape._3.toLong * batchSize val numLayerError = numLayerOutput - val numLayerWeights = if(l.weightShape != null) l.weightShape()(0).toLong * l.weightShape()(1).toLong else 0 + val numLayerWeights = if(l.weightShape != null) { + val nWt = l.weightShape()(0).toLong * l.weightShape()(1).toLong + if(l.extraWeightShape != null) l.extraWeightShape()(0).toLong * l.extraWeightShape()(1).toLong + nWt + else nWt + } else 0 val numLayerBias = if(l.biasShape != null)l.biasShape()(0).toLong * l.biasShape()(1).toLong else 0 val numLayerGradients = (numLayerWeights + numLayerBias) * batchSize if(isTraining) (numLayerInput + numLayerOutput + numLayerError + numLayerWeights + numLayerBias + numLayerGradients)*Double.BYTES @@ -337,7 +341,11 @@ class Caffe2DML(val sc: SparkContext, (l._1, layer.param.getType, "(, " + layer.outputShape._1 + ", " + layer.outputShape._2 + ", " + layer.outputShape._3 + ")", - if (layer.weightShape != null) "[" + layer.weightShape()(0) + " X " + layer.weightShape()(1) + "]" else "", + if (layer.weightShape != null) { + val wShapes = "[" + layer.weightShape()(0) + " X " + layer.weightShape()(1) + "]" + if (layer.extraWeightShape != null) wShapes + ", " + "[" + layer.extraWeightShape()(0) + " X " + layer.extraWeightShape()(1) + "]" + else wShapes + } else "", if (layer.biasShape != null) "[" + layer.biasShape()(0) + " X " + layer.biasShape()(1) + "]" else "", layer.param.getTopList.mkString(","), layer.param.getBottomList.mkString(","), @@ -421,6 +429,7 @@ class Caffe2DML(val sc: SparkContext, // Set input/output variables and execute the script val script = dml(trainingScript).in(inputs) net.getLayers.map(net.getCaffeLayer(_)).filter(_.weight != null).map(l => script.out(l.weight)) + net.getLayers.map(net.getCaffeLayer(_)).filter(_.extraWeight != null).map(l => script.out(l.extraWeight)) net.getLayers.map(net.getCaffeLayer(_)).filter(_.bias != null).map(l => script.out(l.bias)) setDebugFlags(false) @@ -667,6 +676,7 @@ class Caffe2DML(val sc: SparkContext, tabDMLScript.append("snapshot_dir= \"" + solverParam.getSnapshotPrefix + "\" + \"/iter_\" + iter + \"/\"\n") val allLayers = net.getLayers.map(net.getCaffeLayer(_)) allLayers.filter(_.weight != null).map(l => appendSnapshotWrite(l.weight, l.param.getName + "_weight.mtx")) + allLayers.filter(_.extraWeight != null).map(l => appendSnapshotWrite(l.extraWeight, l.param.getName + "_extra_weight.mtx")) allLayers.filter(_.bias != null).map(l => appendSnapshotWrite(l.bias, l.param.getName + "_bias.mtx")) } } @@ -689,6 +699,7 @@ class Caffe2DML(val sc: SparkContext, .map(layer => net.getCaffeLayer(layer)) .map(l => { if (l.shouldUpdateWeight) assign(tabDMLScript, l.dWeight + "_agg", matrix("0", parallel_batches, multiply(nrow(l.weight), ncol(l.weight)))) + if (l.shouldUpdateExtraWeight) assign(tabDMLScript, l.dExtraWeight + "_agg", matrix("0", parallel_batches, multiply(nrow(l.extraWeight), ncol(l.extraWeight)))) if (l.shouldUpdateBias) assign(tabDMLScript, l.dBias + "_agg", matrix("0", parallel_batches, multiply(nrow(l.bias), ncol(l.bias)))) }) } @@ -701,6 +712,7 @@ class Caffe2DML(val sc: SparkContext, .map(layer => net.getCaffeLayer(layer)) .map(l => { if (l.shouldUpdateWeight) assign(tabDMLScript, l.dWeight + "_agg[j,]", matrix(l.dWeight, "1", multiply(nrow(l.weight), ncol(l.weight))) + " * weighting") + if (l.shouldUpdateExtraWeight) assign(tabDMLScript, l.dExtraWeight + "_agg[j,]", matrix(l.dExtraWeight, "1", multiply(nrow(l.extraWeight), ncol(l.extraWeight))) + " * weighting") if (l.shouldUpdateWeight) assign(tabDMLScript, l.dBias + "_agg[j,]", matrix(l.dBias, "1", multiply(nrow(l.bias), ncol(l.bias))) + " * weighting") }) } @@ -710,6 +722,7 @@ class Caffe2DML(val sc: SparkContext, .map(layer => net.getCaffeLayer(layer)) .map(l => { if (l.shouldUpdateWeight) assign(tabDMLScript, l.dWeight, matrix(colSums(l.dWeight + "_agg"), nrow(l.weight), ncol(l.weight))) + if (l.shouldUpdateExtraWeight) assign(tabDMLScript, l.dExtraWeight, matrix(colSums(l.dExtraWeight + "_agg"), nrow(l.extraWeight), ncol(l.extraWeight))) if (l.shouldUpdateWeight) assign(tabDMLScript, l.dBias, matrix(colSums(l.dBias + "_agg"), nrow(l.bias), ncol(l.bias))) }) } @@ -744,7 +757,7 @@ class Caffe2DMLModel(val numClasses: String, val sc: SparkContext, val solver: C def modelVariables(): List[String] = { val allLayers = net.getLayers.map(net.getCaffeLayer(_)) - allLayers.filter(_.weight != null).map(_.weight) ++ allLayers.filter(_.bias != null).map(_.bias) + allLayers.filter(_.weight != null).map(_.weight) ++ allLayers.filter(_.extraWeight != null).map(_.extraWeight) ++ allLayers.filter(_.bias != null).map(_.bias) } // ================================================================================================ @@ -850,6 +863,7 @@ class Caffe2DMLModel(val numClasses: String, val sc: SparkContext, val solver: C if (estimator.mloutput != null) { // fit was called net.getLayers.map(net.getCaffeLayer(_)).filter(_.weight != null).map(l => script.in(l.weight, estimator.mloutput.getMatrix(l.weight))) + net.getLayers.map(net.getCaffeLayer(_)).filter(_.extraWeight != null).map(l => script.in(l.extraWeight, estimator.mloutput.getMatrix(l.extraWeight))) net.getLayers.map(net.getCaffeLayer(_)).filter(_.bias != null).map(l => script.in(l.bias, estimator.mloutput.getMatrix(l.bias))) } http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala index 37b585f..dd8d137 100644 --- a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala +++ b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala @@ -90,14 +90,19 @@ trait CaffeLayer extends BaseDMLGenerator { // -------------------------------------------------------------------------------------- // No need to override these methods in subclasses, instead classes that have weights and biases // should implement HasWeight and HasBias traits. - def dWeight(): String = throw new DMLRuntimeException("dWeight is not implemented in super class") - def dBias(): String = throw new DMLRuntimeException("dBias is not implemented in super class") def weight(): String = null; def weightShape(): Array[Int]; + def dWeight(): String = throw new DMLRuntimeException("dWeight is not implemented in super class") + def shouldUpdateWeight(): Boolean = if (weight != null) true else false def bias(): String = null; def biasShape(): Array[Int]; - def shouldUpdateWeight(): Boolean = if (weight != null) true else false + def dBias(): String = throw new DMLRuntimeException("dBias is not implemented in super class") def shouldUpdateBias(): Boolean = if (bias != null) true else false + + def extraWeight(): String = null; + def extraWeightShape(): Array[Int] = null; + def dExtraWeight(): String = throw new DMLRuntimeException("dExtraWeight is not implemented in super class") + def shouldUpdateExtraWeight():Boolean = if(extraWeight != null) true else false // -------------------------------------------------------------------------------------- // Helper methods to simplify the code of subclasses def invokeInit(dmlScript: StringBuilder, returnVariables: List[String], arguments: String*): Unit = @@ -818,7 +823,8 @@ class Dropout(val param: LayerParameter, val id: Int, val net: CaffeNetwork) ext class InnerProduct(val param: LayerParameter, val id: Int, val net: CaffeNetwork) extends CaffeLayer with HasWeight with HasBias { // ------------------------------------------------- // TODO: bias_filler [default type: 'constant' value: 0]; bias_term [default true]: specifies whether to learn and apply a set of additive biases to the filter outputs - override def sourceFileName = "affine" + val isLowRank = param.getInnerProductParam.hasRank && param.getInnerProductParam.getRank > 0 + override def sourceFileName = if(isLowRank) "low_rank_affine" else "affine" /* * Initialize the parameters of this layer. * @@ -839,7 +845,10 @@ class InnerProduct(val param: LayerParameter, val id: Int, val net: CaffeNetwork * - W: Weights, of shape (D, M). * - b: Biases, of shape (1, M). */ - override def init(dmlScript: StringBuilder) = invokeInit(dmlScript, List[String](weight, bias), numFeatures, numNeurons) + override def init(dmlScript: StringBuilder) = { + if(isLowRank) invokeInit(dmlScript, List[String](weight, extraWeight, bias), numFeatures, numNeurons, param.getInnerProductParam.getRank.toString) + else invokeInit(dmlScript, List[String](weight, bias), numFeatures, numNeurons) + } /* * Computes the forward pass for an affine (fully-connected) layer * with M neurons. The input data has N examples, each with D @@ -857,7 +866,8 @@ class InnerProduct(val param: LayerParameter, val id: Int, val net: CaffeNetwork if(debugLayer && caffe2dmlObj != null && !caffe2dmlObj.containsParfor) { dmlScript.append("assert(ncol(" + X + ") == nrow(" + weight + ") | ncol(" + weight + ") == ncol(" + bias + ")); ") } - invokeForward(dmlScript, List[String](out), X, weight, bias) + if(isLowRank) invokeForward(dmlScript, List[String](out), X, weight, extraWeight, bias) + else invokeForward(dmlScript, List[String](out), X, weight, bias) } /* @@ -875,8 +885,10 @@ class InnerProduct(val param: LayerParameter, val id: Int, val net: CaffeNetwork * - dW: Gradient wrt `W`, of shape (D, M). * - db: Gradient wrt `b`, of shape (1, M). */ - override def backward(dmlScript: StringBuilder, outSuffix: String) = - invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, dWeight, dBias), dout, X, weight, bias) + override def backward(dmlScript: StringBuilder, outSuffix: String) = { + if(isLowRank) invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, dWeight, dExtraWeight, dBias), dout, X, weight, extraWeight, bias) + else invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, dWeight, dBias), dout, X, weight, bias) + } // ------------------------------------------------- // num_output (c_o): the number of filters @@ -884,8 +896,11 @@ class InnerProduct(val param: LayerParameter, val id: Int, val net: CaffeNetwork def numFeatures = int_mult(bottomLayerOutputShape._1, bottomLayerOutputShape._2, bottomLayerOutputShape._3) // n * c_o * 1 * 1 override def outputShape = (param.getInnerProductParam.getNumOutput.toString, "1", "1") - override def weightShape(): Array[Int] = Array(numFeatures.toInt, numNeurons.toInt) + override def weightShape(): Array[Int] = if(isLowRank) Array(numFeatures.toInt, param.getInnerProductParam.getRank) else Array(numFeatures.toInt, numNeurons.toInt) override def biasShape(): Array[Int] = Array(1, numNeurons.toInt) + override def extraWeight(): String = if(isLowRank) weight + "_extra" else null + override def extraWeightShape(): Array[Int] = if(isLowRank) Array(param.getInnerProductParam.getRank, numNeurons.toInt) else null + override def dExtraWeight(): String = if(isLowRank) dWeight + "_extra" else null } http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala index d0d738e..8559c60 100644 --- a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala +++ b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala @@ -69,6 +69,10 @@ trait CaffeSolver { dmlScript.append("\t").append(layer.dWeight + "_reg = " + regularizationSource + "::backward(" + layer.weight + ", " + newLambda + ")\n") dmlScript.append("\t").append(layer.dWeight + " = " + layer.dWeight + " + " + layer.dWeight + "_reg\n") + if(layer.shouldUpdateExtraWeight) { + dmlScript.append("\t").append(layer.dExtraWeight + "_reg = " + regularizationSource + "::backward(" + layer.extraWeight + ", " + newLambda + ")\n") + dmlScript.append("\t").append(layer.dExtraWeight + " = " + layer.dExtraWeight + " + " + layer.dExtraWeight + "_reg\n") + } } } } @@ -129,6 +133,7 @@ class SGD(regularizationType:String = "L2", lambda: Double = 5e-04, momentum: Do if (momentum == 0) { // Use sgd if (layer.shouldUpdateWeight) dmlScript.append("\t").append(layer.weight + " = sgd::update(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer)) + ")\n") + if (layer.shouldUpdateExtraWeight) dmlScript.append("\t").append(layer.extraWeight + " = sgd::update(" + commaSep(layer.extraWeight, layer.dExtraWeight, getWeightLr(layer)) + ")\n") if (layer.shouldUpdateBias) dmlScript.append("\t").append(layer.bias + " = sgd::update(" + commaSep(layer.bias, layer.dBias, getBiasLr(layer)) + ")\n") } else { // Use sgd_momentum @@ -139,6 +144,13 @@ class SGD(regularizationType:String = "L2", lambda: Double = 5e-04, momentum: Do "[" + commaSep(layer.weight, layer.weight + "_v") + "] " + "= sgd_momentum::update(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer), momentum.toString, layer.weight + "_v") + ")\n" ) + if (layer.shouldUpdateExtraWeight) + dmlScript + .append("\t") + .append( + "[" + commaSep(layer.extraWeight, layer.extraWeight + "_v") + "] " + + "= sgd_momentum::update(" + commaSep(layer.extraWeight, layer.dExtraWeight, getWeightLr(layer), momentum.toString, layer.extraWeight + "_v") + ")\n" + ) if (layer.shouldUpdateBias) dmlScript .append("\t") @@ -151,6 +163,7 @@ class SGD(regularizationType:String = "L2", lambda: Double = 5e-04, momentum: Do def init(dmlScript: StringBuilder, layer: CaffeLayer): Unit = if (momentum != 0) { if (layer.shouldUpdateWeight) dmlScript.append(layer.weight + "_v = sgd_momentum::init(" + layer.weight + ")\n") + if (layer.shouldUpdateExtraWeight) dmlScript.append(layer.extraWeight + "_v = sgd_momentum::init(" + layer.extraWeight + ")\n") if (layer.shouldUpdateBias) dmlScript.append(layer.bias + "_v = sgd_momentum::init(" + layer.bias + ")\n") } def sourceFileName: String = if (momentum == 0) "sgd" else "sgd_momentum" @@ -193,6 +206,13 @@ class AdaGrad(regularizationType:String = "L2", lambda: Double = 5e-04, epsilon: "[" + commaSep(layer.weight, layer.weight + "_cache") + "] " + "= adagrad::update(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer), epsilon.toString, layer.weight + "_cache") + ")\n" ) + if (layer.shouldUpdateExtraWeight) + dmlScript + .append("\t") + .append( + "[" + commaSep(layer.extraWeight, layer.extraWeight + "_cache") + "] " + + "= adagrad::update(" + commaSep(layer.extraWeight, layer.dExtraWeight, getWeightLr(layer), epsilon.toString, layer.extraWeight + "_cache") + ")\n" + ) if (layer.shouldUpdateBias) dmlScript .append("\t") @@ -203,6 +223,7 @@ class AdaGrad(regularizationType:String = "L2", lambda: Double = 5e-04, epsilon: } def init(dmlScript: StringBuilder, layer: CaffeLayer): Unit = { if (layer.shouldUpdateWeight) dmlScript.append(layer.weight + "_cache = adagrad::init(" + layer.weight + ")\n") + if (layer.shouldUpdateExtraWeight) dmlScript.append(layer.extraWeight + "_cache = adagrad::init(" + layer.extraWeight + ")\n") if (layer.shouldUpdateBias) dmlScript.append(layer.bias + "_cache = adagrad::init(" + layer.bias + ")\n") } def sourceFileName: String = "adagrad" @@ -257,6 +278,15 @@ class Adam(regularizationType:String = "L2", lambda: Double = 5e-04, momentum:Do momentum.toString, momentum2.toString, delta.toString, t, layer.weight + "_m", layer.weight + "_v") + ")\n" ) + if (layer.shouldUpdateExtraWeight) + dmlScript + .append("\t") + .append( + "[" + commaSep(layer.extraWeight, layer.extraWeight + "_m", layer.extraWeight + "_v") + "] " + + "= adam::update(" + commaSep(layer.extraWeight, layer.dExtraWeight, getWeightLr(layer), + momentum.toString, momentum2.toString, delta.toString, t, + layer.extraWeight + "_m", layer.extraWeight + "_v") + ")\n" + ) if (layer.shouldUpdateBias) dmlScript .append("\t") @@ -269,6 +299,7 @@ class Adam(regularizationType:String = "L2", lambda: Double = 5e-04, momentum:Do } def init(dmlScript: StringBuilder, layer: CaffeLayer): Unit = { if (layer.shouldUpdateWeight) dmlScript.append("[ " + layer.weight + "_m, " + layer.weight + "_v ] = adam::init(" + layer.weight + ")\n") + if (layer.shouldUpdateExtraWeight) dmlScript.append("[ " + layer.extraWeight + "_m, " + layer.extraWeight + "_v ] = adam::init(" + layer.extraWeight + ")\n") if (layer.shouldUpdateBias) dmlScript.append("[ " + layer.bias + "_m, " + layer.bias + "_v ] = adam::init(" + layer.bias + ")\n") } def sourceFileName: String = "adam" @@ -320,6 +351,13 @@ class Nesterov(regularizationType:String = "L2", lambda: Double = 5e-04, momentu "[" + commaSep(layer.weight, layer.weight + "_v") + "] " + "= " + fn + "(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer), momentum.toString, layer.weight + "_v") + lastParameter + ")\n" ) + if (layer.shouldUpdateExtraWeight) + dmlScript + .append("\t") + .append( + "[" + commaSep(layer.extraWeight, layer.extraWeight + "_v") + "] " + + "= " + fn + "(" + commaSep(layer.extraWeight, layer.dExtraWeight, getWeightLr(layer), momentum.toString, layer.extraWeight + "_v") + lastParameter + ")\n" + ) if (layer.shouldUpdateBias) dmlScript .append("\t") @@ -330,6 +368,7 @@ class Nesterov(regularizationType:String = "L2", lambda: Double = 5e-04, momentu } def init(dmlScript: StringBuilder, layer: CaffeLayer): Unit = { if (layer.shouldUpdateWeight) dmlScript.append(layer.weight + "_v = sgd_nesterov::init(" + layer.weight + ")\n") + if (layer.shouldUpdateExtraWeight) dmlScript.append(layer.extraWeight + "_v = sgd_nesterov::init(" + layer.extraWeight + ")\n") if (layer.shouldUpdateBias) dmlScript.append(layer.bias + "_v = sgd_nesterov::init(" + layer.bias + ")\n") } def sourceFileName: String = "sgd_nesterov" http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala b/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala index 0231354..5d25116 100644 --- a/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala +++ b/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala @@ -313,6 +313,7 @@ trait DMLGenerator extends SourceDMLGenerator with NextBatchGenerator { tabDMLScript.append("# Load the weights. Note: keeping the initialization code in case the layer wants to initialize non-weights and non-bias\n") val allLayers = net.getLayers.filter(l => !layersToIgnore.contains(l)).map(net.getCaffeLayer(_)) allLayers.filter(_.weight != null).map(l => tabDMLScript.append(readWeight(l.weight, l.param.getName + "_weight.mtx"))) + allLayers.filter(_.extraWeight != null).map(l => tabDMLScript.append(readWeight(l.extraWeight, l.param.getName + "_extra_weight.mtx"))) allLayers.filter(_.bias != null).map(l => tabDMLScript.append(readWeight(l.bias, l.param.getName + "_bias.mtx"))) } net.getLayers.map(layer => solver.init(tabDMLScript, net.getCaffeLayer(layer))) http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/Utils.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/Utils.scala b/src/main/scala/org/apache/sysml/api/dl/Utils.scala index 5939cf1..e771ed1 100644 --- a/src/main/scala/org/apache/sysml/api/dl/Utils.scala +++ b/src/main/scala/org/apache/sysml/api/dl/Utils.scala @@ -171,6 +171,7 @@ object Utils { ml.execute(script) } + // TODO: Loading of extra weights is not supported def readCaffeNet(net: CaffeNetwork, netFilePath: String, weightsFilePath: String, inputVariables: java.util.HashMap[String, MatrixBlock]): NetParameter = { // Load network val reader: InputStreamReader = getInputStreamReader(netFilePath);
