systemml git commit: [SYSTEMML-540] Low-rank fully connected layer

niketanpansare Wed, 24 Jan 2018 10:35:06 -0800

Repository: systemml
Updated Branches:
  refs/heads/master 47973a905 -> 6af1df01d



[SYSTEMML-540] Low-rank fully connected layer

This layer has three advantages over the affine layer:
- It has significantly lower memory requirement than affine layer making it 
ideal for devices such as GPUs.
- It implicitly avoids overfitting by minimizing the number of parameters in 
the neural network.
- It can exploit sparsity-aware fused operators.

Closes #720.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6af1df01
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6af1df01
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6af1df01

Branch: refs/heads/master
Commit: 6af1df01d5ebe4c37025e06f7d839a2b2ee7d34b
Parents: 47973a9
Author: Niketan Pansare <[email protected]>
Authored: Wed Jan 24 10:32:15 2018 -0800
Committer: Niketan Pansare <[email protected]>
Committed: Wed Jan 24 10:32:15 2018 -0800

----------------------------------------------------------------------
 scripts/nn/layers/low_rank_affine.dml           | 109 +++++++++++++++++++
 scripts/nn/test/grad_check.dml                  | 100 +++++++++++++++++
 scripts/nn/test/run_tests.dml                   |   1 +
 src/main/proto/caffe/caffe.proto                |   2 +
 .../org/apache/sysml/api/dl/Caffe2DML.scala     |  20 +++-
 .../org/apache/sysml/api/dl/CaffeLayer.scala    |  33 ++++--
 .../org/apache/sysml/api/dl/CaffeSolver.scala   |  39 +++++++
 .../org/apache/sysml/api/dl/DMLGenerator.scala  |   1 +
 .../scala/org/apache/sysml/api/dl/Utils.scala   |   1 +
 9 files changed, 294 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/scripts/nn/layers/low_rank_affine.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/low_rank_affine.dml 
b/scripts/nn/layers/low_rank_affine.dml
new file mode 100644
index 0000000..32293f9
--- /dev/null
+++ b/scripts/nn/layers/low_rank_affine.dml
@@ -0,0 +1,109 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Low-rank Affine (fully-connected) layer.
+ * 
+ * This layer has three advantages over the affine layer:
+ * 1. It has significantly lower memory requirement than affine layer making 
it ideal for devices such as GPUs.
+ * 2. It implicitly avoids overfitting by minimizing the number of parameters 
in the neural network.
+ * 3. It can exploit sparsity-aware fused operators.
+ */
+
+forward = function(matrix[double] X, matrix[double] U, matrix[double] V, 
matrix[double] b)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for a low-rank affine (fully-connected) layer
+   * with M neurons.  The input data has N examples, each with D
+   * features.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, D).
+   *  - U: LHS factor matrix for weights, of shape (D, R).
+   *  - V: RHS factor matrix for weights, of shape (R, M).
+   *  - b: Biases, of shape (1, M).
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, M).
+   */
+  out = X %*% U %*% V + b
+}
+
+backward = function(matrix[double] dout, matrix[double] X,
+                    matrix[double] U, matrix[double] V, matrix[double] b)
+    return (matrix[double] dX, matrix[double] dU, matrix[double] dV, 
matrix[double] db) {
+  /*
+   * Computes the backward pass for a low-rank fully-connected (affine) layer
+   * with M neurons.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, M).
+   *  - X: Inputs, of shape (N, D).
+   *  - U: LHS factor matrix for weights, of shape (D, R).
+   *  - V: RHS factor matrix for weights, of shape (R, M).
+   *  - b: Biases, of shape (1, M).
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, D).
+   *  - dU: Gradient wrt `U`, of shape (D, R).
+   *  - dV: Gradient wrt `V`, of shape (R, M).
+   *  - db: Gradient wrt `b`, of shape (1, M).
+   */
+  dX = dout %*% t(V) %*% t(U)
+  
+  # If out = Z %*% L, then dL = t(Z) %*% dout
+  # Substituting Z = X %*% U and L = V, we get
+  dV = t(U) %*% t(X) %*% dout
+    
+  dU = t(X) %*% dout %*% t(V)
+  
+  db = colSums(dout)
+}
+
+init = function(int D, int M, int R)
+    return (matrix[double] U, matrix[double] V, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - D: Dimensionality of the input features (number of features).
+   *  - M: Number of neurons in this layer.
+   *  - R: Rank of U,V matrices such that R << min(D, M).
+   *
+   * Outputs:
+   *  - U: LHS factor matrix for weights, of shape (D, R).
+   *  - V: RHS factor matrix for weights, of shape (R, M).
+   *  - b: Biases, of shape (1, M).
+   */
+  U = rand(rows=D, cols=R, pdf="normal") * sqrt(2.0/D)
+  V = rand(rows=R, cols=M, pdf="normal") * sqrt(2.0/R)
+  b = matrix(0, rows=1, cols=M)
+}
+

http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/scripts/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml
index 41a8bc6..9c551b8 100644
--- a/scripts/nn/test/grad_check.dml
+++ b/scripts/nn/test/grad_check.dml
@@ -23,6 +23,7 @@
  * Gradient checks for various architectures.
  */
 source("nn/layers/affine.dml") as affine
+source("nn/layers/low_rank_affine.dml") as low_rank_affine
 source("nn/layers/batch_norm1d.dml") as batch_norm1d
 source("nn/layers/batch_norm2d.dml") as batch_norm2d
 source("nn/layers/conv2d.dml") as conv2d
@@ -133,6 +134,105 @@ affine = function() {
   }
 }
 
+low_rank_affine = function() {
+  /*
+   * Gradient check for the low rank affine layer.
+   */
+  print("Grad checking the low rank affine layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 100 # num features
+  M = 10 # num neurons
+  R = 2 # rank
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=M)
+  [U, V, b] = low_rank_affine::init(D, M, R)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = low_rank_affine::forward(X, U, V, b)
+  dout = l2_loss::backward(out, y)
+  [dX, dU, dV, db] = low_rank_affine::backward(dout, X, U, V, b)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = low_rank_affine::forward(X, U, V, b)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = low_rank_affine::forward(X, U, V, b)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, 
lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking U.")
+  for (i in 1:nrow(U)) {
+    for (j in 1:ncol(U)) {
+      # Compute numerical derivative
+      old = as.scalar(U[i,j])
+      U[i,j] = old - h
+      outmh = low_rank_affine::forward(X, U, V, b)
+      lossmh = l2_loss::forward(outmh, y)
+      U[i,j] = old + h
+      outph = low_rank_affine::forward(X, U, V, b)
+      lossph = l2_loss::forward(outph, y)
+      U[i,j] = old  # reset
+      dU_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dU[i,j]), dU_num, 
lossph, lossmh)
+    }
+  }
+  
+  print(" - Grad checking V.")
+  for (i in 1:nrow(V)) {
+    for (j in 1:ncol(V)) {
+      # Compute numerical derivative
+      old = as.scalar(V[i,j])
+      V[i,j] = old - h
+      outmh = low_rank_affine::forward(X, U, V, b)
+      lossmh = l2_loss::forward(outmh, y)
+      V[i,j] = old + h
+      outph = low_rank_affine::forward(X, U, V, b)
+      lossph = l2_loss::forward(outph, y)
+      V[i,j] = old  # reset
+      dV_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, 
lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      outmh = low_rank_affine::forward(X, U, V, b)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      outph = low_rank_affine::forward(X, U, V, b)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, 
lossph, lossmh)
+    }
+  }
+}
+
 batch_norm1d = function() {
   /*
    * Gradient check for the 1D batch normalization layer.

http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/scripts/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml
index 27d6a4a..f70701b 100644
--- a/scripts/nn/test/run_tests.dml
+++ b/scripts/nn/test/run_tests.dml
@@ -41,6 +41,7 @@ print("")
 
 # Core layers
 grad_check::affine()
+grad_check::low_rank_affine()
 grad_check::batch_norm1d()
 grad_check::batch_norm2d()
 grad_check::conv2d()

http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/proto/caffe/caffe.proto
----------------------------------------------------------------------
diff --git a/src/main/proto/caffe/caffe.proto b/src/main/proto/caffe/caffe.proto
index 444b2f9..c5a5799 100644
--- a/src/main/proto/caffe/caffe.proto
+++ b/src/main/proto/caffe/caffe.proto
@@ -833,6 +833,8 @@ message InnerProductParameter {
   // of the weight matrix. The weight matrix itself is not going to be 
transposed
   // but rather the transfer flag of operations will be toggled accordingly.
   optional bool transpose = 6 [default = false];
+  
+  optional int32 rank = 7 [default = 0]; // rank of U, V matrices
 }
 
 message InputParameter {

http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala 
b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
index 5d17a4d..9f75008 100644
--- a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
@@ -313,7 +313,11 @@ class Caffe2DML(val sc: SparkContext,
     val numLayerInput =  if(!l.isInstanceOf[Data]) 
l.bottomLayerOutputShape._1.toLong * l.bottomLayerOutputShape._2.toLong * 
l.bottomLayerOutputShape._3.toLong  * batchSize else 0
     val numLayerOutput = l.outputShape._1.toLong * l.outputShape._2.toLong * 
l.outputShape._3.toLong  * batchSize
     val numLayerError = numLayerOutput
-    val numLayerWeights = if(l.weightShape != null) l.weightShape()(0).toLong 
* l.weightShape()(1).toLong else 0
+    val numLayerWeights = if(l.weightShape != null) {
+      val nWt = l.weightShape()(0).toLong * l.weightShape()(1).toLong
+      if(l.extraWeightShape != null) l.extraWeightShape()(0).toLong * 
l.extraWeightShape()(1).toLong + nWt
+      else nWt
+    } else 0
     val numLayerBias = if(l.biasShape != null)l.biasShape()(0).toLong * 
l.biasShape()(1).toLong else 0
     val numLayerGradients = (numLayerWeights + numLayerBias) * batchSize
     if(isTraining) (numLayerInput + numLayerOutput + numLayerError + 
numLayerWeights + numLayerBias + numLayerGradients)*Double.BYTES
@@ -337,7 +341,11 @@ class Caffe2DML(val sc: SparkContext,
         (l._1,
          layer.param.getType,
          "(, " + layer.outputShape._1 + ", " + layer.outputShape._2 + ", " + 
layer.outputShape._3 + ")",
-         if (layer.weightShape != null) "[" + layer.weightShape()(0) + " X " + 
layer.weightShape()(1) + "]" else "",
+         if (layer.weightShape != null) {
+           val wShapes = "[" + layer.weightShape()(0) + " X " + 
layer.weightShape()(1) + "]"
+           if (layer.extraWeightShape != null) wShapes + ", " +  "[" + 
layer.extraWeightShape()(0) + " X " + layer.extraWeightShape()(1) + "]"
+           else wShapes
+         } else "",
          if (layer.biasShape != null) "[" + layer.biasShape()(0) + " X " + 
layer.biasShape()(1) + "]" else "",
          layer.param.getTopList.mkString(","),
          layer.param.getBottomList.mkString(","), 
@@ -421,6 +429,7 @@ class Caffe2DML(val sc: SparkContext,
     // Set input/output variables and execute the script
     val script = dml(trainingScript).in(inputs)
     net.getLayers.map(net.getCaffeLayer(_)).filter(_.weight != null).map(l => 
script.out(l.weight))
+    net.getLayers.map(net.getCaffeLayer(_)).filter(_.extraWeight != 
null).map(l => script.out(l.extraWeight))
     net.getLayers.map(net.getCaffeLayer(_)).filter(_.bias != null).map(l => 
script.out(l.bias))
     
     setDebugFlags(false)
@@ -667,6 +676,7 @@ class Caffe2DML(val sc: SparkContext,
         tabDMLScript.append("snapshot_dir= \"" + solverParam.getSnapshotPrefix 
+ "\" + \"/iter_\" + iter + \"/\"\n")
         val allLayers = net.getLayers.map(net.getCaffeLayer(_))
         allLayers.filter(_.weight != null).map(l => 
appendSnapshotWrite(l.weight, l.param.getName + "_weight.mtx"))
+        allLayers.filter(_.extraWeight != null).map(l => 
appendSnapshotWrite(l.extraWeight, l.param.getName + "_extra_weight.mtx"))
         allLayers.filter(_.bias != null).map(l => appendSnapshotWrite(l.bias, 
l.param.getName + "_bias.mtx"))
       }
     }
@@ -689,6 +699,7 @@ class Caffe2DML(val sc: SparkContext,
       .map(layer => net.getCaffeLayer(layer))
       .map(l => {
         if (l.shouldUpdateWeight) assign(tabDMLScript, l.dWeight + "_agg", 
matrix("0", parallel_batches, multiply(nrow(l.weight), ncol(l.weight))))
+        if (l.shouldUpdateExtraWeight) assign(tabDMLScript, l.dExtraWeight + 
"_agg", matrix("0", parallel_batches, multiply(nrow(l.extraWeight), 
ncol(l.extraWeight))))
         if (l.shouldUpdateBias) assign(tabDMLScript, l.dBias + "_agg", 
matrix("0", parallel_batches, multiply(nrow(l.bias), ncol(l.bias))))
       })
   }
@@ -701,6 +712,7 @@ class Caffe2DML(val sc: SparkContext,
       .map(layer => net.getCaffeLayer(layer))
       .map(l => {
         if (l.shouldUpdateWeight) assign(tabDMLScript, l.dWeight + "_agg[j,]", 
matrix(l.dWeight, "1", multiply(nrow(l.weight), ncol(l.weight))) + " * 
weighting")
+        if (l.shouldUpdateExtraWeight) assign(tabDMLScript, l.dExtraWeight + 
"_agg[j,]", matrix(l.dExtraWeight, "1", multiply(nrow(l.extraWeight), 
ncol(l.extraWeight))) + " * weighting")
         if (l.shouldUpdateWeight) assign(tabDMLScript, l.dBias + "_agg[j,]", 
matrix(l.dBias, "1", multiply(nrow(l.bias), ncol(l.bias))) + " * weighting")
       })
   }
@@ -710,6 +722,7 @@ class Caffe2DML(val sc: SparkContext,
       .map(layer => net.getCaffeLayer(layer))
       .map(l => {
         if (l.shouldUpdateWeight) assign(tabDMLScript, l.dWeight, 
matrix(colSums(l.dWeight + "_agg"), nrow(l.weight), ncol(l.weight)))
+        if (l.shouldUpdateExtraWeight) assign(tabDMLScript, l.dExtraWeight, 
matrix(colSums(l.dExtraWeight + "_agg"), nrow(l.extraWeight), 
ncol(l.extraWeight)))
         if (l.shouldUpdateWeight) assign(tabDMLScript, l.dBias, 
matrix(colSums(l.dBias + "_agg"), nrow(l.bias), ncol(l.bias)))
       })
   }
@@ -744,7 +757,7 @@ class Caffe2DMLModel(val numClasses: String, val sc: 
SparkContext, val solver: C
 
   def modelVariables(): List[String] = {
     val allLayers = net.getLayers.map(net.getCaffeLayer(_))
-    allLayers.filter(_.weight != null).map(_.weight) ++ 
allLayers.filter(_.bias != null).map(_.bias)
+    allLayers.filter(_.weight != null).map(_.weight) ++ 
allLayers.filter(_.extraWeight != null).map(_.extraWeight) ++ 
allLayers.filter(_.bias != null).map(_.bias)
   }
 
   // 
================================================================================================
@@ -850,6 +863,7 @@ class Caffe2DMLModel(val numClasses: String, val sc: 
SparkContext, val solver: C
     if (estimator.mloutput != null) {
       // fit was called
       net.getLayers.map(net.getCaffeLayer(_)).filter(_.weight != null).map(l 
=> script.in(l.weight, estimator.mloutput.getMatrix(l.weight)))
+      net.getLayers.map(net.getCaffeLayer(_)).filter(_.extraWeight != 
null).map(l => script.in(l.extraWeight, 
estimator.mloutput.getMatrix(l.extraWeight)))
       net.getLayers.map(net.getCaffeLayer(_)).filter(_.bias != null).map(l => 
script.in(l.bias, estimator.mloutput.getMatrix(l.bias)))
     }
     

http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala 
b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala
index 37b585f..dd8d137 100644
--- a/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/CaffeLayer.scala
@@ -90,14 +90,19 @@ trait CaffeLayer extends BaseDMLGenerator {
   // 
--------------------------------------------------------------------------------------
   // No need to override these methods in subclasses, instead classes that 
have weights and biases
   // should implement HasWeight and HasBias traits.
-  def dWeight(): String = throw new DMLRuntimeException("dWeight is not 
implemented in super class")
-  def dBias(): String   = throw new DMLRuntimeException("dBias is not 
implemented in super class")
   def weight(): String  = null;
   def weightShape(): Array[Int];
+  def dWeight(): String = throw new DMLRuntimeException("dWeight is not 
implemented in super class")
+  def shouldUpdateWeight(): Boolean = if (weight != null) true else false
   def bias(): String = null;
   def biasShape(): Array[Int];
-  def shouldUpdateWeight(): Boolean = if (weight != null) true else false
+  def dBias(): String   = throw new DMLRuntimeException("dBias is not 
implemented in super class")
   def shouldUpdateBias(): Boolean   = if (bias != null) true else false
+  
+  def extraWeight(): String  = null;
+  def extraWeightShape(): Array[Int] = null;
+  def dExtraWeight(): String = throw new DMLRuntimeException("dExtraWeight is 
not implemented in super class")
+  def shouldUpdateExtraWeight():Boolean = if(extraWeight != null) true else 
false
   // 
--------------------------------------------------------------------------------------
   // Helper methods to simplify the code of subclasses
   def invokeInit(dmlScript: StringBuilder, returnVariables: List[String], 
arguments: String*): Unit =
@@ -818,7 +823,8 @@ class Dropout(val param: LayerParameter, val id: Int, val 
net: CaffeNetwork) ext
 class InnerProduct(val param: LayerParameter, val id: Int, val net: 
CaffeNetwork) extends CaffeLayer with HasWeight with HasBias {
   // -------------------------------------------------
   // TODO: bias_filler [default type: 'constant' value: 0]; bias_term [default 
true]: specifies whether to learn and apply a set of additive biases to the 
filter outputs
-  override def sourceFileName = "affine"
+  val isLowRank = param.getInnerProductParam.hasRank && 
param.getInnerProductParam.getRank > 0
+  override def sourceFileName = if(isLowRank) "low_rank_affine" else "affine"
   /*
    * Initialize the parameters of this layer.
    *
@@ -839,7 +845,10 @@ class InnerProduct(val param: LayerParameter, val id: Int, 
val net: CaffeNetwork
    *  - W: Weights, of shape (D, M).
    *  - b: Biases, of shape (1, M).
    */
-  override def init(dmlScript: StringBuilder) = invokeInit(dmlScript, 
List[String](weight, bias), numFeatures, numNeurons)
+  override def init(dmlScript: StringBuilder) = {
+    if(isLowRank) invokeInit(dmlScript, List[String](weight, extraWeight, 
bias), numFeatures, numNeurons, param.getInnerProductParam.getRank.toString)
+    else invokeInit(dmlScript, List[String](weight, bias), numFeatures, 
numNeurons)
+  }
   /*
    * Computes the forward pass for an affine (fully-connected) layer
    * with M neurons.  The input data has N examples, each with D
@@ -857,7 +866,8 @@ class InnerProduct(val param: LayerParameter, val id: Int, 
val net: CaffeNetwork
     if(debugLayer && caffe2dmlObj != null && !caffe2dmlObj.containsParfor) {
       dmlScript.append("assert(ncol(" + X + ") == nrow(" + weight + ") | 
ncol(" + weight + ") == ncol(" + bias + ")); ")
     }
-    invokeForward(dmlScript, List[String](out), X, weight, bias)
+    if(isLowRank) invokeForward(dmlScript, List[String](out), X, weight, 
extraWeight, bias)
+    else invokeForward(dmlScript, List[String](out), X, weight, bias)
   }
     
   /*
@@ -875,8 +885,10 @@ class InnerProduct(val param: LayerParameter, val id: Int, 
val net: CaffeNetwork
    *  - dW: Gradient wrt `W`, of shape (D, M).
    *  - db: Gradient wrt `b`, of shape (1, M).
    */
-  override def backward(dmlScript: StringBuilder, outSuffix: String) = 
-    invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, dWeight, 
dBias), dout, X, weight, bias)
+  override def backward(dmlScript: StringBuilder, outSuffix: String) =  {
+    if(isLowRank) invokeBackward(dmlScript, outSuffix, List[String]("dOut" + 
id, dWeight, dExtraWeight, dBias), dout, X, weight, extraWeight, bias)
+    else invokeBackward(dmlScript, outSuffix, List[String]("dOut" + id, 
dWeight, dBias), dout, X, weight, bias)
+  }
   
   // -------------------------------------------------
   // num_output (c_o): the number of filters
@@ -884,8 +896,11 @@ class InnerProduct(val param: LayerParameter, val id: Int, 
val net: CaffeNetwork
   def numFeatures = int_mult(bottomLayerOutputShape._1, 
bottomLayerOutputShape._2, bottomLayerOutputShape._3)
   // n * c_o * 1 * 1
   override def outputShape               = 
(param.getInnerProductParam.getNumOutput.toString, "1", "1")
-  override def weightShape(): Array[Int] = Array(numFeatures.toInt, 
numNeurons.toInt)
+  override def weightShape(): Array[Int] = if(isLowRank) 
Array(numFeatures.toInt, param.getInnerProductParam.getRank) else 
Array(numFeatures.toInt, numNeurons.toInt)
   override def biasShape(): Array[Int]   = Array(1, numNeurons.toInt)
+  override def extraWeight(): String  = if(isLowRank) weight + "_extra" else 
null
+  override def extraWeightShape(): Array[Int] = if(isLowRank) 
Array(param.getInnerProductParam.getRank, numNeurons.toInt) else null
+  override def dExtraWeight(): String = if(isLowRank) dWeight + "_extra" else 
null
 }
 
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala 
b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
index d0d738e..8559c60 100644
--- a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
@@ -69,6 +69,10 @@ trait CaffeSolver {
       
       dmlScript.append("\t").append(layer.dWeight + "_reg = " + 
regularizationSource + "::backward(" + layer.weight + ", " + newLambda + ")\n")
       dmlScript.append("\t").append(layer.dWeight + " = " + layer.dWeight + " 
+ " + layer.dWeight + "_reg\n")
+      if(layer.shouldUpdateExtraWeight) {
+        dmlScript.append("\t").append(layer.dExtraWeight + "_reg = " + 
regularizationSource + "::backward(" + layer.extraWeight + ", " + newLambda + 
")\n")
+        dmlScript.append("\t").append(layer.dExtraWeight + " = " + 
layer.dExtraWeight + " + " + layer.dExtraWeight + "_reg\n")
+      }
     }
   }
 }
@@ -129,6 +133,7 @@ class SGD(regularizationType:String = "L2", lambda: Double 
= 5e-04, momentum: Do
     if (momentum == 0) {
       // Use sgd
       if (layer.shouldUpdateWeight) dmlScript.append("\t").append(layer.weight 
+ " = sgd::update(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer)) 
+ ")\n")
+      if (layer.shouldUpdateExtraWeight) 
dmlScript.append("\t").append(layer.extraWeight + " = sgd::update(" + 
commaSep(layer.extraWeight, layer.dExtraWeight, getWeightLr(layer)) + ")\n")
       if (layer.shouldUpdateBias) dmlScript.append("\t").append(layer.bias + " 
= sgd::update(" + commaSep(layer.bias, layer.dBias, getBiasLr(layer)) + ")\n")
     } else {
       // Use sgd_momentum
@@ -139,6 +144,13 @@ class SGD(regularizationType:String = "L2", lambda: Double 
= 5e-04, momentum: Do
             "[" + commaSep(layer.weight, layer.weight + "_v") + "] " +
             "= sgd_momentum::update(" + commaSep(layer.weight, layer.dWeight, 
getWeightLr(layer), momentum.toString, layer.weight + "_v") + ")\n"
           )
+      if (layer.shouldUpdateExtraWeight)
+        dmlScript
+          .append("\t")
+          .append(
+            "[" + commaSep(layer.extraWeight, layer.extraWeight + "_v") + "] " 
+
+            "= sgd_momentum::update(" + commaSep(layer.extraWeight, 
layer.dExtraWeight, getWeightLr(layer), momentum.toString, layer.extraWeight + 
"_v") + ")\n"
+          )
       if (layer.shouldUpdateBias)
         dmlScript
           .append("\t")
@@ -151,6 +163,7 @@ class SGD(regularizationType:String = "L2", lambda: Double 
= 5e-04, momentum: Do
   def init(dmlScript: StringBuilder, layer: CaffeLayer): Unit =
     if (momentum != 0) {
       if (layer.shouldUpdateWeight) dmlScript.append(layer.weight + "_v = 
sgd_momentum::init(" + layer.weight + ")\n")
+      if (layer.shouldUpdateExtraWeight) dmlScript.append(layer.extraWeight + 
"_v = sgd_momentum::init(" + layer.extraWeight + ")\n")
       if (layer.shouldUpdateBias) dmlScript.append(layer.bias + "_v = 
sgd_momentum::init(" + layer.bias + ")\n")
     }
   def sourceFileName: String = if (momentum == 0) "sgd" else "sgd_momentum"
@@ -193,6 +206,13 @@ class AdaGrad(regularizationType:String = "L2", lambda: 
Double = 5e-04, epsilon:
           "[" + commaSep(layer.weight, layer.weight + "_cache") + "] " +
           "= adagrad::update(" + commaSep(layer.weight, layer.dWeight, 
getWeightLr(layer), epsilon.toString, layer.weight + "_cache") + ")\n"
         )
+    if (layer.shouldUpdateExtraWeight)
+      dmlScript
+        .append("\t")
+        .append(
+          "[" + commaSep(layer.extraWeight, layer.extraWeight + "_cache") + "] 
" +
+          "= adagrad::update(" + commaSep(layer.extraWeight, 
layer.dExtraWeight, getWeightLr(layer), epsilon.toString, layer.extraWeight + 
"_cache") + ")\n"
+        )
     if (layer.shouldUpdateBias)
       dmlScript
         .append("\t")
@@ -203,6 +223,7 @@ class AdaGrad(regularizationType:String = "L2", lambda: 
Double = 5e-04, epsilon:
   }
   def init(dmlScript: StringBuilder, layer: CaffeLayer): Unit = {
     if (layer.shouldUpdateWeight) dmlScript.append(layer.weight + "_cache = 
adagrad::init(" + layer.weight + ")\n")
+    if (layer.shouldUpdateExtraWeight) dmlScript.append(layer.extraWeight + 
"_cache = adagrad::init(" + layer.extraWeight + ")\n")
     if (layer.shouldUpdateBias) dmlScript.append(layer.bias + "_cache = 
adagrad::init(" + layer.bias + ")\n")
   }
   def sourceFileName: String = "adagrad"
@@ -257,6 +278,15 @@ class Adam(regularizationType:String = "L2", lambda: 
Double = 5e-04, momentum:Do
               momentum.toString, momentum2.toString, delta.toString,  t,
               layer.weight + "_m", layer.weight + "_v") + ")\n"
         )
+    if (layer.shouldUpdateExtraWeight)
+      dmlScript
+        .append("\t")
+        .append(
+          "[" + commaSep(layer.extraWeight, layer.extraWeight + "_m", 
layer.extraWeight + "_v") + "] " +
+          "= adam::update(" + commaSep(layer.extraWeight, layer.dExtraWeight, 
getWeightLr(layer), 
+              momentum.toString, momentum2.toString, delta.toString,  t,
+              layer.extraWeight + "_m", layer.extraWeight + "_v") + ")\n"
+        )
     if (layer.shouldUpdateBias)
       dmlScript
         .append("\t")
@@ -269,6 +299,7 @@ class Adam(regularizationType:String = "L2", lambda: Double 
= 5e-04, momentum:Do
   }
   def init(dmlScript: StringBuilder, layer: CaffeLayer): Unit = {
     if (layer.shouldUpdateWeight) dmlScript.append("[ " + layer.weight + "_m, 
" + layer.weight + "_v ] = adam::init(" + layer.weight + ")\n")
+    if (layer.shouldUpdateExtraWeight) dmlScript.append("[ " + 
layer.extraWeight + "_m, " + layer.extraWeight + "_v ] = adam::init(" + 
layer.extraWeight + ")\n")
     if (layer.shouldUpdateBias) dmlScript.append("[ " + layer.bias + "_m, " + 
layer.bias + "_v ] = adam::init(" + layer.bias + ")\n")
   }
   def sourceFileName: String = "adam"
@@ -320,6 +351,13 @@ class Nesterov(regularizationType:String = "L2", lambda: 
Double = 5e-04, momentu
           "[" + commaSep(layer.weight, layer.weight + "_v") + "] " +
           "= " + fn + "(" + commaSep(layer.weight, layer.dWeight, 
getWeightLr(layer), momentum.toString, layer.weight + "_v") + lastParameter + 
")\n"
         )
+    if (layer.shouldUpdateExtraWeight)
+      dmlScript
+        .append("\t")
+        .append(
+          "[" + commaSep(layer.extraWeight, layer.extraWeight + "_v") + "] " +
+          "= " + fn + "(" + commaSep(layer.extraWeight, layer.dExtraWeight, 
getWeightLr(layer), momentum.toString, layer.extraWeight + "_v") + 
lastParameter + ")\n"
+        )
     if (layer.shouldUpdateBias)
       dmlScript
         .append("\t")
@@ -330,6 +368,7 @@ class Nesterov(regularizationType:String = "L2", lambda: 
Double = 5e-04, momentu
   }
   def init(dmlScript: StringBuilder, layer: CaffeLayer): Unit = {
     if (layer.shouldUpdateWeight) dmlScript.append(layer.weight + "_v = 
sgd_nesterov::init(" + layer.weight + ")\n")
+    if (layer.shouldUpdateExtraWeight) dmlScript.append(layer.extraWeight + 
"_v = sgd_nesterov::init(" + layer.extraWeight + ")\n")
     if (layer.shouldUpdateBias) dmlScript.append(layer.bias + "_v = 
sgd_nesterov::init(" + layer.bias + ")\n")
   }
   def sourceFileName: String = "sgd_nesterov"

http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala 
b/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala
index 0231354..5d25116 100644
--- a/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/DMLGenerator.scala
@@ -313,6 +313,7 @@ trait DMLGenerator extends SourceDMLGenerator with 
NextBatchGenerator {
       tabDMLScript.append("# Load the weights. Note: keeping the 
initialization code in case the layer wants to initialize non-weights and 
non-bias\n")
       val allLayers = net.getLayers.filter(l => 
!layersToIgnore.contains(l)).map(net.getCaffeLayer(_))
       allLayers.filter(_.weight != null).map(l => 
tabDMLScript.append(readWeight(l.weight, l.param.getName + "_weight.mtx")))
+      allLayers.filter(_.extraWeight != null).map(l => 
tabDMLScript.append(readWeight(l.extraWeight, l.param.getName + 
"_extra_weight.mtx")))
       allLayers.filter(_.bias != null).map(l => 
tabDMLScript.append(readWeight(l.bias, l.param.getName + "_bias.mtx")))
     }
     net.getLayers.map(layer => solver.init(tabDMLScript, 
net.getCaffeLayer(layer)))

http://git-wip-us.apache.org/repos/asf/systemml/blob/6af1df01/src/main/scala/org/apache/sysml/api/dl/Utils.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/Utils.scala 
b/src/main/scala/org/apache/sysml/api/dl/Utils.scala
index 5939cf1..e771ed1 100644
--- a/src/main/scala/org/apache/sysml/api/dl/Utils.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/Utils.scala
@@ -171,6 +171,7 @@ object Utils {
     ml.execute(script)
   }
 
+  // TODO: Loading of extra weights is not supported
   def readCaffeNet(net: CaffeNetwork, netFilePath: String, weightsFilePath: 
String, inputVariables: java.util.HashMap[String, MatrixBlock]): NetParameter = 
{
     // Load network
     val reader: InputStreamReader     = getInputStreamReader(netFilePath);

systemml git commit: [SYSTEMML-540] Low-rank fully connected layer

Reply via email to