Repository: madlib
Updated Branches:
  refs/heads/master 5ab573bec -> 92bdf8cab


MLP: Simplify momentum and Nesterov updates

JIRA: MADLIB-1272

Momentum updates are complicated due to Nesterov requiring an initial
update before gradient calculations. There is, however, a different form
of the Nesterov update that can be cleanly performed after the regular
update, simplifying the code. This allows performing the gradient
calculations before any update - with or without Nesterov.

Closes #313


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/92bdf8ca
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/92bdf8ca
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/92bdf8ca

Branch: refs/heads/master
Commit: 92bdf8cab087472da1b2962f4ce51dc20255f6ba
Parents: 5ab573b
Author: Rahul Iyer <[email protected]>
Authored: Fri Aug 17 01:42:53 2018 -0700
Committer: Rahul Iyer <[email protected]>
Committed: Wed Aug 29 10:31:08 2018 -0700

----------------------------------------------------------------------
 src/modules/convex/task/mlp.hpp   | 53 +++++++++++++++++++++-------------
 src/modules/convex/type/model.hpp | 44 ++++++----------------------
 2 files changed, 42 insertions(+), 55 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/92bdf8ca/src/modules/convex/task/mlp.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/task/mlp.hpp b/src/modules/convex/task/mlp.hpp
index 3915ab1..b772549 100644
--- a/src/modules/convex/task/mlp.hpp
+++ b/src/modules/convex/task/mlp.hpp
@@ -158,9 +158,6 @@ MLP<Model, Tuple>::getLossAndUpdateModel(
         const double         &stepsize) {
 
     double total_loss = 0.;
-    // model is updated with the momentum step (i.e. velocity vector)
-    // if Nesterov Accelerated Gradient is enabled
-    model.nesterovUpdatePosition();
 
     // initialize gradient vector
     std::vector<Matrix> total_gradient_per_layer(model.num_layers);
@@ -188,22 +185,37 @@ MLP<Model, Tuple>::getLossAndUpdateModel(
         total_loss += getLoss(y_true, o.back(), model.is_classification);
     }
 
-    // convert gradient to a gradient update vector
-    //  1. normalize to per row update
-    //  2. discount by stepsize
-    //  3. add regularization
-    //  4. make negative
     for (Index k=0; k < model.num_layers; k++){
+        // convert gradient to a gradient update vector
+        //  1. normalize to per row update
+        //  2. discount by stepsize
+        //  3. add regularization
+        //  4. make negative for descent
         Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
         regularization.row(0).setZero(); // Do not update bias
-        total_gradient_per_layer[k] = -stepsize * (total_gradient_per_layer[k] 
/ static_cast<double>(num_rows_in_batch) +
-                                                  regularization);
-        model.updateVelocity(total_gradient_per_layer[k], k);
-        model.updatePosition(total_gradient_per_layer[k], k);
+        total_gradient_per_layer[k] = -stepsize *
+            (total_gradient_per_layer[k] / 
static_cast<double>(num_rows_in_batch) +
+             regularization);
+
+        // total_gradient_per_layer is now the update vector
+        if (model.momentum > 0){
+            model.velocity[k] = model.momentum * model.velocity[k] + 
total_gradient_per_layer[k];
+            if (model.is_nesterov){
+                // Below equation ensures that Nesterov updates are half step
+                // ahead of regular momentum updates i.e. next step's 
discounted
+                // velocity update is already added in the current step.
+                model.u[k] += model.momentum * model.velocity[k] + 
total_gradient_per_layer[k];
+            }
+            else{
+                model.u[k] += model.velocity[k];
+            }
+        } else {
+            // no momentum
+            model.u[k] += total_gradient_per_layer[k];
+        }
     }
 
     return total_loss;
-
 }
 
 
@@ -215,8 +227,6 @@ MLP<Model, Tuple>::gradientInPlace(
         const dependent_variable_type       &y_true,
         const double                        &stepsize)
 {
-    model.nesterovUpdatePosition();
-
     std::vector<ColumnVector> net, o, delta;
 
     feedForward(model, x, net, o);
@@ -225,15 +235,18 @@ MLP<Model, Tuple>::gradientInPlace(
     for (Index k=0; k < model.num_layers; k++){
         Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k];
         regularization.row(0).setZero(); // Do not update bias
+
         if (model.momentum > 0){
             Matrix gradient = -stepsize * (o[k] * delta[k].transpose() + 
regularization);
-            model.updateVelocity(gradient, k);
-            model.updatePosition(gradient, k);
+            model.velocity[k] = model.momentum * model.velocity[k] + gradient;
+            if (model.is_nesterov)
+                model.u[k] += model.momentum * model.velocity[k] + gradient;
+            else
+                model.u[k] += model.velocity[k];
         }
         else {
-            // Updating model inline instead of using updatePosition because
-            // we suspect that updatePosition ends up creating a copy of the
-            // gradient even if it is passed by reference and hence making it 
slower.
+            // Updating model inline as a special case to avoid a copy of the
+            // gradient matrix to velocity.
             model.u[k] -= stepsize * (o[k] * delta[k].transpose() + 
regularization);
         }
     }

http://git-wip-us.apache.org/repos/asf/madlib/blob/92bdf8ca/src/modules/convex/type/model.hpp
----------------------------------------------------------------------
diff --git a/src/modules/convex/type/model.hpp 
b/src/modules/convex/type/model.hpp
index 440e384..8f74d47 100644
--- a/src/modules/convex/type/model.hpp
+++ b/src/modules/convex/type/model.hpp
@@ -152,12 +152,12 @@ struct MLPModel {
     }
 
     size_t rebind(const double *is_classification_in,
-                    const double *activation_in,
-                    const double *momentum_in,
-                    const double *is_nesterov_in,
-                    const double *data,
-                    const uint16_t &inNumberOfStages,
-                    const double *inNumbersOfUnits) {
+                  const double *activation_in,
+                  const double *momentum_in,
+                  const double *is_nesterov_in,
+                  const double *data,
+                  const uint16_t &inNumberOfStages,
+                  const double *inNumbersOfUnits) {
         const double *n = inNumbersOfUnits;
         size_t k;
 
@@ -178,7 +178,8 @@ struct MLPModel {
         }
         for (k = 0; k < num_layers; k ++) {
             velocity.push_back(MutableMappedMatrix());
-            velocity[k].rebind(const_cast<double *>(data + sizeOfU), n[k] + 1, 
n[k+1]);
+            velocity[k].rebind(const_cast<double *>(data + sizeOfU),
+                               n[k] + 1, n[k+1]);
             sizeOfU += (n[k] + 1) * (n[k+1]);
         }
         return sizeOfU;
@@ -187,7 +188,6 @@ struct MLPModel {
     void initialize(const uint16_t &inNumberOfStages,
                     const double *inNumbersOfUnits) {
         num_layers = inNumberOfStages;
-
         for (size_t k =0; k < num_layers; ++k){
             // Initalize according to Glorot and Bengio (2010)
             // See design doc for more info
@@ -197,31 +197,6 @@ struct MLPModel {
         }
     }
 
-    void updateVelocity(const Matrix &gradient, const Index layer_index){
-        if (momentum > 0.){
-            // if momentum is enabled
-            velocity[layer_index] = momentum * velocity[layer_index] + 
gradient;
-        }
-    }
-
-    void updatePosition(const Matrix &gradient, const Index layer_index){
-        if (momentum > 0 and not is_nesterov){
-            u[layer_index] += velocity[layer_index];
-        }
-        else {
-            // update is same for non momentum and nesterov
-            u[layer_index] += gradient;
-        }
-    }
-
-    void nesterovUpdatePosition(){
-        if (momentum > 0 and is_nesterov){
-            for (size_t k = 0; k < u.size(); k++){
-                u[k] += momentum * velocity[k];
-            }
-        }
-    }
-
     double norm() const {
         double norm = 0.;
         size_t k;
@@ -243,8 +218,7 @@ struct MLPModel {
      *  Some operator wrappers for u.
      */
     MLPModel& operator*=(const double &c) {
-        // Note that when scaling the model, you should
-        // not update the bias.
+        // Note that when scaling the model, don't update the bias.
         size_t k;
         for (k = 0; k < u.size(); k ++) {
            u[k] *= c;

Reply via email to