Repository: madlib Updated Branches: refs/heads/master 5ab573bec -> 92bdf8cab
MLP: Simplify momentum and Nesterov updates JIRA: MADLIB-1272 Momentum updates are complicated due to Nesterov requiring an initial update before gradient calculations. There is, however, a different form of the Nesterov update that can be cleanly performed after the regular update, simplifying the code. This allows performing the gradient calculations before any update - with or without Nesterov. Closes #313 Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/92bdf8ca Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/92bdf8ca Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/92bdf8ca Branch: refs/heads/master Commit: 92bdf8cab087472da1b2962f4ce51dc20255f6ba Parents: 5ab573b Author: Rahul Iyer <[email protected]> Authored: Fri Aug 17 01:42:53 2018 -0700 Committer: Rahul Iyer <[email protected]> Committed: Wed Aug 29 10:31:08 2018 -0700 ---------------------------------------------------------------------- src/modules/convex/task/mlp.hpp | 53 +++++++++++++++++++++------------- src/modules/convex/type/model.hpp | 44 ++++++---------------------- 2 files changed, 42 insertions(+), 55 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/92bdf8ca/src/modules/convex/task/mlp.hpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/task/mlp.hpp b/src/modules/convex/task/mlp.hpp index 3915ab1..b772549 100644 --- a/src/modules/convex/task/mlp.hpp +++ b/src/modules/convex/task/mlp.hpp @@ -158,9 +158,6 @@ MLP<Model, Tuple>::getLossAndUpdateModel( const double &stepsize) { double total_loss = 0.; - // model is updated with the momentum step (i.e. velocity vector) - // if Nesterov Accelerated Gradient is enabled - model.nesterovUpdatePosition(); // initialize gradient vector std::vector<Matrix> total_gradient_per_layer(model.num_layers); @@ -188,22 +185,37 @@ MLP<Model, Tuple>::getLossAndUpdateModel( total_loss += getLoss(y_true, o.back(), model.is_classification); } - // convert gradient to a gradient update vector - // 1. normalize to per row update - // 2. discount by stepsize - // 3. add regularization - // 4. make negative for (Index k=0; k < model.num_layers; k++){ + // convert gradient to a gradient update vector + // 1. normalize to per row update + // 2. discount by stepsize + // 3. add regularization + // 4. make negative for descent Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k]; regularization.row(0).setZero(); // Do not update bias - total_gradient_per_layer[k] = -stepsize * (total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch) + - regularization); - model.updateVelocity(total_gradient_per_layer[k], k); - model.updatePosition(total_gradient_per_layer[k], k); + total_gradient_per_layer[k] = -stepsize * + (total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch) + + regularization); + + // total_gradient_per_layer is now the update vector + if (model.momentum > 0){ + model.velocity[k] = model.momentum * model.velocity[k] + total_gradient_per_layer[k]; + if (model.is_nesterov){ + // Below equation ensures that Nesterov updates are half step + // ahead of regular momentum updates i.e. next step's discounted + // velocity update is already added in the current step. + model.u[k] += model.momentum * model.velocity[k] + total_gradient_per_layer[k]; + } + else{ + model.u[k] += model.velocity[k]; + } + } else { + // no momentum + model.u[k] += total_gradient_per_layer[k]; + } } return total_loss; - } @@ -215,8 +227,6 @@ MLP<Model, Tuple>::gradientInPlace( const dependent_variable_type &y_true, const double &stepsize) { - model.nesterovUpdatePosition(); - std::vector<ColumnVector> net, o, delta; feedForward(model, x, net, o); @@ -225,15 +235,18 @@ MLP<Model, Tuple>::gradientInPlace( for (Index k=0; k < model.num_layers; k++){ Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k]; regularization.row(0).setZero(); // Do not update bias + if (model.momentum > 0){ Matrix gradient = -stepsize * (o[k] * delta[k].transpose() + regularization); - model.updateVelocity(gradient, k); - model.updatePosition(gradient, k); + model.velocity[k] = model.momentum * model.velocity[k] + gradient; + if (model.is_nesterov) + model.u[k] += model.momentum * model.velocity[k] + gradient; + else + model.u[k] += model.velocity[k]; } else { - // Updating model inline instead of using updatePosition because - // we suspect that updatePosition ends up creating a copy of the - // gradient even if it is passed by reference and hence making it slower. + // Updating model inline as a special case to avoid a copy of the + // gradient matrix to velocity. model.u[k] -= stepsize * (o[k] * delta[k].transpose() + regularization); } } http://git-wip-us.apache.org/repos/asf/madlib/blob/92bdf8ca/src/modules/convex/type/model.hpp ---------------------------------------------------------------------- diff --git a/src/modules/convex/type/model.hpp b/src/modules/convex/type/model.hpp index 440e384..8f74d47 100644 --- a/src/modules/convex/type/model.hpp +++ b/src/modules/convex/type/model.hpp @@ -152,12 +152,12 @@ struct MLPModel { } size_t rebind(const double *is_classification_in, - const double *activation_in, - const double *momentum_in, - const double *is_nesterov_in, - const double *data, - const uint16_t &inNumberOfStages, - const double *inNumbersOfUnits) { + const double *activation_in, + const double *momentum_in, + const double *is_nesterov_in, + const double *data, + const uint16_t &inNumberOfStages, + const double *inNumbersOfUnits) { const double *n = inNumbersOfUnits; size_t k; @@ -178,7 +178,8 @@ struct MLPModel { } for (k = 0; k < num_layers; k ++) { velocity.push_back(MutableMappedMatrix()); - velocity[k].rebind(const_cast<double *>(data + sizeOfU), n[k] + 1, n[k+1]); + velocity[k].rebind(const_cast<double *>(data + sizeOfU), + n[k] + 1, n[k+1]); sizeOfU += (n[k] + 1) * (n[k+1]); } return sizeOfU; @@ -187,7 +188,6 @@ struct MLPModel { void initialize(const uint16_t &inNumberOfStages, const double *inNumbersOfUnits) { num_layers = inNumberOfStages; - for (size_t k =0; k < num_layers; ++k){ // Initalize according to Glorot and Bengio (2010) // See design doc for more info @@ -197,31 +197,6 @@ struct MLPModel { } } - void updateVelocity(const Matrix &gradient, const Index layer_index){ - if (momentum > 0.){ - // if momentum is enabled - velocity[layer_index] = momentum * velocity[layer_index] + gradient; - } - } - - void updatePosition(const Matrix &gradient, const Index layer_index){ - if (momentum > 0 and not is_nesterov){ - u[layer_index] += velocity[layer_index]; - } - else { - // update is same for non momentum and nesterov - u[layer_index] += gradient; - } - } - - void nesterovUpdatePosition(){ - if (momentum > 0 and is_nesterov){ - for (size_t k = 0; k < u.size(); k++){ - u[k] += momentum * velocity[k]; - } - } - } - double norm() const { double norm = 0.; size_t k; @@ -243,8 +218,7 @@ struct MLPModel { * Some operator wrappers for u. */ MLPModel& operator*=(const double &c) { - // Note that when scaling the model, you should - // not update the bias. + // Note that when scaling the model, don't update the bias. size_t k; for (k = 0; k < u.size(); k ++) { u[k] *= c;
