[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/272#discussion_r191575057 --- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in --- @@ -1781,7 +1799,7 @@ class MLPMinibatchPreProcessor: summary_table_columns = summary_table_columns[0] required_columns = (self.DEPENDENT_VARNAME, self.INDEPENDENT_VARNAME, -self.CLASS_VALUES, self.GROUPING_COL) +self.CLASS_VALUES, self.GROUPING_COL, self.DEPENDENT_VARTYPE) if set(required_columns) <= set(summary_table_columns): self.preprocessed_summary_dict = summary_table_columns else: --- End diff -- Can you please update the optimizer params in online docs? ---
[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/272#discussion_r191533727 --- Diff: src/modules/convex/type/model.hpp --- @@ -126,45 +129,96 @@ struct MLPModel { for (k = 0; k < N; k ++) { size += static_cast((n[k] + 1) * (n[k+1])); } +//TODO conditionally assign size based on momentum --- End diff -- Is this TODO still valid? ---
[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/272#discussion_r191574947 --- Diff: src/ports/postgres/modules/convex/mlp.sql_in --- @@ -1474,13 +1480,15 @@ CREATE AGGREGATE MADLIB_SCHEMA.mlp_minibatch_step( /* warm_start_coeff */DOUBLE PRECISION[], /* lambda */ DOUBLE PRECISION, /* batch_size */ INTEGER, -/* n_epochs */INTEGER +/* n_epochs */INTEGER, +/* momentum */DOUBLE PRECISION, +/* is_nesterov */ BOOLEAN )( STYPE=DOUBLE PRECISION[], SFUNC=MADLIB_SCHEMA.mlp_minibatch_transition, m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.mlp_minibatch_merge,') FINALFUNC=MADLIB_SCHEMA.mlp_minibatch_final, -INITCOND='{0,0,0,0,0,0,0,0,0,0,0,0}' +INITCOND='{0,0,0,0,0,0,0,0,0,0,0,0,0,0}' ); - --- End diff -- Can you please update the user docs with momentum and nesterov related optimizer params? Is it also a good time to update the MLP design doc? ---
[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/272#discussion_r191539168 --- Diff: src/modules/convex/task/mlp.hpp --- @@ -197,6 +244,7 @@ MLP::loss( const model_type, const independent_variables_type, const dependent_variable_type _true) { + --- End diff -- The loss computation code in this function is duplicated in `getLoss()` function. Can we call that function to remove the code duplication? ---
[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/272#discussion_r191537654 --- Diff: src/modules/convex/task/mlp.hpp --- @@ -126,68 +157,84 @@ MLP::getLossAndUpdateModel( const Matrix _true_batch, const double ) { -uint16_t num_layers = static_cast(model.u.size()); // assuming nu. of layers >= 1 -Index num_rows_in_batch = x_batch.rows(); double total_loss = 0.; +// model is updated with the momentum step (i.e. velocity vector) +// if Nesterov Accelerated Gradient is enabled +model.nesterovUpdatePosition(); -// gradient added over the batch -std::vector total_gradient_per_layer(num_layers); -for (Index k=0; k < num_layers; ++k) +// initialize gradient vector +std::vector total_gradient_per_layer(model.num_layers); +for (Index k=0; k < model.num_layers; ++k) { total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(), model.u[k].cols()); +} +std::vector net, o, delta; +Index num_rows_in_batch = x_batch.rows(); for (Index i=0; i < num_rows_in_batch; i++){ +// gradient and loss added over the batch ColumnVector x = x_batch.row(i); ColumnVector y_true = y_true_batch.row(i); -std::vector net, o, delta; feedForward(model, x, net, o); backPropogate(y_true, o.back(), net, model, delta); -for (Index k=0; k < num_layers; k++){ +// compute the gradient +for (Index k=0; k < model.num_layers; k++){ total_gradient_per_layer[k] += o[k] * delta[k].transpose(); } -// loss computation -ColumnVector y_estimated = o.back(); -if(model.is_classification){ -double clip = 1.e-10; -y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip); -total_loss += - (y_true.array()*y_estimated.array().log() - + (-y_true.array()+1)*(-y_estimated.array()+1).log()).sum(); -} -else{ -total_loss += 0.5 * (y_estimated - y_true).squaredNorm(); -} +// compute the loss +total_loss += getLoss(y_true, o.back(), model.is_classification); } -for (Index k=0; k < num_layers; k++){ + +// convert gradient to a gradient update vector +// 1. normalize to per row update +// 2. discount by stepsize +// 3. add regularization +// 4. make negative +for (Index k=0; k < model.num_layers; k++){ Matrix regularization = MLP::lambda * model.u[k]; regularization.row(0).setZero(); // Do not update bias -model.u[k] -= -stepsize * -(total_gradient_per_layer[k] / static_cast(num_rows_in_batch) + - regularization); +total_gradient_per_layer[k] = -stepsize * (total_gradient_per_layer[k] / static_cast(num_rows_in_batch) + + regularization); +model.updateVelocity(total_gradient_per_layer[k], k); +model.updatePosition(total_gradient_per_layer[k], k); } + return total_loss; + } + template void MLP::gradientInPlace( model_type , const independent_variables_type, const dependent_variable_type _true, -const double) { -size_t N = model.u.size(); // assuming nu. of layers >= 1 +const double) +{ +model.nesterovUpdatePosition(); + std::vector net, o, delta; feedForward(model, x, net, o); backPropogate(y_true, o.back(), net, model, delta); -for (size_t k=0; k < N; k++){ +for (Index k=0; k < model.num_layers; k++){ Matrix regularization = MLP::lambda*model.u[k]; regularization.row(0).setZero(); // Do not update bias -model.u[k] -= stepsize * (o[k] * delta[k].transpose() + regularization); +if (model.momentum > 0){ +Matrix gradient = -stepsize * (o[k] * delta[k].transpose() + regularization); +model.updateVelocity(gradient, k); +model.updatePosition(gradient, k); +} +else { +// Updating model inline instead of using updatePosition since +// updatePosition creates a copy of the gradient making it slower. --- End diff -- This comment is a little confusing. We do pass along the gradient by reference to `updatePosition()`. If the