[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...

2018-05-29 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/272#discussion_r191575057
  
--- Diff: src/ports/postgres/modules/convex/mlp_igd.py_in ---
@@ -1781,7 +1799,7 @@ class MLPMinibatchPreProcessor:
 summary_table_columns = summary_table_columns[0]
 
 required_columns = (self.DEPENDENT_VARNAME, 
self.INDEPENDENT_VARNAME,
-self.CLASS_VALUES, self.GROUPING_COL)
+self.CLASS_VALUES, self.GROUPING_COL, 
self.DEPENDENT_VARTYPE)
 if set(required_columns) <= set(summary_table_columns):
 self.preprocessed_summary_dict = summary_table_columns
 else:
--- End diff --

Can you please update the optimizer params in online docs?


---


[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...

2018-05-29 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/272#discussion_r191533727
  
--- Diff: src/modules/convex/type/model.hpp ---
@@ -126,45 +129,96 @@ struct MLPModel {
 for (k = 0; k < N; k ++) {
 size += static_cast((n[k] + 1) * (n[k+1]));
 }
+//TODO conditionally assign size based on momentum
--- End diff --

Is this TODO still valid?


---


[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...

2018-05-29 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/272#discussion_r191574947
  
--- Diff: src/ports/postgres/modules/convex/mlp.sql_in ---
@@ -1474,13 +1480,15 @@ CREATE AGGREGATE MADLIB_SCHEMA.mlp_minibatch_step(
 /* warm_start_coeff */DOUBLE PRECISION[],
 /* lambda */  DOUBLE PRECISION,
 /* batch_size */  INTEGER,
-/* n_epochs */INTEGER
+/* n_epochs */INTEGER,
+/* momentum */DOUBLE PRECISION,
+/* is_nesterov */ BOOLEAN
 )(
 STYPE=DOUBLE PRECISION[],
 SFUNC=MADLIB_SCHEMA.mlp_minibatch_transition,
 m4_ifdef(`__POSTGRESQL__', `', 
`prefunc=MADLIB_SCHEMA.mlp_minibatch_merge,')
 FINALFUNC=MADLIB_SCHEMA.mlp_minibatch_final,
-INITCOND='{0,0,0,0,0,0,0,0,0,0,0,0}'
+INITCOND='{0,0,0,0,0,0,0,0,0,0,0,0,0,0}'
 );
 -
 
--- End diff --

Can you please update the user docs with momentum and nesterov related 
optimizer params? Is it also a good time to update the MLP design doc?


---


[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...

2018-05-29 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/272#discussion_r191539168
  
--- Diff: src/modules/convex/task/mlp.hpp ---
@@ -197,6 +244,7 @@ MLP::loss(
 const model_type,
 const independent_variables_type,
 const dependent_variable_type   _true) {
+
--- End diff --

The loss computation code in this function is duplicated in `getLoss()` 
function. Can we call that function to remove the code duplication?


---


[GitHub] madlib pull request #272: MLP: Add momentum and nesterov to gradient updates...

2018-05-29 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/272#discussion_r191537654
  
--- Diff: src/modules/convex/task/mlp.hpp ---
@@ -126,68 +157,84 @@ MLP::getLossAndUpdateModel(
 const Matrix _true_batch,
 const double ) {
 
-uint16_t num_layers = static_cast(model.u.size()); // 
assuming nu. of layers >= 1
-Index num_rows_in_batch = x_batch.rows();
 double total_loss = 0.;
+// model is updated with the momentum step (i.e. velocity vector)
+// if Nesterov Accelerated Gradient is enabled
+model.nesterovUpdatePosition();
 
-// gradient added over the batch
-std::vector total_gradient_per_layer(num_layers);
-for (Index k=0; k < num_layers; ++k)
+// initialize gradient vector
+std::vector total_gradient_per_layer(model.num_layers);
+for (Index k=0; k < model.num_layers; ++k) {
 total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
model.u[k].cols());
+}
 
+std::vector net, o, delta;
+Index num_rows_in_batch = x_batch.rows();
 for (Index i=0; i < num_rows_in_batch; i++){
+// gradient and loss added over the batch
 ColumnVector x = x_batch.row(i);
 ColumnVector y_true = y_true_batch.row(i);
 
-std::vector net, o, delta;
 feedForward(model, x, net, o);
 backPropogate(y_true, o.back(), net, model, delta);
 
-for (Index k=0; k < num_layers; k++){
+// compute the gradient
+for (Index k=0; k < model.num_layers; k++){
 total_gradient_per_layer[k] += o[k] * delta[k].transpose();
 }
 
-// loss computation
-ColumnVector y_estimated = o.back();
-if(model.is_classification){
-double clip = 1.e-10;
-y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip);
-total_loss += - (y_true.array()*y_estimated.array().log()
-   + 
(-y_true.array()+1)*(-y_estimated.array()+1).log()).sum();
-}
-else{
-total_loss += 0.5 * (y_estimated - y_true).squaredNorm();
-}
+// compute the loss
+total_loss += getLoss(y_true, o.back(), model.is_classification);
 }
-for (Index k=0; k < num_layers; k++){
+
+// convert gradient to a gradient update vector
+//  1. normalize to per row update
+//  2. discount by stepsize
+//  3. add regularization
+//  4. make negative
+for (Index k=0; k < model.num_layers; k++){
 Matrix regularization = MLP::lambda * model.u[k];
 regularization.row(0).setZero(); // Do not update bias
-model.u[k] -=
-stepsize *
-(total_gradient_per_layer[k] / 
static_cast(num_rows_in_batch) +
- regularization);
+total_gradient_per_layer[k] = -stepsize * 
(total_gradient_per_layer[k] / static_cast(num_rows_in_batch) +
+  regularization);
+model.updateVelocity(total_gradient_per_layer[k], k);
+model.updatePosition(total_gradient_per_layer[k], k);
 }
+
 return total_loss;
+
 }
 
+
 template 
 void
 MLP::gradientInPlace(
 model_type  ,
 const independent_variables_type,
 const dependent_variable_type   _true,
-const double) {
-size_t N = model.u.size(); // assuming nu. of layers >= 1
+const double)
+{
+model.nesterovUpdatePosition();
+
 std::vector net, o, delta;
 
 feedForward(model, x, net, o);
 backPropogate(y_true, o.back(), net, model, delta);
 
-for (size_t k=0; k < N; k++){
+for (Index k=0; k < model.num_layers; k++){
 Matrix regularization = MLP::lambda*model.u[k];
 regularization.row(0).setZero(); // Do not update bias
-model.u[k] -= stepsize * (o[k] * delta[k].transpose() + 
regularization);
+if (model.momentum > 0){
+Matrix gradient = -stepsize * (o[k] * delta[k].transpose() + 
regularization);
+model.updateVelocity(gradient, k);
+model.updatePosition(gradient, k);
+}
+else {
+// Updating model inline instead of using updatePosition since
+// updatePosition creates a copy of the gradient making it 
slower.
--- End diff --

This comment is a little confusing. We do pass along the gradient by 
reference to `updatePosition()`. If the