fmcquillan99 commented on a change in pull request #564:
URL: https://github.com/apache/madlib/pull/564#discussion_r621288731



##########
File path: src/modules/convex/task/mlp.hpp
##########
@@ -218,6 +237,86 @@ MLP<Model, Tuple>::getLossAndUpdateModel(
     return total_loss;
 }
 
+template <class Model, class Tuple>
+double
+MLP<Model, Tuple>::getLossAndUpdateModelALR(
+        model_type              &model,
+        const Matrix            &x_batch,
+        const Matrix            &y_true_batch,
+        const double            &stepsize,
+        const int               &opt_code,
+        const double            &gamma,
+        std::vector<Matrix>     &sqrs,
+        const double            &beta1,
+        const double            &beta2,
+        std::vector<Matrix>     &vs,
+        const int               &t) {
+
+    double total_loss = 0.;
+    const double EPS_STABLE = 1.e-18;
+
+    // initialize gradient vector
+    std::vector<Matrix> total_gradient_per_layer(model.num_layers);
+    Matrix g, v_bias_corr, sqr_bias_corr;
+    for (Index k=0; k < model.num_layers; ++k) {
+        total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
+                                                   model.u[k].cols());
+    }
+
+    std::vector<ColumnVector> net, o, delta;
+    Index num_rows_in_batch = x_batch.rows();
+
+    for (Index i=0; i < num_rows_in_batch; i++){
+        // gradient and loss added over the batch
+        ColumnVector x = x_batch.row(i);
+        ColumnVector y_true = y_true_batch.row(i);
+
+        feedForward(model, x, net, o);
+        backPropogate(y_true, o.back(), net, model, delta);
+
+        // compute the gradient
+        for (Index k=0; k < model.num_layers; k++){
+                total_gradient_per_layer[k] += o[k] * delta[k].transpose();
+        }
+
+        // compute the loss
+        total_loss += getLoss(y_true, o.back(), model.is_classification);
+    }
+
+    for (Index k=0; k < model.num_layers; k++){
+        // convert gradient to a gradient update vector
+        //  1. normalize to per row update
+        //  2. discount by stepsize
+        //  3. add regularization
+        Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
+        regularization.row(0).setZero(); // Do not update bias
+
+        g = total_gradient_per_layer[k] / 
static_cast<double>(num_rows_in_batch);
+        if (opt_code == IS_RMSPROP){
+
+            sqrs[k] = gamma * sqrs[k] + (1.0 - gamma) * square(g);
+            total_gradient_per_layer[k] = (-stepsize * g).array() /
+                                          (sqrs[k].array() + 
EPS_STABLE).sqrt();

Review comment:
       I think option 1 is better too.  Less confusing for user.  My coursera 
notes from Andrew Ng say epsilon is mainly to prevent a `/0` and less related 
to convergence, but Nikhil found a blog post that did relate epsilon to 
performance.  At any rate a common default value for both rmsprop and adam is 
preferable imo.
   
   Also most other packages expose epsilon as a user set parameter, so we ought 
to as well I think, instead of hard coding it.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to