fmcquillan99 commented on a change in pull request #564: URL: https://github.com/apache/madlib/pull/564#discussion_r621288731
########## File path: src/modules/convex/task/mlp.hpp ########## @@ -218,6 +237,86 @@ MLP<Model, Tuple>::getLossAndUpdateModel( return total_loss; } +template <class Model, class Tuple> +double +MLP<Model, Tuple>::getLossAndUpdateModelALR( + model_type &model, + const Matrix &x_batch, + const Matrix &y_true_batch, + const double &stepsize, + const int &opt_code, + const double &gamma, + std::vector<Matrix> &sqrs, + const double &beta1, + const double &beta2, + std::vector<Matrix> &vs, + const int &t) { + + double total_loss = 0.; + const double EPS_STABLE = 1.e-18; + + // initialize gradient vector + std::vector<Matrix> total_gradient_per_layer(model.num_layers); + Matrix g, v_bias_corr, sqr_bias_corr; + for (Index k=0; k < model.num_layers; ++k) { + total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(), + model.u[k].cols()); + } + + std::vector<ColumnVector> net, o, delta; + Index num_rows_in_batch = x_batch.rows(); + + for (Index i=0; i < num_rows_in_batch; i++){ + // gradient and loss added over the batch + ColumnVector x = x_batch.row(i); + ColumnVector y_true = y_true_batch.row(i); + + feedForward(model, x, net, o); + backPropogate(y_true, o.back(), net, model, delta); + + // compute the gradient + for (Index k=0; k < model.num_layers; k++){ + total_gradient_per_layer[k] += o[k] * delta[k].transpose(); + } + + // compute the loss + total_loss += getLoss(y_true, o.back(), model.is_classification); + } + + for (Index k=0; k < model.num_layers; k++){ + // convert gradient to a gradient update vector + // 1. normalize to per row update + // 2. discount by stepsize + // 3. add regularization + Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k]; + regularization.row(0).setZero(); // Do not update bias + + g = total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch); + if (opt_code == IS_RMSPROP){ + + sqrs[k] = gamma * sqrs[k] + (1.0 - gamma) * square(g); + total_gradient_per_layer[k] = (-stepsize * g).array() / + (sqrs[k].array() + EPS_STABLE).sqrt(); Review comment: I think option 1 is better too. Less confusing for user. My coursera notes from Andrew Ng say epsilon is mainly to prevent a `/0` and less related to convergence, but Nikhil found a blog post that did relate epsilon to performance. At any rate a common default value for both rmsprop and adam is preferable imo. Also most other packages expose epsilon as a user set parameter, so we ought to as well I think, instead of hard coding it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org