Github user njayaram2 commented on a diff in the pull request:
https://github.com/apache/madlib/pull/272#discussion_r191537654
--- Diff: src/modules/convex/task/mlp.hpp ---
@@ -126,68 +157,84 @@ MLP<Model, Tuple>::getLossAndUpdateModel(
const Matrix &y_true_batch,
const double &stepsize) {
- uint16_t num_layers = static_cast<uint16_t>(model.u.size()); //
assuming nu. of layers >= 1
- Index num_rows_in_batch = x_batch.rows();
double total_loss = 0.;
+ // model is updated with the momentum step (i.e. velocity vector)
+ // if Nesterov Accelerated Gradient is enabled
+ model.nesterovUpdatePosition();
- // gradient added over the batch
- std::vector<Matrix> total_gradient_per_layer(num_layers);
- for (Index k=0; k < num_layers; ++k)
+ // initialize gradient vector
+ std::vector<Matrix> total_gradient_per_layer(model.num_layers);
+ for (Index k=0; k < model.num_layers; ++k) {
total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
model.u[k].cols());
+ }
+ std::vector<ColumnVector> net, o, delta;
+ Index num_rows_in_batch = x_batch.rows();
for (Index i=0; i < num_rows_in_batch; i++){
+ // gradient and loss added over the batch
ColumnVector x = x_batch.row(i);
ColumnVector y_true = y_true_batch.row(i);
- std::vector<ColumnVector> net, o, delta;
feedForward(model, x, net, o);
backPropogate(y_true, o.back(), net, model, delta);
- for (Index k=0; k < num_layers; k++){
+ // compute the gradient
+ for (Index k=0; k < model.num_layers; k++){
total_gradient_per_layer[k] += o[k] * delta[k].transpose();
}
- // loss computation
- ColumnVector y_estimated = o.back();
- if(model.is_classification){
- double clip = 1.e-10;
- y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip);
- total_loss += - (y_true.array()*y_estimated.array().log()
- +
(-y_true.array()+1)*(-y_estimated.array()+1).log()).sum();
- }
- else{
- total_loss += 0.5 * (y_estimated - y_true).squaredNorm();
- }
+ // compute the loss
+ total_loss += getLoss(y_true, o.back(), model.is_classification);
}
- for (Index k=0; k < num_layers; k++){
+
+ // convert gradient to a gradient update vector
+ // 1. normalize to per row update
+ // 2. discount by stepsize
+ // 3. add regularization
+ // 4. make negative
+ for (Index k=0; k < model.num_layers; k++){
Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
regularization.row(0).setZero(); // Do not update bias
- model.u[k] -=
- stepsize *
- (total_gradient_per_layer[k] /
static_cast<double>(num_rows_in_batch) +
- regularization);
+ total_gradient_per_layer[k] = -stepsize *
(total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch) +
+ regularization);
+ model.updateVelocity(total_gradient_per_layer[k], k);
+ model.updatePosition(total_gradient_per_layer[k], k);
}
+
return total_loss;
+
}
+
template <class Model, class Tuple>
void
MLP<Model, Tuple>::gradientInPlace(
model_type &model,
const independent_variables_type &x,
const dependent_variable_type &y_true,
- const double &stepsize) {
- size_t N = model.u.size(); // assuming nu. of layers >= 1
+ const double &stepsize)
+{
+ model.nesterovUpdatePosition();
+
std::vector<ColumnVector> net, o, delta;
feedForward(model, x, net, o);
backPropogate(y_true, o.back(), net, model, delta);
- for (size_t k=0; k < N; k++){
+ for (Index k=0; k < model.num_layers; k++){
Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k];
regularization.row(0).setZero(); // Do not update bias
- model.u[k] -= stepsize * (o[k] * delta[k].transpose() +
regularization);
+ if (model.momentum > 0){
+ Matrix gradient = -stepsize * (o[k] * delta[k].transpose() +
regularization);
+ model.updateVelocity(gradient, k);
+ model.updatePosition(gradient, k);
+ }
+ else {
+ // Updating model inline instead of using updatePosition since
+ // updatePosition creates a copy of the gradient making it
slower.
--- End diff --
This comment is a little confusing. We do pass along the gradient by
reference to `updatePosition()`. If the underlying reason for using inline
instead of calling the function is something more, could you please update the
comment to reflect the same?
---