Github user njayaram2 commented on a diff in the pull request:

    https://github.com/apache/madlib/pull/272#discussion_r191537654
  
    --- Diff: src/modules/convex/task/mlp.hpp ---
    @@ -126,68 +157,84 @@ MLP<Model, Tuple>::getLossAndUpdateModel(
             const Matrix         &y_true_batch,
             const double         &stepsize) {
     
    -    uint16_t num_layers = static_cast<uint16_t>(model.u.size()); // 
assuming nu. of layers >= 1
    -    Index num_rows_in_batch = x_batch.rows();
         double total_loss = 0.;
    +    // model is updated with the momentum step (i.e. velocity vector)
    +    // if Nesterov Accelerated Gradient is enabled
    +    model.nesterovUpdatePosition();
     
    -    // gradient added over the batch
    -    std::vector<Matrix> total_gradient_per_layer(num_layers);
    -    for (Index k=0; k < num_layers; ++k)
    +    // initialize gradient vector
    +    std::vector<Matrix> total_gradient_per_layer(model.num_layers);
    +    for (Index k=0; k < model.num_layers; ++k) {
             total_gradient_per_layer[k] = Matrix::Zero(model.u[k].rows(),
                                                        model.u[k].cols());
    +    }
     
    +    std::vector<ColumnVector> net, o, delta;
    +    Index num_rows_in_batch = x_batch.rows();
         for (Index i=0; i < num_rows_in_batch; i++){
    +        // gradient and loss added over the batch
             ColumnVector x = x_batch.row(i);
             ColumnVector y_true = y_true_batch.row(i);
     
    -        std::vector<ColumnVector> net, o, delta;
             feedForward(model, x, net, o);
             backPropogate(y_true, o.back(), net, model, delta);
     
    -        for (Index k=0; k < num_layers; k++){
    +        // compute the gradient
    +        for (Index k=0; k < model.num_layers; k++){
                     total_gradient_per_layer[k] += o[k] * delta[k].transpose();
             }
     
    -        // loss computation
    -        ColumnVector y_estimated = o.back();
    -        if(model.is_classification){
    -            double clip = 1.e-10;
    -            y_estimated = y_estimated.cwiseMax(clip).cwiseMin(1.-clip);
    -            total_loss += - (y_true.array()*y_estimated.array().log()
    -                   + 
(-y_true.array()+1)*(-y_estimated.array()+1).log()).sum();
    -        }
    -        else{
    -            total_loss += 0.5 * (y_estimated - y_true).squaredNorm();
    -        }
    +        // compute the loss
    +        total_loss += getLoss(y_true, o.back(), model.is_classification);
         }
    -    for (Index k=0; k < num_layers; k++){
    +
    +    // convert gradient to a gradient update vector
    +    //  1. normalize to per row update
    +    //  2. discount by stepsize
    +    //  3. add regularization
    +    //  4. make negative
    +    for (Index k=0; k < model.num_layers; k++){
             Matrix regularization = MLP<Model, Tuple>::lambda * model.u[k];
             regularization.row(0).setZero(); // Do not update bias
    -        model.u[k] -=
    -            stepsize *
    -            (total_gradient_per_layer[k] / 
static_cast<double>(num_rows_in_batch) +
    -             regularization);
    +        total_gradient_per_layer[k] = -stepsize * 
(total_gradient_per_layer[k] / static_cast<double>(num_rows_in_batch) +
    +                                                  regularization);
    +        model.updateVelocity(total_gradient_per_layer[k], k);
    +        model.updatePosition(total_gradient_per_layer[k], k);
         }
    +
         return total_loss;
    +
     }
     
    +
     template <class Model, class Tuple>
     void
     MLP<Model, Tuple>::gradientInPlace(
             model_type                          &model,
             const independent_variables_type    &x,
             const dependent_variable_type       &y_true,
    -        const double                        &stepsize) {
    -    size_t N = model.u.size(); // assuming nu. of layers >= 1
    +        const double                        &stepsize)
    +{
    +    model.nesterovUpdatePosition();
    +
         std::vector<ColumnVector> net, o, delta;
     
         feedForward(model, x, net, o);
         backPropogate(y_true, o.back(), net, model, delta);
     
    -    for (size_t k=0; k < N; k++){
    +    for (Index k=0; k < model.num_layers; k++){
             Matrix regularization = MLP<Model, Tuple>::lambda*model.u[k];
             regularization.row(0).setZero(); // Do not update bias
    -        model.u[k] -= stepsize * (o[k] * delta[k].transpose() + 
regularization);
    +        if (model.momentum > 0){
    +            Matrix gradient = -stepsize * (o[k] * delta[k].transpose() + 
regularization);
    +            model.updateVelocity(gradient, k);
    +            model.updatePosition(gradient, k);
    +        }
    +        else {
    +            // Updating model inline instead of using updatePosition since
    +            // updatePosition creates a copy of the gradient making it 
slower.
    --- End diff --
    
    This comment is a little confusing. We do pass along the gradient by 
reference to `updatePosition()`. If the underlying reason for using inline 
instead of calling the function is something more, could you please update the 
comment to reflect the same?


---

Reply via email to