[ 
https://issues.apache.org/jira/browse/SINGA-249?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15522446#comment-15522446
 ] 

hacker99 edited comment on SINGA-249 at 9/26/16 8:29 AM:
---------------------------------------------------------

Thank you very much for reply!

I just look at code in Caffe for BP Algorithm which just same with 
http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm .The code in 
line#62 #63 show that when compute gradient in  one layer , Caffe  use  input 
of current  layer and the gradient of next layer , but in Singa i find that 
Singa use the input of current layer and gradient of final layer. if there 
anything i miss with BP in Singa? Looking forward your reply again.
 
(code in 
https://github.com/apache/incubator-singa/blob/master/src/model/layer/convolution.cc
162     CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
163     dw += Mult(grad_b, col_data.T());
)


https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp
 42 template <typename Dtype>
 43 void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 44       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& 
bottom) {
 45   const Dtype* weight = this->blobs_[0]->cpu_data();
 46   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
 47   for (int i = 0; i < top.size(); ++i) {
 48     const Dtype* top_diff = top[i]->cpu_diff();
 49     const Dtype* bottom_data = bottom[i]->cpu_data();
 50     Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
 51     // Bias gradient, if necessary.
 52     if (this->bias_term_ && this->param_propagate_down_[1]) {
 53       Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
 54       for (int n = 0; n < this->num_; ++n) {
 55         this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
 56       }
 57     }
 58     if (this->param_propagate_down_[0] || propagate_down[i]) {
 59       for (int n = 0; n < this->num_; ++n) {
 60         // gradient w.r.t. weight. Note that we will accumulate diffs.
 61         if (this->param_propagate_down_[0]) {
 62           this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_,
 63               top_diff + n * this->top_dim_, weight_diff);
 64         }
 65         // gradient w.r.t. bottom data, if necessary.
 66         if (propagate_down[i]) {
 67           this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,
 68               bottom_diff + n * this->bottom_dim_);
 69         }
 70       }
 71     }
 72   }
 73 }






was (Author: harker99):
Thank you very much for reply!

I just look at code in Caffe for BP Algorithm which just same with 
http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm .The code in 
line#62 #63 show that when compute gradient in  one layer , Caffe  use  input 
of current  layer and the gradient of next layer , but in Singa i find that 
Singa use the input of current layer and gradient of final layer. if there 
anything i miss with BP in Singa? Looking forward your reply again.
 
(code in 
https://github.com/apache/incubator-singa/blob/master/src/model/layer/convolution.cc
162     CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
163     dw += Mult(grad_b, col_data.T());
)



 42 template <typename Dtype>
 43 void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 44       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& 
bottom) {
 45   const Dtype* weight = this->blobs_[0]->cpu_data();
 46   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
 47   for (int i = 0; i < top.size(); ++i) {
 48     const Dtype* top_diff = top[i]->cpu_diff();
 49     const Dtype* bottom_data = bottom[i]->cpu_data();
 50     Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
 51     // Bias gradient, if necessary.
 52     if (this->bias_term_ && this->param_propagate_down_[1]) {
 53       Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
 54       for (int n = 0; n < this->num_; ++n) {
 55         this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
 56       }
 57     }
 58     if (this->param_propagate_down_[0] || propagate_down[i]) {
 59       for (int n = 0; n < this->num_; ++n) {
 60         // gradient w.r.t. weight. Note that we will accumulate diffs.
 61         if (this->param_propagate_down_[0]) {
 62           this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_,
 63               top_diff + n * this->top_dim_, weight_diff);
 64         }
 65         // gradient w.r.t. bottom data, if necessary.
 66         if (propagate_down[i]) {
 67           this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,
 68               bottom_diff + n * this->bottom_dim_);
 69         }
 70       }
 71     }
 72   }
 73 }





> Convolution BP
> --------------
>
>                 Key: SINGA-249
>                 URL: https://issues.apache.org/jira/browse/SINGA-249
>             Project: Singa
>          Issue Type: Wish
>         Environment: ubuntu 14.04,singa 1.0
>            Reporter: hacker99
>
> I'm curious about how to calculate the gradient of the back propagation 
> algorithm eg. Convolution layer. Can anyone explain to me the details of the 
> implementation of the formula and the code? Very grateful, if there is some 
> documents or just tell why   dw += Mult(grad_b, col_data.T())?
> #code from src/model/layer/convolution.cc
> const std::pair<Tensor, vector<Tensor>> Convolution::Backward(
>     int flag, const Tensor &grad) {
>   CHECK_EQ(grad.device()->lang(), kCpp);
>   CHECK_EQ(grad.nDim(), 4u);
>   CHECK(!buf_.empty());
>   Tensor src_data = buf_.top();
>   buf_.pop();
>   vector<Tensor> param_grad;
>   Tensor dx;
>   Tensor db, dw;
>   dx.ResetLike(src_data);
>   db.ResetLike(bias_);
>   dw.ResetLike(weight_);
>   dw.SetValue(0.0f);
>   size_t batchsize = grad.shape(0);
>   size_t imagesize = src_data.Size() / batchsize;
>   if (bias_term_) {
>     Tensor tmp1 =
>         Reshape(grad, Shape{batchsize * num_filters_,
>                             grad.Size() / (batchsize * num_filters_)});
>     Tensor tmp2(Shape{batchsize * num_filters_});
>     SumColumns(tmp1, &tmp2);
>     Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_});
>     SumRows(tmp3, &db);
>   }
>   auto in_data = src_data.data<float>();
>   Tensor col_data(Shape{col_height_, col_width_});
>   float *data_col = new float[col_height_ * col_width_];
>   float *dx_b = new float[imagesize];
>   for (size_t b = 0; b < batchsize; b++) {
>     Im2col(in_data + b * imagesize, channels_, height_, width_, kernel_h_,
>            kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data_col);
>     col_data.CopyDataFromHostPtr(data_col, col_height_ * col_width_);
>     Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_});
>     CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
>     dw += Mult(grad_b, col_data.T());
>     Tensor dcol_b = Mult(weight_.T(), grad_b);
>     auto dcol_data = dcol_b.data<float>();
>     Col2im(dcol_data, channels_, height_, width_, kernel_h_, kernel_w_, 
> pad_h_,
>            pad_w_, stride_h_, stride_w_, dx_b);
>     dx.CopyDataFromHostPtr(dx_b, imagesize, b * imagesize);
>   }
>   param_grad.push_back(dw);
>   param_grad.push_back(db);
>   delete[] data_col;
>   delete[] dx_b;
>   return std::make_pair(dx, param_grad);
> }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to