http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/include/singa/neuralnet/output_layer/csv.h
----------------------------------------------------------------------
diff --git a/include/singa/neuralnet/output_layer/csv.h 
b/include/singa/neuralnet/output_layer/csv.h
deleted file mode 100644
index 439db8c..0000000
--- a/include/singa/neuralnet/output_layer/csv.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_OUTPUT_LAYER_CSV_H_
-#define SINGA_NEURALNET_OUTPUT_LAYER_CSV_H_
-
-#include <vector>
-#include "singa/neuralnet/layer.h"
-#include "singa/io/store.h"
-
-namespace singa {
-/**
- * Output data (and label) for its source layer.
- */
-class CSVOutputLayer : public OutputLayer {
- public:
-  ~CSVOutputLayer() { delete store_; }
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) 
override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  int inst_ = 0;
-  io::Store* store_ = nullptr;
-};
-}  // namespace singa
-#endif  // SINGA_NEURALNET_OUTPUT_LAYER_CSV_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/include/singa/neuralnet/output_layer/record.h
----------------------------------------------------------------------
diff --git a/include/singa/neuralnet/output_layer/record.h 
b/include/singa/neuralnet/output_layer/record.h
deleted file mode 100644
index cd652d5..0000000
--- a/include/singa/neuralnet/output_layer/record.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_OUTPUT_LAYER_RECORD_H_
-#define SINGA_NEURALNET_OUTPUT_LAYER_RECORD_H_
-
-#include <vector>
-#include "singa/neuralnet/layer.h"
-#include "singa/io/store.h"
-
-namespace singa {
-
-class RecordOutputLayer : public OutputLayer {
- public:
-  ~RecordOutputLayer() { delete store_; }
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) 
override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  int inst_ = 0;  //!< instance No.
-  io::Store* store_ = nullptr;
-};
-}  // namespace singa
-#endif  // SINGA_NEURALNET_OUTPUT_LAYER_RECORD_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/connection_layer/bridge.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer/bridge.cc 
b/src/neuralnet/connection_layer/bridge.cc
index 1ad4b0c..200a3f9 100644
--- a/src/neuralnet/connection_layer/bridge.cc
+++ b/src/neuralnet/connection_layer/bridge.cc
@@ -19,6 +19,7 @@
 *
 *************************************************************/
 
+<<<<<<< HEAD
 #include "singa/neuralnet/connection_layer/bridge.h"
 #include "singa/comm/msg.h"
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/connection_layer/concate.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer/concate.cc 
b/src/neuralnet/connection_layer/concate.cc
index f9d6416..13a7ea7 100644
--- a/src/neuralnet/connection_layer/concate.cc
+++ b/src/neuralnet/connection_layer/concate.cc
@@ -19,12 +19,10 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/connection_layer/concate.h"
+#include "singa/neuralnet/connection_layer.h"
 
 namespace singa {
 
-using std::vector;
-
 void ConcateLayer::Setup(const LayerProto& conf,
                          const vector<Layer*>& srclayers) {
   CHECK_GT(srclayers.size(), 1);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/connection_layer/slice.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer/slice.cc 
b/src/neuralnet/connection_layer/slice.cc
index c69f797..a607a95 100644
--- a/src/neuralnet/connection_layer/slice.cc
+++ b/src/neuralnet/connection_layer/slice.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/connection_layer/slice.h"
+#include "singa/neuralnet/connection_layer.h"
 
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/connection_layer/split.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer/split.cc 
b/src/neuralnet/connection_layer/split.cc
index 36b391c..7ee24fa 100644
--- a/src/neuralnet/connection_layer/split.cc
+++ b/src/neuralnet/connection_layer/split.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/connection_layer/split.h"
+#include "singa/neuralnet/connection_layer.h"
 
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/input_layer/csv.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/csv.cc b/src/neuralnet/input_layer/csv.cc
index 297d05f..53cabff 100644
--- a/src/neuralnet/input_layer/csv.cc
+++ b/src/neuralnet/input_layer/csv.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/input_layer/csv.h"
+#include "singa/neuralnet/input_layer.h"
 #include "singa/utils/tokenizer.h"
 
 namespace singa {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/input_layer/deprecated.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/deprecated.cc 
b/src/neuralnet/input_layer/deprecated.cc
index b0baf1d..1760d4b 100644
--- a/src/neuralnet/input_layer/deprecated.cc
+++ b/src/neuralnet/input_layer/deprecated.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/input_layer/deprecated.h"
+#include "singa/neuralnet/input_layer.h"
 #include "mshadow/tensor.h"
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/input_layer/image_preprocess.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/image_preprocess.cc 
b/src/neuralnet/input_layer/image_preprocess.cc
index 110605b..576b096 100644
--- a/src/neuralnet/input_layer/image_preprocess.cc
+++ b/src/neuralnet/input_layer/image_preprocess.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/input_layer/image_preprocess.h"
+#include "singa/neuralnet/input_layer.h"
 #include "singa/utils/image_transform.h"
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/input_layer/prefetch.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/prefetch.cc 
b/src/neuralnet/input_layer/prefetch.cc
index 1348a37..9c7f2d9 100644
--- a/src/neuralnet/input_layer/prefetch.cc
+++ b/src/neuralnet/input_layer/prefetch.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/input_layer/prefetch.h"
+#include "singa/neuralnet/input_layer.h"
 namespace singa {
 
 using std::vector;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/input_layer/record.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/record.cc 
b/src/neuralnet/input_layer/record.cc
index 1983e7b..2fb71c6 100644
--- a/src/neuralnet/input_layer/record.cc
+++ b/src/neuralnet/input_layer/record.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/input_layer/record.h"
+#include "singa/neuralnet/input_layer.h"
 namespace singa {
 
 using std::string;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/input_layer/store.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/input_layer/store.cc 
b/src/neuralnet/input_layer/store.cc
index 0355c13..b1b296e 100644
--- a/src/neuralnet/input_layer/store.cc
+++ b/src/neuralnet/input_layer/store.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/input_layer/store.h"
+#include "singa/neuralnet/input_layer.h"
 namespace singa {
 
 StoreInputLayer::~StoreInputLayer() {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/loss_layer/euclidean.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/loss_layer/euclidean.cc 
b/src/neuralnet/loss_layer/euclidean.cc
index daab15c..71e5bae 100644
--- a/src/neuralnet/loss_layer/euclidean.cc
+++ b/src/neuralnet/loss_layer/euclidean.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/loss_layer/euclidean.h"
+#include "singa/neuralnet/loss_layer.h"
 
 #include <glog/logging.h>
 #include "mshadow/tensor.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/loss_layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/loss_layer/softmax.cc 
b/src/neuralnet/loss_layer/softmax.cc
index 497c287..8c100ef 100644
--- a/src/neuralnet/loss_layer/softmax.cc
+++ b/src/neuralnet/loss_layer/softmax.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/loss_layer/softmax.h"
+#include "singa/neuralnet/loss_layer.h"
 
 #include <algorithm>
 #include <glog/logging.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/argsort.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/argsort.cc 
b/src/neuralnet/neuron_layer/argsort.cc
index d0e7f36..d1775c0 100644
--- a/src/neuralnet/neuron_layer/argsort.cc
+++ b/src/neuralnet/neuron_layer/argsort.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/argsort.h"
+#include "singa/neuralnet/output_layer.h"
 #include <algorithm>
 
 namespace singa {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/convolution.cc 
b/src/neuralnet/neuron_layer/convolution.cc
index fa74dcf..4247bee 100644
--- a/src/neuralnet/neuron_layer/convolution.cc
+++ b/src/neuralnet/neuron_layer/convolution.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/convolution.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"
@@ -37,14 +37,40 @@ void ConvolutionLayer::Setup(const LayerProto& conf,
   CHECK_EQ(srclayers.size(), 1);
   Layer::Setup(conf, srclayers);
   ConvolutionProto conv_conf = conf.convolution_conf();
-  kernel_ = conv_conf.kernel();
-  CHECK_GT(kernel_, 0) << "Filter size cannot be zero.";
-  pad_ = conv_conf.pad();
-  stride_ = conv_conf.stride();
+  if (conv_conf.has_kernel()) {
+    kernel_x_ = kernel_y_ = conv_conf.kernel();
+  } else {
+    CHECK(conv_conf.has_kernel_x());
+    CHECK(conv_conf.has_kernel_y());
+    kernel_x_ = conv_conf.kernel_x();
+    kernel_y_ = conv_conf.kernel_y();
+  }
+
+  if (conv_conf.has_pad()) {
+    pad_x_ = pad_y_ = conv_conf.pad();
+  } else {
+    CHECK(conv_conf.has_pad_x());
+    CHECK(conv_conf.has_pad_y());
+    pad_x_ = conv_conf.pad_x();
+    pad_y_ = conv_conf.pad_y();
+  }
+
+  if (conv_conf.has_stride()) {
+    stride_x_ = stride_y_ = conv_conf.stride();
+  } else {
+    CHECK(conv_conf.has_stride_x());
+    CHECK(conv_conf.has_stride_y());
+    stride_x_ = conv_conf.stride_x();
+    stride_y_ = conv_conf.stride_y();
+  }
+
   num_filters_ = conv_conf.num_filters();
+  // partition filters
   if (partition_dim() > 0)
     num_filters_ /= srclayers.at(0)->num_partitions();
+
   const vector<int>& srcshape = srclayers[0]->data(this).shape();
+  batchsize_ = srcshape[0];
   int dim = srcshape.size();
   CHECK_GT(dim, 2);
   width_ = srcshape[dim - 1];
@@ -53,10 +79,10 @@ void ConvolutionLayer::Setup(const LayerProto& conf,
     channels_ = srcshape[dim - 3];
   else if (dim > 2)
     channels_ = 1;
-  batchsize_ = srcshape[0];
-  conv_height_ = (height_ + 2 * pad_ - kernel_) / stride_ + 1;
-  conv_width_ = (width_ + 2 * pad_ - kernel_) / stride_ + 1;
-  col_height_ = channels_ * kernel_ * kernel_;
+
+  conv_height_ = (height_ + 2 * pad_y_ - kernel_y_) / stride_y_ + 1;
+  conv_width_ = (width_ + 2 * pad_x_ - kernel_x_) / stride_x_ + 1;
+  col_height_ = channels_ * kernel_x_ * kernel_y_;
   col_width_ = conv_height_ * conv_width_;
   vector<int> shape{batchsize_, num_filters_, conv_height_, conv_width_};
   data_.Reshape(shape);
@@ -64,11 +90,14 @@ void ConvolutionLayer::Setup(const LayerProto& conf,
   col_data_.Reshape(vector<int>{col_height_, col_width_});
   col_grad_.Reshape(vector<int>{col_height_, col_width_});
   weight_ = Param::Create(conf.param(0));
-  bias_ = Param::Create(conf.param(1));
   weight_->Setup(vector<int>{num_filters_, col_height_});
-  bias_->Setup(vector<int>{num_filters_});
+  if (conf.param_size() > 1) {
+    bias_ = Param::Create(conf.param(1));
+    bias_->Setup(vector<int>{num_filters_});
+  }
 }
 
+// TODO(wangwei) remove mshadow's functions
 void ConvolutionLayer::ComputeFeature(int flag,
     const vector<Layer*>& srclayers) {
   auto src = Tensor4(srclayers[0]->mutable_data(this));
@@ -78,9 +107,9 @@ void ConvolutionLayer::ComputeFeature(int flag,
   auto bias = Tensor1(bias_->mutable_data());
   for (int n = 0; n < batchsize_; n++) {
     if (pad_ > 0)
-      col = expr::unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
+      col = expr::unpack_patch2col(pad(src[n], pad_x_), kernel_x_, stride_x_);
     else
-      col = expr::unpack_patch2col(src[n], kernel_, stride_);
+      col = expr::unpack_patch2col(src[n], kernel_x_, stride_x_);
     data[n] = dot(weight, col);
   }
   data += expr::broadcast<1>(bias, data.shape);
@@ -107,13 +136,13 @@ void ConvolutionLayer::ComputeGradient(int flag,
   Shape<2> imgshp = Shape2(height_, width_);
   for (int n = 0; n < batchsize_; n++) {
     if (pad_ > 0)
-      col = expr::unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
+      col = expr::unpack_patch2col(pad(src[n], pad_x_), kernel_x_, stride_x_);
     else
-      col = expr::unpack_patch2col(src[n], kernel_, stride_);
+      col = expr::unpack_patch2col(src[n], kernel_x_, stride_x_);
     gweight += dot(grad[n], col.T());
     if (gsrcblob != nullptr) {
       gcol = dot(weight.T(), grad[n]);
-      gsrc[n] = crop(expr::pack_col2patch(gcol, padshp, kernel_, stride_),
+      gsrc[n] = crop(expr::pack_col2patch(gcol, padshp, kernel_x_, stride_x_),
           imgshp);
     }
   }
@@ -130,7 +159,7 @@ void CConvolutionLayer::ComputeFeature(int flag,
 
   for (int n = 0; n < batchsize_; n++) {
     Im2col(src[n].dptr, channels_, height_, width_,
-        kernel_, kernel_, pad_, pad_, stride_, stride_, col.dptr);
+        kernel_x_, kernel_y_, pad_x_, pad_y_, stride_x_, stride_y_, col.dptr);
     data[n] = dot(weight, col);
   }
   data += expr::broadcast<1>(bias, data.shape);
@@ -154,12 +183,13 @@ void CConvolutionLayer::ComputeGradient(int flag,
   gbias = expr::sumall_except_dim<1>(grad);
   for (int n = 0; n < batchsize_; n++) {
     Im2col(src[n].dptr, channels_, height_, width_,
-        kernel_, kernel_, pad_, pad_, stride_, stride_, col.dptr);
+        kernel_x_, kernel_y_, pad_x_, pad_y_, stride_x_, stride_y_, col.dptr);
     gweight += dot(grad[n], col.T());
     if (gsrcblob != nullptr) {
       gcol = dot(weight.T(), grad[n]);
       Col2im(gcol.dptr, channels_, height_, width_,
-          kernel_, kernel_, pad_, pad_, stride_, stride_, gsrc[n].dptr);
+          kernel_x_, kernel_y_, pad_x_, pad_y_, stride_x_, stride_y_,
+          gsrc[n].dptr);
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/cudnn_activation.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_activation.cu 
b/src/neuralnet/neuron_layer/cudnn_activation.cu
new file mode 100644
index 0000000..f77a8a8
--- /dev/null
+++ b/src/neuralnet/neuron_layer/cudnn_activation.cu
@@ -0,0 +1,100 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/neuralnet/neuron_layer.h"
+
+namespace singa {
+
+void CudnnActivationLayer::InitCudnn() {
+  CudnnLayer::InitCudnn();
+
+  // TODO(wangwei) make the mode case insensitive
+  if (layer_conf_.activation_conf().mode() == "sigmoid")
+    mode_ = CUDNN_ACTIVATION_SIGMOID;
+  else if (layer_conf_.activation_conf().mode() == "tanh")
+    mode_ = CUDNN_ACTIVATION_TANH;
+  else if (layer_conf_.activation_conf().mode() == "relu")
+    mode_ = CUDNN_ACTIVATION_RELU;
+  else {
+    LOG(FATAL) << "Unkown activation: " << 
layer_conf_.activation_conf().mode();
+  }
+
+  const auto& shape = data_.shape();
+  CHECK_GT(shape.size(), 0);
+  // size of each dimension
+  int* sdim= new int[shape.size()];
+  int* stride = new int[shape.size()];
+  stride[shape.size() -1] = 1;
+  int i = shape.size() - 1;
+  sdim[i] = shape[i];
+  stride[i] = 1;
+  for (--i; i >= 0; i--) {
+    sdim[i] = shape[i];
+    stride[i] = shape[i + 1] * stride[i + 1]
+  }
+  CHECK_CUDNN(cudnnSetTensorNdDescriptor(src_desc_,
+        CUDNN_DATA_FLOAT,
+        shape.size(),
+        sdim,
+        stride));
+  CHECK_CUDNN(cudnnSetTensorNdDescriptor(my_desc_,
+        CUDNN_DATA_FLOAT,
+        shape.size(),
+        sdim,
+        stride));
+  delete[] sdim;
+  delete[] stride;
+}
+
+void CudnnActivationLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  if (!has_init_cudnn_)
+    InitCudnn();
+  float alpha = 1.0f, beta = 0.0f;
+  // currently only consider single src layer
+  CHECK_EQ(srclayers.size(), 1);
+  CHECK_CUDNN(cudnnActivationForward(handle_,
+        mode_,
+        &alpha,
+        src_desc_,
+        srclayers[0].data(this)->gpu_data(),
+        &beta,
+        my_desc_,
+        data_.mutable_gpu_data()));
+}
+
+void CudnnActivationLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  float alpha = 1.0f, beta = 0.0f;
+  CHECK_CUDNN(cudnnActivationBackward(handle_,
+        mode_,
+        &alpha,
+        my_desc_,
+        data_.gpu_data()
+        my_desc_,
+        grad_.gpu_data(),
+        src_desc_,
+        srclayers[0].data(this)->gpu_data(),
+        &beta,
+        src_desc_,
+        srclayers[0].mutable_grad(this)->mutable_gpu_data()));
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/cudnn_convolution.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_convolution.cu 
b/src/neuralnet/neuron_layer/cudnn_convolution.cu
new file mode 100644
index 0000000..debe4c3
--- /dev/null
+++ b/src/neuralnet/neuron_layer/cudnn_convolution.cu
@@ -0,0 +1,205 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/neuralnet/neuron_layer.h"
+
+namespace singa {
+
+CudnnConvLayer::~CudnnConvLayer() {
+  if (has_init_cudnn_) {
+    CHECK_CUDNN(cudnnDestroyTensorDescriptor(bias_desc_));
+    CHECK_CUDNN(cudnnDestroyFilterDescriptor(filter_desc_));
+    CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  }
+}
+
+void CudnnConvLayer::InitCudnn() {
+  CudnnLayer::InitCudnn();
+  // convert MB to bytes
+  workspace_byte_limit_ = 
layer_conf_.convolution_conf().workspace_byte_limit() << 20;
+
+  CHECK_CUDNN(cudnnCreateTensorDescriptor(&bias_desc_));
+  CHECK_CUDNN(cudnnCreateFilterDescriptor(&filter_desc_));
+  CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+  CHECK_CUDNN(cudnnSetConvolution2dDescriptor(conv_desc_,
+        pad_y_,
+        pad_x_,
+        stride_y_,
+        stride_x_,
+        1,
+        1,
+        CUDNN_CROSS_CORRELATION));
+  CHECK_CUDNN(cudnnSetFilter4dDescriptor(filter_desc_,
+        CUDNN_DATA_FLOAT,
+        num_filters_,
+        channels_,
+        kernel_y_,
+        kernel_x_));
+  if (bias_) {
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(bias_desc_,
+          CUDNN_TENSOR_NCHW,
+          CUDNN_DATA_FLOAT,
+          1,
+          num_filters_,
+          1,
+          1));
+  }
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
+        CUDNN_TENSOR_NCHW,
+        CUDNN_DATA_FLOAT,
+        batchsize_,
+        channels_,
+        height_,
+        width_));
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(data_desc_,
+        CUDNN_TENSOR_NCHW,
+        CUDNN_DATA_FLOAT,
+        batchsize_,
+        num_filters_,
+        conv_height_,
+        conv_width_));
+
+  CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(handle_,
+        src_desc_,
+        filter_desc_,
+        conv_desc_,
+        my_desc_,
+        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+        workspace_byte_limit_,
+        &fp_alg_));
+
+  CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(handle_,
+        src_desc_,
+        my_desc_,
+        conv_desc_,
+        filter_desc_,
+        CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
+        workspace_byte_limit_,
+        &bp_filter_alg_));
+  CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(handle_,
+        filter_desc_,
+        my_desc_,
+        conv_desc_,
+        src_desc_,
+        CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
+        workspace_byte_limit_,
+        &bp_data_alg_));
+
+  size_t fp_byte, bp_data_byte, bp_filter_byte;
+  CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(handle_,
+        src_desc_,
+        filter_desc_,
+        conv_desc_,
+        my_desc_,
+        fp_alg_,
+        &fp_byte));
+  CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_,
+        filter_desc_,
+        my_desc_,
+        conv_desc_,
+        src_desc_,
+        bp_data_alg_,
+        &bp_data_byte));
+  CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_,
+        src_desc_,
+        my_desc_,
+        conv_desc_,
+        filter_desc_,
+        bp_filter_alg_,
+        &bp_filter_byte));
+  workspace_count_ = std::max(std::max(fp_bypte, bp_data_byte), bp_filter_byte)
+    / sizeof(float) + 1;
+}
+
+void CudnnConvLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) 
{
+  if (!has_init_cudnn_)
+    InitCudnn();
+  float alpha = 1.f, beta = 0.f;
+  Blob<float> workspace(vector<int>{workspace_count_});
+  CUDNN_CHECK(cudnnConvolutionForward(handle_,
+        &alpha,
+        src_desc_,
+        srclayers[0]->data(this).gpu_data(),
+        filter_desc_,
+        weight_->data().gpu_data(),
+        conv_descs_,
+        fp_alg_,
+        workspace.mutable_gpu_data(),
+        workspace_count_ * sizeof(float),
+        &beta,
+        data_desc_,
+        data_.mutable_gpu_data()));
+
+  if (bias_) {
+    beta = 1.f;
+    CUDNN_CHECK(cudnnAddTensor(handle_,
+          CUDNN_ADD_SAME_C,
+          &alpha,
+          bias_desc_,
+          bias_->data().gpu_data(),
+          &beta,
+          data_desc_,
+          data_.mutable_gpu_data()));
+  }
+}
+
+void CudnnConvLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
+{
+  float alpha = 1.f, beta = 0.f;
+  Blob<float> workspace(vector<int>{workspace_count_});
+  if (bias_) {
+    CHECK_CUDNN(cudnnConvolutionBackwardBias(handle_,
+          &alpha,
+          data_desc_,
+          grad_.gpu_data(),
+          &beta,
+          bias_desc_,
+          bias_->mutable_grad()->mutable_gpu_data()));
+  }
+  CHECK_CUDNN(cudnnConvolutionBackwardFilter_v3(handle_,
+        &alpha,
+        src_desc_,
+        srclayers[0]->data(this)->gpu_data(),
+        my_desc_,
+        grad_.gpu_data(),
+        conv_desc_,
+        bp_filter_alg_,
+        workspace.mutable_gpu_data(),
+        workspace_count_ * sizeof(float),
+        &beta,
+        filter_desc_,
+        weight_->mutable_grad()->mutable_gpu_data()));
+  CHECK_CUDNN(cudnnConvolutionBackwardData_v3(handle_,
+        &alpha,
+        filter_desc_,
+        weight_->data()->gpu_data(),
+        my_desc_,
+        grad_.gpu_data(),
+        conv_desc_,
+        bp_data_alg_,
+        workspace.mutable_gpu_data(),
+        workspace_count_ * sizeof(float),
+        &beta,
+        src_desc_,
+        srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
+}
+}  /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/cudnn_lrn.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_lrn.cu 
b/src/neuralnet/neuron_layer/cudnn_lrn.cu
new file mode 100644
index 0000000..7cb111a
--- /dev/null
+++ b/src/neuralnet/neuron_layer/cudnn_lrn.cu
@@ -0,0 +1,95 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/neuralnet/neuron_layer.h"
+
+namespace singa {
+CudnnLRNLayer::~CudnnLRNLayer() {
+  if (!init_cudnn_) {
+    cudnnDestroyLRNDescriptor(norm_desc_);
+  }
+}
+
+void CudnnLRNLayer::Setup(const LayerProto& proto,
+    const vector<Layer*>& srclayers) {
+  LRNLayer::Setup(proto, srclayers);
+  mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
+}
+
+void CudnnLRNLayer::InitCudnn() {
+  CudnnLayer::InitCudnn(srclayers);
+  CHECK_EQ(cudnnCreateLRNDescriptor(&norm_desc_), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnSetLRNDescriptor(norm_desc_,
+        lsize_,
+        alpha_,
+        beta_,
+        knorm_), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateTensorDescriptor(&src_desc_), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnSetTensor4dDescriptor(src_desc_,
+      CUDNN_TENSOR_NCHW,
+      CUDNN_DATA_FLOAT,
+      batchsize_,
+      channels_,
+      height_,
+      width_), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateTensorDescriptor(&my_desc_), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnSetTensor4dDescriptor(my_desc_,
+      CUDNN_TENSOR_NCHW,
+      CUDNN_DATA_FLOAT,
+      batchsize_,
+      channels_,
+      height_,
+      width_), CUDNN_STATUS_SUCCESS);
+}
+void ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  if (init_cudnn_) {
+    InitCudnn();
+    init_cudnn_ = false;
+  }
+  CHECK_EQ(cudnnLRNCrossChannelForward(handle_,
+      norm_desc_,
+      mode_,
+      &alpha,
+      src_desc_,
+      srclayers[0]->data(this).gpu_data(),
+      &beta,
+      my_desc_,
+      data_.mutable_gpu_data()), CUDNN_STATUS_SUCCESS);
+}
+void ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  CHECK_EQ(cudnnLRNCrossChannelBackward(handle_,
+        norm_desc_,
+        mode_,
+        &alpha,
+        my_desc_, // ???
+        data_.gpu_data(),
+        my_desc_,
+        grad_.gpu_data()
+        src_desc_,
+        srclayers[0]->data(this).gpu_data(),
+        &beta,
+        src_desc_,
+        srclayers[0]->mutable_grad(this)->mutable_gpu_data()),
+      CUDNN_STATUS_SUCCESS);
+}
+
+
+} /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/cudnn_pooling.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_pooling.cu 
b/src/neuralnet/neuron_layer/cudnn_pooling.cu
new file mode 100644
index 0000000..619998f
--- /dev/null
+++ b/src/neuralnet/neuron_layer/cudnn_pooling.cu
@@ -0,0 +1,95 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/neuralnet/neuron_layer.h"
+
+namespace singa {
+
+CuDNNPoolLayer::~CuDNNPoolLayer() {
+  if (has_init_cudnn_) {
+    CHECK_EQ(cudnnDestroyPoolingDescriptor(pool_desc_), CUDNN_STATUS_SUCCESS);
+  }
+}
+
+void CuDNNPoolLayer::InitCudnn() {
+  CudnnLayer::InitCudnn();
+  CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pool_desc_));
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
+        CUDNN_TENSOR_NCHW,
+        CUDNN_DATA_FLOAT,
+        batchsize_,
+        channels_,
+        height_,
+        width_));
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
+        CUDNN_TENSOR_NCHW,
+        CUDNN_DATA_FLOAT,
+        batchsize_,
+        channels_,
+        pooled_height_,
+        pooled_width_));
+  auto pool_method = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+  if (pool_ == PoolingProto_PoolMethod_MAX)
+    pool_method = CUDNN_POOLING_MAX;
+  CHECK_CUDNN(cudnnSetPooling2dDescriptor(pool_desc_,
+        pool_method,
+        kernel_y_,
+        kernel_x_,
+        pad_y_,
+        pad_x_,
+        stride_y_,
+        stride_x_));
+}
+
+void CuDNNPoolLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) 
{
+  if (!has_init_cudnn_)
+    InitCudnn();
+  float alpha = 1.0f, beta = 0.0f;
+  // currently only consider single src layer
+  CHECK_EQ(srclayers.size(), 1);
+  CHECK_CUDNN(cudnnPoolingForward(handle_,
+        pool_desc_,
+        &alpha,
+        src_desc_,
+        srclayers[0]->data(this).gpu_data(),
+        &beta,
+        data_desc_,
+        data_.mutable_gpu_data()));
+}
+
+void CuDNNPoolLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)
+{
+  float alpha = 1.0f, beta = 0.0f;
+  CHECK_CUDNN(cudnnPoolingBackward(handle_,
+        pool_desc_,
+        &alpha,
+        my_desc_,
+        data_.gpu_data(),
+        my_desc_,
+        grad_.gpu_data(),
+        src_desc_,
+        srclayers[0]->data(this).gpu_data(),
+        &beta,
+        src_desc_,
+        srclayers[0]->mutable_grad(this).mutable_gpu_data()));
+}
+}  /* singa */
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/cudnn_softmax.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/cudnn_softmax.cu 
b/src/neuralnet/neuron_layer/cudnn_softmax.cu
new file mode 100644
index 0000000..f7e8abe
--- /dev/null
+++ b/src/neuralnet/neuron_layer/cudnn_softmax.cu
@@ -0,0 +1,75 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/neuralnet/neuron_layer.h"
+
+namespace singa {
+
+void CudnnSoftmaxLayer::InitCudnn() {
+  CudnnLayer::InitCudnn();
+  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
+        CUDNN_TENSOR_NCHW,
+        CUDNN_DATA_FLOAT,
+        batchsize_,
+        num_softmax_per_instance_,
+        count_per_softmax_,
+        1));
+  CHECK_EQ(cudnnSetTensor4dDescriptor(my_desc_,
+        CUDNN_TENSOR_NCHW,
+        CUDNN_DATA_FLOAT,
+        batchsize_,
+        num_softmax_per_instance_,
+        count_per_softmax_,
+        1));
+}
+
+void CudnnSoftmaxLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  if (!has_init_cudnn_)
+    InitCudnn();
+  float alpha = 1.0f, beta = 0.0f;
+  CHECK_CUDNN(CudnnSoftmaxForward(handle_,
+        CUDNN_SOFTMAX_ACCURATE,
+        CUDNN_SOFTMAX_CHANNEL,
+        &alpha,
+        src_desc_,
+        srclayers[0]->data(this).gpu_data(),
+        &beta,
+        my_desc_,
+        data_.mutable_gpu_data()));
+}
+
+void CudnnSoftmaxLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  float alpha = 1.f, beta = 0.f;
+  CHECK_CUDNN(CudnnSoftmaxForward(handle_,
+        CUDNN_SOFTMAX_ACCURATE,
+        CUDNN_SOFTMAX_CHANNEL,
+        &alpha,
+        my_desc_,
+        data_.gpu_data(),
+        my_desc_,
+        grad_.gpu_data(),
+        &beta,
+        src_desc_,
+        srclayers[0]->mutable_grad(this).mutable_gpu_data()));
+}
+}  /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/dropout.cc 
b/src/neuralnet/neuron_layer/dropout.cc
index 59d9bc5..6158a6c 100644
--- a/src/neuralnet/neuron_layer/dropout.cc
+++ b/src/neuralnet/neuron_layer/dropout.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/dropout.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/inner_product.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/inner_product.cc 
b/src/neuralnet/neuron_layer/inner_product.cc
index 3f38d9c..25b71df 100644
--- a/src/neuralnet/neuron_layer/inner_product.cc
+++ b/src/neuralnet/neuron_layer/inner_product.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/inner_product.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/lrn.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/lrn.cc 
b/src/neuralnet/neuron_layer/lrn.cc
index f1c302f..178b2bf 100644
--- a/src/neuralnet/neuron_layer/lrn.cc
+++ b/src/neuralnet/neuron_layer/lrn.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/lrn.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/pooling.cc 
b/src/neuralnet/neuron_layer/pooling.cc
index a3c5e6f..fd475d8 100644
--- a/src/neuralnet/neuron_layer/pooling.cc
+++ b/src/neuralnet/neuron_layer/pooling.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/pooling.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/rbm.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/rbm.cc 
b/src/neuralnet/neuron_layer/rbm.cc
index fc15e79..53a1bd9 100644
--- a/src/neuralnet/neuron_layer/rbm.cc
+++ b/src/neuralnet/neuron_layer/rbm.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/rbm.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/relu.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/relu.cc 
b/src/neuralnet/neuron_layer/relu.cc
index 247d077..1e030a0 100644
--- a/src/neuralnet/neuron_layer/relu.cc
+++ b/src/neuralnet/neuron_layer/relu.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/relu.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/sigmoid.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/sigmoid.cc 
b/src/neuralnet/neuron_layer/sigmoid.cc
index 3ddefd8..c449b36 100644
--- a/src/neuralnet/neuron_layer/sigmoid.cc
+++ b/src/neuralnet/neuron_layer/sigmoid.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/sigmoid.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 #include <glog/logging.h>
 #include "singa/utils/singleton.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/softmax.cc 
b/src/neuralnet/neuron_layer/softmax.cc
index be2f337..83bbc5a 100644
--- a/src/neuralnet/neuron_layer/softmax.cc
+++ b/src/neuralnet/neuron_layer/softmax.cc
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/softmax.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 namespace singa {
 
@@ -37,12 +37,17 @@ void SoftmaxLayer::Setup(const LayerProto& proto,
     const vector<Layer*>& srclayers) {
   CHECK_EQ(srclayers.size(), 1);
   NeuronLayer::Setup(proto, srclayers);
-  data_.Reshape(srclayers[0]->data(this).shape());
+  const auto& srcdata = srclayers[0]->data(this);
+  batchsize_ = data_.shape()[0];
+  num_softmax_per_instance_ = proto.softmax_conf().softmax_dim();
+  count_per_softmax_ = .count() / batchsize_ / num_softmax_per_instance_;
+  data_.Reshape(vector<int>{batchsize_, num_softmax_per_instance_,
+      count_per_softmax_});
+  grad_.ReshapeLike(data_);
 }
 
 void SoftmaxLayer::ComputeFeature(int flag,
     const vector<Layer*>& srclayers) {
-  int batchsize = data_.shape()[0];
   int dim = data_.count() / batchsize;
   Shape<2> s = Shape2(batchsize, dim);
   Tensor<cpu, 2> prob(data_.mutable_cpu_data(), s);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/neuron_layer/stanh.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/stanh.cc 
b/src/neuralnet/neuron_layer/stanh.cc
index 3036850..70b9cd1 100644
--- a/src/neuralnet/neuron_layer/stanh.cc
+++ b/src/neuralnet/neuron_layer/stanh.cc
@@ -19,10 +19,7 @@
 *
 *************************************************************/
 
-#include "singa/neuralnet/neuron_layer/stanh.h"
-
-#include <glog/logging.h>
-#include "singa/utils/singleton.h"
+#include "singa/neuralnet/neuron_layer.h"
 
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/output_layer/csv.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/output_layer/csv.cc 
b/src/neuralnet/output_layer/csv.cc
index 856b95f..d2512da 100644
--- a/src/neuralnet/output_layer/csv.cc
+++ b/src/neuralnet/output_layer/csv.cc
@@ -18,7 +18,7 @@
 * under the License.
 *
 *************************************************************/
-#include "singa/neuralnet/output_layer/csv.h"
+#include "singa/neuralnet/output_layer.h"
 
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/neuralnet/output_layer/record.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/output_layer/record.cc 
b/src/neuralnet/output_layer/record.cc
index 727f539..f7b3e01 100644
--- a/src/neuralnet/output_layer/record.cc
+++ b/src/neuralnet/output_layer/record.cc
@@ -18,7 +18,7 @@
 * under the License.
 *
 *************************************************************/
-#include "singa/neuralnet/output_layer/record.h"
+#include "singa/neuralnet/output_layer.h"
 #include "singa/proto/common.pb.h"
 namespace singa {
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/af1bf509/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index ae461f8..ca5e546 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -349,8 +349,9 @@ message ConvolutionProto {
   optional int32 pad = 30 [default = 0];
   // the stride
   optional int32 stride = 31 [default = 1];
-  // whether to have bias terms
-  optional bool bias_term = 32 [default = true];
+
+  // cudnn workspace size in MB
+  optional int32 workspace_byte_limit = 33 [default = 512];
 }
 
 message DataProto {
@@ -443,6 +444,16 @@ message ReLUProto {
   optional float negative_slope = 1 [default = 0];
 }
 
+message SliceProto {
+  required int32 slice_dim = 1;
+}
+
+message SoftmaxProto {
+  // Can be used to do softmax over each channel of one image by setting it to
+  // be the size of the second dimension (the first dimension is batchsize).
+  optional int32 num_softmax_per_instance = 1 [default = 1];
+}
+
 message RMSPropProto {
  // history=history*rho_+(1-rho_)*(grad*grad_scale);
   required float rho = 1;


Reply via email to