Repository: incubator-singa
Updated Branches:
  refs/heads/dev 245f60e68 -> 493192233


SINGA-213 Implement Encoder and Decoder for CSV

TextEncoder encodes a Tensor into a 1D string where data are splited by comma.

TextDecoder decodes a string of data into Tensors (data and [optional]label).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/7444f0ac
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/7444f0ac
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/7444f0ac

Branch: refs/heads/dev
Commit: 7444f0acc42f47e596967ac4f4640b30872959c9
Parents: 245f60e
Author: jixin <[email protected]>
Authored: Mon Jul 11 12:16:55 2016 +0800
Committer: jixin <[email protected]>
Committed: Mon Jul 11 12:17:04 2016 +0800

----------------------------------------------------------------------
 .gitignore                          |  1 +
 include/singa/io/decoder.h          | 20 ++++++++++-
 include/singa/io/encoder.h          | 16 +++++++--
 src/io/textfile_decoder.cc          | 54 ++++++++++++++++++++++++++++
 src/io/textfile_encoder.cc          | 43 +++++++++++++++++++++++
 src/proto/io.proto                  |  2 +-
 test/singa/test_textfile_decoder.cc | 60 ++++++++++++++++++++++++++++++++
 7 files changed, 192 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 221b3d7..887c409 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.log
 *.pb.h
 *.pb.cc
+*.cxx
 build/
 thirdparty/*
 !thirdparty/install.sh

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/include/singa/io/decoder.h
----------------------------------------------------------------------
diff --git a/include/singa/io/decoder.h b/include/singa/io/decoder.h
index dad2231..7d47d6a 100644
--- a/include/singa/io/decoder.h
+++ b/include/singa/io/decoder.h
@@ -40,17 +40,35 @@ class Decoder {
 #ifdef USE_OPENCV
 /// Decode the string as an ImageRecord object and convert it into a image
 /// tensor (dtype is kFloat32) and a label tensor (dtype is kInt).
-class Proto2JPGDecoder : public Decoder {
+class JPGDecoder : public Decoder {
  public:
   void Setup(const DecoderConf& conf) override {
     image_dim_order_ = conf.image_dim_order();
   }
   std::vector<Tensor> Decode(std::string value) override;
 
+  const std::string image_dim_order() const { return image_dim_order_; }
+
  private:
   /// Indicate the dimension order for the output image tensor.
   std::string image_dim_order_ = "CHW";
 };
 #endif
+
+/// Decode the string and convert it into a text
+/// tensor (dtype is kFloat32) and a label tensor (dtype is kInt).
+class TextDecoder : public Decoder {
+ public:
+  void Setup(const DecoderConf& conf) override {
+    has_label_ = conf.has_label();
+  }
+  std::vector<Tensor> Decode(std::string value) override;
+
+  const bool has_label() const { return has_label_; }
+
+ private:
+  /// Indicate the dimension order for the output image tensor.
+  bool has_label_ = true;
+};
 } // namespace singa
 #endif // SINGA_IO_DECODER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/include/singa/io/encoder.h
----------------------------------------------------------------------
diff --git a/include/singa/io/encoder.h b/include/singa/io/encoder.h
index 5c1f242..90cea0e 100644
--- a/include/singa/io/encoder.h
+++ b/include/singa/io/encoder.h
@@ -41,7 +41,7 @@ class Encoder {
 
 #ifdef USE_OPENCV
 /// Convert an image and its label into an ImageRecord (protobuf message).
-class JPG2ProtoEncoder : public Encoder {
+class JPGEncoder : public Encoder {
  public:
   void Setup(const EncoderConf& conf) override {
     image_dim_order_ = conf.image_dim_order();
@@ -52,10 +52,22 @@ class JPG2ProtoEncoder : public Encoder {
   /// The label tensor's data type is kInt.
   std::string Encode(vector<Tensor>& data) override;
 
+  const std::string image_dim_order() const { return image_dim_order_; }
+
  private:
   /// Indicate the input image tensor's dimension order.
-  std::string image_dim_order_ = "HWC";
+  std::string image_dim_order_ = "CHW";
 };
 #endif  // USE_OPENCV
+
+/// Convert a set of tensors parsed from csv file into strings
+class TextEncoder : public Encoder {
+ public:
+  void Setup(const EncoderConf& conf) override {}
+  /// 'data' has two tesors, one for the text vector (1D) and one for the
+  /// label. The text tensor's data type is kFloat.
+  /// The label tensor's data type is kInt.
+  std::string Encode(vector<Tensor>& data) override;
+};
 } // namespace singa
 #endif  // SINGA_IO_ENCODER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/src/io/textfile_decoder.cc
----------------------------------------------------------------------
diff --git a/src/io/textfile_decoder.cc b/src/io/textfile_decoder.cc
new file mode 100644
index 0000000..221fb9f
--- /dev/null
+++ b/src/io/textfile_decoder.cc
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/decoder.h"
+#include <string>
+#include <sstream>
+
+#define MAXSIZE 4096
+
+namespace singa {
+
+std::vector<Tensor> TextDecoder::Decode(std::string value) {
+  std::vector<Tensor> output;
+  std::stringstream ss;
+  ss.str(value);
+  int l = 0;
+  if (has_label_ == true) ss >> l;
+  std::string str;
+  float* d = new float[MAXSIZE];
+  size_t size = 0;
+  while(std::getline(ss, str, ',')) {
+    float temp;
+    if (std::stringstream(str) >> temp) {
+      CHECK_LE(size, MAXSIZE-1);
+      d[size++] = temp;
+    }
+  }
+
+  Tensor data(Shape{size}, kFloat32);
+  data.CopyDataFromHostPtr(d, size);
+  output.push_back(data);
+  if (has_label_ == true) {
+    Tensor label(Shape{1}, kInt);
+    label.CopyDataFromHostPtr(&l, 1);
+    output.push_back(label);
+  }
+  return output;
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/src/io/textfile_encoder.cc
----------------------------------------------------------------------
diff --git a/src/io/textfile_encoder.cc b/src/io/textfile_encoder.cc
new file mode 100644
index 0000000..72401ee
--- /dev/null
+++ b/src/io/textfile_encoder.cc
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/encoder.h"
+#include <sstream>
+
+namespace singa {
+
+std::string TextEncoder::Encode(vector<Tensor>& data) {
+  CHECK_GE(data.size(), 1);
+  size_t size = data[0].Size();
+  const float* value = data[0].data<float>();
+  std::string des = "";
+  if (data.size() == 2) {
+    const float label = (const float)data[1].data<int>()[0];
+    std::ostringstream buff;
+    buff << label;
+    des += buff.str() + ',';
+  }
+  for (size_t i = 0; i < size; i++) {
+    std::ostringstream buff;
+    buff << value[i];
+    if (i == size - 1) des += buff.str();
+    else des += buff.str() + ',';
+  }
+  return des;
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/src/proto/io.proto
----------------------------------------------------------------------
diff --git a/src/proto/io.proto b/src/proto/io.proto
index 788b235..96280e5 100644
--- a/src/proto/io.proto
+++ b/src/proto/io.proto
@@ -27,9 +27,9 @@ message EncoderConf {
 message DecoderConf {
   optional string type = 1 [default = "proto2jpg"];
   optional string image_dim_order = 2 [default = "CHW"];
+  optional bool has_label = 3 [default = true];
 }
 
-
 message ImageRecord {
   repeated int32 shape = 1;
   repeated int32 label = 2;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/test/singa/test_textfile_decoder.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_textfile_decoder.cc 
b/test/singa/test_textfile_decoder.cc
new file mode 100644
index 0000000..bb31b88
--- /dev/null
+++ b/test/singa/test_textfile_decoder.cc
@@ -0,0 +1,60 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/io/encoder.h"
+#include "singa/io/decoder.h"
+#include "gtest/gtest.h"
+#include <sstream>
+#include <algorithm>
+
+using singa::Shape;
+using singa::Tensor;
+TEST(TextDecoder, Decode) {
+  singa::TextEncoder encoder;
+  singa::TextDecoder decoder;
+
+  singa::DecoderConf decoder_conf;
+  decoder_conf.set_has_label(true);
+  decoder.Setup(decoder_conf);
+  EXPECT_EQ(true, decoder.has_label());
+
+  float in_data[] = {1.23, 4.5, 5.1, 3.33, 0.44};
+  std::string in_str = "2, 1.23, 4.5, 5.1, 3.33, 0.44";
+  int in_label = 2;
+  size_t size = 5;
+
+  std::vector<Tensor> input;
+  Tensor data(Shape{size}, singa::kFloat32), label(Shape{1}, singa::kInt);
+  data.CopyDataFromHostPtr<float>(in_data, size);
+  label.CopyDataFromHostPtr<int>(&in_label, 1);
+  input.push_back(data);
+  input.push_back(label);
+
+  std::string value = encoder.Encode(input);
+  in_str.erase(std::remove(in_str.begin(), in_str.end(), ' '), in_str.end());
+  EXPECT_EQ(in_str, value);
+
+  std::vector<Tensor> output = decoder.Decode(value);
+  const auto* out_data = output.at(0).data<float>();
+  const auto* out_label = output.at(1).data<int>();
+  for (size_t i = 0; i < size; i++) EXPECT_EQ(in_data[i], out_data[i]);
+  EXPECT_EQ(in_label, out_label[0]);
+}

Reply via email to