Repository: incubator-singa Updated Branches: refs/heads/dev 245f60e68 -> 493192233
SINGA-213 Implement Encoder and Decoder for CSV TextEncoder encodes a Tensor into a 1D string where data are splited by comma. TextDecoder decodes a string of data into Tensors (data and [optional]label). Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/7444f0ac Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/7444f0ac Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/7444f0ac Branch: refs/heads/dev Commit: 7444f0acc42f47e596967ac4f4640b30872959c9 Parents: 245f60e Author: jixin <[email protected]> Authored: Mon Jul 11 12:16:55 2016 +0800 Committer: jixin <[email protected]> Committed: Mon Jul 11 12:17:04 2016 +0800 ---------------------------------------------------------------------- .gitignore | 1 + include/singa/io/decoder.h | 20 ++++++++++- include/singa/io/encoder.h | 16 +++++++-- src/io/textfile_decoder.cc | 54 ++++++++++++++++++++++++++++ src/io/textfile_encoder.cc | 43 +++++++++++++++++++++++ src/proto/io.proto | 2 +- test/singa/test_textfile_decoder.cc | 60 ++++++++++++++++++++++++++++++++ 7 files changed, 192 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 221b3d7..887c409 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ *.log *.pb.h *.pb.cc +*.cxx build/ thirdparty/* !thirdparty/install.sh http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/include/singa/io/decoder.h ---------------------------------------------------------------------- diff --git a/include/singa/io/decoder.h b/include/singa/io/decoder.h index dad2231..7d47d6a 100644 --- a/include/singa/io/decoder.h +++ b/include/singa/io/decoder.h @@ -40,17 +40,35 @@ class Decoder { #ifdef USE_OPENCV /// Decode the string as an ImageRecord object and convert it into a image /// tensor (dtype is kFloat32) and a label tensor (dtype is kInt). -class Proto2JPGDecoder : public Decoder { +class JPGDecoder : public Decoder { public: void Setup(const DecoderConf& conf) override { image_dim_order_ = conf.image_dim_order(); } std::vector<Tensor> Decode(std::string value) override; + const std::string image_dim_order() const { return image_dim_order_; } + private: /// Indicate the dimension order for the output image tensor. std::string image_dim_order_ = "CHW"; }; #endif + +/// Decode the string and convert it into a text +/// tensor (dtype is kFloat32) and a label tensor (dtype is kInt). +class TextDecoder : public Decoder { + public: + void Setup(const DecoderConf& conf) override { + has_label_ = conf.has_label(); + } + std::vector<Tensor> Decode(std::string value) override; + + const bool has_label() const { return has_label_; } + + private: + /// Indicate the dimension order for the output image tensor. + bool has_label_ = true; +}; } // namespace singa #endif // SINGA_IO_DECODER_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/include/singa/io/encoder.h ---------------------------------------------------------------------- diff --git a/include/singa/io/encoder.h b/include/singa/io/encoder.h index 5c1f242..90cea0e 100644 --- a/include/singa/io/encoder.h +++ b/include/singa/io/encoder.h @@ -41,7 +41,7 @@ class Encoder { #ifdef USE_OPENCV /// Convert an image and its label into an ImageRecord (protobuf message). -class JPG2ProtoEncoder : public Encoder { +class JPGEncoder : public Encoder { public: void Setup(const EncoderConf& conf) override { image_dim_order_ = conf.image_dim_order(); @@ -52,10 +52,22 @@ class JPG2ProtoEncoder : public Encoder { /// The label tensor's data type is kInt. std::string Encode(vector<Tensor>& data) override; + const std::string image_dim_order() const { return image_dim_order_; } + private: /// Indicate the input image tensor's dimension order. - std::string image_dim_order_ = "HWC"; + std::string image_dim_order_ = "CHW"; }; #endif // USE_OPENCV + +/// Convert a set of tensors parsed from csv file into strings +class TextEncoder : public Encoder { + public: + void Setup(const EncoderConf& conf) override {} + /// 'data' has two tesors, one for the text vector (1D) and one for the + /// label. The text tensor's data type is kFloat. + /// The label tensor's data type is kInt. + std::string Encode(vector<Tensor>& data) override; +}; } // namespace singa #endif // SINGA_IO_ENCODER_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/src/io/textfile_decoder.cc ---------------------------------------------------------------------- diff --git a/src/io/textfile_decoder.cc b/src/io/textfile_decoder.cc new file mode 100644 index 0000000..221fb9f --- /dev/null +++ b/src/io/textfile_decoder.cc @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "singa/io/decoder.h" +#include <string> +#include <sstream> + +#define MAXSIZE 4096 + +namespace singa { + +std::vector<Tensor> TextDecoder::Decode(std::string value) { + std::vector<Tensor> output; + std::stringstream ss; + ss.str(value); + int l = 0; + if (has_label_ == true) ss >> l; + std::string str; + float* d = new float[MAXSIZE]; + size_t size = 0; + while(std::getline(ss, str, ',')) { + float temp; + if (std::stringstream(str) >> temp) { + CHECK_LE(size, MAXSIZE-1); + d[size++] = temp; + } + } + + Tensor data(Shape{size}, kFloat32); + data.CopyDataFromHostPtr(d, size); + output.push_back(data); + if (has_label_ == true) { + Tensor label(Shape{1}, kInt); + label.CopyDataFromHostPtr(&l, 1); + output.push_back(label); + } + return output; +} +} // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/src/io/textfile_encoder.cc ---------------------------------------------------------------------- diff --git a/src/io/textfile_encoder.cc b/src/io/textfile_encoder.cc new file mode 100644 index 0000000..72401ee --- /dev/null +++ b/src/io/textfile_encoder.cc @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "singa/io/encoder.h" +#include <sstream> + +namespace singa { + +std::string TextEncoder::Encode(vector<Tensor>& data) { + CHECK_GE(data.size(), 1); + size_t size = data[0].Size(); + const float* value = data[0].data<float>(); + std::string des = ""; + if (data.size() == 2) { + const float label = (const float)data[1].data<int>()[0]; + std::ostringstream buff; + buff << label; + des += buff.str() + ','; + } + for (size_t i = 0; i < size; i++) { + std::ostringstream buff; + buff << value[i]; + if (i == size - 1) des += buff.str(); + else des += buff.str() + ','; + } + return des; +} +} // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/src/proto/io.proto ---------------------------------------------------------------------- diff --git a/src/proto/io.proto b/src/proto/io.proto index 788b235..96280e5 100644 --- a/src/proto/io.proto +++ b/src/proto/io.proto @@ -27,9 +27,9 @@ message EncoderConf { message DecoderConf { optional string type = 1 [default = "proto2jpg"]; optional string image_dim_order = 2 [default = "CHW"]; + optional bool has_label = 3 [default = true]; } - message ImageRecord { repeated int32 shape = 1; repeated int32 label = 2; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7444f0ac/test/singa/test_textfile_decoder.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_textfile_decoder.cc b/test/singa/test_textfile_decoder.cc new file mode 100644 index 0000000..bb31b88 --- /dev/null +++ b/test/singa/test_textfile_decoder.cc @@ -0,0 +1,60 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "singa/io/encoder.h" +#include "singa/io/decoder.h" +#include "gtest/gtest.h" +#include <sstream> +#include <algorithm> + +using singa::Shape; +using singa::Tensor; +TEST(TextDecoder, Decode) { + singa::TextEncoder encoder; + singa::TextDecoder decoder; + + singa::DecoderConf decoder_conf; + decoder_conf.set_has_label(true); + decoder.Setup(decoder_conf); + EXPECT_EQ(true, decoder.has_label()); + + float in_data[] = {1.23, 4.5, 5.1, 3.33, 0.44}; + std::string in_str = "2, 1.23, 4.5, 5.1, 3.33, 0.44"; + int in_label = 2; + size_t size = 5; + + std::vector<Tensor> input; + Tensor data(Shape{size}, singa::kFloat32), label(Shape{1}, singa::kInt); + data.CopyDataFromHostPtr<float>(in_data, size); + label.CopyDataFromHostPtr<int>(&in_label, 1); + input.push_back(data); + input.push_back(label); + + std::string value = encoder.Encode(input); + in_str.erase(std::remove(in_str.begin(), in_str.end(), ' '), in_str.end()); + EXPECT_EQ(in_str, value); + + std::vector<Tensor> output = decoder.Decode(value); + const auto* out_data = output.at(0).data<float>(); + const auto* out_label = output.at(1).data<int>(); + for (size_t i = 0; i < size; i++) EXPECT_EQ(in_data[i], out_data[i]); + EXPECT_EQ(in_label, out_label[0]); +}
