Repository: orc Updated Branches: refs/heads/master ec95303b0 -> 90f138b06
ORC-183: Add a method in Type to build type Added static Type* buildTypeFromString(const std::string& input) Fixes #115 Signed-off-by: Owen O'Malley <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/90f138b0 Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/90f138b0 Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/90f138b0 Branch: refs/heads/master Commit: 90f138b06053b18b86b09edcb33bf8dc25d7f659 Parents: ec95303 Author: Gang Wu <[email protected]> Authored: Thu Apr 27 21:31:35 2017 -0700 Committer: Owen O'Malley <[email protected]> Committed: Wed May 10 13:02:22 2017 -0700 ---------------------------------------------------------------------- c++/include/orc/Type.hh | 5 ++ c++/src/TypeImpl.cc | 196 +++++++++++++++++++++++++++++++++++++++++++ c++/src/TypeImpl.hh | 67 +++++++++++++++ c++/test/TestType.cc | 23 +++++ 4 files changed, 291 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/90f138b0/c++/include/orc/Type.hh ---------------------------------------------------------------------- diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh index 25b8f53..68f5ec2 100644 --- a/c++/include/orc/Type.hh +++ b/c++/include/orc/Type.hh @@ -82,6 +82,11 @@ namespace orc { * @return a reference to the union type */ virtual Type* addUnionChild(ORC_UNIQUE_PTR<Type> fieldType) = 0; + + /** + * Build a Type object from string text representation. + */ + static ORC_UNIQUE_PTR<Type> buildTypeFromString(const std::string& input); }; const int64_t DEFAULT_DECIMAL_SCALE = 18; http://git-wip-us.apache.org/repos/asf/orc/blob/90f138b0/c++/src/TypeImpl.cc ---------------------------------------------------------------------- diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index fdf66a0..6074f94 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -491,4 +491,200 @@ namespace orc { return std::unique_ptr<Type>(result); } + ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) { + std::vector<std::pair<std::string, Type*> > res = + TypeImpl::parseType(input, 0, input.size()); + if (res.size() != 1) { + throw std::logic_error("Invalid type string."); + } + return ORC_UNIQUE_PTR<Type>(res[0].second); + } + + Type* TypeImpl::parseArrayType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* arrayType = new TypeImpl(LIST); + std::vector<std::pair<std::string, Type*> > v = + TypeImpl::parseType(input, start, end); + if (v.size() != 1) { + throw std::logic_error("Array type must contain exactly one sub type."); + } + arrayType->addChildType(ORC_UNIQUE_PTR<Type>(v[0].second)); + return arrayType; + } + + Type* TypeImpl::parseMapType(const std::string &input, + size_t start, + size_t end) { + TypeImpl * mapType = new TypeImpl(MAP); + std::vector<std::pair<std::string, Type*> > v = + TypeImpl::parseType(input, start, end); + if (v.size() != 2) { + throw std::logic_error( + "Map type must contain exactly two sub types."); + } + mapType->addChildType(ORC_UNIQUE_PTR<Type>(v[0].second)); + mapType->addChildType(ORC_UNIQUE_PTR<Type>(v[1].second)); + return mapType; + } + + Type* TypeImpl::parseStructType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* structType = new TypeImpl(STRUCT); + std::vector<std::pair<std::string, Type*> > v = + TypeImpl::parseType(input, start, end); + if (v.size() == 0) { + throw std::logic_error( + "Struct type must contain at least one sub type."); + } + for (size_t i = 0; i < v.size(); ++i) { + structType->addStructField(v[i].first, ORC_UNIQUE_PTR<Type>(v[i].second)); + } + return structType; + } + + Type* TypeImpl::parseUnionType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* unionType = new TypeImpl(UNION); + std::vector<std::pair<std::string, Type*> > v = + TypeImpl::parseType(input, start, end); + if (v.size() == 0) { + throw std::logic_error("Union type must contain at least one sub type."); + } + for (size_t i = 0; i < v.size(); ++i) { + unionType->addChildType(ORC_UNIQUE_PTR<Type>(v[i].second)); + } + return unionType; + } + + Type* TypeImpl::parseDecimalType(const std::string &input, + size_t start, + size_t end) { + size_t sep = input.find(',', start); + if (sep + 1 >= end || sep == std::string::npos) { + throw std::logic_error("Decimal type must specify precision and scale."); + } + uint64_t precision = + static_cast<uint64_t>(atoi(input.substr(start, sep - start).c_str())); + uint64_t scale = + static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str())); + return new TypeImpl(DECIMAL, precision, scale); + } + + Type* TypeImpl::parseCategory(std::string category, + const std::string &input, + size_t start, + size_t end) { + if (category == "boolean") { + return new TypeImpl(BOOLEAN); + } else if (category == "tinyint") { + return new TypeImpl(BYTE); + } else if (category == "smallint") { + return new TypeImpl(SHORT); + } else if (category == "int") { + return new TypeImpl(INT); + } else if (category == "bigint") { + return new TypeImpl(LONG); + } else if (category == "float") { + return new TypeImpl(FLOAT); + } else if (category == "double") { + return new TypeImpl(DOUBLE); + } else if (category == "string") { + return new TypeImpl(STRING); + } else if (category == "binary") { + return new TypeImpl(BINARY); + } else if (category == "timestamp") { + return new TypeImpl(TIMESTAMP); + } else if (category == "array") { + return parseArrayType(input, start, end); + } else if (category == "map") { + return parseMapType(input, start, end); + } else if (category == "struct") { + return parseStructType(input, start, end); + } else if (category == "uniontype") { + return parseUnionType(input, start, end); + } else if (category == "decimal") { + return parseDecimalType(input, start, end); + } else if (category == "date") { + return new TypeImpl(DATE); + } else if (category == "varchar") { + uint64_t maxLength = static_cast<uint64_t>( + atoi(input.substr(start, end - start).c_str())); + return new TypeImpl(VARCHAR, maxLength); + } else if (category == "char") { + uint64_t maxLength = static_cast<uint64_t>( + atoi(input.substr(start, end - start).c_str())); + return new TypeImpl(CHAR, maxLength); + } else { + throw std::logic_error("Unknown type " + category); + } + } + + std::vector<std::pair<std::string, Type *> > TypeImpl::parseType( + const std::string &input, + size_t start, + size_t end) { + std::string types = input.substr(start, end - start); + std::vector<std::pair<std::string, Type *> > res; + size_t pos = 0; + + while (pos < types.size()) { + size_t endPos = pos; + while (endPos < types.size() && isalnum(types[endPos])) { + ++endPos; + } + + std::string fieldName; + if (types[endPos] == ':') { + fieldName = types.substr(pos, endPos - pos); + pos = ++endPos; + while (endPos < types.size() && isalpha(types[endPos])) { + ++endPos; + } + } + + size_t nextPos = endPos + 1; + if (types[endPos] == '<') { + int count = 1; + while (nextPos < types.size()) { + if (types[nextPos] == '<') { + ++count; + } else if (types[nextPos] == '>') { + --count; + } + if (count == 0) { + break; + } + ++nextPos; + } + if (nextPos == types.size()) { + throw std::logic_error("Invalid type string. Cannot find closing >"); + } + } else if (types[endPos] == '(') { + while (nextPos < types.size() && types[nextPos] != ')') { + ++nextPos; + } + if (nextPos == types.size()) { + throw std::logic_error("Invalid type string. Cannot find closing )"); + } + } else if (types[endPos] != ',' && types[endPos] != '\0') { + throw std::logic_error("Unrecognized character."); + } + + std::string category = types.substr(pos, endPos - pos); + Type* type = parseCategory(category, types, endPos + 1, nextPos); + res.push_back(std::make_pair(fieldName, type)); + + if (types[nextPos] == ')' || types[nextPos] == '>') { + pos = nextPos + 2; + } else { + pos = nextPos; + } + } + + return res; + } + } http://git-wip-us.apache.org/repos/asf/orc/blob/90f138b0/c++/src/TypeImpl.hh ---------------------------------------------------------------------- diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index e2866e4..3c3f739 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -98,6 +98,11 @@ namespace orc { */ void addChildType(std::unique_ptr<Type> childType); + static std::vector<std::pair<std::string, Type *> > parseType( + const std::string &input, + size_t start, + size_t end); + private: /** * Assign ids to this node and its children giving this @@ -110,6 +115,68 @@ namespace orc { * Ensure that ids are assigned to all of the nodes. */ void ensureIdAssigned() const; + + /** + * Parse array type from string + * @param input the input string of an array type + * @param start start position of the input string + * @param end end position of the input string + */ + static Type* parseArrayType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse map type from string + * @param input the input string of a map type + * @param start start position of the input string + * @param end end position of the input string + */ + static Type* parseMapType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse struct type from string + * @param input the input string of a struct type + * @param start start position of the input string + * @param end end position of the input string + */ + static Type* parseStructType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse union type from string + * @param input the input string of an union type + * @param start start position of the input string + * @param end end position of the input string + */ + static Type* parseUnionType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse decimal type from string + * @param input the input string of a decimal type + * @param start start position of the input string + * @param end end position of the input string + */ + static Type* parseDecimalType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse type for a category + * @param category type name + * @param input the input string of the category + * @param start start position of the input string + * @param end end position of the input string + */ + static Type* parseCategory(std::string category, + const std::string &input, + size_t start, + size_t end); }; std::unique_ptr<Type> convertType(const proto::Type& type, http://git-wip-us.apache.org/repos/asf/orc/blob/90f138b0/c++/test/TestType.cc ---------------------------------------------------------------------- diff --git a/c++/test/TestType.cc b/c++/test/TestType.cc index 3c595d0..8ce9313 100644 --- a/c++/test/TestType.cc +++ b/c++/test/TestType.cc @@ -274,4 +274,27 @@ namespace orc { EXPECT_EQ(13, cutType->getSubtype(1)->getColumnId()); EXPECT_EQ(13, cutType->getSubtype(1)->getMaximumColumnId()); } + + TEST(TestType, buildTypeFromString) { + std::string typeStr = "struct<a:int,b:string,c:decimal(10,2),d:varchar(5)>"; + ORC_UNIQUE_PTR<Type> type = Type::buildTypeFromString(typeStr); + EXPECT_EQ(typeStr, type->toString()); + + typeStr = "map<boolean,float>"; + type = Type::buildTypeFromString(typeStr); + EXPECT_EQ(typeStr, type->toString()); + + typeStr = "uniontype<bigint,binary,timestamp>"; + type = Type::buildTypeFromString(typeStr); + EXPECT_EQ(typeStr, type->toString()); + + typeStr = "struct<a:bigint,b:struct<a:binary,b:timestamp>>"; + type = Type::buildTypeFromString(typeStr); + EXPECT_EQ(typeStr, type->toString()); + + typeStr = + "struct<a:bigint,b:struct<a:binary,b:timestamp>,c:map<double,tinyint>>"; + type = Type::buildTypeFromString(typeStr); + EXPECT_EQ(typeStr, type->toString()); + } }
