This is an automated email from the ASF dual-hosted git repository.
hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new a9fa77224 [VL] Substrait-to-Velox: Support nested complex type
signature parsing
a9fa77224 is described below
commit a9fa7722405ddf72cc1aaba026aafebaf771748e
Author: Hongze Zhang <[email protected]>
AuthorDate: Thu May 9 14:02:36 2024 +0800
[VL] Substrait-to-Velox: Support nested complex type signature parsing
---
cpp/velox/substrait/VeloxSubstraitSignature.cc | 103 +++++++++++++++++--------
cpp/velox/tests/VeloxSubstraitSignatureTest.cc | 7 ++
2 files changed, 78 insertions(+), 32 deletions(-)
diff --git a/cpp/velox/substrait/VeloxSubstraitSignature.cc
b/cpp/velox/substrait/VeloxSubstraitSignature.cc
index 34e0df6de..ee7c5f513 100644
--- a/cpp/velox/substrait/VeloxSubstraitSignature.cc
+++ b/cpp/velox/substrait/VeloxSubstraitSignature.cc
@@ -72,6 +72,48 @@ std::string
VeloxSubstraitSignature::toSubstraitSignature(const TypePtr& type) {
}
}
+namespace {
+using index = std::string::size_type;
+
+index findEnclosingPos(std::string text, index from, char left, char right) {
+ VELOX_CHECK(left != right)
+ VELOX_CHECK(text.at(from) == left)
+ int32_t stackedLeftChars = 0;
+ for (index idx = from; idx < text.size(); idx++) {
+ const char ch = text.at(idx);
+ if (ch == left) {
+ stackedLeftChars++;
+ }
+ if (ch == right) {
+ stackedLeftChars--;
+ }
+ if (stackedLeftChars == 0) {
+ return idx;
+ }
+ }
+ VELOX_FAIL("Unable to find enclose character from text: " + text)
+}
+
+index findSansNesting(std::string text, index from, char target, char left,
char right) {
+ VELOX_CHECK(left != right)
+ VELOX_CHECK(target != left && target != right)
+ int32_t stackedLeftChars = 0;
+ for (index idx = from; idx < text.size(); idx++) {
+ const char ch = text.at(idx);
+ if (ch == left) {
+ stackedLeftChars++;
+ }
+ if (ch == right) {
+ stackedLeftChars--;
+ }
+ if (ch == target && stackedLeftChars == 0) {
+ return idx;
+ }
+ }
+ return std::string::npos;
+}
+} // namespace
+
TypePtr VeloxSubstraitSignature::fromSubstraitSignature(const std::string&
signature) {
if (signature == "bool") {
return BOOLEAN();
@@ -123,7 +165,7 @@ TypePtr
VeloxSubstraitSignature::fromSubstraitSignature(const std::string& signa
auto parseNestedTypeSignature = [&](const std::string& signature) ->
std::vector<TypePtr> {
auto start = signature.find_first_of('<');
- auto end = signature.find_last_of('>');
+ auto end = findEnclosingPos(signature, start, '<', '>');
VELOX_CHECK(
end - start > 1,
"Native validation failed due to: more information is needed to create
nested type for {}",
@@ -132,30 +174,25 @@ TypePtr
VeloxSubstraitSignature::fromSubstraitSignature(const std::string& signa
std::string childrenTypes = signature.substr(start + 1, end - start - 1);
// Split the types with delimiter.
- std::string delimiter = ",";
- std::size_t pos;
+ const char delimiter = ',';
std::vector<TypePtr> types;
- while ((pos = childrenTypes.find(delimiter)) != std::string::npos) {
- auto typeStr = childrenTypes.substr(0, pos);
- std::size_t endPos = pos;
- if (startWith(typeStr, "dec") || startWith(typeStr, "struct") ||
startWith(typeStr, "map") ||
- startWith(typeStr, "list")) {
- endPos = childrenTypes.find(">") + 1;
- if (endPos > pos) {
- typeStr += childrenTypes.substr(pos, endPos - pos);
- } else {
- // For nested case, the end '>' could missing,
- // so the last position is treated as end.
- typeStr += childrenTypes.substr(pos);
- endPos = childrenTypes.size();
- }
+ size_t typeStart = 0;
+ while (true) {
+ if (typeStart == childrenTypes.size()) {
+ break;
+ }
+ VELOX_CHECK(typeStart < childrenTypes.size())
+ const size_t typeEnd = findSansNesting(childrenTypes, typeStart,
delimiter, '<', '>');
+ if (typeEnd == std::string::npos) {
+ std::string typeStr = childrenTypes.substr(typeStart);
+ types.emplace_back(fromSubstraitSignature(typeStr));
+ break;
}
+ std::string typeStr = childrenTypes.substr(typeStart, typeEnd -
typeStart);
types.emplace_back(fromSubstraitSignature(typeStr));
- childrenTypes.erase(0, endPos + delimiter.length());
- }
- if (childrenTypes.size() > 0 && !startWith(childrenTypes, ">")) {
- types.emplace_back(fromSubstraitSignature(childrenTypes));
+ typeStart = typeEnd + 1;
}
+
return types;
};
@@ -172,6 +209,10 @@ TypePtr
VeloxSubstraitSignature::fromSubstraitSignature(const std::string& signa
if (startWith(signature, "struct")) {
// Struct type name is in the format of struct<T1,T2,...,Tn>.
auto types = parseNestedTypeSignature(signature);
+ if (types.empty()) {
+ VELOX_UNSUPPORTED(
+ "VeloxSubstraitSignature::fromSubstraitSignature: Unrecognizable
struct type signature {}.", signature);
+ }
std::vector<std::string> names(types.size());
for (int i = 0; i < types.size(); i++) {
names[i] = "";
@@ -183,22 +224,20 @@ TypePtr
VeloxSubstraitSignature::fromSubstraitSignature(const std::string& signa
// Map type name is in the format of map<T1,T2>.
auto types = parseNestedTypeSignature(signature);
if (types.size() != 2) {
- VELOX_UNSUPPORTED("Substrait type signature conversion to Velox type not
supported for {}.", signature);
+ VELOX_UNSUPPORTED(
+ "VeloxSubstraitSignature::fromSubstraitSignature: Unrecognizable map
type signature {}.", signature);
}
return MAP(std::move(types)[0], std::move(types)[1]);
}
if (startWith(signature, "list")) {
- auto listStart = signature.find_first_of('<');
- auto listEnd = signature.find_last_of('>');
- VELOX_CHECK(
- listEnd - listStart > 1,
- "Native validation failed due to: more information is needed to create
ListType: {}",
- signature);
-
- auto elementTypeStr = signature.substr(listStart + 1, listEnd - listStart
- 1);
- auto elementType = fromSubstraitSignature(elementTypeStr);
- return ARRAY(elementType);
+ // Array type name is in the format of list<T>.
+ auto types = parseNestedTypeSignature(signature);
+ if (types.size() != 1) {
+ VELOX_UNSUPPORTED(
+ "VeloxSubstraitSignature::fromSubstraitSignature: Unrecognizable
list type signature {}.", signature);
+ }
+ return ARRAY(std::move(types)[0]);
}
VELOX_UNSUPPORTED("Substrait type signature conversion to Velox type not
supported for {}.", signature);
diff --git a/cpp/velox/tests/VeloxSubstraitSignatureTest.cc
b/cpp/velox/tests/VeloxSubstraitSignatureTest.cc
index d6db661f7..cb62f9764 100644
--- a/cpp/velox/tests/VeloxSubstraitSignatureTest.cc
+++ b/cpp/velox/tests/VeloxSubstraitSignatureTest.cc
@@ -138,6 +138,13 @@ TEST_F(VeloxSubstraitSignatureTest,
fromSubstraitSignature) {
ASSERT_EQ(type->childAt(0)->childAt(0)->childAt(1)->kind(),
TypeKind::VARCHAR);
type = fromSubstraitSignature("struct<struct<struct<i8,dec<19,2>>>>");
ASSERT_EQ(type->childAt(0)->childAt(0)->childAt(1)->kind(),
TypeKind::HUGEINT);
+ type =
fromSubstraitSignature("struct<fp32,struct<struct<i8,dec<19,2>,list<i32>,map<i16,i32>>>>");
+ ASSERT_EQ(type->childAt(0)->kind(), TypeKind::REAL);
+ ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(0)->kind(),
TypeKind::TINYINT);
+ ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(1)->kind(),
TypeKind::HUGEINT);
+ ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(2)->childAt(0)->kind(),
TypeKind::INTEGER);
+ ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(3)->childAt(0)->kind(),
TypeKind::SMALLINT);
+ ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(3)->childAt(1)->kind(),
TypeKind::INTEGER);
ASSERT_ANY_THROW(fromSubstraitSignature("other")->kind());
// Map type test.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]