This is an automated email from the ASF dual-hosted git repository.

hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new a9fa77224 [VL] Substrait-to-Velox: Support nested complex type 
signature parsing
a9fa77224 is described below

commit a9fa7722405ddf72cc1aaba026aafebaf771748e
Author: Hongze Zhang <[email protected]>
AuthorDate: Thu May 9 14:02:36 2024 +0800

    [VL] Substrait-to-Velox: Support nested complex type signature parsing
---
 cpp/velox/substrait/VeloxSubstraitSignature.cc | 103 +++++++++++++++++--------
 cpp/velox/tests/VeloxSubstraitSignatureTest.cc |   7 ++
 2 files changed, 78 insertions(+), 32 deletions(-)

diff --git a/cpp/velox/substrait/VeloxSubstraitSignature.cc 
b/cpp/velox/substrait/VeloxSubstraitSignature.cc
index 34e0df6de..ee7c5f513 100644
--- a/cpp/velox/substrait/VeloxSubstraitSignature.cc
+++ b/cpp/velox/substrait/VeloxSubstraitSignature.cc
@@ -72,6 +72,48 @@ std::string 
VeloxSubstraitSignature::toSubstraitSignature(const TypePtr& type) {
   }
 }
 
+namespace {
+using index = std::string::size_type;
+
+index findEnclosingPos(std::string text, index from, char left, char right) {
+  VELOX_CHECK(left != right)
+  VELOX_CHECK(text.at(from) == left)
+  int32_t stackedLeftChars = 0;
+  for (index idx = from; idx < text.size(); idx++) {
+    const char ch = text.at(idx);
+    if (ch == left) {
+      stackedLeftChars++;
+    }
+    if (ch == right) {
+      stackedLeftChars--;
+    }
+    if (stackedLeftChars == 0) {
+      return idx;
+    }
+  }
+  VELOX_FAIL("Unable to find enclose character from text: " + text)
+}
+
+index findSansNesting(std::string text, index from, char target, char left, 
char right) {
+  VELOX_CHECK(left != right)
+  VELOX_CHECK(target != left && target != right)
+  int32_t stackedLeftChars = 0;
+  for (index idx = from; idx < text.size(); idx++) {
+    const char ch = text.at(idx);
+    if (ch == left) {
+      stackedLeftChars++;
+    }
+    if (ch == right) {
+      stackedLeftChars--;
+    }
+    if (ch == target && stackedLeftChars == 0) {
+      return idx;
+    }
+  }
+  return std::string::npos;
+}
+} // namespace
+
 TypePtr VeloxSubstraitSignature::fromSubstraitSignature(const std::string& 
signature) {
   if (signature == "bool") {
     return BOOLEAN();
@@ -123,7 +165,7 @@ TypePtr 
VeloxSubstraitSignature::fromSubstraitSignature(const std::string& signa
 
   auto parseNestedTypeSignature = [&](const std::string& signature) -> 
std::vector<TypePtr> {
     auto start = signature.find_first_of('<');
-    auto end = signature.find_last_of('>');
+    auto end = findEnclosingPos(signature, start, '<', '>');
     VELOX_CHECK(
         end - start > 1,
         "Native validation failed due to: more information is needed to create 
nested type for {}",
@@ -132,30 +174,25 @@ TypePtr 
VeloxSubstraitSignature::fromSubstraitSignature(const std::string& signa
     std::string childrenTypes = signature.substr(start + 1, end - start - 1);
 
     // Split the types with delimiter.
-    std::string delimiter = ",";
-    std::size_t pos;
+    const char delimiter = ',';
     std::vector<TypePtr> types;
-    while ((pos = childrenTypes.find(delimiter)) != std::string::npos) {
-      auto typeStr = childrenTypes.substr(0, pos);
-      std::size_t endPos = pos;
-      if (startWith(typeStr, "dec") || startWith(typeStr, "struct") || 
startWith(typeStr, "map") ||
-          startWith(typeStr, "list")) {
-        endPos = childrenTypes.find(">") + 1;
-        if (endPos > pos) {
-          typeStr += childrenTypes.substr(pos, endPos - pos);
-        } else {
-          // For nested case, the end '>' could missing,
-          // so the last position is treated as end.
-          typeStr += childrenTypes.substr(pos);
-          endPos = childrenTypes.size();
-        }
+    size_t typeStart = 0;
+    while (true) {
+      if (typeStart == childrenTypes.size()) {
+        break;
+      }
+      VELOX_CHECK(typeStart < childrenTypes.size())
+      const size_t typeEnd = findSansNesting(childrenTypes, typeStart, 
delimiter, '<', '>');
+      if (typeEnd == std::string::npos) {
+        std::string typeStr = childrenTypes.substr(typeStart);
+        types.emplace_back(fromSubstraitSignature(typeStr));
+        break;
       }
+      std::string typeStr = childrenTypes.substr(typeStart, typeEnd - 
typeStart);
       types.emplace_back(fromSubstraitSignature(typeStr));
-      childrenTypes.erase(0, endPos + delimiter.length());
-    }
-    if (childrenTypes.size() > 0 && !startWith(childrenTypes, ">")) {
-      types.emplace_back(fromSubstraitSignature(childrenTypes));
+      typeStart = typeEnd + 1;
     }
+
     return types;
   };
 
@@ -172,6 +209,10 @@ TypePtr 
VeloxSubstraitSignature::fromSubstraitSignature(const std::string& signa
   if (startWith(signature, "struct")) {
     // Struct type name is in the format of struct<T1,T2,...,Tn>.
     auto types = parseNestedTypeSignature(signature);
+    if (types.empty()) {
+      VELOX_UNSUPPORTED(
+          "VeloxSubstraitSignature::fromSubstraitSignature: Unrecognizable 
struct type signature {}.", signature);
+    }
     std::vector<std::string> names(types.size());
     for (int i = 0; i < types.size(); i++) {
       names[i] = "";
@@ -183,22 +224,20 @@ TypePtr 
VeloxSubstraitSignature::fromSubstraitSignature(const std::string& signa
     // Map type name is in the format of map<T1,T2>.
     auto types = parseNestedTypeSignature(signature);
     if (types.size() != 2) {
-      VELOX_UNSUPPORTED("Substrait type signature conversion to Velox type not 
supported for {}.", signature);
+      VELOX_UNSUPPORTED(
+          "VeloxSubstraitSignature::fromSubstraitSignature: Unrecognizable map 
type signature {}.", signature);
     }
     return MAP(std::move(types)[0], std::move(types)[1]);
   }
 
   if (startWith(signature, "list")) {
-    auto listStart = signature.find_first_of('<');
-    auto listEnd = signature.find_last_of('>');
-    VELOX_CHECK(
-        listEnd - listStart > 1,
-        "Native validation failed due to: more information is needed to create 
ListType: {}",
-        signature);
-
-    auto elementTypeStr = signature.substr(listStart + 1, listEnd - listStart 
- 1);
-    auto elementType = fromSubstraitSignature(elementTypeStr);
-    return ARRAY(elementType);
+    // Array type name is in the format of list<T>.
+    auto types = parseNestedTypeSignature(signature);
+    if (types.size() != 1) {
+      VELOX_UNSUPPORTED(
+          "VeloxSubstraitSignature::fromSubstraitSignature: Unrecognizable 
list type signature {}.", signature);
+    }
+    return ARRAY(std::move(types)[0]);
   }
 
   VELOX_UNSUPPORTED("Substrait type signature conversion to Velox type not 
supported for {}.", signature);
diff --git a/cpp/velox/tests/VeloxSubstraitSignatureTest.cc 
b/cpp/velox/tests/VeloxSubstraitSignatureTest.cc
index d6db661f7..cb62f9764 100644
--- a/cpp/velox/tests/VeloxSubstraitSignatureTest.cc
+++ b/cpp/velox/tests/VeloxSubstraitSignatureTest.cc
@@ -138,6 +138,13 @@ TEST_F(VeloxSubstraitSignatureTest, 
fromSubstraitSignature) {
   ASSERT_EQ(type->childAt(0)->childAt(0)->childAt(1)->kind(), 
TypeKind::VARCHAR);
   type = fromSubstraitSignature("struct<struct<struct<i8,dec<19,2>>>>");
   ASSERT_EQ(type->childAt(0)->childAt(0)->childAt(1)->kind(), 
TypeKind::HUGEINT);
+  type = 
fromSubstraitSignature("struct<fp32,struct<struct<i8,dec<19,2>,list<i32>,map<i16,i32>>>>");
+  ASSERT_EQ(type->childAt(0)->kind(), TypeKind::REAL);
+  ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(0)->kind(), 
TypeKind::TINYINT);
+  ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(1)->kind(), 
TypeKind::HUGEINT);
+  ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(2)->childAt(0)->kind(), 
TypeKind::INTEGER);
+  ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(3)->childAt(0)->kind(), 
TypeKind::SMALLINT);
+  ASSERT_EQ(type->childAt(1)->childAt(0)->childAt(3)->childAt(1)->kind(), 
TypeKind::INTEGER);
   ASSERT_ANY_THROW(fromSubstraitSignature("other")->kind());
 
   // Map type test.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to