This is an automated email from the ASF dual-hosted git repository.

rui pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new cca659ac2 [VL] Fix field name parsing in Subfield (#7330)
cca659ac2 is described below

commit cca659ac2889c983566443473ab0e4558bb9cd2b
Author: Rui Mo <[email protected]>
AuthorDate: Wed Sep 25 10:08:44 2024 +0800

    [VL] Fix field name parsing in Subfield (#7330)
---
 .../gluten/execution/MiscOperatorSuite.scala       | 24 +++++++
 cpp/velox/substrait/SubstraitToVeloxPlan.cc        | 73 +++++++++++-----------
 2 files changed, 59 insertions(+), 38 deletions(-)

diff --git 
a/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala
 
b/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala
index 0432c0da9..12e770327 100644
--- 
a/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala
+++ 
b/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala
@@ -66,6 +66,30 @@ class MiscOperatorSuite extends 
VeloxWholeStageTransformerSuite with AdaptiveSpa
       .set(GlutenConfig.NATIVE_ARROW_READER_ENABLED.key, "true")
   }
 
+  test("field names contain non-ASCII characters") {
+    withTempPath {
+      path =>
+        // scalastyle:off nonascii
+        Seq((1, 2, 3, 4)).toDF("товары", "овары", "国ⅵ", 
"中文").write.parquet(path.getCanonicalPath)
+        // scalastyle:on
+        
spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("view")
+        runQueryAndCompare("select * from view") {
+          checkGlutenOperatorMatch[FileSourceScanExecTransformer]
+        }
+    }
+
+    withTempPath {
+      path =>
+        // scalastyle:off nonascii
+        spark.range(10).toDF("中文").write.parquet(path.getCanonicalPath)
+        
spark.read.parquet(path.getCanonicalPath).filter("`中文`>1").createOrReplaceTempView("view")
+        // scalastyle:on
+        runQueryAndCompare("select * from view") {
+          checkGlutenOperatorMatch[FileSourceScanExecTransformer]
+        }
+    }
+  }
+
   test("simple_select") {
     val df = runQueryAndCompare("select * from lineitem limit 1") { _ => }
     checkLengthAndPlan(df, 1)
diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc 
b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
index 44a8ff7c1..01386115b 100644
--- a/cpp/velox/substrait/SubstraitToVeloxPlan.cc
+++ b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
@@ -179,16 +179,11 @@ RowTypePtr getJoinOutputType(
   VELOX_FAIL("Output should include left or right columns.");
 }
 
-// Returns the field name separators used to create Subfield.
-std::shared_ptr<common::Separators> getSeparators() {
-  auto separators = std::make_shared<common::Separators>();
-  // ']', '.', '[', '*', '^' are not separators in Spark.
-  separators->closeBracket = '\0';
-  separators->dot = '\0';
-  separators->openBracket = '\0';
-  separators->wildCard = '\0';
-  separators->unicodeCaret = '\0';
-  return separators;
+// Returns the path vector used to create Subfield.
+std::vector<std::unique_ptr<common::Subfield::PathElement>> getPath(const 
std::string& field) {
+  std::vector<std::unique_ptr<common::Subfield::PathElement>> path;
+  path.push_back(std::make_unique<common::Subfield::NestedField>(field));
+  return path;
 }
 
 } // namespace
@@ -2040,9 +2035,9 @@ void 
SubstraitToVeloxPlanConverter::setInFilter<TypeKind::BIGINT>(
     values.emplace_back(value);
   }
   if (negated) {
-    filters[common::Subfield(inputName, getSeparators())] = 
common::createNegatedBigintValues(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
common::createNegatedBigintValues(values, nullAllowed);
   } else {
-    filters[common::Subfield(inputName, getSeparators())] = 
common::createBigintValues(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
common::createBigintValues(values, nullAllowed);
   }
 }
 
@@ -2062,9 +2057,9 @@ void 
SubstraitToVeloxPlanConverter::setInFilter<TypeKind::INTEGER>(
     values.emplace_back(value);
   }
   if (negated) {
-    filters[common::Subfield(inputName, getSeparators())] = 
common::createNegatedBigintValues(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
common::createNegatedBigintValues(values, nullAllowed);
   } else {
-    filters[common::Subfield(inputName, getSeparators())] = 
common::createBigintValues(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
common::createBigintValues(values, nullAllowed);
   }
 }
 
@@ -2084,9 +2079,9 @@ void 
SubstraitToVeloxPlanConverter::setInFilter<TypeKind::SMALLINT>(
     values.emplace_back(value);
   }
   if (negated) {
-    filters[common::Subfield(inputName, getSeparators())] = 
common::createNegatedBigintValues(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
common::createNegatedBigintValues(values, nullAllowed);
   } else {
-    filters[common::Subfield(inputName, getSeparators())] = 
common::createBigintValues(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
common::createBigintValues(values, nullAllowed);
   }
 }
 
@@ -2106,9 +2101,9 @@ void 
SubstraitToVeloxPlanConverter::setInFilter<TypeKind::TINYINT>(
     values.emplace_back(value);
   }
   if (negated) {
-    filters[common::Subfield(inputName, getSeparators())] = 
common::createNegatedBigintValues(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
common::createNegatedBigintValues(values, nullAllowed);
   } else {
-    filters[common::Subfield(inputName, getSeparators())] = 
common::createBigintValues(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
common::createBigintValues(values, nullAllowed);
   }
 }
 
@@ -2126,10 +2121,11 @@ void 
SubstraitToVeloxPlanConverter::setInFilter<TypeKind::VARCHAR>(
     values.emplace_back(value);
   }
   if (negated) {
-    filters[common::Subfield(inputName, getSeparators())] =
+    filters[common::Subfield(std::move(getPath(inputName)))] =
         std::make_unique<common::NegatedBytesValues>(values, nullAllowed);
   } else {
-    filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::BytesValues>(values, nullAllowed);
+    filters[common::Subfield(std::move(getPath(inputName)))] =
+        std::make_unique<common::BytesValues>(values, nullAllowed);
   }
 }
 
@@ -2142,7 +2138,7 @@ void SubstraitToVeloxPlanConverter::setSubfieldFilter(
   using MultiRangeType = typename RangeTraits<KIND>::MultiRangeType;
 
   if (colFilters.size() == 1) {
-    filters[common::Subfield(inputName, getSeparators())] = 
std::move(colFilters[0]);
+    filters[common::Subfield(std::move(getPath(inputName)))] = 
std::move(colFilters[0]);
   } else if (colFilters.size() > 1) {
     // BigintMultiRange should have been sorted
     if (colFilters[0]->kind() == common::FilterKind::kBigintRange) {
@@ -2152,10 +2148,10 @@ void SubstraitToVeloxPlanConverter::setSubfieldFilter(
       });
     }
     if constexpr (std::is_same_v<MultiRangeType, common::MultiRange>) {
-      filters[common::Subfield(inputName, getSeparators())] =
+      filters[common::Subfield(std::move(getPath(inputName)))] =
           std::make_unique<common::MultiRange>(std::move(colFilters), 
nullAllowed, true /*nanAllowed*/);
     } else {
-      filters[common::Subfield(inputName, getSeparators())] =
+      filters[common::Subfield(std::move(getPath(inputName)))] =
           std::make_unique<MultiRangeType>(std::move(colFilters), nullAllowed);
     }
   }
@@ -2184,7 +2180,7 @@ void 
SubstraitToVeloxPlanConverter::constructSubfieldFilters(
     // Handle bool type filters.
     // Not equal.
     if (filterInfo.notValue_) {
-      filters[common::Subfield(inputName, getSeparators())] =
+      filters[common::Subfield(std::move(getPath(inputName)))] =
           
std::make_unique<common::BoolValue>(!filterInfo.notValue_.value().value<bool>(),
 nullAllowed);
     } else if (filterInfo.notValues_.size() > 0) {
       std::set<bool> notValues;
@@ -2192,18 +2188,18 @@ void 
SubstraitToVeloxPlanConverter::constructSubfieldFilters(
         notValues.emplace(v.value<bool>());
       }
       if (notValues.size() == 1) {
-        filters[common::Subfield(inputName, getSeparators())] =
+        filters[common::Subfield(std::move(getPath(inputName)))] =
             std::make_unique<common::BoolValue>(!(*notValues.begin()), 
nullAllowed);
       } else {
         // if there are more than one distinct value in NOT IN list, the 
filter should be AlwaysFalse
-        filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::AlwaysFalse>();
+        filters[common::Subfield(std::move(getPath(inputName)))] = 
std::make_unique<common::AlwaysFalse>();
       }
     } else if (rangeSize == 0) {
       // IsNull/IsNotNull.
       if (!nullAllowed) {
-        filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::IsNotNull>();
+        filters[common::Subfield(std::move(getPath(inputName)))] = 
std::make_unique<common::IsNotNull>();
       } else if (isNull) {
-        filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::IsNull>();
+        filters[common::Subfield(std::move(getPath(inputName)))] = 
std::make_unique<common::IsNull>();
       } else {
         VELOX_NYI("Only IsNotNull and IsNull are supported in 
constructSubfieldFilters when no other filter ranges.");
       }
@@ -2212,7 +2208,8 @@ void 
SubstraitToVeloxPlanConverter::constructSubfieldFilters(
       // Equal.
       auto value = filterInfo.lowerBounds_[0].value().value<bool>();
       VELOX_CHECK(value == filterInfo.upperBounds_[0].value().value<bool>(), 
"invalid state of bool equal");
-      filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::BoolValue>(value, nullAllowed);
+      filters[common::Subfield(std::move(getPath(inputName)))] =
+          std::make_unique<common::BoolValue>(value, nullAllowed);
     }
   } else if constexpr (
       KIND == facebook::velox::TypeKind::ARRAY || KIND == 
facebook::velox::TypeKind::MAP ||
@@ -2220,9 +2217,9 @@ void 
SubstraitToVeloxPlanConverter::constructSubfieldFilters(
     // Only IsNotNull and IsNull are supported for complex types.
     VELOX_CHECK_EQ(rangeSize, 0, "Only IsNotNull and IsNull are supported for 
complex type.");
     if (!nullAllowed) {
-      filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::IsNotNull>();
+      filters[common::Subfield(std::move(getPath(inputName)))] = 
std::make_unique<common::IsNotNull>();
     } else if (isNull) {
-      filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::IsNull>();
+      filters[common::Subfield(std::move(getPath(inputName)))] = 
std::make_unique<common::IsNull>();
     } else {
       VELOX_NYI("Only IsNotNull and IsNull are supported for input type 
'{}'.", inputType->toString());
     }
@@ -2266,16 +2263,16 @@ void 
SubstraitToVeloxPlanConverter::constructSubfieldFilters(
       VELOX_CHECK(rangeSize == 0, "LowerBounds or upperBounds conditons cannot 
be supported after not-equal filter.");
       if constexpr (std::is_same_v<MultiRangeType, common::MultiRange>) {
         if (colFilters.size() == 1) {
-          filters[common::Subfield(inputName, getSeparators())] = 
std::move(colFilters.front());
+          filters[common::Subfield(std::move(getPath(inputName)))] = 
std::move(colFilters.front());
         } else {
-          filters[common::Subfield(inputName, getSeparators())] =
+          filters[common::Subfield(std::move(getPath(inputName)))] =
               std::make_unique<common::MultiRange>(std::move(colFilters), 
nullAllowed, true /*nanAllowed*/);
         }
       } else {
         if (colFilters.size() == 1) {
-          filters[common::Subfield(inputName, getSeparators())] = 
std::move(colFilters.front());
+          filters[common::Subfield(std::move(getPath(inputName)))] = 
std::move(colFilters.front());
         } else {
-          filters[common::Subfield(inputName, getSeparators())] =
+          filters[common::Subfield(std::move(getPath(inputName)))] =
               std::make_unique<MultiRangeType>(std::move(colFilters), 
nullAllowed);
         }
       }
@@ -2286,11 +2283,11 @@ void 
SubstraitToVeloxPlanConverter::constructSubfieldFilters(
     if (rangeSize == 0) {
       // handle is not null and is null exists at same time
       if (existIsNullAndIsNotNull) {
-        filters[common::Subfield(inputName, getSeparators())] = 
std::move(std::make_unique<common::AlwaysFalse>());
+        filters[common::Subfield(std::move(getPath(inputName)))] = 
std::move(std::make_unique<common::AlwaysFalse>());
       } else if (!nullAllowed) {
-        filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::IsNotNull>();
+        filters[common::Subfield(std::move(getPath(inputName)))] = 
std::make_unique<common::IsNotNull>();
       } else if (isNull) {
-        filters[common::Subfield(inputName, getSeparators())] = 
std::make_unique<common::IsNull>();
+        filters[common::Subfield(std::move(getPath(inputName)))] = 
std::make_unique<common::IsNull>();
       } else {
         VELOX_NYI("Only IsNotNull and IsNull are supported in 
constructSubfieldFilters when no other filter ranges.");
       }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to