(incubator-gluten) branch main updated: [GLUTEN-7652][VL] Support binary as string (#9325)

wangzhen Wed, 16 Apr 2025 03:40:36 -0700

This is an automated email from the ASF dual-hosted git repository.

wangzhen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new 79db240513 [GLUTEN-7652][VL] Support binary as string (#9325)
79db240513 is described below

commit 79db240513bf4a9e3807f0743c680e83aa0da0cd
Author: Zhen Wang <[email protected]>
AuthorDate: Wed Apr 16 18:40:21 2025 +0800

    [GLUTEN-7652][VL] Support binary as string (#9325)
    
    * [GLUTEN-7652][VL] Set data columns for hive HiveTableHandle
    
    * fix format
    
    * fix
    
    * fix
    
    * fix
    
    * fix style
    
    * fix
    
    * fix style
    
    * address comment
    
    * replace to_binary with cast
---
 .../org/apache/gluten/execution/VeloxScanSuite.scala | 20 ++++++++++++++++++++
 cpp/velox/substrait/SubstraitToVeloxPlan.cc          | 16 +++++++++++-----
 cpp/velox/tests/Substrait2VeloxPlanConversionTest.cc | 12 ++++++++++--
 .../sql/hive/execution/GlutenHiveSQLQuerySuite.scala |  1 -
 4 files changed, 41 insertions(+), 8 deletions(-)

diff --git 
a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala
 
b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala
index 525eca9407..f750ff65bd 100644
--- 
a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala
+++ 
b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala
@@ -22,6 +22,7 @@ import org.apache.gluten.config.GlutenConfig
 import org.apache.gluten.utils.VeloxFileSystemValidationJniWrapper
 
 import org.apache.spark.SparkConf
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.GreaterThan
 import org.apache.spark.sql.execution.ScalarSubquery
 import org.apache.spark.sql.internal.SQLConf
@@ -187,4 +188,23 @@ class VeloxScanSuite extends 
VeloxWholeStageTransformerSuite {
       }
     }
   }
+
+  test("test binary as string") {
+    withTempDir {
+      dir =>
+        val path = dir.getCanonicalPath
+        spark
+          .range(2)
+          .selectExpr("id as a", "cast(cast(id + 10 as string) as binary) as 
b")
+          .write
+          .mode("overwrite")
+          .parquet(path)
+
+        withTable("test") {
+          sql("create table test (a long, b string) using parquet options 
(path '" + path + "')")
+          val df = sql("select b from test group by b order by b")
+          checkAnswer(df, Seq(Row("10"), Row("11")))
+        }
+    }
+  }
 }
diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc 
b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
index 909d00e5e8..1304cb511b 100644
--- a/cpp/velox/substrait/SubstraitToVeloxPlan.cc
+++ b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
@@ -1292,18 +1292,24 @@ core::PlanNodePtr 
SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait::
 
   // Velox requires Filter Pushdown must being enabled.
   bool filterPushdownEnabled = true;
+  auto names = colNameList;
+  auto types = veloxTypeList;
+  auto dataColumns = ROW(std::move(names), std::move(types));
   std::shared_ptr<connector::hive::HiveTableHandle> tableHandle;
   if (!readRel.has_filter()) {
     tableHandle = std::make_shared<connector::hive::HiveTableHandle>(
-        kHiveConnectorId, "hive_table", filterPushdownEnabled, 
common::SubfieldFilters{}, nullptr);
+        kHiveConnectorId, "hive_table", filterPushdownEnabled, 
common::SubfieldFilters{}, nullptr, dataColumns);
   } else {
     common::SubfieldFilters subfieldFilters;
-    auto names = colNameList;
-    auto types = veloxTypeList;
-    auto remainingFilter = exprConverter_->toVeloxExpr(readRel.filter(), 
ROW(std::move(names), std::move(types)));
+    auto remainingFilter = exprConverter_->toVeloxExpr(readRel.filter(), 
dataColumns);
 
     tableHandle = std::make_shared<connector::hive::HiveTableHandle>(
-        kHiveConnectorId, "hive_table", filterPushdownEnabled, 
std::move(subfieldFilters), remainingFilter);
+        kHiveConnectorId,
+        "hive_table",
+        filterPushdownEnabled,
+        std::move(subfieldFilters),
+        remainingFilter,
+        dataColumns);
   }
 
   // Get assignments and out names.
diff --git a/cpp/velox/tests/Substrait2VeloxPlanConversionTest.cc 
b/cpp/velox/tests/Substrait2VeloxPlanConversionTest.cc
index cccc619a86..42c1cb79b8 100644
--- a/cpp/velox/tests/Substrait2VeloxPlanConversionTest.cc
+++ b/cpp/velox/tests/Substrait2VeloxPlanConversionTest.cc
@@ -88,6 +88,14 @@ class Substrait2VeloxPlanConversionTest : public 
exec::test::HiveConnectorTestBa
 //
 //  Tested Velox operators: TableScan (Filter Pushdown), Project, Aggregate.
 TEST_F(Substrait2VeloxPlanConversionTest, q6) {
+  FLAGS_velox_exception_user_stacktrace_enabled = true;
+  FLAGS_velox_exception_system_stacktrace_enabled = true;
+  std::unordered_map<std::string, std::string> hiveConfig{
+      {"hive.orc.use-column-names", "true"}, {"hive.parquet.use-column-names", 
"true"}};
+  std::shared_ptr<const facebook::velox::config::ConfigBase> config{
+      
std::make_shared<facebook::velox::config::ConfigBase>(std::move(hiveConfig))};
+  resetHiveConnector(config);
+
   // Generate the used ORC file.
   auto type =
       ROW({"l_orderkey",
@@ -257,7 +265,7 @@ TEST_F(Substrait2VeloxPlanConversionTest, ifthenTest) {
   // Convert to Velox PlanNode.
   auto planNode = planConverter_->toVeloxPlan(substraitPlan, 
std::vector<::substrait::ReadRel_LocalFiles>{split});
   ASSERT_EQ(
-      "-- Project[1][expressions: ] -> \n  -- TableScan[0][table: hive_table, 
remaining filter: 
(and(and(and(and(isnotnull(\"hd_vehicle_count\"),or(equalto(\"hd_buy_potential\",\">10000\"),equalto(\"hd_buy_potential\",\"unknown\"))),greaterthan(\"hd_vehicle_count\",0)),if(greaterthan(\"hd_vehicle_count\",0),greaterthan(divide(cast
 \"hd_dep_count\" as DOUBLE,cast \"hd_vehicle_count\" as 
DOUBLE),1.2))),isnotnull(\"hd_demo_sk\")))] -> n0_0:BIGINT, n0_1:VARCHAR, 
n0_2:BIGINT, n0_3:BIGINT\n",
+      "-- Project[1][expressions: ] -> \n  -- TableScan[0][table: hive_table, 
remaining filter: 
(and(and(and(and(isnotnull(\"hd_vehicle_count\"),or(equalto(\"hd_buy_potential\",\">10000\"),equalto(\"hd_buy_potential\",\"unknown\"))),greaterthan(\"hd_vehicle_count\",0)),if(greaterthan(\"hd_vehicle_count\",0),greaterthan(divide(cast
 \"hd_dep_count\" as DOUBLE,cast \"hd_vehicle_count\" as 
DOUBLE),1.2))),isnotnull(\"hd_demo_sk\"))), data columns: 
ROW<hd_demo_sk:BIGINT,hd_buy_potential:VARCHA [...]
       planNode->toString(true, true));
 }
 
@@ -273,7 +281,7 @@ TEST_F(Substrait2VeloxPlanConversionTest, filterUpper) {
   // Convert to Velox PlanNode.
   auto planNode = planConverter_->toVeloxPlan(substraitPlan, 
std::vector<::substrait::ReadRel_LocalFiles>{split});
   ASSERT_EQ(
-      "-- Project[1][expressions: ] -> \n  -- TableScan[0][table: hive_table, 
remaining filter: (and(isnotnull(\"key\"),lessthan(\"key\",3)))] -> 
n0_0:INTEGER\n",
+      "-- Project[1][expressions: ] -> \n  -- TableScan[0][table: hive_table, 
remaining filter: (and(isnotnull(\"key\"),lessthan(\"key\",3))), data columns: 
ROW<key:INTEGER>] -> n0_0:INTEGER\n",
       planNode->toString(true, true));
 }
 
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
index b348f67193..c93aa6640d 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/hive/execution/GlutenHiveSQLQuerySuite.scala
@@ -47,5 +47,4 @@ class GlutenHiveSQLQuerySuite extends 
GlutenHiveSQLQuerySuiteBase {
       ignoreIfNotExists = true,
       purge = false)
   }
-
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [GLUTEN-7652][VL] Support binary as string (#9325)

Reply via email to