This is an automated email from the ASF dual-hosted git repository.
philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new db0336a51 [CORE] Add a config to fall back all regexp expressions
(#5099)
db0336a51 is described below
commit db0336a5106e4a2b84fb9b549b6b0f5b923618cc
Author: PHILO-HE <[email protected]>
AuthorDate: Tue Mar 26 12:08:10 2024 +0800
[CORE] Add a config to fall back all regexp expressions (#5099)
---
docs/Configuration.md | 1 +
docs/velox-backend-limitations.md | 12 +-
.../org/apache/spark/sql/GlutenSQLTestsTrait.scala | 4 +-
.../org/apache/spark/sql/GlutenTestsTrait.scala | 2 +-
gluten-ut/pom.xml | 1 +
.../spark/sql/execution/GlutenSortSuite.scala | 4 +-
.../spark/sql/execution/GlutenSortSuite.scala | 4 +-
.../utils/clickhouse/ClickHouseTestSettings.scala | 1 -
.../utils/velox/VeloxTestSettings.scala | 3 +-
.../expressions/GlutenExpressionMappingSuite.scala | 49 -----
.../spark/sql/execution/GlutenSortSuite.scala | 4 +-
gluten-ut/test/pom.xml | 199 +++++++++++++++++++++
.../expressions/GlutenExpressionMappingSuite.scala | 97 ++++++++++
.../main/scala/io/glutenproject/GlutenConfig.scala | 18 +-
14 files changed, 334 insertions(+), 65 deletions(-)
diff --git a/docs/Configuration.md b/docs/Configuration.md
index 626000bc4..b68717d04 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -65,6 +65,7 @@ You can add these configurations into spark-defaults.conf to
enable or disable t
| spark.gluten.sql.broadcastNestedLoopJoinTransformerEnabled | Config to
enable BroadcastNestedLoopJoinExecTransformer.
[...]
| spark.gluten.sql.cacheWholeStageTransformerContext | When true,
`WholeStageTransformer` will cache the `WholeStageTransformerContext` when
executing. It is used to get substrait plan node and native plan string.
[...]
| spark.gluten.sql.injectNativePlanStringToExplain | When true,
Gluten will inject native plan tree to explain string inside
`WholeStageTransformerContext`.
[...]
+ | spark.gluten.sql.fallbackRegexpExpressions | When true,
Gluten will fall back all regexp expressions to avoid any incompatibility risk.
[...]
## Velox Parameters
diff --git a/docs/velox-backend-limitations.md
b/docs/velox-backend-limitations.md
index 7b03f3b2f..7f58fea88 100644
--- a/docs/velox-backend-limitations.md
+++ b/docs/velox-backend-limitations.md
@@ -25,9 +25,15 @@ Velox BloomFilter's serialization format is different from
Spark's. BloomFilter
#### Case Sensitive mode
Gluten only supports spark default case-insensitive mode. If case-sensitive
mode is enabled, user may get incorrect result.
-#### Lookaround pattern for regexp functions
-In velox, lookaround (lookahead/lookbehind) pattern is not supported in
RE2-based implementations for Spark functions,
-such as `rlike`, `regexp_extract`, etc.
+#### Regexp functions
+In Velox, regexp functions (`rlike`, `regexp_extract`, etc.) are implemented
based on RE2, while in Spark they are based on `java.util.regex`.
+* Lookaround (lookahead/lookbehind) pattern is not supported in RE2.
+* When matching white space with pattern "\\s", RE2 doesn't treat "\v" (or
"\x0b") as white space, but `java.util.regex` does.
+
+There are a few unknown incompatible cases. If user cannot tolerate the
incompatibility risk, please enable the below configuration property.
+```
+spark.gluten.sql.fallbackRegexpExpressions
+```
#### FileSource format
Currently, Gluten only fully supports parquet file format and partially
support ORC. If other format is used, scan operator falls back to vanilla spark.
diff --git
a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenSQLTestsTrait.scala
b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenSQLTestsTrait.scala
index 5c6cc2e3f..8433bbf8d 100644
---
a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenSQLTestsTrait.scala
+++
b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenSQLTestsTrait.scala
@@ -76,11 +76,11 @@ trait GlutenSQLTestsTrait extends QueryTest with
GlutenSQLTestsBaseTrait {
assertEmptyMissingInput(analyzedDF)
- GlutenQueryTest.checkAnswer(analyzedDF, expectedAnswer)
+ GlutenQueryTestUtil.checkAnswer(analyzedDF, expectedAnswer)
}
}
-object GlutenQueryTest extends Assertions {
+object GlutenQueryTestUtil extends Assertions {
/**
* Runs the plan and makes sure the answer matches the expected result.
diff --git
a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsTrait.scala
b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsTrait.scala
index 488756560..60aad9b2b 100644
---
a/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsTrait.scala
+++
b/gluten-ut/common/src/test/scala/org/apache/spark/sql/GlutenTestsTrait.scala
@@ -22,7 +22,7 @@ import io.glutenproject.execution.ProjectExecTransformer
import io.glutenproject.test.TestStats
import io.glutenproject.utils.{BackendTestUtils, SystemParameters}
-import org.apache.spark.sql.GlutenQueryTest.isNaNOrInf
+import org.apache.spark.sql.GlutenQueryTestUtil.isNaNOrInf
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.analysis.ResolveTimeZone
import org.apache.spark.sql.catalyst.expressions._
diff --git a/gluten-ut/pom.xml b/gluten-ut/pom.xml
index 1c8d49f15..f4f9b694a 100644
--- a/gluten-ut/pom.xml
+++ b/gluten-ut/pom.xml
@@ -26,6 +26,7 @@
<modules>
<module>common</module>
+ <module>test</module>
</modules>
<artifactId>gluten-ut</artifactId>
diff --git
a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
index 8787537a2..86aa55aae 100644
---
a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
+++
b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.execution
import io.glutenproject.execution.SortExecTransformer
-import org.apache.spark.sql.{catalyst, GlutenQueryTest,
GlutenSQLTestsBaseTrait, Row}
+import org.apache.spark.sql.{catalyst, GlutenQueryTestUtil,
GlutenSQLTestsBaseTrait, Row}
import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedAttribute}
import org.apache.spark.sql.catalyst.expressions.{Length, SortOrder}
import org.apache.spark.sql.catalyst.plans.QueryPlan
@@ -55,7 +55,7 @@ class GlutenSortSuite extends SortSuite with
GlutenSQLTestsBaseTrait with Adapti
)
val df = input.toDF("a", "b", "c").orderBy(length($"a").desc, $"b".desc)
- GlutenQueryTest.checkAnswer(
+ GlutenQueryTestUtil.checkAnswer(
df,
Seq(
Row("Hello Bob", 10, 1.0),
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
index 8787537a2..86aa55aae 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.execution
import io.glutenproject.execution.SortExecTransformer
-import org.apache.spark.sql.{catalyst, GlutenQueryTest,
GlutenSQLTestsBaseTrait, Row}
+import org.apache.spark.sql.{catalyst, GlutenQueryTestUtil,
GlutenSQLTestsBaseTrait, Row}
import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedAttribute}
import org.apache.spark.sql.catalyst.expressions.{Length, SortOrder}
import org.apache.spark.sql.catalyst.plans.QueryPlan
@@ -55,7 +55,7 @@ class GlutenSortSuite extends SortSuite with
GlutenSQLTestsBaseTrait with Adapti
)
val df = input.toDF("a", "b", "c").orderBy(length($"a").desc, $"b".desc)
- GlutenQueryTest.checkAnswer(
+ GlutenQueryTestUtil.checkAnswer(
df,
Seq(
Row("Hello Bob", 10, 1.0),
diff --git
a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/clickhouse/ClickHouseTestSettings.scala
index c2d22140c..ab204b365 100644
---
a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/clickhouse/ClickHouseTestSettings.scala
@@ -1860,7 +1860,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
"SELECT structFieldSimple.key, arrayFieldSimple[1] FROM tableWithSchema
a where int_Field=1")
.exclude("SELECT structFieldComplex.Value.`value_(2)` FROM
tableWithSchema")
enableSuite[SparkFunctionStatistics]
- enableSuite[GlutenExpressionMappingSuite]
enableSuite[GlutenSparkSessionExtensionSuite]
override def getSQLQueryTestSettings: SQLQueryTestSettings =
ClickHouseSQLQueryTestSettings
diff --git
a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
index 182c896d2..1c37e787b 100644
---
a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxTestSettings.scala
@@ -19,7 +19,7 @@ package io.glutenproject.utils.velox
import io.glutenproject.utils.{BackendTestSettings, SQLQueryTestSettings}
import org.apache.spark.sql._
-import
org.apache.spark.sql.catalyst.expressions.{GlutenArithmeticExpressionSuite,
GlutenBitwiseExpressionsSuite, GlutenCastSuite,
GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite,
GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite,
GlutenDecimalExpressionSuite, GlutenExpressionMappingSuite,
GlutenHashExpressionsSuite, GlutenIntervalExpressionsSuite,
GlutenLiteralExpressionSuite, GlutenMathExpressionsSuite,
GlutenMiscExpressionsSuite, GlutenNondeterministicSuite, Gluten [...]
+import
org.apache.spark.sql.catalyst.expressions.{GlutenArithmeticExpressionSuite,
GlutenBitwiseExpressionsSuite, GlutenCastSuite,
GlutenCollectionExpressionsSuite, GlutenComplexTypeSuite,
GlutenConditionalExpressionSuite, GlutenDateExpressionsSuite,
GlutenDecimalExpressionSuite, GlutenHashExpressionsSuite,
GlutenIntervalExpressionsSuite, GlutenLiteralExpressionSuite,
GlutenMathExpressionsSuite, GlutenMiscExpressionsSuite,
GlutenNondeterministicSuite, GlutenNullExpressionsSuite, GlutenPr [...]
import
org.apache.spark.sql.connector.{GlutenDataSourceV2DataFrameSessionCatalogSuite,
GlutenDataSourceV2DataFrameSuite, GlutenDataSourceV2FunctionSuite,
GlutenDataSourceV2SQLSessionCatalogSuite, GlutenDataSourceV2SQLSuiteV1Filter,
GlutenDataSourceV2SQLSuiteV2Filter, GlutenDataSourceV2Suite,
GlutenDeleteFromTableSuite, GlutenFileDataSourceV2FallBackSuite,
GlutenKeyGroupedPartitioningSuite, GlutenLocalScanSuite,
GlutenMetadataColumnSuite, GlutenSupportsCatalogOptionsSuite, GlutenTableCapa
[...]
import org.apache.spark.sql.errors.{GlutenQueryCompilationErrorsDSv2Suite,
GlutenQueryCompilationErrorsSuite, GlutenQueryExecutionErrorsSuite,
GlutenQueryParsingErrorsSuite}
import org.apache.spark.sql.execution.{FallbackStrategiesSuite,
GlutenBroadcastExchangeSuite, GlutenCoalesceShufflePartitionsSuite,
GlutenExchangeSuite, GlutenReplaceHashWithSortAggSuite,
GlutenReuseExchangeAndSubquerySuite, GlutenSameResultSuite, GlutenSortSuite,
GlutenSQLAggregateFunctionSuite, GlutenSQLWindowFunctionSuite,
GlutenTakeOrderedAndProjectSuite}
@@ -1138,7 +1138,6 @@ class VeloxTestSettings extends BackendTestSettings {
enableSuite[GlutenFallbackSuite]
enableSuite[GlutenHiveSQLQuerySuite]
enableSuite[GlutenCollapseProjectExecTransformerSuite]
- enableSuite[GlutenExpressionMappingSuite]
enableSuite[GlutenSparkSessionExtensionSuite]
override def getSQLQueryTestSettings: SQLQueryTestSettings =
VeloxSQLQueryTestSettings
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenExpressionMappingSuite.scala
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenExpressionMappingSuite.scala
deleted file mode 100644
index 65709b0de..000000000
---
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenExpressionMappingSuite.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.catalyst.expressions
-
-import io.glutenproject.GlutenConfig
-import io.glutenproject.execution.ProjectExecTransformer
-import io.glutenproject.expression.ExpressionMappings
-
-import org.apache.spark.sql.{GlutenSQLTestsTrait, Row}
-import org.apache.spark.sql.execution.ProjectExec
-import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
-
-class GlutenExpressionMappingSuite extends GlutenSQLTestsTrait with
AdaptiveSparkPlanHelper {
-
- testGluten("test expression blacklist") {
- val names = ExpressionMappings.expressionsMap.values.toSet
- assert(names.contains("regexp_replace"))
- assert(names.contains("regexp_extract"))
-
- withSQLConf(GlutenConfig.EXPRESSION_BLACK_LIST.key ->
"regexp_replace,regexp_extract,add") {
- val names = ExpressionMappings.expressionsMap.values.toSet
- assert(!names.contains("regexp_replace"))
- assert(!names.contains("regexp_extract"))
- assert(names.contains("regexp_extract_all"))
- assert(!names.contains("add"))
- spark.sql("CREATE TABLE t USING PARQUET AS SELECT 1 as c")
- withTable("t") {
- val df = spark.sql("SELECT c + 1 FROM t")
- checkAnswer(df, Row(2))
-
assert(find(df.queryExecution.executedPlan)(_.isInstanceOf[ProjectExecTransformer]).isEmpty)
-
assert(find(df.queryExecution.executedPlan)(_.isInstanceOf[ProjectExec]).isDefined)
- }
- }
- }
-}
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
index 26f02e9a2..eaf5f66d9 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/GlutenSortSuite.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.execution
import io.glutenproject.execution.SortExecTransformer
-import org.apache.spark.sql.{catalyst, GlutenQueryTest,
GlutenSQLTestsBaseTrait, Row}
+import org.apache.spark.sql.{catalyst, GlutenQueryTestUtil,
GlutenSQLTestsBaseTrait, Row}
import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedAttribute}
import org.apache.spark.sql.catalyst.expressions.{Length, SortOrder}
import org.apache.spark.sql.catalyst.plans.QueryPlan
@@ -55,7 +55,7 @@ class GlutenSortSuite extends SortSuite with
GlutenSQLTestsBaseTrait with Adapti
)
val df = input.toDF("a", "b", "c").orderBy(length($"a").desc, $"b".desc)
- GlutenQueryTest.checkAnswer(
+ GlutenQueryTestUtil.checkAnswer(
df,
Seq(
Row("Hello Bob", 10, 1.0),
diff --git a/gluten-ut/test/pom.xml b/gluten-ut/test/pom.xml
new file mode 100644
index 000000000..63a6c7122
--- /dev/null
+++ b/gluten-ut/test/pom.xml
@@ -0,0 +1,199 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <artifactId>gluten-ut</artifactId>
+ <groupId>io.glutenproject</groupId>
+ <version>1.2.0-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>gluten-ut-test</artifactId>
+ <packaging>jar</packaging>
+ <name>Gluten Unit Test</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>io.glutenproject</groupId>
+ <artifactId>gluten-ut-common</artifactId>
+ <version>${project.version}</version>
+ <scope>compile</scope>
+ <type>test-jar</type>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.parquet</groupId>
+ <artifactId>parquet-column</artifactId>
+ <version>1.12.3</version>
+ <scope>test</scope>
+ <classifier>tests</classifier>
+ </dependency>
+ <dependency>
+ <groupId>io.glutenproject</groupId>
+ <artifactId>gluten-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <profiles>
+ <profile>
+ <id>backends-clickhouse</id>
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+ <dependencies>
+ <dependency>
+ <groupId>io.glutenproject</groupId>
+ <artifactId>backends-clickhouse</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.celeborn</groupId>
+
<artifactId>celeborn-client-spark-${spark.major.version}-shaded_${scala.binary.version}</artifactId>
+ <version>${celeborn.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ </profile>
+ <profile>
+ <id>backends-velox</id>
+ <activation>
+ <activeByDefault>false</activeByDefault>
+ </activation>
+ <dependencies>
+ <dependency>
+ <groupId>io.glutenproject</groupId>
+ <artifactId>backends-velox</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.arrow</groupId>
+ <artifactId>arrow-vector</artifactId>
+ <version>${arrow.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>io.netty</groupId>
+ <artifactId>netty-common</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>io.netty</groupId>
+ <artifactId>netty-buffer</artifactId>
+ </exclusion>
+ </exclusions>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.arrow</groupId>
+ <artifactId>arrow-c-data</artifactId>
+ <version>${arrow.version}</version>
+ <scope>test</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.arrow</groupId>
+ <artifactId>arrow-vector</artifactId>
+ </exclusion>
+ <exclusion>
+ <artifactId>protobuf-java</artifactId>
+ <groupId>com.google.protobuf</groupId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.arrow</groupId>
+ <artifactId>arrow-memory-netty</artifactId>
+ <version>${arrow.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.arrow</groupId>
+ <artifactId>arrow-memory-core</artifactId>
+ <version>${arrow.version}</version>
+ <scope>test</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>io.netty</groupId>
+ <artifactId>netty-common</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>io.netty</groupId>
+ <artifactId>netty-buffer</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ </dependencies>
+ <properties>
+ <clickhouse.lib.path></clickhouse.lib.path>
+ </properties>
+ </profile>
+ </profiles>
+
+ <build>
+
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-resources-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>net.alchim31.maven</groupId>
+ <artifactId>scala-maven-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.scalastyle</groupId>
+ <artifactId>scalastyle-maven-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>com.diffplug.spotless</groupId>
+ <artifactId>spotless-maven-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-checkstyle-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.scalatest</groupId>
+ <artifactId>scalatest-maven-plugin</artifactId>
+ <configuration>
+ <junitxml>.</junitxml>
+ </configuration>
+ <executions>
+ <execution>
+ <id>test</id>
+ <goals>
+ <goal>test</goal>
+ </goals>
+ <configuration>
+ <systemProperties>
+
<clickhouse.lib.path>${clickhouse.lib.path}</clickhouse.lib.path>
+ <tpcds.data.path>${tpcds.data.path}</tpcds.data.path>
+ </systemProperties>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>prepare-test-jar</id>
+ <phase>test-compile</phase>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git
a/gluten-ut/test/src/test/scala/io/glutenproject/expressions/GlutenExpressionMappingSuite.scala
b/gluten-ut/test/src/test/scala/io/glutenproject/expressions/GlutenExpressionMappingSuite.scala
new file mode 100644
index 000000000..81e234c0e
--- /dev/null
+++
b/gluten-ut/test/src/test/scala/io/glutenproject/expressions/GlutenExpressionMappingSuite.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.glutenproject.expressions
+
+import io.glutenproject.GlutenConfig
+import io.glutenproject.execution.ProjectExecTransformer
+import io.glutenproject.expression.ExpressionMappings
+import io.glutenproject.utils.{BackendTestUtils, SystemParameters}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{GlutenQueryTest, Row}
+import org.apache.spark.sql.execution.ProjectExec
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.test.SharedSparkSession
+
+class GlutenExpressionMappingSuite
+ extends GlutenQueryTest
+ with SharedSparkSession
+ with AdaptiveSparkPlanHelper {
+
+ override protected def sparkConf: SparkConf = {
+ val conf = super.sparkConf
+ .set("spark.plugins", "io.glutenproject.GlutenPlugin")
+ .set("spark.default.parallelism", "1")
+ .set("spark.memory.offHeap.enabled", "true")
+ .set("spark.memory.offHeap.size", "1024MB")
+ .set("spark.ui.enabled", "false")
+ .set("spark.gluten.ui.enabled", "false")
+ if (BackendTestUtils.isCHBackendLoaded()) {
+ conf
+ .set("spark.gluten.sql.enable.native.validation", "false")
+ .set(GlutenConfig.GLUTEN_LIB_PATH,
SystemParameters.getClickHouseLibPath)
+ }
+ conf
+ }
+
+ test("test expression blacklist") {
+ val names = ExpressionMappings.expressionsMap.values.toSet
+ assert(names.contains("regexp_replace"))
+ assert(names.contains("regexp_extract"))
+
+ withSQLConf(GlutenConfig.EXPRESSION_BLACK_LIST.key ->
"regexp_replace,regexp_extract,add") {
+ val names = ExpressionMappings.expressionsMap.values.toSet
+ assert(!names.contains("regexp_replace"))
+ assert(!names.contains("regexp_extract"))
+ assert(names.contains("regexp_extract_all"))
+ assert(!names.contains("add"))
+ spark.sql("CREATE TABLE t USING PARQUET AS SELECT 1 as c")
+ withTable("t") {
+ val df = spark.sql("SELECT c + 1 FROM t")
+ checkAnswer(df, Row(2))
+
assert(find(df.queryExecution.executedPlan)(_.isInstanceOf[ProjectExecTransformer]).isEmpty)
+
assert(find(df.queryExecution.executedPlan)(_.isInstanceOf[ProjectExec]).isDefined)
+ }
+ }
+ }
+
+ test("test blacklisting regexp expressions") {
+ val names = ExpressionMappings.expressionsMap.values.toSet
+ assert(names.contains("rlike"))
+ assert(names.contains("regexp_replace"))
+ assert(names.contains("regexp_extract"))
+ assert(names.contains("regexp_extract_all"))
+ assert(names.contains("split"))
+
+ withSQLConf(
+ GlutenConfig.EXPRESSION_BLACK_LIST.key -> "",
+ GlutenConfig.FALLBACK_REGEXP_EXPRESSIONS.key -> "true") {
+ val names = ExpressionMappings.expressionsMap.values.toSet
+ assert(!names.contains("rlike"))
+ assert(!names.contains("regexp_replace"))
+ assert(!names.contains("regexp_extract"))
+ assert(!names.contains("regexp_extract_all"))
+ assert(!names.contains("split"))
+
+ spark.sql("CREATE TABLE t USING PARQUET AS SELECT 'abc100' as c")
+ withTable("t") {
+ val df = spark.sql("SELECT regexp_replace(c, '(\\d+)', 'something')
FROM t")
+
assert(find(df.queryExecution.executedPlan)(_.isInstanceOf[ProjectExecTransformer]).isEmpty)
+ }
+ }
+ }
+}
diff --git a/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala
b/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala
index 4119a09fc..a30f70baa 100644
--- a/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala
+++ b/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala
@@ -292,11 +292,18 @@ class GlutenConfig(conf: SQLConf) extends Logging {
def expressionBlacklist: Set[String] = {
val blacklist = conf.getConf(EXPRESSION_BLACK_LIST)
- if (blacklist.isDefined) {
+ val blacklistSet: Set[String] = if (blacklist.isDefined) {
blacklist.get.toLowerCase(Locale.ROOT).trim.split(",").toSet
} else {
Set.empty
}
+
+ if (conf.getConf(FALLBACK_REGEXP_EXPRESSIONS)) {
+ val regexpList =
"rlike,regexp_replace,regexp_extract,regexp_extract_all,split"
+ regexpList.trim.split(",").toSet ++ blacklistSet
+ } else {
+ blacklistSet
+ }
}
def printStackOnValidationFailure: Boolean =
@@ -1416,6 +1423,15 @@ object GlutenConfig {
.stringConf
.createOptional
+ val FALLBACK_REGEXP_EXPRESSIONS =
+ buildConf("spark.gluten.sql.fallbackRegexpExpressions")
+ .doc(
+ "If true, fall back all regexp expressions. There are a few
incompatible cases" +
+ " between RE2 (used by native engine) and java.util.regex (used by
Spark). User should" +
+ " enable this property if their incompatibility is intolerable.")
+ .booleanConf
+ .createWithDefault(false)
+
val FALLBACK_REPORTER_ENABLED =
buildConf("spark.gluten.sql.columnar.fallbackReporter")
.doc("When true, enable fallback reporter rule to print fallback reason")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]