This is an automated email from the ASF dual-hosted git repository.
philo-he pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git
The following commit(s) were added to refs/heads/main by this push:
new b4425283ec [MINOR][TEST] Remove SparkFunctionStatistics (#12136)
b4425283ec is described below
commit b4425283ec83ca9ae6a7d09c87cddaf249b98ef7
Author: Philo He <[email protected]>
AuthorDate: Tue May 26 16:09:06 2026 -0700
[MINOR][TEST] Remove SparkFunctionStatistics (#12136)
---
.../utils/clickhouse/ClickHouseTestSettings.scala | 2 -
.../sql/statistics/SparkFunctionStatistics.scala | 220 ---------------------
.../utils/clickhouse/ClickHouseTestSettings.scala | 2 -
.../sql/statistics/SparkFunctionStatistics.scala | 218 --------------------
.../sql/statistics/SparkFunctionStatistics.scala | 218 --------------------
.../sql/statistics/SparkFunctionStatistics.scala | 218 --------------------
.../sql/statistics/SparkFunctionStatistics.scala | 218 --------------------
7 files changed, 1096 deletions(-)
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 075e310ec7..01177248c0 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -40,7 +40,6 @@ import
org.apache.spark.sql.extension.GlutenSessionExtensionSuite
import org.apache.spark.sql.gluten.GlutenFallbackSuite
import org.apache.spark.sql.hive.execution.GlutenHiveSQLQueryCHSuite
import org.apache.spark.sql.sources._
-import org.apache.spark.sql.statistics.SparkFunctionStatistics
// Some settings' line length exceeds 100
// scalastyle:off line.size.limit
@@ -1875,7 +1874,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude(
"SELECT structFieldSimple.key, arrayFieldSimple[1] FROM tableWithSchema
a where int_Field=1")
.exclude("SELECT structFieldComplex.Value.`value_(2)` FROM
tableWithSchema")
- enableSuite[SparkFunctionStatistics]
enableSuite[GlutenImplicitsTest]
.excludeGlutenTest("fallbackSummary with shuffle")
.excludeGlutenTest("fallbackSummary with cache")
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5723cb1161..0000000000
---
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding,
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
- var spark: SparkSession = null
-
- protected def initializeSession(): Unit = {
- if (spark == null) {
- val sparkBuilder = SparkSession
- .builder()
- .appName("Gluten-UT")
- .master(s"local[2]")
- // Avoid static evaluation for literal input by spark catalyst.
- .config(
- SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
- ConvertToLocalRelation.ruleName +
- "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
- .config("spark.driver.memory", "1G")
- .config("spark.sql.adaptive.enabled", "true")
- .config("spark.sql.shuffle.partitions", "1")
- .config("spark.sql.files.maxPartitionBytes", "134217728")
- .config("spark.memory.offHeap.enabled", "true")
- .config("spark.memory.offHeap.size", "1024MB")
- .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
- .config("spark.shuffle.manager",
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
- // Avoid the code size overflow error in Spark code generation.
- .config("spark.sql.codegen.wholeStage", "false")
-
- spark = if (BackendTestUtils.isCHBackendLoaded()) {
- sparkBuilder
- .config("spark.io.compression.codec", "LZ4")
- .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
- .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
- .config("spark.sql.files.openCostInBytes", "134217728")
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- } else {
- sparkBuilder
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- }
- }
- }
-
- def extractQuery(examples: String): Seq[String] = {
- examples
- .split("\n")
- .map(_.trim)
- .filter(!_.isEmpty)
- .filter(_.startsWith("> SELECT"))
- .map(_.replace("> SELECT", "SELECT"))
- }
-
- test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
- initializeSession
- val functionRegistry = spark.sessionState.functionRegistry
- val sparkBuiltInFunctions = functionRegistry.listFunction()
- // According to expressionsForTimestampNTZSupport in
FunctionRegistry.scala,
- // these functions are registered only for testing, not available for end
users.
- // Other functions like current_database is NOT necessarily offloaded to
native.
- val ignoreFunctions =
FunctionRegistry.expressionsForTimestampNTZSupport.keySet ++
- Seq(
- "get_fake_app_name",
- "current_catalog",
- "current_database",
- "spark_partition_id",
- "current_user",
- "current_timezone")
- val supportedFunctions = new java.util.ArrayList[String]()
- val unsupportedFunctions = new java.util.ArrayList[String]()
- val needInspectFunctions = new java.util.ArrayList[String]()
-
- for (func <- sparkBuiltInFunctions) {
- val exprInfo = functionRegistry.lookupFunction(func).get
- if (!ignoreFunctions.contains(exprInfo.getName)) {
- val examples = extractQuery(exprInfo.getExamples)
- if (examples.isEmpty) {
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("## Not found examples for " + exprInfo.getName)
- // scalastyle:on println
- }
- var isSupported: Boolean = true
- breakable {
- for (example <- examples) {
- var executedPlan: SparkPlan = null
- try {
- executedPlan = spark.sql(example).queryExecution.executedPlan
- } catch {
- case t: Throwable =>
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("-- Need inspect " + exprInfo.getName)
- println(exprInfo.getExamples)
- // scalastyle:on println
- break
- }
- val hasFallbackProject =
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
- if (hasFallbackProject) {
- isSupported = false
- break
- }
- val hasGlutenPlan =
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
- if (!hasGlutenPlan) {
- isSupported = false
- break
- }
- break
- }
- }
- if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
- supportedFunctions.add(exprInfo.getName)
- } else if (!isSupported) {
- unsupportedFunctions.add(exprInfo.getName)
- }
- }
- }
- // scalastyle:off println
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions: " + supportedFunctions.size())
- println("Unsupported functions: " + unsupportedFunctions.size())
- println("Need inspect functions: " + needInspectFunctions.size())
- // scalastyle:on println
- // For correction.
- val supportedCastAliasFunctions = Seq(
- "boolean",
- "tinyint",
- "smallint",
- "int",
- "bigint",
- "float",
- "double",
- "decimal",
- "date",
- "binary",
- "string")
- for (func <- supportedCastAliasFunctions) {
- if (needInspectFunctions.contains(func)) {
- needInspectFunctions.remove(func)
- supportedFunctions.add(func)
- }
- }
-
- // For wrongly recognized unsupported case.
- Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value",
"hash", "mod").foreach(
- name => {
- if (unsupportedFunctions.remove(name)) {
- supportedFunctions.add(name)
- }
- })
- // For wrongly recognized supported case.
- Seq(
- "array_contains",
- "map_keys",
- "get_json_object",
- "element_at",
- "map_from_arrays",
- "contains",
- "startswith",
- "endswith",
- "map_contains_key",
- "map_values",
- "try_element_at",
- "struct",
- "array",
- "ilike",
- "sec",
- "csc"
- ).foreach(
- name => {
- if (supportedFunctions.remove(name)) {
- unsupportedFunctions.add(name)
- }
- })
- // Functions in needInspectFunctions were checked.
- unsupportedFunctions.addAll(needInspectFunctions)
- // scalastyle:off println
- println("---------------")
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions corrected: " + supportedFunctions.size())
- println("Unsupported functions corrected: " + unsupportedFunctions.size())
- println("Support list:")
- println(supportedFunctions)
- println("Not support list:")
- println(unsupportedFunctions)
- // scalastyle:on println
- }
-}
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 7b3fc556fb..485c3acd29 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -40,7 +40,6 @@ import
org.apache.spark.sql.extension.GlutenSessionExtensionSuite
import org.apache.spark.sql.gluten.GlutenFallbackSuite
import org.apache.spark.sql.hive.execution.GlutenHiveSQLQueryCHSuite
import org.apache.spark.sql.sources._
-import org.apache.spark.sql.statistics.SparkFunctionStatistics
// Some settings' line length exceeds 100
// scalastyle:off line.size.limit
@@ -1738,7 +1737,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude(
"SELECT structFieldSimple.key, arrayFieldSimple[1] FROM tableWithSchema
a where int_Field=1")
.exclude("SELECT structFieldComplex.Value.`value_(2)` FROM
tableWithSchema")
- enableSuite[SparkFunctionStatistics]
enableSuite[GlutenSparkSessionExtensionSuite]
enableSuite[GlutenHiveSQLQueryCHSuite]
enableSuite[GlutenPercentileSuite]
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5bf53e66e6..0000000000
---
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding,
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
- var spark: SparkSession = null
-
- protected def initializeSession(): Unit = {
- if (spark == null) {
- val sparkBuilder = SparkSession
- .builder()
- .appName("Gluten-UT")
- .master(s"local[2]")
- // Avoid static evaluation for literal input by spark catalyst.
- .config(
- SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
- ConvertToLocalRelation.ruleName +
- "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
- .config("spark.driver.memory", "1G")
- .config("spark.sql.adaptive.enabled", "true")
- .config("spark.sql.shuffle.partitions", "1")
- .config("spark.sql.files.maxPartitionBytes", "134217728")
- .config("spark.memory.offHeap.enabled", "true")
- .config("spark.memory.offHeap.size", "1024MB")
- .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
- .config("spark.shuffle.manager",
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
- // Avoid the code size overflow error in Spark code generation.
- .config("spark.sql.codegen.wholeStage", "false")
-
- spark = if (BackendTestUtils.isCHBackendLoaded()) {
- sparkBuilder
- .config("spark.io.compression.codec", "LZ4")
- .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
- .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
- .config("spark.sql.files.openCostInBytes", "134217728")
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- } else {
- sparkBuilder
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- }
- }
- }
-
- def extractQuery(examples: String): Seq[String] = {
- examples
- .split("\n")
- .map(_.trim)
- .filter(!_.isEmpty)
- .filter(_.startsWith("> SELECT"))
- .map(_.replace("> SELECT", "SELECT"))
- }
-
- test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
- initializeSession
- val functionRegistry = spark.sessionState.functionRegistry
- val sparkBuiltInFunctions = functionRegistry.listFunction()
- // According to expressionsForTimestampNTZSupport in
FunctionRegistry.scala,
- // these functions are registered only for testing, not available for end
users.
- // Other functions like current_database is NOT necessarily offloaded to
native.
- val ignoreFunctions = Seq(
- "get_fake_app_name",
- "current_catalog",
- "current_database",
- "spark_partition_id",
- "current_user",
- "current_timezone")
- val supportedFunctions = new java.util.ArrayList[String]()
- val unsupportedFunctions = new java.util.ArrayList[String]()
- val needInspectFunctions = new java.util.ArrayList[String]()
-
- for (func <- sparkBuiltInFunctions) {
- val exprInfo = functionRegistry.lookupFunction(func).get
- if (!ignoreFunctions.contains(exprInfo.getName)) {
- val examples = extractQuery(exprInfo.getExamples)
- if (examples.isEmpty) {
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("## Not found examples for " + exprInfo.getName)
- // scalastyle:on println
- }
- var isSupported: Boolean = true
- breakable {
- for (example <- examples) {
- var executedPlan: SparkPlan = null
- try {
- executedPlan = spark.sql(example).queryExecution.executedPlan
- } catch {
- case t: Throwable =>
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("-- Need inspect " + exprInfo.getName)
- println(exprInfo.getExamples)
- // scalastyle:on println
- break
- }
- val hasFallbackProject =
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
- if (hasFallbackProject) {
- isSupported = false
- break
- }
- val hasGlutenPlan =
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
- if (!hasGlutenPlan) {
- isSupported = false
- break
- }
- break
- }
- }
- if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
- supportedFunctions.add(exprInfo.getName)
- } else if (!isSupported) {
- unsupportedFunctions.add(exprInfo.getName)
- }
- }
- }
- // scalastyle:off println
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions: " + supportedFunctions.size())
- println("Unsupported functions: " + unsupportedFunctions.size())
- println("Need inspect functions: " + needInspectFunctions.size())
- // scalastyle:on println
- // For correction.
- val supportedCastAliasFunctions = Seq(
- "boolean",
- "tinyint",
- "smallint",
- "int",
- "bigint",
- "float",
- "double",
- "decimal",
- "date",
- "binary",
- "string")
- for (func <- supportedCastAliasFunctions) {
- if (needInspectFunctions.contains(func)) {
- needInspectFunctions.remove(func)
- supportedFunctions.add(func)
- }
- }
-
- // For wrongly recognized unsupported case.
- Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value",
"hash", "mod").foreach(
- name => {
- if (unsupportedFunctions.remove(name)) {
- supportedFunctions.add(name)
- }
- })
- // For wrongly recognized supported case.
- Seq(
- "array_contains",
- "map_keys",
- "get_json_object",
- "element_at",
- "map_from_arrays",
- "contains",
- "startswith",
- "endswith",
- "map_contains_key",
- "map_values",
- "try_element_at",
- "struct",
- "array",
- "ilike",
- "sec",
- "csc"
- ).foreach(
- name => {
- if (supportedFunctions.remove(name)) {
- unsupportedFunctions.add(name)
- }
- })
- // Functions in needInspectFunctions were checked.
- unsupportedFunctions.addAll(needInspectFunctions)
- // scalastyle:off println
- println("---------------")
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions corrected: " + supportedFunctions.size())
- println("Unsupported functions corrected: " + unsupportedFunctions.size())
- println("Support list:")
- println(supportedFunctions)
- println("Not support list:")
- println(unsupportedFunctions)
- // scalastyle:on println
- }
-}
diff --git
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5bf53e66e6..0000000000
---
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding,
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
- var spark: SparkSession = null
-
- protected def initializeSession(): Unit = {
- if (spark == null) {
- val sparkBuilder = SparkSession
- .builder()
- .appName("Gluten-UT")
- .master(s"local[2]")
- // Avoid static evaluation for literal input by spark catalyst.
- .config(
- SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
- ConvertToLocalRelation.ruleName +
- "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
- .config("spark.driver.memory", "1G")
- .config("spark.sql.adaptive.enabled", "true")
- .config("spark.sql.shuffle.partitions", "1")
- .config("spark.sql.files.maxPartitionBytes", "134217728")
- .config("spark.memory.offHeap.enabled", "true")
- .config("spark.memory.offHeap.size", "1024MB")
- .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
- .config("spark.shuffle.manager",
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
- // Avoid the code size overflow error in Spark code generation.
- .config("spark.sql.codegen.wholeStage", "false")
-
- spark = if (BackendTestUtils.isCHBackendLoaded()) {
- sparkBuilder
- .config("spark.io.compression.codec", "LZ4")
- .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
- .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
- .config("spark.sql.files.openCostInBytes", "134217728")
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- } else {
- sparkBuilder
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- }
- }
- }
-
- def extractQuery(examples: String): Seq[String] = {
- examples
- .split("\n")
- .map(_.trim)
- .filter(!_.isEmpty)
- .filter(_.startsWith("> SELECT"))
- .map(_.replace("> SELECT", "SELECT"))
- }
-
- test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
- initializeSession
- val functionRegistry = spark.sessionState.functionRegistry
- val sparkBuiltInFunctions = functionRegistry.listFunction()
- // According to expressionsForTimestampNTZSupport in
FunctionRegistry.scala,
- // these functions are registered only for testing, not available for end
users.
- // Other functions like current_database is NOT necessarily offloaded to
native.
- val ignoreFunctions = Seq(
- "get_fake_app_name",
- "current_catalog",
- "current_database",
- "spark_partition_id",
- "current_user",
- "current_timezone")
- val supportedFunctions = new java.util.ArrayList[String]()
- val unsupportedFunctions = new java.util.ArrayList[String]()
- val needInspectFunctions = new java.util.ArrayList[String]()
-
- for (func <- sparkBuiltInFunctions) {
- val exprInfo = functionRegistry.lookupFunction(func).get
- if (!ignoreFunctions.contains(exprInfo.getName)) {
- val examples = extractQuery(exprInfo.getExamples)
- if (examples.isEmpty) {
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("## Not found examples for " + exprInfo.getName)
- // scalastyle:on println
- }
- var isSupported: Boolean = true
- breakable {
- for (example <- examples) {
- var executedPlan: SparkPlan = null
- try {
- executedPlan = spark.sql(example).queryExecution.executedPlan
- } catch {
- case t: Throwable =>
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("-- Need inspect " + exprInfo.getName)
- println(exprInfo.getExamples)
- // scalastyle:on println
- break
- }
- val hasFallbackProject =
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
- if (hasFallbackProject) {
- isSupported = false
- break
- }
- val hasGlutenPlan =
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
- if (!hasGlutenPlan) {
- isSupported = false
- break
- }
- break
- }
- }
- if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
- supportedFunctions.add(exprInfo.getName)
- } else if (!isSupported) {
- unsupportedFunctions.add(exprInfo.getName)
- }
- }
- }
- // scalastyle:off println
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions: " + supportedFunctions.size())
- println("Unsupported functions: " + unsupportedFunctions.size())
- println("Need inspect functions: " + needInspectFunctions.size())
- // scalastyle:on println
- // For correction.
- val supportedCastAliasFunctions = Seq(
- "boolean",
- "tinyint",
- "smallint",
- "int",
- "bigint",
- "float",
- "double",
- "decimal",
- "date",
- "binary",
- "string")
- for (func <- supportedCastAliasFunctions) {
- if (needInspectFunctions.contains(func)) {
- needInspectFunctions.remove(func)
- supportedFunctions.add(func)
- }
- }
-
- // For wrongly recognized unsupported case.
- Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value",
"hash", "mod").foreach(
- name => {
- if (unsupportedFunctions.remove(name)) {
- supportedFunctions.add(name)
- }
- })
- // For wrongly recognized supported case.
- Seq(
- "array_contains",
- "map_keys",
- "get_json_object",
- "element_at",
- "map_from_arrays",
- "contains",
- "startswith",
- "endswith",
- "map_contains_key",
- "map_values",
- "try_element_at",
- "struct",
- "array",
- "ilike",
- "sec",
- "csc"
- ).foreach(
- name => {
- if (supportedFunctions.remove(name)) {
- unsupportedFunctions.add(name)
- }
- })
- // Functions in needInspectFunctions were checked.
- unsupportedFunctions.addAll(needInspectFunctions)
- // scalastyle:off println
- println("---------------")
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions corrected: " + supportedFunctions.size())
- println("Unsupported functions corrected: " + unsupportedFunctions.size())
- println("Support list:")
- println(supportedFunctions)
- println("Not support list:")
- println(unsupportedFunctions)
- // scalastyle:on println
- }
-}
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5bf53e66e6..0000000000
---
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding,
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
- var spark: SparkSession = null
-
- protected def initializeSession(): Unit = {
- if (spark == null) {
- val sparkBuilder = SparkSession
- .builder()
- .appName("Gluten-UT")
- .master(s"local[2]")
- // Avoid static evaluation for literal input by spark catalyst.
- .config(
- SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
- ConvertToLocalRelation.ruleName +
- "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
- .config("spark.driver.memory", "1G")
- .config("spark.sql.adaptive.enabled", "true")
- .config("spark.sql.shuffle.partitions", "1")
- .config("spark.sql.files.maxPartitionBytes", "134217728")
- .config("spark.memory.offHeap.enabled", "true")
- .config("spark.memory.offHeap.size", "1024MB")
- .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
- .config("spark.shuffle.manager",
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
- // Avoid the code size overflow error in Spark code generation.
- .config("spark.sql.codegen.wholeStage", "false")
-
- spark = if (BackendTestUtils.isCHBackendLoaded()) {
- sparkBuilder
- .config("spark.io.compression.codec", "LZ4")
- .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
- .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
- .config("spark.sql.files.openCostInBytes", "134217728")
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- } else {
- sparkBuilder
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- }
- }
- }
-
- def extractQuery(examples: String): Seq[String] = {
- examples
- .split("\n")
- .map(_.trim)
- .filter(!_.isEmpty)
- .filter(_.startsWith("> SELECT"))
- .map(_.replace("> SELECT", "SELECT"))
- }
-
- test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
- initializeSession
- val functionRegistry = spark.sessionState.functionRegistry
- val sparkBuiltInFunctions = functionRegistry.listFunction()
- // According to expressionsForTimestampNTZSupport in
FunctionRegistry.scala,
- // these functions are registered only for testing, not available for end
users.
- // Other functions like current_database is NOT necessarily offloaded to
native.
- val ignoreFunctions = Seq(
- "get_fake_app_name",
- "current_catalog",
- "current_database",
- "spark_partition_id",
- "current_user",
- "current_timezone")
- val supportedFunctions = new java.util.ArrayList[String]()
- val unsupportedFunctions = new java.util.ArrayList[String]()
- val needInspectFunctions = new java.util.ArrayList[String]()
-
- for (func <- sparkBuiltInFunctions) {
- val exprInfo = functionRegistry.lookupFunction(func).get
- if (!ignoreFunctions.contains(exprInfo.getName)) {
- val examples = extractQuery(exprInfo.getExamples)
- if (examples.isEmpty) {
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("## Not found examples for " + exprInfo.getName)
- // scalastyle:on println
- }
- var isSupported: Boolean = true
- breakable {
- for (example <- examples) {
- var executedPlan: SparkPlan = null
- try {
- executedPlan = spark.sql(example).queryExecution.executedPlan
- } catch {
- case t: Throwable =>
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("-- Need inspect " + exprInfo.getName)
- println(exprInfo.getExamples)
- // scalastyle:on println
- break
- }
- val hasFallbackProject =
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
- if (hasFallbackProject) {
- isSupported = false
- break
- }
- val hasGlutenPlan =
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
- if (!hasGlutenPlan) {
- isSupported = false
- break
- }
- break
- }
- }
- if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
- supportedFunctions.add(exprInfo.getName)
- } else if (!isSupported) {
- unsupportedFunctions.add(exprInfo.getName)
- }
- }
- }
- // scalastyle:off println
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions: " + supportedFunctions.size())
- println("Unsupported functions: " + unsupportedFunctions.size())
- println("Need inspect functions: " + needInspectFunctions.size())
- // scalastyle:on println
- // For correction.
- val supportedCastAliasFunctions = Seq(
- "boolean",
- "tinyint",
- "smallint",
- "int",
- "bigint",
- "float",
- "double",
- "decimal",
- "date",
- "binary",
- "string")
- for (func <- supportedCastAliasFunctions) {
- if (needInspectFunctions.contains(func)) {
- needInspectFunctions.remove(func)
- supportedFunctions.add(func)
- }
- }
-
- // For wrongly recognized unsupported case.
- Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value",
"hash", "mod").foreach(
- name => {
- if (unsupportedFunctions.remove(name)) {
- supportedFunctions.add(name)
- }
- })
- // For wrongly recognized supported case.
- Seq(
- "array_contains",
- "map_keys",
- "get_json_object",
- "element_at",
- "map_from_arrays",
- "contains",
- "startswith",
- "endswith",
- "map_contains_key",
- "map_values",
- "try_element_at",
- "struct",
- "array",
- "ilike",
- "sec",
- "csc"
- ).foreach(
- name => {
- if (supportedFunctions.remove(name)) {
- unsupportedFunctions.add(name)
- }
- })
- // Functions in needInspectFunctions were checked.
- unsupportedFunctions.addAll(needInspectFunctions)
- // scalastyle:off println
- println("---------------")
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions corrected: " + supportedFunctions.size())
- println("Unsupported functions corrected: " + unsupportedFunctions.size())
- println("Support list:")
- println(supportedFunctions)
- println("Not support list:")
- println(unsupportedFunctions)
- // scalastyle:on println
- }
-}
diff --git
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5bf53e66e6..0000000000
---
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding,
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
- var spark: SparkSession = null
-
- protected def initializeSession(): Unit = {
- if (spark == null) {
- val sparkBuilder = SparkSession
- .builder()
- .appName("Gluten-UT")
- .master(s"local[2]")
- // Avoid static evaluation for literal input by spark catalyst.
- .config(
- SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
- ConvertToLocalRelation.ruleName +
- "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
- .config("spark.driver.memory", "1G")
- .config("spark.sql.adaptive.enabled", "true")
- .config("spark.sql.shuffle.partitions", "1")
- .config("spark.sql.files.maxPartitionBytes", "134217728")
- .config("spark.memory.offHeap.enabled", "true")
- .config("spark.memory.offHeap.size", "1024MB")
- .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
- .config("spark.shuffle.manager",
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
- // Avoid the code size overflow error in Spark code generation.
- .config("spark.sql.codegen.wholeStage", "false")
-
- spark = if (BackendTestUtils.isCHBackendLoaded()) {
- sparkBuilder
- .config("spark.io.compression.codec", "LZ4")
- .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
- .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
- .config("spark.sql.files.openCostInBytes", "134217728")
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- } else {
- sparkBuilder
- .config("spark.unsafe.exceptionOnMemoryLeak", "true")
- .getOrCreate()
- }
- }
- }
-
- def extractQuery(examples: String): Seq[String] = {
- examples
- .split("\n")
- .map(_.trim)
- .filter(!_.isEmpty)
- .filter(_.startsWith("> SELECT"))
- .map(_.replace("> SELECT", "SELECT"))
- }
-
- test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
- initializeSession
- val functionRegistry = spark.sessionState.functionRegistry
- val sparkBuiltInFunctions = functionRegistry.listFunction()
- // According to expressionsForTimestampNTZSupport in
FunctionRegistry.scala,
- // these functions are registered only for testing, not available for end
users.
- // Other functions like current_database is NOT necessarily offloaded to
native.
- val ignoreFunctions = Seq(
- "get_fake_app_name",
- "current_catalog",
- "current_database",
- "spark_partition_id",
- "current_user",
- "current_timezone")
- val supportedFunctions = new java.util.ArrayList[String]()
- val unsupportedFunctions = new java.util.ArrayList[String]()
- val needInspectFunctions = new java.util.ArrayList[String]()
-
- for (func <- sparkBuiltInFunctions) {
- val exprInfo = functionRegistry.lookupFunction(func).get
- if (!ignoreFunctions.contains(exprInfo.getName)) {
- val examples = extractQuery(exprInfo.getExamples)
- if (examples.isEmpty) {
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("## Not found examples for " + exprInfo.getName)
- // scalastyle:on println
- }
- var isSupported: Boolean = true
- breakable {
- for (example <- examples) {
- var executedPlan: SparkPlan = null
- try {
- executedPlan = spark.sql(example).queryExecution.executedPlan
- } catch {
- case t: Throwable =>
- needInspectFunctions.add(exprInfo.getName)
- // scalastyle:off println
- println("-- Need inspect " + exprInfo.getName)
- println(exprInfo.getExamples)
- // scalastyle:on println
- break
- }
- val hasFallbackProject =
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
- if (hasFallbackProject) {
- isSupported = false
- break
- }
- val hasGlutenPlan =
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
- if (!hasGlutenPlan) {
- isSupported = false
- break
- }
- break
- }
- }
- if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
- supportedFunctions.add(exprInfo.getName)
- } else if (!isSupported) {
- unsupportedFunctions.add(exprInfo.getName)
- }
- }
- }
- // scalastyle:off println
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions: " + supportedFunctions.size())
- println("Unsupported functions: " + unsupportedFunctions.size())
- println("Need inspect functions: " + needInspectFunctions.size())
- // scalastyle:on println
- // For correction.
- val supportedCastAliasFunctions = Seq(
- "boolean",
- "tinyint",
- "smallint",
- "int",
- "bigint",
- "float",
- "double",
- "decimal",
- "date",
- "binary",
- "string")
- for (func <- supportedCastAliasFunctions) {
- if (needInspectFunctions.contains(func)) {
- needInspectFunctions.remove(func)
- supportedFunctions.add(func)
- }
- }
-
- // For wrongly recognized unsupported case.
- Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value",
"hash", "mod").foreach(
- name => {
- if (unsupportedFunctions.remove(name)) {
- supportedFunctions.add(name)
- }
- })
- // For wrongly recognized supported case.
- Seq(
- "array_contains",
- "map_keys",
- "get_json_object",
- "element_at",
- "map_from_arrays",
- "contains",
- "startswith",
- "endswith",
- "map_contains_key",
- "map_values",
- "try_element_at",
- "struct",
- "array",
- "ilike",
- "sec",
- "csc"
- ).foreach(
- name => {
- if (supportedFunctions.remove(name)) {
- unsupportedFunctions.add(name)
- }
- })
- // Functions in needInspectFunctions were checked.
- unsupportedFunctions.addAll(needInspectFunctions)
- // scalastyle:off println
- println("---------------")
- println("Overall functions: " + (sparkBuiltInFunctions.size -
ignoreFunctions.size))
- println("Supported functions corrected: " + supportedFunctions.size())
- println("Unsupported functions corrected: " + unsupportedFunctions.size())
- println("Support list:")
- println(supportedFunctions)
- println("Not support list:")
- println(unsupportedFunctions)
- // scalastyle:on println
- }
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]