(gluten) branch main updated: [MINOR][TEST] Remove SparkFunctionStatistics (#12136)

philo Tue, 26 May 2026 16:10:24 -0700

This is an automated email from the ASF dual-hosted git repository.

philo-he pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new b4425283ec [MINOR][TEST] Remove SparkFunctionStatistics (#12136)
b4425283ec is described below

commit b4425283ec83ca9ae6a7d09c87cddaf249b98ef7
Author: Philo He <[email protected]>
AuthorDate: Tue May 26 16:09:06 2026 -0700

    [MINOR][TEST] Remove SparkFunctionStatistics (#12136)
---
 .../utils/clickhouse/ClickHouseTestSettings.scala  |   2 -
 .../sql/statistics/SparkFunctionStatistics.scala   | 220 ---------------------
 .../utils/clickhouse/ClickHouseTestSettings.scala  |   2 -
 .../sql/statistics/SparkFunctionStatistics.scala   | 218 --------------------
 .../sql/statistics/SparkFunctionStatistics.scala   | 218 --------------------
 .../sql/statistics/SparkFunctionStatistics.scala   | 218 --------------------
 .../sql/statistics/SparkFunctionStatistics.scala   | 218 --------------------
 7 files changed, 1096 deletions(-)

diff --git 
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 075e310ec7..01177248c0 100644
--- 
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -40,7 +40,6 @@ import 
org.apache.spark.sql.extension.GlutenSessionExtensionSuite
 import org.apache.spark.sql.gluten.GlutenFallbackSuite
 import org.apache.spark.sql.hive.execution.GlutenHiveSQLQueryCHSuite
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.statistics.SparkFunctionStatistics
 
 // Some settings' line length exceeds 100
 // scalastyle:off line.size.limit
@@ -1875,7 +1874,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude(
       "SELECT structFieldSimple.key, arrayFieldSimple[1] FROM tableWithSchema 
a where int_Field=1")
     .exclude("SELECT structFieldComplex.Value.`value_(2)` FROM 
tableWithSchema")
-  enableSuite[SparkFunctionStatistics]
   enableSuite[GlutenImplicitsTest]
     .excludeGlutenTest("fallbackSummary with shuffle")
     .excludeGlutenTest("fallbackSummary with cache")
diff --git 
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
 
b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5723cb1161..0000000000
--- 
a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, 
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some 
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
-  var spark: SparkSession = null
-
-  protected def initializeSession(): Unit = {
-    if (spark == null) {
-      val sparkBuilder = SparkSession
-        .builder()
-        .appName("Gluten-UT")
-        .master(s"local[2]")
-        // Avoid static evaluation for literal input by spark catalyst.
-        .config(
-          SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
-          ConvertToLocalRelation.ruleName +
-            "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
-        .config("spark.driver.memory", "1G")
-        .config("spark.sql.adaptive.enabled", "true")
-        .config("spark.sql.shuffle.partitions", "1")
-        .config("spark.sql.files.maxPartitionBytes", "134217728")
-        .config("spark.memory.offHeap.enabled", "true")
-        .config("spark.memory.offHeap.size", "1024MB")
-        .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
-        .config("spark.shuffle.manager", 
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
-        // Avoid the code size overflow error in Spark code generation.
-        .config("spark.sql.codegen.wholeStage", "false")
-
-      spark = if (BackendTestUtils.isCHBackendLoaded()) {
-        sparkBuilder
-          .config("spark.io.compression.codec", "LZ4")
-          .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
-          .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
-          .config("spark.sql.files.openCostInBytes", "134217728")
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      } else {
-        sparkBuilder
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      }
-    }
-  }
-
-  def extractQuery(examples: String): Seq[String] = {
-    examples
-      .split("\n")
-      .map(_.trim)
-      .filter(!_.isEmpty)
-      .filter(_.startsWith("> SELECT"))
-      .map(_.replace("> SELECT", "SELECT"))
-  }
-
-  test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
-    initializeSession
-    val functionRegistry = spark.sessionState.functionRegistry
-    val sparkBuiltInFunctions = functionRegistry.listFunction()
-    // According to expressionsForTimestampNTZSupport in 
FunctionRegistry.scala,
-    // these functions are registered only for testing, not available for end 
users.
-    // Other functions like current_database is NOT necessarily offloaded to 
native.
-    val ignoreFunctions = 
FunctionRegistry.expressionsForTimestampNTZSupport.keySet ++
-      Seq(
-        "get_fake_app_name",
-        "current_catalog",
-        "current_database",
-        "spark_partition_id",
-        "current_user",
-        "current_timezone")
-    val supportedFunctions = new java.util.ArrayList[String]()
-    val unsupportedFunctions = new java.util.ArrayList[String]()
-    val needInspectFunctions = new java.util.ArrayList[String]()
-
-    for (func <- sparkBuiltInFunctions) {
-      val exprInfo = functionRegistry.lookupFunction(func).get
-      if (!ignoreFunctions.contains(exprInfo.getName)) {
-        val examples = extractQuery(exprInfo.getExamples)
-        if (examples.isEmpty) {
-          needInspectFunctions.add(exprInfo.getName)
-          // scalastyle:off println
-          println("## Not found examples for " + exprInfo.getName)
-          // scalastyle:on println
-        }
-        var isSupported: Boolean = true
-        breakable {
-          for (example <- examples) {
-            var executedPlan: SparkPlan = null
-            try {
-              executedPlan = spark.sql(example).queryExecution.executedPlan
-            } catch {
-              case t: Throwable =>
-                needInspectFunctions.add(exprInfo.getName)
-                // scalastyle:off println
-                println("-- Need inspect " + exprInfo.getName)
-                println(exprInfo.getExamples)
-                // scalastyle:on println
-                break
-            }
-            val hasFallbackProject = 
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
-            if (hasFallbackProject) {
-              isSupported = false
-              break
-            }
-            val hasGlutenPlan = 
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
-            if (!hasGlutenPlan) {
-              isSupported = false
-              break
-            }
-            break
-          }
-        }
-        if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
-          supportedFunctions.add(exprInfo.getName)
-        } else if (!isSupported) {
-          unsupportedFunctions.add(exprInfo.getName)
-        }
-      }
-    }
-    // scalastyle:off println
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions: " + supportedFunctions.size())
-    println("Unsupported functions: " + unsupportedFunctions.size())
-    println("Need inspect functions: " + needInspectFunctions.size())
-    // scalastyle:on println
-    // For correction.
-    val supportedCastAliasFunctions = Seq(
-      "boolean",
-      "tinyint",
-      "smallint",
-      "int",
-      "bigint",
-      "float",
-      "double",
-      "decimal",
-      "date",
-      "binary",
-      "string")
-    for (func <- supportedCastAliasFunctions) {
-      if (needInspectFunctions.contains(func)) {
-        needInspectFunctions.remove(func)
-        supportedFunctions.add(func)
-      }
-    }
-
-    // For wrongly recognized unsupported case.
-    Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value", 
"hash", "mod").foreach(
-      name => {
-        if (unsupportedFunctions.remove(name)) {
-          supportedFunctions.add(name)
-        }
-      })
-    // For wrongly recognized supported case.
-    Seq(
-      "array_contains",
-      "map_keys",
-      "get_json_object",
-      "element_at",
-      "map_from_arrays",
-      "contains",
-      "startswith",
-      "endswith",
-      "map_contains_key",
-      "map_values",
-      "try_element_at",
-      "struct",
-      "array",
-      "ilike",
-      "sec",
-      "csc"
-    ).foreach(
-      name => {
-        if (supportedFunctions.remove(name)) {
-          unsupportedFunctions.add(name)
-        }
-      })
-    // Functions in needInspectFunctions were checked.
-    unsupportedFunctions.addAll(needInspectFunctions)
-    // scalastyle:off println
-    println("---------------")
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions corrected: " + supportedFunctions.size())
-    println("Unsupported functions corrected: " + unsupportedFunctions.size())
-    println("Support list:")
-    println(supportedFunctions)
-    println("Not support list:")
-    println(unsupportedFunctions)
-    // scalastyle:on println
-  }
-}
diff --git 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
index 7b3fc556fb..485c3acd29 100644
--- 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
+++ 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -40,7 +40,6 @@ import 
org.apache.spark.sql.extension.GlutenSessionExtensionSuite
 import org.apache.spark.sql.gluten.GlutenFallbackSuite
 import org.apache.spark.sql.hive.execution.GlutenHiveSQLQueryCHSuite
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.statistics.SparkFunctionStatistics
 
 // Some settings' line length exceeds 100
 // scalastyle:off line.size.limit
@@ -1738,7 +1737,6 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .exclude(
       "SELECT structFieldSimple.key, arrayFieldSimple[1] FROM tableWithSchema 
a where int_Field=1")
     .exclude("SELECT structFieldComplex.Value.`value_(2)` FROM 
tableWithSchema")
-  enableSuite[SparkFunctionStatistics]
   enableSuite[GlutenSparkSessionExtensionSuite]
   enableSuite[GlutenHiveSQLQueryCHSuite]
   enableSuite[GlutenPercentileSuite]
diff --git 
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
 
b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5bf53e66e6..0000000000
--- 
a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, 
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some 
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
-  var spark: SparkSession = null
-
-  protected def initializeSession(): Unit = {
-    if (spark == null) {
-      val sparkBuilder = SparkSession
-        .builder()
-        .appName("Gluten-UT")
-        .master(s"local[2]")
-        // Avoid static evaluation for literal input by spark catalyst.
-        .config(
-          SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
-          ConvertToLocalRelation.ruleName +
-            "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
-        .config("spark.driver.memory", "1G")
-        .config("spark.sql.adaptive.enabled", "true")
-        .config("spark.sql.shuffle.partitions", "1")
-        .config("spark.sql.files.maxPartitionBytes", "134217728")
-        .config("spark.memory.offHeap.enabled", "true")
-        .config("spark.memory.offHeap.size", "1024MB")
-        .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
-        .config("spark.shuffle.manager", 
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
-        // Avoid the code size overflow error in Spark code generation.
-        .config("spark.sql.codegen.wholeStage", "false")
-
-      spark = if (BackendTestUtils.isCHBackendLoaded()) {
-        sparkBuilder
-          .config("spark.io.compression.codec", "LZ4")
-          .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
-          .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
-          .config("spark.sql.files.openCostInBytes", "134217728")
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      } else {
-        sparkBuilder
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      }
-    }
-  }
-
-  def extractQuery(examples: String): Seq[String] = {
-    examples
-      .split("\n")
-      .map(_.trim)
-      .filter(!_.isEmpty)
-      .filter(_.startsWith("> SELECT"))
-      .map(_.replace("> SELECT", "SELECT"))
-  }
-
-  test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
-    initializeSession
-    val functionRegistry = spark.sessionState.functionRegistry
-    val sparkBuiltInFunctions = functionRegistry.listFunction()
-    // According to expressionsForTimestampNTZSupport in 
FunctionRegistry.scala,
-    // these functions are registered only for testing, not available for end 
users.
-    // Other functions like current_database is NOT necessarily offloaded to 
native.
-    val ignoreFunctions = Seq(
-      "get_fake_app_name",
-      "current_catalog",
-      "current_database",
-      "spark_partition_id",
-      "current_user",
-      "current_timezone")
-    val supportedFunctions = new java.util.ArrayList[String]()
-    val unsupportedFunctions = new java.util.ArrayList[String]()
-    val needInspectFunctions = new java.util.ArrayList[String]()
-
-    for (func <- sparkBuiltInFunctions) {
-      val exprInfo = functionRegistry.lookupFunction(func).get
-      if (!ignoreFunctions.contains(exprInfo.getName)) {
-        val examples = extractQuery(exprInfo.getExamples)
-        if (examples.isEmpty) {
-          needInspectFunctions.add(exprInfo.getName)
-          // scalastyle:off println
-          println("## Not found examples for " + exprInfo.getName)
-          // scalastyle:on println
-        }
-        var isSupported: Boolean = true
-        breakable {
-          for (example <- examples) {
-            var executedPlan: SparkPlan = null
-            try {
-              executedPlan = spark.sql(example).queryExecution.executedPlan
-            } catch {
-              case t: Throwable =>
-                needInspectFunctions.add(exprInfo.getName)
-                // scalastyle:off println
-                println("-- Need inspect " + exprInfo.getName)
-                println(exprInfo.getExamples)
-                // scalastyle:on println
-                break
-            }
-            val hasFallbackProject = 
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
-            if (hasFallbackProject) {
-              isSupported = false
-              break
-            }
-            val hasGlutenPlan = 
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
-            if (!hasGlutenPlan) {
-              isSupported = false
-              break
-            }
-            break
-          }
-        }
-        if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
-          supportedFunctions.add(exprInfo.getName)
-        } else if (!isSupported) {
-          unsupportedFunctions.add(exprInfo.getName)
-        }
-      }
-    }
-    // scalastyle:off println
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions: " + supportedFunctions.size())
-    println("Unsupported functions: " + unsupportedFunctions.size())
-    println("Need inspect functions: " + needInspectFunctions.size())
-    // scalastyle:on println
-    // For correction.
-    val supportedCastAliasFunctions = Seq(
-      "boolean",
-      "tinyint",
-      "smallint",
-      "int",
-      "bigint",
-      "float",
-      "double",
-      "decimal",
-      "date",
-      "binary",
-      "string")
-    for (func <- supportedCastAliasFunctions) {
-      if (needInspectFunctions.contains(func)) {
-        needInspectFunctions.remove(func)
-        supportedFunctions.add(func)
-      }
-    }
-
-    // For wrongly recognized unsupported case.
-    Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value", 
"hash", "mod").foreach(
-      name => {
-        if (unsupportedFunctions.remove(name)) {
-          supportedFunctions.add(name)
-        }
-      })
-    // For wrongly recognized supported case.
-    Seq(
-      "array_contains",
-      "map_keys",
-      "get_json_object",
-      "element_at",
-      "map_from_arrays",
-      "contains",
-      "startswith",
-      "endswith",
-      "map_contains_key",
-      "map_values",
-      "try_element_at",
-      "struct",
-      "array",
-      "ilike",
-      "sec",
-      "csc"
-    ).foreach(
-      name => {
-        if (supportedFunctions.remove(name)) {
-          unsupportedFunctions.add(name)
-        }
-      })
-    // Functions in needInspectFunctions were checked.
-    unsupportedFunctions.addAll(needInspectFunctions)
-    // scalastyle:off println
-    println("---------------")
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions corrected: " + supportedFunctions.size())
-    println("Unsupported functions corrected: " + unsupportedFunctions.size())
-    println("Support list:")
-    println(supportedFunctions)
-    println("Not support list:")
-    println(unsupportedFunctions)
-    // scalastyle:on println
-  }
-}
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5bf53e66e6..0000000000
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, 
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some 
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
-  var spark: SparkSession = null
-
-  protected def initializeSession(): Unit = {
-    if (spark == null) {
-      val sparkBuilder = SparkSession
-        .builder()
-        .appName("Gluten-UT")
-        .master(s"local[2]")
-        // Avoid static evaluation for literal input by spark catalyst.
-        .config(
-          SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
-          ConvertToLocalRelation.ruleName +
-            "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
-        .config("spark.driver.memory", "1G")
-        .config("spark.sql.adaptive.enabled", "true")
-        .config("spark.sql.shuffle.partitions", "1")
-        .config("spark.sql.files.maxPartitionBytes", "134217728")
-        .config("spark.memory.offHeap.enabled", "true")
-        .config("spark.memory.offHeap.size", "1024MB")
-        .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
-        .config("spark.shuffle.manager", 
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
-        // Avoid the code size overflow error in Spark code generation.
-        .config("spark.sql.codegen.wholeStage", "false")
-
-      spark = if (BackendTestUtils.isCHBackendLoaded()) {
-        sparkBuilder
-          .config("spark.io.compression.codec", "LZ4")
-          .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
-          .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
-          .config("spark.sql.files.openCostInBytes", "134217728")
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      } else {
-        sparkBuilder
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      }
-    }
-  }
-
-  def extractQuery(examples: String): Seq[String] = {
-    examples
-      .split("\n")
-      .map(_.trim)
-      .filter(!_.isEmpty)
-      .filter(_.startsWith("> SELECT"))
-      .map(_.replace("> SELECT", "SELECT"))
-  }
-
-  test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
-    initializeSession
-    val functionRegistry = spark.sessionState.functionRegistry
-    val sparkBuiltInFunctions = functionRegistry.listFunction()
-    // According to expressionsForTimestampNTZSupport in 
FunctionRegistry.scala,
-    // these functions are registered only for testing, not available for end 
users.
-    // Other functions like current_database is NOT necessarily offloaded to 
native.
-    val ignoreFunctions = Seq(
-      "get_fake_app_name",
-      "current_catalog",
-      "current_database",
-      "spark_partition_id",
-      "current_user",
-      "current_timezone")
-    val supportedFunctions = new java.util.ArrayList[String]()
-    val unsupportedFunctions = new java.util.ArrayList[String]()
-    val needInspectFunctions = new java.util.ArrayList[String]()
-
-    for (func <- sparkBuiltInFunctions) {
-      val exprInfo = functionRegistry.lookupFunction(func).get
-      if (!ignoreFunctions.contains(exprInfo.getName)) {
-        val examples = extractQuery(exprInfo.getExamples)
-        if (examples.isEmpty) {
-          needInspectFunctions.add(exprInfo.getName)
-          // scalastyle:off println
-          println("## Not found examples for " + exprInfo.getName)
-          // scalastyle:on println
-        }
-        var isSupported: Boolean = true
-        breakable {
-          for (example <- examples) {
-            var executedPlan: SparkPlan = null
-            try {
-              executedPlan = spark.sql(example).queryExecution.executedPlan
-            } catch {
-              case t: Throwable =>
-                needInspectFunctions.add(exprInfo.getName)
-                // scalastyle:off println
-                println("-- Need inspect " + exprInfo.getName)
-                println(exprInfo.getExamples)
-                // scalastyle:on println
-                break
-            }
-            val hasFallbackProject = 
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
-            if (hasFallbackProject) {
-              isSupported = false
-              break
-            }
-            val hasGlutenPlan = 
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
-            if (!hasGlutenPlan) {
-              isSupported = false
-              break
-            }
-            break
-          }
-        }
-        if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
-          supportedFunctions.add(exprInfo.getName)
-        } else if (!isSupported) {
-          unsupportedFunctions.add(exprInfo.getName)
-        }
-      }
-    }
-    // scalastyle:off println
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions: " + supportedFunctions.size())
-    println("Unsupported functions: " + unsupportedFunctions.size())
-    println("Need inspect functions: " + needInspectFunctions.size())
-    // scalastyle:on println
-    // For correction.
-    val supportedCastAliasFunctions = Seq(
-      "boolean",
-      "tinyint",
-      "smallint",
-      "int",
-      "bigint",
-      "float",
-      "double",
-      "decimal",
-      "date",
-      "binary",
-      "string")
-    for (func <- supportedCastAliasFunctions) {
-      if (needInspectFunctions.contains(func)) {
-        needInspectFunctions.remove(func)
-        supportedFunctions.add(func)
-      }
-    }
-
-    // For wrongly recognized unsupported case.
-    Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value", 
"hash", "mod").foreach(
-      name => {
-        if (unsupportedFunctions.remove(name)) {
-          supportedFunctions.add(name)
-        }
-      })
-    // For wrongly recognized supported case.
-    Seq(
-      "array_contains",
-      "map_keys",
-      "get_json_object",
-      "element_at",
-      "map_from_arrays",
-      "contains",
-      "startswith",
-      "endswith",
-      "map_contains_key",
-      "map_values",
-      "try_element_at",
-      "struct",
-      "array",
-      "ilike",
-      "sec",
-      "csc"
-    ).foreach(
-      name => {
-        if (supportedFunctions.remove(name)) {
-          unsupportedFunctions.add(name)
-        }
-      })
-    // Functions in needInspectFunctions were checked.
-    unsupportedFunctions.addAll(needInspectFunctions)
-    // scalastyle:off println
-    println("---------------")
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions corrected: " + supportedFunctions.size())
-    println("Unsupported functions corrected: " + unsupportedFunctions.size())
-    println("Support list:")
-    println(supportedFunctions)
-    println("Not support list:")
-    println(unsupportedFunctions)
-    // scalastyle:on println
-  }
-}
diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5bf53e66e6..0000000000
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, 
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some 
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
-  var spark: SparkSession = null
-
-  protected def initializeSession(): Unit = {
-    if (spark == null) {
-      val sparkBuilder = SparkSession
-        .builder()
-        .appName("Gluten-UT")
-        .master(s"local[2]")
-        // Avoid static evaluation for literal input by spark catalyst.
-        .config(
-          SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
-          ConvertToLocalRelation.ruleName +
-            "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
-        .config("spark.driver.memory", "1G")
-        .config("spark.sql.adaptive.enabled", "true")
-        .config("spark.sql.shuffle.partitions", "1")
-        .config("spark.sql.files.maxPartitionBytes", "134217728")
-        .config("spark.memory.offHeap.enabled", "true")
-        .config("spark.memory.offHeap.size", "1024MB")
-        .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
-        .config("spark.shuffle.manager", 
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
-        // Avoid the code size overflow error in Spark code generation.
-        .config("spark.sql.codegen.wholeStage", "false")
-
-      spark = if (BackendTestUtils.isCHBackendLoaded()) {
-        sparkBuilder
-          .config("spark.io.compression.codec", "LZ4")
-          .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
-          .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
-          .config("spark.sql.files.openCostInBytes", "134217728")
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      } else {
-        sparkBuilder
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      }
-    }
-  }
-
-  def extractQuery(examples: String): Seq[String] = {
-    examples
-      .split("\n")
-      .map(_.trim)
-      .filter(!_.isEmpty)
-      .filter(_.startsWith("> SELECT"))
-      .map(_.replace("> SELECT", "SELECT"))
-  }
-
-  test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
-    initializeSession
-    val functionRegistry = spark.sessionState.functionRegistry
-    val sparkBuiltInFunctions = functionRegistry.listFunction()
-    // According to expressionsForTimestampNTZSupport in 
FunctionRegistry.scala,
-    // these functions are registered only for testing, not available for end 
users.
-    // Other functions like current_database is NOT necessarily offloaded to 
native.
-    val ignoreFunctions = Seq(
-      "get_fake_app_name",
-      "current_catalog",
-      "current_database",
-      "spark_partition_id",
-      "current_user",
-      "current_timezone")
-    val supportedFunctions = new java.util.ArrayList[String]()
-    val unsupportedFunctions = new java.util.ArrayList[String]()
-    val needInspectFunctions = new java.util.ArrayList[String]()
-
-    for (func <- sparkBuiltInFunctions) {
-      val exprInfo = functionRegistry.lookupFunction(func).get
-      if (!ignoreFunctions.contains(exprInfo.getName)) {
-        val examples = extractQuery(exprInfo.getExamples)
-        if (examples.isEmpty) {
-          needInspectFunctions.add(exprInfo.getName)
-          // scalastyle:off println
-          println("## Not found examples for " + exprInfo.getName)
-          // scalastyle:on println
-        }
-        var isSupported: Boolean = true
-        breakable {
-          for (example <- examples) {
-            var executedPlan: SparkPlan = null
-            try {
-              executedPlan = spark.sql(example).queryExecution.executedPlan
-            } catch {
-              case t: Throwable =>
-                needInspectFunctions.add(exprInfo.getName)
-                // scalastyle:off println
-                println("-- Need inspect " + exprInfo.getName)
-                println(exprInfo.getExamples)
-                // scalastyle:on println
-                break
-            }
-            val hasFallbackProject = 
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
-            if (hasFallbackProject) {
-              isSupported = false
-              break
-            }
-            val hasGlutenPlan = 
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
-            if (!hasGlutenPlan) {
-              isSupported = false
-              break
-            }
-            break
-          }
-        }
-        if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
-          supportedFunctions.add(exprInfo.getName)
-        } else if (!isSupported) {
-          unsupportedFunctions.add(exprInfo.getName)
-        }
-      }
-    }
-    // scalastyle:off println
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions: " + supportedFunctions.size())
-    println("Unsupported functions: " + unsupportedFunctions.size())
-    println("Need inspect functions: " + needInspectFunctions.size())
-    // scalastyle:on println
-    // For correction.
-    val supportedCastAliasFunctions = Seq(
-      "boolean",
-      "tinyint",
-      "smallint",
-      "int",
-      "bigint",
-      "float",
-      "double",
-      "decimal",
-      "date",
-      "binary",
-      "string")
-    for (func <- supportedCastAliasFunctions) {
-      if (needInspectFunctions.contains(func)) {
-        needInspectFunctions.remove(func)
-        supportedFunctions.add(func)
-      }
-    }
-
-    // For wrongly recognized unsupported case.
-    Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value", 
"hash", "mod").foreach(
-      name => {
-        if (unsupportedFunctions.remove(name)) {
-          supportedFunctions.add(name)
-        }
-      })
-    // For wrongly recognized supported case.
-    Seq(
-      "array_contains",
-      "map_keys",
-      "get_json_object",
-      "element_at",
-      "map_from_arrays",
-      "contains",
-      "startswith",
-      "endswith",
-      "map_contains_key",
-      "map_values",
-      "try_element_at",
-      "struct",
-      "array",
-      "ilike",
-      "sec",
-      "csc"
-    ).foreach(
-      name => {
-        if (supportedFunctions.remove(name)) {
-          unsupportedFunctions.add(name)
-        }
-      })
-    // Functions in needInspectFunctions were checked.
-    unsupportedFunctions.addAll(needInspectFunctions)
-    // scalastyle:off println
-    println("---------------")
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions corrected: " + supportedFunctions.size())
-    println("Unsupported functions corrected: " + unsupportedFunctions.size())
-    println("Support list:")
-    println(supportedFunctions)
-    println("Not support list:")
-    println(unsupportedFunctions)
-    // scalastyle:on println
-  }
-}
diff --git 
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
 
b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
deleted file mode 100644
index 5bf53e66e6..0000000000
--- 
a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/statistics/SparkFunctionStatistics.scala
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.statistics
-
-import org.apache.gluten.config.GlutenConfig
-import org.apache.gluten.execution.GlutenPlan
-import org.apache.gluten.utils.BackendTestUtils
-
-import org.apache.spark.sql.{GlutenTestConstants, QueryTest, SparkSession}
-import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, 
ConvertToLocalRelation, NullPropagation}
-import org.apache.spark.sql.execution.{ProjectExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
-
-import scala.util.control.Breaks.{break, breakable}
-
-/**
- * TODO: There are some false positive & false negative cases for some 
functions. For such
- * situation, we need to use a suitable test sql to do the check.
- */
-class SparkFunctionStatistics extends QueryTest {
-
-  var spark: SparkSession = null
-
-  protected def initializeSession(): Unit = {
-    if (spark == null) {
-      val sparkBuilder = SparkSession
-        .builder()
-        .appName("Gluten-UT")
-        .master(s"local[2]")
-        // Avoid static evaluation for literal input by spark catalyst.
-        .config(
-          SQLConf.OPTIMIZER_EXCLUDED_RULES.key,
-          ConvertToLocalRelation.ruleName +
-            "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName)
-        .config("spark.driver.memory", "1G")
-        .config("spark.sql.adaptive.enabled", "true")
-        .config("spark.sql.shuffle.partitions", "1")
-        .config("spark.sql.files.maxPartitionBytes", "134217728")
-        .config("spark.memory.offHeap.enabled", "true")
-        .config("spark.memory.offHeap.size", "1024MB")
-        .config("spark.plugins", "org.apache.gluten.GlutenPlugin")
-        .config("spark.shuffle.manager", 
"org.apache.spark.shuffle.sort.ColumnarShuffleManager")
-        // Avoid the code size overflow error in Spark code generation.
-        .config("spark.sql.codegen.wholeStage", "false")
-
-      spark = if (BackendTestUtils.isCHBackendLoaded()) {
-        sparkBuilder
-          .config("spark.io.compression.codec", "LZ4")
-          .config("spark.gluten.sql.columnar.backend.ch.worker.id", "1")
-          .config(GlutenConfig.NATIVE_VALIDATION_ENABLED.key, "false")
-          .config("spark.sql.files.openCostInBytes", "134217728")
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      } else {
-        sparkBuilder
-          .config("spark.unsafe.exceptionOnMemoryLeak", "true")
-          .getOrCreate()
-      }
-    }
-  }
-
-  def extractQuery(examples: String): Seq[String] = {
-    examples
-      .split("\n")
-      .map(_.trim)
-      .filter(!_.isEmpty)
-      .filter(_.startsWith("> SELECT"))
-      .map(_.replace("> SELECT", "SELECT"))
-  }
-
-  test(GlutenTestConstants.GLUTEN_TEST + "Run spark function statistics: ") {
-    initializeSession
-    val functionRegistry = spark.sessionState.functionRegistry
-    val sparkBuiltInFunctions = functionRegistry.listFunction()
-    // According to expressionsForTimestampNTZSupport in 
FunctionRegistry.scala,
-    // these functions are registered only for testing, not available for end 
users.
-    // Other functions like current_database is NOT necessarily offloaded to 
native.
-    val ignoreFunctions = Seq(
-      "get_fake_app_name",
-      "current_catalog",
-      "current_database",
-      "spark_partition_id",
-      "current_user",
-      "current_timezone")
-    val supportedFunctions = new java.util.ArrayList[String]()
-    val unsupportedFunctions = new java.util.ArrayList[String]()
-    val needInspectFunctions = new java.util.ArrayList[String]()
-
-    for (func <- sparkBuiltInFunctions) {
-      val exprInfo = functionRegistry.lookupFunction(func).get
-      if (!ignoreFunctions.contains(exprInfo.getName)) {
-        val examples = extractQuery(exprInfo.getExamples)
-        if (examples.isEmpty) {
-          needInspectFunctions.add(exprInfo.getName)
-          // scalastyle:off println
-          println("## Not found examples for " + exprInfo.getName)
-          // scalastyle:on println
-        }
-        var isSupported: Boolean = true
-        breakable {
-          for (example <- examples) {
-            var executedPlan: SparkPlan = null
-            try {
-              executedPlan = spark.sql(example).queryExecution.executedPlan
-            } catch {
-              case t: Throwable =>
-                needInspectFunctions.add(exprInfo.getName)
-                // scalastyle:off println
-                println("-- Need inspect " + exprInfo.getName)
-                println(exprInfo.getExamples)
-                // scalastyle:on println
-                break
-            }
-            val hasFallbackProject = 
executedPlan.find(_.isInstanceOf[ProjectExec]).isDefined
-            if (hasFallbackProject) {
-              isSupported = false
-              break
-            }
-            val hasGlutenPlan = 
executedPlan.find(_.isInstanceOf[GlutenPlan]).isDefined
-            if (!hasGlutenPlan) {
-              isSupported = false
-              break
-            }
-            break
-          }
-        }
-        if (isSupported && !needInspectFunctions.contains(exprInfo.getName)) {
-          supportedFunctions.add(exprInfo.getName)
-        } else if (!isSupported) {
-          unsupportedFunctions.add(exprInfo.getName)
-        }
-      }
-    }
-    // scalastyle:off println
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions: " + supportedFunctions.size())
-    println("Unsupported functions: " + unsupportedFunctions.size())
-    println("Need inspect functions: " + needInspectFunctions.size())
-    // scalastyle:on println
-    // For correction.
-    val supportedCastAliasFunctions = Seq(
-      "boolean",
-      "tinyint",
-      "smallint",
-      "int",
-      "bigint",
-      "float",
-      "double",
-      "decimal",
-      "date",
-      "binary",
-      "string")
-    for (func <- supportedCastAliasFunctions) {
-      if (needInspectFunctions.contains(func)) {
-        needInspectFunctions.remove(func)
-        supportedFunctions.add(func)
-      }
-    }
-
-    // For wrongly recognized unsupported case.
-    Seq("%", "ceil", "floor", "first", "first_value", "last", "last_value", 
"hash", "mod").foreach(
-      name => {
-        if (unsupportedFunctions.remove(name)) {
-          supportedFunctions.add(name)
-        }
-      })
-    // For wrongly recognized supported case.
-    Seq(
-      "array_contains",
-      "map_keys",
-      "get_json_object",
-      "element_at",
-      "map_from_arrays",
-      "contains",
-      "startswith",
-      "endswith",
-      "map_contains_key",
-      "map_values",
-      "try_element_at",
-      "struct",
-      "array",
-      "ilike",
-      "sec",
-      "csc"
-    ).foreach(
-      name => {
-        if (supportedFunctions.remove(name)) {
-          unsupportedFunctions.add(name)
-        }
-      })
-    // Functions in needInspectFunctions were checked.
-    unsupportedFunctions.addAll(needInspectFunctions)
-    // scalastyle:off println
-    println("---------------")
-    println("Overall functions: " + (sparkBuiltInFunctions.size - 
ignoreFunctions.size))
-    println("Supported functions corrected: " + supportedFunctions.size())
-    println("Unsupported functions corrected: " + unsupportedFunctions.size())
-    println("Support list:")
-    println(supportedFunctions)
-    println("Not support list:")
-    println(unsupportedFunctions)
-    // scalastyle:on println
-  }
-}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(gluten) branch main updated: [MINOR][TEST] Remove SparkFunctionStatistics (#12136)

Reply via email to