This is an automated email from the ASF dual-hosted git repository. agrove pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push: new 06ed88b55 feat: Change default value of `COMET_NATIVE_SCAN_IMPL` to `auto` (#1933) 06ed88b55 is described below commit 06ed88b55d4e05e35e2f25760016436042624932 Author: Andy Grove <agr...@apache.org> AuthorDate: Fri Jun 27 18:32:23 2025 -0600 feat: Change default value of `COMET_NATIVE_SCAN_IMPL` to `auto` (#1933) --- .github/workflows/spark_sql_test.yml | 43 +++++++++++- .github/workflows/spark_sql_test_native_auto.yml | 81 ---------------------- .../main/scala/org/apache/comet/CometConf.scala | 2 +- .../org/apache/comet/rules/CometScanRule.scala | 6 +- .../org/apache/comet/CometExpressionSuite.scala | 12 +++- 5 files changed, 56 insertions(+), 88 deletions(-) diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index 0b19d4d2a..5422012dd 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -40,7 +40,7 @@ env: RUST_VERSION: stable jobs: - spark-sql-native-comet: + spark-sql-auto-scan: strategy: matrix: os: [ubuntu-24.04] @@ -75,7 +75,46 @@ jobs: run: | cd apache-spark rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups - ENABLE_COMET=true ENABLE_COMET_SHUFFLE=true build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}" + ENABLE_COMET=true ENABLE_COMET_SHUFFLE=true build/sbt -Dsbt.log.noformat=true ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}" + env: + LC_ALL: "C.UTF-8" + + spark-sql-native-native-comet: + strategy: + matrix: + os: [ ubuntu-24.04 ] + java-version: [ 11 ] + spark-version: [ { short: '3.4', full: '3.4.3' }, { short: '3.5', full: '3.5.6' } ] + module: + - { name: "catalyst", args1: "catalyst/test", args2: "" } + - { name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest } + - { name: "sql/core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest" } + - { name: "sql/core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest" } + - { name: "sql/hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest" } + - { name: "sql/hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest" } + - { name: "sql/hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest" } + fail-fast: false + name: spark-sql-native-comet-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.java-version }} + runs-on: ${{ matrix.os }} + container: + image: amd64/rust + steps: + - uses: actions/checkout@v4 + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{env.RUST_VERSION}} + jdk-version: ${{ matrix.java-version }} + - name: Setup Spark + uses: ./.github/actions/setup-spark-builder + with: + spark-version: ${{ matrix.spark-version.full }} + spark-short-version: ${{ matrix.spark-version.short }} + - name: Run Spark tests + run: | + cd apache-spark + rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups + ENABLE_COMET=true ENABLE_COMET_SHUFFLE=true COMET_PARQUET_SCAN_IMPL=native_comet build/sbt -Dsbt.log.noformat=true ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}" env: LC_ALL: "C.UTF-8" diff --git a/.github/workflows/spark_sql_test_native_auto.yml b/.github/workflows/spark_sql_test_native_auto.yml deleted file mode 100644 index 5c5e7118f..000000000 --- a/.github/workflows/spark_sql_test_native_auto.yml +++ /dev/null @@ -1,81 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Spark SQL Tests (native_auto) - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -on: - push: - paths-ignore: - - "doc/**" - - "docs/**" - - "**.md" - pull_request: - paths-ignore: - - "doc/**" - - "docs/**" - - "**.md" - # manual trigger - # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow - workflow_dispatch: - -env: - RUST_VERSION: stable - -jobs: - spark-sql-catalyst-native-auto: - strategy: - matrix: - os: [ubuntu-24.04] - java-version: [11] - spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.6'}] - module: - - {name: "catalyst", args1: "catalyst/test", args2: ""} - - {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest} - - {name: "sql/core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest"} - - {name: "sql/core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest"} - - {name: "sql/hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"} - - {name: "sql/hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"} - - {name: "sql/hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"} - fail-fast: false - name: spark-sql-native-auto-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.java-version }} - runs-on: ${{ matrix.os }} - container: - image: amd64/rust - steps: - - uses: actions/checkout@v4 - - name: Setup Rust & Java toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: ${{env.RUST_VERSION}} - jdk-version: ${{ matrix.java-version }} - - name: Setup Spark - uses: ./.github/actions/setup-spark-builder - with: - spark-version: ${{ matrix.spark-version.full }} - spark-short-version: ${{ matrix.spark-version.short }} - - name: Run Spark tests - run: | - cd apache-spark - rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups - ENABLE_COMET=true ENABLE_COMET_SHUFFLE=true COMET_PARQUET_SCAN_IMPL=auto build/sbt -Dsbt.log.noformat=true ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}" - env: - LC_ALL: "C.UTF-8" - diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index ee18fb950..4575a2fb7 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -103,7 +103,7 @@ object CometConf extends ShimCometConf { .checkValues( Set(SCAN_NATIVE_COMET, SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT, SCAN_AUTO)) .createWithDefault(sys.env - .getOrElse("COMET_PARQUET_SCAN_IMPL", SCAN_NATIVE_COMET) + .getOrElse("COMET_PARQUET_SCAN_IMPL", SCAN_AUTO) .toLowerCase(Locale.ROOT)) val COMET_RESPECT_PARQUET_FILTER_PUSHDOWN: ConfigEntry[Boolean] = diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 5b2997756..592069fcc 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.comet.{CometConf, DataTypeSupport} +import org.apache.comet.{CometConf, CometSparkSessionExtensions, DataTypeSupport} import org.apache.comet.CometConf._ import org.apache.comet.CometSparkSessionExtensions.{isCometLoaded, isCometScanEnabled, withInfo, withInfos} import org.apache.comet.parquet.{CometParquetScan, SupportsComet} @@ -261,6 +261,10 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] { val fallbackReasons = new ListBuffer[String]() + if (CometSparkSessionExtensions.isSpark40Plus) { + fallbackReasons += s"$SCAN_NATIVE_ICEBERG_COMPAT is not implemented for Spark 4.0.0" + } + // native_iceberg_compat only supports local filesystem and S3 if (!scanExec.relation.inputFiles .forall(path => path.startsWith("file://") || path.startsWith("s3a://"))) { diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index 34e38895a..d4976f3cb 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -2211,6 +2211,8 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("get_struct_field - select primitive fields") { + val scanImpl = CometConf.COMET_NATIVE_SCAN_IMPL.get() + assume(!(scanImpl == CometConf.SCAN_AUTO && CometSparkSessionExtensions.isSpark40Plus)) withTempPath { dir => // create input file with Comet disabled withSQLConf(CometConf.COMET_ENABLED.key -> "false") { @@ -2225,7 +2227,7 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { val df = spark.read.parquet(dir.toString()).select("nested1.id") // Comet's original scan does not support structs. // The plan will have a Comet Scan only if scan impl is native_full or native_recordbatch - if (!CometConf.COMET_NATIVE_SCAN_IMPL.get().equals(CometConf.SCAN_NATIVE_COMET)) { + if (!scanImpl.equals(CometConf.SCAN_NATIVE_COMET)) { checkSparkAnswerAndOperator(df) } else { checkSparkAnswer(df) @@ -2234,6 +2236,8 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("get_struct_field - select subset of struct") { + val scanImpl = CometConf.COMET_NATIVE_SCAN_IMPL.get() + assume(!(scanImpl == CometConf.SCAN_AUTO && CometSparkSessionExtensions.isSpark40Plus)) withTempPath { dir => // create input file with Comet disabled withSQLConf(CometConf.COMET_ENABLED.key -> "false") { @@ -2255,7 +2259,7 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { val df = spark.read.parquet(dir.toString()) // Comet's original scan does not support structs. // The plan will have a Comet Scan only if scan impl is native_full or native_recordbatch - if (!CometConf.COMET_NATIVE_SCAN_IMPL.get().equals(CometConf.SCAN_NATIVE_COMET)) { + if (scanImpl != CometConf.SCAN_NATIVE_COMET) { checkSparkAnswerAndOperator(df.select("nested1.id")) checkSparkAnswerAndOperator(df.select("nested1.nested2")) checkSparkAnswerAndOperator(df.select("nested1.nested2.id")) @@ -2270,6 +2274,8 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("get_struct_field - read entire struct") { + val scanImpl = CometConf.COMET_NATIVE_SCAN_IMPL.get() + assume(!(scanImpl == CometConf.SCAN_AUTO && CometSparkSessionExtensions.isSpark40Plus)) withTempPath { dir => // create input file with Comet disabled withSQLConf(CometConf.COMET_ENABLED.key -> "false") { @@ -2291,7 +2297,7 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { val df = spark.read.parquet(dir.toString()).select("nested1.id") // Comet's original scan does not support structs. // The plan will have a Comet Scan only if scan impl is native_full or native_recordbatch - if (!CometConf.COMET_NATIVE_SCAN_IMPL.get().equals(CometConf.SCAN_NATIVE_COMET)) { + if (scanImpl != CometConf.SCAN_NATIVE_COMET) { checkSparkAnswerAndOperator(df) } else { checkSparkAnswer(df) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org