This is an automated email from the ASF dual-hosted git repository.
mbutrovich pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 34daa54d2 fix: regressions in `CometToPrettyStringSuite` (#2384)
34daa54d2 is described below
commit 34daa54d2e611f1081efdfceeefb7c19146357eb
Author: hsiang-c <[email protected]>
AuthorDate: Sat Sep 20 22:38:07 2025 +0800
fix: regressions in `CometToPrettyStringSuite` (#2384)
* Introduce BinaryOutputStyle from Spark 4.0
* Allow casting from binary to string
* Pass binaryOutputStyle to query plan serde
* Take binaryOutputStyle in planner
* Implement Spark-style ToPrettyString
* Match file name w/ test name
* Test all 5 BinaryOutputStyle in Spark 4.0
* Fix package: 'org.apache.sql' -> 'org.apache.spark.sql'
* Add CometToPrettyStringSuite back to CI
* Specify binaryOutputStyle for Spark 3.4
* Let Comet deal with non pretty string casting
* Enable binary to string casting test
* Attempt to fix the build; ToPrettyString is Spark 3.5+
* Removed resolved issues
* Type casting only function
* Extract test setup logic to CometFuzzTestBase
* Move binary_output_style proto <-> enum mapping to core
* Move BinaryOutputStyle from cast.rs to lib.rs
* Remove incorrect comments
---
.github/workflows/pr_build_linux.yml | 3 +
.github/workflows/pr_build_macos.yml | 3 +
dev/ci/check-suites.py | 5 +-
docs/source/user-guide/latest/compatibility.md | 2 +-
native/Cargo.lock | 2 +
native/core/src/execution/planner.rs | 18 ++++-
native/proto/src/proto/expr.proto | 9 +++
native/spark-expr/Cargo.toml | 2 +
native/spark-expr/src/conversion_funcs/cast.rs | 89 ++++++++++++++++++++--
native/spark-expr/src/lib.rs | 9 +++
.../org/apache/comet/expressions/CometCast.scala | 3 +-
.../org/apache/comet/serde/QueryPlanSerde.scala | 1 +
.../org/apache/comet/shims/CometExprShim.scala | 4 +-
.../org/apache/comet/shims/CometExprShim.scala | 4 +-
.../org/apache/comet/shims/CometExprShim.scala | 13 +++-
.../scala/org/apache/comet/CometCastSuite.scala | 3 +-
.../apache/{ => spark}/sql/ShimCometTestBase.scala | 0
.../spark/sql/CometToPrettyStringSuite.scala | 49 +-----------
.../spark/sql/CometToPrettyStringSuite.scala | 64 ++++++++++++++++
19 files changed, 215 insertions(+), 68 deletions(-)
diff --git a/.github/workflows/pr_build_linux.yml
b/.github/workflows/pr_build_linux.yml
index c0cbf8bbe..fe7df906d 100644
--- a/.github/workflows/pr_build_linux.yml
+++ b/.github/workflows/pr_build_linux.yml
@@ -149,6 +149,9 @@ jobs:
org.apache.comet.CometBitwiseExpressionSuite
org.apache.comet.CometMapExpressionSuite
org.apache.comet.objectstore.NativeConfigSuite
+ - name: "sql"
+ value: |
+ ${{ matrix.profile.maven_opts != 'Spark 3.4, JDK 11, Scala 2.12'
&& 'org.apache.spark.sql.CometToPrettyStringSuite' || ''}}
fail-fast: false
name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name
}}]
runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/pr_build_macos.yml
b/.github/workflows/pr_build_macos.yml
index ea09de06f..1abe644f5 100644
--- a/.github/workflows/pr_build_macos.yml
+++ b/.github/workflows/pr_build_macos.yml
@@ -114,6 +114,9 @@ jobs:
org.apache.comet.CometBitwiseExpressionSuite
org.apache.comet.CometMapExpressionSuite
org.apache.comet.objectstore.NativeConfigSuite
+ - name: "sql"
+ value: |
+ ${{ matrix.profile.maven_opts != 'Spark 3.4, JDK 11, Scala 2.12'
&& 'org.apache.spark.sql.CometToPrettyStringSuite' || ''}}
fail-fast: false
name: ${{ matrix.os }}/${{ matrix.profile.name }} [${{ matrix.suite.name
}}]
runs-on: ${{ matrix.os }}
diff --git a/dev/ci/check-suites.py b/dev/ci/check-suites.py
index 62bcd77b5..8d9acb2d5 100644
--- a/dev/ci/check-suites.py
+++ b/dev/ci/check-suites.py
@@ -36,10 +36,7 @@ if __name__ == "__main__":
"org.apache.comet.parquet.ParquetReadFromS3Suite", # manual test suite
"org.apache.spark.sql.comet.CometPlanStabilitySuite", # abstract
"org.apache.spark.sql.comet.ParquetDatetimeRebaseSuite", # abstract
- "org.apache.comet.exec.CometColumnarShuffleSuite", # abstract
- # TODO add CometToPrettyStringSuite to PR worklows
- # https://github.com/apache/datafusion-comet/issues/2307
- "org.apache.spark.sql.CometToPrettyStringSuite"
+ "org.apache.comet.exec.CometColumnarShuffleSuite" # abstract
]
for workflow_filename in [".github/workflows/pr_build_linux.yml",
".github/workflows/pr_build_macos.yml"]:
diff --git a/docs/source/user-guide/latest/compatibility.md
b/docs/source/user-guide/latest/compatibility.md
index 175855655..dd059abbc 100644
--- a/docs/source/user-guide/latest/compatibility.md
+++ b/docs/source/user-guide/latest/compatibility.md
@@ -210,6 +210,7 @@ The following cast operations are generally compatible with
Spark except for the
| string | long | |
| string | binary | |
| string | date | Only supports years between 262143 BC and 262142 AD |
+| binary | string | |
| date | string | |
| timestamp | long | |
| timestamp | string | |
@@ -233,7 +234,6 @@ The following cast operations are not compatible with Spark
for all inputs and a
| string | double | Does not support inputs ending with 'd' or 'f'. Does not
support 'inf'. Does not support ANSI mode. |
| string | decimal | Does not support inputs ending with 'd' or 'f'. Does not
support 'inf'. Does not support ANSI mode. Returns 0.0 instead of null if input
contains no digits |
| string | timestamp | Not all valid formats are supported |
-| binary | string | Only works for binary data representing valid UTF-8
strings |
<!--END:INCOMPAT_CAST_TABLE-->
### Unsupported Casts
diff --git a/native/Cargo.lock b/native/Cargo.lock
index f9d14fa94..b94458ddd 100644
--- a/native/Cargo.lock
+++ b/native/Cargo.lock
@@ -1581,11 +1581,13 @@ name = "datafusion-comet-spark-expr"
version = "0.11.0"
dependencies = [
"arrow",
+ "base64",
"chrono",
"chrono-tz",
"criterion",
"datafusion",
"futures",
+ "hex",
"num",
"rand",
"regex",
diff --git a/native/core/src/execution/planner.rs
b/native/core/src/execution/planner.rs
index 1465d33ad..5aa6ece3b 100644
--- a/native/core/src/execution/planner.rs
+++ b/native/core/src/execution/planner.rs
@@ -62,8 +62,8 @@ use datafusion::{
prelude::SessionContext,
};
use datafusion_comet_spark_expr::{
- create_comet_physical_fun, create_modulo_expr, create_negate_expr,
BloomFilterAgg,
- BloomFilterMightContain, EvalMode, SparkHour, SparkMinute, SparkSecond,
+ create_comet_physical_fun, create_modulo_expr, create_negate_expr,
BinaryOutputStyle,
+ BloomFilterAgg, BloomFilterMightContain, EvalMode, SparkHour, SparkMinute,
SparkSecond,
};
use crate::execution::operators::ExecutionError::GeneralError;
@@ -809,6 +809,8 @@ impl PhysicalPlanner {
SparkCastOptions::new(EvalMode::Try, &expr.timezone, true);
let null_string = "NULL";
spark_cast_options.null_string = null_string.to_string();
+ spark_cast_options.binary_output_style =
+
from_protobuf_binary_output_style(expr.binary_output_style).ok();
let child = self.create_expr(expr.child.as_ref().unwrap(),
input_schema)?;
let cast = Arc::new(Cast::new(
Arc::clone(&child),
@@ -2693,6 +2695,18 @@ fn create_case_expr(
}
}
+fn from_protobuf_binary_output_style(
+ value: i32,
+) -> Result<BinaryOutputStyle, prost::UnknownEnumValue> {
+ match spark_expression::BinaryOutputStyle::try_from(value)? {
+ spark_expression::BinaryOutputStyle::Utf8 =>
Ok(BinaryOutputStyle::Utf8),
+ spark_expression::BinaryOutputStyle::Basic =>
Ok(BinaryOutputStyle::Basic),
+ spark_expression::BinaryOutputStyle::Base64 =>
Ok(BinaryOutputStyle::Base64),
+ spark_expression::BinaryOutputStyle::Hex => Ok(BinaryOutputStyle::Hex),
+ spark_expression::BinaryOutputStyle::HexDiscrete =>
Ok(BinaryOutputStyle::HexDiscrete),
+ }
+}
+
fn literal_to_array_ref(
data_type: DataType,
list_literal: ListLiteral,
diff --git a/native/proto/src/proto/expr.proto
b/native/proto/src/proto/expr.proto
index 04d9376ac..ade9860c8 100644
--- a/native/proto/src/proto/expr.proto
+++ b/native/proto/src/proto/expr.proto
@@ -269,9 +269,18 @@ message ToJson {
bool ignore_null_fields = 6;
}
+enum BinaryOutputStyle {
+ UTF8 = 0;
+ BASIC = 1;
+ BASE64 = 2;
+ HEX = 3;
+ HEX_DISCRETE = 4;
+}
+
message ToPrettyString {
Expr child = 1;
string timezone = 2;
+ BinaryOutputStyle binaryOutputStyle = 3;
}
message Hour {
diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml
index 6ccecf7d2..961c7bec0 100644
--- a/native/spark-expr/Cargo.toml
+++ b/native/spark-expr/Cargo.toml
@@ -37,6 +37,8 @@ thiserror = { workspace = true }
futures = { workspace = true }
twox-hash = "2.1.2"
rand = { workspace = true }
+hex = "0.4.3"
+base64 = "0.22.1"
[dev-dependencies]
arrow = {workspace = true}
diff --git a/native/spark-expr/src/conversion_funcs/cast.rs
b/native/spark-expr/src/conversion_funcs/cast.rs
index 8f33bf912..0c7b437a5 100644
--- a/native/spark-expr/src/conversion_funcs/cast.rs
+++ b/native/spark-expr/src/conversion_funcs/cast.rs
@@ -15,13 +15,15 @@
// specific language governing permissions and limitations
// under the License.
-use crate::timezone;
use crate::utils::array_with_timezone;
+use crate::{timezone, BinaryOutputStyle};
use crate::{EvalMode, SparkError, SparkResult};
use arrow::array::builder::StringBuilder;
-use arrow::array::{DictionaryArray, StringArray, StructArray};
+use arrow::array::{DictionaryArray, GenericByteArray, StringArray,
StructArray};
use arrow::compute::can_cast_types;
-use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType,
Schema};
+use arrow::datatypes::{
+ ArrowDictionaryKeyType, ArrowNativeType, DataType, GenericBinaryType,
Schema,
+};
use arrow::{
array::{
cast::AsArray,
@@ -60,6 +62,8 @@ use std::{
sync::Arc,
};
+use base64::prelude::*;
+
static TIMESTAMP_FORMAT: Option<&str> = Some("%Y-%m-%d %H:%M:%S%.f");
const MICROS_PER_SECOND: i64 = 1000000;
@@ -260,11 +264,7 @@ fn can_cast_to_string(from_type: &DataType, options:
&SparkCastOptions) -> bool
// scientific notation where Comet does not
true
}
- Binary => {
- // https://github.com/apache/datafusion-comet/issues/377
- // Only works for binary data representing valid UTF-8 strings
- options.allow_incompat
- }
+ Binary => true,
Struct(fields) => fields
.iter()
.all(|f| can_cast_to_string(f.data_type(), options)),
@@ -816,6 +816,8 @@ pub struct SparkCastOptions {
pub is_adapting_schema: bool,
/// String to use to represent null values
pub null_string: String,
+ /// SparkSQL's binaryOutputStyle
+ pub binary_output_style: Option<BinaryOutputStyle>,
}
impl SparkCastOptions {
@@ -827,6 +829,7 @@ impl SparkCastOptions {
allow_cast_unsigned_ints: false,
is_adapting_schema: false,
null_string: "null".to_string(),
+ binary_output_style: None,
}
}
@@ -838,6 +841,7 @@ impl SparkCastOptions {
allow_cast_unsigned_ints: false,
is_adapting_schema: false,
null_string: "null".to_string(),
+ binary_output_style: None,
}
}
}
@@ -1027,6 +1031,7 @@ fn cast_array(
{
Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
}
+ (Binary, Utf8) => Ok(cast_binary_to_string::<i32>(&array,
cast_options)?),
_ if cast_options.is_adapting_schema
|| is_datafusion_spark_compatible(from_type, to_type,
cast_options.allow_incompat) =>
{
@@ -1045,6 +1050,74 @@ fn cast_array(
Ok(spark_cast_postprocess(cast_result?, from_type, to_type))
}
+fn cast_binary_to_string<O: OffsetSizeTrait>(
+ array: &dyn Array,
+ spark_cast_options: &SparkCastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let input = array
+ .as_any()
+ .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
+ .unwrap();
+
+ fn binary_formatter(value: &[u8], spark_cast_options: &SparkCastOptions)
-> String {
+ match spark_cast_options.binary_output_style {
+ Some(s) => spark_binary_formatter(value, s),
+ None => cast_binary_formatter(value),
+ }
+ }
+
+ let output_array = input
+ .iter()
+ .map(|value| match value {
+ Some(value) => Ok(Some(binary_formatter(value,
spark_cast_options))),
+ _ => Ok(None),
+ })
+ .collect::<Result<GenericStringArray<O>, ArrowError>>()?;
+ Ok(Arc::new(output_array))
+}
+
+/// This function mimics the [BinaryFormatter]:
https://github.com/apache/spark/blob/v4.0.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ToStringBase.scala#L449-L468
+/// used by SparkSQL's ToPrettyString expression.
+/// The BinaryFormatter was [introduced]:
https://issues.apache.org/jira/browse/SPARK-47911 in Spark 4.0.0
+/// Before Spark 4.0.0, the default is SPACE_DELIMITED_UPPERCASE_HEX
+fn spark_binary_formatter(value: &[u8], binary_output_style:
BinaryOutputStyle) -> String {
+ match binary_output_style {
+ BinaryOutputStyle::Utf8 => String::from_utf8(value.to_vec()).unwrap(),
+ BinaryOutputStyle::Basic => {
+ format!(
+ "{:?}",
+ value
+ .iter()
+ .map(|v| i8::from_ne_bytes([*v]))
+ .collect::<Vec<i8>>()
+ )
+ }
+ BinaryOutputStyle::Base64 => BASE64_STANDARD_NO_PAD.encode(value),
+ BinaryOutputStyle::Hex => value
+ .iter()
+ .map(|v| hex::encode_upper([*v]))
+ .collect::<String>(),
+ BinaryOutputStyle::HexDiscrete => {
+ // Spark's default SPACE_DELIMITED_UPPERCASE_HEX
+ format!(
+ "[{}]",
+ value
+ .iter()
+ .map(|v| hex::encode_upper([*v]))
+ .collect::<Vec<String>>()
+ .join(" ")
+ )
+ }
+ }
+}
+
+fn cast_binary_formatter(value: &[u8]) -> String {
+ match String::from_utf8(value.to_vec()) {
+ Ok(value) => value,
+ Err(_) => unsafe { String::from_utf8_unchecked(value.to_vec()) },
+ }
+}
+
/// Determines if DataFusion supports the given cast in a way that is
/// compatible with Spark
fn is_datafusion_spark_compatible(
diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs
index 4b29b6177..af5677a9b 100644
--- a/native/spark-expr/src/lib.rs
+++ b/native/spark-expr/src/lib.rs
@@ -98,6 +98,15 @@ pub enum EvalMode {
Try,
}
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
+pub enum BinaryOutputStyle {
+ Utf8,
+ Basic,
+ Base64,
+ Hex,
+ HexDiscrete,
+}
+
pub(crate) fn arithmetic_overflow_error(from_type: &str) -> SparkError {
SparkError::ArithmeticOverflow {
from_type: from_type.to_string(),
diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
index 3ea488256..7db62130d 100644
--- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
+++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala
@@ -215,8 +215,7 @@ object CometCast extends CometExpressionSerde[Cast] with
CometExprShim {
"There can be formatting differences in some case due to Spark
using " +
"scientific notation where Comet does not"))
case DataTypes.BinaryType =>
- // https://github.com/apache/datafusion-comet/issues/377
- Incompatible(Some("Only works for binary data representing valid UTF-8
strings"))
+ Compatible()
case StructType(fields) =>
for (field <- fields) {
isSupported(field.dataType, DataTypes.StringType, timeZoneId,
evalMode) match {
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
index 1e0e97862..258d275e5 100644
--- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -716,6 +716,7 @@ object QueryPlanSerde extends Logging with CometExprShim {
.newBuilder()
.setChild(p)
.setTimezone(timezoneId.getOrElse("UTC"))
+ .setBinaryOutputStyle(binaryOutputStyle)
.build()
Some(
ExprOuterClass.Expr
diff --git
a/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala
b/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala
index b6ba91ad1..e9785ce5e 100644
--- a/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala
+++ b/spark/src/main/spark-3.4/org/apache/comet/shims/CometExprShim.scala
@@ -20,7 +20,7 @@ package org.apache.comet.shims
import org.apache.comet.expressions.CometEvalMode
import org.apache.comet.serde.CommonStringExprs
-import org.apache.comet.serde.ExprOuterClass.Expr
+import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr}
import org.apache.spark.sql.catalyst.expressions._
/**
@@ -30,6 +30,8 @@ trait CometExprShim extends CommonStringExprs {
protected def evalMode(c: Cast): CometEvalMode.Value =
CometEvalModeUtil.fromSparkEvalMode(c.evalMode)
+ protected def binaryOutputStyle: BinaryOutputStyle =
BinaryOutputStyle.HEX_DISCRETE
+
def versionSpecificExprToProtoInternal(
expr: Expression,
inputs: Seq[Attribute],
diff --git
a/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala
b/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala
index b6ba91ad1..bbabb389d 100644
--- a/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala
+++ b/spark/src/main/spark-3.5/org/apache/comet/shims/CometExprShim.scala
@@ -20,7 +20,7 @@ package org.apache.comet.shims
import org.apache.comet.expressions.CometEvalMode
import org.apache.comet.serde.CommonStringExprs
-import org.apache.comet.serde.ExprOuterClass.Expr
+import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr}
import org.apache.spark.sql.catalyst.expressions._
/**
@@ -30,6 +30,8 @@ trait CometExprShim extends CommonStringExprs {
protected def evalMode(c: Cast): CometEvalMode.Value =
CometEvalModeUtil.fromSparkEvalMode(c.evalMode)
+ protected def binaryOutputStyle: BinaryOutputStyle =
BinaryOutputStyle.HEX_DISCRETE
+
def versionSpecificExprToProtoInternal(
expr: Expression,
inputs: Seq[Attribute],
diff --git
a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala
b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala
index 5885c908e..09dcface6 100644
--- a/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala
+++ b/spark/src/main/spark-4.0/org/apache/comet/shims/CometExprShim.scala
@@ -20,9 +20,10 @@ package org.apache.comet.shims
import org.apache.comet.expressions.CometEvalMode
import org.apache.comet.serde.CommonStringExprs
-import org.apache.comet.serde.ExprOuterClass.Expr
+import org.apache.comet.serde.ExprOuterClass.{BinaryOutputStyle, Expr}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke
+import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.types.StringTypeWithCollation
import org.apache.spark.sql.types.{BinaryType, BooleanType, StringType}
@@ -33,6 +34,16 @@ trait CometExprShim extends CommonStringExprs {
protected def evalMode(c: Cast): CometEvalMode.Value =
CometEvalModeUtil.fromSparkEvalMode(c.evalMode)
+ protected def binaryOutputStyle: BinaryOutputStyle = {
+
SQLConf.get.getConf(SQLConf.BINARY_OUTPUT_STYLE).map(SQLConf.BinaryOutputStyle.withName)
match {
+ case Some(SQLConf.BinaryOutputStyle.UTF8) => BinaryOutputStyle.UTF8
+ case Some(SQLConf.BinaryOutputStyle.BASIC) => BinaryOutputStyle.BASIC
+ case Some(SQLConf.BinaryOutputStyle.BASE64) => BinaryOutputStyle.BASE64
+ case Some(SQLConf.BinaryOutputStyle.HEX) => BinaryOutputStyle.HEX
+ case _ => BinaryOutputStyle.HEX_DISCRETE
+ }
+ }
+
def versionSpecificExprToProtoInternal(
expr: Expression,
inputs: Seq[Attribute],
diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
index 772cc064a..2667b4087 100644
--- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
@@ -827,8 +827,7 @@ class CometCastSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
// CAST from BinaryType
- ignore("cast BinaryType to StringType") {
- // https://github.com/apache/datafusion-comet/issues/377
+ test("cast BinaryType to StringType") {
castTest(generateBinary(), DataTypes.StringType)
}
diff --git a/spark/src/test/spark-3.4/org/apache/sql/ShimCometTestBase.scala
b/spark/src/test/spark-3.4/org/apache/spark/sql/ShimCometTestBase.scala
similarity index 100%
rename from spark/src/test/spark-3.4/org/apache/sql/ShimCometTestBase.scala
rename to spark/src/test/spark-3.4/org/apache/spark/sql/ShimCometTestBase.scala
diff --git
a/spark/src/test/spark-3.5/org/apache/spark/sql/CometToPrettyStringSuite.scala
b/spark/src/test/spark-3.5/org/apache/spark/sql/CometToPrettyStringSuite.scala
index d030106c3..df2ba67b7 100644
---
a/spark/src/test/spark-3.5/org/apache/spark/sql/CometToPrettyStringSuite.scala
+++
b/spark/src/test/spark-3.5/org/apache/spark/sql/CometToPrettyStringSuite.scala
@@ -19,60 +19,17 @@
package org.apache.spark.sql
-import org.apache.comet.CometConf
+import org.apache.comet.CometFuzzTestBase
import org.apache.comet.expressions.{CometCast, CometEvalMode}
import org.apache.comet.serde.Compatible
-import org.apache.comet.testing.{DataGenOptions, ParquetGenerator}
-import org.apache.commons.io.FileUtils
+
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Alias, ToPrettyString}
import org.apache.spark.sql.catalyst.plans.logical.Project
-import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.DataTypes
-import java.io.File
-import java.text.SimpleDateFormat
-import scala.util.Random
-
-class CometToPrettyStringSuite extends CometTestBase {
-
- private var filename: String = null
-
- /**
- * We use Asia/Kathmandu because it has a non-zero number of minutes as the
offset, so is an
- * interesting edge case. Also, this timezone tends to be different from the
default system
- * timezone.
- *
- * Represents UTC+5:45
- */
- private val defaultTimezone = "Asia/Kathmandu"
-
- override def beforeAll(): Unit = {
- super.beforeAll()
- val tempDir = System.getProperty("java.io.tmpdir")
- filename =
s"$tempDir/CometFuzzTestSuite_${System.currentTimeMillis()}.parquet"
- val random = new Random(42)
- withSQLConf(
- CometConf.COMET_ENABLED.key -> "false",
- SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) {
- val options =
- DataGenOptions(
- generateArray = true,
- generateStruct = true,
- generateMap = true,
- generateNegativeZero = false,
- // override base date due to known issues with experimental scans
- baseDate =
- new SimpleDateFormat("YYYY-MM-DD hh:mm:ss").parse("2024-05-25
12:34:56").getTime)
- ParquetGenerator.makeParquetFile(random, spark, filename, 1000, options)
- }
- }
-
- protected override def afterAll(): Unit = {
- super.afterAll()
- FileUtils.deleteDirectory(new File(filename))
- }
+class CometToPrettyStringSuite extends CometFuzzTestBase {
test("ToPrettyString") {
val df = spark.read.parquet(filename)
diff --git
a/spark/src/test/spark-4.0/org/apache/spark/sql/CometToPrettyStringSuite.scala
b/spark/src/test/spark-4.0/org/apache/spark/sql/CometToPrettyStringSuite.scala
new file mode 100644
index 000000000..469b49fb2
--- /dev/null
+++
b/spark/src/test/spark-4.0/org/apache/spark/sql/CometToPrettyStringSuite.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.comet.CometFuzzTestBase
+import org.apache.comet.expressions.{CometCast, CometEvalMode}
+import org.apache.comet.serde.Compatible
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions.{Alias, ToPrettyString}
+import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.classic.Dataset
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.BinaryOutputStyle
+import org.apache.spark.sql.types.DataTypes
+
+class CometToPrettyStringSuite extends CometFuzzTestBase {
+
+ test("ToPrettyString") {
+ val style = List(
+ BinaryOutputStyle.UTF8,
+ BinaryOutputStyle.BASIC,
+ BinaryOutputStyle.BASE64,
+ BinaryOutputStyle.HEX,
+ BinaryOutputStyle.HEX_DISCRETE
+ )
+ style.foreach(s =>
+ withSQLConf(SQLConf.BINARY_OUTPUT_STYLE.key -> s.toString) {
+ val df = spark.read.parquet(filename)
+ df.createOrReplaceTempView("t1")
+ val table =
spark.sessionState.catalog.lookupRelation(TableIdentifier("t1"))
+
+ for (field <- df.schema.fields) {
+ val col = field.name
+ val prettyExpr = Alias(ToPrettyString(UnresolvedAttribute(col)),
s"pretty_$col")()
+ val plan = Project(Seq(prettyExpr), table)
+ val analyzed = spark.sessionState.analyzer.execute(plan)
+ val result: DataFrame = Dataset.ofRows(spark, analyzed)
+ CometCast.isSupported(field.dataType, DataTypes.StringType,
Some(spark.sessionState.conf.sessionLocalTimeZone), CometEvalMode.TRY) match {
+ case _: Compatible => checkSparkAnswerAndOperator(result)
+ case _ => checkSparkAnswer(result)
+ }
+ }
+ }
+ )
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]