This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 68f127b75 feat: add partial support for date_format expression (#3201)
68f127b75 is described below
commit 68f127b75373eafda882512f13d255cf4256c520
Author: Andy Grove <[email protected]>
AuthorDate: Sun Jan 18 21:24:55 2026 -0700
feat: add partial support for date_format expression (#3201)
---
docs/source/user-guide/latest/configs.md | 1 +
.../org/apache/comet/serde/QueryPlanSerde.scala | 1 +
.../scala/org/apache/comet/serde/datetime.scala | 104 +++++++++++++++++-
.../comet/CometTemporalExpressionSuite.scala | 120 ++++++++++++++++++++-
4 files changed, 223 insertions(+), 3 deletions(-)
diff --git a/docs/source/user-guide/latest/configs.md
b/docs/source/user-guide/latest/configs.md
index 53ed18910..5eea5c4e5 100644
--- a/docs/source/user-guide/latest/configs.md
+++ b/docs/source/user-guide/latest/configs.md
@@ -234,6 +234,7 @@ These settings can be used to determine which parts of the
plan are accelerated
| `spark.comet.expression.CreateArray.enabled` | Enable Comet acceleration for
`CreateArray` | true |
| `spark.comet.expression.CreateNamedStruct.enabled` | Enable Comet
acceleration for `CreateNamedStruct` | true |
| `spark.comet.expression.DateAdd.enabled` | Enable Comet acceleration for
`DateAdd` | true |
+| `spark.comet.expression.DateFormatClass.enabled` | Enable Comet acceleration
for `DateFormatClass` | true |
| `spark.comet.expression.DateSub.enabled` | Enable Comet acceleration for
`DateSub` | true |
| `spark.comet.expression.DayOfMonth.enabled` | Enable Comet acceleration for
`DayOfMonth` | true |
| `spark.comet.expression.DayOfWeek.enabled` | Enable Comet acceleration for
`DayOfWeek` | true |
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
index 2849aa6d3..3569559df 100644
--- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -185,6 +185,7 @@ object QueryPlanSerde extends Logging with CometExprShim {
private val temporalExpressions: Map[Class[_ <: Expression],
CometExpressionSerde[_]] = Map(
classOf[DateAdd] -> CometDateAdd,
+ classOf[DateFormatClass] -> CometDateFormat,
classOf[DateSub] -> CometDateSub,
classOf[UnixDate] -> CometUnixDate,
classOf[FromUnixTime] -> CometFromUnixTime,
diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala
b/spark/src/main/scala/org/apache/comet/serde/datetime.scala
index 252f52478..b191e8721 100644
--- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala
@@ -21,8 +21,8 @@ package org.apache.comet.serde
import java.util.Locale
-import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateSub,
DayOfMonth, DayOfWeek, DayOfYear, GetDateField, Hour, Literal, Minute, Month,
Quarter, Second, TruncDate, TruncTimestamp, UnixDate, WeekDay, WeekOfYear, Year}
-import org.apache.spark.sql.types.{DateType, IntegerType}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd,
DateFormatClass, DateSub, DayOfMonth, DayOfWeek, DayOfYear, GetDateField, Hour,
Literal, Minute, Month, Quarter, Second, TruncDate, TruncTimestamp, UnixDate,
WeekDay, WeekOfYear, Year}
+import org.apache.spark.sql.types.{DateType, IntegerType, StringType}
import org.apache.spark.unsafe.types.UTF8String
import org.apache.comet.CometSparkSessionExtensions.withInfo
@@ -381,3 +381,103 @@ object CometTruncTimestamp extends
CometExpressionSerde[TruncTimestamp] {
}
}
}
+
+/**
+ * Converts Spark DateFormatClass expression to DataFusion's to_char function.
+ *
+ * Spark uses Java SimpleDateFormat patterns while DataFusion uses strftime
patterns. This
+ * implementation supports a whitelist of common format strings that can be
reliably mapped
+ * between the two systems.
+ */
+object CometDateFormat extends CometExpressionSerde[DateFormatClass] {
+
+ /**
+ * Mapping from Spark SimpleDateFormat patterns to strftime patterns. Only
formats in this map
+ * are supported.
+ */
+ val supportedFormats: Map[String, String] = Map(
+ // Full date formats
+ "yyyy-MM-dd" -> "%Y-%m-%d",
+ "yyyy/MM/dd" -> "%Y/%m/%d",
+ "yyyy-MM-dd HH:mm:ss" -> "%Y-%m-%d %H:%M:%S",
+ "yyyy/MM/dd HH:mm:ss" -> "%Y/%m/%d %H:%M:%S",
+ // Date components
+ "yyyy" -> "%Y",
+ "yy" -> "%y",
+ "MM" -> "%m",
+ "dd" -> "%d",
+ // Time formats
+ "HH:mm:ss" -> "%H:%M:%S",
+ "HH:mm" -> "%H:%M",
+ "HH" -> "%H",
+ "mm" -> "%M",
+ "ss" -> "%S",
+ // Combined formats
+ "yyyyMMdd" -> "%Y%m%d",
+ "yyyyMM" -> "%Y%m",
+ // Month and day names
+ "EEEE" -> "%A",
+ "EEE" -> "%a",
+ "MMMM" -> "%B",
+ "MMM" -> "%b",
+ // 12-hour time
+ "hh:mm:ss a" -> "%I:%M:%S %p",
+ "hh:mm a" -> "%I:%M %p",
+ "h:mm a" -> "%-I:%M %p",
+ // ISO formats
+ "yyyy-MM-dd'T'HH:mm:ss" -> "%Y-%m-%dT%H:%M:%S")
+
+ override def getSupportLevel(expr: DateFormatClass): SupportLevel = {
+ // Check timezone - only UTC is fully compatible
+ val timezone = expr.timeZoneId.getOrElse("UTC")
+ val isUtc = timezone == "UTC" || timezone == "Etc/UTC"
+
+ expr.right match {
+ case Literal(fmt: UTF8String, _) =>
+ val format = fmt.toString
+ if (supportedFormats.contains(format)) {
+ if (isUtc) {
+ Compatible()
+ } else {
+ Incompatible(Some(s"Non-UTC timezone '$timezone' may produce
different results"))
+ }
+ } else {
+ Unsupported(
+ Some(
+ s"Format '$format' is not supported. Supported formats: " +
+ supportedFormats.keys.mkString(", ")))
+ }
+ case _ =>
+ Unsupported(Some("Only literal format strings are supported"))
+ }
+ }
+
+ override def convert(
+ expr: DateFormatClass,
+ inputs: Seq[Attribute],
+ binding: Boolean): Option[ExprOuterClass.Expr] = {
+ // Get the format string - must be a literal for us to map it
+ val strftimeFormat = expr.right match {
+ case Literal(fmt: UTF8String, _) =>
+ supportedFormats.get(fmt.toString)
+ case _ => None
+ }
+
+ strftimeFormat match {
+ case Some(format) =>
+ val childExpr = exprToProtoInternal(expr.left, inputs, binding)
+ val formatExpr = exprToProtoInternal(Literal(format), inputs, binding)
+
+ val optExpr = scalarFunctionExprToProtoWithReturnType(
+ "to_char",
+ StringType,
+ false,
+ childExpr,
+ formatExpr)
+ optExprWithInfo(optExpr, expr, expr.left, expr.right)
+ case None =>
+ withInfo(expr, expr.left, expr.right)
+ None
+ }
+ }
+}
diff --git
a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala
b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala
index 3ab525ab6..35976ffa9 100644
--- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala
@@ -26,7 +26,7 @@ import
org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
-import org.apache.comet.serde.{CometTruncDate, CometTruncTimestamp}
+import org.apache.comet.serde.{CometDateFormat, CometTruncDate,
CometTruncTimestamp}
import org.apache.comet.testing.{DataGenOptions, FuzzDataGenerator}
class CometTemporalExpressionSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
@@ -123,6 +123,124 @@ class CometTemporalExpressionSuite extends CometTestBase
with AdaptiveSparkPlanH
FuzzDataGenerator.generateDataFrame(r, spark, schema, 1000,
DataGenOptions())
}
+ test("date_format with timestamp column") {
+ // Filter out formats with embedded quotes that need special handling
+ val supportedFormats = CometDateFormat.supportedFormats.keys.toSeq
+ .filterNot(_.contains("'"))
+
+ createTimestampTestData.createOrReplaceTempView("tbl")
+
+ withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") {
+ for (format <- supportedFormats) {
+ checkSparkAnswerAndOperator(s"SELECT c0, date_format(c0, '$format')
from tbl order by c0")
+ }
+ // Test ISO format with embedded quotes separately using double-quoted
string
+ checkSparkAnswerAndOperator(
+ "SELECT c0, date_format(c0, \"yyyy-MM-dd'T'HH:mm:ss\") from tbl order
by c0")
+ }
+ }
+
+ test("date_format with specific format strings") {
+ // Test specific format strings with explicit timestamp data
+ createTimestampTestData.createOrReplaceTempView("tbl")
+
+ withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") {
+ // Date formats
+ checkSparkAnswerAndOperator("SELECT c0, date_format(c0, 'yyyy-MM-dd')
from tbl order by c0")
+ checkSparkAnswerAndOperator("SELECT c0, date_format(c0, 'yyyy/MM/dd')
from tbl order by c0")
+
+ // Time formats
+ checkSparkAnswerAndOperator("SELECT c0, date_format(c0, 'HH:mm:ss') from
tbl order by c0")
+ checkSparkAnswerAndOperator("SELECT c0, date_format(c0, 'HH:mm') from
tbl order by c0")
+
+ // Combined formats
+ checkSparkAnswerAndOperator(
+ "SELECT c0, date_format(c0, 'yyyy-MM-dd HH:mm:ss') from tbl order by
c0")
+
+ // Day/month names
+ checkSparkAnswerAndOperator("SELECT c0, date_format(c0, 'EEEE') from tbl
order by c0")
+ checkSparkAnswerAndOperator("SELECT c0, date_format(c0, 'MMMM') from tbl
order by c0")
+
+ // 12-hour time
+ checkSparkAnswerAndOperator("SELECT c0, date_format(c0, 'hh:mm:ss a')
from tbl order by c0")
+
+ // ISO format (use double single-quotes to escape the literal T)
+ checkSparkAnswerAndOperator(
+ "SELECT c0, date_format(c0, \"yyyy-MM-dd'T'HH:mm:ss\") from tbl order
by c0")
+ }
+ }
+
+ test("date_format with literal timestamp") {
+ // Test specific literal timestamp formats
+ // Disable constant folding to ensure Comet actually executes the
expression
+ withSQLConf(
+ SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC",
+ SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
+ "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") {
+ checkSparkAnswerAndOperator(
+ "SELECT date_format(TIMESTAMP '2024-03-15 14:30:45', 'yyyy-MM-dd')")
+ checkSparkAnswerAndOperator(
+ "SELECT date_format(TIMESTAMP '2024-03-15 14:30:45', 'yyyy-MM-dd
HH:mm:ss')")
+ checkSparkAnswerAndOperator(
+ "SELECT date_format(TIMESTAMP '2024-03-15 14:30:45', 'HH:mm:ss')")
+ checkSparkAnswerAndOperator("SELECT date_format(TIMESTAMP '2024-03-15
14:30:45', 'EEEE')")
+ checkSparkAnswerAndOperator(
+ "SELECT date_format(TIMESTAMP '2024-03-15 14:30:45', 'hh:mm:ss a')")
+ }
+ }
+
+ test("date_format with null") {
+ withSQLConf(
+ SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC",
+ SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
+ "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") {
+ checkSparkAnswerAndOperator("SELECT date_format(CAST(NULL AS TIMESTAMP),
'yyyy-MM-dd')")
+ }
+ }
+
+ test("date_format unsupported format falls back to Spark") {
+ createTimestampTestData.createOrReplaceTempView("tbl")
+
+ withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") {
+ // Unsupported format string
+ checkSparkAnswerAndFallbackReason(
+ "SELECT c0, date_format(c0, 'yyyy-MM-dd EEEE') from tbl order by c0",
+ "Format 'yyyy-MM-dd EEEE' is not supported")
+ }
+ }
+
+ test("date_format with non-UTC timezone falls back to Spark") {
+ createTimestampTestData.createOrReplaceTempView("tbl")
+
+ val nonUtcTimezones =
+ Seq("America/New_York", "America/Los_Angeles", "Europe/London",
"Asia/Tokyo")
+
+ for (tz <- nonUtcTimezones) {
+ withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) {
+ // Non-UTC timezones should fall back to Spark as Incompatible
+ checkSparkAnswerAndFallbackReason(
+ "SELECT c0, date_format(c0, 'yyyy-MM-dd HH:mm:ss') from tbl order by
c0",
+ s"Non-UTC timezone '$tz' may produce different results")
+ }
+ }
+ }
+
+ test("date_format with non-UTC timezone works when allowIncompatible is
enabled") {
+ createTimestampTestData.createOrReplaceTempView("tbl")
+
+ val nonUtcTimezones = Seq("America/New_York", "Europe/London",
"Asia/Tokyo")
+
+ for (tz <- nonUtcTimezones) {
+ withSQLConf(
+ SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz,
+ "spark.comet.expr.DateFormatClass.allowIncompatible" -> "true") {
+ // With allowIncompatible enabled, Comet will execute the expression
+ // Results may differ from Spark but should not throw errors
+ checkSparkAnswer("SELECT c0, date_format(c0, 'yyyy-MM-dd') from tbl
order by c0")
+ }
+ }
+ }
+
test("unix_date") {
val r = new Random(42)
val schema = StructType(Seq(StructField("c0", DataTypes.DateType, true)))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]