This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new bd014b39820 [SPARK-42898][SQL] Mark that string/date casts do not need
time zone id
bd014b39820 is described below
commit bd014b3982045ccd251ec6899976dc9dc6a9e36b
Author: Robert (Bobby) Evans <[email protected]>
AuthorDate: Fri Jul 21 11:56:48 2023 +0800
[SPARK-42898][SQL] Mark that string/date casts do not need time zone id
This PR takes over https://github.com/apache/spark/pull/40524 as the
original author is busy with other works.
It also close #40524
### What changes were proposed in this pull request?
This removes the need for a time zone id when casting from StringType ->
DateType and DateType -> StringType.
### Why are the changes needed?
It is mostly for consistency with what the code is actually doing. Marking
it as needing A time zone id has no real impact to the actual execution.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Pass GA. And `HivePartitionFilteringSuite` is updated to adapt to the
change.
Closes #42089 from pan3793/SPARK-42898.
Lead-authored-by: Robert (Bobby) Evans <[email protected]>
Co-authored-by: Cheng Pan <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../spark/sql/catalyst/expressions/Cast.scala | 4 +-
.../hive/client/HivePartitionFilteringSuite.scala | 193 ++++++++++++++-------
2 files changed, 131 insertions(+), 66 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 72df6c33ecd..8e58d546979 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -266,8 +266,8 @@ object Cast extends QueryErrorsBase {
* * Cast.castToTimestamp
*/
def needsTimeZone(from: DataType, to: DataType): Boolean = (from, to) match {
- case (StringType, TimestampType | DateType) => true
- case (TimestampType | DateType, StringType) => true
+ case (StringType, TimestampType) => true
+ case (TimestampType, StringType) => true
case (DateType, TimestampType) => true
case (TimestampType, DateType) => true
case (TimestampType, TimestampNTZType) => true
diff --git
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala
index b96d28d22cc..48a6c6a2be6 100644
---
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala
+++
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala
@@ -17,7 +17,7 @@
package org.apache.spark.sql.hive.client
-import java.sql.Date
+import java.sql.{Date, Timestamp}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.conf.HiveConf
@@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.SQLHelper
import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{BooleanType, DateType, IntegerType,
LongType, StringType, StructType}
+import org.apache.spark.sql.types.{BooleanType, DateType, IntegerType,
LongType, StringType, StructType, TimestampType}
import org.apache.spark.util.Utils
class HivePartitionFilteringSuite(version: String)
@@ -57,8 +57,10 @@ class HivePartitionFilteringSuite(version: String)
private val chunkValue = Seq("aa", "ab", "ba", "bb")
private val dateValue = Seq("2019-01-01", "2019-01-02", "2019-01-03") ++
defaultPartition
private val dateStrValue = Seq("2020-01-01", "2020-01-02", "2020-01-03",
"20200104", "20200105")
+ private val timestampStrValue = Seq("2021-01-01 00:00:00", "2021-01-02
00:00:00")
private val testPartitionCount =
- dsValue.size * hValue.size * chunkValue.size * dateValue.size *
dateStrValue.size
+ dsValue.size * hValue.size * chunkValue.size * dateValue.size *
dateStrValue.size *
+ timestampStrValue.size
private val storageFormat = CatalogStorageFormat(
locationUri = None,
@@ -78,13 +80,19 @@ class HivePartitionFilteringSuite(version: String)
hadoopConf.set("hive.metastore.warehouse.dir",
Utils.createTempDir().toURI().toString())
val client = buildClient(hadoopConf)
val tableSchema =
- new StructType().add("value", "int").add("ds", "int").add("h",
"int").add("chunk", "string")
- .add("d", "date").add("datestr", "string")
+ new StructType()
+ .add("value", "int")
+ .add("ds", "int")
+ .add("h", "int")
+ .add("chunk", "string")
+ .add("d", "date")
+ .add("datestr", "string")
+ .add("timestampstr", "string")
val table = CatalogTable(
identifier = TableIdentifier("test", Some("default")),
tableType = CatalogTableType.MANAGED,
schema = tableSchema,
- partitionColumnNames = Seq("ds", "h", "chunk", "d", "datestr"),
+ partitionColumnNames = Seq("ds", "h", "chunk", "d", "datestr",
"timestampstr"),
storage = storageFormat)
client.createTable(table, ignoreIfExists = false)
@@ -95,12 +103,14 @@ class HivePartitionFilteringSuite(version: String)
chunk <- chunkValue
date <- dateValue
dateStr <- dateStrValue
+ timestampStr <- timestampStrValue
} yield CatalogTablePartition(Map(
"ds" -> ds.toString,
"h" -> h.toString,
"chunk" -> chunk,
"d" -> date,
- "datestr" -> dateStr
+ "datestr" -> dateStr,
+ "timestampstr" -> timestampStr
), storageFormat)
assert(partitions.size == testPartitionCount)
@@ -152,7 +162,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: ds=20170101") {
@@ -162,7 +173,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: ds=(20170101 + 1) and h=0") {
@@ -174,7 +186,8 @@ class HivePartitionFilteringSuite(version: String)
0 to 0,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: chunk='aa'") {
@@ -184,7 +197,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
"aa" :: Nil,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: cast(chunk as int)=1 (not a valid partition
predicate)") {
@@ -194,7 +208,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: cast(chunk as boolean)=true (not a valid
partition predicate)") {
@@ -204,7 +219,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: 20170101=ds") {
@@ -214,7 +230,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: ds=20170101 and h=2") {
@@ -224,7 +241,8 @@ class HivePartitionFilteringSuite(version: String)
2 to 2,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: cast(ds as long)=20170101L and h=2") {
@@ -234,7 +252,8 @@ class HivePartitionFilteringSuite(version: String)
2 to 2,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: ds=20170101 or ds=20170102") {
@@ -244,7 +263,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: ds in (20170102, 20170103) (using IN
expression)") {
@@ -254,7 +274,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: cast(ds as long) in (20170102L, 20170103L)
(using IN expression)") {
@@ -264,7 +285,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: ds in (20170102, 20170103) (using INSET
expression)") {
@@ -274,7 +296,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue, {
+ dateStrValue,
+ timestampStrValue, {
case expr @ In(v, list) if expr.inSetConvertible =>
InSet(v, list.map(_.eval(EmptyRow)).toSet)
})
@@ -288,7 +311,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue, {
+ dateStrValue,
+ timestampStrValue, {
case expr @ In(v, list) if expr.inSetConvertible =>
InSet(v, list.map(_.eval(EmptyRow)).toSet)
})
@@ -301,7 +325,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
"ab" :: "ba" :: Nil,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: chunk in ('ab', 'ba') (using INSET
expression)") {
@@ -311,31 +336,38 @@ class HivePartitionFilteringSuite(version: String)
hValue,
"ab" :: "ba" :: Nil,
dateValue,
- dateStrValue, {
+ dateStrValue,
+ timestampStrValue, {
case expr @ In(v, list) if expr.inSetConvertible =>
InSet(v, list.map(_.eval(EmptyRow)).toSet)
})
}
test("getPartitionsByFilter: (ds=20170101 and h>=2) or (ds=20170102 and
h<2)") {
- val day1 = (20170101 to 20170101, 2 to 4, chunkValue, dateValue,
dateStrValue)
- val day2 = (20170102 to 20170102, 0 to 1, chunkValue, dateValue,
dateStrValue)
+ val day1 = (20170101 to 20170101, 2 to 4,
+ chunkValue, dateValue, dateStrValue, timestampStrValue)
+ val day2 = (20170102 to 20170102, 0 to 1,
+ chunkValue, dateValue, dateStrValue, timestampStrValue)
testMetastorePartitionFiltering((attr("ds") === 20170101 && attr("h") >=
2) ||
(attr("ds") === 20170102 && attr("h") < 2), day1 :: day2 :: Nil)
}
test("getPartitionsByFilter: (ds=20170101 and h>=2) or (ds=20170102 and
h<(1+1))") {
- val day1 = (20170101 to 20170101, 2 to 4, chunkValue, dateValue,
dateStrValue)
+ val day1 = (20170101 to 20170101, 2 to 4,
+ chunkValue, dateValue, dateStrValue, timestampStrValue)
// Day 2 should include all hours because we can't build a filter for
h<(7+1)
- val day2 = (20170102 to 20170102, 0 to 4, chunkValue, dateValue,
dateStrValue)
+ val day2 = (20170102 to 20170102, 0 to 4,
+ chunkValue, dateValue, dateStrValue, timestampStrValue)
testMetastorePartitionFiltering((attr("ds") === 20170101 && attr("h") >=
2) ||
(attr("ds") === 20170102 && attr("h") < (Literal(1) + 1)), day1 ::
day2 :: Nil)
}
test("getPartitionsByFilter: " +
"chunk in ('ab', 'ba') and ((ds=20170101 and h>=2) or (ds=20170102 and
h<2))") {
- val day1 = (20170101 to 20170101, 2 to 4, Seq("ab", "ba"), dateValue,
dateStrValue)
- val day2 = (20170102 to 20170102, 0 to 1, Seq("ab", "ba"), dateValue,
dateStrValue)
+ val day1 = (20170101 to 20170101, 2 to 4, Seq("ab", "ba"),
+ dateValue, dateStrValue, timestampStrValue)
+ val day2 = (20170102 to 20170102, 0 to 1, Seq("ab", "ba"),
+ dateValue, dateStrValue, timestampStrValue)
testMetastorePartitionFiltering(attr("chunk").in("ab", "ba") &&
((attr("ds") === 20170101 && attr("h") >= 2) || (attr("ds") ===
20170102 && attr("h") < 2)),
day1 :: day2 :: Nil)
@@ -348,7 +380,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
Seq("bb"),
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: chunk startsWith b") {
@@ -358,7 +391,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
Seq("ba", "bb"),
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: chunk endsWith b") {
@@ -368,7 +402,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
Seq("ab", "bb"),
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: chunk in ('ab', 'ba') and ((cast(ds as
string)>'20170102')") {
@@ -378,7 +413,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
Seq("ab", "ba"),
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: ds<>20170101") {
@@ -388,7 +424,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: h<>0 and chunk<>ab and d<>2019-01-01") {
@@ -398,7 +435,8 @@ class HivePartitionFilteringSuite(version: String)
1 to 4,
Seq("aa", "ba", "bb"),
Seq("2019-01-02", "2019-01-03"),
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: d=2019-01-01") {
@@ -408,7 +446,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
Seq("2019-01-01"),
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: d>2019-01-02") {
@@ -418,7 +457,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
Seq("2019-01-03"),
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: In(d, 2019-01-01, 2019-01-02)") {
@@ -429,7 +469,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
Seq("2019-01-01", "2019-01-02"),
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: InSet(d, 2019-01-01, 2019-01-02)") {
@@ -440,7 +481,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
Seq("2019-01-01", "2019-01-02"),
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: not in/inset string type") {
@@ -451,7 +493,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
result,
dateValue,
- dateStrValue
+ dateStrValue,
+ timestampStrValue
)
}
@@ -482,7 +525,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
result,
- dateStrValue
+ dateStrValue,
+ timestampStrValue
)
}
@@ -520,7 +564,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: IS NULL / IS NOT NULL") {
@@ -532,7 +577,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
}
@@ -544,7 +590,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
Seq("2019-01-01"),
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
}
@@ -555,7 +602,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
Seq("2019-01-02", "2019-01-03"),
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: d =!= 2019-01-01 || IS NULL") {
@@ -565,7 +613,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: d <=> 2019-01-01") {
@@ -575,7 +624,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("getPartitionsByFilter: d <=> null") {
@@ -585,7 +635,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
test("SPARK-35437: getPartitionsByFilter: substr(chunk,0,1)=a") {
@@ -597,7 +648,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
t._2,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
}
}
@@ -612,7 +664,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
t._2,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
}
}
@@ -626,7 +679,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- t._2)
+ t._2,
+ timestampStrValue)
}
}
}
@@ -638,7 +692,7 @@ class HivePartitionFilteringSuite(version: String)
Seq(attr("ds") === 20170101))
assert(filteredPartitions.size == 1 * hValue.size * chunkValue.size *
- dateValue.size * dateStrValue.size)
+ dateValue.size * dateStrValue.size * timestampStrValue.size)
}
}
@@ -654,7 +708,8 @@ class HivePartitionFilteringSuite(version: String)
hValue,
chunkValue,
dateValue,
- prunedPartition)
+ prunedPartition,
+ timestampStrValue)
}
}
@@ -662,12 +717,13 @@ class HivePartitionFilteringSuite(version: String)
Seq("true", "false").foreach { pruningFastFallbackEnabled =>
withSQLConf(pruningFastFallback -> pruningFastFallbackEnabled) {
testMetastorePartitionFiltering(
- attr("datestr").cast(DateType) === Date.valueOf("2020-01-01"),
+ attr("timestampstr").cast(TimestampType) ===
Timestamp.valueOf("2021-01-01 00:00:00"),
dsValue,
hValue,
chunkValue,
dateValue,
- dateStrValue)
+ dateStrValue,
+ timestampStrValue)
}
}
}
@@ -679,10 +735,12 @@ class HivePartitionFilteringSuite(version: String)
expectedH: Seq[Int],
expectedChunks: Seq[String],
expectedD: Seq[String],
- expectedDatestr: Seq[String]): Unit = {
+ expectedDatestr: Seq[String],
+ expectedTimestampstr: Seq[String]): Unit = {
testMetastorePartitionFiltering(
filterExpr,
- (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr) ::
Nil,
+ (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr,
+ expectedTimestampstr) :: Nil,
identity)
}
@@ -693,23 +751,26 @@ class HivePartitionFilteringSuite(version: String)
expectedChunks: Seq[String],
expectedD: Seq[String],
expectedDatestr: Seq[String],
+ expectedTimestampStr: Seq[String],
transform: Expression => Expression): Unit = {
testMetastorePartitionFiltering(
filterExpr,
- (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr) ::
Nil,
+ (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr,
+ expectedTimestampStr) :: Nil,
transform)
}
private def testMetastorePartitionFiltering(
filterExpr: Expression,
expectedPartitionCubes:
- Seq[(Seq[Int], Seq[Int], Seq[String], Seq[String], Seq[String])]):
Unit = {
+ Seq[(Seq[Int], Seq[Int], Seq[String], Seq[String], Seq[String],
Seq[String])]): Unit = {
testMetastorePartitionFiltering(filterExpr, expectedPartitionCubes,
identity)
}
private def testMetastorePartitionFiltering(
filterExpr: Expression,
- expectedPartitionCubes: Seq[(Seq[Int], Seq[Int], Seq[String],
Seq[String], Seq[String])],
+ expectedPartitionCubes: Seq[
+ (Seq[Int], Seq[Int], Seq[String], Seq[String], Seq[String],
Seq[String])],
transform: Expression => Expression): Unit = {
val filteredPartitions =
client.getPartitionsByFilter(client.getRawHiveTable("default", "test"),
Seq(
@@ -717,25 +778,29 @@ class HivePartitionFilteringSuite(version: String)
))
val expectedPartitionCount = expectedPartitionCubes.map {
- case (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr)
=>
+ case (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr,
+ expectedTimestampStr) =>
expectedDs.size * expectedH.size * expectedChunks.size *
- expectedD.size * expectedDatestr.size
+ expectedD.size * expectedDatestr.size * expectedTimestampStr.size
}.sum
val expectedPartitions = expectedPartitionCubes.map {
- case (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr)
=>
+ case (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr,
+ expectedTimestampStr) =>
for {
ds <- expectedDs
h <- expectedH
chunk <- expectedChunks
d <- expectedD
datestr <- expectedDatestr
+ timestampstr <- expectedTimestampStr
} yield Set(
"ds" -> ds.toString,
"h" -> h.toString,
"chunk" -> chunk,
"d" -> d,
- "datestr" -> datestr
+ "datestr" -> datestr,
+ "timestampstr" -> timestampstr
)
}.reduce(_ ++ _)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]