yaooqinn commented on a change in pull request #26942: [SPARK-30301][SQL] Fix
wrong results when datetimes as fields of complex types
URL: https://github.com/apache/spark/pull/26942#discussion_r397624096
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
##########
@@ -56,78 +56,41 @@ object HiveResult {
// We need the types so we can output struct field names
val types = executedPlan.output.map(_.dataType)
// Reformat to match hive tab delimited output.
- result.map(_.zip(types).map(toHiveString)).map(_.mkString("\t"))
+ result.map(_.zip(types).map(e => toHiveString(e)))
+ .map(_.mkString("\t"))
}
- private val primitiveTypes = Seq(
- StringType,
- IntegerType,
- LongType,
- DoubleType,
- FloatType,
- BooleanType,
- ByteType,
- ShortType,
- DateType,
- TimestampType,
- BinaryType)
-
private lazy val zoneId =
DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)
private lazy val dateFormatter = DateFormatter(zoneId)
private lazy val timestampFormatter =
TimestampFormatter.getFractionFormatter(zoneId)
- /** Hive outputs fields of structs slightly differently than top level
attributes. */
- private def toHiveStructString(a: (Any, DataType)): String = a match {
- case (struct: Row, StructType(fields)) =>
- struct.toSeq.zip(fields).map {
- case (v, t) => s""""${t.name}":${toHiveStructString((v,
t.dataType))}"""
- }.mkString("{", ",", "}")
- case (seq: Seq[_], ArrayType(typ, _)) =>
- seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
- case (map: Map[_, _], MapType(kType, vType, _)) =>
- map.map {
- case (key, value) =>
- toHiveStructString((key, kType)) + ":" + toHiveStructString((value,
vType))
- }.toSeq.sorted.mkString("{", ",", "}")
- case (null, _) => "null"
- case (s: String, StringType) => "\"" + s + "\""
- case (decimal, DecimalType()) => decimal.toString
- case (interval: CalendarInterval, CalendarIntervalType) =>
- SQLConf.get.intervalOutputStyle match {
- case SQL_STANDARD => toSqlStandardString(interval)
- case ISO_8601 => toIso8601String(interval)
- case MULTI_UNITS => toMultiUnitsString(interval)
- }
- case (other, tpe) if primitiveTypes contains tpe => other.toString
- }
-
/** Formats a datum (based on the given data type) and returns the string
representation. */
- def toHiveString(a: (Any, DataType)): String = a match {
- case (struct: Row, StructType(fields)) =>
- struct.toSeq.zip(fields).map {
- case (v, t) => s""""${t.name}":${toHiveStructString((v,
t.dataType))}"""
- }.mkString("{", ",", "}")
- case (seq: Seq[_], ArrayType(typ, _)) =>
- seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
- case (map: Map[_, _], MapType(kType, vType, _)) =>
- map.map {
- case (key, value) =>
- toHiveStructString((key, kType)) + ":" + toHiveStructString((value,
vType))
- }.toSeq.sorted.mkString("{", ",", "}")
- case (null, _) => "NULL"
+ def toHiveString(a: (Any, DataType), nested: Boolean = false): String = a
match {
+ case (null, _) => if (nested) "null" else "NULL"
+ case (b, BooleanType) => b.toString
case (d: Date, DateType) =>
dateFormatter.format(DateTimeUtils.fromJavaDate(d))
case (t: Timestamp, TimestampType) =>
- DateTimeUtils.timestampToString(timestampFormatter,
DateTimeUtils.fromJavaTimestamp(t))
+ timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t))
case (bin: Array[Byte], BinaryType) => new String(bin,
StandardCharsets.UTF_8)
Review comment:
```sql
hive> create table bt(k binary);
OK
Time taken: 0.637 seconds
hive> insert into bt values ("spark"), ("hello"), ("3"), ("."), ("0");
Query ID = root_20200325055555_1c227d63-47dd-4899-a879-0b6a98269908
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1585115622286_0001, Tracking URL =
http://quickstart.cloudera:8088/proxy/application_1585115622286_0001/
Kill Command = /usr/lib/hadoop/bin/hadoop job -kill job_1585115622286_0001
Hadoop job information for Stage-1: number of mappers: 1; number of
reducers: 0
2020-03-25 05:57:03,557 Stage-1 map = 0%, reduce = 0%
2020-03-25 05:57:09,766 Stage-1 map = 100%, reduce = 0%, Cumulative CPU
1.86 sec
MapReduce Total cumulative CPU time: 1 seconds 860 msec
Ended Job = job_1585115622286_0001
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to:
hdfs://quickstart.cloudera:8020/user/hive/warehouse/bt/.hive-staging_hive_2020-03-25_05-56-54_007_226069574359185383-1/-ext-10000
Loading data to table default.bt
Table default.bt stats: [numFiles=1, numRows=5, totalSize=33, rawDataSize=28]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 1.86 sec HDFS Read: 3080 HDFS
Write: 99 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 860 msec
OK
Time taken: 17.185 seconds
hive> select k, array(k) from bt;
OK
spark [spark]
hello [hello]
3 [3]
. [.]
0 [0]
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]