yaooqinn commented on a change in pull request #26942: [SPARK-30301][SQL] Fix
wrong results when datetimes as fields of complex types
URL: https://github.com/apache/spark/pull/26942#discussion_r397625334
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
##########
@@ -56,78 +56,41 @@ object HiveResult {
// We need the types so we can output struct field names
val types = executedPlan.output.map(_.dataType)
// Reformat to match hive tab delimited output.
- result.map(_.zip(types).map(toHiveString)).map(_.mkString("\t"))
+ result.map(_.zip(types).map(e => toHiveString(e)))
+ .map(_.mkString("\t"))
}
- private val primitiveTypes = Seq(
- StringType,
- IntegerType,
- LongType,
- DoubleType,
- FloatType,
- BooleanType,
- ByteType,
- ShortType,
- DateType,
- TimestampType,
- BinaryType)
-
private lazy val zoneId =
DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)
private lazy val dateFormatter = DateFormatter(zoneId)
private lazy val timestampFormatter =
TimestampFormatter.getFractionFormatter(zoneId)
- /** Hive outputs fields of structs slightly differently than top level
attributes. */
- private def toHiveStructString(a: (Any, DataType)): String = a match {
- case (struct: Row, StructType(fields)) =>
- struct.toSeq.zip(fields).map {
- case (v, t) => s""""${t.name}":${toHiveStructString((v,
t.dataType))}"""
- }.mkString("{", ",", "}")
- case (seq: Seq[_], ArrayType(typ, _)) =>
- seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
- case (map: Map[_, _], MapType(kType, vType, _)) =>
- map.map {
- case (key, value) =>
- toHiveStructString((key, kType)) + ":" + toHiveStructString((value,
vType))
- }.toSeq.sorted.mkString("{", ",", "}")
- case (null, _) => "null"
- case (s: String, StringType) => "\"" + s + "\""
- case (decimal, DecimalType()) => decimal.toString
- case (interval: CalendarInterval, CalendarIntervalType) =>
- SQLConf.get.intervalOutputStyle match {
- case SQL_STANDARD => toSqlStandardString(interval)
- case ISO_8601 => toIso8601String(interval)
- case MULTI_UNITS => toMultiUnitsString(interval)
- }
- case (other, tpe) if primitiveTypes contains tpe => other.toString
- }
-
/** Formats a datum (based on the given data type) and returns the string
representation. */
- def toHiveString(a: (Any, DataType)): String = a match {
- case (struct: Row, StructType(fields)) =>
- struct.toSeq.zip(fields).map {
- case (v, t) => s""""${t.name}":${toHiveStructString((v,
t.dataType))}"""
- }.mkString("{", ",", "}")
- case (seq: Seq[_], ArrayType(typ, _)) =>
- seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
- case (map: Map[_, _], MapType(kType, vType, _)) =>
- map.map {
- case (key, value) =>
- toHiveStructString((key, kType)) + ":" + toHiveStructString((value,
vType))
- }.toSeq.sorted.mkString("{", ",", "}")
- case (null, _) => "NULL"
+ def toHiveString(a: (Any, DataType), nested: Boolean = false): String = a
match {
+ case (null, _) => if (nested) "null" else "NULL"
+ case (b, BooleanType) => b.toString
case (d: Date, DateType) =>
dateFormatter.format(DateTimeUtils.fromJavaDate(d))
case (t: Timestamp, TimestampType) =>
- DateTimeUtils.timestampToString(timestampFormatter,
DateTimeUtils.fromJavaTimestamp(t))
+ timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t))
case (bin: Array[Byte], BinaryType) => new String(bin,
StandardCharsets.UTF_8)
Review comment:
```sql
[root@quickstart /]# beeline -u 'jdbc:hive2://localhost:10000'
2020-03-25 06:02:31,786 WARN [main] mapreduce.TableMapReduceUtil: The
hbase-prefix-tree module jar containing PrefixTreeCodec is not present.
Continuing without it.
scan complete in 4ms
Connecting to jdbc:hive2://localhost:10000
Connected to: Apache Hive (version 1.1.0-cdh5.7.0)
Driver: Hive JDBC (version 1.1.0-cdh5.7.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 1.1.0-cdh5.7.0 by Apache Hive
0: jdbc:hive2://localhost:10000> select k, array(k) from bt;
INFO : Compiling
command(queryId=hive_20200325060202_e14c040e-6556-4eb3-98c8-295e929d1a65):
select k, array(k) from bt
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:k,
type:binary, comment:null), FieldSchema(name:_c1, type:array<binary>,
comment:null)], properties:null)
INFO : Completed compiling
command(queryId=hive_20200325060202_e14c040e-6556-4eb3-98c8-295e929d1a65); Time
taken: 1.108 seconds
INFO : Concurrency mode is disabled, not creating a lock manager
INFO : Executing
command(queryId=hive_20200325060202_e14c040e-6556-4eb3-98c8-295e929d1a65):
select k, array(k) from bt
INFO : Completed executing
command(queryId=hive_20200325060202_e14c040e-6556-4eb3-98c8-295e929d1a65); Time
taken: 0.005 seconds
INFO : OK
+--------+----------+--+
| k | _c1 |
+--------+----------+--+
| spark | [spark] |
| hello | [hello] |
| 3 | [3] |
| . | [.] |
| 0 | [0] |
+--------+----------+--+
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]