[GitHub] [spark] yaooqinn commented on a change in pull request #26942: [SPARK-30301][SQL] Fix wrong results when datetimes as fields of complex types

GitBox Tue, 24 Mar 2020 22:59:35 -0700

yaooqinn commented on a change in pull request #26942: [SPARK-30301][SQL] Fix 
wrong results when datetimes as fields of complex types
URL: https://github.com/apache/spark/pull/26942#discussion_r397624096


 ##########
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
 ##########
 @@ -56,78 +56,41 @@ object HiveResult {
       // We need the types so we can output struct field names
       val types = executedPlan.output.map(_.dataType)
       // Reformat to match hive tab delimited output.
-      result.map(_.zip(types).map(toHiveString)).map(_.mkString("\t"))
+      result.map(_.zip(types).map(e => toHiveString(e)))
+        .map(_.mkString("\t"))
   }
 
-  private val primitiveTypes = Seq(
-    StringType,
-    IntegerType,
-    LongType,
-    DoubleType,
-    FloatType,
-    BooleanType,
-    ByteType,
-    ShortType,
-    DateType,
-    TimestampType,
-    BinaryType)
-
   private lazy val zoneId = 
DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)
   private lazy val dateFormatter = DateFormatter(zoneId)
   private lazy val timestampFormatter = 
TimestampFormatter.getFractionFormatter(zoneId)
 
-  /** Hive outputs fields of structs slightly differently than top level 
attributes. */
-  private def toHiveStructString(a: (Any, DataType)): String = a match {
-    case (struct: Row, StructType(fields)) =>
-      struct.toSeq.zip(fields).map {
-        case (v, t) => s""""${t.name}":${toHiveStructString((v, 
t.dataType))}"""
-      }.mkString("{", ",", "}")
-    case (seq: Seq[_], ArrayType(typ, _)) =>
-      seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
-    case (map: Map[_, _], MapType(kType, vType, _)) =>
-      map.map {
-        case (key, value) =>
-          toHiveStructString((key, kType)) + ":" + toHiveStructString((value, 
vType))
-      }.toSeq.sorted.mkString("{", ",", "}")
-    case (null, _) => "null"
-    case (s: String, StringType) => "\"" + s + "\""
-    case (decimal, DecimalType()) => decimal.toString
-    case (interval: CalendarInterval, CalendarIntervalType) =>
-      SQLConf.get.intervalOutputStyle match {
-        case SQL_STANDARD => toSqlStandardString(interval)
-        case ISO_8601 => toIso8601String(interval)
-        case MULTI_UNITS => toMultiUnitsString(interval)
-      }
-    case (other, tpe) if primitiveTypes contains tpe => other.toString
-  }
-
   /** Formats a datum (based on the given data type) and returns the string 
representation. */
-  def toHiveString(a: (Any, DataType)): String = a match {
-    case (struct: Row, StructType(fields)) =>
-      struct.toSeq.zip(fields).map {
-        case (v, t) => s""""${t.name}":${toHiveStructString((v, 
t.dataType))}"""
-      }.mkString("{", ",", "}")
-    case (seq: Seq[_], ArrayType(typ, _)) =>
-      seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]")
-    case (map: Map[_, _], MapType(kType, vType, _)) =>
-      map.map {
-        case (key, value) =>
-          toHiveStructString((key, kType)) + ":" + toHiveStructString((value, 
vType))
-      }.toSeq.sorted.mkString("{", ",", "}")
-    case (null, _) => "NULL"
+  def toHiveString(a: (Any, DataType), nested: Boolean = false): String = a 
match {
+    case (null, _) => if (nested) "null" else "NULL"
+    case (b, BooleanType) => b.toString
     case (d: Date, DateType) => 
dateFormatter.format(DateTimeUtils.fromJavaDate(d))
     case (t: Timestamp, TimestampType) =>
-      DateTimeUtils.timestampToString(timestampFormatter, 
DateTimeUtils.fromJavaTimestamp(t))
+      timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t))
     case (bin: Array[Byte], BinaryType) => new String(bin, 
StandardCharsets.UTF_8)
 
 Review comment:
   ```sql
   hive> create table bt(k binary);
   OK
   Time taken: 0.637 seconds
   hive> insert into bt values ("spark"), ("hello"), ("3"), ("."), ("0");
   Query ID = root_20200325055555_1c227d63-47dd-4899-a879-0b6a98269908
   Total jobs = 3
   Launching Job 1 out of 3
   Number of reduce tasks is set to 0 since there's no reduce operator
   Starting Job = job_1585115622286_0001, Tracking URL = 
http://quickstart.cloudera:8088/proxy/application_1585115622286_0001/
   Kill Command = /usr/lib/hadoop/bin/hadoop job  -kill job_1585115622286_0001
   Hadoop job information for Stage-1: number of mappers: 1; number of 
reducers: 0
   2020-03-25 05:57:03,557 Stage-1 map = 0%,  reduce = 0%
   2020-03-25 05:57:09,766 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 
1.86 sec
   MapReduce Total cumulative CPU time: 1 seconds 860 msec
   Ended Job = job_1585115622286_0001
   Stage-4 is selected by condition resolver.
   Stage-3 is filtered out by condition resolver.
   Stage-5 is filtered out by condition resolver.
   Moving data to: 
hdfs://quickstart.cloudera:8020/user/hive/warehouse/bt/.hive-staging_hive_2020-03-25_05-56-54_007_226069574359185383-1/-ext-10000
   Loading data to table default.bt
   Table default.bt stats: [numFiles=1, numRows=5, totalSize=33, rawDataSize=28]
   MapReduce Jobs Launched:
   Stage-Stage-1: Map: 1   Cumulative CPU: 1.86 sec   HDFS Read: 3080 HDFS 
Write: 99 SUCCESS
   Total MapReduce CPU Time Spent: 1 seconds 860 msec
   OK
   Time taken: 17.185 seconds
   hive> select k, array(k) from bt;
   OK
   spark        [spark]
   hello        [hello]
   3    [3]
   .    [.]
   0    [0]
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] yaooqinn commented on a change in pull request #26942: [SPARK-30301][SQL] Fix wrong results when datetimes as fields of complex types

Reply via email to