Repository: spark
Updated Branches:
  refs/heads/branch-2.0 3500dbc9b -> 704215d30


[SPARK-17335][SQL] Fix ArrayType and MapType CatalogString.

## What changes were proposed in this pull request?
the `catalogString` for `ArrayType` and `MapType` currently calls the 
`simpleString` method on its children. This is a problem when the child is a 
struct, the `struct.simpleString` implementation truncates the number of fields 
it shows (25 at max). This breaks the generation of a proper `catalogString`, 
and has shown to cause errors while writing to Hive.

This PR fixes this by providing proper `catalogString` implementations for 
`ArrayData` or `MapData`.

## How was this patch tested?
Added testing for `catalogString` to `DataTypeSuite`.

Author: Herman van Hovell <hvanhov...@databricks.com>

Closes #14938 from hvanhovell/SPARK-17335.

(cherry picked from commit c2a1576c230697f56f282b6388c79835377e0f2f)
Signed-off-by: Herman van Hovell <hvanhov...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/704215d3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/704215d3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/704215d3

Branch: refs/heads/branch-2.0
Commit: 704215d3055bad7957d1d6da1a1a526c0d27d37d
Parents: 3500dbc
Author: Herman van Hovell <hvanhov...@databricks.com>
Authored: Sat Sep 3 19:02:20 2016 +0200
Committer: Herman van Hovell <hvanhov...@databricks.com>
Committed: Sat Sep 3 19:02:35 2016 +0200

----------------------------------------------------------------------
 .../org/apache/spark/sql/types/ArrayType.scala  |   2 +
 .../org/apache/spark/sql/types/MapType.scala    |   2 +
 .../apache/spark/sql/types/DataTypeSuite.scala  |  30 ++++
 .../benchmarks/WideSchemaBenchmark-results.txt  | 174 +++++++++++--------
 4 files changed, 133 insertions(+), 75 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/704215d3/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 520e344..82a03b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -77,6 +77,8 @@ case class ArrayType(elementType: DataType, containsNull: 
Boolean) extends DataT
 
   override def simpleString: String = s"array<${elementType.simpleString}>"
 
+  override def catalogString: String = s"array<${elementType.catalogString}>"
+
   override def sql: String = s"ARRAY<${elementType.sql}>"
 
   override private[spark] def asNullable: ArrayType =

http://git-wip-us.apache.org/repos/asf/spark/blob/704215d3/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index 454ea40..1789609 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -64,6 +64,8 @@ case class MapType(
 
   override def simpleString: String = 
s"map<${keyType.simpleString},${valueType.simpleString}>"
 
+  override def catalogString: String = 
s"map<${keyType.catalogString},${valueType.catalogString}>"
+
   override def sql: String = s"MAP<${keyType.sql}, ${valueType.sql}>"
 
   override private[spark] def asNullable: MapType =

http://git-wip-us.apache.org/repos/asf/spark/blob/704215d3/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 6b85f12..569230a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.types
 
 import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 
 class DataTypeSuite extends SparkFunSuite {
 
@@ -342,4 +343,33 @@ class DataTypeSuite extends SparkFunSuite {
       StructField("a", StringType, nullable = false) ::
       StructField("b", StringType, nullable = false) :: Nil),
     expected = false)
+
+  def checkCatalogString(dt: DataType): Unit = {
+    test(s"catalogString: $dt") {
+      val dt2 = CatalystSqlParser.parseDataType(dt.catalogString)
+      assert(dt === dt2)
+    }
+  }
+  def createStruct(n: Int): StructType = new StructType(Array.tabulate(n) {
+    i => StructField(s"col$i", IntegerType, nullable = true)
+  })
+
+  checkCatalogString(BooleanType)
+  checkCatalogString(ByteType)
+  checkCatalogString(ShortType)
+  checkCatalogString(IntegerType)
+  checkCatalogString(LongType)
+  checkCatalogString(FloatType)
+  checkCatalogString(DoubleType)
+  checkCatalogString(DecimalType(10, 5))
+  checkCatalogString(BinaryType)
+  checkCatalogString(StringType)
+  checkCatalogString(DateType)
+  checkCatalogString(TimestampType)
+  checkCatalogString(createStruct(4))
+  checkCatalogString(createStruct(40))
+  checkCatalogString(ArrayType(IntegerType))
+  checkCatalogString(ArrayType(createStruct(40)))
+  checkCatalogString(MapType(IntegerType, StringType))
+  checkCatalogString(MapType(IntegerType, createStruct(40)))
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/704215d3/sql/core/benchmarks/WideSchemaBenchmark-results.txt
----------------------------------------------------------------------
diff --git a/sql/core/benchmarks/WideSchemaBenchmark-results.txt 
b/sql/core/benchmarks/WideSchemaBenchmark-results.txt
index ea6a661..0b9f791 100644
--- a/sql/core/benchmarks/WideSchemaBenchmark-results.txt
+++ b/sql/core/benchmarks/WideSchemaBenchmark-results.txt
@@ -1,93 +1,117 @@
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
+
 parsing large select:                    Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
 
------------------------------------------------------------------------------------------------
-1 select expressions                             3 /    5          0.0     
2967064.0       1.0X
-100 select expressions                          11 /   12          0.0    
11369518.0       0.3X
-2500 select expressions                        243 /  250          0.0   
242561004.0       0.0X
+1 select expressions                             2 /    4          0.0     
2050147.0       1.0X
+100 select expressions                           6 /    7          0.0     
6123412.0       0.3X
+2500 select expressions                        135 /  141          0.0   
134623148.0       0.0X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 many column field r/w:                   Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
 
------------------------------------------------------------------------------------------------
-1 cols x 100000 rows (read in-mem)              28 /   40          3.6         
278.8       1.0X
-1 cols x 100000 rows (exec in-mem)              28 /   42          3.5         
284.0       1.0X
-1 cols x 100000 rows (read parquet)             23 /   35          4.4         
228.8       1.2X
-1 cols x 100000 rows (write parquet)           163 /  182          0.6        
1633.0       0.2X
-100 cols x 1000 rows (read in-mem)              27 /   39          3.7         
266.9       1.0X
-100 cols x 1000 rows (exec in-mem)              48 /   79          2.1         
481.7       0.6X
-100 cols x 1000 rows (read parquet)             25 /   36          3.9         
254.3       1.1X
-100 cols x 1000 rows (write parquet)           182 /  196          0.5        
1819.5       0.2X
-2500 cols x 40 rows (read in-mem)              280 /  315          0.4        
2797.1       0.1X
-2500 cols x 40 rows (exec in-mem)              606 /  638          0.2        
6064.3       0.0X
-2500 cols x 40 rows (read parquet)             836 /  843          0.1        
8356.4       0.0X
-2500 cols x 40 rows (write parquet)            490 /  522          0.2        
4900.6       0.1X
+1 cols x 100000 rows (read in-mem)              16 /   18          6.3         
158.6       1.0X
+1 cols x 100000 rows (exec in-mem)              17 /   19          6.0         
166.7       1.0X
+1 cols x 100000 rows (read parquet)             24 /   26          4.3         
235.1       0.7X
+1 cols x 100000 rows (write parquet)            81 /   85          1.2         
811.3       0.2X
+100 cols x 1000 rows (read in-mem)              17 /   19          6.0         
166.2       1.0X
+100 cols x 1000 rows (exec in-mem)              25 /   27          4.0         
249.2       0.6X
+100 cols x 1000 rows (read parquet)             23 /   25          4.4         
226.0       0.7X
+100 cols x 1000 rows (write parquet)            83 /   87          1.2         
831.0       0.2X
+2500 cols x 40 rows (read in-mem)              132 /  137          0.8        
1322.9       0.1X
+2500 cols x 40 rows (exec in-mem)              326 /  330          0.3        
3260.6       0.0X
+2500 cols x 40 rows (read parquet)             831 /  839          0.1        
8305.8       0.0X
+2500 cols x 40 rows (write parquet)            237 /  245          0.4        
2372.6       0.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 wide shallowly nested struct field r/w:  Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
 
------------------------------------------------------------------------------------------------
-1 wide x 100000 rows (read in-mem)              22 /   35          4.6         
216.0       1.0X
-1 wide x 100000 rows (exec in-mem)              40 /   63          2.5         
400.6       0.5X
-1 wide x 100000 rows (read parquet)             93 /  134          1.1         
933.9       0.2X
-1 wide x 100000 rows (write parquet)           133 /  174          0.7        
1334.3       0.2X
-100 wide x 1000 rows (read in-mem)              22 /   44          4.5         
223.3       1.0X
-100 wide x 1000 rows (exec in-mem)              88 /  138          1.1         
878.6       0.2X
-100 wide x 1000 rows (read parquet)            117 /  186          0.9        
1172.0       0.2X
-100 wide x 1000 rows (write parquet)           144 /  174          0.7        
1441.6       0.1X
-2500 wide x 40 rows (read in-mem)               36 /   57          2.8         
358.9       0.6X
-2500 wide x 40 rows (exec in-mem)             1466 / 1507          0.1       
14656.6       0.0X
-2500 wide x 40 rows (read parquet)             690 /  802          0.1        
6898.2       0.0X
-2500 wide x 40 rows (write parquet)            197 /  207          0.5        
1970.9       0.1X
+1 wide x 100000 rows (read in-mem)              15 /   17          6.6         
151.0       1.0X
+1 wide x 100000 rows (exec in-mem)              20 /   22          5.1         
196.6       0.8X
+1 wide x 100000 rows (read parquet)             59 /   63          1.7         
592.8       0.3X
+1 wide x 100000 rows (write parquet)            81 /   87          1.2         
814.6       0.2X
+100 wide x 1000 rows (read in-mem)              21 /   25          4.8         
208.7       0.7X
+100 wide x 1000 rows (exec in-mem)              72 /   81          1.4         
718.5       0.2X
+100 wide x 1000 rows (read parquet)             75 /   85          1.3         
752.6       0.2X
+100 wide x 1000 rows (write parquet)            88 /   95          1.1         
876.7       0.2X
+2500 wide x 40 rows (read in-mem)               28 /   34          3.5         
282.2       0.5X
+2500 wide x 40 rows (exec in-mem)             1269 / 1284          0.1       
12688.1       0.0X
+2500 wide x 40 rows (read parquet)             549 /  578          0.2        
5493.4       0.0X
+2500 wide x 40 rows (write parquet)             96 /  104          1.0         
959.1       0.2X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 deeply nested struct field r/w:          Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
 
------------------------------------------------------------------------------------------------
-1 deep x 100000 rows (read in-mem)              22 /   35          4.5         
223.9       1.0X
-1 deep x 100000 rows (exec in-mem)              28 /   52          3.6         
280.6       0.8X
-1 deep x 100000 rows (read parquet)             41 /   65          2.4         
410.5       0.5X
-1 deep x 100000 rows (write parquet)           163 /  173          0.6        
1634.5       0.1X
-100 deep x 1000 rows (read in-mem)              43 /   63          2.3         
425.9       0.5X
-100 deep x 1000 rows (exec in-mem)             232 /  280          0.4        
2321.7       0.1X
-100 deep x 1000 rows (read parquet)           1989 / 2281          0.1       
19886.6       0.0X
-100 deep x 1000 rows (write parquet)           144 /  184          0.7        
1442.6       0.2X
-250 deep x 400 rows (read in-mem)               68 /   95          1.5         
680.9       0.3X
-250 deep x 400 rows (exec in-mem)             1310 / 1403          0.1       
13096.4       0.0X
-250 deep x 400 rows (read parquet)          41477 / 41847          0.0      
414766.8       0.0X
-250 deep x 400 rows (write parquet)            243 /  272          0.4        
2433.1       0.1X
+1 deep x 100000 rows (read in-mem)              14 /   16          7.0         
143.8       1.0X
+1 deep x 100000 rows (exec in-mem)              17 /   19          5.9         
169.7       0.8X
+1 deep x 100000 rows (read parquet)             33 /   35          3.1         
327.0       0.4X
+1 deep x 100000 rows (write parquet)            79 /   84          1.3         
786.9       0.2X
+100 deep x 1000 rows (read in-mem)              21 /   24          4.7         
211.3       0.7X
+100 deep x 1000 rows (exec in-mem)             221 /  235          0.5        
2214.5       0.1X
+100 deep x 1000 rows (read parquet)           1928 / 1952          0.1       
19277.1       0.0X
+100 deep x 1000 rows (write parquet)            91 /   96          1.1         
909.5       0.2X
+250 deep x 400 rows (read in-mem)               57 /   61          1.8         
567.1       0.3X
+250 deep x 400 rows (exec in-mem)             1329 / 1385          0.1       
13291.8       0.0X
+250 deep x 400 rows (read parquet)          36563 / 36750          0.0      
365630.2       0.0X
+250 deep x 400 rows (write parquet)            126 /  130          0.8        
1262.0       0.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 bushy struct field r/w:                  Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
 
------------------------------------------------------------------------------------------------
-1 x 1 deep x 100000 rows (read in-mem)          23 /   36          4.4         
229.8       1.0X
-1 x 1 deep x 100000 rows (exec in-mem)          27 /   48          3.7         
269.6       0.9X
-1 x 1 deep x 100000 rows (read parquet)         25 /   33          4.0         
247.5       0.9X
-1 x 1 deep x 100000 rows (write parquet)        82 /  134          1.2         
821.1       0.3X
-128 x 8 deep x 1000 rows (read in-mem)          19 /   29          5.3         
189.5       1.2X
-128 x 8 deep x 1000 rows (exec in-mem)         144 /  165          0.7        
1440.4       0.2X
-128 x 8 deep x 1000 rows (read parquet)        117 /  159          0.9        
1174.4       0.2X
-128 x 8 deep x 1000 rows (write parquet)       135 /  162          0.7        
1349.0       0.2X
-1024 x 11 deep x 100 rows (read in-mem)         30 /   49          3.3         
304.4       0.8X
-1024 x 11 deep x 100 rows (exec in-mem)       1146 / 1183          0.1       
11457.6       0.0X
-1024 x 11 deep x 100 rows (read parquet)       712 /  758          0.1        
7119.5       0.0X
-1024 x 11 deep x 100 rows (write parquet)       104 /  143          1.0        
1037.3       0.2X
+1 x 1 deep x 100000 rows (read in-mem)          13 /   15          7.8         
127.7       1.0X
+1 x 1 deep x 100000 rows (exec in-mem)          15 /   17          6.6         
151.5       0.8X
+1 x 1 deep x 100000 rows (read parquet)         20 /   23          5.0         
198.3       0.6X
+1 x 1 deep x 100000 rows (write parquet)        77 /   82          1.3         
770.4       0.2X
+128 x 8 deep x 1000 rows (read in-mem)          12 /   14          8.2         
122.5       1.0X
+128 x 8 deep x 1000 rows (exec in-mem)         124 /  140          0.8        
1241.2       0.1X
+128 x 8 deep x 1000 rows (read parquet)         69 /   74          1.4         
693.9       0.2X
+128 x 8 deep x 1000 rows (write parquet)        78 /   83          1.3         
777.7       0.2X
+1024 x 11 deep x 100 rows (read in-mem)         25 /   29          4.1         
246.1       0.5X
+1024 x 11 deep x 100 rows (exec in-mem)       1197 / 1223          0.1       
11974.6       0.0X
+1024 x 11 deep x 100 rows (read parquet)       426 /  433          0.2        
4263.7       0.0X
+1024 x 11 deep x 100 rows (write parquet)        91 /   98          1.1        
 913.5       0.1X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
 
-OpenJDK 64-Bit Server VM 1.8.0_66-internal-b17 on Linux 4.2.0-36-generic
-Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz
 wide array field r/w:                    Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
 
------------------------------------------------------------------------------------------------
-1 wide x 100000 rows (read in-mem)              18 /   31          5.6         
179.3       1.0X
-1 wide x 100000 rows (exec in-mem)              31 /   47          3.2         
310.2       0.6X
-1 wide x 100000 rows (read parquet)             45 /   73          2.2         
445.1       0.4X
-1 wide x 100000 rows (write parquet)           109 /  140          0.9        
1085.9       0.2X
-100 wide x 1000 rows (read in-mem)              17 /   25          5.8         
172.7       1.0X
-100 wide x 1000 rows (exec in-mem)              18 /   22          5.4         
184.6       1.0X
-100 wide x 1000 rows (read parquet)             26 /   42          3.8         
261.8       0.7X
-100 wide x 1000 rows (write parquet)           150 /  164          0.7        
1499.4       0.1X
-2500 wide x 40 rows (read in-mem)               19 /   31          5.1         
194.7       0.9X
-2500 wide x 40 rows (exec in-mem)               19 /   24          5.3         
188.5       1.0X
-2500 wide x 40 rows (read parquet)              33 /   47          3.0         
334.4       0.5X
-2500 wide x 40 rows (write parquet)            153 /  164          0.7        
1528.2       0.1X
+1 wide x 100000 rows (read in-mem)              14 /   16          7.0         
143.2       1.0X
+1 wide x 100000 rows (exec in-mem)              17 /   19          5.9         
170.9       0.8X
+1 wide x 100000 rows (read parquet)             43 /   46          2.3         
434.1       0.3X
+1 wide x 100000 rows (write parquet)            78 /   83          1.3         
777.6       0.2X
+100 wide x 1000 rows (read in-mem)              11 /   13          9.0         
111.5       1.3X
+100 wide x 1000 rows (exec in-mem)              13 /   15          7.8         
128.3       1.1X
+100 wide x 1000 rows (read parquet)             24 /   27          4.1         
245.0       0.6X
+100 wide x 1000 rows (write parquet)            74 /   80          1.4         
740.5       0.2X
+2500 wide x 40 rows (read in-mem)               11 /   13          9.1         
109.5       1.3X
+2500 wide x 40 rows (exec in-mem)               13 /   15          7.7         
129.4       1.1X
+2500 wide x 40 rows (read parquet)              24 /   26          4.1         
241.3       0.6X
+2500 wide x 40 rows (write parquet)             75 /   81          1.3         
751.8       0.2X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6
+Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
+
+wide map field r/w:                      Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+1 wide x 100000 rows (read in-mem)              16 /   18          6.2         
162.6       1.0X
+1 wide x 100000 rows (exec in-mem)              21 /   23          4.8         
208.2       0.8X
+1 wide x 100000 rows (read parquet)             54 /   59          1.8         
543.6       0.3X
+1 wide x 100000 rows (write parquet)            80 /   86          1.2         
804.5       0.2X
+100 wide x 1000 rows (read in-mem)              11 /   13          8.7         
114.5       1.4X
+100 wide x 1000 rows (exec in-mem)              14 /   16          7.0         
143.5       1.1X
+100 wide x 1000 rows (read parquet)             30 /   32          3.3         
300.4       0.5X
+100 wide x 1000 rows (write parquet)            75 /   80          1.3         
749.9       0.2X
+2500 wide x 40 rows (read in-mem)               13 /   15          7.8         
128.1       1.3X
+2500 wide x 40 rows (exec in-mem)               15 /   18          6.5         
153.6       1.1X
+2500 wide x 40 rows (read parquet)              30 /   33          3.3         
304.4       0.5X
+2500 wide x 40 rows (write parquet)             77 /   83          1.3         
768.5       0.2X
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to