spark git commit: [SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics

hvanhovell Wed, 14 Dec 2016 12:25:42 -0800

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 1d5c7f452 -> 2b18bd4b9



[SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating 
statistics

This patch reduces the default number element estimation for arrays and maps 
from 100 to 1. The issue with the 100 number is that when nested (e.g. an array 
of map), 100 * 100 would be used as the default size. This sounds like just an 
overestimation which doesn't seem that bad (since it is usually better to 
overestimate than underestimate). However, due to the way we assume the size 
output for Project (new estimated column size / old estimated column size), 
this overestimation can become underestimation. It is actually in general in 
this case safer to assume 1 default element.

This should be covered by existing tests.

Author: Reynold Xin <[email protected]>

Closes #16274 from rxin/SPARK-18853.

(cherry picked from commit 5d799473696a15fddd54ec71a93b6f8cb169810c)
Signed-off-by: Herman van Hovell <[email protected]>
(cherry picked from commit e8866f9fc62095b78421d461549f7eaf8e9070b3)
Signed-off-by: Herman van Hovell <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2b18bd4b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2b18bd4b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2b18bd4b

Branch: refs/heads/branch-2.0
Commit: 2b18bd4b99c0c85f28bd174751f52f940f78d2e8
Parents: 1d5c7f4
Author: Reynold Xin <[email protected]>
Authored: Wed Dec 14 21:22:49 2016 +0100
Committer: Herman van Hovell <[email protected]>
Committed: Wed Dec 14 21:25:07 2016 +0100

----------------------------------------------------------------------
 .../scala/org/apache/spark/sql/types/ArrayType.scala  |  6 +++---
 .../scala/org/apache/spark/sql/types/MapType.scala    |  6 +++---
 .../org/apache/spark/sql/types/DataTypeSuite.scala    | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2b18bd4b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 82a03b0..8c4df58 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -70,10 +70,10 @@ case class ArrayType(elementType: DataType, containsNull: 
Boolean) extends DataT
       ("containsNull" -> containsNull)
 
   /**
-   * The default size of a value of the ArrayType is 100 * the default size of 
the element type.
-   * (We assume that there are 100 elements).
+   * The default size of a value of the ArrayType is the default size of the 
element type.
+   * We assume that there is only 1 element on average in an array. See 
SPARK-18853.
    */
-  override def defaultSize: Int = 100 * elementType.defaultSize
+  override def defaultSize: Int = 1 * elementType.defaultSize
 
   override def simpleString: String = s"array<${elementType.simpleString}>"
 

http://git-wip-us.apache.org/repos/asf/spark/blob/2b18bd4b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index 1789609..4d59920 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -57,10 +57,10 @@ case class MapType(
 
   /**
    * The default size of a value of the MapType is
-   * 100 * (the default size of the key type + the default size of the value 
type).
-   * (We assume that there are 100 elements).
+   * (the default size of the key type + the default size of the value type).
+   * We assume that there is only 1 element on average in a map. See 
SPARK-18853.
    */
-  override def defaultSize: Int = 100 * (keyType.defaultSize + 
valueType.defaultSize)
+  override def defaultSize: Int = 1 * (keyType.defaultSize + 
valueType.defaultSize)
 
   override def simpleString: String = 
s"map<${keyType.simpleString},${valueType.simpleString}>"
 

http://git-wip-us.apache.org/repos/asf/spark/blob/2b18bd4b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 569230a..8c947ed 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -236,7 +236,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkDataTypeJsonRepr(structType)
 
   def checkDefaultSize(dataType: DataType, expectedDefaultSize: Int): Unit = {
-    test(s"Check the default size of ${dataType}") {
+    test(s"Check the default size of $dataType") {
       assert(dataType.defaultSize === expectedDefaultSize)
     }
   }
@@ -255,18 +255,18 @@ class DataTypeSuite extends SparkFunSuite {
   checkDefaultSize(TimestampType, 8)
   checkDefaultSize(StringType, 20)
   checkDefaultSize(BinaryType, 100)
-  checkDefaultSize(ArrayType(DoubleType, true), 800)
-  checkDefaultSize(ArrayType(StringType, false), 2000)
-  checkDefaultSize(MapType(IntegerType, StringType, true), 2400)
-  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400)
-  checkDefaultSize(structType, 812)
+  checkDefaultSize(ArrayType(DoubleType, true), 8)
+  checkDefaultSize(ArrayType(StringType, false), 20)
+  checkDefaultSize(MapType(IntegerType, StringType, true), 24)
+  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 12)
+  checkDefaultSize(structType, 20)
 
   def checkEqualsIgnoreCompatibleNullability(
       from: DataType,
       to: DataType,
       expected: Boolean): Unit = {
     val testName =
-      s"equalsIgnoreCompatibleNullability: (from: ${from}, to: ${to})"
+      s"equalsIgnoreCompatibleNullability: (from: $from, to: $to)"
     test(testName) {
       assert(DataType.equalsIgnoreCompatibleNullability(from, to) === expected)
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-18853][SQL] Project (UnaryNode) is way too aggressive in estimating statistics

Reply via email to