[spark] branch branch-3.0 updated: [SPARK-30790] The dataType of map() should be map

wenchen Wed, 12 Feb 2020 20:36:12 -0800

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 8ab6ae3  [SPARK-30790] The dataType of map() should be map<null,null>
8ab6ae3 is described below

commit 8ab6ae3ede96adb093347470a5cbbf17fe8c04e9
Author: iRakson <raksonrak...@gmail.com>
AuthorDate: Thu Feb 13 12:23:40 2020 +0800

    [SPARK-30790] The dataType of map() should be map<null,null>
    
    ### What changes were proposed in this pull request?
    
    `spark.sql("select map()")` returns {}.
    
    After these changes it will return map<null,null>
    
    ### Why are the changes needed?
    After changes introduced due to #27521, it is important to maintain 
consistency while using map().
    
    ### Does this PR introduce any user-facing change?
    Yes. Now map() will give map<null,null> instead of {}.
    
    ### How was this patch tested?
    UT added. Migration guide updated as well
    
    Closes #27542 from iRakson/SPARK-30790.
    
    Authored-by: iRakson <raksonrak...@gmail.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit 926e3a1efe9e142804fcbf52146b22700640ae1b)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 docs/sql-migration-guide.md                        |  2 +-
 .../catalyst/expressions/complexTypeCreator.scala  | 14 +++++++++---
 .../sql/catalyst/util/ArrayBasedMapBuilder.scala   |  5 ++---
 .../org/apache/spark/sql/internal/SQLConf.scala    | 10 ++++-----
 .../apache/spark/sql/DataFrameFunctionsSuite.scala | 25 +++++++++++++++-------
 5 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index f98fab5..46b7416 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -216,7 +216,7 @@ license: |
 
   - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. 
In Spark version 2.4 and earlier, this function gives `-1` for the same input. 
To restore the behavior before Spark 3.0, you can set 
`spark.sql.legacy.sizeOfNull` to `true`.
   
-  - Since Spark 3.0, when the `array` function is called without any 
parameters, it returns an empty array of `NullType`. In Spark version 2.4 and 
earlier, it returns an empty array of string type. To restore the behavior 
before Spark 3.0, you can set 
`spark.sql.legacy.arrayDefaultToStringType.enabled` to `true`.
+  - Since Spark 3.0, when the `array`/`map` function is called without any 
parameters, it returns an empty collection with `NullType` as element type. In 
Spark version 2.4 and earlier, it returns an empty collection with `StringType` 
as element type. To restore the behavior before Spark 3.0, you can set 
`spark.sql.legacy.createEmptyCollectionUsingStringType` to `true`.
 
   - Since Spark 3.0, the interval literal syntax does not allow multiple 
from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' 
YEAR TO MONTH'` throws parser exception.
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 7335e30..4bd85d3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -46,7 +46,7 @@ case class CreateArray(children: Seq[Expression]) extends 
Expression {
   }
 
   private val defaultElementType: DataType = {
-    if (SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING)) {
+    if 
(SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) 
{
       StringType
     } else {
       NullType
@@ -145,6 +145,14 @@ case class CreateMap(children: Seq[Expression]) extends 
Expression {
   lazy val keys = children.indices.filter(_ % 2 == 0).map(children)
   lazy val values = children.indices.filter(_ % 2 != 0).map(children)
 
+  private val defaultElementType: DataType = {
+    if 
(SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) 
{
+      StringType
+    } else {
+      NullType
+    }
+  }
+
   override def foldable: Boolean = children.forall(_.foldable)
 
   override def checkInputDataTypes(): TypeCheckResult = {
@@ -167,9 +175,9 @@ case class CreateMap(children: Seq[Expression]) extends 
Expression {
   override lazy val dataType: MapType = {
     MapType(
       keyType = 
TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(keys.map(_.dataType))
-        .getOrElse(StringType),
+        .getOrElse(defaultElementType),
       valueType = 
TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(values.map(_.dataType))
-        .getOrElse(StringType),
+        .getOrElse(defaultElementType),
       valueContainsNull = values.exists(_.nullable))
   }
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
index 9893436..37d6530 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
@@ -29,12 +29,11 @@ import org.apache.spark.unsafe.array.ByteArrayMethods
  */
 class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends 
Serializable {
   assert(!keyType.existsRecursively(_.isInstanceOf[MapType]), "key of map 
cannot be/contain map")
-  assert(keyType != NullType, "map key cannot be null type.")
 
   private lazy val keyToIndex = keyType match {
     // Binary type data is `byte[]`, which can't use `==` to check equality.
-    case _: AtomicType | _: CalendarIntervalType if 
!keyType.isInstanceOf[BinaryType] =>
-      new java.util.HashMap[Any, Int]()
+    case _: AtomicType | _: CalendarIntervalType | _: NullType
+      if !keyType.isInstanceOf[BinaryType] => new java.util.HashMap[Any, Int]()
     case _ =>
       // for complex types, use interpreted ordering to be able to compare 
unsafe data with safe
       // data, e.g. UnsafeRow vs GenericInternalRow.
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index b79b767..442711d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2007,12 +2007,12 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
-  val LEGACY_ARRAY_DEFAULT_TO_STRING =
-    buildConf("spark.sql.legacy.arrayDefaultToStringType.enabled")
+  val LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE =
+    buildConf("spark.sql.legacy.createEmptyCollectionUsingStringType")
       .internal()
-      .doc("When set to true, it returns an empty array of string type when 
the `array` " +
-        "function is called without any parameters. Otherwise, it returns an 
empty " +
-        "array of `NullType`")
+      .doc("When set to true, Spark returns an empty collection with 
`StringType` as element " +
+        "type if the `array`/`map` function is called without any parameters. 
Otherwise, Spark " +
+        "returns an empty collection with `NullType` as element type.")
       .booleanConf
       .createWithDefault(false)
 
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 6012678..f7531ea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -3499,13 +3499,6 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
     ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
   }
 
-  test("SPARK-21281 use string types by default if map have no argument") {
-    val ds = spark.range(1)
-    var expectedSchema = new StructType()
-      .add("x", MapType(StringType, StringType, valueContainsNull = false), 
nullable = false)
-    assert(ds.select(map().as("x")).schema == expectedSchema)
-  }
-
   test("SPARK-21281 fails if functions have no argument") {
     val df = Seq(1).toDF("a")
 
@@ -3563,7 +3556,8 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
   test("SPARK-29462: Empty array of NullType for array function with no 
arguments") {
     Seq((true, StringType), (false, NullType)).foreach {
       case (arrayDefaultToString, expectedType) =>
-        withSQLConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING.key -> 
arrayDefaultToString.toString) {
+        
withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key ->
+          arrayDefaultToString.toString) {
           val schema = spark.range(1).select(array()).schema
           assert(schema.nonEmpty && 
schema.head.dataType.isInstanceOf[ArrayType])
           val actualType = 
schema.head.dataType.asInstanceOf[ArrayType].elementType
@@ -3571,6 +3565,21 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
         }
     }
   }
+
+  test("SPARK-30790: Empty map with NullType as key/value type for map 
function with no argument") {
+    Seq((true, StringType), (false, NullType)).foreach {
+      case (mapDefaultToString, expectedType) =>
+        
withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key ->
+          mapDefaultToString.toString) {
+          val schema = spark.range(1).select(map()).schema
+          assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType])
+          val actualKeyType = 
schema.head.dataType.asInstanceOf[MapType].keyType
+          val actualValueType = 
schema.head.dataType.asInstanceOf[MapType].valueType
+          assert(actualKeyType === expectedType)
+          assert(actualValueType === expectedType)
+        }
+    }
+  }
 }
 
 object DataFrameFunctionsSuite {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-30790] The dataType of map() should be map

Reply via email to