Repository: spark Updated Branches: refs/heads/master 0ff67a1cf -> 59c184e02
[SPARK-17913][SQL] compare atomic and string type column may return confusing result ## What changes were proposed in this pull request? Spark SQL follows MySQL to do the implicit type conversion for binary comparison: http://dev.mysql.com/doc/refman/5.7/en/type-conversion.html However, this may return confusing result, e.g. `1 = 'true'` will return true, `19157170390056973L = '19157170390056971'` will return true. I think it's more reasonable to follow postgres in this case, i.e. cast string to the type of the other side, but return null if the string is not castable to keep hive compatibility. ## How was this patch tested? newly added tests. Author: Wenchen Fan <[email protected]> Closes #15880 from cloud-fan/compare. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/59c184e0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/59c184e0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/59c184e0 Branch: refs/heads/master Commit: 59c184e028d79286ef490a448ae7f2536d8753d6 Parents: 0ff67a1 Author: Wenchen Fan <[email protected]> Authored: Tue Jan 24 10:18:25 2017 -0800 Committer: gatorsmile <[email protected]> Committed: Tue Jan 24 10:18:25 2017 -0800 ---------------------------------------------------------------------- .../spark/sql/catalyst/analysis/TypeCoercion.scala | 11 +++++++---- .../org/apache/spark/sql/types/AbstractDataType.scala | 11 +++++++++++ .../spark/sql/catalyst/analysis/TypeCoercionSuite.scala | 12 ++++++++++++ .../scala/org/apache/spark/sql/DataFrameSuite.scala | 5 +++++ .../sql/execution/datasources/json/JsonSuite.scala | 2 +- .../test/resources/sqlgen/broadcast_join_subquery.sql | 2 +- .../src/test/resources/sqlgen/subquery_in_having_1.sql | 2 +- 7 files changed, 38 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/59c184e0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 5f72fa8..4177c2b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -338,10 +338,13 @@ object TypeCoercion { case p @ BinaryComparison(left @ NullType(), right @ StringType()) => p.makeCopy(Array(Literal.create(null, StringType), right)) - case p @ BinaryComparison(left @ StringType(), right) if right.dataType != StringType => - p.makeCopy(Array(Cast(left, DoubleType), right)) - case p @ BinaryComparison(left, right @ StringType()) if left.dataType != StringType => - p.makeCopy(Array(left, Cast(right, DoubleType))) + // When compare string with atomic type, case string to that type. + case p @ BinaryComparison(left @ StringType(), right @ AtomicType()) + if right.dataType != StringType => + p.makeCopy(Array(Cast(left, right.dataType), right)) + case p @ BinaryComparison(left @ AtomicType(), right @ StringType()) + if left.dataType != StringType => + p.makeCopy(Array(left, Cast(right, left.dataType))) case i @ In(a @ DateType(), b) if b.forall(_.dataType == StringType) => i.makeCopy(Array(Cast(a, StringType), b)) http://git-wip-us.apache.org/repos/asf/spark/blob/59c184e0/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala index 76dbb7c..da5775b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala @@ -129,6 +129,17 @@ protected[sql] abstract class AtomicType extends DataType { private[sql] val ordering: Ordering[InternalType] } +object AtomicType { + /** + * Enables matching against AtomicType for expressions: + * {{{ + * case Cast(child @ AtomicType(), StringType) => + * ... + * }}} + */ + def unapply(e: Expression): Boolean = e.dataType.isInstanceOf[AtomicType] +} + /** * Numeric data types. http://git-wip-us.apache.org/repos/asf/spark/blob/59c184e0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala index dbb1e3e..110bd02 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala @@ -916,6 +916,18 @@ class TypeCoercionSuite extends PlanTest { ruleTest(rules, Divide(1L, nullLit), Divide(Cast(1L, DoubleType), Cast(nullLit, DoubleType))) ruleTest(rules, Divide(nullLit, 1L), Divide(Cast(nullLit, DoubleType), Cast(1L, DoubleType))) } + + test("binary comparison with string promotion") { + ruleTest(PromoteStrings, + GreaterThan(Literal("123"), Literal(1)), + GreaterThan(Cast(Literal("123"), IntegerType), Literal(1))) + ruleTest(PromoteStrings, + LessThan(Literal(true), Literal("123")), + LessThan(Literal(true), Cast(Literal("123"), BooleanType))) + ruleTest(PromoteStrings, + EqualTo(Literal(Array(1, 2)), Literal("123")), + EqualTo(Literal(Array(1, 2)), Literal("123"))) + } } http://git-wip-us.apache.org/repos/asf/spark/blob/59c184e0/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 621a46a..cb7b979 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -1725,4 +1725,9 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { val df = spark.createDataFrame(spark.sparkContext.makeRDD(rows), schema) assert(df.filter($"array1" === $"array2").count() == 1) } + + test("SPARK-17913: compare long and string type column may return confusing result") { + val df = Seq(123L -> "123", 19157170390056973L -> "19157170390056971").toDF("i", "j") + checkAnswer(df.select($"i" === $"j"), Row(true) :: Row(false) :: Nil) + } } http://git-wip-us.apache.org/repos/asf/spark/blob/59c184e0/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index a324183..161a409 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -448,7 +448,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { // Number and String conflict: resolve the type as number in this query. checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 14"), + sql("select num_str + 1.2 from jsonTable where num_str > 14d"), Row(92233720368547758071.2) ) http://git-wip-us.apache.org/repos/asf/spark/blob/59c184e0/sql/hive/src/test/resources/sqlgen/broadcast_join_subquery.sql ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/sqlgen/broadcast_join_subquery.sql b/sql/hive/src/test/resources/sqlgen/broadcast_join_subquery.sql index 3de4f8a..adddc41 100644 --- a/sql/hive/src/test/resources/sqlgen/broadcast_join_subquery.sql +++ b/sql/hive/src/test/resources/sqlgen/broadcast_join_subquery.sql @@ -5,4 +5,4 @@ FROM (SELECT x.key as key1, x.value as value1, y.key as key2, y.value as value2 JOIN srcpart z ON (subq.key1 = z.key and z.ds='2008-04-08' and z.hr=11) ORDER BY subq.key1, z.value -------------------------------------------------------------------------------- -SELECT `gen_attr_0` AS `key1`, `gen_attr_1` AS `value` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_5` AS `gen_attr_0`, `gen_attr_7` AS `gen_attr_6`, `gen_attr_9` AS `gen_attr_8`, `gen_attr_11` AS `gen_attr_10` FROM (SELECT `key` AS `gen_attr_5`, `value` AS `gen_attr_7` FROM `default`.`src1`) AS gen_subquery_0 INNER JOIN (SELECT `key` AS `gen_attr_9`, `value` AS `gen_attr_11` FROM `default`.`src`) AS gen_subquery_1 ON (`gen_attr_5` = `gen_attr_9`)) AS subq INNER JOIN (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_1`, `ds` AS `gen_attr_3`, `hr` AS `gen_attr_4` FROM `default`.`srcpart`) AS gen_subquery_2 ON (((`gen_attr_0` = `gen_attr_2`) AND (`gen_attr_3` = '2008-04-08')) AND (CAST(`gen_attr_4` AS DOUBLE) = CAST(11 AS DOUBLE))) ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST) AS gen_subquery_3 +SELECT `gen_attr_0` AS `key1`, `gen_attr_1` AS `value` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_5` AS `gen_attr_0`, `gen_attr_7` AS `gen_attr_6`, `gen_attr_9` AS `gen_attr_8`, `gen_attr_11` AS `gen_attr_10` FROM (SELECT `key` AS `gen_attr_5`, `value` AS `gen_attr_7` FROM `default`.`src1`) AS gen_subquery_0 INNER JOIN (SELECT `key` AS `gen_attr_9`, `value` AS `gen_attr_11` FROM `default`.`src`) AS gen_subquery_1 ON (`gen_attr_5` = `gen_attr_9`)) AS subq INNER JOIN (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_1`, `ds` AS `gen_attr_3`, `hr` AS `gen_attr_4` FROM `default`.`srcpart`) AS gen_subquery_2 ON (((`gen_attr_0` = `gen_attr_2`) AND (`gen_attr_3` = '2008-04-08')) AND (CAST(`gen_attr_4` AS INT) = 11)) ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST) AS gen_subquery_3 http://git-wip-us.apache.org/repos/asf/spark/blob/59c184e0/sql/hive/src/test/resources/sqlgen/subquery_in_having_1.sql ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/sqlgen/subquery_in_having_1.sql b/sql/hive/src/test/resources/sqlgen/subquery_in_having_1.sql index 2588214..55afb86 100644 --- a/sql/hive/src/test/resources/sqlgen/subquery_in_having_1.sql +++ b/sql/hive/src/test/resources/sqlgen/subquery_in_having_1.sql @@ -5,4 +5,4 @@ group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key) order by key -------------------------------------------------------------------------------- -SELECT `gen_attr_0` AS `key`, `gen_attr_1` AS `count(1)` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_0`, count(1) AS `gen_attr_1`, count(1) AS `gen_attr_2` FROM (SELECT `key` AS `gen_attr_0`, `value` AS `gen_attr_4` FROM `default`.`src`) AS gen_subquery_0 GROUP BY `gen_attr_0` HAVING (`gen_attr_2` IN (SELECT `gen_attr_5` AS `_c0` FROM (SELECT `gen_attr_3` AS `gen_attr_5` FROM (SELECT count(1) AS `gen_attr_3` FROM (SELECT `key` AS `gen_attr_6`, `value` AS `gen_attr_7` FROM `default`.`src`) AS gen_subquery_3 WHERE (CAST(`gen_attr_6` AS DOUBLE) = CAST('90' AS DOUBLE)) GROUP BY `gen_attr_6`) AS gen_subquery_2) AS gen_subquery_4))) AS gen_subquery_1 ORDER BY `gen_attr_0` ASC NULLS FIRST) AS src +SELECT `gen_attr_0` AS `key`, `gen_attr_1` AS `count(1)` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_0`, count(1) AS `gen_attr_1`, count(1) AS `gen_attr_2` FROM (SELECT `key` AS `gen_attr_0`, `value` AS `gen_attr_4` FROM `default`.`src`) AS gen_subquery_0 GROUP BY `gen_attr_0` HAVING (`gen_attr_2` IN (SELECT `gen_attr_5` AS `_c0` FROM (SELECT `gen_attr_3` AS `gen_attr_5` FROM (SELECT count(1) AS `gen_attr_3` FROM (SELECT `key` AS `gen_attr_6`, `value` AS `gen_attr_7` FROM `default`.`src`) AS gen_subquery_3 WHERE (`gen_attr_6` = CAST('90' AS INT)) GROUP BY `gen_attr_6`) AS gen_subquery_2) AS gen_subquery_4))) AS gen_subquery_1 ORDER BY `gen_attr_0` ASC NULLS FIRST) AS src --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
