spark git commit: [SPARK-22249][SQL] isin with empty list throws exception on cached DataFrame

srowen Tue, 17 Oct 2017 00:42:07 -0700

Repository: spark
Updated Branches:
  refs/heads/master e66cabb02 -> 8148f19ca



[SPARK-22249][SQL] isin with empty list throws exception on cached DataFrame

## What changes were proposed in this pull request?

As pointed out in the JIRA, there is a bug which causes an exception to be 
thrown if `isin` is called with an empty list on a cached DataFrame. The PR 
fixes it.

## How was this patch tested?

Added UT.

Author: Marco Gaido <marcogaid...@gmail.com>

Closes #19494 from mgaido91/SPARK-22249.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8148f19c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8148f19c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8148f19c

Branch: refs/heads/master
Commit: 8148f19ca1f0e0375603cb4f180c1bad8b0b8042
Parents: e66cabb
Author: Marco Gaido <marcogaid...@gmail.com>
Authored: Tue Oct 17 09:41:23 2017 +0200
Committer: Sean Owen <so...@cloudera.com>
Committed: Tue Oct 17 09:41:23 2017 +0200

----------------------------------------------------------------------
 .../execution/columnar/InMemoryTableScanExec.scala   |  1 +
 .../columnar/InMemoryColumnarQuerySuite.scala        | 15 +++++++++++++++
 2 files changed, 16 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8148f19c/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index af3636a..846ec03 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -102,6 +102,7 @@ case class InMemoryTableScanExec(
     case IsNull(a: Attribute) => statsFor(a).nullCount > 0
     case IsNotNull(a: Attribute) => statsFor(a).count - statsFor(a).nullCount 
> 0
 
+    case In(_: AttributeReference, list: Seq[Expression]) if list.isEmpty => 
Literal.FalseLiteral
     case In(a: AttributeReference, list: Seq[Expression]) if 
list.forall(_.isInstanceOf[Literal]) =>
       list.map(l => statsFor(a).lowerBound <= l.asInstanceOf[Literal] &&
         l.asInstanceOf[Literal] <= statsFor(a).upperBound).reduce(_ || _)

http://git-wip-us.apache.org/repos/asf/spark/blob/8148f19c/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 8d411eb..75d17bc 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -429,4 +429,19 @@ class InMemoryColumnarQuerySuite extends QueryTest with 
SharedSQLContext {
       checkAnswer(agg_without_cache, agg_with_cache)
     }
   }
+
+  test("SPARK-22249: IN should work also with cached DataFrame") {
+    val df = spark.range(10).cache()
+    // with an empty list
+    assert(df.filter($"id".isin()).count() == 0)
+    // with a non-empty list
+    assert(df.filter($"id".isin(2)).count() == 1)
+    assert(df.filter($"id".isin(2, 3)).count() == 2)
+    df.unpersist()
+    val dfNulls = spark.range(10).selectExpr("null as id").cache()
+    // with null as value for the attribute
+    assert(dfNulls.filter($"id".isin()).count() == 0)
+    assert(dfNulls.filter($"id".isin(2, 3)).count() == 0)
+    dfNulls.unpersist()
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-22249][SQL] isin with empty list throws exception on cached DataFrame

Reply via email to