Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/23176#discussion_r237382322
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
---
@@ -367,11 +367,29 @@ case class InSet(child: Expression, hset: Set[Any])
extends UnaryExpression with
}
@transient lazy val set: Set[Any] = child.dataType match {
- case _: AtomicType => hset
+ case t: AtomicType if !t.isInstanceOf[BinaryType] => hset
case _: NullType => hset
case _ =>
+ val ord = TypeUtils.getInterpretedOrdering(child.dataType)
+ val ordering = if (hasNull) {
+ new Ordering[Any] {
+ override def compare(x: Any, y: Any): Int = {
+ if (x == null && y == null) {
+ 0
+ } else if (x == null) {
+ -1
+ } else if (y == null) {
+ 1
+ } else {
+ ord.compare(x, y)
+ }
+ }
+ }
+ } else {
+ ord
+ }
// for structs use interpreted ordering to be able to compare
UnsafeRows with non-UnsafeRows
- TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++
hset
+ TreeSet.empty(ordering) ++ hset
--- End diff --
shall we just filter out nulls when building the tree set?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]