Repository: crunch
Updated Branches:
  refs/heads/master 06688d55e -> 626853abb


CRUNCH-528 Improve Pair comparison

Resolves the issue of integer overflow when comparing hash codes
of values, as well as providing stable sorting of null values
and non-equal objects with hash collisions.

Description of the underlying problem:

If the hash code values are too large, then the values will wrap
when doing subtraction. This results in a comparison function
that is not transitive.

Among other things, this makes Joins using the in-memory
pipeline not work, since the in-memory shuffler uses a TreeMap
if the key type is Comparable. Since the key in a join is a Pair
of the original key and a join tag, the key is always comparable.
With a non-transitive comparison function, it is possible for
the two join tags of the original key to sort differently,
resulting in the two join tags not being adjacent for the
original key. This results either in either the cross product
erroneously producing no values in the case of an inner join,
since the two join tags are not adjacent, or null values
appearing when they should not in the case of an outer join.


Project: http://git-wip-us.apache.org/repos/asf/crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/626853ab
Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/626853ab
Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/626853ab

Branch: refs/heads/master
Commit: 626853abb7f996107941875653b8292f462ba3b9
Parents: 06688d5
Author: Brandon Vargo <[email protected]>
Authored: Wed May 27 10:42:09 2015 -0600
Committer: Gabriel Reid <[email protected]>
Committed: Thu May 28 08:59:28 2015 +0200

----------------------------------------------------------------------
 .../src/main/java/org/apache/crunch/Pair.java   | 25 ++++++++--
 .../test/java/org/apache/crunch/PairTest.java   | 50 ++++++++++++++++++++
 2 files changed, 71 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/crunch/blob/626853ab/crunch-core/src/main/java/org/apache/crunch/Pair.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Pair.java 
b/crunch-core/src/main/java/org/apache/crunch/Pair.java
index 462434c..176c4aa 100644
--- a/crunch-core/src/main/java/org/apache/crunch/Pair.java
+++ b/crunch-core/src/main/java/org/apache/crunch/Pair.java
@@ -17,10 +17,12 @@
  */
 package org.apache.crunch;
 
-import org.apache.commons.lang.builder.HashCodeBuilder;
-
 import java.io.Serializable;
 
+import com.google.common.collect.ComparisonChain;
+import com.google.common.collect.Ordering;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+
 /**
  * A convenience class for two-element {@link Tuple}s.
  */
@@ -91,9 +93,24 @@ public class Pair<K, V> implements Tuple, Comparable<Pair<K, 
V>>, Serializable {
     if (lhs == rhs) {
       return 0;
     } else if (lhs != null && 
Comparable.class.isAssignableFrom(lhs.getClass())) {
-      return ((Comparable) lhs).compareTo(rhs);
+      return Ordering.natural().nullsLast().compare((Comparable) lhs, 
(Comparable) rhs);//(Comparable) lhs).compareTo(rhs);
+    }
+    if (lhs == null) {
+      return 1; // nulls last
     }
-    return (lhs == null ? 0 : lhs.hashCode()) - (rhs == null ? 0 : 
rhs.hashCode());
+    if (rhs == null) {
+      return -1; // nulls last
+    }
+    if (lhs.equals(rhs)) {
+      return 0;
+    }
+
+    // Now we compare based on hash code. We already know that the two sides 
are not equal, so
+    // if the hash codes are equal, we just use arbitrary (but consistent) 
ordering
+    return ComparisonChain.start()
+        .compare(lhs.hashCode(), rhs.hashCode())
+        .compare(lhs, rhs, Ordering.arbitrary())
+        .result();
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/crunch/blob/626853ab/crunch-core/src/test/java/org/apache/crunch/PairTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/PairTest.java 
b/crunch-core/src/test/java/org/apache/crunch/PairTest.java
index 106413c..4119201 100644
--- a/crunch-core/src/test/java/org/apache/crunch/PairTest.java
+++ b/crunch-core/src/test/java/org/apache/crunch/PairTest.java
@@ -62,5 +62,55 @@ public class PairTest {
     assertTrue(Pair.of(2, "a").compareTo(Pair.of(1, "a")) > 0);
     assertTrue(Pair.of("a", 2).compareTo(Pair.of("a", 1)) > 0);
     assertTrue(Pair.of(null, 17).compareTo(Pair.of(null, 29)) < 0);
+    assertTrue(Pair.of(Integer.MIN_VALUE, 0).compareTo(Pair.of((Integer)null, 
0)) < 0);
+    assertTrue(Pair.of(0, Integer.MIN_VALUE).compareTo(Pair.of(0, 
(Integer)null)) < 0);
+    assertTrue(Pair.of(2, "a").compareTo(Pair.of(1, "a")) > 0);
+    assertTrue(Pair.of(new HashValue(2), "a").compareTo(Pair.of(new 
HashValue(1), "a")) > 0);
+    assertTrue(Pair.of(new HashValue(Integer.MAX_VALUE), "a").compareTo(
+          Pair.of(new HashValue(Integer.MIN_VALUE), "a")) > 0);
+    assertTrue(Pair.of(new HashValue(0), "a").compareTo(Pair.of((HashValue) 
null, "a")) < 0);
+    // Two value whose hash code is the same but are not equal (via equals) 
should not compare as the same
+    assertTrue(Pair.of(new HashValue(0, 1), 0).compareTo(Pair.of(new 
HashValue(0, 2), 0)) != 0);
+  }
+
+  /**
+   * An object with a known hash value that is not comparable used for testing 
the comparison logic
+   * within Pair.
+   */
+  private static final class HashValue {
+    private final int value;
+    private final int additionalValue;  // Additional value which is used for 
equality but not hash code
+
+    public HashValue(int value) {
+      this(value, 0);
+    }
+
+    public HashValue(int value, int additionalValue) {
+      this.value = value;
+      this.additionalValue = additionalValue;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o)
+        return true;
+      if (!(o instanceof HashValue))
+        return false;
+
+      HashValue other = (HashValue) o;
+
+      if (value != other.value)
+        return false;
+
+      if (additionalValue != other.additionalValue)
+        return false;
+
+      return true;
+    }
+
+    @Override
+    public int hashCode() {
+      return this.value;
+    }
   }
 }

Reply via email to