Repository: crunch Updated Branches: refs/heads/master 06688d55e -> 626853abb
CRUNCH-528 Improve Pair comparison Resolves the issue of integer overflow when comparing hash codes of values, as well as providing stable sorting of null values and non-equal objects with hash collisions. Description of the underlying problem: If the hash code values are too large, then the values will wrap when doing subtraction. This results in a comparison function that is not transitive. Among other things, this makes Joins using the in-memory pipeline not work, since the in-memory shuffler uses a TreeMap if the key type is Comparable. Since the key in a join is a Pair of the original key and a join tag, the key is always comparable. With a non-transitive comparison function, it is possible for the two join tags of the original key to sort differently, resulting in the two join tags not being adjacent for the original key. This results either in either the cross product erroneously producing no values in the case of an inner join, since the two join tags are not adjacent, or null values appearing when they should not in the case of an outer join. Project: http://git-wip-us.apache.org/repos/asf/crunch/repo Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/626853ab Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/626853ab Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/626853ab Branch: refs/heads/master Commit: 626853abb7f996107941875653b8292f462ba3b9 Parents: 06688d5 Author: Brandon Vargo <[email protected]> Authored: Wed May 27 10:42:09 2015 -0600 Committer: Gabriel Reid <[email protected]> Committed: Thu May 28 08:59:28 2015 +0200 ---------------------------------------------------------------------- .../src/main/java/org/apache/crunch/Pair.java | 25 ++++++++-- .../test/java/org/apache/crunch/PairTest.java | 50 ++++++++++++++++++++ 2 files changed, 71 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/crunch/blob/626853ab/crunch-core/src/main/java/org/apache/crunch/Pair.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/main/java/org/apache/crunch/Pair.java b/crunch-core/src/main/java/org/apache/crunch/Pair.java index 462434c..176c4aa 100644 --- a/crunch-core/src/main/java/org/apache/crunch/Pair.java +++ b/crunch-core/src/main/java/org/apache/crunch/Pair.java @@ -17,10 +17,12 @@ */ package org.apache.crunch; -import org.apache.commons.lang.builder.HashCodeBuilder; - import java.io.Serializable; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Ordering; +import org.apache.commons.lang.builder.HashCodeBuilder; + /** * A convenience class for two-element {@link Tuple}s. */ @@ -91,9 +93,24 @@ public class Pair<K, V> implements Tuple, Comparable<Pair<K, V>>, Serializable { if (lhs == rhs) { return 0; } else if (lhs != null && Comparable.class.isAssignableFrom(lhs.getClass())) { - return ((Comparable) lhs).compareTo(rhs); + return Ordering.natural().nullsLast().compare((Comparable) lhs, (Comparable) rhs);//(Comparable) lhs).compareTo(rhs); + } + if (lhs == null) { + return 1; // nulls last } - return (lhs == null ? 0 : lhs.hashCode()) - (rhs == null ? 0 : rhs.hashCode()); + if (rhs == null) { + return -1; // nulls last + } + if (lhs.equals(rhs)) { + return 0; + } + + // Now we compare based on hash code. We already know that the two sides are not equal, so + // if the hash codes are equal, we just use arbitrary (but consistent) ordering + return ComparisonChain.start() + .compare(lhs.hashCode(), rhs.hashCode()) + .compare(lhs, rhs, Ordering.arbitrary()) + .result(); } @Override http://git-wip-us.apache.org/repos/asf/crunch/blob/626853ab/crunch-core/src/test/java/org/apache/crunch/PairTest.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/test/java/org/apache/crunch/PairTest.java b/crunch-core/src/test/java/org/apache/crunch/PairTest.java index 106413c..4119201 100644 --- a/crunch-core/src/test/java/org/apache/crunch/PairTest.java +++ b/crunch-core/src/test/java/org/apache/crunch/PairTest.java @@ -62,5 +62,55 @@ public class PairTest { assertTrue(Pair.of(2, "a").compareTo(Pair.of(1, "a")) > 0); assertTrue(Pair.of("a", 2).compareTo(Pair.of("a", 1)) > 0); assertTrue(Pair.of(null, 17).compareTo(Pair.of(null, 29)) < 0); + assertTrue(Pair.of(Integer.MIN_VALUE, 0).compareTo(Pair.of((Integer)null, 0)) < 0); + assertTrue(Pair.of(0, Integer.MIN_VALUE).compareTo(Pair.of(0, (Integer)null)) < 0); + assertTrue(Pair.of(2, "a").compareTo(Pair.of(1, "a")) > 0); + assertTrue(Pair.of(new HashValue(2), "a").compareTo(Pair.of(new HashValue(1), "a")) > 0); + assertTrue(Pair.of(new HashValue(Integer.MAX_VALUE), "a").compareTo( + Pair.of(new HashValue(Integer.MIN_VALUE), "a")) > 0); + assertTrue(Pair.of(new HashValue(0), "a").compareTo(Pair.of((HashValue) null, "a")) < 0); + // Two value whose hash code is the same but are not equal (via equals) should not compare as the same + assertTrue(Pair.of(new HashValue(0, 1), 0).compareTo(Pair.of(new HashValue(0, 2), 0)) != 0); + } + + /** + * An object with a known hash value that is not comparable used for testing the comparison logic + * within Pair. + */ + private static final class HashValue { + private final int value; + private final int additionalValue; // Additional value which is used for equality but not hash code + + public HashValue(int value) { + this(value, 0); + } + + public HashValue(int value, int additionalValue) { + this.value = value; + this.additionalValue = additionalValue; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (!(o instanceof HashValue)) + return false; + + HashValue other = (HashValue) o; + + if (value != other.value) + return false; + + if (additionalValue != other.additionalValue) + return false; + + return true; + } + + @Override + public int hashCode() { + return this.value; + } } }
