Author: knoguchi Date: Thu Dec 21 21:39:21 2017 New Revision: 1818985 URL: http://svn.apache.org/viewvc?rev=1818985&view=rev Log: PIG-5300: hashCode for Bag needs to be order independent (knoguchi)
Modified: pig/trunk/CHANGES.txt pig/trunk/src/org/apache/pig/data/DataBag.java pig/trunk/src/org/apache/pig/data/DefaultAbstractBag.java pig/trunk/src/org/apache/pig/data/NonSpillableDataBag.java pig/trunk/src/org/apache/pig/data/SingleTupleBag.java pig/trunk/test/org/apache/pig/test/TestDataModel.java Modified: pig/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1818985&r1=1818984&r2=1818985&view=diff ============================================================================== --- pig/trunk/CHANGES.txt (original) +++ pig/trunk/CHANGES.txt Thu Dec 21 21:39:21 2017 @@ -59,6 +59,7 @@ PIG-5251: Bump joda-time to 2.9.9 (dbist OPTIMIZATIONS BUG FIXES +PIG-5300: hashCode for Bag needs to be order independent (knoguchi) PIG-5318: Unit test failures on Pig on Spark with Spark 2.2 (nkollar via szita) Modified: pig/trunk/src/org/apache/pig/data/DataBag.java URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/DataBag.java?rev=1818985&r1=1818984&r2=1818985&view=diff ============================================================================== --- pig/trunk/src/org/apache/pig/data/DataBag.java (original) +++ pig/trunk/src/org/apache/pig/data/DataBag.java Thu Dec 21 21:39:21 2017 @@ -126,6 +126,15 @@ public interface DataBag extends Spillab void clear(); /** + * Returns the hash code value for the bag. The hash code of a bag is + * defined to be the sum of the hash codes of each tuple. + * This ensures that b1.equals(b2) implies b1.hashCode() == b2.hashCode() + * + * @return the hash code value for this bag + */ + int hashCode(); + + /** * This is used by FuncEvalSpec.FakeDataBag. * @param stale Set stale state. */ Modified: pig/trunk/src/org/apache/pig/data/DefaultAbstractBag.java URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/DefaultAbstractBag.java?rev=1818985&r1=1818984&r2=1818985&view=diff ============================================================================== --- pig/trunk/src/org/apache/pig/data/DefaultAbstractBag.java (original) +++ pig/trunk/src/org/apache/pig/data/DefaultAbstractBag.java Thu Dec 21 21:39:21 2017 @@ -309,6 +309,9 @@ public abstract class DefaultAbstractBag @Override public boolean equals(Object other) { + if( other == null ) { + return false; + } return compareTo(other) == 0; } @@ -359,11 +362,10 @@ public abstract class DefaultAbstractBag @Override public int hashCode() { - int hash = 1; + int hash = 0; Iterator<Tuple> i = iterator(); while (i.hasNext()) { - // Use 37 because we want a prime, and tuple uses 31. - hash = 37 * hash + i.next().hashCode(); + hash += i.next().hashCode(); } return hash; } Modified: pig/trunk/src/org/apache/pig/data/NonSpillableDataBag.java URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/NonSpillableDataBag.java?rev=1818985&r1=1818984&r2=1818985&view=diff ============================================================================== --- pig/trunk/src/org/apache/pig/data/NonSpillableDataBag.java (original) +++ pig/trunk/src/org/apache/pig/data/NonSpillableDataBag.java Thu Dec 21 21:39:21 2017 @@ -198,11 +198,18 @@ public class NonSpillableDataBag impleme */ @Override public boolean equals(Object obj) { + if( obj == null) { + return false; + } return compareTo(obj) == 0; } public int hashCode() { - return mContents.hashCode(); + int hash = 0; + for( Tuple t : mContents ) { + hash += t.hashCode(); + } + return hash; } @SuppressWarnings("unchecked") Modified: pig/trunk/src/org/apache/pig/data/SingleTupleBag.java URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/SingleTupleBag.java?rev=1818985&r1=1818984&r2=1818985&view=diff ============================================================================== --- pig/trunk/src/org/apache/pig/data/SingleTupleBag.java (original) +++ pig/trunk/src/org/apache/pig/data/SingleTupleBag.java Thu Dec 21 21:39:21 2017 @@ -156,22 +156,37 @@ public class SingleTupleBag implements D } } - /* (non-Javadoc) - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ @Override - public int compareTo(Object o) { - // TODO Auto-generated method stub - return 0; + public int compareTo(Object other) { + if (this == other) + return 0; + if (other instanceof DataBag) { + DataBag bOther = (DataBag) other; + if (bOther.size() != 1) { + if ( 1 > bOther.size()) return 1; + else return -1; + } + Iterator<Tuple> iter = bOther.iterator(); + return item.compareTo(iter.next()); + } else { + return DataType.compare(this, other); + } } - public boolean equals(Object o){ - // TODO: match to compareTo if it is updated - return true; + @Override + public boolean equals(Object other) { + if( other == null ) { + return false; + } + return compareTo(other) == 0; } + @Override public int hashCode() { - return 42; + if( item != null ) { + return item.hashCode(); + } + return 0; } class TBIterator implements Iterator<Tuple> { Modified: pig/trunk/test/org/apache/pig/test/TestDataModel.java URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestDataModel.java?rev=1818985&r1=1818984&r2=1818985&view=diff ============================================================================== --- pig/trunk/test/org/apache/pig/test/TestDataModel.java (original) +++ pig/trunk/test/org/apache/pig/test/TestDataModel.java Thu Dec 21 21:39:21 2017 @@ -17,6 +17,7 @@ package org.apache.pig.test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -42,7 +43,15 @@ import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; +import org.apache.pig.data.DefaultDataBag; +import org.apache.pig.data.DistinctDataBag; +import org.apache.pig.data.InternalCachedBag; +import org.apache.pig.data.InternalDistinctBag; import org.apache.pig.data.InternalMap; +import org.apache.pig.data.InternalSortedBag; +import org.apache.pig.data.NonSpillableDataBag; +import org.apache.pig.data.SingleTupleBag; +import org.apache.pig.data.SortedDataBag; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.junit.Test; @@ -587,6 +596,77 @@ public class TestDataModel { } } + @Test + public void testBagHashCodeAndEquals() throws Exception { + DataBag [] ascBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()}; + DataBag [] dscBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()}; + Tuple input = giveMeOneOfEach(); + TupleFactory tf = TupleFactory.getInstance(); + for (int i = 0; i < ascBags.length; i++ ) { + for( int j = 0; j < input.size(); j++ ) { + ascBags[i].add(tf.newTuple(input.get(j))); + dscBags[i].add(tf.newTuple(input.get(input.size() - 1 - j))); + } + } + // making sure equals(null) does not throw NPE + for (DataBag bag : ascBags ) { + assertFalse(bag.equals(null)); + } + // Comparing against each other including itself + for (DataBag bag1 : ascBags ) { + for (DataBag bag2 : ascBags ) { + assertEquals("Comparing hashcode for " + bag1.getClass() + " and " + bag2.getClass() + " failed", bag1.hashCode(), bag2.hashCode()); + assertEquals("Comparing content of Bag for " + bag1.getClass() + " and " + bag2.getClass() + " failed", bag1, bag2); + } + } + // Comparing against each other again but now with different insertion + // order. (including from the same type of Bag class) + for (DataBag ascBag : ascBags) { + for (DataBag dscBag : dscBags) { + assertEquals("Comparing hashcode for " + ascBag.getClass() + " and " + dscBag.getClass() + " failed", ascBag.hashCode(), dscBag.hashCode()); + assertEquals("Comparing content of Bag for " + ascBag.getClass() + " and " + dscBag.getClass() + " failed", ascBag, dscBag); + } + } + } + + @Test + public void testBagHashCodeAndEqualsForSingleTuple() throws Exception { + TupleFactory tf = TupleFactory.getInstance(); + + // For each datatype, making sure bag hashcode and content match + for( Object input: giveMeOneOfEach() ) { + DataBag [] singleItemBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag(), new SingleTupleBag(null)}; + DataBag [] twoItemBags = {new SortedDataBag(null), new DefaultDataBag(), new DistinctDataBag(), new InternalCachedBag(), new InternalDistinctBag(), new InternalSortedBag(), new NonSpillableDataBag()}; + + for( DataBag singleItemBag : singleItemBags ) { + singleItemBag.add(tf.newTuple(input)); + } + + for( DataBag singleItemBag1 : singleItemBags ) { + for( DataBag singleItemBag2 : singleItemBags ) { + assertEquals("Comparing hashcode for " + singleItemBag1.getClass() + + " and " + singleItemBag2.getClass() + " failed", + singleItemBag1.hashCode(), singleItemBag2.hashCode()); + assertEquals("Comparing content of Bag for " + singleItemBag1.getClass() + + " and " + singleItemBag2.getClass() + " failed", + singleItemBag1, singleItemBag2); + } + } + + // making sure no two-items-bag would match with single item bag + for( DataBag twoItemBag : twoItemBags ) { + twoItemBag.add(tf.newTuple(input)); + twoItemBag.add(tf.newTuple(new String("Unique string not to overlap with a tuple above"))); + } + + for( DataBag twoItemBag : twoItemBags ) { + for( DataBag singleItemBag : singleItemBags ) { + assertNotEquals(twoItemBag, singleItemBag); + } + } + } + } + private Tuple giveMeOneOfEach() throws Exception { TupleFactory tf = TupleFactory.getInstance();