Author: gates Date: Thu Oct 30 11:33:43 2008 New Revision: 709222 URL: http://svn.apache.org/viewvc?rev=709222&view=rev Log: PIG-511 Fixed flaws in builtin UDF diff pointed out by Crisitan Ivascu.
Modified: hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java Modified: hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java?rev=709222&r1=709221&r2=709222&view=diff ============================================================================== --- hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java (original) +++ hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java Thu Oct 30 11:33:43 2008 @@ -18,7 +18,9 @@ package org.apache.pig.builtin; import java.io.IOException; +import java.util.HashSet; import java.util.Iterator; +import java.util.Set; import org.apache.pig.EvalFunc; import org.apache.pig.backend.executionengine.ExecException; @@ -33,8 +35,6 @@ * will emit any Tuples that are in on of the DataBags but not the other. If the * fields are values, it will emit tuples with values that do not match. * - * @author breed - * */ public class DIFF extends EvalFunc<DataBag> { TupleFactory mTupleFactory = TupleFactory.getInstance(); @@ -78,48 +78,20 @@ DataBag bag1, DataBag bag2, DataBag emitTo) { - // Create two distinct versions of the bag. This will speed up - // comparison, and provide us a sorted order so we don't have to do - // an n^2 lookup. - DataBag d1 = mBagFactory.newDistinctBag(); - DataBag d2 = mBagFactory.newDistinctBag(); - Iterator<Tuple> i1 = d1.iterator(); - Iterator<Tuple> i2 = d2.iterator(); - while (i1.hasNext()) d1.add(i1.next()); - while (i2.hasNext()) d2.add(i2.next()); - - i1 = d1.iterator(); - i2 = d2.iterator(); - - Tuple t1 = i1.next(); - Tuple t2 = i2.next(); - - while (i1.hasNext() && i2.hasNext()) { - int c = t1.compareTo(t2); - - if (c < 0) { - // place t1 in the result bag and advance i1 - emitTo.add(t1); - t1 = i1.next(); - } else if (c > 0) { - // place t2 in the result bag and advance i2 - emitTo.add(t2); - t2 = i2.next(); - } else if (c == 0) { - // put neither in the result bag, advance both iterators - t1 = i1.next(); - t2 = i2.next(); - } - } + // Build two hash tables and probe with first one, then the other. + // This does make the assumption that the distinct set of keys from + // each bag will fit in memory. + Set<Tuple> s1 = new HashSet<Tuple>(); + Iterator<Tuple> i1 = bag1.iterator(); + while (i1.hasNext()) s1.add(i1.next()); + + Set<Tuple> s2 = new HashSet<Tuple>(); + Iterator<Tuple> i2 = bag2.iterator(); + while (i2.hasNext()) s2.add(i2.next()); + + for (Tuple t : s1) if (!s2.contains(t)) emitTo.add(t); + for (Tuple t : s2) if (!s1.contains(t)) emitTo.add(t); - // One ran out, put all the rest of the other (if there are any) in - // the result bag. - while (i1.hasNext()) { - emitTo.add(i1.next()); - } - while (i2.hasNext()) { - emitTo.add(i2.next()); - } } Modified: hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java?rev=709222&r1=709221&r2=709222&view=diff ============================================================================== --- hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java (original) +++ hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java Thu Oct 30 11:33:43 2008 @@ -19,6 +19,7 @@ import java.io.File; import java.io.PrintWriter; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -890,6 +891,44 @@ assertTrue(f1.equals(f2)); } + + @Test + public void testDIFF() throws Exception { + // Test it in the case with two bags. + BagFactory bf = BagFactory.getInstance(); + TupleFactory tf = TupleFactory.getInstance(); + + DataBag b1 = bf.newDefaultBag(); + DataBag b2 = bf.newDefaultBag(); + for (int i = 0; i < 10; i++) b1.add(tf.newTuple(new Integer(i))); + for (int i = 0; i < 10; i += 2) b2.add(tf.newTuple(new Integer(i))); + Tuple t = tf.newTuple(2); + t.set(0, b1); + t.set(1, b2); + DIFF d = new DIFF(); + DataBag result = d.exec(t); + + assertEquals(5, result.size()); + Iterator<Tuple> i = result.iterator(); + int[] values = new int[5]; + for (int j = 0; j < 5; j++) values[j] = (Integer)i.next().get(0); + Arrays.sort(values); + for (int j = 1; j < 10; j += 2) assertEquals(j, values[j/2]); + + // Test it in the case of two objects that are equals + t = tf.newTuple(2); + t.set(0, new Integer(1)); + t.set(1, new Integer(1)); + result = d.exec(t); + assertEquals(0, result.size()); + + // Test it in the case of two objects that are not equal + t = tf.newTuple(2); + t.set(0, new Integer(1)); + t.set(1, new Integer(2)); + result = d.exec(t); + assertEquals(2, result.size()); + } private static String getInputType(String typeFor) { return allowedInput.get(typeFor);