Author: gates
Date: Thu Oct 30 11:33:43 2008
New Revision: 709222

URL: http://svn.apache.org/viewvc?rev=709222&view=rev
Log:
PIG-511 Fixed flaws in builtin UDF diff pointed out by Crisitan Ivascu.


Modified:
    hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java
    hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java

Modified: hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java?rev=709222&r1=709221&r2=709222&view=diff
==============================================================================
--- hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java (original)
+++ hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java Thu Oct 30 
11:33:43 2008
@@ -18,7 +18,9 @@
 package org.apache.pig.builtin;
 
 import java.io.IOException;
+import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Set;
 
 import org.apache.pig.EvalFunc;
 import org.apache.pig.backend.executionengine.ExecException;
@@ -33,8 +35,6 @@
  * will emit any Tuples that are in on of the DataBags but not the other. If 
the
  * fields are values, it will emit tuples with values that do not match.
  * 
- * @author breed
- *
  */
 public class DIFF extends EvalFunc<DataBag> {
     TupleFactory mTupleFactory = TupleFactory.getInstance();
@@ -78,48 +78,20 @@
             DataBag bag1,
             DataBag bag2,
             DataBag emitTo) {
-        // Create two distinct versions of the bag.  This will speed up
-        // comparison, and provide us a sorted order so we don't have to do
-        // an n^2 lookup.
-        DataBag d1 = mBagFactory.newDistinctBag();
-        DataBag d2 = mBagFactory.newDistinctBag();
-        Iterator<Tuple> i1 = d1.iterator();
-        Iterator<Tuple> i2 = d2.iterator();
-        while (i1.hasNext()) d1.add(i1.next());
-        while (i2.hasNext()) d2.add(i2.next());
-
-        i1 = d1.iterator();
-        i2 = d2.iterator();
-
-        Tuple t1 = i1.next();
-        Tuple t2 = i2.next();
-
-        while (i1.hasNext() && i2.hasNext()) {
-            int c = t1.compareTo(t2);
-
-            if (c < 0) {
-                // place t1 in the result bag and advance i1
-                emitTo.add(t1);
-                t1 = i1.next();
-            } else if (c > 0) {
-                // place t2 in the result bag and advance i2
-                emitTo.add(t2);
-                t2 = i2.next();
-            } else if (c == 0) {
-                // put neither in the result bag, advance both iterators
-                t1 = i1.next();
-                t2 = i2.next();
-            }
-        }
+        // Build two hash tables and probe with first one, then the other.
+        // This does make the assumption that the distinct set of keys from
+        // each bag will fit in memory.
+        Set<Tuple> s1 = new HashSet<Tuple>();
+        Iterator<Tuple> i1 = bag1.iterator();
+        while (i1.hasNext()) s1.add(i1.next());
+
+        Set<Tuple> s2 = new HashSet<Tuple>();
+        Iterator<Tuple> i2 = bag2.iterator();
+        while (i2.hasNext()) s2.add(i2.next());
+
+        for (Tuple t : s1) if (!s2.contains(t)) emitTo.add(t);
+        for (Tuple t : s2) if (!s1.contains(t)) emitTo.add(t);
 
-        // One ran out, put all the rest of the other (if there are any) in
-        // the result bag.
-        while (i1.hasNext()) {
-            emitTo.add(i1.next());
-        }
-        while (i2.hasNext()) {
-            emitTo.add(i2.next());
-        }
     }
     
     

Modified: hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java
URL: 
http://svn.apache.org/viewvc/hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java?rev=709222&r1=709221&r2=709222&view=diff
==============================================================================
--- hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java 
(original)
+++ hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java Thu Oct 
30 11:33:43 2008
@@ -19,6 +19,7 @@
 
 import java.io.File;
 import java.io.PrintWriter;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@@ -890,6 +891,44 @@
         
         assertTrue(f1.equals(f2));        
     }
+
+    @Test
+    public void testDIFF() throws Exception {
+        // Test it in the case with two bags.
+        BagFactory bf = BagFactory.getInstance();
+        TupleFactory tf = TupleFactory.getInstance();
+
+        DataBag b1 = bf.newDefaultBag();
+        DataBag b2 = bf.newDefaultBag();
+        for (int i = 0; i < 10; i++) b1.add(tf.newTuple(new Integer(i)));
+        for (int i = 0; i < 10; i += 2) b2.add(tf.newTuple(new Integer(i)));
+        Tuple t = tf.newTuple(2);
+        t.set(0, b1);
+        t.set(1, b2);
+        DIFF d = new DIFF();
+        DataBag result = d.exec(t);
+
+        assertEquals(5, result.size());
+        Iterator<Tuple> i = result.iterator();
+        int[] values = new int[5];
+        for (int j = 0; j < 5; j++) values[j] = (Integer)i.next().get(0);
+        Arrays.sort(values);
+        for (int j = 1; j < 10; j += 2) assertEquals(j, values[j/2]);
+
+        // Test it in the case of two objects that are equals
+        t = tf.newTuple(2);
+        t.set(0, new Integer(1));
+        t.set(1, new Integer(1));
+        result = d.exec(t);
+        assertEquals(0, result.size());
+
+        // Test it in the case of two objects that are not equal
+        t = tf.newTuple(2);
+        t.set(0, new Integer(1));
+        t.set(1, new Integer(2));
+        result = d.exec(t);
+        assertEquals(2, result.size());
+    }
     
     private static String getInputType(String typeFor) {
         return allowedInput.get(typeFor);


Reply via email to