Author: rohini
Date: Mon Oct  8 22:26:48 2018
New Revision: 1843210

URL: http://svn.apache.org/viewvc?rev=1843210&view=rev
Log:
PIG-5357: BagFactory interface should support creating a distinct bag from a 
set (jtolar via rohini)

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/src/org/apache/pig/data/BagFactory.java
    pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java
    pig/trunk/src/org/apache/pig/data/DistinctDataBag.java

Modified: pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1843210&r1=1843209&r2=1843210&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Mon Oct  8 22:26:48 2018
@@ -26,6 +26,8 @@ PIG-5282: Upgade to Java 8 (satishsaley
  
 IMPROVEMENTS
 
+PIG-5357: BagFactory interface should support creating a distinct bag from a 
set (jtolar via rohini)
+
 PIG-5354: Show fieldname and a line number for casting errors (knoguchi)
 
 PIG-5342: Add setting to turn off bloom join combiner (satishsaley via rohini)

Modified: pig/trunk/src/org/apache/pig/data/BagFactory.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/BagFactory.java?rev=1843210&r1=1843209&r2=1843210&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/data/BagFactory.java (original)
+++ pig/trunk/src/org/apache/pig/data/BagFactory.java Mon Oct  8 22:26:48 2018
@@ -23,6 +23,7 @@ import java.net.URL;
 import java.net.URLClassLoader;
 import java.util.Comparator;
 import java.util.List;
+import java.util.Set;
 
 import org.apache.pig.classification.InterfaceAudience;
 import org.apache.pig.classification.InterfaceStability;
@@ -127,6 +128,21 @@ public abstract class BagFactory {
     public abstract DataBag newDistinctBag();
 
     /**
+     * Get a distinct data bag.  Distinct bags guarantee that when an
+     * iterator is opened on the bag, no two tuples returned from the
+     * iterator will be equal.
+     * @param tuples distinct set of tuples
+     * @return distinct data bag
+     */
+    public DataBag newDistinctBag(Set<Tuple> tuples) {
+        DataBag bag = newDistinctBag();
+        for (Tuple t : tuples) {
+            bag.add(t);
+        }
+        return bag;
+    }
+
+    /**
      * Construct a new BagFactory
      */
     protected BagFactory() {

Modified: pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java?rev=1843210&r1=1843209&r2=1843210&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java (original)
+++ pig/trunk/src/org/apache/pig/data/DefaultBagFactory.java Mon Oct  8 
22:26:48 2018
@@ -19,6 +19,7 @@ package org.apache.pig.data;
 
 import java.util.Comparator;
 import java.util.List;
+import java.util.Set;
 
 /**
  * Default implementation of BagFactory.
@@ -76,6 +77,21 @@ public class DefaultBagFactory extends B
         return b;
     }
 
+    /**
+     * Get a distinct data bag.
+     * @param tuples Distinct set of tuples used to initialize the bag.
+     * If null, an empty bag is returned.
+     */
+    @Override
+    public DataBag newDistinctBag(Set<Tuple> tuples) {
+        if (tuples == null) {
+            return newDistinctBag();
+        }
+
+        DataBag b = new DistinctDataBag(tuples);
+        return b;
+    }
+
     DefaultBagFactory() {
         super();
     }

Modified: pig/trunk/src/org/apache/pig/data/DistinctDataBag.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/data/DistinctDataBag.java?rev=1843210&r1=1843209&r2=1843210&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/data/DistinctDataBag.java (original)
+++ pig/trunk/src/org/apache/pig/data/DistinctDataBag.java Mon Oct  8 22:26:48 
2018
@@ -32,6 +32,7 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.ListIterator;
+import java.util.Set;
 import java.util.TreeSet;
 
 import org.apache.commons.logging.Log;
@@ -63,6 +64,13 @@ public class DistinctDataBag extends Def
         mContents = new HashSet<Tuple>();
     }
 
+    public DistinctDataBag(Set<Tuple> tuples) {
+        mContents = tuples;
+
+        mSize = mContents.size();
+        markSpillableIfNecessary();
+    }
+
     @Override
     public boolean isSorted() {
         return false;
@@ -227,7 +235,7 @@ public class DistinctDataBag extends Def
         DistinctDataBagIterator() {
             // If this is the first read, we need to sort the data.
             synchronized (mContents) {
-                if (mContents instanceof HashSet) {
+                if (mContents instanceof Set) {
                     preMerge();
                     // We're the first reader, we need to sort the data.
                     // This is in case it gets dumped under us.


Reply via email to