leerho commented on a change in pull request #319:
URL: 
https://github.com/apache/incubator-datasketches-java/pull/319#discussion_r433386568



##########
File path: src/main/java/org/apache/datasketches/tuple/AnotB.java
##########
@@ -22,45 +22,327 @@
 import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
 import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
 import static org.apache.datasketches.Util.ceilingPowerOf2;
+import static org.apache.datasketches.Util.simpleIntLog2;
 
 import java.lang.reflect.Array;
 import java.util.Arrays;
 
 import org.apache.datasketches.HashOperations;
+import org.apache.datasketches.theta.HashIterator;
 
 /**
- * Computes a set difference of two generic tuple sketches
+ * Computes a set difference, A-AND-NOT-B, of two generic tuple sketches
  * @param <S> Type of Summary
  */
 public final class AnotB<S extends Summary> {
-  private boolean isEmpty_ = true;
-  private long theta_ = Long.MAX_VALUE;
-  private long[] keys_;
-  private S[] summaries_;
-  private int count_;
+  private boolean empty_ = true;
+  private long thetaLong_ = Long.MAX_VALUE;
+  private long[] keys_ = null;   //always in compact form, not necessarily 
sorted
+  private S[] summaries_ = null; //always in compact form, not necessarily 
sorted
+  private int count_ = 0;
+
+  /**
+   * Sets the given Tuple sketch as the first argument <i>A</i>. This 
overwrites the internal state of
+   * this AnotB operator with the contents of the given sketch. This sets the 
stage for multiple
+   * following <i>notB</i> operations.
+   * @param skA The incoming sketch for the first argument, <i>A</i>.
+   */
+  public void setA(final Sketch<S> skA) {
+    if ((skA == null) || skA.isEmpty()) { return; }
+    //skA is not empty
+    empty_ = false;
+    thetaLong_ = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    keys_ = cskA.keys_;
+    summaries_ = cskA.summaries_;
+    count_ = cskA.getRetainedEntries();
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of 
this AnoB operator.
+   * @param skB The incoming Tuple sketch for the second (or following) 
argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final Sketch<S> skB) {
+    if (empty_) { return; }
+    if ((skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB = convertToHashTable(skB.keys_, countB, 
thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), 
count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, 
simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summaries_[i];
+          nonMatches++;
+        }
+      }
+    }
+    keys_ = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    summaries_ = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    count_ = nonMatches;
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of 
this AnoB operator.
+   * @param skB The incoming Theta sketch for the second (or following) 
argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final org.apache.datasketches.theta.Sketch skB) {
+    if (empty_) { return; }
+    if ((skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB =
+        convertToHashTable(extractThetaHashArray(skB, countB), countB, 
thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), 
count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta

Review comment:
       This calls into question whether the check (key != 0) is even necessary. 
 In theory, probably not.  By the way, hash keys in the theta family are forced 
to be positive or zero by shifting the output of the hash function right by one 
bit (effectively making them 63 bits of entropy).  Actually this check, if it 
exists at all, should probably be (key > 0).
   
   So why do I put in checks like this?  Because I am conservative and worried 
about the other source of non-valid hashes, which is corruption of the sketch 
binaries as they are moved around in a system. Which, by the way, is far more 
likely than a valid hash being <= 0.  So this little check just insures that 
invalid hashes don't get further propagated.  
   
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to