davecromberge commented on a change in pull request #319:
URL:
https://github.com/apache/incubator-datasketches-java/pull/319#discussion_r433478030
##########
File path: src/main/java/org/apache/datasketches/tuple/AnotB.java
##########
@@ -22,45 +22,327 @@
import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
import static org.apache.datasketches.Util.ceilingPowerOf2;
+import static org.apache.datasketches.Util.simpleIntLog2;
import java.lang.reflect.Array;
import java.util.Arrays;
import org.apache.datasketches.HashOperations;
+import org.apache.datasketches.theta.HashIterator;
/**
- * Computes a set difference of two generic tuple sketches
+ * Computes a set difference, A-AND-NOT-B, of two generic tuple sketches
* @param <S> Type of Summary
*/
public final class AnotB<S extends Summary> {
- private boolean isEmpty_ = true;
- private long theta_ = Long.MAX_VALUE;
- private long[] keys_;
- private S[] summaries_;
- private int count_;
+ private boolean empty_ = true;
+ private long thetaLong_ = Long.MAX_VALUE;
+ private long[] keys_ = null; //always in compact form, not necessarily
sorted
+ private S[] summaries_ = null; //always in compact form, not necessarily
sorted
+ private int count_ = 0;
+
+ /**
+ * Sets the given Tuple sketch as the first argument <i>A</i>. This
overwrites the internal state of
+ * this AnotB operator with the contents of the given sketch. This sets the
stage for multiple
+ * following <i>notB</i> operations.
+ * @param skA The incoming sketch for the first argument, <i>A</i>.
+ */
+ public void setA(final Sketch<S> skA) {
+ if ((skA == null) || skA.isEmpty()) { return; }
+ //skA is not empty
+ empty_ = false;
+ thetaLong_ = skA.getThetaLong();
+ final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+ ? (CompactSketch<S>)skA
+ : ((QuickSelectSketch<S>)skA).compact();
+ keys_ = cskA.keys_;
+ summaries_ = cskA.summaries_;
+ count_ = cskA.getRetainedEntries();
+ }
+
+ /**
+ * Performs an <i>AND NOT</i> operation with the existing internal state of
this AnoB operator.
+ * @param skB The incoming Tuple sketch for the second (or following)
argument <i>B</i>.
+ */
+ @SuppressWarnings("unchecked")
+ public void notB(final Sketch<S> skB) {
+ if (empty_) { return; }
+ if ((skB == null) || skB.isEmpty()) { return; }
+ //skB is not empty
+ final long thetaLongB = skB.getThetaLong();
+ thetaLong_ = Math.min(thetaLong_, thetaLongB);
+ //Build hashtable and removes keys of skB >= theta
+ final int countB = skB.getRetainedEntries();
+ final long[] hashTableKeysB = convertToHashTable(skB.keys_, countB,
thetaLong_);
+
+ //build temporary arrays of skA
+ final long[] tmpKeysA = new long[count_];
+ final S[] tmpSummariesA =
+ (S[]) Array.newInstance(summaries_.getClass().getComponentType(),
count_);
+
+ //search for non matches and build temp arrays
+ int nonMatches = 0;
+ for (int i = 0; i < count_; i++) {
+ final long key = keys_[i];
+ if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta
+ final int index =
+ HashOperations.hashSearch(hashTableKeysB,
simpleIntLog2(hashTableKeysB.length), key);
+ if (index == -1) {
+ tmpKeysA[nonMatches] = key;
+ tmpSummariesA[nonMatches] = summaries_[i];
+ nonMatches++;
+ }
+ }
+ }
+ keys_ = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+ summaries_ = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+ count_ = nonMatches;
+ }
+
+ /**
+ * Performs an <i>AND NOT</i> operation with the existing internal state of
this AnoB operator.
+ * @param skB The incoming Theta sketch for the second (or following)
argument <i>B</i>.
+ */
+ @SuppressWarnings("unchecked")
+ public void notB(final org.apache.datasketches.theta.Sketch skB) {
+ if (empty_) { return; }
+ if ((skB == null) || skB.isEmpty()) { return; }
+ //skB is not empty
+ final long thetaLongB = skB.getThetaLong();
+ thetaLong_ = Math.min(thetaLong_, thetaLongB);
+ //Build hashtable and removes keys of skB >= theta
+ final int countB = skB.getRetainedEntries();
+ final long[] hashTableKeysB =
+ convertToHashTable(extractThetaHashArray(skB, countB), countB,
thetaLong_);
+
+ //build temporary arrays of skA
+ final long[] tmpKeysA = new long[count_];
+ final S[] tmpSummariesA =
+ (S[]) Array.newInstance(summaries_.getClass().getComponentType(),
count_);
+
+ //search for non matches and build temp arrays
+ int nonMatches = 0;
+ for (int i = 0; i < count_; i++) {
+ final long key = keys_[i];
+ if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta
Review comment:
I hadn't considered the case where the contents of the sketch itself are
subject to corruption. I agree that it is valid to preserve the check in this
case.
It's very useful background information, and it's interesting to see the
actual probability explained in those terms and contrasted to the actual RSE of
the sketch. Thanks for the explanation!
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]