This is an automated email from the ASF dual-hosted git repository.

zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/calcite.git


The following commit(s) were added to refs/heads/master by this push:
     new 26b00ea  [CALCITE-4132] Estimate the number of distinct values more 
accurately (Liya Fan)
26b00ea is described below

commit 26b00ea80c273a4c219c9b170d6b406ca88a402e
Author: liyafan82 <[email protected]>
AuthorDate: Mon Jul 20 10:07:51 2020 +0800

    [CALCITE-4132] Estimate the number of distinct values more accurately (Liya 
Fan)
    
    Close apache/calcite#2077
---
 .../org/apache/calcite/rel/metadata/RelMdUtil.java | 55 +++++++++++++---------
 .../apache/calcite/rel/metadata/RelMdUtilTest.java | 54 +++++++++++++++++++++
 2 files changed, 88 insertions(+), 21 deletions(-)

diff --git a/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java 
b/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
index 779e9b4..48e0301 100644
--- a/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
+++ b/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
@@ -289,9 +289,12 @@ public class RelMdUtil {
    * between 1 and 100, you'll most likely end up with fewer than 100 distinct
    * values, because you'll pick some values more than once.
    *
-   * @param domainSize  number of distinct values in the domain
-   * @param numSelected number selected from the domain
-   * @return number of distinct values for subset selected
+   * The implementation is an unbiased estimation of the number of distinct 
values
+   * by performing a number of selections (with replacement) from a universe 
set.
+   *
+   * @param domainSize size of the universe set.
+   * @param numSelected the number of selections.
+   * @return the expected number of distinct values.
    */
   public static Double numDistinctVals(
       Double domainSize,
@@ -305,24 +308,34 @@ public class RelMdUtil {
     double dSize = capInfinity(domainSize);
     double numSel = capInfinity(numSelected);
 
-    // The formula for this is:
-    // 1. Assume we pick 80 random values between 1 and 100.
-    // 2. The chance we skip any given value is .99 ^ 80
-    // 3. Thus on average we will skip .99 ^ 80 percent of the values
-    //    in the domain
-    // 4. Generalized, we skip ( (n-1)/n ) ^ k values where n is the
-    //    number of possible values and k is the number we are selecting
-    // 5. This can be rewritten via approximation (if you want to
-    //    know why approximation is called for here, ask Bill Keese):
-    //  ((n-1)/n) ^ k
-    //  = e ^ ln( ((n-1)/n) ^ k )
-    //  = e ^ (k * ln ((n-1)/n))
-    //  = e ^ (k * ln (1-1/n))
-    // ~= e ^ (k * (-1/n))  because ln(1+x) ~= x for small x
-    //  = e ^ (-k/n)
-    // 6. Flipping it from number skipped to number visited, we get:
-    double res =
-        (dSize > 0) ? ((1.0 - Math.exp(-1 * numSel / dSize)) * dSize) : 0;
+    // The formula is derived as follows:
+    //
+    // Suppose we have N distinct values, and we select n from them (with 
replacement).
+    // For any value i, we use C(i) = k to express the event that the value is 
selected exactly
+    // k times in the n selections.
+    //
+    // It can be seen that, for any one selection, the probability of the 
value being selected
+    // is 1/N. So the probability of being selected exactly k times is
+    //
+    // Pr{C(i) = k} = C(n, k) * (1 / N)^k * (1 - 1 / N)^(n - k),
+    // where C(n, k) = n! / [k! * (n - k)!]
+    //
+    // The probability that the value is never selected is
+    // Pr{C(i) = 0} = C(n, 0) * (1/N)^0 * (1 - 1 / N)^n = (1 - 1 / N)^n
+    //
+    // We define indicator random variable I(i), so that I(i) = 1 iff
+    // value i is selected in at least one of the selections. We have
+    // E[I(i)] = 1 * Pr{I(i) = 1} + 0 * Pr{I(i) = 0) = Pr{I(i) = 1}
+    // = Pr{C(i) > 0} = 1 - Pr{C(i) = 0} = 1 - (1 - 1 / N)^n
+    //
+    // The expected number of distinct values in the overall n selections is:
+    // E(I(1)] + E(I(2)] + ... + E(I(N)] = N * [1 - (1 - 1 / N)^n]
+
+    double res = 0;
+    if (dSize > 0) {
+      double expo = numSel * Math.log(1.0 - 1.0 / dSize);
+      res = (1.0 - Math.exp(expo)) * dSize;
+    }
 
     // fix the boundary cases
     if (res > dSize) {
diff --git 
a/core/src/test/java/org/apache/calcite/rel/metadata/RelMdUtilTest.java 
b/core/src/test/java/org/apache/calcite/rel/metadata/RelMdUtilTest.java
new file mode 100644
index 0000000..617b9c6
--- /dev/null
+++ b/core/src/test/java/org/apache/calcite/rel/metadata/RelMdUtilTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.calcite.rel.metadata;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Test cases for {@link RelMdUtil}.
+ */
+public class RelMdUtilTest {
+
+  @Test void testNumDistinctVals() {
+    // the first element must be distinct, the second one has half chance of 
being distinct
+    assertEquals(1.5, RelMdUtil.numDistinctVals(2.0, 2.0), 1e-5);
+
+    // when no selection is made, we get no distinct value
+    double domainSize = 100;
+    assertEquals(0, RelMdUtil.numDistinctVals(domainSize, 0.0), 1e-5);
+
+    // when we perform one selection, we always have 1 distinct value,
+    // regardless of the domain size
+    for (double dSize = 1; dSize < 100; dSize += 1) {
+      assertEquals(1.0, RelMdUtil.numDistinctVals(dSize, 1.0), 1e-5);
+    }
+
+    // when we select n objects from a set with n values
+    // we get no more than n distinct values
+    for (double dSize = 1; dSize < 100; dSize += 1) {
+      assertTrue(RelMdUtil.numDistinctVals(dSize, dSize) <= dSize);
+    }
+
+    // when the number of selections is large enough
+    // we get all distinct values, w.h.p.
+    assertEquals(domainSize, RelMdUtil.numDistinctVals(domainSize, domainSize 
* 100), 1e-5);
+  }
+
+}

Reply via email to