This is an automated email from the ASF dual-hosted git repository.
zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/master by this push:
new 26b00ea [CALCITE-4132] Estimate the number of distinct values more
accurately (Liya Fan)
26b00ea is described below
commit 26b00ea80c273a4c219c9b170d6b406ca88a402e
Author: liyafan82 <[email protected]>
AuthorDate: Mon Jul 20 10:07:51 2020 +0800
[CALCITE-4132] Estimate the number of distinct values more accurately (Liya
Fan)
Close apache/calcite#2077
---
.../org/apache/calcite/rel/metadata/RelMdUtil.java | 55 +++++++++++++---------
.../apache/calcite/rel/metadata/RelMdUtilTest.java | 54 +++++++++++++++++++++
2 files changed, 88 insertions(+), 21 deletions(-)
diff --git a/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
b/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
index 779e9b4..48e0301 100644
--- a/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
+++ b/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUtil.java
@@ -289,9 +289,12 @@ public class RelMdUtil {
* between 1 and 100, you'll most likely end up with fewer than 100 distinct
* values, because you'll pick some values more than once.
*
- * @param domainSize number of distinct values in the domain
- * @param numSelected number selected from the domain
- * @return number of distinct values for subset selected
+ * The implementation is an unbiased estimation of the number of distinct
values
+ * by performing a number of selections (with replacement) from a universe
set.
+ *
+ * @param domainSize size of the universe set.
+ * @param numSelected the number of selections.
+ * @return the expected number of distinct values.
*/
public static Double numDistinctVals(
Double domainSize,
@@ -305,24 +308,34 @@ public class RelMdUtil {
double dSize = capInfinity(domainSize);
double numSel = capInfinity(numSelected);
- // The formula for this is:
- // 1. Assume we pick 80 random values between 1 and 100.
- // 2. The chance we skip any given value is .99 ^ 80
- // 3. Thus on average we will skip .99 ^ 80 percent of the values
- // in the domain
- // 4. Generalized, we skip ( (n-1)/n ) ^ k values where n is the
- // number of possible values and k is the number we are selecting
- // 5. This can be rewritten via approximation (if you want to
- // know why approximation is called for here, ask Bill Keese):
- // ((n-1)/n) ^ k
- // = e ^ ln( ((n-1)/n) ^ k )
- // = e ^ (k * ln ((n-1)/n))
- // = e ^ (k * ln (1-1/n))
- // ~= e ^ (k * (-1/n)) because ln(1+x) ~= x for small x
- // = e ^ (-k/n)
- // 6. Flipping it from number skipped to number visited, we get:
- double res =
- (dSize > 0) ? ((1.0 - Math.exp(-1 * numSel / dSize)) * dSize) : 0;
+ // The formula is derived as follows:
+ //
+ // Suppose we have N distinct values, and we select n from them (with
replacement).
+ // For any value i, we use C(i) = k to express the event that the value is
selected exactly
+ // k times in the n selections.
+ //
+ // It can be seen that, for any one selection, the probability of the
value being selected
+ // is 1/N. So the probability of being selected exactly k times is
+ //
+ // Pr{C(i) = k} = C(n, k) * (1 / N)^k * (1 - 1 / N)^(n - k),
+ // where C(n, k) = n! / [k! * (n - k)!]
+ //
+ // The probability that the value is never selected is
+ // Pr{C(i) = 0} = C(n, 0) * (1/N)^0 * (1 - 1 / N)^n = (1 - 1 / N)^n
+ //
+ // We define indicator random variable I(i), so that I(i) = 1 iff
+ // value i is selected in at least one of the selections. We have
+ // E[I(i)] = 1 * Pr{I(i) = 1} + 0 * Pr{I(i) = 0) = Pr{I(i) = 1}
+ // = Pr{C(i) > 0} = 1 - Pr{C(i) = 0} = 1 - (1 - 1 / N)^n
+ //
+ // The expected number of distinct values in the overall n selections is:
+ // E(I(1)] + E(I(2)] + ... + E(I(N)] = N * [1 - (1 - 1 / N)^n]
+
+ double res = 0;
+ if (dSize > 0) {
+ double expo = numSel * Math.log(1.0 - 1.0 / dSize);
+ res = (1.0 - Math.exp(expo)) * dSize;
+ }
// fix the boundary cases
if (res > dSize) {
diff --git
a/core/src/test/java/org/apache/calcite/rel/metadata/RelMdUtilTest.java
b/core/src/test/java/org/apache/calcite/rel/metadata/RelMdUtilTest.java
new file mode 100644
index 0000000..617b9c6
--- /dev/null
+++ b/core/src/test/java/org/apache/calcite/rel/metadata/RelMdUtilTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.calcite.rel.metadata;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Test cases for {@link RelMdUtil}.
+ */
+public class RelMdUtilTest {
+
+ @Test void testNumDistinctVals() {
+ // the first element must be distinct, the second one has half chance of
being distinct
+ assertEquals(1.5, RelMdUtil.numDistinctVals(2.0, 2.0), 1e-5);
+
+ // when no selection is made, we get no distinct value
+ double domainSize = 100;
+ assertEquals(0, RelMdUtil.numDistinctVals(domainSize, 0.0), 1e-5);
+
+ // when we perform one selection, we always have 1 distinct value,
+ // regardless of the domain size
+ for (double dSize = 1; dSize < 100; dSize += 1) {
+ assertEquals(1.0, RelMdUtil.numDistinctVals(dSize, 1.0), 1e-5);
+ }
+
+ // when we select n objects from a set with n values
+ // we get no more than n distinct values
+ for (double dSize = 1; dSize < 100; dSize += 1) {
+ assertTrue(RelMdUtil.numDistinctVals(dSize, dSize) <= dSize);
+ }
+
+ // when the number of selections is large enough
+ // we get all distinct values, w.h.p.
+ assertEquals(domainSize, RelMdUtil.numDistinctVals(domainSize, domainSize
* 100), 1e-5);
+ }
+
+}