This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 91887a285e2b924421dc3bd5e09870d5af2e5d3e
Author: Jibing-Li <[email protected]>
AuthorDate: Fri Apr 26 13:06:48 2024 +0800

    Implement HLL with 128 buckets to support statistics cache. (#34124)
---
 .../main/java/org/apache/doris/common/io/Hll.java  |  15 +-
 .../java/org/apache/doris/common/io/HllTest.java   |   4 +-
 .../org/apache/doris/statistics/util/Hll128.java   | 214 +++++++++++++++++++++
 .../apache/doris/statistics/util/Hll128Test.java   | 204 ++++++++++++++++++++
 4 files changed, 432 insertions(+), 5 deletions(-)

diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java 
b/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java
index b00912598d1..f70df617498 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java
@@ -46,7 +46,7 @@ public class Hll {
 
     public static final int HLL_COLUMN_PRECISION = 14;
     public static final int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION);
-    public static final int HLL_EXPLICLIT_INT64_NUM = 160;
+    public static final int HLL_EXPLICIT_INT64_NUM = 160;
     public static final int HLL_SPARSE_THRESHOLD = 4096;
     public static final int HLL_REGISTERS_COUNT = 16 * 1024;
 
@@ -122,7 +122,7 @@ public class Hll {
                 type = HLL_DATA_EXPLICIT;
                 break;
             case HLL_DATA_EXPLICIT:
-                if (hashSet.size() < HLL_EXPLICLIT_INT64_NUM) {
+                if (hashSet.size() < HLL_EXPLICIT_INT64_NUM) {
                     hashSet.add(hashValue);
                     break;
                 }
@@ -157,7 +157,7 @@ public class Hll {
                 switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing 
switch default
                     case HLL_DATA_EXPLICIT:
                         this.hashSet.addAll(other.hashSet);
-                        if (this.hashSet.size() > HLL_EXPLICLIT_INT64_NUM) {
+                        if (this.hashSet.size() > HLL_EXPLICIT_INT64_NUM) {
                             convertExplicitToRegister();
                             this.type = HLL_DATA_FULL;
                         }
@@ -393,4 +393,13 @@ public class Hll {
         return type;
     }
 
+    // For convert to statistics used Hll128
+    public byte[] getRegisters() {
+        return registers;
+    }
+
+    // For convert to statistics used Hll128
+    public Set<Long> getHashSet() {
+        return hashSet;
+    }
 }
diff --git a/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java 
b/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java
index fabc7c1f8da..94333f255a6 100644
--- a/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java
+++ b/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java
@@ -55,11 +55,11 @@ public class HllTest {
 
         // test explicit
         Hll explicitHll = new Hll();
-        for (int i = 0; i < Hll.HLL_EXPLICLIT_INT64_NUM; i++) {
+        for (int i = 0; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) {
             explicitHll.updateWithHash(i);
         }
         Assert.assertTrue(explicitHll.getType() == Hll.HLL_DATA_EXPLICIT);
-        Assert.assertTrue(explicitHll.estimateCardinality() == 
Hll.HLL_EXPLICLIT_INT64_NUM);
+        Assert.assertTrue(explicitHll.estimateCardinality() == 
Hll.HLL_EXPLICIT_INT64_NUM);
 
         ByteArrayOutputStream explicitOutputStream = new 
ByteArrayOutputStream();
         DataOutput explicitOutput = new DataOutputStream(explicitOutputStream);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/Hll128.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/Hll128.java
new file mode 100644
index 00000000000..094cff4b7d9
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/Hll128.java
@@ -0,0 +1,214 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.statistics.util;
+
+import org.apache.doris.common.io.Hll;
+
+import java.math.BigInteger;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * This is an HLL implementation with 128 Buckets.
+ * Mainly used for statistics partition ndv cache.
+ * We can convert the org.apache.doris.common.io.Hll object with 16K buckets 
to Hll128 to reduce memory consumption.
+ */
+public class Hll128 {
+
+    public static final byte HLL_DATA_EMPTY = 0;
+    public static final byte HLL_DATA_EXPLICIT = 1;
+    public static final byte HLL_DATA_FULL = 3;
+
+    public static final int HLL_COLUMN_PRECISION = 7;
+    public static final int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION);
+    public static final int HLL_EXPLICLIT_INT64_NUM = 160;
+    public static final int HLL_REGISTERS_COUNT = 128;
+
+    private int type;
+    private Set<Long> hashSet;
+    private byte[] registers;
+
+    public Hll128() {
+        type = Hll.HLL_DATA_EMPTY;
+        this.hashSet = new HashSet<>();
+    }
+
+    private void convertExplicitToRegister() {
+        assert this.type == HLL_DATA_EXPLICIT;
+        registers = new byte[HLL_REGISTERS_COUNT];
+        for (Long value : hashSet) {
+            updateRegisters(value);
+        }
+        hashSet.clear();
+    }
+
+    private void updateRegisters(long hashValue) {
+        int idx;
+        // hash value less than zero means we get a unsigned long
+        // so need to transfer to BigInter to mod
+        if (hashValue < 0) {
+            BigInteger unint64HashValue = new 
BigInteger(Long.toUnsignedString(hashValue));
+            unint64HashValue = unint64HashValue.mod(new 
BigInteger(Long.toUnsignedString(HLL_REGISTERS_COUNT)));
+            idx = unint64HashValue.intValue();
+        } else {
+            idx = (int) (hashValue % HLL_REGISTERS_COUNT);
+        }
+
+        hashValue >>>= HLL_COLUMN_PRECISION;
+        hashValue |= (1L << HLL_ZERO_COUNT_BITS);
+        byte firstOneBit = (byte) (Hll.getLongTailZeroNum(hashValue) + 1);
+        registers[idx] = registers[idx] > firstOneBit ? registers[idx] : 
firstOneBit;
+    }
+
+    private void mergeRegisters(byte[] other) {
+        for (int i = 0; i < HLL_REGISTERS_COUNT; i++) {
+            this.registers[i] = this.registers[i] > other[i] ? 
this.registers[i] : other[i];
+        }
+    }
+
+    public void update(long hashValue) {
+        switch (this.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch 
default
+            case HLL_DATA_EMPTY:
+                hashSet.add(hashValue);
+                type = HLL_DATA_EXPLICIT;
+                break;
+            case HLL_DATA_EXPLICIT:
+                if (hashSet.size() < HLL_EXPLICLIT_INT64_NUM) {
+                    hashSet.add(hashValue);
+                    break;
+                }
+                convertExplicitToRegister();
+                type = HLL_DATA_FULL;
+            case HLL_DATA_FULL:  // CHECKSTYLE IGNORE THIS LINE: fall through
+                updateRegisters(hashValue);
+                break;
+        }
+    }
+
+    public void merge(Hll128 other) {
+        if (other.type == HLL_DATA_EMPTY) {
+            return;
+        }
+        switch (this.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch 
default
+            case HLL_DATA_EMPTY:
+                this.type = other.type;
+                switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing 
switch default
+                    case HLL_DATA_EXPLICIT:
+                        this.hashSet.addAll(other.hashSet);
+                        break;
+                    case HLL_DATA_FULL:
+                        this.registers = new byte[HLL_REGISTERS_COUNT];
+                        System.arraycopy(other.registers, 0, this.registers, 
0, HLL_REGISTERS_COUNT);
+                        break;
+                }
+                break;
+            case HLL_DATA_EXPLICIT:
+                switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing 
switch default
+                    case HLL_DATA_EXPLICIT:
+                        this.hashSet.addAll(other.hashSet);
+                        if (this.hashSet.size() > HLL_EXPLICLIT_INT64_NUM) {
+                            convertExplicitToRegister();
+                            this.type = HLL_DATA_FULL;
+                        }
+                        break;
+                    case HLL_DATA_FULL:
+                        convertExplicitToRegister();
+                        mergeRegisters(other.registers);
+                        this.type = HLL_DATA_FULL;
+                        break;
+                }
+                break;
+            case HLL_DATA_FULL:
+                switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing 
switch default
+                    case HLL_DATA_EXPLICIT:
+                        for (long value : other.hashSet) {
+                            update(value);
+                        }
+                        break;
+                    case HLL_DATA_FULL:
+                        mergeRegisters(other.registers);
+                        break;
+                }
+                break;
+        }
+    }
+
+    // use strictfp to force java follow IEEE 754 to deal float point strictly
+    public strictfp long estimateCardinality() {
+        if (type == HLL_DATA_EMPTY) {
+            return 0;
+        }
+        if (type == HLL_DATA_EXPLICIT) {
+            return hashSet.size();
+        }
+
+        int numStreams = HLL_REGISTERS_COUNT;
+        float alpha = 0.7213f / (1 + 1.079f / numStreams);
+        float harmonicMean = 0;
+        int numZeroRegisters = 0;
+
+        for (int i = 0; i < HLL_REGISTERS_COUNT; i++) {
+            harmonicMean += Math.pow(2.0f, -registers[i]);
+
+            if (registers[i] == 0) {
+                numZeroRegisters++;
+            }
+        }
+
+        harmonicMean = 1.0f / harmonicMean;
+        double estimate = alpha * numStreams * numStreams * harmonicMean;
+
+        if (estimate <= numStreams * 2.5 && numZeroRegisters != 0) {
+            estimate = numStreams * Math.log(((float) numStreams) / ((float) 
numZeroRegisters));
+        }
+
+        return (long) (estimate + 0.5);
+    }
+
+    public int getType() {
+        return type;
+    }
+
+    public static Hll128 fromHll(Hll hll) {
+        Hll128 hll128 = new Hll128();
+        if (hll == null || hll.getType() == Hll.HLL_DATA_EMPTY) {
+            return hll128;
+        }
+        if (hll.getType() == Hll.HLL_DATA_EXPLICIT) {
+            hll128.type = HLL_DATA_EXPLICIT;
+            hll128.hashSet.addAll(hll.getHashSet());
+            return hll128;
+        }
+
+        byte[] registers = hll.getRegisters();
+        byte[] registers128 = new byte[HLL_REGISTERS_COUNT];
+        int groupSize = Hll.HLL_REGISTERS_COUNT / HLL_REGISTERS_COUNT;
+        for (int i = 0; i < HLL_REGISTERS_COUNT; i++) {
+            for (int j = 0; j < groupSize; j++) {
+                registers128[i] = registers128[i] < registers[i * groupSize + 
j]
+                    ? registers[i * groupSize + j]
+                    : registers128[i];
+            }
+        }
+        hll128.registers = registers128;
+        hll128.type = HLL_DATA_FULL;
+        return hll128;
+    }
+
+}
+
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/Hll128Test.java 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/Hll128Test.java
new file mode 100644
index 00000000000..d692b90ce31
--- /dev/null
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/Hll128Test.java
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.statistics.util;
+
+import org.apache.doris.common.io.Hll;
+
+import org.apache.commons.codec.binary.StringUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class Hll128Test {
+
+    @Test
+    public void basicTest() {
+        // test empty
+        Hll128 emptyHll = new Hll128();
+        Assert.assertEquals(Hll128.HLL_DATA_EMPTY, emptyHll.getType());
+        Assert.assertEquals(0, emptyHll.estimateCardinality());
+
+        // test explicit
+        Hll128 explicitHll = new Hll128();
+        for (int i = 0; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) {
+            explicitHll.update(i);
+        }
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicitHll.getType());
+        Assert.assertEquals(Hll.HLL_EXPLICIT_INT64_NUM, 
explicitHll.estimateCardinality());
+
+        // test full
+        Hll128 fullHll = new Hll128();
+        for (int i = 1; i <= Short.MAX_VALUE; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            fullHll.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        Assert.assertEquals(Hll.HLL_DATA_FULL, fullHll.getType());
+        Assert.assertEquals(33141, fullHll.estimateCardinality());
+        Assert.assertTrue(fullHll.estimateCardinality() > Short.MAX_VALUE * (1 
- 0.1)
+                && fullHll.estimateCardinality() < Short.MAX_VALUE * (1 + 
0.1));
+
+    }
+
+    @Test
+    public void testFromHll() throws IOException {
+        // test empty
+        Hll emptyHll = new Hll();
+        Hll128 hll128 = Hll128.fromHll(emptyHll);
+        Assert.assertEquals(Hll128.HLL_DATA_EMPTY, hll128.getType());
+        Assert.assertEquals(0, hll128.estimateCardinality());
+
+        // test explicit
+        Hll explicitHll = new Hll();
+        for (int i = 0; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) {
+            explicitHll.updateWithHash(i);
+        }
+        hll128 = Hll128.fromHll(explicitHll);
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, hll128.getType());
+        Assert.assertEquals(Hll.HLL_EXPLICIT_INT64_NUM, 
hll128.estimateCardinality());
+
+        // test full
+        Hll fullHll = new Hll();
+        for (int i = 0; i < 10000; i++) {
+            fullHll.updateWithHash(i);
+        }
+        hll128 = Hll128.fromHll(fullHll);
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, hll128.getType());
+        Assert.assertTrue(hll128.estimateCardinality() > 9000 && 
hll128.estimateCardinality() < 11000);
+    }
+
+    @Test
+    public void testMerge() throws IOException {
+        // test empty merge empty
+        Hll128 empty1 = new Hll128();
+        Hll128 empty2 = new Hll128();
+        empty1.merge(empty2);
+        Assert.assertEquals(Hll128.HLL_DATA_EMPTY, empty1.getType());
+        Assert.assertEquals(0, empty1.estimateCardinality());
+
+        // test empty merge explicit
+        Hll128 empty = new Hll128();
+        Hll128 explicit = new Hll128();
+        for (int i = 1; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            explicit.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        empty.merge(explicit);
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, empty.getType());
+        Assert.assertEquals(Hll.HLL_EXPLICIT_INT64_NUM - 1, 
empty.estimateCardinality());
+
+        // test empty merge full
+        empty = new Hll128();
+        Hll128 full = new Hll128();
+        for (int i = 1; i < 10000; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            full.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        empty.merge(full);
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, empty.getType());
+        Assert.assertTrue(empty.estimateCardinality() > 9000 && 
empty.estimateCardinality() < 11000);
+
+        // test explicit merge empty
+        empty = new Hll128();
+        explicit = new Hll128();
+        for (int i = 1; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            explicit.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        explicit.merge(empty);
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit.getType());
+        Assert.assertEquals(Hll.HLL_EXPLICIT_INT64_NUM - 1, 
explicit.estimateCardinality());
+
+        // test explicit merge explicit
+        Hll128 explicit1 = new Hll128();
+        Hll128 explicit2 = new Hll128();
+        for (int i = 0; i < 10; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            explicit1.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        for (int i = 0; i < 30; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            explicit2.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        explicit1.merge(explicit2);
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit1.getType());
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit2.getType());
+        Assert.assertEquals(30, explicit1.estimateCardinality());
+
+        explicit2 = new Hll128();
+        for (int i = 10001; i < 10000 + Hll.HLL_EXPLICIT_INT64_NUM; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            explicit2.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit1.getType());
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit2.getType());
+        explicit1.merge(explicit2);
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, explicit1.getType());
+        Assert.assertTrue(explicit1.estimateCardinality() > 170 && 
explicit1.estimateCardinality() < 210);
+
+        // Test explicit merge full
+        explicit = new Hll128();
+        for (int i = 0; i < 10; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            explicit.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        full = new Hll128();
+        for (int i = 1; i < 10000; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            full.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, full.getType());
+        explicit.merge(full);
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, explicit.getType());
+        Assert.assertTrue(explicit.estimateCardinality() > 9000 && 
explicit.estimateCardinality() < 11000);
+
+        // Test full merge explicit
+        explicit = new Hll128();
+        for (int i = 0; i < 10; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            explicit.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        full = new Hll128();
+        for (int i = 1; i < 10000; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            full.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit.getType());
+        full.merge(explicit);
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, full.getType());
+        Assert.assertTrue(full.estimateCardinality() > 9000 && 
full.estimateCardinality() < 11000);
+
+        // Test full merge full
+        Hll128 full1 = new Hll128();
+        Hll128 full2 = new Hll128();
+        for (int i = 1; i < 10000; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            full1.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        for (int i = 5000; i < 15000; i++) {
+            byte[] v = StringUtils.getBytesUtf8(String.valueOf(i));
+            full2.update(Hll.hash64(v, v.length, Hll.SEED));
+        }
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, full1.getType());
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, full2.getType());
+        full1.merge(full2);
+        Assert.assertEquals(Hll128.HLL_DATA_FULL, full1.getType());
+        Assert.assertTrue(full1.estimateCardinality() > 13500 && 
full1.estimateCardinality() < 16500);
+    }
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to