This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 91887a285e2b924421dc3bd5e09870d5af2e5d3e Author: Jibing-Li <[email protected]> AuthorDate: Fri Apr 26 13:06:48 2024 +0800 Implement HLL with 128 buckets to support statistics cache. (#34124) --- .../main/java/org/apache/doris/common/io/Hll.java | 15 +- .../java/org/apache/doris/common/io/HllTest.java | 4 +- .../org/apache/doris/statistics/util/Hll128.java | 214 +++++++++++++++++++++ .../apache/doris/statistics/util/Hll128Test.java | 204 ++++++++++++++++++++ 4 files changed, 432 insertions(+), 5 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java b/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java index b00912598d1..f70df617498 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/io/Hll.java @@ -46,7 +46,7 @@ public class Hll { public static final int HLL_COLUMN_PRECISION = 14; public static final int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION); - public static final int HLL_EXPLICLIT_INT64_NUM = 160; + public static final int HLL_EXPLICIT_INT64_NUM = 160; public static final int HLL_SPARSE_THRESHOLD = 4096; public static final int HLL_REGISTERS_COUNT = 16 * 1024; @@ -122,7 +122,7 @@ public class Hll { type = HLL_DATA_EXPLICIT; break; case HLL_DATA_EXPLICIT: - if (hashSet.size() < HLL_EXPLICLIT_INT64_NUM) { + if (hashSet.size() < HLL_EXPLICIT_INT64_NUM) { hashSet.add(hashValue); break; } @@ -157,7 +157,7 @@ public class Hll { switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default case HLL_DATA_EXPLICIT: this.hashSet.addAll(other.hashSet); - if (this.hashSet.size() > HLL_EXPLICLIT_INT64_NUM) { + if (this.hashSet.size() > HLL_EXPLICIT_INT64_NUM) { convertExplicitToRegister(); this.type = HLL_DATA_FULL; } @@ -393,4 +393,13 @@ public class Hll { return type; } + // For convert to statistics used Hll128 + public byte[] getRegisters() { + return registers; + } + + // For convert to statistics used Hll128 + public Set<Long> getHashSet() { + return hashSet; + } } diff --git a/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java b/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java index fabc7c1f8da..94333f255a6 100644 --- a/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java +++ b/fe/fe-common/src/test/java/org/apache/doris/common/io/HllTest.java @@ -55,11 +55,11 @@ public class HllTest { // test explicit Hll explicitHll = new Hll(); - for (int i = 0; i < Hll.HLL_EXPLICLIT_INT64_NUM; i++) { + for (int i = 0; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) { explicitHll.updateWithHash(i); } Assert.assertTrue(explicitHll.getType() == Hll.HLL_DATA_EXPLICIT); - Assert.assertTrue(explicitHll.estimateCardinality() == Hll.HLL_EXPLICLIT_INT64_NUM); + Assert.assertTrue(explicitHll.estimateCardinality() == Hll.HLL_EXPLICIT_INT64_NUM); ByteArrayOutputStream explicitOutputStream = new ByteArrayOutputStream(); DataOutput explicitOutput = new DataOutputStream(explicitOutputStream); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/Hll128.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/Hll128.java new file mode 100644 index 00000000000..094cff4b7d9 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/Hll128.java @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics.util; + +import org.apache.doris.common.io.Hll; + +import java.math.BigInteger; +import java.util.HashSet; +import java.util.Set; + +/** + * This is an HLL implementation with 128 Buckets. + * Mainly used for statistics partition ndv cache. + * We can convert the org.apache.doris.common.io.Hll object with 16K buckets to Hll128 to reduce memory consumption. + */ +public class Hll128 { + + public static final byte HLL_DATA_EMPTY = 0; + public static final byte HLL_DATA_EXPLICIT = 1; + public static final byte HLL_DATA_FULL = 3; + + public static final int HLL_COLUMN_PRECISION = 7; + public static final int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION); + public static final int HLL_EXPLICLIT_INT64_NUM = 160; + public static final int HLL_REGISTERS_COUNT = 128; + + private int type; + private Set<Long> hashSet; + private byte[] registers; + + public Hll128() { + type = Hll.HLL_DATA_EMPTY; + this.hashSet = new HashSet<>(); + } + + private void convertExplicitToRegister() { + assert this.type == HLL_DATA_EXPLICIT; + registers = new byte[HLL_REGISTERS_COUNT]; + for (Long value : hashSet) { + updateRegisters(value); + } + hashSet.clear(); + } + + private void updateRegisters(long hashValue) { + int idx; + // hash value less than zero means we get a unsigned long + // so need to transfer to BigInter to mod + if (hashValue < 0) { + BigInteger unint64HashValue = new BigInteger(Long.toUnsignedString(hashValue)); + unint64HashValue = unint64HashValue.mod(new BigInteger(Long.toUnsignedString(HLL_REGISTERS_COUNT))); + idx = unint64HashValue.intValue(); + } else { + idx = (int) (hashValue % HLL_REGISTERS_COUNT); + } + + hashValue >>>= HLL_COLUMN_PRECISION; + hashValue |= (1L << HLL_ZERO_COUNT_BITS); + byte firstOneBit = (byte) (Hll.getLongTailZeroNum(hashValue) + 1); + registers[idx] = registers[idx] > firstOneBit ? registers[idx] : firstOneBit; + } + + private void mergeRegisters(byte[] other) { + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + this.registers[i] = this.registers[i] > other[i] ? this.registers[i] : other[i]; + } + } + + public void update(long hashValue) { + switch (this.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EMPTY: + hashSet.add(hashValue); + type = HLL_DATA_EXPLICIT; + break; + case HLL_DATA_EXPLICIT: + if (hashSet.size() < HLL_EXPLICLIT_INT64_NUM) { + hashSet.add(hashValue); + break; + } + convertExplicitToRegister(); + type = HLL_DATA_FULL; + case HLL_DATA_FULL: // CHECKSTYLE IGNORE THIS LINE: fall through + updateRegisters(hashValue); + break; + } + } + + public void merge(Hll128 other) { + if (other.type == HLL_DATA_EMPTY) { + return; + } + switch (this.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EMPTY: + this.type = other.type; + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + this.hashSet.addAll(other.hashSet); + break; + case HLL_DATA_FULL: + this.registers = new byte[HLL_REGISTERS_COUNT]; + System.arraycopy(other.registers, 0, this.registers, 0, HLL_REGISTERS_COUNT); + break; + } + break; + case HLL_DATA_EXPLICIT: + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + this.hashSet.addAll(other.hashSet); + if (this.hashSet.size() > HLL_EXPLICLIT_INT64_NUM) { + convertExplicitToRegister(); + this.type = HLL_DATA_FULL; + } + break; + case HLL_DATA_FULL: + convertExplicitToRegister(); + mergeRegisters(other.registers); + this.type = HLL_DATA_FULL; + break; + } + break; + case HLL_DATA_FULL: + switch (other.type) { // CHECKSTYLE IGNORE THIS LINE: missing switch default + case HLL_DATA_EXPLICIT: + for (long value : other.hashSet) { + update(value); + } + break; + case HLL_DATA_FULL: + mergeRegisters(other.registers); + break; + } + break; + } + } + + // use strictfp to force java follow IEEE 754 to deal float point strictly + public strictfp long estimateCardinality() { + if (type == HLL_DATA_EMPTY) { + return 0; + } + if (type == HLL_DATA_EXPLICIT) { + return hashSet.size(); + } + + int numStreams = HLL_REGISTERS_COUNT; + float alpha = 0.7213f / (1 + 1.079f / numStreams); + float harmonicMean = 0; + int numZeroRegisters = 0; + + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + harmonicMean += Math.pow(2.0f, -registers[i]); + + if (registers[i] == 0) { + numZeroRegisters++; + } + } + + harmonicMean = 1.0f / harmonicMean; + double estimate = alpha * numStreams * numStreams * harmonicMean; + + if (estimate <= numStreams * 2.5 && numZeroRegisters != 0) { + estimate = numStreams * Math.log(((float) numStreams) / ((float) numZeroRegisters)); + } + + return (long) (estimate + 0.5); + } + + public int getType() { + return type; + } + + public static Hll128 fromHll(Hll hll) { + Hll128 hll128 = new Hll128(); + if (hll == null || hll.getType() == Hll.HLL_DATA_EMPTY) { + return hll128; + } + if (hll.getType() == Hll.HLL_DATA_EXPLICIT) { + hll128.type = HLL_DATA_EXPLICIT; + hll128.hashSet.addAll(hll.getHashSet()); + return hll128; + } + + byte[] registers = hll.getRegisters(); + byte[] registers128 = new byte[HLL_REGISTERS_COUNT]; + int groupSize = Hll.HLL_REGISTERS_COUNT / HLL_REGISTERS_COUNT; + for (int i = 0; i < HLL_REGISTERS_COUNT; i++) { + for (int j = 0; j < groupSize; j++) { + registers128[i] = registers128[i] < registers[i * groupSize + j] + ? registers[i * groupSize + j] + : registers128[i]; + } + } + hll128.registers = registers128; + hll128.type = HLL_DATA_FULL; + return hll128; + } + +} + diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/Hll128Test.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/Hll128Test.java new file mode 100644 index 00000000000..d692b90ce31 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/Hll128Test.java @@ -0,0 +1,204 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics.util; + +import org.apache.doris.common.io.Hll; + +import org.apache.commons.codec.binary.StringUtils; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; + +public class Hll128Test { + + @Test + public void basicTest() { + // test empty + Hll128 emptyHll = new Hll128(); + Assert.assertEquals(Hll128.HLL_DATA_EMPTY, emptyHll.getType()); + Assert.assertEquals(0, emptyHll.estimateCardinality()); + + // test explicit + Hll128 explicitHll = new Hll128(); + for (int i = 0; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) { + explicitHll.update(i); + } + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicitHll.getType()); + Assert.assertEquals(Hll.HLL_EXPLICIT_INT64_NUM, explicitHll.estimateCardinality()); + + // test full + Hll128 fullHll = new Hll128(); + for (int i = 1; i <= Short.MAX_VALUE; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + fullHll.update(Hll.hash64(v, v.length, Hll.SEED)); + } + Assert.assertEquals(Hll.HLL_DATA_FULL, fullHll.getType()); + Assert.assertEquals(33141, fullHll.estimateCardinality()); + Assert.assertTrue(fullHll.estimateCardinality() > Short.MAX_VALUE * (1 - 0.1) + && fullHll.estimateCardinality() < Short.MAX_VALUE * (1 + 0.1)); + + } + + @Test + public void testFromHll() throws IOException { + // test empty + Hll emptyHll = new Hll(); + Hll128 hll128 = Hll128.fromHll(emptyHll); + Assert.assertEquals(Hll128.HLL_DATA_EMPTY, hll128.getType()); + Assert.assertEquals(0, hll128.estimateCardinality()); + + // test explicit + Hll explicitHll = new Hll(); + for (int i = 0; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) { + explicitHll.updateWithHash(i); + } + hll128 = Hll128.fromHll(explicitHll); + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, hll128.getType()); + Assert.assertEquals(Hll.HLL_EXPLICIT_INT64_NUM, hll128.estimateCardinality()); + + // test full + Hll fullHll = new Hll(); + for (int i = 0; i < 10000; i++) { + fullHll.updateWithHash(i); + } + hll128 = Hll128.fromHll(fullHll); + Assert.assertEquals(Hll128.HLL_DATA_FULL, hll128.getType()); + Assert.assertTrue(hll128.estimateCardinality() > 9000 && hll128.estimateCardinality() < 11000); + } + + @Test + public void testMerge() throws IOException { + // test empty merge empty + Hll128 empty1 = new Hll128(); + Hll128 empty2 = new Hll128(); + empty1.merge(empty2); + Assert.assertEquals(Hll128.HLL_DATA_EMPTY, empty1.getType()); + Assert.assertEquals(0, empty1.estimateCardinality()); + + // test empty merge explicit + Hll128 empty = new Hll128(); + Hll128 explicit = new Hll128(); + for (int i = 1; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + explicit.update(Hll.hash64(v, v.length, Hll.SEED)); + } + empty.merge(explicit); + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, empty.getType()); + Assert.assertEquals(Hll.HLL_EXPLICIT_INT64_NUM - 1, empty.estimateCardinality()); + + // test empty merge full + empty = new Hll128(); + Hll128 full = new Hll128(); + for (int i = 1; i < 10000; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + full.update(Hll.hash64(v, v.length, Hll.SEED)); + } + empty.merge(full); + Assert.assertEquals(Hll128.HLL_DATA_FULL, empty.getType()); + Assert.assertTrue(empty.estimateCardinality() > 9000 && empty.estimateCardinality() < 11000); + + // test explicit merge empty + empty = new Hll128(); + explicit = new Hll128(); + for (int i = 1; i < Hll.HLL_EXPLICIT_INT64_NUM; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + explicit.update(Hll.hash64(v, v.length, Hll.SEED)); + } + explicit.merge(empty); + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit.getType()); + Assert.assertEquals(Hll.HLL_EXPLICIT_INT64_NUM - 1, explicit.estimateCardinality()); + + // test explicit merge explicit + Hll128 explicit1 = new Hll128(); + Hll128 explicit2 = new Hll128(); + for (int i = 0; i < 10; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + explicit1.update(Hll.hash64(v, v.length, Hll.SEED)); + } + for (int i = 0; i < 30; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + explicit2.update(Hll.hash64(v, v.length, Hll.SEED)); + } + explicit1.merge(explicit2); + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit1.getType()); + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit2.getType()); + Assert.assertEquals(30, explicit1.estimateCardinality()); + + explicit2 = new Hll128(); + for (int i = 10001; i < 10000 + Hll.HLL_EXPLICIT_INT64_NUM; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + explicit2.update(Hll.hash64(v, v.length, Hll.SEED)); + } + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit1.getType()); + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit2.getType()); + explicit1.merge(explicit2); + Assert.assertEquals(Hll128.HLL_DATA_FULL, explicit1.getType()); + Assert.assertTrue(explicit1.estimateCardinality() > 170 && explicit1.estimateCardinality() < 210); + + // Test explicit merge full + explicit = new Hll128(); + for (int i = 0; i < 10; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + explicit.update(Hll.hash64(v, v.length, Hll.SEED)); + } + full = new Hll128(); + for (int i = 1; i < 10000; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + full.update(Hll.hash64(v, v.length, Hll.SEED)); + } + Assert.assertEquals(Hll128.HLL_DATA_FULL, full.getType()); + explicit.merge(full); + Assert.assertEquals(Hll128.HLL_DATA_FULL, explicit.getType()); + Assert.assertTrue(explicit.estimateCardinality() > 9000 && explicit.estimateCardinality() < 11000); + + // Test full merge explicit + explicit = new Hll128(); + for (int i = 0; i < 10; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + explicit.update(Hll.hash64(v, v.length, Hll.SEED)); + } + full = new Hll128(); + for (int i = 1; i < 10000; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + full.update(Hll.hash64(v, v.length, Hll.SEED)); + } + Assert.assertEquals(Hll128.HLL_DATA_EXPLICIT, explicit.getType()); + full.merge(explicit); + Assert.assertEquals(Hll128.HLL_DATA_FULL, full.getType()); + Assert.assertTrue(full.estimateCardinality() > 9000 && full.estimateCardinality() < 11000); + + // Test full merge full + Hll128 full1 = new Hll128(); + Hll128 full2 = new Hll128(); + for (int i = 1; i < 10000; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + full1.update(Hll.hash64(v, v.length, Hll.SEED)); + } + for (int i = 5000; i < 15000; i++) { + byte[] v = StringUtils.getBytesUtf8(String.valueOf(i)); + full2.update(Hll.hash64(v, v.length, Hll.SEED)); + } + Assert.assertEquals(Hll128.HLL_DATA_FULL, full1.getType()); + Assert.assertEquals(Hll128.HLL_DATA_FULL, full2.getType()); + full1.merge(full2); + Assert.assertEquals(Hll128.HLL_DATA_FULL, full1.getType()); + Assert.assertTrue(full1.estimateCardinality() > 13500 && full1.estimateCardinality() < 16500); + } + +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
