GEODE-970: Remove duplicate HLL classes HLL classes were duplicated in com.gemstone.gemfire.internal.redis.hll and com.gemstone.gemfire.cache.hdfs.internal.cardinality packages. They now live in com.gemstone.gemfire.internal.hll package.
Project: http://git-wip-us.apache.org/repos/asf/incubator-geode/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-geode/commit/e685fd85 Tree: http://git-wip-us.apache.org/repos/asf/incubator-geode/tree/e685fd85 Diff: http://git-wip-us.apache.org/repos/asf/incubator-geode/diff/e685fd85 Branch: refs/heads/develop Commit: e685fd85ac7e2607f70b47bfb448b1d91a56b103 Parents: 92be457 Author: Swapnil Bawaskar <sbawas...@pivotal.io> Authored: Wed Feb 17 13:28:13 2016 -0800 Committer: Swapnil Bawaskar <sbawas...@pivotal.io> Committed: Thu Feb 18 06:51:36 2016 -0800 ---------------------------------------------------------------------- .../cache/hdfs/internal/cardinality/Bits.java | 54 - .../cardinality/CardinalityMergeException.java | 42 - .../hdfs/internal/cardinality/HyperLogLog.java | 313 ----- .../hdfs/internal/cardinality/IBuilder.java | 41 - .../hdfs/internal/cardinality/ICardinality.java | 87 -- .../hdfs/internal/cardinality/MurmurHash.java | 261 ----- .../hdfs/internal/cardinality/RegisterSet.java | 136 --- .../hdfs/internal/hoplog/AbstractHoplog.java | 2 +- .../hdfs/internal/hoplog/HFileSortedOplog.java | 4 +- .../hoplog/HdfsSortedOplogOrganizer.java | 8 +- .../cache/hdfs/internal/hoplog/Hoplog.java | 3 +- .../internal/hoplog/SequenceFileHoplog.java | 2 +- .../com/gemstone/gemfire/internal/hll/Bits.java | 65 ++ .../internal/hll/CardinalityMergeException.java | 42 + .../gemfire/internal/hll/HyperLogLog.java | 362 ++++++ .../gemfire/internal/hll/HyperLogLogPlus.java | 1070 ++++++++++++++++++ .../gemstone/gemfire/internal/hll/IBuilder.java | 41 + .../gemfire/internal/hll/ICardinality.java | 89 ++ .../gemfire/internal/hll/MurmurHash.java | 261 +++++ .../gemfire/internal/hll/RegisterSet.java | 126 +++ .../gemfire/internal/redis/RegionProvider.java | 2 +- .../internal/redis/executor/hll/Bits.java | 65 -- .../executor/hll/CardinalityMergeException.java | 42 - .../redis/executor/hll/HyperLogLog.java | 360 ------ .../redis/executor/hll/HyperLogLogPlus.java | 1068 ----------------- .../internal/redis/executor/hll/IBuilder.java | 41 - .../redis/executor/hll/ICardinality.java | 89 -- .../internal/redis/executor/hll/MurmurHash.java | 234 ---- .../redis/executor/hll/PFAddExecutor.java | 1 + .../redis/executor/hll/PFCountExecutor.java | 2 + .../redis/executor/hll/PFMergeExecutor.java | 2 + .../redis/executor/hll/RegisterSet.java | 126 --- .../gemfire/redis/GemFireRedisServer.java | 2 +- 33 files changed, 2073 insertions(+), 2970 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/Bits.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/Bits.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/Bits.java deleted file mode 100644 index 4fa3443..0000000 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/Bits.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * Copyright (C) 2011 Clearspring Technologies, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.gemstone.gemfire.cache.hdfs.internal.cardinality; - -import java.io.ByteArrayInputStream; -import java.io.DataInputStream; -import java.io.IOException; - -public class Bits -{ - - public static int[] getBits(byte[] mBytes) throws IOException - { - int bitSize = mBytes.length / 4; - int[] bits = new int[bitSize]; - DataInputStream dis = new DataInputStream(new ByteArrayInputStream(mBytes)); - for (int i = 0; i < bitSize; i++) - { - bits[i] = dis.readInt(); - } - return bits; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/CardinalityMergeException.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/CardinalityMergeException.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/CardinalityMergeException.java deleted file mode 100644 index a63f8e6..0000000 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/CardinalityMergeException.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * Copyright (C) 2011 Clearspring Technologies, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.gemstone.gemfire.cache.hdfs.internal.cardinality; - -@SuppressWarnings("serial") -public abstract class CardinalityMergeException extends Exception -{ - public CardinalityMergeException(String message) - { - super(message); - } -} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/HyperLogLog.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/HyperLogLog.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/HyperLogLog.java deleted file mode 100644 index 6d945bc..0000000 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/HyperLogLog.java +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * Copyright (C) 2012 Clearspring Technologies, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.gemstone.gemfire.cache.hdfs.internal.cardinality; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.io.Serializable; - -/** - * Java implementation of HyperLogLog (HLL) algorithm from this paper: - * <p/> - * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf - * <p/> - * HLL is an improved version of LogLog that is capable of estimating - * the cardinality of a set with accuracy = 1.04/sqrt(m) where - * m = 2^b. So we can control accuracy vs space usage by increasing - * or decreasing b. - * <p/> - * The main benefit of using HLL over LL is that it only requires 64% - * of the space that LL does to get the same accuracy. - * <p/> - * This implementation implements a single counter. If a large (millions) - * number of counters are required you may want to refer to: - * <p/> - * http://dsiutils.dsi.unimi.it/ - * <p/> - * It has a more complex implementation of HLL that supports multiple counters - * in a single object, drastically reducing the java overhead from creating - * a large number of objects. - * <p/> - * This implementation leveraged a javascript implementation that Yammer has - * been working on: - * <p/> - * https://github.com/yammer/probablyjs - * <p> - * Note that this implementation does not include the long range correction function - * defined in the original paper. Empirical evidence shows that the correction - * function causes more harm than good. - * </p> - * - * <p> - * Users have different motivations to use different types of hashing functions. - * Rather than try to keep up with all available hash functions and to remove - * the concern of causing future binary incompatibilities this class allows clients - * to offer the value in hashed int or long form. This way clients are free - * to change their hash function on their own time line. We recommend using Google's - * Guava Murmur3_128 implementation as it provides good performance and speed when - * high precision is required. In our tests the 32bit MurmurHash function included - * in this project is faster and produces better results than the 32 bit murmur3 - * implementation google provides. - * </p> - */ -public class HyperLogLog implements ICardinality -{ - private final RegisterSet registerSet; - private final int log2m; - private final double alphaMM; - - - /** - * Create a new HyperLogLog instance using the specified standard deviation. - * - * @param rsd - the relative standard deviation for the counter. - * smaller values create counters that require more space. - */ - public HyperLogLog(double rsd) - { - this(log2m(rsd)); - } - - private static int log2m(double rsd) - { - return (int) (Math.log((1.106 / rsd) * (1.106 / rsd)) / Math.log(2)); - } - - /** - * Create a new HyperLogLog instance. The log2m parameter defines the accuracy of - * the counter. The larger the log2m the better the accuracy. - * <p/> - * accuracy = 1.04/sqrt(2^log2m) - * - * @param log2m - the number of bits to use as the basis for the HLL instance - */ - public HyperLogLog(int log2m) - { - this(log2m, new RegisterSet((int) Math.pow(2, log2m))); - } - - /** - * Creates a new HyperLogLog instance using the given registers. Used for unmarshalling a serialized - * instance and for merging multiple counters together. - * - * @param registerSet - the initial values for the register set - */ - public HyperLogLog(int log2m, RegisterSet registerSet) - { - this.registerSet = registerSet; - this.log2m = log2m; - int m = (int) Math.pow(2, this.log2m); - - // See the paper. - switch (log2m) - { - case 4: - alphaMM = 0.673 * m * m; - break; - case 5: - alphaMM = 0.697 * m * m; - break; - case 6: - alphaMM = 0.709 * m * m; - break; - default: - alphaMM = (0.7213 / (1 + 1.079 / m)) * m * m; - } - } - - - @Override - public boolean offerHashed(long hashedValue) - { - // j becomes the binary address determined by the first b log2m of x - // j will be between 0 and 2^log2m - final int j = (int) (hashedValue >>> (Long.SIZE - log2m)); - final int r = Long.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1; - return registerSet.updateIfGreater(j, r); - } - - @Override - public boolean offerHashed(int hashedValue) - { - // j becomes the binary address determined by the first b log2m of x - // j will be between 0 and 2^log2m - final int j = hashedValue >>> (Integer.SIZE - log2m); - final int r = Integer.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1; - return registerSet.updateIfGreater(j, r); - } - - @Override - public boolean offer(Object o) - { - final int x = MurmurHash.hash(o); - return offerHashed(x); - } - - - @Override - public long cardinality() - { - double registerSum = 0; - int count = registerSet.count; - double zeros = 0.0; - for (int j = 0; j < registerSet.count; j++) - { - int val = registerSet.get(j); - registerSum += 1.0 / (1<<val); - if (val == 0) { - zeros++; - } - } - - double estimate = alphaMM * (1 / registerSum); - - if (estimate <= (5.0 / 2.0) * count) - { - // Small Range Estimate - return Math.round(count * Math.log(count / zeros)); - } - else - { - return Math.round(estimate); - } - } - - @Override - public int sizeof() - { - return registerSet.size * 4; - } - - @Override - public byte[] getBytes() throws IOException - { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dos = new DataOutputStream(baos); - - dos.writeInt(log2m); - dos.writeInt(registerSet.size * 4); - for (int x : registerSet.bits()) - { - dos.writeInt(x); - } - - return baos.toByteArray(); - } - - /** Add all the elements of the other set to this set. - * - * This operation does not imply a loss of precision. - * - * @param other A compatible Hyperloglog instance (same log2m) - * @throws CardinalityMergeException if other is not compatible - */ - public void addAll(HyperLogLog other) throws CardinalityMergeException { - if (this.sizeof() != other.sizeof()) - { - throw new HyperLogLogMergeException("Cannot merge estimators of different sizes"); - } - - registerSet.merge(other.registerSet); - } - - @Override - public ICardinality merge(ICardinality... estimators) throws CardinalityMergeException - { - HyperLogLog merged = new HyperLogLog(log2m); - merged.addAll(this); - - if (estimators == null) - { - return merged; - } - - for (ICardinality estimator : estimators) - { - if (!(estimator instanceof HyperLogLog)) - { - throw new HyperLogLogMergeException("Cannot merge estimators of different class"); - } - HyperLogLog hll = (HyperLogLog) estimator; - merged.addAll(hll); - } - - return merged; - } - - public static class Builder implements IBuilder<ICardinality>, Serializable - { - private double rsd; - - public Builder(double rsd) - { - this.rsd = rsd; - } - - @Override - public HyperLogLog build() - { - return new HyperLogLog(rsd); - } - - @Override - public int sizeof() - { - int log2m = log2m(rsd); - int k = (int) Math.pow(2, log2m); - return RegisterSet.getBits(k) * 4; - } - - public static HyperLogLog build(byte[] bytes) throws IOException - { - ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - DataInputStream oi = new DataInputStream(bais); - int log2m = oi.readInt(); - int size = oi.readInt(); - byte[] longArrayBytes = new byte[size]; - oi.readFully(longArrayBytes); - return new HyperLogLog(log2m, new RegisterSet((int) Math.pow(2, log2m), Bits.getBits(longArrayBytes))); - } - } - - @SuppressWarnings("serial") - protected static class HyperLogLogMergeException extends CardinalityMergeException - { - public HyperLogLogMergeException(String message) - { - super(message); - } - } -} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/IBuilder.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/IBuilder.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/IBuilder.java deleted file mode 100644 index 4321dc6..0000000 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/IBuilder.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * Copyright (C) 2011 Clearspring Technologies, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.gemstone.gemfire.cache.hdfs.internal.cardinality; - - -public interface IBuilder<T> -{ - T build(); - - int sizeof(); -} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/ICardinality.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/ICardinality.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/ICardinality.java deleted file mode 100644 index ae6d7d6..0000000 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/ICardinality.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * Copyright (C) 2011 Clearspring Technologies, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.gemstone.gemfire.cache.hdfs.internal.cardinality; - -import java.io.IOException; - - -public interface ICardinality -{ - /** - * @param o stream element - * @return false if the value returned by cardinality() is unaffected by the appearance of o in the stream. - */ - boolean offer(Object o); - - /** - * Offer the value as a hashed long value - * - * @param hashedLong - the hash of the item to offer to the estimator - * @return false if the value returned by cardinality() is unaffected by the appearance of hashedLong in the stream - */ - boolean offerHashed(long hashedLong); - - /** - * Offer the value as a hashed long value - * - * @param hashedInt - the hash of the item to offer to the estimator - * @return false if the value returned by cardinality() is unaffected by the appearance of hashedInt in the stream - */ - boolean offerHashed(int hashedInt); - - /** - * @return the number of unique elements in the stream or an estimate thereof - */ - long cardinality(); - - /** - * @return size in bytes needed for serialization - */ - int sizeof(); - - /** - * @throws IOException - */ - byte[] getBytes() throws IOException; - - /** - * Merges estimators to produce a new estimator for the combined streams - * of this estimator and those passed as arguments. - * - * Nor this estimator nor the one passed as parameters are modified. - * - * @param estimators Zero or more compatible estimators - * @throws CardinalityMergeException If at least one of the estimators is not compatible with this one - */ - ICardinality merge(ICardinality... estimators) throws CardinalityMergeException; -} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/MurmurHash.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/MurmurHash.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/MurmurHash.java deleted file mode 100644 index bc760a8..0000000 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/MurmurHash.java +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.gemstone.gemfire.cache.hdfs.internal.cardinality; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with this - * work for additional information regarding copyright ownership. The ASF - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -/** - * This is a very fast, non-cryptographic hash suitable for general hash-based - * lookup. See http://murmurhash.googlepages.com/ for more details. - * <p/> - * <p> - * The C version of MurmurHash 2.0 found at that site was ported to Java by - * Andrzej Bialecki (ab at getopt org). - * </p> - */ -public class MurmurHash -{ - public static int hash(Object o) - { - if (o == null) - { - return 0; - } - if (o instanceof Long) - { - return hashLong((Long) o); - } - if (o instanceof Integer) - { - return hashLong((Integer) o); - } - if (o instanceof Double) - { - return hashLong(Double.doubleToRawLongBits((Double) o)); - } - if (o instanceof Float) - { - return hashLong(Float.floatToRawIntBits((Float) o)); - } - if (o instanceof String) - { - return hash(((String) o).getBytes()); - } - if (o instanceof byte[]) - { - return hash((byte[]) o); - } - return hash(o.toString()); - } - - public static int hash(byte[] data) - { - return hash(data, 0, data.length, -1); - } - - public static int hash(byte[] data, int seed) - { - return hash(data, 0, data.length, seed); - } - - public static int hash(byte[] data, int offset, int length, int seed) - { - int m = 0x5bd1e995; - int r = 24; - - int h = seed ^ length; - - int len_4 = length >> 2; - - for (int i = 0; i < len_4; i++) - { - int i_4 = i << 2; - int k = data[offset + i_4 + 3]; - k = k << 8; - k = k | (data[offset + i_4 + 2] & 0xff); - k = k << 8; - k = k | (data[offset + i_4 + 1] & 0xff); - k = k << 8; - k = k | (data[offset + i_4 + 0] & 0xff); - k *= m; - k ^= k >>> r; - k *= m; - h *= m; - h ^= k; - } - - // avoid calculating modulo - int len_m = len_4 << 2; - int left = length - len_m; - - if (left != 0) - { - if (left >= 3) - { - h ^= (int) data[offset + length - 3] << 16; - } - if (left >= 2) - { - h ^= (int) data[offset + length - 2] << 8; - } - if (left >= 1) - { - h ^= (int) data[offset + length - 1]; - } - - h *= m; - } - - h ^= h >>> 13; - h *= m; - h ^= h >>> 15; - - return h; - } - - public static int hashLong(long data) - { - int m = 0x5bd1e995; - int r = 24; - - int h = 0; - - int k = (int) data * m; - k ^= k >>> r; - h ^= k * m; - - k = (int) (data >> 32) * m; - k ^= k >>> r; - h *= m; - h ^= k * m; - - h ^= h >>> 13; - h *= m; - h ^= h >>> 15; - - return h; - } - - public static long hash64(Object o) - { - if (o == null) - { - return 0l; - } - else if (o instanceof String) - { - final byte[] bytes = ((String) o).getBytes(); - return hash64(bytes, bytes.length); - } - else if (o instanceof byte[]) - { - final byte[] bytes = (byte[]) o; - return hash64(bytes, bytes.length); - } - return hash64(o.toString()); - } - - // 64 bit implementation copied from here: https://github.com/tnm/murmurhash-java - - /** - * Generates 64 bit hash from byte array with default seed value. - * - * @param data byte array to hash - * @param length length of the array to hash - * @return 64 bit hash of the given string - */ - public static long hash64(final byte[] data, int length) - { - return hash64(data, length, 0xe17a1465); - } - - - /** - * Generates 64 bit hash from byte array of the given length and seed. - * - * @param data byte array to hash - * @param length length of the array to hash - * @param seed initial seed value - * @return 64 bit hash of the given array - */ - public static long hash64(final byte[] data, int length, int seed) - { - final long m = 0xc6a4a7935bd1e995L; - final int r = 47; - - long h = (seed & 0xffffffffl) ^ (length * m); - - int length8 = length / 8; - - for (int i = 0; i < length8; i++) - { - final int i8 = i * 8; - long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8) - + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24) - + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40) - + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56); - - k *= m; - k ^= k >>> r; - k *= m; - - h ^= k; - h *= m; - } - - switch (length % 8) - { - case 7: - h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48; - case 6: - h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40; - case 5: - h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32; - case 4: - h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24; - case 3: - h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16; - case 2: - h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8; - case 1: - h ^= (long) (data[length & ~7] & 0xff); - h *= m; - } - ; - - h ^= h >>> r; - h *= m; - h ^= h >>> r; - - return h; - } -} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/RegisterSet.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/RegisterSet.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/RegisterSet.java deleted file mode 100644 index 6f1b919..0000000 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/cardinality/RegisterSet.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * Copyright (C) 2012 Clearspring Technologies, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.gemstone.gemfire.cache.hdfs.internal.cardinality; - -public class RegisterSet -{ - public final static int LOG2_BITS_PER_WORD = 6; - public final static int REGISTER_SIZE = 5; - - public final int count; - public final int size; - - private final int[] M; - - public RegisterSet(int count) - { - this(count, null); - } - - public RegisterSet(int count, int[] initialValues) - { - this.count = count; - int bits = getBits(count); - - if (initialValues == null) - { - if (bits == 0) - { - this.M = new int[1]; - } - else if (bits % Integer.SIZE == 0) - { - this.M = new int[bits]; - } - else - { - this.M = new int[bits + 1]; - } - } - else - { - this.M = initialValues; - } - this.size = this.M.length; - } - - public static int getBits(int count) - { - return count / LOG2_BITS_PER_WORD; - } - - public void set(int position, int value) - { - int bucketPos = position / LOG2_BITS_PER_WORD; - int shift = REGISTER_SIZE * (position - (bucketPos * LOG2_BITS_PER_WORD)); - this.M[bucketPos] = (this.M[bucketPos] & ~(0x1f << shift)) | (value << shift); - } - - public int get(int position) - { - int bucketPos = position / LOG2_BITS_PER_WORD; - int shift = REGISTER_SIZE * (position - (bucketPos * LOG2_BITS_PER_WORD)); - return (this.M[bucketPos] & (0x1f << shift)) >>> shift; - } - - public boolean updateIfGreater(int position, int value) - { - int bucket = position / LOG2_BITS_PER_WORD; - int shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD)); - int mask = 0x1f << shift; - - // Use long to avoid sign issues with the left-most shift - long curVal = this.M[bucket] & mask; - long newVal = value << shift; - if (curVal < newVal) { - this.M[bucket] = (int)((this.M[bucket] & ~mask) | newVal); - return true; - } else { - return false; - } - } - - public void merge(RegisterSet that) - { - for (int bucket = 0; bucket < M.length; bucket++) - { - int word = 0; - for (int j = 0; j < LOG2_BITS_PER_WORD; j++) - { - int mask = 0x1f << (REGISTER_SIZE * j); - - int thisVal = (this.M[bucket] & mask); - int thatVal = (that.M[bucket] & mask); - word |= (thisVal < thatVal) ? thatVal : thisVal; - } - this.M[bucket] = word; - } - } - - public int[] bits() - { - int[] copy = new int[size]; - System.arraycopy(M, 0, copy, 0, M.length); - return copy; - } -} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/AbstractHoplog.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/AbstractHoplog.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/AbstractHoplog.java index 636dd91..d2fdbe7 100644 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/AbstractHoplog.java +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/AbstractHoplog.java @@ -19,6 +19,7 @@ package com.gemstone.gemfire.cache.hdfs.internal.hoplog; import java.io.IOException; import java.util.regex.Matcher; +import com.gemstone.gemfire.internal.hll.ICardinality; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -32,7 +33,6 @@ import org.apache.hadoop.io.compress.SnappyCodec; import com.gemstone.gemfire.cache.hdfs.HDFSIOException; import com.gemstone.gemfire.cache.hdfs.internal.HDFSStoreImpl; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.ICardinality; import com.gemstone.gemfire.cache.hdfs.internal.org.apache.hadoop.io.SequenceFile; import com.gemstone.gemfire.cache.hdfs.internal.org.apache.hadoop.io.SequenceFile.CompressionType; import com.gemstone.gemfire.cache.hdfs.internal.org.apache.hadoop.io.SequenceFile.Writer.Option; http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HFileSortedOplog.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HFileSortedOplog.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HFileSortedOplog.java index 8fc9c8e..5ba20d2 100644 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HFileSortedOplog.java +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HFileSortedOplog.java @@ -29,6 +29,8 @@ import java.util.Map.Entry; import java.util.NoSuchElementException; import java.util.concurrent.atomic.AtomicBoolean; +import com.gemstone.gemfire.internal.hll.HyperLogLog; +import com.gemstone.gemfire.internal.hll.ICardinality; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.ipc.RemoteException; @@ -37,8 +39,6 @@ import org.apache.hadoop.util.ShutdownHookManager; import com.gemstone.gemfire.cache.CacheClosedException; import com.gemstone.gemfire.cache.hdfs.HDFSIOException; import com.gemstone.gemfire.cache.hdfs.internal.HDFSStoreImpl; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.HyperLogLog; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.ICardinality; import com.gemstone.gemfire.internal.cache.persistence.soplog.DelegatingSerializedComparator; import com.gemstone.gemfire.internal.cache.persistence.soplog.HFileStoreStatistics; import com.gemstone.gemfire.internal.cache.persistence.soplog.SortedOplogStatistics; http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HdfsSortedOplogOrganizer.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HdfsSortedOplogOrganizer.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HdfsSortedOplogOrganizer.java index 9890b3b..61b925b 100644 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HdfsSortedOplogOrganizer.java +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/HdfsSortedOplogOrganizer.java @@ -40,6 +40,10 @@ import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.gemstone.gemfire.internal.hll.CardinalityMergeException; +import com.gemstone.gemfire.internal.hll.HyperLogLog; +import com.gemstone.gemfire.internal.hll.ICardinality; +import com.gemstone.gemfire.internal.hll.MurmurHash; import org.apache.commons.lang.NotImplementedException; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -54,10 +58,6 @@ import com.gemstone.gemfire.cache.hdfs.HDFSIOException; import com.gemstone.gemfire.cache.hdfs.HDFSStore; import com.gemstone.gemfire.cache.hdfs.internal.QueuedPersistentEvent; import com.gemstone.gemfire.cache.hdfs.internal.SortedHoplogPersistedEvent; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.CardinalityMergeException; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.HyperLogLog; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.ICardinality; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.MurmurHash; import com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSCompactionManager.CompactionRequest; import com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSRegionDirector.HdfsRegionManager; import com.gemstone.gemfire.cache.hdfs.internal.hoplog.Hoplog.HoplogReader; http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/Hoplog.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/Hoplog.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/Hoplog.java index 113e49b..a5030ae 100644 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/Hoplog.java +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/Hoplog.java @@ -16,12 +16,13 @@ */ package com.gemstone.gemfire.cache.hdfs.internal.hoplog; +import com.gemstone.gemfire.internal.hll.ICardinality; + import java.io.Closeable; import java.io.IOException; import java.nio.ByteBuffer; import java.util.EnumMap; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.ICardinality; /** * Ordered sequence file http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/SequenceFileHoplog.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/SequenceFileHoplog.java b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/SequenceFileHoplog.java index 04cbb05..fbb8f14 100644 --- a/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/SequenceFileHoplog.java +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/cache/hdfs/internal/hoplog/SequenceFileHoplog.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.EnumMap; +import com.gemstone.gemfire.internal.hll.ICardinality; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -32,7 +33,6 @@ import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Text; import com.gemstone.gemfire.cache.hdfs.HDFSIOException; -import com.gemstone.gemfire.cache.hdfs.internal.cardinality.ICardinality; import com.gemstone.gemfire.cache.hdfs.internal.hoplog.HoplogSetReader.HoplogIterator; import com.gemstone.gemfire.cache.hdfs.internal.org.apache.hadoop.io.SequenceFile; import com.gemstone.gemfire.cache.hdfs.internal.org.apache.hadoop.io.SequenceFile.Reader; http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/Bits.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/Bits.java b/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/Bits.java new file mode 100755 index 0000000..bd0b33c --- /dev/null +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/Bits.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.gemstone.gemfire.internal.hll; + +/* + * Copyright (C) 2011 Clearspring Technologies, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayInputStream; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.IOException; + +public class Bits { + + public static int[] getBits(byte[] mBytes) throws IOException { + int bitSize = mBytes.length / 4; + int[] bits = new int[bitSize]; + DataInputStream dis = new DataInputStream(new ByteArrayInputStream(mBytes)); + for (int i = 0; i < bitSize; i++) { + bits[i] = dis.readInt(); + } + return bits; + } + + /** + * This method might be better described as + * "byte array to int array" or "data input to int array" + */ + public static int[] getBits(DataInput dataIn, int byteLength) throws IOException { + int bitSize = byteLength / 4; + int[] bits = new int[bitSize]; + for (int i = 0; i < bitSize; i++) { + bits[i] = dataIn.readInt(); + } + return bits; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/CardinalityMergeException.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/CardinalityMergeException.java b/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/CardinalityMergeException.java new file mode 100755 index 0000000..5fd6b0b --- /dev/null +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/CardinalityMergeException.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.gemstone.gemfire.internal.hll; + +/* + * Copyright (C) 2011 Clearspring Technologies, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +@SuppressWarnings("serial") +public abstract class CardinalityMergeException extends Exception { + + public CardinalityMergeException(String message) { + super(message); + } +} http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/e685fd85/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/HyperLogLog.java ---------------------------------------------------------------------- diff --git a/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/HyperLogLog.java b/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/HyperLogLog.java new file mode 100755 index 0000000..fa5508d --- /dev/null +++ b/gemfire-core/src/main/java/com/gemstone/gemfire/internal/hll/HyperLogLog.java @@ -0,0 +1,362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.gemstone.gemfire.internal.hll; + +/* + * Copyright (C) 2012 Clearspring Technologies, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.gemstone.gemfire.internal.redis.executor.hll.HllExecutor; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.Externalizable; +import java.io.IOException; +import java.io.ObjectInput; +import java.io.ObjectOutput; +import java.io.Serializable; + +/** + * Java implementation of HyperLogLog (HLL) algorithm from this paper: + * <p/> + * http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf + * <p/> + * HLL is an improved version of LogLog that is capable of estimating + * the cardinality of a set with accuracy = 1.04/sqrt(m) where + * m = 2^b. So we can control accuracy vs space usage by increasing + * or decreasing b. + * <p/> + * The main benefit of using HLL over LL is that it only requires 64% + * of the space that LL does to get the same accuracy. + * <p/> + * This implementation implements a single counter. If a large (millions) + * number of counters are required you may want to refer to: + * <p/> + * http://dsiutils.dsi.unimi.it/ + * <p/> + * It has a more complex implementation of HLL that supports multiple counters + * in a single object, drastically reducing the java overhead from creating + * a large number of objects. + * <p/> + * This implementation leveraged a javascript implementation that Yammer has + * been working on: + * <p/> + * https://github.com/yammer/probablyjs + * <p> + * Note that this implementation does not include the long range correction function + * defined in the original paper. Empirical evidence shows that the correction + * function causes more harm than good. + * </p> + * <p/> + * <p> + * Users have different motivations to use different types of hashing functions. + * Rather than try to keep up with all available hash functions and to remove + * the concern of causing future binary incompatibilities this class allows clients + * to offer the value in hashed int or long form. This way clients are free + * to change their hash function on their own time line. We recommend using Google's + * Guava Murmur3_128 implementation as it provides good performance and speed when + * high precision is required. In our tests the 32bit MurmurHash function included + * in this project is faster and produces better results than the 32 bit murmur3 + * implementation google provides. + * </p> + */ +public class HyperLogLog implements ICardinality, Serializable { + + private static final long serialVersionUID = -4661220245111112301L; + private final RegisterSet registerSet; + private final int log2m; + private final double alphaMM; + + + /** + * Create a new HyperLogLog instance using the specified standard deviation. + * + * @param rsd - the relative standard deviation for the counter. + * smaller values create counters that require more space. + */ + public HyperLogLog(double rsd) { + this(log2m(rsd)); + } + + private static int log2m(double rsd) { + return (int) (Math.log((1.106 / rsd) * (1.106 / rsd)) / Math.log(2)); + } + + /** + * Create a new HyperLogLog instance. The log2m parameter defines the accuracy of + * the counter. The larger the log2m the better the accuracy. + * <p/> + * accuracy = 1.04/sqrt(2^log2m) + * + * @param log2m - the number of bits to use as the basis for the HLL instance + */ + public HyperLogLog(int log2m) { + this(log2m, new RegisterSet(1 << log2m)); + } + + /** + * Creates a new HyperLogLog instance using the given registers. Used for unmarshalling a serialized + * instance and for merging multiple counters together. + * + * @param registerSet - the initial values for the register set + */ + @Deprecated + public HyperLogLog(int log2m, RegisterSet registerSet) { + if (log2m < 0 || log2m > 30) { + throw new IllegalArgumentException("log2m argument is " + + log2m + " and is outside the range [0, 30]"); + } + this.registerSet = registerSet; + this.log2m = log2m; + int m = 1 << this.log2m; + + alphaMM = getAlphaMM(log2m, m); + } + + @Override + public boolean offerHashed(long hashedValue) { + // j becomes the binary address determined by the first b log2m of x + // j will be between 0 and 2^log2m + final int j = (int) (hashedValue >>> (Long.SIZE - log2m)); + final int r = Long.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1; + return registerSet.updateIfGreater(j, r); + } + + @Override + public boolean offerHashed(int hashedValue) { + // j becomes the binary address determined by the first b log2m of x + // j will be between 0 and 2^log2m + final int j = hashedValue >>> (Integer.SIZE - log2m); + final int r = Integer.numberOfLeadingZeros((hashedValue << this.log2m) | (1 << (this.log2m - 1)) + 1) + 1; + return registerSet.updateIfGreater(j, r); + } + + @Override + public boolean offer(Object o) { + final int x = MurmurHash.hash(o); + return offerHashed(x); + } + + + @Override + public long cardinality() { + double registerSum = 0; + int count = registerSet.count; + double zeros = 0.0; + for (int j = 0; j < registerSet.count; j++) { + int val = registerSet.get(j); + registerSum += 1.0 / (1 << val); + if (val == 0) { + zeros++; + } + } + + double estimate = alphaMM * (1 / registerSum); + + if (estimate <= (5.0 / 2.0) * count) { + // Small Range Estimate + return Math.round(linearCounting(count, zeros)); + } else { + return Math.round(estimate); + } + } + + @Override + public int sizeof() { + return registerSet.size * 4; + } + + @Override + public byte[] getBytes() throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutput dos = new DataOutputStream(baos); + writeBytes(dos); + + return baos.toByteArray(); + } + + private void writeBytes(DataOutput serializedByteStream) throws IOException { + serializedByteStream.writeInt(log2m); + serializedByteStream.writeInt(registerSet.size * 4); + for (int x : registerSet.readOnlyBits()) { + serializedByteStream.writeInt(x); + } + } + + /** + * Add all the elements of the other set to this set. + * <p/> + * This operation does not imply a loss of precision. + * + * @param other A compatible Hyperloglog instance (same log2m) + * @throws CardinalityMergeException if other is not compatible + */ + public void addAll(HyperLogLog other) throws CardinalityMergeException { + if (this.sizeof() != other.sizeof()) { + throw new HyperLogLogMergeException("Cannot merge estimators of different sizes"); + } + + registerSet.merge(other.registerSet); + } + + @Override + public ICardinality merge(ICardinality... estimators) throws CardinalityMergeException { + HyperLogLog merged = new HyperLogLog(HllExecutor.DEFAULT_HLL_STD_DEV);//new HyperLogLog(log2m, new RegisterSet(this.registerSet.count)); + merged.addAll(this); + + if (estimators == null) { + return merged; + } + + for (ICardinality estimator : estimators) { + if (!(estimator instanceof HyperLogLog)) { + throw new HyperLogLogMergeException("Cannot merge estimators of different class"); + } + HyperLogLog hll = (HyperLogLog) estimator; + merged.addAll(hll); + } + + return merged; + } + + private Object writeReplace() { + return new SerializationHolder(this); + } + + /** + * This class exists to support Externalizable semantics for + * HyperLogLog objects without having to expose a public + * constructor, public write/read methods, or pretend final + * fields aren't final. + * + * In short, Externalizable allows you to skip some of the more + * verbose meta-data default Serializable gets you, but still + * includes the class name. In that sense, there is some cost + * to this holder object because it has a longer class name. I + * imagine people who care about optimizing for that have their + * own work-around for long class names in general, or just use + * a custom serialization framework. Therefore we make no attempt + * to optimize that here (eg. by raising this from an inner class + * and giving it an unhelpful name). + */ + private static class SerializationHolder implements Externalizable { + + HyperLogLog hyperLogLogHolder; + + public SerializationHolder(HyperLogLog hyperLogLogHolder) { + this.hyperLogLogHolder = hyperLogLogHolder; + } + + /** + * required for Externalizable + */ + @SuppressWarnings("unused") + public SerializationHolder() { + + } + + @Override + public void writeExternal(ObjectOutput out) throws IOException { + hyperLogLogHolder.writeBytes(out); + } + + @Override + public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { + hyperLogLogHolder = Builder.build(in); + } + + private Object readResolve() { + return hyperLogLogHolder; + } + } + + public static class Builder implements IBuilder<ICardinality>, Serializable { + + private static final long serialVersionUID = -979314356097156719L; + private double rsd; + + public Builder(double rsd) { + this.rsd = rsd; + } + + @Override + public HyperLogLog build() { + return new HyperLogLog(rsd); + } + + @Override + public int sizeof() { + int log2m = log2m(rsd); + int k = 1 << log2m; + return RegisterSet.getBits(k) * 4; + } + + public static HyperLogLog build(byte[] bytes) throws IOException { + ByteArrayInputStream bais = new ByteArrayInputStream(bytes); + return build(new DataInputStream(bais)); + } + + public static HyperLogLog build(DataInput serializedByteStream) throws IOException { + int log2m = serializedByteStream.readInt(); + int byteArraySize = serializedByteStream.readInt(); + return new HyperLogLog(log2m, + new RegisterSet(1 << log2m, Bits.getBits(serializedByteStream, byteArraySize))); + } + } + + @SuppressWarnings("serial") + protected static class HyperLogLogMergeException extends CardinalityMergeException { + + public HyperLogLogMergeException(String message) { + super(message); + } + } + + protected static double getAlphaMM(final int p, final int m) { + // See the paper. + switch (p) { + case 4: + return 0.673 * m * m; + case 5: + return 0.697 * m * m; + case 6: + return 0.709 * m * m; + default: + return (0.7213 / (1 + 1.079 / m)) * m * m; + } + } + + protected static double linearCounting(int m, double V) { + return m * Math.log(m / V); + } +}