Ali Alsuliman has uploaded a new change for review. https://asterix-gerrit.ics.uci.edu/3286
Change subject: [ASTERIXDB-2516][RT] Avoid rewriting numbers into storage when hashing ...................................................................... [ASTERIXDB-2516][RT] Avoid rewriting numbers into storage when hashing - user model changes: no - storage format changes: no - interface changes: no Details: When hashing numbers other than a double, the value is converted to double by writing the number again as a double in a buffer then hashing is performed. Eliminate the need to write to a buffer and read the number as a double and perform the hashing on the bits. Change-Id: Ibdf68d90240a3fcddc8cc2ae5ee4a7aa53070438 --- A asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/HashTest.java M asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/AMurmurHash3BinaryHashFunctionFamily.java M hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/MurmurHash3BinaryHash.java 3 files changed, 275 insertions(+), 25 deletions(-) git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/86/3286/1 diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/HashTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/HashTest.java new file mode 100644 index 0000000..1b793b1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/HashTest.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.asterix.runtime; + +import java.io.IOException; +import java.util.Random; + +import org.apache.asterix.dataflow.data.nontagged.hash.AMurmurHash3BinaryHashFunctionFamily; +import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider; +import org.apache.asterix.om.base.AMutableDouble; +import org.apache.asterix.om.base.AMutableFloat; +import org.apache.asterix.om.base.AMutableInt16; +import org.apache.asterix.om.base.AMutableInt32; +import org.apache.asterix.om.base.AMutableInt64; +import org.apache.asterix.om.base.AMutableInt8; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.om.types.hierachy.FloatToDoubleTypeConvertComputer; +import org.apache.asterix.om.types.hierachy.IntegerToDoubleTypeConvertComputer; +import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction; +import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer; +import org.apache.hyracks.data.std.accessors.MurmurHash3BinaryHash; +import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; +import org.junit.Assert; +import org.junit.Test; + +public class HashTest { + + private static final Random RANDOM = new Random(); + + @SuppressWarnings("unchecked") // unchecked call from serializer + @Test + public void hashNumbers() throws IOException { + int seed = 0; + ISerializerDeserializer se = SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ANY); + IBinaryHashFunction hashFun = + AMurmurHash3BinaryHashFunctionFamily.createBinaryHashFunction(BuiltinType.ANY, seed); + ArrayBackedValueStorage numStorage = new ArrayBackedValueStorage(); + ArrayBackedValueStorage doubleStorage = new ArrayBackedValueStorage(); + AMutableInt8 mutableInt8 = new AMutableInt8((byte) 0); + AMutableInt16 mutableInt16 = new AMutableInt16((short) 0); + AMutableInt32 mutableInt32 = new AMutableInt32(0); + AMutableInt64 mutableInt64 = new AMutableInt64(0); + AMutableFloat mutableFloat = new AMutableFloat(0); + AMutableDouble mutableDouble = new AMutableDouble(0); + int h1, h2, intHash, longHash, floatHash; + for (int i = 0; i < 100; i++) { + double doubleValue = RANDOM.nextDouble(); + float floatValue = RANDOM.nextFloat(); + long longValue = RANDOM.nextLong(); + int intValue = RANDOM.nextInt(i + 1); + // byte + mutableInt8.setValue((byte) longValue); + assertSameHash(se, mutableInt8, numStorage, doubleStorage, hashFun, seed, false, longValue); + mutableInt8.setValue((byte) intValue); + assertSameHash(se, mutableInt8, numStorage, doubleStorage, hashFun, seed, false, intValue); + // short + mutableInt16.setValue((short) longValue); + assertSameHash(se, mutableInt16, numStorage, doubleStorage, hashFun, seed, false, longValue); + mutableInt16.setValue((short) intValue); + assertSameHash(se, mutableInt16, numStorage, doubleStorage, hashFun, seed, false, intValue); + // int + mutableInt32.setValue((int) longValue); + assertSameHash(se, mutableInt32, numStorage, doubleStorage, hashFun, seed, false, longValue); + mutableInt32.setValue(intValue); + intHash = assertSameHash(se, mutableInt32, numStorage, doubleStorage, hashFun, seed, false, intValue); + // long + mutableInt64.setValue(intValue); + h2 = assertSameHash(se, mutableInt64, numStorage, doubleStorage, hashFun, seed, false, intValue); + Assert.assertEquals(intHash, h2); + mutableInt64.setValue(longValue); + longHash = assertSameHash(se, mutableInt64, numStorage, doubleStorage, hashFun, seed, false, longValue); + // float + mutableFloat.setValue(intValue); + h2 = assertSameHash(se, mutableFloat, numStorage, doubleStorage, hashFun, seed, true, intValue); + Assert.assertEquals(intHash, h2); + mutableFloat.setValue(floatValue); + floatHash = assertSameHash(se, mutableFloat, numStorage, doubleStorage, hashFun, seed, true, floatValue); + // double + mutableDouble.setValue(intValue); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, intValue); + Assert.assertEquals(intHash, h2); + mutableDouble.setValue(longValue); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, longValue); + Assert.assertEquals(longHash, h2); + mutableDouble.setValue(floatValue); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, floatValue); + Assert.assertEquals(floatHash, h2); + mutableDouble.setValue(doubleValue); + assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, doubleValue); + } + mutableInt8.setValue(Byte.MAX_VALUE); + assertSameHash(se, mutableInt8, numStorage, doubleStorage, hashFun, seed, false, Byte.MAX_VALUE); + mutableInt8.setValue(Byte.MIN_VALUE); + assertSameHash(se, mutableInt8, numStorage, doubleStorage, hashFun, seed, false, Byte.MIN_VALUE); + + mutableInt16.setValue(Short.MAX_VALUE); + assertSameHash(se, mutableInt16, numStorage, doubleStorage, hashFun, seed, false, Short.MAX_VALUE); + mutableInt16.setValue(Short.MIN_VALUE); + assertSameHash(se, mutableInt16, numStorage, doubleStorage, hashFun, seed, false, Short.MIN_VALUE); + + mutableInt32.setValue(Integer.MAX_VALUE); + assertSameHash(se, mutableInt32, numStorage, doubleStorage, hashFun, seed, false, Integer.MAX_VALUE); + mutableInt32.setValue(Integer.MIN_VALUE); + assertSameHash(se, mutableInt32, numStorage, doubleStorage, hashFun, seed, false, Integer.MIN_VALUE); + + mutableInt64.setValue(Long.MAX_VALUE); + assertSameHash(se, mutableInt64, numStorage, doubleStorage, hashFun, seed, false, Long.MAX_VALUE); + mutableInt64.setValue(Long.MIN_VALUE); + assertSameHash(se, mutableInt64, numStorage, doubleStorage, hashFun, seed, false, Long.MIN_VALUE); + + mutableFloat.setValue(Float.MAX_VALUE); + assertSameHash(se, mutableFloat, numStorage, doubleStorage, hashFun, seed, true, Float.MAX_VALUE); + mutableFloat.setValue(-Float.MAX_VALUE); + assertSameHash(se, mutableFloat, numStorage, doubleStorage, hashFun, seed, true, -Float.MAX_VALUE); + mutableFloat.setValue(Float.MIN_VALUE); + assertSameHash(se, mutableFloat, numStorage, doubleStorage, hashFun, seed, true, Float.MIN_VALUE); + + mutableDouble.setValue(Double.MAX_VALUE); + assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, Double.MAX_VALUE); + mutableDouble.setValue(-Double.MAX_VALUE); + assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, -Double.MAX_VALUE); + mutableDouble.setValue(Double.MIN_VALUE); + assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, Double.MIN_VALUE); + + mutableFloat.setValue(Float.NaN); + h1 = assertSameHash(se, mutableFloat, numStorage, doubleStorage, hashFun, seed, true, Float.NaN); + mutableDouble.setValue(Double.NaN); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, Double.NaN); + Assert.assertEquals(h1, h2); + + mutableFloat.setValue(Float.POSITIVE_INFINITY); + h1 = assertSameHash(se, mutableFloat, numStorage, doubleStorage, hashFun, seed, true, Float.POSITIVE_INFINITY); + mutableDouble.setValue(Double.POSITIVE_INFINITY); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, Double.POSITIVE_INFINITY); + Assert.assertEquals(h1, h2); + + mutableFloat.setValue(Float.NEGATIVE_INFINITY); + h1 = assertSameHash(se, mutableFloat, numStorage, doubleStorage, hashFun, seed, true, Float.NEGATIVE_INFINITY); + mutableDouble.setValue(Double.NEGATIVE_INFINITY); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, Double.NEGATIVE_INFINITY); + Assert.assertEquals(h1, h2); + + mutableInt32.setValue(0); + h1 = assertSameHash(se, mutableInt32, numStorage, doubleStorage, hashFun, seed, false, 0); + mutableDouble.setValue(0.0); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, 0.0); + Assert.assertEquals(h1, h2); + + mutableInt32.setValue(-0); + h1 = assertSameHash(se, mutableInt32, numStorage, doubleStorage, hashFun, seed, false, -0); + mutableDouble.setValue(-0.0); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, -0.0); + Assert.assertNotEquals(h1, h2); + + mutableInt32.setValue(+0); + h1 = assertSameHash(se, mutableInt32, numStorage, doubleStorage, hashFun, seed, false, +0); + mutableDouble.setValue(+0.0); + h2 = assertSameHashDouble(se, mutableDouble, numStorage, hashFun, seed, +0.0); + Assert.assertEquals(h1, h2); + } + + @SuppressWarnings("unchecked") // unchecked call from serializer + private static int assertSameHash(ISerializerDeserializer se, Object num, ArrayBackedValueStorage numStorage, + ArrayBackedValueStorage doubleStorage, IBinaryHashFunction hashFun, int seed, boolean isFloat, double n) + throws IOException { + numStorage.reset(); + se.serialize(num, numStorage.getDataOutput()); + int actual = hashFun.hash(numStorage.getByteArray(), numStorage.getStartOffset(), numStorage.getLength()); + doubleStorage.reset(); + if (isFloat) { + FloatToDoubleTypeConvertComputer.getInstance().convertType(numStorage.getByteArray(), + numStorage.getStartOffset() + 1, numStorage.getLength() - 1, doubleStorage.getDataOutput()); + } else { + IntegerToDoubleTypeConvertComputer.getInstance().convertType(numStorage.getByteArray(), + numStorage.getStartOffset() + 1, numStorage.getLength() - 1, doubleStorage.getDataOutput()); + } + int expected = MurmurHash3BinaryHash.hash(doubleStorage.getByteArray(), doubleStorage.getStartOffset(), + doubleStorage.getLength(), seed); + Assert.assertEquals("Hash for number: " + n, expected, actual); + return actual; + } + + @SuppressWarnings("unchecked") // unchecked call from serializer + private static int assertSameHashDouble(ISerializerDeserializer se, Object num, ArrayBackedValueStorage numStorage, + IBinaryHashFunction hashFun, int seed, double n) throws IOException { + numStorage.reset(); + se.serialize(num, numStorage.getDataOutput()); + int actual = hashFun.hash(numStorage.getByteArray(), numStorage.getStartOffset(), numStorage.getLength()); + int expected = MurmurHash3BinaryHash.hash(numStorage.getByteArray(), numStorage.getStartOffset(), + numStorage.getLength(), seed); + Assert.assertEquals("Hash for number: " + n, expected, actual); + return actual; + } +} diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/AMurmurHash3BinaryHashFunctionFamily.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/AMurmurHash3BinaryHashFunctionFamily.java index 020fbd1..b2b5655 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/AMurmurHash3BinaryHashFunctionFamily.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/hash/AMurmurHash3BinaryHashFunctionFamily.java @@ -18,11 +18,11 @@ */ package org.apache.asterix.dataflow.data.nontagged.hash; +import static org.apache.asterix.om.types.ATypeTag.SERIALIZED_DOUBLE_TYPE_TAG; import static org.apache.asterix.om.util.container.ObjectFactories.RECORD_FACTORY; import static org.apache.asterix.om.util.container.ObjectFactories.STORAGE_FACTORY; import static org.apache.asterix.om.util.container.ObjectFactories.VOID_FACTORY; -import java.io.DataOutput; import java.io.IOException; import org.apache.asterix.dataflow.data.common.ListAccessorUtil; @@ -34,17 +34,19 @@ import org.apache.asterix.om.types.AbstractCollectionType; import org.apache.asterix.om.types.EnumDeserializer; import org.apache.asterix.om.types.IAType; -import org.apache.asterix.om.types.hierachy.FloatToDoubleTypeConvertComputer; -import org.apache.asterix.om.types.hierachy.IntegerToDoubleTypeConvertComputer; import org.apache.asterix.om.util.container.IObjectPool; import org.apache.asterix.om.util.container.ListObjectPool; import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction; import org.apache.hyracks.api.dataflow.value.IBinaryHashFunctionFamily; -import org.apache.hyracks.api.exceptions.ErrorCode; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.data.std.accessors.MurmurHash3BinaryHash; import org.apache.hyracks.data.std.api.IMutableValueStorage; import org.apache.hyracks.data.std.api.IPointable; +import org.apache.hyracks.data.std.primitive.DoublePointable; +import org.apache.hyracks.data.std.primitive.FloatPointable; +import org.apache.hyracks.data.std.primitive.IntegerPointable; +import org.apache.hyracks.data.std.primitive.LongPointable; +import org.apache.hyracks.data.std.primitive.ShortPointable; import org.apache.hyracks.data.std.util.ArrayBackedValueStorage; public class AMurmurHash3BinaryHashFunctionFamily implements IBinaryHashFunctionFamily { @@ -75,8 +77,6 @@ private static final class GenericHashFunction implements IBinaryHashFunction { - private final ArrayBackedValueStorage valueBuffer = new ArrayBackedValueStorage(); - private final DataOutput valueOut = valueBuffer.getDataOutput(); private final IObjectPool<IPointable, Void> voidPointableAllocator = new ListObjectPool<>(VOID_FACTORY); private final IObjectPool<IMutableValueStorage, Void> storageAllocator = new ListObjectPool<>(STORAGE_FACTORY); private final IObjectPool<SortedRecord, ARecordType> recordPool = new ListObjectPool<>(RECORD_FACTORY); @@ -95,31 +95,27 @@ private int hash(IAType type, byte[] bytes, int offset, int length) throws HyracksDataException { // if a numeric type is encountered, then we promote each numeric type to the DOUBLE type. - valueBuffer.reset(); ATypeTag sourceTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(bytes[offset]); - + double value; switch (sourceTag) { case TINYINT: + value = bytes[offset + 1]; + return MurmurHash3BinaryHash.hash(Double.doubleToLongBits(value), SERIALIZED_DOUBLE_TYPE_TAG, seed); case SMALLINT: + value = ShortPointable.getShort(bytes, offset + 1); + return MurmurHash3BinaryHash.hash(Double.doubleToLongBits(value), SERIALIZED_DOUBLE_TYPE_TAG, seed); case INTEGER: + value = IntegerPointable.getInteger(bytes, offset + 1); + return MurmurHash3BinaryHash.hash(Double.doubleToLongBits(value), SERIALIZED_DOUBLE_TYPE_TAG, seed); case BIGINT: - try { - IntegerToDoubleTypeConvertComputer.getInstance().convertType(bytes, offset + 1, length - 1, - valueOut); - } catch (IOException e) { - throw HyracksDataException.create(ErrorCode.NUMERIC_PROMOTION_ERROR, e.getMessage()); - } - return MurmurHash3BinaryHash.hash(valueBuffer.getByteArray(), valueBuffer.getStartOffset(), - valueBuffer.getLength(), seed); + value = LongPointable.getLong(bytes, offset + 1); + return MurmurHash3BinaryHash.hash(Double.doubleToLongBits(value), SERIALIZED_DOUBLE_TYPE_TAG, seed); case FLOAT: - try { - FloatToDoubleTypeConvertComputer.getInstance().convertType(bytes, offset + 1, length - 1, - valueOut); - } catch (IOException e) { - throw HyracksDataException.create(ErrorCode.NUMERIC_PROMOTION_ERROR, e.getMessage()); - } - return MurmurHash3BinaryHash.hash(valueBuffer.getByteArray(), valueBuffer.getStartOffset(), - valueBuffer.getLength(), seed); + value = FloatPointable.getFloat(bytes, offset + 1); + return MurmurHash3BinaryHash.hash(Double.doubleToLongBits(value), SERIALIZED_DOUBLE_TYPE_TAG, seed); + case DOUBLE: + return MurmurHash3BinaryHash.hash(DoublePointable.getLongBits(bytes, offset + 1), + SERIALIZED_DOUBLE_TYPE_TAG, seed); case ARRAY: try { return hashArray(type, bytes, offset, length); @@ -128,7 +124,6 @@ } case OBJECT: return hashRecord(type, bytes, offset, length); - case DOUBLE: default: return MurmurHash3BinaryHash.hash(bytes, offset, length, seed); } diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/MurmurHash3BinaryHash.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/MurmurHash3BinaryHash.java index b720fbc..a02c0d4 100644 --- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/MurmurHash3BinaryHash.java +++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/MurmurHash3BinaryHash.java @@ -73,4 +73,48 @@ h ^= (h >>> 16); return h; } + + /** + * Hashes a number represented by the "bits" argument. The "length" is equal to 9 since the hash calculation will + * include one extra byte together with they 8 bytes of the long. + * + * @param bits the bits of the number. + * @param beginningByte the extra byte to be included in the hash calculation. + * @param seed seed. + * @return hash of a number. + */ + public static int hash(long bits, byte beginningByte, int seed) { + int hash = seed; + int length = 9; + int remain = length; + byte firstByte = beginningByte; + int numShifts = 56; + while (remain >= 4) { + int k = (firstByte & 0xff) | (((byte) (bits >>> numShifts) & 0xff) << 8) + | (((byte) (bits >>> (numShifts - 8)) & 0xff) << 16) + | (((byte) (bits >>> (numShifts - 16)) & 0xff) << 24); + k *= C1; + k = Integer.rotateLeft(k, 15); + k *= C2; + hash ^= k; + hash = Integer.rotateLeft(hash, 13); + hash = hash * C3 + C4; + firstByte = (byte) (bits >>> 32); + numShifts = 24; + remain -= 4; + } + int k = 0; + k ^= ((byte) bits & 0xff); + k *= C1; + k = Integer.rotateLeft(k, 15); + k *= C2; + hash ^= k; + hash ^= length; + hash ^= (hash >>> 16); + hash *= C5; + hash ^= (hash >>> 13); + hash *= C6; + hash ^= (hash >>> 16); + return hash; + } } -- To view, visit https://asterix-gerrit.ics.uci.edu/3286 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ibdf68d90240a3fcddc8cc2ae5ee4a7aa53070438 Gerrit-PatchSet: 1 Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Owner: Ali Alsuliman <[email protected]>
