[ https://issues.apache.org/jira/browse/DRILL-4237?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15205196#comment-15205196 ]
ASF GitHub Bot commented on DRILL-4237: --------------------------------------- Github user amansinha100 commented on a diff in the pull request: https://github.com/apache/drill/pull/430#discussion_r56901020 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/MurmurHash3.java --- @@ -0,0 +1,264 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +package org.apache.drill.exec.expr.fn.impl; + +import io.netty.buffer.DrillBuf; +import io.netty.util.internal.PlatformDependent; + + +/** + * + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. + * See http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp + * MurmurHash3_x64_128 + * MurmurHash3_x86_32 + */ +public final class MurmurHash3 extends DrillHash{ + + public final class LongPair { + public long val1; + public long val2; + } + + public static final long fmix64(long k) { + k ^= k >>> 33; + k *= 0xff51afd7ed558ccdL; + k ^= k >>> 33; + k *= 0xc4ceb9fe1a85ec53L; + k ^= k >>> 33; + return k; + } + + + + /* + Take 64 bit of murmur3_128's output + */ + public static long murmur3_64(long bStart, long bEnd, DrillBuf buffer, int seed) { + + long h1 = seed & 0x00000000FFFFFFFFL; + long h2 = seed & 0x00000000FFFFFFFFL; + + final long c1 = 0x87c37b91114253d5L; + final long c2 = 0x4cf5ad432745937fL; + long start = buffer.memoryAddress() + bStart; + long end = buffer.memoryAddress() + bEnd; + long length = end - start; + long roundedEnd = start + ( length & 0xFFFFFFF0); // round down to 16 byte block + for (long i=start; i<roundedEnd; i+=16) { + long k1 = getLongLittleEndian(i); + long k2 = getLongLittleEndian(i+8); + k1 *= c1; k1 = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1; + h1 = Long.rotateLeft(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + k2 *= c2; k2 = Long.rotateLeft(k2,33); k2 *= c1; h2 ^= k2; + h2 = Long.rotateLeft(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + long k1 = 0; + long k2 = 0; + + // tail + switch ((int)length & 15) { + case 15: k2 = (PlatformDependent.getByte(roundedEnd+14) & 0xffL) << 48; + case 14: k2 |= (PlatformDependent.getByte(roundedEnd+13) & 0xffL) << 40; + case 13: k2 |= (PlatformDependent.getByte(roundedEnd+12) & 0xffL) << 32; + case 12: k2 |= (PlatformDependent.getByte(roundedEnd+11) & 0xffL) << 24; + case 11: k2 |= (PlatformDependent.getByte(roundedEnd+10) & 0xffL) << 16; + case 10: k2 |= (PlatformDependent.getByte(roundedEnd+ 9) & 0xffL) << 8; + case 9: k2 |= (PlatformDependent.getByte(roundedEnd+ 8) & 0xffL); + k2 *= c2; k2 = Long.rotateLeft(k2, 33); k2 *= c1; h2 ^= k2; + case 8: k1 = (long)PlatformDependent.getByte(roundedEnd+7) << 56; + case 7: k1 |= (PlatformDependent.getByte(roundedEnd+6) & 0xffL) << 48; + case 6: k1 |= (PlatformDependent.getByte(roundedEnd+5) & 0xffL) << 40; + case 5: k1 |= (PlatformDependent.getByte(roundedEnd+4) & 0xffL) << 32; + case 4: k1 |= (PlatformDependent.getByte(roundedEnd+3) & 0xffL) << 24; + case 3: k1 |= (PlatformDependent.getByte(roundedEnd+2) & 0xffL) << 16; + case 2: k1 |= (PlatformDependent.getByte(roundedEnd+1) & 0xffL) << 8; + case 1: k1 |= (PlatformDependent.getByte(roundedEnd ) & 0xffL); + k1 *= c1; k1 = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1; + } + + h1 ^= length; h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + // murmur3_128 should return 128 bit (h1,h2), now we return only 64bits, + return h1; + } + + public static long murmur3_64(long val, int seed) { + + long h1 = seed & 0x00000000FFFFFFFFL; + long h2 = seed & 0x00000000FFFFFFFFL; + + final long c1 = 0x87c37b91114253d5L; + final long c2 = 0x4cf5ad432745937fL; + + int length = 8; + long k1 = 0; + long k2 = 0; + k1 = val; + k1 *= c1; k1 = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1; + + h1 ^= length; h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + // murmur3_128 should return 128 bit (h1,h2), now we return only 64bits, + return h1; + + } + + public static int murmur3_32(int bStart, int bEnd, DrillBuf buffer, int seed) { + + final long c1 = 0xcc9e2d51L; + final long c2 = 0x1b873593L; + long start = buffer.memoryAddress() + bStart; + long end = buffer.memoryAddress() + bEnd; + long length = end - start; --- End diff -- Why not use (bEnd - bStart) which would be subtracting smaller numbers..(instead of adding memoryAddress first and then subtracting) > Skew in hash distribution > ------------------------- > > Key: DRILL-4237 > URL: https://issues.apache.org/jira/browse/DRILL-4237 > Project: Apache Drill > Issue Type: Bug > Components: Functions - Drill > Affects Versions: 1.4.0 > Reporter: Aman Sinha > Assignee: Chunhui Shi > > Apparently, the fix in DRILL-4119 did not fully resolve the data skew issue. > It worked fine on the smaller sample of the data set but on another sample of > the same data set, it still produces skewed values - see below the hash > values which are all odd numbers. > {noformat} > 0: jdbc:drill:zk=local> select columns[0], hash32(columns[0]) from `test.csv` > limit 10; > +-----------------------------------+--------------+ > | EXPR$0 | EXPR$1 | > +-----------------------------------+--------------+ > | f71aaddec3316ae18d43cb1467e88a41 | 1506011089 | > | 3f3a13bb45618542b5ac9d9536704d3a | 1105719049 | > | 6935afd0c693c67bba482cedb7a2919b | -18137557 | > | ca2a938d6d7e57bda40501578f98c2a8 | -1372666789 | > | fab7f08402c8836563b0a5c94dbf0aec | -1930778239 | > | 9eb4620dcb68a84d17209da279236431 | -970026001 | > | 16eed4a4e801b98550b4ff504242961e | 356133757 | > | a46f7935fea578ce61d8dd45bfbc2b3d | -94010449 | > | 7fdf5344536080c15deb2b5a2975a2b7 | -141361507 | > | b82560a06e2e51b461c9fe134a8211bd | -375376717 | > +-----------------------------------+--------------+ > {noformat} > This indicates an underlying issue with the XXHash64 java implementation, > which is Drill's implementation of the C version. One of the key difference > as pointed out by [~jnadeau] was the use of unsigned int64 in the C version > compared to the Java version which uses (signed) long. I created an XXHash > version using com.google.common.primitives.UnsignedLong. However, > UnsignedLong does not have bit-wise operations that are needed for XXHash > such as rotateLeft(), XOR etc. One could write wrappers for these but at > this point, the question is: should we think of an alternative hash function > ? > The alternative approach could be the murmur hash for numeric data types that > we were using earlier and the Mahout version of hash function for string > types > (https://github.com/apache/drill/blob/master/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/HashHelper.java#L28). > As a test, I reverted to this function and was getting good hash > distribution for the test data. > I could not find any performance comparisons of our perf tests (TPC-H or DS) > with the original and newer (XXHash) hash functions. If performance is > comparable, should we revert to the original function ? > As an aside, I would like to remove the hash64 versions of the functions > since these are not used anywhere. -- This message was sent by Atlassian JIRA (v6.3.4#6332)