[ 
https://issues.apache.org/jira/browse/DRILL-4237?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15205196#comment-15205196
 ] 

ASF GitHub Bot commented on DRILL-4237:
---------------------------------------

Github user amansinha100 commented on a diff in the pull request:

    https://github.com/apache/drill/pull/430#discussion_r56901020
  
    --- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/MurmurHash3.java
 ---
    @@ -0,0 +1,264 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +
    +
    +package org.apache.drill.exec.expr.fn.impl;
    +
    +import io.netty.buffer.DrillBuf;
    +import io.netty.util.internal.PlatformDependent;
    +
    +
    +/**
    + *
    + * MurmurHash3 was written by Austin Appleby, and is placed in the public
    + * domain.
    + * See http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp
    + * MurmurHash3_x64_128
    + * MurmurHash3_x86_32
    + */
    +public final class MurmurHash3 extends DrillHash{
    +
    +  public final class LongPair {
    +    public long val1;
    +    public long val2;
    +  }
    +
    +  public static final long fmix64(long k) {
    +    k ^= k >>> 33;
    +    k *= 0xff51afd7ed558ccdL;
    +    k ^= k >>> 33;
    +    k *= 0xc4ceb9fe1a85ec53L;
    +    k ^= k >>> 33;
    +    return k;
    +  }
    +
    +
    +
    +  /*
    +  Take 64 bit of murmur3_128's output
    +   */
    +  public static long murmur3_64(long bStart, long bEnd, DrillBuf buffer, 
int seed) {
    +
    +    long h1 = seed & 0x00000000FFFFFFFFL;
    +    long h2 = seed & 0x00000000FFFFFFFFL;
    +
    +    final long c1 = 0x87c37b91114253d5L;
    +    final long c2 = 0x4cf5ad432745937fL;
    +    long start = buffer.memoryAddress() + bStart;
    +    long end = buffer.memoryAddress() + bEnd;
    +    long length = end - start;
    +    long roundedEnd = start + ( length & 0xFFFFFFF0);  // round down to 16 
byte block
    +    for (long i=start; i<roundedEnd; i+=16) {
    +      long k1 = getLongLittleEndian(i);
    +      long k2 = getLongLittleEndian(i+8);
    +      k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
    +      h1 = Long.rotateLeft(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
    +      k2 *= c2; k2  = Long.rotateLeft(k2,33); k2 *= c1; h2 ^= k2;
    +      h2 = Long.rotateLeft(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
    +    }
    +
    +    long k1 = 0;
    +    long k2 = 0;
    +
    +    // tail
    +    switch ((int)length & 15) {
    +      case 15: k2  = (PlatformDependent.getByte(roundedEnd+14) & 0xffL) << 
48;
    +      case 14: k2 |= (PlatformDependent.getByte(roundedEnd+13) & 0xffL) << 
40;
    +      case 13: k2 |= (PlatformDependent.getByte(roundedEnd+12) & 0xffL) << 
32;
    +      case 12: k2 |= (PlatformDependent.getByte(roundedEnd+11) & 0xffL) << 
24;
    +      case 11: k2 |= (PlatformDependent.getByte(roundedEnd+10) & 0xffL) << 
16;
    +      case 10: k2 |= (PlatformDependent.getByte(roundedEnd+ 9) & 0xffL) << 
8;
    +      case  9: k2 |= (PlatformDependent.getByte(roundedEnd+ 8) & 0xffL);
    +        k2 *= c2; k2  = Long.rotateLeft(k2, 33); k2 *= c1; h2 ^= k2;
    +      case  8: k1  = (long)PlatformDependent.getByte(roundedEnd+7) << 56;
    +      case  7: k1 |= (PlatformDependent.getByte(roundedEnd+6) & 0xffL) << 
48;
    +      case  6: k1 |= (PlatformDependent.getByte(roundedEnd+5) & 0xffL) << 
40;
    +      case  5: k1 |= (PlatformDependent.getByte(roundedEnd+4) & 0xffL) << 
32;
    +      case  4: k1 |= (PlatformDependent.getByte(roundedEnd+3) & 0xffL) << 
24;
    +      case  3: k1 |= (PlatformDependent.getByte(roundedEnd+2) & 0xffL) << 
16;
    +      case  2: k1 |= (PlatformDependent.getByte(roundedEnd+1) & 0xffL) << 
8;
    +      case  1: k1 |= (PlatformDependent.getByte(roundedEnd ) & 0xffL);
    +        k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
    +    }
    +
    +    h1 ^= length; h2 ^= length;
    +
    +    h1 += h2;
    +    h2 += h1;
    +
    +    h1 = fmix64(h1);
    +    h2 = fmix64(h2);
    +
    +    h1 += h2;
    +    h2 += h1;
    +    // murmur3_128 should return 128 bit (h1,h2), now we return only 
64bits,
    +    return h1;
    +  }
    +
    +  public static long murmur3_64(long val, int seed) {
    +
    +    long h1 = seed & 0x00000000FFFFFFFFL;
    +    long h2 = seed & 0x00000000FFFFFFFFL;
    +
    +    final long c1 = 0x87c37b91114253d5L;
    +    final long c2 = 0x4cf5ad432745937fL;
    +
    +    int length = 8;
    +    long k1 = 0;
    +    long k2 = 0;
    +    k1 = val;
    +    k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
    +
    +    h1 ^= length; h2 ^= length;
    +
    +    h1 += h2;
    +    h2 += h1;
    +
    +    h1 = fmix64(h1);
    +    h2 = fmix64(h2);
    +
    +    h1 += h2;
    +    h2 += h1;
    +    // murmur3_128 should return 128 bit (h1,h2), now we return only 
64bits,
    +    return h1;
    +
    +  }
    +
    +  public static int murmur3_32(int bStart, int bEnd, DrillBuf buffer, int 
seed) {
    +
    +    final long c1 = 0xcc9e2d51L;
    +    final long c2 = 0x1b873593L;
    +    long start = buffer.memoryAddress() + bStart;
    +    long end = buffer.memoryAddress() + bEnd;
    +    long length = end - start;
    --- End diff --
    
    Why not use (bEnd - bStart) which would be subtracting smaller 
numbers..(instead of adding memoryAddress first and then subtracting)


> Skew in hash distribution
> -------------------------
>
>                 Key: DRILL-4237
>                 URL: https://issues.apache.org/jira/browse/DRILL-4237
>             Project: Apache Drill
>          Issue Type: Bug
>          Components: Functions - Drill
>    Affects Versions: 1.4.0
>            Reporter: Aman Sinha
>            Assignee: Chunhui Shi
>
> Apparently, the fix in DRILL-4119 did not fully resolve the data skew issue.  
> It worked fine on the smaller sample of the data set but on another sample of 
> the same data set, it still produces skewed values - see below the hash 
> values which are all odd numbers. 
> {noformat}
> 0: jdbc:drill:zk=local> select columns[0], hash32(columns[0]) from `test.csv` 
> limit 10;
> +-----------------------------------+--------------+
> |              EXPR$0               |    EXPR$1    |
> +-----------------------------------+--------------+
> | f71aaddec3316ae18d43cb1467e88a41  | 1506011089   |
> | 3f3a13bb45618542b5ac9d9536704d3a  | 1105719049   |
> | 6935afd0c693c67bba482cedb7a2919b  | -18137557    |
> | ca2a938d6d7e57bda40501578f98c2a8  | -1372666789  |
> | fab7f08402c8836563b0a5c94dbf0aec  | -1930778239  |
> | 9eb4620dcb68a84d17209da279236431  | -970026001   |
> | 16eed4a4e801b98550b4ff504242961e  | 356133757    |
> | a46f7935fea578ce61d8dd45bfbc2b3d  | -94010449    |
> | 7fdf5344536080c15deb2b5a2975a2b7  | -141361507   |
> | b82560a06e2e51b461c9fe134a8211bd  | -375376717   |
> +-----------------------------------+--------------+
> {noformat}
> This indicates an underlying issue with the XXHash64 java implementation, 
> which is Drill's implementation of the C version.  One of the key difference 
> as pointed out by [~jnadeau] was the use of unsigned int64 in the C version 
> compared to the Java version which uses (signed) long.  I created an XXHash 
> version using com.google.common.primitives.UnsignedLong.  However, 
> UnsignedLong does not have bit-wise operations that are needed for XXHash 
> such as rotateLeft(),  XOR etc.  One could write wrappers for these but at 
> this point, the question is: should we think of an alternative hash function 
> ? 
> The alternative approach could be the murmur hash for numeric data types that 
> we were using earlier and the Mahout version of hash function for string 
> types 
> (https://github.com/apache/drill/blob/master/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/HashHelper.java#L28).
>   As a test, I reverted to this function and was getting good hash 
> distribution for the test data. 
> I could not find any performance comparisons of our perf tests (TPC-H or DS) 
> with the original and newer (XXHash) hash functions.  If performance is 
> comparable, should we revert to the original function ?  
> As an aside, I would like to remove the hash64 versions of the functions 
> since these are not used anywhere. 



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to