Repository: hive Updated Branches: refs/heads/master df12aec50 -> ffb7e043e
HIVE-18866 : Semijoin and analyze: Implement a Long -> Hash64 vector fast-path (Gopal Vijayaraghavan, Sergey Shelukhin, reviewed by Prasanth Jayachandran) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/5df1eb3f Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/5df1eb3f Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/5df1eb3f Branch: refs/heads/master Commit: 5df1eb3f16111bbf70ae8d17b915a4bffb67594b Parents: 43e2f96 Author: sergey <ser...@apache.org> Authored: Tue May 22 12:06:12 2018 -0700 Committer: sergey <ser...@apache.org> Committed: Tue May 22 12:14:11 2018 -0700 ---------------------------------------------------------------------- itests/hive-jmh/pom.xml | 5 + .../hive/benchmark/hash/Murmur3Bench.java | 107 +++++++++++++++++++ .../hadoop/hive/common/ndv/hll/HyperLogLog.java | 21 ++-- .../apache/hive/common/util/BloomKFilter.java | 4 +- .../org/apache/hive/common/util/Murmur3.java | 46 ++++++++ .../apache/hive/common/util/TestMurmur3.java | 29 ++++- 6 files changed, 193 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/5df1eb3f/itests/hive-jmh/pom.xml ---------------------------------------------------------------------- diff --git a/itests/hive-jmh/pom.xml b/itests/hive-jmh/pom.xml index c0a6564..5eb3026 100644 --- a/itests/hive-jmh/pom.xml +++ b/itests/hive-jmh/pom.xml @@ -65,6 +65,11 @@ </dependency> <dependency> <groupId>org.apache.hive</groupId> + <artifactId>hive-storage-api</artifactId> + <version>2.7.0-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>${project.version}</version> </dependency> http://git-wip-us.apache.org/repos/asf/hive/blob/5df1eb3f/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/hash/Murmur3Bench.java ---------------------------------------------------------------------- diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/hash/Murmur3Bench.java b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/hash/Murmur3Bench.java new file mode 100644 index 0000000..cd85148 --- /dev/null +++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/hash/Murmur3Bench.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hive.benchmark.hash; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColLikeStringScalar; +import org.apache.hive.common.util.Murmur3; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +/** + * This test measures the performance for vectorization. + * <p/> + * This test uses JMH framework for benchmarking. + * You may execute this benchmark tool using JMH command line in different ways: + * <p/> + * To use the settings shown in the main() function, use: + * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.hash.Murmur3Bench + * <p/> + * To use the default settings used by JMH, use: + * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.hash.Murmur3Bench + * <p/> + * To specify different parameters, use: + * - This command will use 10 warm-up iterations, 5 test iterations, and 2 forks. And it will + * display the Average Time (avgt) in Microseconds (us) + * - Benchmark mode. Available modes are: + * [Throughput/thrpt, AverageTime/avgt, SampleTime/sample, SingleShotTime/ss, All/all] + * - Output time unit. Available time units are: [m, s, ms, us, ns]. + * <p/> + * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.hash.Murmur3Bench + * -wi 10 -i 5 -f 2 -bm avgt -tu us + */ +@State(Scope.Benchmark) +public class Murmur3Bench { + @BenchmarkMode(Mode.AverageTime) + @Fork(1) + @State(Scope.Thread) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + public static class Hash64Bench { + + @Param({ "-1"}) //"123456789", "987654321", "1234", "4321", + long v; + + + + @Benchmark + @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 20, time = 2, timeUnit = TimeUnit.SECONDS) + public long longHash() { + long k = 0; + for (int i = 0; i < 4096; i++) { + k += Murmur3.hash64(v); + } + return k; + } + + @Benchmark + @Warmup(iterations = 2, time = 2, timeUnit = TimeUnit.SECONDS) + @Measurement(iterations = 20, time = 2, timeUnit = TimeUnit.SECONDS) + public long longBytesHash() { + ByteBuffer LONG_BUFFER = ByteBuffer.allocate(Long.BYTES); + long k = 0; + for (int i = 0; i < 4096; i++) { + LONG_BUFFER.putLong(0, v+i); + k += Murmur3.hash64(LONG_BUFFER.array()); + } + return k; + } + } + + public static void main(String[] args) throws RunnerException { + Options opt = new OptionsBuilder().include(".*" + Murmur3Bench.class.getSimpleName() + + ".*").build(); + new Runner(opt).run(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/5df1eb3f/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java index 8bdb47b..07a93c6 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java @@ -62,9 +62,6 @@ public class HyperLogLog implements NumDistinctValueEstimator { private final static int DEFAULT_HASH_BITS = 64; private final static long HASH64_ZERO = Murmur3.hash64(new byte[] {0}); private final static long HASH64_ONE = Murmur3.hash64(new byte[] {1}); - private final static ByteBuffer SHORT_BUFFER = ByteBuffer.allocate(Short.BYTES); - private final static ByteBuffer INT_BUFFER = ByteBuffer.allocate(Integer.BYTES); - private final static ByteBuffer LONG_BUFFER = ByteBuffer.allocate(Long.BYTES); public enum EncodingType { SPARSE, DENSE @@ -212,33 +209,27 @@ public class HyperLogLog implements NumDistinctValueEstimator { } public void addShort(short val) { - SHORT_BUFFER.putShort(0, val); - add(Murmur3.hash64(SHORT_BUFFER.array())); + add(Murmur3.hash64(val)); } public void addInt(int val) { - INT_BUFFER.putInt(0, val); - add(Murmur3.hash64(INT_BUFFER.array())); + add(Murmur3.hash64(val)); } public void addLong(long val) { - LONG_BUFFER.putLong(0, val); - add(Murmur3.hash64(LONG_BUFFER.array())); + add(Murmur3.hash64(val)); } public void addFloat(float val) { - INT_BUFFER.putFloat(0, val); - add(Murmur3.hash64(INT_BUFFER.array())); + add(Murmur3.hash64(Float.floatToIntBits(val))); } public void addDouble(double val) { - LONG_BUFFER.putDouble(0, val); - add(Murmur3.hash64(LONG_BUFFER.array())); + add(Murmur3.hash64(Double.doubleToLongBits(val))); } public void addChar(char val) { - SHORT_BUFFER.putChar(0, val); - add(Murmur3.hash64(SHORT_BUFFER.array())); + add(Murmur3.hash64((short)val)); } /** http://git-wip-us.apache.org/repos/asf/hive/blob/5df1eb3f/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java ---------------------------------------------------------------------- diff --git a/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java b/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java index 6ccf5ab..5b1914d 100644 --- a/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java +++ b/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java @@ -156,7 +156,7 @@ public class BloomKFilter { public void addLong(long val) { // puts long in little endian order - addBytes(longToByteArrayLE(val)); + addHash(Murmur3.hash64(val)); } public void addFloat(float val) { @@ -239,7 +239,7 @@ public class BloomKFilter { } public boolean testLong(long val) { - return testBytes(longToByteArrayLE(val)); + return testHash(Murmur3.hash64(val)); } public boolean testFloat(float val) { http://git-wip-us.apache.org/repos/asf/hive/blob/5df1eb3f/storage-api/src/java/org/apache/hive/common/util/Murmur3.java ---------------------------------------------------------------------- diff --git a/storage-api/src/java/org/apache/hive/common/util/Murmur3.java b/storage-api/src/java/org/apache/hive/common/util/Murmur3.java index c896fa7..8aae28b 100644 --- a/storage-api/src/java/org/apache/hive/common/util/Murmur3.java +++ b/storage-api/src/java/org/apache/hive/common/util/Murmur3.java @@ -155,6 +155,52 @@ public class Murmur3 { return hash64(data, 0, data.length, DEFAULT_SEED); } + public static long hash64(long data) { + long hash = DEFAULT_SEED; + long k = Long.reverseBytes(data); + int length = Long.BYTES; + // mix functions + k *= C1; + k = Long.rotateLeft(k, R1); + k *= C2; + hash ^= k; + hash = Long.rotateLeft(hash, R2) * M + N1; + // finalization + hash ^= length; + hash = fmix64(hash); + return hash; + } + + public static long hash64(int data) { + long k1 = Integer.reverseBytes(data) & (-1L >>> 32); + int length = Integer.BYTES; + long hash = DEFAULT_SEED; + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + hash ^= k1; + // finalization + hash ^= length; + hash = fmix64(hash); + return hash; + } + + public static long hash64(short data) { + long hash = DEFAULT_SEED; + long k1 = 0; + k1 ^= ((long) data & 0xff) << 8; + k1 ^= ((long)((data & 0xFF00) >> 8) & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + hash ^= k1; + + // finalization + hash ^= Short.BYTES; + hash = fmix64(hash); + return hash; + } + public static long hash64(byte[] data, int offset, int length) { return hash64(data, offset, length, DEFAULT_SEED); } http://git-wip-us.apache.org/repos/asf/hive/blob/5df1eb3f/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java ---------------------------------------------------------------------- diff --git a/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java b/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java index f20366b..16955c1 100644 --- a/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java +++ b/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java @@ -18,7 +18,7 @@ package org.apache.hive.common.util; -import static org.junit.Assert.assertEquals; +import static org.junit.Assert.*; import org.apache.hive.common.util.Murmur3.IncrementalHash32; import com.google.common.hash.HashFunction; @@ -222,7 +222,32 @@ public class TestMurmur3 { assertEquals(gl2, m2); } } - + + @Test + public void test64() { + final int seed = 123, iters = 1000000; + ByteBuffer SHORT_BUFFER = ByteBuffer.allocate(Short.BYTES); + ByteBuffer INT_BUFFER = ByteBuffer.allocate(Integer.BYTES); + ByteBuffer LONG_BUFFER = ByteBuffer.allocate(Long.BYTES); + Random rdm = new Random(seed); + for (int i = 0; i < iters; ++i) { + long ln = rdm.nextLong(); + int in = rdm.nextInt(); + short sn = (short) (rdm.nextInt(2* Short.MAX_VALUE - 1) - Short.MAX_VALUE); + float fn = rdm.nextFloat(); + double dn = rdm.nextDouble(); + SHORT_BUFFER.putShort(0, sn); + assertEquals(Murmur3.hash64(SHORT_BUFFER.array()), Murmur3.hash64(sn)); + INT_BUFFER.putInt(0, in); + assertEquals(Murmur3.hash64(INT_BUFFER.array()), Murmur3.hash64(in)); + LONG_BUFFER.putLong(0, ln); + assertEquals(Murmur3.hash64(LONG_BUFFER.array()), Murmur3.hash64(ln)); + INT_BUFFER.putFloat(0, fn); + assertEquals(Murmur3.hash64(INT_BUFFER.array()), Murmur3.hash64(Float.floatToIntBits(fn))); + LONG_BUFFER.putDouble(0, dn); + assertEquals(Murmur3.hash64(LONG_BUFFER.array()), Murmur3.hash64(Double.doubleToLongBits(dn))); + } + } @Test public void testIncremental() {