Repository: orc Updated Branches: refs/heads/master 7118e968b -> 604dcc801
http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java b/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java deleted file mode 100644 index 88c3514..0000000 --- a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java +++ /dev/null @@ -1,335 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.common.util; - -/** - * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms. - * - * Murmur3 32 and 128 bit variants. - * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 - * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 - * - * This is a public domain code with no copyrights. - * From homepage of MurmurHash (https://code.google.com/p/smhasher/), - * "All MurmurHash versions are public domain software, and the author disclaims all copyright - * to their code." - */ -public class Murmur3 { - // from 64-bit linear congruential generator - public static final long NULL_HASHCODE = 2862933555777941757L; - - // Constants for 32 bit variant - private static final int C1_32 = 0xcc9e2d51; - private static final int C2_32 = 0x1b873593; - private static final int R1_32 = 15; - private static final int R2_32 = 13; - private static final int M_32 = 5; - private static final int N_32 = 0xe6546b64; - - // Constants for 128 bit variant - private static final long C1 = 0x87c37b91114253d5L; - private static final long C2 = 0x4cf5ad432745937fL; - private static final int R1 = 31; - private static final int R2 = 27; - private static final int R3 = 33; - private static final int M = 5; - private static final int N1 = 0x52dce729; - private static final int N2 = 0x38495ab5; - - private static final int DEFAULT_SEED = 104729; - - /** - * Murmur3 32-bit variant. - * - * @param data - input byte array - * @return - hashcode - */ - public static int hash32(byte[] data) { - return hash32(data, data.length, DEFAULT_SEED); - } - - /** - * Murmur3 32-bit variant. - * - * @param data - input byte array - * @param length - length of array - * @param seed - seed. (default 0) - * @return - hashcode - */ - public static int hash32(byte[] data, int length, int seed) { - int hash = seed; - final int nblocks = length >> 2; - - // body - for (int i = 0; i < nblocks; i++) { - int i_4 = i << 2; - int k = (data[i_4] & 0xff) - | ((data[i_4 + 1] & 0xff) << 8) - | ((data[i_4 + 2] & 0xff) << 16) - | ((data[i_4 + 3] & 0xff) << 24); - - // mix functions - k *= C1_32; - k = Integer.rotateLeft(k, R1_32); - k *= C2_32; - hash ^= k; - hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; - } - - // tail - int idx = nblocks << 2; - int k1 = 0; - switch (length - idx) { - case 3: - k1 ^= data[idx + 2] << 16; - case 2: - k1 ^= data[idx + 1] << 8; - case 1: - k1 ^= data[idx]; - - // mix functions - k1 *= C1_32; - k1 = Integer.rotateLeft(k1, R1_32); - k1 *= C2_32; - hash ^= k1; - } - - // finalization - hash ^= length; - hash ^= (hash >>> 16); - hash *= 0x85ebca6b; - hash ^= (hash >>> 13); - hash *= 0xc2b2ae35; - hash ^= (hash >>> 16); - - return hash; - } - - /** - * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. - * - * @param data - input byte array - * @return - hashcode - */ - public static long hash64(byte[] data) { - return hash64(data, 0, data.length, DEFAULT_SEED); - } - - public static long hash64(byte[] data, int offset, int length) { - return hash64(data, offset, length, DEFAULT_SEED); - } - - /** - * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. - * - * @param data - input byte array - * @param length - length of array - * @param seed - seed. (default is 0) - * @return - hashcode - */ - public static long hash64(byte[] data, int offset, int length, int seed) { - long hash = seed; - final int nblocks = length >> 3; - - // body - for (int i = 0; i < nblocks; i++) { - final int i8 = i << 3; - long k = ((long) data[offset + i8] & 0xff) - | (((long) data[offset + i8 + 1] & 0xff) << 8) - | (((long) data[offset + i8 + 2] & 0xff) << 16) - | (((long) data[offset + i8 + 3] & 0xff) << 24) - | (((long) data[offset + i8 + 4] & 0xff) << 32) - | (((long) data[offset + i8 + 5] & 0xff) << 40) - | (((long) data[offset + i8 + 6] & 0xff) << 48) - | (((long) data[offset + i8 + 7] & 0xff) << 56); - - // mix functions - k *= C1; - k = Long.rotateLeft(k, R1); - k *= C2; - hash ^= k; - hash = Long.rotateLeft(hash, R2) * M + N1; - } - - // tail - long k1 = 0; - int tailStart = nblocks << 3; - switch (length - tailStart) { - case 7: - k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48; - case 6: - k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40; - case 5: - k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32; - case 4: - k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24; - case 3: - k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16; - case 2: - k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8; - case 1: - k1 ^= ((long) data[offset + tailStart] & 0xff); - k1 *= C1; - k1 = Long.rotateLeft(k1, R1); - k1 *= C2; - hash ^= k1; - } - - // finalization - hash ^= length; - hash = fmix64(hash); - - return hash; - } - - /** - * Murmur3 128-bit variant. - * - * @param data - input byte array - * @return - hashcode (2 longs) - */ - public static long[] hash128(byte[] data) { - return hash128(data, 0, data.length, DEFAULT_SEED); - } - - /** - * Murmur3 128-bit variant. - * - * @param data - input byte array - * @param offset - the first element of array - * @param length - length of array - * @param seed - seed. (default is 0) - * @return - hashcode (2 longs) - */ - public static long[] hash128(byte[] data, int offset, int length, int seed) { - long h1 = seed; - long h2 = seed; - final int nblocks = length >> 4; - - // body - for (int i = 0; i < nblocks; i++) { - final int i16 = i << 4; - long k1 = ((long) data[offset + i16] & 0xff) - | (((long) data[offset + i16 + 1] & 0xff) << 8) - | (((long) data[offset + i16 + 2] & 0xff) << 16) - | (((long) data[offset + i16 + 3] & 0xff) << 24) - | (((long) data[offset + i16 + 4] & 0xff) << 32) - | (((long) data[offset + i16 + 5] & 0xff) << 40) - | (((long) data[offset + i16 + 6] & 0xff) << 48) - | (((long) data[offset + i16 + 7] & 0xff) << 56); - - long k2 = ((long) data[offset + i16 + 8] & 0xff) - | (((long) data[offset + i16 + 9] & 0xff) << 8) - | (((long) data[offset + i16 + 10] & 0xff) << 16) - | (((long) data[offset + i16 + 11] & 0xff) << 24) - | (((long) data[offset + i16 + 12] & 0xff) << 32) - | (((long) data[offset + i16 + 13] & 0xff) << 40) - | (((long) data[offset + i16 + 14] & 0xff) << 48) - | (((long) data[offset + i16 + 15] & 0xff) << 56); - - // mix functions for k1 - k1 *= C1; - k1 = Long.rotateLeft(k1, R1); - k1 *= C2; - h1 ^= k1; - h1 = Long.rotateLeft(h1, R2); - h1 += h2; - h1 = h1 * M + N1; - - // mix functions for k2 - k2 *= C2; - k2 = Long.rotateLeft(k2, R3); - k2 *= C1; - h2 ^= k2; - h2 = Long.rotateLeft(h2, R1); - h2 += h1; - h2 = h2 * M + N2; - } - - // tail - long k1 = 0; - long k2 = 0; - int tailStart = nblocks << 4; - switch (length - tailStart) { - case 15: - k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48; - case 14: - k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40; - case 13: - k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32; - case 12: - k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24; - case 11: - k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16; - case 10: - k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8; - case 9: - k2 ^= (long) (data[offset + tailStart + 8] & 0xff); - k2 *= C2; - k2 = Long.rotateLeft(k2, R3); - k2 *= C1; - h2 ^= k2; - - case 8: - k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56; - case 7: - k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48; - case 6: - k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40; - case 5: - k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32; - case 4: - k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24; - case 3: - k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16; - case 2: - k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8; - case 1: - k1 ^= (long) (data[offset + tailStart] & 0xff); - k1 *= C1; - k1 = Long.rotateLeft(k1, R1); - k1 *= C2; - h1 ^= k1; - } - - // finalization - h1 ^= length; - h2 ^= length; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - return new long[]{h1, h2}; - } - - private static long fmix64(long h) { - h ^= (h >>> 33); - h *= 0xff51afd7ed558ccdL; - h ^= (h >>> 33); - h *= 0xc4ceb9fe1a85ec53L; - h ^= (h >>> 33); - return h; - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/java/org/apache/orc/util/Murmur3.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/orc/util/Murmur3.java b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java new file mode 100644 index 0000000..838681c --- /dev/null +++ b/java/storage-api/src/java/org/apache/orc/util/Murmur3.java @@ -0,0 +1,335 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.util; + +/** + * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms. + * + * Murmur3 32 and 128 bit variants. + * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 + * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 + * + * This is a public domain code with no copyrights. + * From homepage of MurmurHash (https://code.google.com/p/smhasher/), + * "All MurmurHash versions are public domain software, and the author disclaims all copyright + * to their code." + */ +public class Murmur3 { + // from 64-bit linear congruential generator + public static final long NULL_HASHCODE = 2862933555777941757L; + + // Constants for 32 bit variant + private static final int C1_32 = 0xcc9e2d51; + private static final int C2_32 = 0x1b873593; + private static final int R1_32 = 15; + private static final int R2_32 = 13; + private static final int M_32 = 5; + private static final int N_32 = 0xe6546b64; + + // Constants for 128 bit variant + private static final long C1 = 0x87c37b91114253d5L; + private static final long C2 = 0x4cf5ad432745937fL; + private static final int R1 = 31; + private static final int R2 = 27; + private static final int R3 = 33; + private static final int M = 5; + private static final int N1 = 0x52dce729; + private static final int N2 = 0x38495ab5; + + private static final int DEFAULT_SEED = 104729; + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static int hash32(byte[] data) { + return hash32(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default 0) + * @return - hashcode + */ + public static int hash32(byte[] data, int length, int seed) { + int hash = seed; + final int nblocks = length >> 2; + + // body + for (int i = 0; i < nblocks; i++) { + int i_4 = i << 2; + int k = (data[i_4] & 0xff) + | ((data[i_4 + 1] & 0xff) << 8) + | ((data[i_4 + 2] & 0xff) << 16) + | ((data[i_4 + 3] & 0xff) << 24); + + // mix functions + k *= C1_32; + k = Integer.rotateLeft(k, R1_32); + k *= C2_32; + hash ^= k; + hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; + } + + // tail + int idx = nblocks << 2; + int k1 = 0; + switch (length - idx) { + case 3: + k1 ^= data[idx + 2] << 16; + case 2: + k1 ^= data[idx + 1] << 8; + case 1: + k1 ^= data[idx]; + + // mix functions + k1 *= C1_32; + k1 = Integer.rotateLeft(k1, R1_32); + k1 *= C2_32; + hash ^= k1; + } + + // finalization + hash ^= length; + hash ^= (hash >>> 16); + hash *= 0x85ebca6b; + hash ^= (hash >>> 13); + hash *= 0xc2b2ae35; + hash ^= (hash >>> 16); + + return hash; + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static long hash64(byte[] data) { + return hash64(data, 0, data.length, DEFAULT_SEED); + } + + public static long hash64(byte[] data, int offset, int length) { + return hash64(data, offset, length, DEFAULT_SEED); + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode + */ + public static long hash64(byte[] data, int offset, int length, int seed) { + long hash = seed; + final int nblocks = length >> 3; + + // body + for (int i = 0; i < nblocks; i++) { + final int i8 = i << 3; + long k = ((long) data[offset + i8] & 0xff) + | (((long) data[offset + i8 + 1] & 0xff) << 8) + | (((long) data[offset + i8 + 2] & 0xff) << 16) + | (((long) data[offset + i8 + 3] & 0xff) << 24) + | (((long) data[offset + i8 + 4] & 0xff) << 32) + | (((long) data[offset + i8 + 5] & 0xff) << 40) + | (((long) data[offset + i8 + 6] & 0xff) << 48) + | (((long) data[offset + i8 + 7] & 0xff) << 56); + + // mix functions + k *= C1; + k = Long.rotateLeft(k, R1); + k *= C2; + hash ^= k; + hash = Long.rotateLeft(hash, R2) * M + N1; + } + + // tail + long k1 = 0; + int tailStart = nblocks << 3; + switch (length - tailStart) { + case 7: + k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= ((long) data[offset + tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + hash ^= k1; + } + + // finalization + hash ^= length; + hash = fmix64(hash); + + return hash; + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data) { + return hash128(data, 0, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @param offset - the first element of array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data, int offset, int length, int seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int i16 = i << 4; + long k1 = ((long) data[offset + i16] & 0xff) + | (((long) data[offset + i16 + 1] & 0xff) << 8) + | (((long) data[offset + i16 + 2] & 0xff) << 16) + | (((long) data[offset + i16 + 3] & 0xff) << 24) + | (((long) data[offset + i16 + 4] & 0xff) << 32) + | (((long) data[offset + i16 + 5] & 0xff) << 40) + | (((long) data[offset + i16 + 6] & 0xff) << 48) + | (((long) data[offset + i16 + 7] & 0xff) << 56); + + long k2 = ((long) data[offset + i16 + 8] & 0xff) + | (((long) data[offset + i16 + 9] & 0xff) << 8) + | (((long) data[offset + i16 + 10] & 0xff) << 16) + | (((long) data[offset + i16 + 11] & 0xff) << 24) + | (((long) data[offset + i16 + 12] & 0xff) << 32) + | (((long) data[offset + i16 + 13] & 0xff) << 40) + | (((long) data[offset + i16 + 14] & 0xff) << 48) + | (((long) data[offset + i16 + 15] & 0xff) << 56); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + int tailStart = nblocks << 4; + switch (length - tailStart) { + case 15: + k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48; + case 14: + k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40; + case 13: + k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32; + case 12: + k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24; + case 11: + k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16; + case 10: + k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8; + case 9: + k2 ^= (long) (data[offset + tailStart + 8] & 0xff); + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56; + case 7: + k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= (long) (data[offset + tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return new long[]{h1, h2}; + } + + private static long fmix64(long h) { + h ^= (h >>> 33); + h *= 0xff51afd7ed558ccdL; + h ^= (h >>> 33); + h *= 0xc4ceb9fe1a85ec53L; + h ^= (h >>> 33); + return h; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java b/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java deleted file mode 100644 index 5facc7c..0000000 --- a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java +++ /dev/null @@ -1,224 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hive.common.util; - -import static org.junit.Assert.assertEquals; - -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; - -import org.junit.Test; - -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Arrays; -import java.util.Random; - -/** - * Tests for Murmur3 variants. - */ -public class TestMurmur3 { - - @Test - public void testHashCodesM3_32_string() { - String key = "test"; - int seed = 123; - HashFunction hf = Hashing.murmur3_32(seed); - int hc1 = hf.hashBytes(key.getBytes()).asInt(); - int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); - assertEquals(hc1, hc2); - - key = "testkey"; - hc1 = hf.hashBytes(key.getBytes()).asInt(); - hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); - assertEquals(hc1, hc2); - } - - @Test - public void testHashCodesM3_32_ints() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_32(seed); - for (int i = 0; i < 1000; i++) { - int val = rand.nextInt(); - byte[] data = ByteBuffer.allocate(4).putInt(val).array(); - int hc1 = hf.hashBytes(data).asInt(); - int hc2 = Murmur3.hash32(data, data.length, seed); - assertEquals(hc1, hc2); - } - } - - @Test - public void testHashCodesM3_32_longs() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_32(seed); - for (int i = 0; i < 1000; i++) { - long val = rand.nextLong(); - byte[] data = ByteBuffer.allocate(8).putLong(val).array(); - int hc1 = hf.hashBytes(data).asInt(); - int hc2 = Murmur3.hash32(data, data.length, seed); - assertEquals(hc1, hc2); - } - } - - @Test - public void testHashCodesM3_32_double() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_32(seed); - for (int i = 0; i < 1000; i++) { - double val = rand.nextDouble(); - byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); - int hc1 = hf.hashBytes(data).asInt(); - int hc2 = Murmur3.hash32(data, data.length, seed); - assertEquals(hc1, hc2); - } - } - - @Test - public void testHashCodesM3_128_string() { - String key = "test"; - int seed = 123; - HashFunction hf = Hashing.murmur3_128(seed); - // guava stores the hashcodes in little endian order - ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(key.getBytes()).asBytes()); - buf.flip(); - long gl1 = buf.getLong(); - long gl2 = buf.getLong(8); - long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, seed); - long m1 = hc[0]; - long m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - - key = "testkey128_testkey128"; - buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(key.getBytes()).asBytes()); - buf.flip(); - gl1 = buf.getLong(); - gl2 = buf.getLong(8); - byte[] keyBytes = key.getBytes(); - hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed); - m1 = hc[0]; - m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - - byte[] offsetKeyBytes = new byte[keyBytes.length + 35]; - Arrays.fill(offsetKeyBytes, (byte) -1); - System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length); - hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed); - assertEquals(gl1, hc[0]); - assertEquals(gl2, hc[1]); - } - - @Test - public void testHashCodeM3_64() { - byte[] origin = ("It was the best of times, it was the worst of times," + - " it was the age of wisdom, it was the age of foolishness," + - " it was the epoch of belief, it was the epoch of incredulity," + - " it was the season of Light, it was the season of Darkness," + - " it was the spring of hope, it was the winter of despair," + - " we had everything before us, we had nothing before us," + - " we were all going direct to Heaven," + - " we were all going direct the other way.").getBytes(); - long hash = Murmur3.hash64(origin, 0, origin.length); - assertEquals(305830725663368540L, hash); - - byte[] originOffset = new byte[origin.length + 150]; - Arrays.fill(originOffset, (byte) 123); - System.arraycopy(origin, 0, originOffset, 150, origin.length); - hash = Murmur3.hash64(originOffset, 150, origin.length); - assertEquals(305830725663368540L, hash); - } - - @Test - public void testHashCodesM3_128_ints() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_128(seed); - for (int i = 0; i < 1000; i++) { - int val = rand.nextInt(); - byte[] data = ByteBuffer.allocate(4).putInt(val).array(); - // guava stores the hashcodes in little endian order - ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(data).asBytes()); - buf.flip(); - long gl1 = buf.getLong(); - long gl2 = buf.getLong(8); - long[] hc = Murmur3.hash128(data, 0, data.length, seed); - long m1 = hc[0]; - long m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - - byte[] offsetData = new byte[data.length + 50]; - System.arraycopy(data, 0, offsetData, 50, data.length); - hc = Murmur3.hash128(offsetData, 50, data.length, seed); - assertEquals(gl1, hc[0]); - assertEquals(gl2, hc[1]); - } - } - - @Test - public void testHashCodesM3_128_longs() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_128(seed); - for (int i = 0; i < 1000; i++) { - long val = rand.nextLong(); - byte[] data = ByteBuffer.allocate(8).putLong(val).array(); - // guava stores the hashcodes in little endian order - ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(data).asBytes()); - buf.flip(); - long gl1 = buf.getLong(); - long gl2 = buf.getLong(8); - long[] hc = Murmur3.hash128(data, 0, data.length, seed); - long m1 = hc[0]; - long m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - } - } - - @Test - public void testHashCodesM3_128_double() { - int seed = 123; - Random rand = new Random(seed); - HashFunction hf = Hashing.murmur3_128(seed); - for (int i = 0; i < 1000; i++) { - double val = rand.nextDouble(); - byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); - // guava stores the hashcodes in little endian order - ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - buf.put(hf.hashBytes(data).asBytes()); - buf.flip(); - long gl1 = buf.getLong(); - long gl2 = buf.getLong(8); - long[] hc = Murmur3.hash128(data, 0, data.length, seed); - long m1 = hc[0]; - long m2 = hc[1]; - assertEquals(gl1, m1); - assertEquals(gl2, m2); - } - } -} http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/java/org/apache/orc/tools/FileDump.java ---------------------------------------------------------------------- diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java index 876070b..7206503 100644 --- a/java/tools/src/java/org/apache/orc/tools/FileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java @@ -37,7 +37,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hdfs.DistributedFileSystem; -import org.apache.orc.BloomFilterIO; +import org.apache.orc.util.BloomFilter; +import org.apache.orc.util.BloomFilterIO; import org.apache.orc.ColumnStatistics; import org.apache.orc.CompressionKind; import org.apache.orc.OrcFile; @@ -383,7 +384,9 @@ public final class FileDump { StringBuilder buf = new StringBuilder(); String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex()); buf.append(rowIdxString); - String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex()); + String bloomFilString = getFormattedBloomFilters(col, indices, + reader.getWriterVersion(), + reader.getSchema().findSubtype(col).getCategory()); buf.append(bloomFilString); System.out.println(buf); } @@ -604,15 +607,18 @@ public final class FileDump { return -1; } - private static String getFormattedBloomFilters(int col, - OrcProto.BloomFilterIndex[] bloomFilterIndex) { + private static String getFormattedBloomFilters(int col, OrcIndex index, + OrcFile.WriterVersion version, + TypeDescription.Category type) { + OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex(); StringBuilder buf = new StringBuilder(); - BloomFilterIO stripeLevelBF = null; + BloomFilter stripeLevelBF = null; if (bloomFilterIndex != null && bloomFilterIndex[col] != null) { int idx = 0; buf.append("\n Bloom filters for column ").append(col).append(":"); for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) { - BloomFilterIO toMerge = new BloomFilterIO(bf); + BloomFilter toMerge = BloomFilterIO.deserialize( + index.getBloomFilterKinds()[col], version, type, bf); buf.append("\n Entry ").append(idx++).append(":").append(getBloomFilterStats(toMerge)); if (stripeLevelBF == null) { stripeLevelBF = toMerge; @@ -626,7 +632,7 @@ public final class FileDump { return buf.toString(); } - private static String getBloomFilterStats(BloomFilterIO bf) { + private static String getBloomFilterStats(BloomFilter bf) { StringBuilder sb = new StringBuilder(); int bitCount = bf.getBitSize(); int popCount = 0; http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java ---------------------------------------------------------------------- diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java index e2048ea..aa3072c 100644 --- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java @@ -20,18 +20,20 @@ package org.apache.orc.tools; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.orc.CompressionKind; +import org.apache.orc.OrcFile; import org.apache.orc.Reader; +import org.apache.orc.TypeDescription; import org.apache.orc.impl.AcidStats; import org.apache.orc.impl.OrcAcidUtils; import org.apache.orc.impl.RecordReaderImpl; +import org.apache.orc.util.BloomFilter; import org.codehaus.jettison.json.JSONArray; -import org.apache.orc.BloomFilterIO; +import org.apache.orc.util.BloomFilterIO; import org.apache.orc.BinaryColumnStatistics; import org.apache.orc.BooleanColumnStatistics; import org.apache.orc.ColumnStatistics; @@ -50,12 +52,16 @@ import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONObject; import org.codehaus.jettison.json.JSONStringer; import org.codehaus.jettison.json.JSONWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * File dump tool with json formatted output. */ public class JsonFileDump { + private static final Logger LOG = LoggerFactory.getLogger(JsonFileDump.class); + public static void printJsonMetaData(List<String> files, Configuration conf, List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone) @@ -185,7 +191,9 @@ public class JsonFileDump { writer.object(); writer.key("columnId").value(col); writeRowGroupIndexes(writer, col, indices.getRowGroupIndex()); - writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex()); + writeBloomFilterIndexes(writer, col, indices, + reader.getWriterVersion(), + reader.getSchema().findSubtype(col).getCategory()); writer.endObject(); } writer.endArray(); @@ -334,16 +342,21 @@ public class JsonFileDump { } private static void writeBloomFilterIndexes(JSONWriter writer, int col, - OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException { + OrcIndex index, + OrcFile.WriterVersion version, + TypeDescription.Category type + ) throws JSONException { - BloomFilterIO stripeLevelBF = null; + BloomFilter stripeLevelBF = null; + OrcProto.BloomFilterIndex[] bloomFilterIndex = index.getBloomFilterIndex(); if (bloomFilterIndex != null && bloomFilterIndex[col] != null) { int entryIx = 0; writer.key("bloomFilterIndexes").array(); for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) { writer.object(); writer.key("entryId").value(entryIx++); - BloomFilterIO toMerge = new BloomFilterIO(bf); + BloomFilter toMerge = BloomFilterIO.deserialize( + index.getBloomFilterKinds()[col], version, type, bf); writeBloomFilterStats(writer, toMerge); if (stripeLevelBF == null) { stripeLevelBF = toMerge; @@ -362,7 +375,7 @@ public class JsonFileDump { } } - private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf) + private static void writeBloomFilterStats(JSONWriter writer, BloomFilter bf) throws JSONException { int bitCount = bf.getBitSize(); int popCount = 0; http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/org/apache/orc/tools/TestFileDump.java ---------------------------------------------------------------------- diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java index 10cc87d..65ff404 100644 --- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java +++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java @@ -445,8 +445,9 @@ public class TestFileDump { .compress(CompressionKind.ZLIB) .bufferSize(10000) .rowIndexStride(1000) - .bloomFilterColumns("l") - .bloomFilterFpp(0.01); + .bloomFilterColumns("l,s") + .bloomFilterFpp(0.01) + .bloomFilterVersion(OrcFile.BloomFilterVersion.ORIGINAL); VectorizedRowBatch batch = schema.createRowBatch(1000); Writer writer = OrcFile.createWriter(testFilePath, options); Random r1 = new Random(1); @@ -483,7 +484,6 @@ public class TestFileDump { System.out.flush(); System.setOut(origOut); - checkOutput(outputFilename, workDir + File.separator + outputFilename); } http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/log4j.properties ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/log4j.properties b/java/tools/src/test/resources/log4j.properties new file mode 100644 index 0000000..8224baf --- /dev/null +++ b/java/tools/src/test/resources/log4j.properties @@ -0,0 +1,21 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +log4j.rootLogger=WARN,stdout + +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target = System.err +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%p\t%d{ISO8601}\t%r\t%c\t[%t]\t%m%n + +# Suppress the warnings about native io not being available +log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-bloomfilter.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out index 18fd2fb..b879bed 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with HIVE_13083 +File Version: 0.12 with ORC_101 Rows: 21000 Compression: ZLIB Compression size: 4096 @@ -39,17 +39,17 @@ File Statistics: Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761 Stripes: - Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 951 + Stripe: offset: 3 data: 63786 rows: 5000 tail: 86 index: 743 Stream: column 0 section ROW_INDEX start: 3 length 17 Stream: column 1 section ROW_INDEX start: 20 length 166 Stream: column 2 section ROW_INDEX start: 186 length 169 Stream: column 3 section ROW_INDEX start: 355 length 87 - Stream: column 3 section BLOOM_FILTER start: 442 length 512 - Stream: column 1 section DATA start: 954 length 20035 - Stream: column 2 section DATA start: 20989 length 40050 - Stream: column 3 section DATA start: 61039 length 3543 - Stream: column 3 section LENGTH start: 64582 length 25 - Stream: column 3 section DICTIONARY_DATA start: 64607 length 133 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 442 length 304 + Stream: column 1 section DATA start: 746 length 20035 + Stream: column 2 section DATA start: 20781 length 40050 + Stream: column 3 section DATA start: 60831 length 3543 + Stream: column 3 section LENGTH start: 64374 length 25 + Stream: column 3 section DICTIONARY_DATA start: 64399 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -67,17 +67,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 64826 data: 63775 rows: 5000 tail: 86 index: 944 - Stream: column 0 section ROW_INDEX start: 64826 length 17 - Stream: column 1 section ROW_INDEX start: 64843 length 164 - Stream: column 2 section ROW_INDEX start: 65007 length 168 - Stream: column 3 section ROW_INDEX start: 65175 length 83 - Stream: column 3 section BLOOM_FILTER start: 65258 length 512 - Stream: column 1 section DATA start: 65770 length 20035 - Stream: column 2 section DATA start: 85805 length 40050 - Stream: column 3 section DATA start: 125855 length 3532 - Stream: column 3 section LENGTH start: 129387 length 25 - Stream: column 3 section DICTIONARY_DATA start: 129412 length 133 + Stripe: offset: 64618 data: 63775 rows: 5000 tail: 86 index: 736 + Stream: column 0 section ROW_INDEX start: 64618 length 17 + Stream: column 1 section ROW_INDEX start: 64635 length 164 + Stream: column 2 section ROW_INDEX start: 64799 length 168 + Stream: column 3 section ROW_INDEX start: 64967 length 83 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 65050 length 304 + Stream: column 1 section DATA start: 65354 length 20035 + Stream: column 2 section DATA start: 85389 length 40050 + Stream: column 3 section DATA start: 125439 length 3532 + Stream: column 3 section LENGTH start: 128971 length 25 + Stream: column 3 section DICTIONARY_DATA start: 128996 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -95,17 +95,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 129631 data: 63787 rows: 5000 tail: 86 index: 950 - Stream: column 0 section ROW_INDEX start: 129631 length 17 - Stream: column 1 section ROW_INDEX start: 129648 length 163 - Stream: column 2 section ROW_INDEX start: 129811 length 168 - Stream: column 3 section ROW_INDEX start: 129979 length 90 - Stream: column 3 section BLOOM_FILTER start: 130069 length 512 - Stream: column 1 section DATA start: 130581 length 20035 - Stream: column 2 section DATA start: 150616 length 40050 - Stream: column 3 section DATA start: 190666 length 3544 - Stream: column 3 section LENGTH start: 194210 length 25 - Stream: column 3 section DICTIONARY_DATA start: 194235 length 133 + Stripe: offset: 129215 data: 63787 rows: 5000 tail: 86 index: 742 + Stream: column 0 section ROW_INDEX start: 129215 length 17 + Stream: column 1 section ROW_INDEX start: 129232 length 163 + Stream: column 2 section ROW_INDEX start: 129395 length 168 + Stream: column 3 section ROW_INDEX start: 129563 length 90 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 129653 length 304 + Stream: column 1 section DATA start: 129957 length 20035 + Stream: column 2 section DATA start: 149992 length 40050 + Stream: column 3 section DATA start: 190042 length 3544 + Stream: column 3 section LENGTH start: 193586 length 25 + Stream: column 3 section DICTIONARY_DATA start: 193611 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -123,17 +123,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 194454 data: 63817 rows: 5000 tail: 86 index: 952 - Stream: column 0 section ROW_INDEX start: 194454 length 17 - Stream: column 1 section ROW_INDEX start: 194471 length 165 - Stream: column 2 section ROW_INDEX start: 194636 length 167 - Stream: column 3 section ROW_INDEX start: 194803 length 91 - Stream: column 3 section BLOOM_FILTER start: 194894 length 512 - Stream: column 1 section DATA start: 195406 length 20035 - Stream: column 2 section DATA start: 215441 length 40050 - Stream: column 3 section DATA start: 255491 length 3574 - Stream: column 3 section LENGTH start: 259065 length 25 - Stream: column 3 section DICTIONARY_DATA start: 259090 length 133 + Stripe: offset: 193830 data: 63817 rows: 5000 tail: 85 index: 744 + Stream: column 0 section ROW_INDEX start: 193830 length 17 + Stream: column 1 section ROW_INDEX start: 193847 length 165 + Stream: column 2 section ROW_INDEX start: 194012 length 167 + Stream: column 3 section ROW_INDEX start: 194179 length 91 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 194270 length 304 + Stream: column 1 section DATA start: 194574 length 20035 + Stream: column 2 section DATA start: 214609 length 40050 + Stream: column 3 section DATA start: 254659 length 3574 + Stream: column 3 section LENGTH start: 258233 length 25 + Stream: column 3 section DICTIONARY_DATA start: 258258 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -151,17 +151,17 @@ Stripes: Entry 3: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Entry 4: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 - Stripe: offset: 259309 data: 12943 rows: 1000 tail: 78 index: 432 - Stream: column 0 section ROW_INDEX start: 259309 length 12 - Stream: column 1 section ROW_INDEX start: 259321 length 38 - Stream: column 2 section ROW_INDEX start: 259359 length 41 - Stream: column 3 section ROW_INDEX start: 259400 length 40 - Stream: column 3 section BLOOM_FILTER start: 259440 length 301 - Stream: column 1 section DATA start: 259741 length 4007 - Stream: column 2 section DATA start: 263748 length 8010 - Stream: column 3 section DATA start: 271758 length 768 - Stream: column 3 section LENGTH start: 272526 length 25 - Stream: column 3 section DICTIONARY_DATA start: 272551 length 133 + Stripe: offset: 258476 data: 12943 rows: 1000 tail: 78 index: 382 + Stream: column 0 section ROW_INDEX start: 258476 length 12 + Stream: column 1 section ROW_INDEX start: 258488 length 38 + Stream: column 2 section ROW_INDEX start: 258526 length 41 + Stream: column 3 section ROW_INDEX start: 258567 length 40 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 258607 length 251 + Stream: column 1 section DATA start: 258858 length 4007 + Stream: column 2 section DATA start: 262865 length 8010 + Stream: column 3 section DATA start: 270875 length 768 + Stream: column 3 section LENGTH start: 271643 length 25 + Stream: column 3 section DICTIONARY_DATA start: 271668 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -172,7 +172,7 @@ Stripes: Entry 0: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 Stripe level merge: numHashFunctions: 4 bitCount: 6272 popCount: 138 loadFactor: 0.022 expectedFpp: 2.343647E-7 -File length: 273307 bytes +File length: 272427 bytes Padding length: 0 bytes Padding ratio: 0% ________________________________________________________________________________________________________________________ http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out index fa5cc2d..75cd5f4 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with HIVE_13083 +File Version: 0.12 with ORC_101 Rows: 21000 Compression: ZLIB Compression size: 4096 @@ -39,17 +39,20 @@ File Statistics: Column 3: count: 21000 hasNull: false min: Darkness, max: worst sum: 81761 Stripes: - Stripe: offset: 3 data: 63786 rows: 5000 tail: 85 index: 6974 + Stripe: offset: 3 data: 63786 rows: 5000 tail: 104 index: 14950 Stream: column 0 section ROW_INDEX start: 3 length 17 Stream: column 1 section ROW_INDEX start: 20 length 166 Stream: column 2 section ROW_INDEX start: 186 length 169 Stream: column 2 section BLOOM_FILTER start: 355 length 6535 - Stream: column 3 section ROW_INDEX start: 6890 length 87 - Stream: column 1 section DATA start: 6977 length 20035 - Stream: column 2 section DATA start: 27012 length 40050 - Stream: column 3 section DATA start: 67062 length 3543 - Stream: column 3 section LENGTH start: 70605 length 25 - Stream: column 3 section DICTIONARY_DATA start: 70630 length 133 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 6890 length 6046 + Stream: column 3 section ROW_INDEX start: 12936 length 87 + Stream: column 3 section BLOOM_FILTER start: 13023 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 14061 length 892 + Stream: column 1 section DATA start: 14953 length 20035 + Stream: column 2 section DATA start: 34988 length 40050 + Stream: column 3 section DATA start: 75038 length 3543 + Stream: column 3 section LENGTH start: 78581 length 25 + Stream: column 3 section DICTIONARY_DATA start: 78606 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -67,17 +70,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4971 loadFactor: 0.5178 expectedFpp: 0.009981772 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4949 loadFactor: 0.5155 expectedFpp: 0.009676614 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9347 loadFactor: 0.9736 expectedFpp: 0.829482 - Stripe: offset: 70848 data: 63775 rows: 5000 tail: 85 index: 6965 - Stream: column 0 section ROW_INDEX start: 70848 length 17 - Stream: column 1 section ROW_INDEX start: 70865 length 164 - Stream: column 2 section ROW_INDEX start: 71029 length 168 - Stream: column 2 section BLOOM_FILTER start: 71197 length 6533 - Stream: column 3 section ROW_INDEX start: 77730 length 83 - Stream: column 1 section DATA start: 77813 length 20035 - Stream: column 2 section DATA start: 97848 length 40050 - Stream: column 3 section DATA start: 137898 length 3532 - Stream: column 3 section LENGTH start: 141430 length 25 - Stream: column 3 section DICTIONARY_DATA start: 141455 length 133 + Stripe: offset: 78843 data: 63775 rows: 5000 tail: 103 index: 14941 + Stream: column 0 section ROW_INDEX start: 78843 length 17 + Stream: column 1 section ROW_INDEX start: 78860 length 164 + Stream: column 2 section ROW_INDEX start: 79024 length 168 + Stream: column 2 section BLOOM_FILTER start: 79192 length 6533 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 85725 length 6046 + Stream: column 3 section ROW_INDEX start: 91771 length 83 + Stream: column 3 section BLOOM_FILTER start: 91854 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 92892 length 892 + Stream: column 1 section DATA start: 93784 length 20035 + Stream: column 2 section DATA start: 113819 length 40050 + Stream: column 3 section DATA start: 153869 length 3532 + Stream: column 3 section LENGTH start: 157401 length 25 + Stream: column 3 section DICTIONARY_DATA start: 157426 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -95,17 +101,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4962 loadFactor: 0.5169 expectedFpp: 0.009855959 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4966 loadFactor: 0.5173 expectedFpp: 0.009911705 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9344 loadFactor: 0.9733 expectedFpp: 0.8276205 - Stripe: offset: 141673 data: 63787 rows: 5000 tail: 85 index: 6971 - Stream: column 0 section ROW_INDEX start: 141673 length 17 - Stream: column 1 section ROW_INDEX start: 141690 length 163 - Stream: column 2 section ROW_INDEX start: 141853 length 168 - Stream: column 2 section BLOOM_FILTER start: 142021 length 6533 - Stream: column 3 section ROW_INDEX start: 148554 length 90 - Stream: column 1 section DATA start: 148644 length 20035 - Stream: column 2 section DATA start: 168679 length 40050 - Stream: column 3 section DATA start: 208729 length 3544 - Stream: column 3 section LENGTH start: 212273 length 25 - Stream: column 3 section DICTIONARY_DATA start: 212298 length 133 + Stripe: offset: 157662 data: 63787 rows: 5000 tail: 104 index: 14947 + Stream: column 0 section ROW_INDEX start: 157662 length 17 + Stream: column 1 section ROW_INDEX start: 157679 length 163 + Stream: column 2 section ROW_INDEX start: 157842 length 168 + Stream: column 2 section BLOOM_FILTER start: 158010 length 6533 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 164543 length 6046 + Stream: column 3 section ROW_INDEX start: 170589 length 90 + Stream: column 3 section BLOOM_FILTER start: 170679 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 171717 length 892 + Stream: column 1 section DATA start: 172609 length 20035 + Stream: column 2 section DATA start: 192644 length 40050 + Stream: column 3 section DATA start: 232694 length 3544 + Stream: column 3 section LENGTH start: 236238 length 25 + Stream: column 3 section DICTIONARY_DATA start: 236263 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -123,17 +132,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4943 loadFactor: 0.5149 expectedFpp: 0.009594797 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4930 loadFactor: 0.5135 expectedFpp: 0.009419539 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9333 loadFactor: 0.9722 expectedFpp: 0.82082444 - Stripe: offset: 212516 data: 63817 rows: 5000 tail: 85 index: 6964 - Stream: column 0 section ROW_INDEX start: 212516 length 17 - Stream: column 1 section ROW_INDEX start: 212533 length 165 - Stream: column 2 section ROW_INDEX start: 212698 length 167 - Stream: column 2 section BLOOM_FILTER start: 212865 length 6524 - Stream: column 3 section ROW_INDEX start: 219389 length 91 - Stream: column 1 section DATA start: 219480 length 20035 - Stream: column 2 section DATA start: 239515 length 40050 - Stream: column 3 section DATA start: 279565 length 3574 - Stream: column 3 section LENGTH start: 283139 length 25 - Stream: column 3 section DICTIONARY_DATA start: 283164 length 133 + Stripe: offset: 236500 data: 63817 rows: 5000 tail: 103 index: 14940 + Stream: column 0 section ROW_INDEX start: 236500 length 17 + Stream: column 1 section ROW_INDEX start: 236517 length 165 + Stream: column 2 section ROW_INDEX start: 236682 length 167 + Stream: column 2 section BLOOM_FILTER start: 236849 length 6524 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 243373 length 6046 + Stream: column 3 section ROW_INDEX start: 249419 length 91 + Stream: column 3 section BLOOM_FILTER start: 249510 length 1038 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 250548 length 892 + Stream: column 1 section DATA start: 251440 length 20035 + Stream: column 2 section DATA start: 271475 length 40050 + Stream: column 3 section DATA start: 311525 length 3574 + Stream: column 3 section LENGTH start: 315099 length 25 + Stream: column 3 section DICTIONARY_DATA start: 315124 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -151,17 +163,20 @@ Stripes: Entry 3: numHashFunctions: 7 bitCount: 9600 popCount: 4941 loadFactor: 0.5147 expectedFpp: 0.009567649 Entry 4: numHashFunctions: 7 bitCount: 9600 popCount: 4993 loadFactor: 0.5201 expectedFpp: 0.010295142 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 9353 loadFactor: 0.9743 expectedFpp: 0.8332165 - Stripe: offset: 283382 data: 12943 rows: 1000 tail: 78 index: 1468 - Stream: column 0 section ROW_INDEX start: 283382 length 12 - Stream: column 1 section ROW_INDEX start: 283394 length 38 - Stream: column 2 section ROW_INDEX start: 283432 length 41 - Stream: column 2 section BLOOM_FILTER start: 283473 length 1337 - Stream: column 3 section ROW_INDEX start: 284810 length 40 - Stream: column 1 section DATA start: 284850 length 4007 - Stream: column 2 section DATA start: 288857 length 8010 - Stream: column 3 section DATA start: 296867 length 768 - Stream: column 3 section LENGTH start: 297635 length 25 - Stream: column 3 section DICTIONARY_DATA start: 297660 length 133 + Stripe: offset: 315360 data: 12943 rows: 1000 tail: 96 index: 3542 + Stream: column 0 section ROW_INDEX start: 315360 length 12 + Stream: column 1 section ROW_INDEX start: 315372 length 38 + Stream: column 2 section ROW_INDEX start: 315410 length 41 + Stream: column 2 section BLOOM_FILTER start: 315451 length 1337 + Stream: column 2 section BLOOM_FILTER_UTF8 start: 316788 length 1211 + Stream: column 3 section ROW_INDEX start: 317999 length 40 + Stream: column 3 section BLOOM_FILTER start: 318039 length 472 + Stream: column 3 section BLOOM_FILTER_UTF8 start: 318511 length 391 + Stream: column 1 section DATA start: 318902 length 4007 + Stream: column 2 section DATA start: 322909 length 8010 + Stream: column 3 section DATA start: 330919 length 768 + Stream: column 3 section LENGTH start: 331687 length 25 + Stream: column 3 section DICTIONARY_DATA start: 331712 length 133 Encoding column 0: DIRECT Encoding column 1: DIRECT_V2 Encoding column 2: DIRECT_V2 @@ -172,7 +187,7 @@ Stripes: Entry 0: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294 Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294 -File length: 298416 bytes +File length: 332489 bytes Padding length: 0 bytes Padding ratio: 0% ________________________________________________________________________________________________________________________ http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out index 17a964b..4b0822f 100644 --- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out +++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with HIVE_13083 +File Version: 0.12 with ORC_101 Rows: 21000 Compression: ZLIB Compression size: 4096 http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump.json ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json index bf654a1..3dd0dc0 100644 --- a/java/tools/src/test/resources/orc-file-dump.json +++ b/java/tools/src/test/resources/orc-file-dump.json @@ -1,7 +1,7 @@ { "fileName": "TestFileDump.testDump.orc", "fileVersion": "0.12", - "writerVersion": "HIVE_13083", + "writerVersion": "ORC_101", "numberOfRows": 21000, "compression": "ZLIB", "compressionBufferSize": 4096, @@ -254,9 +254,9 @@ "stripeNumber": 1, "stripeInformation": { "offset": 3, - "indexLength": 970, + "indexLength": 762, "dataLength": 63770, - "footerLength": 90, + "footerLength": 89, "rowCount": 5000 }, "streams": [ @@ -286,44 +286,44 @@ }, { "columnId": 3, - "section": "BLOOM_FILTER", + "section": "BLOOM_FILTER_UTF8", "startOffset": 461, - "length": 512 + "length": 304 }, { "columnId": 1, "section": "DATA", - "startOffset": 973, + "startOffset": 765, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 21008, + "startOffset": 20800, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 61058, + "startOffset": 60850, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 61075, + "startOffset": 60867, "length": 3510 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 64585, + "startOffset": 64377, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 64610, + "startOffset": 64402, "length": 133 } ], @@ -494,77 +494,77 @@ { "stripeNumber": 2, "stripeInformation": { - "offset": 64833, - "indexLength": 961, + "offset": 64624, + "indexLength": 753, "dataLength": 63763, - "footerLength": 88, + "footerLength": 87, "rowCount": 5000 }, "streams": [ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 64833, + "startOffset": 64624, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 64850, + "startOffset": 64641, "length": 166 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 65016, + "startOffset": 64807, "length": 166 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 65182, + "startOffset": 64973, "length": 100 }, { "columnId": 3, - "section": "BLOOM_FILTER", - "startOffset": 65282, - "length": 512 + "section": "BLOOM_FILTER_UTF8", + "startOffset": 65073, + "length": 304 }, { "columnId": 1, "section": "DATA", - "startOffset": 65794, + "startOffset": 65377, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 85829, + "startOffset": 85412, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 125879, + "startOffset": 125462, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 125896, + "startOffset": 125479, "length": 3503 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 129399, + "startOffset": 128982, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 129424, + "startOffset": 129007, "length": 133 } ], @@ -735,77 +735,77 @@ { "stripeNumber": 3, "stripeInformation": { - "offset": 129645, - "indexLength": 962, + "offset": 129227, + "indexLength": 754, "dataLength": 63770, - "footerLength": 91, + "footerLength": 89, "rowCount": 5000 }, "streams": [ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 129645, + "startOffset": 129227, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 129662, + "startOffset": 129244, "length": 164 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 129826, + "startOffset": 129408, "length": 167 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 129993, + "startOffset": 129575, "length": 102 }, { "columnId": 3, - "section": "BLOOM_FILTER", - "startOffset": 130095, - "length": 512 + "section": "BLOOM_FILTER_UTF8", + "startOffset": 129677, + "length": 304 }, { "columnId": 1, "section": "DATA", - "startOffset": 130607, + "startOffset": 129981, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 150642, + "startOffset": 150016, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 190692, + "startOffset": 190066, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 190709, + "startOffset": 190083, "length": 3510 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 194219, + "startOffset": 193593, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 194244, + "startOffset": 193618, "length": 133 } ], @@ -976,77 +976,77 @@ { "stripeNumber": 4, "stripeInformation": { - "offset": 194468, - "indexLength": 973, + "offset": 193840, + "indexLength": 765, "dataLength": 63756, - "footerLength": 91, + "footerLength": 89, "rowCount": 5000 }, "streams": [ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 194468, + "startOffset": 193840, "length": 17 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 194485, + "startOffset": 193857, "length": 166 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 194651, + "startOffset": 194023, "length": 171 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 194822, + "startOffset": 194194, "length": 107 }, { "columnId": 3, - "section": "BLOOM_FILTER", - "startOffset": 194929, - "length": 512 + "section": "BLOOM_FILTER_UTF8", + "startOffset": 194301, + "length": 304 }, { "columnId": 1, "section": "DATA", - "startOffset": 195441, + "startOffset": 194605, "length": 20035 }, { "columnId": 2, "section": "DATA", - "startOffset": 215476, + "startOffset": 214640, "length": 40050 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 255526, + "startOffset": 254690, "length": 17 }, { "columnId": 3, "section": "DATA", - "startOffset": 255543, + "startOffset": 254707, "length": 3496 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 259039, + "startOffset": 258203, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 259064, + "startOffset": 258228, "length": 133 } ], @@ -1217,8 +1217,8 @@ { "stripeNumber": 5, "stripeInformation": { - "offset": 259288, - "indexLength": 433, + "offset": 258450, + "indexLength": 383, "dataLength": 12943, "footerLength": 83, "rowCount": 1000 @@ -1227,67 +1227,67 @@ { "columnId": 0, "section": "ROW_INDEX", - "startOffset": 259288, + "startOffset": 258450, "length": 12 }, { "columnId": 1, "section": "ROW_INDEX", - "startOffset": 259300, + "startOffset": 258462, "length": 38 }, { "columnId": 2, "section": "ROW_INDEX", - "startOffset": 259338, + "startOffset": 258500, "length": 41 }, { "columnId": 3, "section": "ROW_INDEX", - "startOffset": 259379, + "startOffset": 258541, "length": 41 }, { "columnId": 3, - "section": "BLOOM_FILTER", - "startOffset": 259420, - "length": 301 + "section": "BLOOM_FILTER_UTF8", + "startOffset": 258582, + "length": 251 }, { "columnId": 1, "section": "DATA", - "startOffset": 259721, + "startOffset": 258833, "length": 4007 }, { "columnId": 2, "section": "DATA", - "startOffset": 263728, + "startOffset": 262840, "length": 8010 }, { "columnId": 3, "section": "PRESENT", - "startOffset": 271738, + "startOffset": 270850, "length": 16 }, { "columnId": 3, "section": "DATA", - "startOffset": 271754, + "startOffset": 270866, "length": 752 }, { "columnId": 3, "section": "LENGTH", - "startOffset": 272506, + "startOffset": 271618, "length": 25 }, { "columnId": 3, "section": "DICTIONARY_DATA", - "startOffset": 272531, + "startOffset": 271643, "length": 133 } ], @@ -1348,7 +1348,7 @@ }] } ], - "fileLength": 273300, + "fileLength": 272409, "paddingLength": 0, "paddingRatio": 0, "status": "OK" http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-dump.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-dump.out b/java/tools/src/test/resources/orc-file-dump.out index 70f7fbd..ae8195e 100644 --- a/java/tools/src/test/resources/orc-file-dump.out +++ b/java/tools/src/test/resources/orc-file-dump.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with HIVE_13083 +File Version: 0.12 with ORC_101 Rows: 21000 Compression: ZLIB Compression size: 4096 http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/java/tools/src/test/resources/orc-file-has-null.out ---------------------------------------------------------------------- diff --git a/java/tools/src/test/resources/orc-file-has-null.out b/java/tools/src/test/resources/orc-file-has-null.out index df075d5..c02f803 100644 --- a/java/tools/src/test/resources/orc-file-has-null.out +++ b/java/tools/src/test/resources/orc-file-has-null.out @@ -1,5 +1,5 @@ Structure for TestFileDump.testDump.orc -File Version: 0.12 with HIVE_13083 +File Version: 0.12 with ORC_101 Rows: 20000 Compression: ZLIB Compression size: 4096 http://git-wip-us.apache.org/repos/asf/orc/blob/9d39cb80/proto/orc_proto.proto ---------------------------------------------------------------------- diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index dbc34ab..de6974e 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -91,6 +91,7 @@ message RowIndex { message BloomFilter { optional uint32 numHashFunctions = 1; repeated fixed64 bitset = 2; + optional bytes utf8bitset = 3; } message BloomFilterIndex { @@ -109,6 +110,7 @@ message Stream { SECONDARY = 5; ROW_INDEX = 6; BLOOM_FILTER = 7; + BLOOM_FILTER_UTF8 = 8; } optional Kind kind = 1; optional uint32 column = 2;