Repository: orc Updated Branches: refs/heads/master 3c30fe85b -> 3283d2381
http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java b/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java new file mode 100644 index 0000000..bb0b8f2 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java @@ -0,0 +1,309 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.common.util; + +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are + * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of + * bloom filter false positive (element not present in bloom filter but test() says true) are + * possible but false negatives are not possible (if element is present then test() will never + * say false). The false positive probability is configurable (default: 5%) depending on which + * storage requirement may increase or decrease. Lower the false positive probability greater + * is the space requirement. + * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. + * During the creation of bloom filter expected number of entries must be specified. If the number + * of insertions exceed the specified initial number of entries then false positive probability will + * increase accordingly. + * + * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash + * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash + * collisions for specific sequence of repeating bytes. Check the following link for more info + * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw + */ +public class BloomFilter { + public static final double DEFAULT_FPP = 0.05; + protected BitSet bitSet; + protected int numBits; + protected int numHashFunctions; + + public BloomFilter() { + } + + public BloomFilter(long expectedEntries) { + this(expectedEntries, DEFAULT_FPP); + } + + public BloomFilter(long expectedEntries, double fpp) { + checkArgument(expectedEntries > 0, "expectedEntries should be > 0"); + checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0"); + int nb = optimalNumOfBits(expectedEntries, fpp); + // make 'm' multiple of 64 + this.numBits = nb + (Long.SIZE - (nb % Long.SIZE)); + this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits); + this.bitSet = new BitSet(numBits); + } + + /** + * A constructor to support rebuilding the BloomFilter from a serialized representation. + * @param bits + * @param numBits + * @param numFuncs + */ + public BloomFilter(List<Long> bits, int numBits, int numFuncs) { + super(); + long[] copied = new long[bits.size()]; + for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i); + bitSet = new BitSet(copied); + this.numBits = numBits; + numHashFunctions = numFuncs; + } + + static int optimalNumOfHashFunctions(long n, long m) { + return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); + } + + static int optimalNumOfBits(long n, double p) { + return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); + } + + public void add(byte[] val) { + if (val == null) { + addBytes(val, -1, -1); + } else { + addBytes(val, 0, val.length); + } + } + + public void addBytes(byte[] val, int offset, int length) { + // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter" + // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively + // implement a Bloom filter without any loss in the asymptotic false positive probability' + + // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned + // in the above paper + long hash64 = val == null ? Murmur3.NULL_HASHCODE : + Murmur3.hash64(val, offset, length); + addHash(hash64); + } + + private void addHash(long hash64) { + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + + for (int i = 1; i <= numHashFunctions; i++) { + int combinedHash = hash1 + (i * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + int pos = combinedHash % numBits; + bitSet.set(pos); + } + } + + public void addString(String val) { + if (val == null) { + add(null); + } else { + add(val.getBytes()); + } + } + + public void addLong(long val) { + addHash(getLongHash(val)); + } + + public void addDouble(double val) { + addLong(Double.doubleToLongBits(val)); + } + + public boolean test(byte[] val) { + if (val == null) { + return testBytes(val, -1, -1); + } + return testBytes(val, 0, val.length); + } + + public boolean testBytes(byte[] val, int offset, int length) { + long hash64 = val == null ? Murmur3.NULL_HASHCODE : + Murmur3.hash64(val, offset, length); + return testHash(hash64); + } + + private boolean testHash(long hash64) { + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + + for (int i = 1; i <= numHashFunctions; i++) { + int combinedHash = hash1 + (i * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + int pos = combinedHash % numBits; + if (!bitSet.get(pos)) { + return false; + } + } + return true; + } + + public boolean testString(String val) { + if (val == null) { + return test(null); + } else { + return test(val.getBytes()); + } + } + + public boolean testLong(long val) { + return testHash(getLongHash(val)); + } + + // Thomas Wang's integer hash function + // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm + private long getLongHash(long key) { + key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = key ^ (key >> 24); + key = (key + (key << 3)) + (key << 8); // key * 265 + key = key ^ (key >> 14); + key = (key + (key << 2)) + (key << 4); // key * 21 + key = key ^ (key >> 28); + key = key + (key << 31); + return key; + } + + public boolean testDouble(double val) { + return testLong(Double.doubleToLongBits(val)); + } + + public long sizeInBytes() { + return getBitSize() / 8; + } + + public int getBitSize() { + return bitSet.getData().length * Long.SIZE; + } + + public int getNumHashFunctions() { + return numHashFunctions; + } + + public long[] getBitSet() { + return bitSet.getData(); + } + + @Override + public String toString() { + return "m: " + numBits + " k: " + numHashFunctions; + } + + /** + * Merge the specified bloom filter with current bloom filter. + * + * @param that - bloom filter to merge + */ + public void merge(BloomFilter that) { + if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) { + this.bitSet.putAll(that.bitSet); + } else { + throw new IllegalArgumentException("BloomFilters are not compatible for merging." + + " this - " + this.toString() + " that - " + that.toString()); + } + } + + public void reset() { + this.bitSet.clear(); + } + + /** + * Bare metal bit set implementation. For performance reasons, this implementation does not check + * for index bounds nor expand the bit set size if the specified index is greater than the size. + */ + public class BitSet { + private final long[] data; + + public BitSet(long bits) { + this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]); + } + + /** + * Deserialize long array as bit set. + * + * @param data - bit array + */ + public BitSet(long[] data) { + assert data.length > 0 : "data length is zero!"; + this.data = data; + } + + /** + * Sets the bit at specified index. + * + * @param index - position + */ + public void set(int index) { + data[index >>> 6] |= (1L << index); + } + + /** + * Returns true if the bit is set in the specified index. + * + * @param index - position + * @return - value at the bit position + */ + public boolean get(int index) { + return (data[index >>> 6] & (1L << index)) != 0; + } + + /** + * Number of bits + */ + public long bitSize() { + return (long) data.length * Long.SIZE; + } + + public long[] getData() { + return data; + } + + /** + * Combines the two BitArrays using bitwise OR. + */ + public void putAll(BitSet array) { + assert data.length == array.data.length : + "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")"; + for (int i = 0; i < data.length; i++) { + data[i] |= array.data[i]; + } + } + + /** + * Clear the bit set. + */ + public void clear() { + Arrays.fill(data, 0); + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hive/common/util/IntervalDayTimeUtils.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hive/common/util/IntervalDayTimeUtils.java b/java/storage-api/src/java/org/apache/hive/common/util/IntervalDayTimeUtils.java new file mode 100644 index 0000000..727c1e6 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hive/common/util/IntervalDayTimeUtils.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.common.util; + +import java.math.BigDecimal; +import java.text.SimpleDateFormat; + +import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; + + +/** + * DateUtils. Thread-safe class + * + */ +public class IntervalDayTimeUtils { + + private static final ThreadLocal<SimpleDateFormat> dateFormatLocal = new ThreadLocal<SimpleDateFormat>() { + @Override + protected SimpleDateFormat initialValue() { + return new SimpleDateFormat("yyyy-MM-dd"); + } + }; + + public static SimpleDateFormat getDateFormat() { + return dateFormatLocal.get(); + } + + public static final int NANOS_PER_SEC = 1000000000; + public static final BigDecimal MAX_INT_BD = new BigDecimal(Integer.MAX_VALUE); + public static final BigDecimal NANOS_PER_SEC_BD = new BigDecimal(NANOS_PER_SEC); + + public static int parseNumericValueWithRange(String fieldName, + String strVal, int minValue, int maxValue) throws IllegalArgumentException { + int result = 0; + if (strVal != null) { + result = Integer.parseInt(strVal); + if (result < minValue || result > maxValue) { + throw new IllegalArgumentException(String.format("%s value %d outside range [%d, %d]", + fieldName, result, minValue, maxValue)); + } + } + return result; + } + + public static long getIntervalDayTimeTotalNanos(HiveIntervalDayTime intervalDayTime) { + return intervalDayTime.getTotalSeconds() * NANOS_PER_SEC + intervalDayTime.getNanos(); + } + + public static void setIntervalDayTimeTotalNanos(HiveIntervalDayTime intervalDayTime, + long totalNanos) { + intervalDayTime.set(totalNanos / NANOS_PER_SEC, (int) (totalNanos % NANOS_PER_SEC)); + } + + public static long getIntervalDayTimeTotalSecondsFromTotalNanos(long totalNanos) { + return totalNanos / NANOS_PER_SEC; + } + + public static int getIntervalDayTimeNanosFromTotalNanos(long totalNanos) { + return (int) (totalNanos % NANOS_PER_SEC); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java b/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java new file mode 100644 index 0000000..88c3514 --- /dev/null +++ b/java/storage-api/src/java/org/apache/hive/common/util/Murmur3.java @@ -0,0 +1,335 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.common.util; + +/** + * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms. + * + * Murmur3 32 and 128 bit variants. + * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 + * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 + * + * This is a public domain code with no copyrights. + * From homepage of MurmurHash (https://code.google.com/p/smhasher/), + * "All MurmurHash versions are public domain software, and the author disclaims all copyright + * to their code." + */ +public class Murmur3 { + // from 64-bit linear congruential generator + public static final long NULL_HASHCODE = 2862933555777941757L; + + // Constants for 32 bit variant + private static final int C1_32 = 0xcc9e2d51; + private static final int C2_32 = 0x1b873593; + private static final int R1_32 = 15; + private static final int R2_32 = 13; + private static final int M_32 = 5; + private static final int N_32 = 0xe6546b64; + + // Constants for 128 bit variant + private static final long C1 = 0x87c37b91114253d5L; + private static final long C2 = 0x4cf5ad432745937fL; + private static final int R1 = 31; + private static final int R2 = 27; + private static final int R3 = 33; + private static final int M = 5; + private static final int N1 = 0x52dce729; + private static final int N2 = 0x38495ab5; + + private static final int DEFAULT_SEED = 104729; + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static int hash32(byte[] data) { + return hash32(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default 0) + * @return - hashcode + */ + public static int hash32(byte[] data, int length, int seed) { + int hash = seed; + final int nblocks = length >> 2; + + // body + for (int i = 0; i < nblocks; i++) { + int i_4 = i << 2; + int k = (data[i_4] & 0xff) + | ((data[i_4 + 1] & 0xff) << 8) + | ((data[i_4 + 2] & 0xff) << 16) + | ((data[i_4 + 3] & 0xff) << 24); + + // mix functions + k *= C1_32; + k = Integer.rotateLeft(k, R1_32); + k *= C2_32; + hash ^= k; + hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; + } + + // tail + int idx = nblocks << 2; + int k1 = 0; + switch (length - idx) { + case 3: + k1 ^= data[idx + 2] << 16; + case 2: + k1 ^= data[idx + 1] << 8; + case 1: + k1 ^= data[idx]; + + // mix functions + k1 *= C1_32; + k1 = Integer.rotateLeft(k1, R1_32); + k1 *= C2_32; + hash ^= k1; + } + + // finalization + hash ^= length; + hash ^= (hash >>> 16); + hash *= 0x85ebca6b; + hash ^= (hash >>> 13); + hash *= 0xc2b2ae35; + hash ^= (hash >>> 16); + + return hash; + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static long hash64(byte[] data) { + return hash64(data, 0, data.length, DEFAULT_SEED); + } + + public static long hash64(byte[] data, int offset, int length) { + return hash64(data, offset, length, DEFAULT_SEED); + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode + */ + public static long hash64(byte[] data, int offset, int length, int seed) { + long hash = seed; + final int nblocks = length >> 3; + + // body + for (int i = 0; i < nblocks; i++) { + final int i8 = i << 3; + long k = ((long) data[offset + i8] & 0xff) + | (((long) data[offset + i8 + 1] & 0xff) << 8) + | (((long) data[offset + i8 + 2] & 0xff) << 16) + | (((long) data[offset + i8 + 3] & 0xff) << 24) + | (((long) data[offset + i8 + 4] & 0xff) << 32) + | (((long) data[offset + i8 + 5] & 0xff) << 40) + | (((long) data[offset + i8 + 6] & 0xff) << 48) + | (((long) data[offset + i8 + 7] & 0xff) << 56); + + // mix functions + k *= C1; + k = Long.rotateLeft(k, R1); + k *= C2; + hash ^= k; + hash = Long.rotateLeft(hash, R2) * M + N1; + } + + // tail + long k1 = 0; + int tailStart = nblocks << 3; + switch (length - tailStart) { + case 7: + k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= ((long) data[offset + tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + hash ^= k1; + } + + // finalization + hash ^= length; + hash = fmix64(hash); + + return hash; + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data) { + return hash128(data, 0, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @param offset - the first element of array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data, int offset, int length, int seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int i16 = i << 4; + long k1 = ((long) data[offset + i16] & 0xff) + | (((long) data[offset + i16 + 1] & 0xff) << 8) + | (((long) data[offset + i16 + 2] & 0xff) << 16) + | (((long) data[offset + i16 + 3] & 0xff) << 24) + | (((long) data[offset + i16 + 4] & 0xff) << 32) + | (((long) data[offset + i16 + 5] & 0xff) << 40) + | (((long) data[offset + i16 + 6] & 0xff) << 48) + | (((long) data[offset + i16 + 7] & 0xff) << 56); + + long k2 = ((long) data[offset + i16 + 8] & 0xff) + | (((long) data[offset + i16 + 9] & 0xff) << 8) + | (((long) data[offset + i16 + 10] & 0xff) << 16) + | (((long) data[offset + i16 + 11] & 0xff) << 24) + | (((long) data[offset + i16 + 12] & 0xff) << 32) + | (((long) data[offset + i16 + 13] & 0xff) << 40) + | (((long) data[offset + i16 + 14] & 0xff) << 48) + | (((long) data[offset + i16 + 15] & 0xff) << 56); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + int tailStart = nblocks << 4; + switch (length - tailStart) { + case 15: + k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48; + case 14: + k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40; + case 13: + k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32; + case 12: + k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24; + case 11: + k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16; + case 10: + k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8; + case 9: + k2 ^= (long) (data[offset + tailStart + 8] & 0xff); + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56; + case 7: + k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= (long) (data[offset + tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return new long[]{h1, h2}; + } + + private static long fmix64(long h) { + h ^= (h >>> 33); + h *= 0xff51afd7ed558ccdL; + h ^= (h >>> 33); + h *= 0xc4ceb9fe1a85ec53L; + h ^= (h >>> 33); + return h; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestListColumnVector.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestListColumnVector.java b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestListColumnVector.java new file mode 100644 index 0000000..395d8f5 --- /dev/null +++ b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestListColumnVector.java @@ -0,0 +1,200 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.junit.Test; + +import java.util.Arrays; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Test for ListColumnVector + */ +public class TestListColumnVector { + + @Test + public void testFlatten() throws Exception { + LongColumnVector col1 = new LongColumnVector(10); + ListColumnVector vector = new ListColumnVector(10, col1); + vector.init(); + + // TEST - repeating NULL & no selection + col1.isRepeating = true; + vector.isRepeating = true; + vector.noNulls = false; + vector.isNull[0] = true; + vector.childCount = 0; + for(int i=0; i < 10; ++i) { + col1.vector[i] = i + 3; + vector.offsets[i] = i; + vector.lengths[i] = 10 + i; + } + vector.flatten(false, null, 10); + // make sure the vector was flattened + assertFalse(vector.isRepeating); + assertFalse(vector.noNulls); + // child isn't flattened, because parent is repeating null + assertTrue(col1.isRepeating); + assertTrue(col1.noNulls); + for(int i=0; i < 10; ++i) { + assertTrue("isNull at " + i, vector.isNull[i]); + } + for(int i=0; i < 10; ++i) { + StringBuilder buf = new StringBuilder(); + vector.stringifyValue(buf, i); + assertEquals("null", buf.toString()); + } + vector.unFlatten(); + assertTrue(col1.isRepeating); + assertTrue(vector.isRepeating); + + // TEST - repeating NULL & selection + Arrays.fill(vector.isNull, 1, 10, false); + int[] sel = new int[]{3, 5, 7}; + vector.flatten(true, sel, 3); + for(int i=1; i < 10; i++) { + assertEquals("failure at " + i, + i == 3 || i == 5 || i == 7, vector.isNull[i]); + } + vector.unFlatten(); + + // TEST - repeating non-NULL & no-selection + vector.noNulls = true; + vector.isRepeating = true; + vector.offsets[0] = 0; + vector.lengths[0] = 3; + vector.childCount = 3; + vector.flatten(false, null, 10); + // make sure the vector was flattened + assertFalse(vector.isRepeating); + assertFalse(vector.noNulls); + assertFalse(col1.isRepeating); + assertFalse(col1.noNulls); + for(int i=0; i < 10; ++i) { + assertEquals("offset at " + i, 0, vector.offsets[i]); + assertEquals("length at " + i, 3, vector.lengths[i]); + } + for(int i=0; i < 10; ++i) { + StringBuilder buf = new StringBuilder(); + vector.stringifyValue(buf, i); + assertEquals("[3, 3, 3]", buf.toString()); + } + vector.unFlatten(); + assertTrue(col1.isRepeating); + assertTrue(col1.noNulls); + assertTrue(vector.isRepeating); + assertTrue(vector.noNulls); + + // TEST - repeating non-NULL & selection + Arrays.fill(vector.offsets, 1, 10, -1); + Arrays.fill(vector.lengths, 1, 10, -1); + Arrays.fill(col1.vector, 1, 10, -1); + vector.flatten(true, sel, 3); + for(int i=1; i < 10; i++) { + if (i == 3 || i == 5 || i == 7) { + assertEquals("failure at " + i, 0, vector.offsets[i]); + assertEquals("failure at " + i, 3, vector.lengths[i]); + } else { + assertEquals("failure at " + i, -1, vector.offsets[i]); + assertEquals("failure at " + i, -1, vector.lengths[i]); + } + } + for(int i=0; i < 3; ++i) { + assertEquals("failure at " + i, 3, col1.vector[i]); + } + for(int i=3; i < 10; ++i) { + assertEquals("failure at " + i, -1, col1.vector[i]); + } + vector.unFlatten(); + + // TEST - reset + vector.reset(); + assertFalse(col1.isRepeating); + assertTrue(col1.noNulls); + assertFalse(vector.isRepeating); + assertTrue(vector.noNulls); + assertEquals(0, vector.childCount); + } + + @Test + public void testSet() throws Exception { + LongColumnVector input1 = new LongColumnVector(10); + ListColumnVector input = new ListColumnVector(10, input1); + input.init(); + LongColumnVector output1 = new LongColumnVector(30); + ListColumnVector output = new ListColumnVector(10, output1); + output.init(); + input.noNulls = false; + input.isNull[6] = true; + input.childCount = 11; + Arrays.fill(output1.vector, -1); + for(int i=0; i < 10; ++i) { + input1.vector[i] = 10 * i; + input.offsets[i] = i; + input.lengths[i] = 2; + output.offsets[i] = i + 2; + output.lengths[i] = 3; + } + output.childCount = 30; + + // copy a null + output.setElement(3, 6, input); + assertEquals(30, output.childCount); + StringBuilder buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("null", buf.toString()); + + // copy a value + output.setElement(3, 5, input); + assertEquals(30, output.offsets[3]); + assertEquals(2, output.lengths[3]); + assertEquals(32, output.childCount); + buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("[50, 60]", buf.toString()); + + // overwrite a value + output.setElement(3, 4, input); + assertEquals(34, output.childCount); + assertEquals(34, output1.vector.length); + assertEquals(50, output1.vector[30]); + assertEquals(60, output1.vector[31]); + buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("[40, 50]", buf.toString()); + + input.reset(); + assertEquals(false, input1.isRepeating); + assertEquals(true, input.noNulls); + output.reset(); + assertEquals(0, output.childCount); + + input.isRepeating = true; + input.offsets[0] = 0; + input.lengths[0] = 10; + output.setElement(2, 7, input); + assertEquals(10, output.childCount); + buf = new StringBuilder(); + output.stringifyValue(buf, 2); + assertEquals("[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]", buf.toString()); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestMapColumnVector.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestMapColumnVector.java b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestMapColumnVector.java new file mode 100644 index 0000000..c77c286 --- /dev/null +++ b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestMapColumnVector.java @@ -0,0 +1,224 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.junit.Test; + +import java.util.Arrays; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Test for MapColumnVector + */ +public class TestMapColumnVector { + + @Test + public void testFlatten() throws Exception { + LongColumnVector col1 = new LongColumnVector(10); + DoubleColumnVector col2 = new DoubleColumnVector(10); + MapColumnVector vector = new MapColumnVector(10, col1, col2); + vector.init(); + + // TEST - repeating NULL & no selection + col1.isRepeating = true; + vector.isRepeating = true; + vector.noNulls = false; + vector.isNull[0] = true; + vector.childCount = 0; + for(int i=0; i < 10; ++i) { + col1.vector[i] = i + 3; + col2.vector[i] = i * 10; + vector.offsets[i] = i; + vector.lengths[i] = 10 + i; + } + vector.flatten(false, null, 10); + // make sure the vector was flattened + assertFalse(vector.isRepeating); + assertFalse(vector.noNulls); + // child isn't flattened, because parent is repeating null + assertTrue(col1.isRepeating); + assertTrue(col1.noNulls); + for(int i=0; i < 10; ++i) { + assertTrue("isNull at " + i, vector.isNull[i]); + } + for(int i=0; i < 10; ++i) { + StringBuilder buf = new StringBuilder(); + vector.stringifyValue(buf, i); + assertEquals("null", buf.toString()); + } + vector.unFlatten(); + assertTrue(col1.isRepeating); + assertTrue(vector.isRepeating); + + // TEST - repeating NULL & selection + Arrays.fill(vector.isNull, 1, 10, false); + int[] sel = new int[]{3, 5, 7}; + vector.flatten(true, sel, 3); + for(int i=1; i < 10; i++) { + assertEquals("failure at " + i, + i == 3 || i == 5 || i == 7, vector.isNull[i]); + } + vector.unFlatten(); + + // TEST - repeating non-NULL & no-selection + vector.noNulls = true; + vector.isRepeating = true; + vector.offsets[0] = 0; + vector.lengths[0] = 3; + vector.childCount = 3; + vector.flatten(false, null, 10); + // make sure the vector was flattened + assertFalse(vector.isRepeating); + assertFalse(vector.noNulls); + assertFalse(col1.isRepeating); + assertFalse(col1.noNulls); + assertFalse(col2.isRepeating); + assertFalse(col2.noNulls); + for(int i=0; i < 10; ++i) { + assertEquals("offset at " + i, 0, vector.offsets[i]); + assertEquals("length at " + i, 3, vector.lengths[i]); + } + for(int i=0; i < 10; ++i) { + StringBuilder buf = new StringBuilder(); + vector.stringifyValue(buf, i); + assertEquals("[{\"key\": 3, \"value\": 0.0}," + + " {\"key\": 3, \"value\": 10.0}," + + " {\"key\": 3, \"value\": 20.0}]", buf.toString()); + } + vector.unFlatten(); + assertTrue(col1.isRepeating); + assertTrue(col1.noNulls); + assertTrue(vector.isRepeating); + assertFalse(col2.isRepeating); + assertTrue(col2.noNulls); + assertTrue(vector.noNulls); + + // TEST - repeating non-NULL & selection + Arrays.fill(vector.offsets, 1, 10, -1); + Arrays.fill(vector.lengths, 1, 10, -1); + Arrays.fill(col1.vector, 1, 10, -1); + vector.flatten(true, sel, 3); + for(int i=1; i < 10; i++) { + if (i == 3 || i == 5 || i == 7) { + assertEquals("failure at " + i, 0, vector.offsets[i]); + assertEquals("failure at " + i, 3, vector.lengths[i]); + } else { + assertEquals("failure at " + i, -1, vector.offsets[i]); + assertEquals("failure at " + i, -1, vector.lengths[i]); + } + } + for(int i=0; i < 3; ++i) { + assertEquals("failure at " + i, 3, col1.vector[i]); + } + for(int i=3; i < 10; ++i) { + assertEquals("failure at " + i, -1, col1.vector[i]); + } + vector.unFlatten(); + + // TEST - reset + vector.reset(); + assertFalse(col1.isRepeating); + assertTrue(col1.noNulls); + assertFalse(col2.isRepeating); + assertTrue(col2.noNulls); + assertFalse(vector.isRepeating); + assertTrue(vector.noNulls); + assertEquals(0, vector.childCount); + } + + @Test + public void testSet() throws Exception { + LongColumnVector input1 = new LongColumnVector(10); + DoubleColumnVector input2 = new DoubleColumnVector(10); + MapColumnVector input = new MapColumnVector(10, input1, input2); + input.init(); + LongColumnVector output1 = new LongColumnVector(30); + DoubleColumnVector output2 = new DoubleColumnVector(30); + MapColumnVector output = new MapColumnVector(10, output1, output2); + output.init(); + input.noNulls = false; + input.isNull[6] = true; + input.childCount = 11; + Arrays.fill(output1.vector, -1); + for(int i=0; i < 10; ++i) { + input1.vector[i] = 10 * i; + input2.vector[i] = 100 * i; + input.offsets[i] = i; + input.lengths[i] = 2; + output.offsets[i] = i + 2; + output.lengths[i] = 3; + } + output.childCount = 30; + + // copy a null + output.setElement(3, 6, input); + assertEquals(30, output.childCount); + StringBuilder buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("null", buf.toString()); + + // copy a value + output.setElement(3, 5, input); + assertEquals(30, output.offsets[3]); + assertEquals(2, output.lengths[3]); + assertEquals(32, output.childCount); + buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("[{\"key\": 50, \"value\": 500.0}," + + " {\"key\": 60, \"value\": 600.0}]", buf.toString()); + + // overwrite a value + output.setElement(3, 4, input); + assertEquals(34, output.childCount); + assertEquals(34, output1.vector.length); + assertEquals(50, output1.vector[30]); + assertEquals(60, output1.vector[31]); + buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("[{\"key\": 40, \"value\": 400.0}," + + " {\"key\": 50, \"value\": 500.0}]", buf.toString()); + + input.reset(); + assertEquals(false, input1.isRepeating); + assertEquals(true, input.noNulls); + output.reset(); + assertEquals(0, output.childCount); + + input.isRepeating = true; + input.offsets[0] = 0; + input.lengths[0] = 10; + output.setElement(2, 7, input); + assertEquals(10, output.childCount); + buf = new StringBuilder(); + output.stringifyValue(buf, 2); + assertEquals("[{\"key\": 0, \"value\": 0.0}," + + " {\"key\": 10, \"value\": 100.0}," + + " {\"key\": 20, \"value\": 200.0}," + + " {\"key\": 30, \"value\": 300.0}," + + " {\"key\": 40, \"value\": 400.0}," + + " {\"key\": 50, \"value\": 500.0}," + + " {\"key\": 60, \"value\": 600.0}," + + " {\"key\": 70, \"value\": 700.0}," + + " {\"key\": 80, \"value\": 800.0}," + + " {\"key\": 90, \"value\": 900.0}]", buf.toString()); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestStructColumnVector.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestStructColumnVector.java b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestStructColumnVector.java new file mode 100644 index 0000000..41b4b65 --- /dev/null +++ b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestStructColumnVector.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Test for StructColumnVector + */ +public class TestStructColumnVector { + + @Test + public void testFlatten() throws Exception { + LongColumnVector col1 = new LongColumnVector(10); + LongColumnVector col2 = new LongColumnVector(10); + StructColumnVector vector = new StructColumnVector(10, col1, col2); + vector.init(); + col1.isRepeating = true; + for(int i=0; i < 10; ++i) { + col1.vector[i] = i; + col2.vector[i] = 2 * i; + } + vector.flatten(false, null, 10); + assertFalse(col1.isRepeating); + for(int i=0; i < 10; ++i) { + assertEquals("col1 at " + i, 0, col1.vector[i]); + assertEquals("col2 at " + i, 2 * i, col2.vector[i]); + } + vector.unFlatten(); + assertTrue(col1.isRepeating); + for(int i=0; i < 10; ++i) { + StringBuilder buf = new StringBuilder(); + vector.stringifyValue(buf, i); + assertEquals("[0, " + (2 * i) + "]", buf.toString()); + } + vector.reset(); + assertFalse(col1.isRepeating); + } + + @Test + public void testSet() throws Exception { + LongColumnVector input1 = new LongColumnVector(10); + LongColumnVector input2 = new LongColumnVector(10); + StructColumnVector input = new StructColumnVector(10, input1, input2); + input.init(); + LongColumnVector output1 = new LongColumnVector(10); + LongColumnVector output2 = new LongColumnVector(10); + StructColumnVector output = new StructColumnVector(10, output1, output2); + output.init(); + input1.isRepeating = true; + input2.noNulls = false; + input2.isNull[5] = true; + input.noNulls = false; + input.isNull[6] = true; + for(int i=0; i < 10; ++i) { + input1.vector[i] = i + 1; + input2.vector[i] = i + 2; + } + output.setElement(3, 6, input); + StringBuilder buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("null", buf.toString()); + output.setElement(3, 5, input); + buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("[1, null]", buf.toString()); + output.setElement(3, 4, input); + buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("[1, 6]", buf.toString()); + input.reset(); + assertEquals(false, input1.isRepeating); + assertEquals(true, input.noNulls); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java new file mode 100644 index 0000000..6e5d5c8 --- /dev/null +++ b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java @@ -0,0 +1,117 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.junit.Test; + +import java.io.PrintWriter; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.sql.Timestamp; +import java.util.Date; +import java.util.Random; + +import org.apache.hadoop.hive.common.type.RandomTypeUtil; + +import static org.junit.Assert.*; + +/** + * Test for ListColumnVector + */ +public class TestTimestampColumnVector { + + private static int TEST_COUNT = 5000; + + private static int fake = 0; + + @Test + public void testSaveAndRetrieve() throws Exception { + + Random r = new Random(1234); + TimestampColumnVector timestampColVector = new TimestampColumnVector(); + Timestamp[] randTimestamps = new Timestamp[VectorizedRowBatch.DEFAULT_SIZE]; + + for (int i = 0; i < VectorizedRowBatch.DEFAULT_SIZE; i++) { + Timestamp randTimestamp = RandomTypeUtil.getRandTimestamp(r); + randTimestamps[i] = randTimestamp; + timestampColVector.set(i, randTimestamp); + } + for (int i = 0; i < VectorizedRowBatch.DEFAULT_SIZE; i++) { + Timestamp retrievedTimestamp = timestampColVector.asScratchTimestamp(i); + Timestamp randTimestamp = randTimestamps[i]; + if (!retrievedTimestamp.equals(randTimestamp)) { + assertTrue(false); + } + } + } + + @Test + public void testTimestampCompare() throws Exception { + Random r = new Random(1234); + TimestampColumnVector timestampColVector = new TimestampColumnVector(); + Timestamp[] randTimestamps = new Timestamp[VectorizedRowBatch.DEFAULT_SIZE]; + Timestamp[] candTimestamps = new Timestamp[VectorizedRowBatch.DEFAULT_SIZE]; + int[] compareToLeftRights = new int[VectorizedRowBatch.DEFAULT_SIZE]; + int[] compareToRightLefts = new int[VectorizedRowBatch.DEFAULT_SIZE]; + + for (int i = 0; i < VectorizedRowBatch.DEFAULT_SIZE; i++) { + Timestamp randTimestamp = RandomTypeUtil.getRandTimestamp(r); + randTimestamps[i] = randTimestamp; + timestampColVector.set(i, randTimestamp); + Timestamp candTimestamp = RandomTypeUtil.getRandTimestamp(r); + candTimestamps[i] = candTimestamp; + compareToLeftRights[i] = candTimestamp.compareTo(randTimestamp); + compareToRightLefts[i] = randTimestamp.compareTo(candTimestamp); + } + + for (int i = 0; i < VectorizedRowBatch.DEFAULT_SIZE; i++) { + Timestamp retrievedTimestamp = timestampColVector.asScratchTimestamp(i); + Timestamp randTimestamp = randTimestamps[i]; + if (!retrievedTimestamp.equals(randTimestamp)) { + assertTrue(false); + } + Timestamp candTimestamp = candTimestamps[i]; + int compareToLeftRight = timestampColVector.compareTo(candTimestamp, i); + if (compareToLeftRight != compareToLeftRights[i]) { + assertTrue(false); + } + int compareToRightLeft = timestampColVector.compareTo(i, candTimestamp); + if (compareToRightLeft != compareToRightLefts[i]) { + assertTrue(false); + } + } + } + + /* + @Test + public void testGenerate() throws Exception { + PrintWriter writer = new PrintWriter("/Users/you/timestamps.txt"); + Random r = new Random(18485); + for (int i = 0; i < 25; i++) { + Timestamp randTimestamp = RandomTypeUtil.getRandTimestamp(r); + writer.println(randTimestamp.toString()); + } + for (int i = 0; i < 25; i++) { + Timestamp randTimestamp = RandomTypeUtil.getRandTimestamp(r, 1965, 2025); + writer.println(randTimestamp.toString()); + } + writer.close(); + } + */ +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestUnionColumnVector.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestUnionColumnVector.java b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestUnionColumnVector.java new file mode 100644 index 0000000..c378cd4 --- /dev/null +++ b/java/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestUnionColumnVector.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Test for StructColumnVector + */ +public class TestUnionColumnVector { + + @Test + public void testFlatten() throws Exception { + LongColumnVector col1 = new LongColumnVector(10); + LongColumnVector col2 = new LongColumnVector(10); + UnionColumnVector vector = new UnionColumnVector(10, col1, col2); + vector.init(); + col1.isRepeating = true; + for(int i=0; i < 10; ++i) { + vector.tags[i] = i % 2; + col1.vector[i] = i; + col2.vector[i] = 2 * i; + } + vector.flatten(false, null, 10); + assertFalse(col1.isRepeating); + for(int i=0; i < 10; ++i) { + assertEquals(i % 2, vector.tags[i]); + assertEquals("col1 at " + i, 0, col1.vector[i]); + assertEquals("col2 at " + i, 2 * i, col2.vector[i]); + } + vector.unFlatten(); + assertTrue(col1.isRepeating); + for(int i=0; i < 10; ++i) { + StringBuilder buf = new StringBuilder(); + vector.stringifyValue(buf, i); + assertEquals("{\"tag\": " + (i % 2) + ", \"value\": " + + (i % 2 == 0 ? 0 : 2 * i) + "}", buf.toString()); + } + vector.reset(); + assertFalse(col1.isRepeating); + } + + @Test + public void testSet() throws Exception { + LongColumnVector input1 = new LongColumnVector(10); + LongColumnVector input2 = new LongColumnVector(10); + UnionColumnVector input = new UnionColumnVector(10, input1, input2); + input.init(); + LongColumnVector output1 = new LongColumnVector(10); + LongColumnVector output2 = new LongColumnVector(10); + UnionColumnVector output = new UnionColumnVector(10, output1, output2); + output.init(); + input1.isRepeating = true; + for(int i=0; i < 10; ++i) { + input.tags[i] = i % 2; + input1.vector[i] = i + 1; + input2.vector[i] = i + 2; + } + output.setElement(3, 4, input); + StringBuilder buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("{\"tag\": 0, \"value\": 1}", buf.toString()); + input.noNulls = false; + input.isNull[5] = true; + output.setElement(3, 5, input); + buf = new StringBuilder(); + output.stringifyValue(buf, 3); + assertEquals("null", buf.toString()); + input.reset(); + assertEquals(false, input1.isRepeating); + assertEquals(true, input.noNulls); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java ---------------------------------------------------------------------- diff --git a/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java b/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java new file mode 100644 index 0000000..5facc7c --- /dev/null +++ b/java/storage-api/src/test/org/apache/hive/common/util/TestMurmur3.java @@ -0,0 +1,224 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hive.common.util; + +import static org.junit.Assert.assertEquals; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; + +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.Random; + +/** + * Tests for Murmur3 variants. + */ +public class TestMurmur3 { + + @Test + public void testHashCodesM3_32_string() { + String key = "test"; + int seed = 123; + HashFunction hf = Hashing.murmur3_32(seed); + int hc1 = hf.hashBytes(key.getBytes()).asInt(); + int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); + assertEquals(hc1, hc2); + + key = "testkey"; + hc1 = hf.hashBytes(key.getBytes()).asInt(); + hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); + assertEquals(hc1, hc2); + } + + @Test + public void testHashCodesM3_32_ints() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_32(seed); + for (int i = 0; i < 1000; i++) { + int val = rand.nextInt(); + byte[] data = ByteBuffer.allocate(4).putInt(val).array(); + int hc1 = hf.hashBytes(data).asInt(); + int hc2 = Murmur3.hash32(data, data.length, seed); + assertEquals(hc1, hc2); + } + } + + @Test + public void testHashCodesM3_32_longs() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_32(seed); + for (int i = 0; i < 1000; i++) { + long val = rand.nextLong(); + byte[] data = ByteBuffer.allocate(8).putLong(val).array(); + int hc1 = hf.hashBytes(data).asInt(); + int hc2 = Murmur3.hash32(data, data.length, seed); + assertEquals(hc1, hc2); + } + } + + @Test + public void testHashCodesM3_32_double() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_32(seed); + for (int i = 0; i < 1000; i++) { + double val = rand.nextDouble(); + byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); + int hc1 = hf.hashBytes(data).asInt(); + int hc2 = Murmur3.hash32(data, data.length, seed); + assertEquals(hc1, hc2); + } + } + + @Test + public void testHashCodesM3_128_string() { + String key = "test"; + int seed = 123; + HashFunction hf = Hashing.murmur3_128(seed); + // guava stores the hashcodes in little endian order + ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(key.getBytes()).asBytes()); + buf.flip(); + long gl1 = buf.getLong(); + long gl2 = buf.getLong(8); + long[] hc = Murmur3.hash128(key.getBytes(), 0, key.getBytes().length, seed); + long m1 = hc[0]; + long m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + + key = "testkey128_testkey128"; + buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(key.getBytes()).asBytes()); + buf.flip(); + gl1 = buf.getLong(); + gl2 = buf.getLong(8); + byte[] keyBytes = key.getBytes(); + hc = Murmur3.hash128(keyBytes, 0, keyBytes.length, seed); + m1 = hc[0]; + m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + + byte[] offsetKeyBytes = new byte[keyBytes.length + 35]; + Arrays.fill(offsetKeyBytes, (byte) -1); + System.arraycopy(keyBytes, 0, offsetKeyBytes, 35, keyBytes.length); + hc = Murmur3.hash128(offsetKeyBytes, 35, keyBytes.length, seed); + assertEquals(gl1, hc[0]); + assertEquals(gl2, hc[1]); + } + + @Test + public void testHashCodeM3_64() { + byte[] origin = ("It was the best of times, it was the worst of times," + + " it was the age of wisdom, it was the age of foolishness," + + " it was the epoch of belief, it was the epoch of incredulity," + + " it was the season of Light, it was the season of Darkness," + + " it was the spring of hope, it was the winter of despair," + + " we had everything before us, we had nothing before us," + + " we were all going direct to Heaven," + + " we were all going direct the other way.").getBytes(); + long hash = Murmur3.hash64(origin, 0, origin.length); + assertEquals(305830725663368540L, hash); + + byte[] originOffset = new byte[origin.length + 150]; + Arrays.fill(originOffset, (byte) 123); + System.arraycopy(origin, 0, originOffset, 150, origin.length); + hash = Murmur3.hash64(originOffset, 150, origin.length); + assertEquals(305830725663368540L, hash); + } + + @Test + public void testHashCodesM3_128_ints() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_128(seed); + for (int i = 0; i < 1000; i++) { + int val = rand.nextInt(); + byte[] data = ByteBuffer.allocate(4).putInt(val).array(); + // guava stores the hashcodes in little endian order + ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(data).asBytes()); + buf.flip(); + long gl1 = buf.getLong(); + long gl2 = buf.getLong(8); + long[] hc = Murmur3.hash128(data, 0, data.length, seed); + long m1 = hc[0]; + long m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + + byte[] offsetData = new byte[data.length + 50]; + System.arraycopy(data, 0, offsetData, 50, data.length); + hc = Murmur3.hash128(offsetData, 50, data.length, seed); + assertEquals(gl1, hc[0]); + assertEquals(gl2, hc[1]); + } + } + + @Test + public void testHashCodesM3_128_longs() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_128(seed); + for (int i = 0; i < 1000; i++) { + long val = rand.nextLong(); + byte[] data = ByteBuffer.allocate(8).putLong(val).array(); + // guava stores the hashcodes in little endian order + ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(data).asBytes()); + buf.flip(); + long gl1 = buf.getLong(); + long gl2 = buf.getLong(8); + long[] hc = Murmur3.hash128(data, 0, data.length, seed); + long m1 = hc[0]; + long m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + } + } + + @Test + public void testHashCodesM3_128_double() { + int seed = 123; + Random rand = new Random(seed); + HashFunction hf = Hashing.murmur3_128(seed); + for (int i = 0; i < 1000; i++) { + double val = rand.nextDouble(); + byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); + // guava stores the hashcodes in little endian order + ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); + buf.put(hf.hashBytes(data).asBytes()); + buf.flip(); + long gl1 = buf.getLong(); + long gl2 = buf.getLong(8); + long[] hc = Murmur3.hash128(data, 0, data.length, seed); + long m1 = hc[0]; + long m2 = hc[1]; + assertEquals(gl1, m1); + assertEquals(gl2, m2); + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/3283d238/proto/orc_proto.proto ---------------------------------------------------------------------- diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index 2e00566..6b7e597 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -18,7 +18,7 @@ package orc.proto; -option java_package = "org.apache.hadoop.hive.ql.io.orc"; +option java_package = "org.apache.orc"; message IntegerStatistics { optional sint64 minimum = 1; @@ -215,6 +215,9 @@ message PostScript { // Version of the writer: // 0 (or missing) = original // 1 = HIVE-8732 fixed + // 2 = HIVE-4243 fixed + // 3 = HIVE-12055 fixed + // 4 = HIVE-13083 fixed optional uint32 writerVersion = 6; // Leave this last in the record optional string magic = 8000;
