pgaref commented on a change in pull request #651: URL: https://github.com/apache/orc/pull/651#discussion_r598636479
########## File path: java/core/src/java/org/apache/orc/impl/DictionaryUtils.java ########## @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl; + +import java.io.IOException; +import java.io.OutputStream; + +import org.apache.hadoop.io.Text; + + +public class DictionaryUtils { + private DictionaryUtils() { Review comment: Remove empty constructor and add comment as class doc? ########## File path: java/core/src/java/org/apache/orc/impl/DictionaryUtils.java ########## @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl; + +import java.io.IOException; +import java.io.OutputStream; + +import org.apache.hadoop.io.Text; + + +public class DictionaryUtils { + private DictionaryUtils() { + // Utility class does nothing in constructor + } + + public static void getTextInternal(Text result, int position, DynamicIntArray keyOffsets, DynamicByteArray byteArray) { Review comment: doc method please? ########## File path: java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java ########## @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.Text; + + +/** + * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes + * and an offset for each entry. It is using chaining for collision resolution. + * + * This implementation is not thread-safe. It also assumes there's no reduction in the size of hash-table + * as it shouldn't happen in the use cases for this class. + */ +public class StringHashTableDictionary implements Dictionary { + + private final DynamicByteArray byteArray = new DynamicByteArray(); + // starting offset of key-in-byte in the byte array for the i-th key. + // Two things combined stores the key array. + private final DynamicIntArray keyOffsets; + + private final Text newKey = new Text(); + + private DynamicIntArray[] hashArray; + + private int capacity; + + private int threshold; + + private float loadFactor; + + private static float DEFAULT_LOAD_FACTOR = 0.75f; + + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + public StringHashTableDictionary(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public StringHashTableDictionary(int initialCapacity, float loadFactor) { + this.capacity = initialCapacity; + this.loadFactor = loadFactor; + this.keyOffsets = new DynamicIntArray(initialCapacity); + this.hashArray = initHashArray(initialCapacity); + this.threshold = (int)Math.min(initialCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + + private DynamicIntArray[] initHashArray(int capacity) { + DynamicIntArray[] bucket = new DynamicIntArray[capacity]; + for (int i = 0; i < capacity; i++) { + bucket[i] = new DynamicIntArray(); + } + return bucket; + } + + @Override + public void visit(Visitor visitor) + throws IOException { + traverse(visitor, new DictionaryUtils.VisitorContextImpl(this.byteArray, this.keyOffsets)); + } + + private void traverse(Visitor visitor, DictionaryUtils.VisitorContextImpl context) throws IOException { + for (DynamicIntArray intArray : hashArray) { + for (int i = 0; i < intArray.size() ; i ++) { + context.setPosition(intArray.get(i)); + visitor.visit(context); + } + } + } + + @Override + public void clear() { + byteArray.clear(); + keyOffsets.clear(); + Arrays.fill(hashArray, null); + } + + @Override + public void getText(Text result, int position) { + DictionaryUtils.getTextInternal(result, position, this.keyOffsets, this.byteArray); + } + + @Override + public int add(byte[] bytes, int offset, int length) { + resizeIfNeeded(); Review comment: resizeIfNeeded called twice here -- believe its safe to remove this call ########## File path: java/core/src/test/org/apache/orc/impl/TestStringHashTableDictionary.java ########## @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + + +public class TestStringHashTableDictionary { + + /** + * A extension for {@link StringHashTableDictionary} for testing purpose by overwriting the hash function. + * + */ + private static class SimpleHashDictionary extends StringHashTableDictionary { + public SimpleHashDictionary(int initialCapacity) { + super(initialCapacity); + } + + /** + * Obtain the prefix for each string as the hash value. + * All the string being used in this test suite will contains its hash value as the prefix for the string content. + * this way we know the order of the traverse() method. + */ + @Override + int getIndex(Text text) { Review comment: Shall we have a test where the actual HashFunction is used as well? ########## File path: java/core/src/test/org/apache/orc/TestStringDictionary.java ########## @@ -114,10 +117,8 @@ public void testTooManyDistinct() throws Exception { } } - @Test - public void testHalfDistinct() throws Exception { + public void testHalfDistinctHelper(Configuration conf) throws Exception { Review comment: Shall we make this a Parametrized test where @Parameter is going to be the DICT implementations -- other tests with dict encoding could also benefit from this ########## File path: java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java ########## @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.Text; + + +/** + * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes + * and an offset for each entry. It is using chaining for collision resolution. + * + * This implementation is not thread-safe. It also assumes there's no reduction in the size of hash-table + * as it shouldn't happen in the use cases for this class. + */ +public class StringHashTableDictionary implements Dictionary { + + private final DynamicByteArray byteArray = new DynamicByteArray(); + // starting offset of key-in-byte in the byte array for the i-th key. + // Two things combined stores the key array. + private final DynamicIntArray keyOffsets; + + private final Text newKey = new Text(); + + private DynamicIntArray[] hashArray; + + private int capacity; + + private int threshold; + + private float loadFactor; + + private static float DEFAULT_LOAD_FACTOR = 0.75f; + + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; Review comment: explain MAX_ARRAY_SIZE limit and how this relates to Hash MASK below ignoring the first bits ########## File path: java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java ########## @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.Text; + + +/** + * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes + * and an offset for each entry. It is using chaining for collision resolution. + * + * This implementation is not thread-safe. It also assumes there's no reduction in the size of hash-table + * as it shouldn't happen in the use cases for this class. + */ +public class StringHashTableDictionary implements Dictionary { + + private final DynamicByteArray byteArray = new DynamicByteArray(); + // starting offset of key-in-byte in the byte array for the i-th key. + // Two things combined stores the key array. + private final DynamicIntArray keyOffsets; + + private final Text newKey = new Text(); + + private DynamicIntArray[] hashArray; + + private int capacity; + + private int threshold; + + private float loadFactor; + + private static float DEFAULT_LOAD_FACTOR = 0.75f; + + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + public StringHashTableDictionary(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public StringHashTableDictionary(int initialCapacity, float loadFactor) { + this.capacity = initialCapacity; + this.loadFactor = loadFactor; + this.keyOffsets = new DynamicIntArray(initialCapacity); + this.hashArray = initHashArray(initialCapacity); + this.threshold = (int)Math.min(initialCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + + private DynamicIntArray[] initHashArray(int capacity) { + DynamicIntArray[] bucket = new DynamicIntArray[capacity]; + for (int i = 0; i < capacity; i++) { + bucket[i] = new DynamicIntArray(); Review comment: No sure each bucket should be initialized to default 8K ints -- are we expecting that many collisions? ########## File path: java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java ########## @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.Text; + + +/** + * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes + * and an offset for each entry. It is using chaining for collision resolution. + * + * This implementation is not thread-safe. It also assumes there's no reduction in the size of hash-table + * as it shouldn't happen in the use cases for this class. + */ +public class StringHashTableDictionary implements Dictionary { + + private final DynamicByteArray byteArray = new DynamicByteArray(); + // starting offset of key-in-byte in the byte array for the i-th key. + // Two things combined stores the key array. + private final DynamicIntArray keyOffsets; + + private final Text newKey = new Text(); + + private DynamicIntArray[] hashArray; + + private int capacity; + + private int threshold; + + private float loadFactor; + + private static float DEFAULT_LOAD_FACTOR = 0.75f; + + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + public StringHashTableDictionary(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public StringHashTableDictionary(int initialCapacity, float loadFactor) { + this.capacity = initialCapacity; + this.loadFactor = loadFactor; + this.keyOffsets = new DynamicIntArray(initialCapacity); + this.hashArray = initHashArray(initialCapacity); + this.threshold = (int)Math.min(initialCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + + private DynamicIntArray[] initHashArray(int capacity) { + DynamicIntArray[] bucket = new DynamicIntArray[capacity]; + for (int i = 0; i < capacity; i++) { + bucket[i] = new DynamicIntArray(); + } + return bucket; + } + + @Override + public void visit(Visitor visitor) + throws IOException { + traverse(visitor, new DictionaryUtils.VisitorContextImpl(this.byteArray, this.keyOffsets)); + } + + private void traverse(Visitor visitor, DictionaryUtils.VisitorContextImpl context) throws IOException { + for (DynamicIntArray intArray : hashArray) { + for (int i = 0; i < intArray.size() ; i ++) { + context.setPosition(intArray.get(i)); + visitor.visit(context); + } + } + } + + @Override + public void clear() { + byteArray.clear(); + keyOffsets.clear(); + Arrays.fill(hashArray, null); + } + + @Override + public void getText(Text result, int position) { + DictionaryUtils.getTextInternal(result, position, this.keyOffsets, this.byteArray); + } + + @Override + public int add(byte[] bytes, int offset, int length) { + resizeIfNeeded(); + newKey.set(bytes, offset, length); + return add(newKey); + } + + public int add(Text text) { + resizeIfNeeded(); + + int index = getIndex(text); + DynamicIntArray candidateArray = hashArray[index]; + + newKey.set(text); + + Text tmpText = new Text(); + for (int i = 0; i < candidateArray.size(); i++) { + getText(tmpText, candidateArray.get(i)); + if (tmpText.equals(newKey)) { + return candidateArray.get(i); + } + } + + // if making it here, it means no match. + int len = newKey.getLength(); + int currIdx = keyOffsets.size(); + keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len)); + candidateArray.add(currIdx); + return currIdx; + } + + private void resizeIfNeeded() { + if (keyOffsets.size() >= threshold) { + int oldCapacity = keyOffsets.size(); + int newCapacity = (oldCapacity << 1) + 1; + doResize(newCapacity); + this.threshold = (int)Math.min(newCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + } + + @Override + public int size() { + return keyOffsets.size(); + } + + /** + * Compute the hash value and find the corresponding index. + * + */ + int getIndex(Text text) { + return (text.hashCode() & 0x7FFFFFFF) % capacity; + } + + // Resize the hash table, re-hash all the existing keys. + // byteArray and keyOffsetsArray don't have to be re-filled. + private void doResize(int newSize) { + DynamicIntArray[] resizedHashArray = new DynamicIntArray[newSize]; + for (int i = 0; i < newSize; i++) { + resizedHashArray[i] = new DynamicIntArray(); + } + + Text tmpText = new Text(); + for (int i = 0; i < capacity; i++) { + DynamicIntArray intArray = hashArray[i]; + int bucketSize = intArray.size(); + if (bucketSize > 0) { + for (int j = 0; j < bucketSize; j++) { + getText(tmpText, intArray.get(j)); + int newIndex = getIndex(tmpText); + resizedHashArray[newIndex].add(intArray.get(j)); + } + } + } + + Arrays.fill(hashArray, null); Review comment: we reassign hashArray below, is this really needed? ########## File path: java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java ########## @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.Text; + + +/** + * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes + * and an offset for each entry. It is using chaining for collision resolution. + * + * This implementation is not thread-safe. It also assumes there's no reduction in the size of hash-table + * as it shouldn't happen in the use cases for this class. + */ +public class StringHashTableDictionary implements Dictionary { + + private final DynamicByteArray byteArray = new DynamicByteArray(); + // starting offset of key-in-byte in the byte array for the i-th key. + // Two things combined stores the key array. + private final DynamicIntArray keyOffsets; + + private final Text newKey = new Text(); + + private DynamicIntArray[] hashArray; + + private int capacity; + + private int threshold; + + private float loadFactor; + + private static float DEFAULT_LOAD_FACTOR = 0.75f; + + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + public StringHashTableDictionary(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public StringHashTableDictionary(int initialCapacity, float loadFactor) { + this.capacity = initialCapacity; + this.loadFactor = loadFactor; + this.keyOffsets = new DynamicIntArray(initialCapacity); + this.hashArray = initHashArray(initialCapacity); + this.threshold = (int)Math.min(initialCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + + private DynamicIntArray[] initHashArray(int capacity) { + DynamicIntArray[] bucket = new DynamicIntArray[capacity]; + for (int i = 0; i < capacity; i++) { + bucket[i] = new DynamicIntArray(); + } + return bucket; + } + + @Override + public void visit(Visitor visitor) + throws IOException { + traverse(visitor, new DictionaryUtils.VisitorContextImpl(this.byteArray, this.keyOffsets)); + } + + private void traverse(Visitor visitor, DictionaryUtils.VisitorContextImpl context) throws IOException { + for (DynamicIntArray intArray : hashArray) { + for (int i = 0; i < intArray.size() ; i ++) { + context.setPosition(intArray.get(i)); + visitor.visit(context); + } + } + } + + @Override + public void clear() { + byteArray.clear(); + keyOffsets.clear(); + Arrays.fill(hashArray, null); + } + + @Override + public void getText(Text result, int position) { + DictionaryUtils.getTextInternal(result, position, this.keyOffsets, this.byteArray); + } + + @Override + public int add(byte[] bytes, int offset, int length) { + resizeIfNeeded(); + newKey.set(bytes, offset, length); + return add(newKey); + } + + public int add(Text text) { + resizeIfNeeded(); + + int index = getIndex(text); + DynamicIntArray candidateArray = hashArray[index]; + + newKey.set(text); + + Text tmpText = new Text(); + for (int i = 0; i < candidateArray.size(); i++) { + getText(tmpText, candidateArray.get(i)); + if (tmpText.equals(newKey)) { + return candidateArray.get(i); + } + } + + // if making it here, it means no match. + int len = newKey.getLength(); + int currIdx = keyOffsets.size(); + keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len)); + candidateArray.add(currIdx); + return currIdx; + } + + private void resizeIfNeeded() { + if (keyOffsets.size() >= threshold) { + int oldCapacity = keyOffsets.size(); + int newCapacity = (oldCapacity << 1) + 1; + doResize(newCapacity); + this.threshold = (int)Math.min(newCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + } + + @Override + public int size() { + return keyOffsets.size(); + } + + /** + * Compute the hash value and find the corresponding index. + * + */ + int getIndex(Text text) { + return (text.hashCode() & 0x7FFFFFFF) % capacity; + } + + // Resize the hash table, re-hash all the existing keys. + // byteArray and keyOffsetsArray don't have to be re-filled. + private void doResize(int newSize) { + DynamicIntArray[] resizedHashArray = new DynamicIntArray[newSize]; + for (int i = 0; i < newSize; i++) { + resizedHashArray[i] = new DynamicIntArray(); + } + + Text tmpText = new Text(); + for (int i = 0; i < capacity; i++) { + DynamicIntArray intArray = hashArray[i]; + int bucketSize = intArray.size(); + if (bucketSize > 0) { Review comment: unnecessary condition -- already handled by the loop ########## File path: java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java ########## @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.Text; + + +/** + * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes + * and an offset for each entry. It is using chaining for collision resolution. + * + * This implementation is not thread-safe. It also assumes there's no reduction in the size of hash-table + * as it shouldn't happen in the use cases for this class. + */ +public class StringHashTableDictionary implements Dictionary { + + private final DynamicByteArray byteArray = new DynamicByteArray(); + // starting offset of key-in-byte in the byte array for the i-th key. + // Two things combined stores the key array. + private final DynamicIntArray keyOffsets; + + private final Text newKey = new Text(); + + private DynamicIntArray[] hashArray; Review comment: Lets clarify what each of hashArray, keyOffsets, byteArray represent. Seems like hashArray is storing String hash (int) to KeyOffset List -- lets clarify and make variables more representative. ########## File path: java/core/src/test/org/apache/orc/impl/TestStringHashTableDictionary.java ########## @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + + +public class TestStringHashTableDictionary { + + /** + * A extension for {@link StringHashTableDictionary} for testing purpose by overwriting the hash function. + * + */ + private static class SimpleHashDictionary extends StringHashTableDictionary { + public SimpleHashDictionary(int initialCapacity) { + super(initialCapacity); + } + + /** + * Obtain the prefix for each string as the hash value. + * All the string being used in this test suite will contains its hash value as the prefix for the string content. + * this way we know the order of the traverse() method. + */ + @Override + int getIndex(Text text) { + String s = text.toString(); + int underscore = s.indexOf("_"); + return Integer.parseInt(text.toString().substring(0, underscore)); + } + } + + @Test + public void test1() + throws Exception { + SimpleHashDictionary hashTableDictionary = new SimpleHashDictionary(5); + // Non-resize trivial cases + Assert.assertEquals(0, hashTableDictionary.getSizeInBytes()); + Assert.assertEquals(0, hashTableDictionary.add(new Text("2_Alice"))); + Assert.assertEquals(1, hashTableDictionary.add(new Text("3_Bob"))); + Assert.assertEquals(0, hashTableDictionary.add(new Text("2_Alice"))); + Assert.assertEquals(1, hashTableDictionary.add(new Text("3_Bob"))); + Assert.assertEquals(2, hashTableDictionary.add(new Text("1_Cindy"))); + + Text text = new Text(); + hashTableDictionary.getText(text, 0); + Assert.assertEquals("2_Alice", text.toString()); + hashTableDictionary.getText(text, 1); + Assert.assertEquals("3_Bob", text.toString()); + hashTableDictionary.getText(text, 2); + Assert.assertEquals("1_Cindy", text.toString()); + + // entering the fourth and fifth element which triggers rehash + Assert.assertEquals(3, hashTableDictionary.add(new Text("0_David"))); + hashTableDictionary.getText(text, 3); + Assert.assertEquals("0_David", text.toString()); + Assert.assertEquals(4, hashTableDictionary.add(new Text("4_Eason"))); + hashTableDictionary.getText(text, 4); + Assert.assertEquals("4_Eason", text.toString()); + + // Re-ensure no all previously existed string still have correct encoded value + hashTableDictionary.getText(text, 0); + Assert.assertEquals("2_Alice", text.toString()); + hashTableDictionary.getText(text, 1); + Assert.assertEquals("3_Bob", text.toString()); + hashTableDictionary.getText(text, 2); + Assert.assertEquals("1_Cindy", text.toString()); + + + // The order of words are based on each string's prefix given their index in the hashArray will be based on that. + TestStringRedBlackTree + .checkContents(hashTableDictionary, new int[]{3, 2, 0, 1, 4}, "0_David", "1_Cindy", "2_Alice", "3_Bob", + "4_Eason"); + Review comment: ensure that size of both HT and RB impl is the same? ########## File path: java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java ########## @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.Text; + + +/** + * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes + * and an offset for each entry. It is using chaining for collision resolution. + * + * This implementation is not thread-safe. It also assumes there's no reduction in the size of hash-table + * as it shouldn't happen in the use cases for this class. + */ +public class StringHashTableDictionary implements Dictionary { + + private final DynamicByteArray byteArray = new DynamicByteArray(); + // starting offset of key-in-byte in the byte array for the i-th key. + // Two things combined stores the key array. + private final DynamicIntArray keyOffsets; + + private final Text newKey = new Text(); + + private DynamicIntArray[] hashArray; + + private int capacity; + + private int threshold; + + private float loadFactor; + + private static float DEFAULT_LOAD_FACTOR = 0.75f; + + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + public StringHashTableDictionary(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public StringHashTableDictionary(int initialCapacity, float loadFactor) { + this.capacity = initialCapacity; + this.loadFactor = loadFactor; + this.keyOffsets = new DynamicIntArray(initialCapacity); + this.hashArray = initHashArray(initialCapacity); + this.threshold = (int)Math.min(initialCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + + private DynamicIntArray[] initHashArray(int capacity) { + DynamicIntArray[] bucket = new DynamicIntArray[capacity]; + for (int i = 0; i < capacity; i++) { + bucket[i] = new DynamicIntArray(); + } + return bucket; + } + + @Override + public void visit(Visitor visitor) + throws IOException { + traverse(visitor, new DictionaryUtils.VisitorContextImpl(this.byteArray, this.keyOffsets)); + } + + private void traverse(Visitor visitor, DictionaryUtils.VisitorContextImpl context) throws IOException { + for (DynamicIntArray intArray : hashArray) { + for (int i = 0; i < intArray.size() ; i ++) { + context.setPosition(intArray.get(i)); + visitor.visit(context); + } + } + } + + @Override + public void clear() { + byteArray.clear(); + keyOffsets.clear(); + Arrays.fill(hashArray, null); + } + + @Override + public void getText(Text result, int position) { + DictionaryUtils.getTextInternal(result, position, this.keyOffsets, this.byteArray); + } + + @Override + public int add(byte[] bytes, int offset, int length) { + resizeIfNeeded(); + newKey.set(bytes, offset, length); + return add(newKey); + } + + public int add(Text text) { + resizeIfNeeded(); + + int index = getIndex(text); + DynamicIntArray candidateArray = hashArray[index]; + + newKey.set(text); + + Text tmpText = new Text(); + for (int i = 0; i < candidateArray.size(); i++) { + getText(tmpText, candidateArray.get(i)); + if (tmpText.equals(newKey)) { + return candidateArray.get(i); + } + } + + // if making it here, it means no match. + int len = newKey.getLength(); + int currIdx = keyOffsets.size(); + keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len)); + candidateArray.add(currIdx); + return currIdx; + } + + private void resizeIfNeeded() { + if (keyOffsets.size() >= threshold) { + int oldCapacity = keyOffsets.size(); + int newCapacity = (oldCapacity << 1) + 1; + doResize(newCapacity); + this.threshold = (int)Math.min(newCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + } + + @Override + public int size() { + return keyOffsets.size(); + } + + /** + * Compute the hash value and find the corresponding index. + * + */ + int getIndex(Text text) { + return (text.hashCode() & 0x7FFFFFFF) % capacity; Review comment: lets explain masking ########## File path: java/core/src/java/org/apache/orc/impl/StringHashTableDictionary.java ########## @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.Text; + + +/** + * Using HashTable to represent a dictionary. The strings are stored as UTF-8 bytes + * and an offset for each entry. It is using chaining for collision resolution. + * + * This implementation is not thread-safe. It also assumes there's no reduction in the size of hash-table + * as it shouldn't happen in the use cases for this class. + */ +public class StringHashTableDictionary implements Dictionary { + + private final DynamicByteArray byteArray = new DynamicByteArray(); + // starting offset of key-in-byte in the byte array for the i-th key. + // Two things combined stores the key array. + private final DynamicIntArray keyOffsets; + + private final Text newKey = new Text(); + + private DynamicIntArray[] hashArray; + + private int capacity; + + private int threshold; + + private float loadFactor; + + private static float DEFAULT_LOAD_FACTOR = 0.75f; + + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + public StringHashTableDictionary(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public StringHashTableDictionary(int initialCapacity, float loadFactor) { + this.capacity = initialCapacity; + this.loadFactor = loadFactor; + this.keyOffsets = new DynamicIntArray(initialCapacity); + this.hashArray = initHashArray(initialCapacity); + this.threshold = (int)Math.min(initialCapacity * loadFactor, MAX_ARRAY_SIZE + 1); + } + + private DynamicIntArray[] initHashArray(int capacity) { + DynamicIntArray[] bucket = new DynamicIntArray[capacity]; + for (int i = 0; i < capacity; i++) { + bucket[i] = new DynamicIntArray(); + } + return bucket; + } + + @Override + public void visit(Visitor visitor) + throws IOException { + traverse(visitor, new DictionaryUtils.VisitorContextImpl(this.byteArray, this.keyOffsets)); + } + + private void traverse(Visitor visitor, DictionaryUtils.VisitorContextImpl context) throws IOException { + for (DynamicIntArray intArray : hashArray) { + for (int i = 0; i < intArray.size() ; i ++) { + context.setPosition(intArray.get(i)); + visitor.visit(context); + } + } + } + + @Override + public void clear() { + byteArray.clear(); + keyOffsets.clear(); + Arrays.fill(hashArray, null); + } + + @Override + public void getText(Text result, int position) { + DictionaryUtils.getTextInternal(result, position, this.keyOffsets, this.byteArray); + } + + @Override + public int add(byte[] bytes, int offset, int length) { + resizeIfNeeded(); + newKey.set(bytes, offset, length); + return add(newKey); + } + + public int add(Text text) { + resizeIfNeeded(); + + int index = getIndex(text); + DynamicIntArray candidateArray = hashArray[index]; + + newKey.set(text); Review comment: why are we moving text to front here? ########## File path: java/core/src/test/org/apache/orc/impl/TestStringHashTableDictionary.java ########## @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + + +public class TestStringHashTableDictionary { + + /** + * A extension for {@link StringHashTableDictionary} for testing purpose by overwriting the hash function. + * + */ + private static class SimpleHashDictionary extends StringHashTableDictionary { + public SimpleHashDictionary(int initialCapacity) { + super(initialCapacity); + } + + /** + * Obtain the prefix for each string as the hash value. + * All the string being used in this test suite will contains its hash value as the prefix for the string content. + * this way we know the order of the traverse() method. + */ + @Override + int getIndex(Text text) { + String s = text.toString(); + int underscore = s.indexOf("_"); + return Integer.parseInt(text.toString().substring(0, underscore)); + } + } + + @Test + public void test1() + throws Exception { + SimpleHashDictionary hashTableDictionary = new SimpleHashDictionary(5); + // Non-resize trivial cases + Assert.assertEquals(0, hashTableDictionary.getSizeInBytes()); + Assert.assertEquals(0, hashTableDictionary.add(new Text("2_Alice"))); + Assert.assertEquals(1, hashTableDictionary.add(new Text("3_Bob"))); + Assert.assertEquals(0, hashTableDictionary.add(new Text("2_Alice"))); + Assert.assertEquals(1, hashTableDictionary.add(new Text("3_Bob"))); + Assert.assertEquals(2, hashTableDictionary.add(new Text("1_Cindy"))); + Review comment: check bytes after additions? ########## File path: java/core/src/test/org/apache/orc/impl/TestStringHashTableDictionary.java ########## @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl; + +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + + +public class TestStringHashTableDictionary { + + /** + * A extension for {@link StringHashTableDictionary} for testing purpose by overwriting the hash function. + * + */ + private static class SimpleHashDictionary extends StringHashTableDictionary { + public SimpleHashDictionary(int initialCapacity) { + super(initialCapacity); + } + + /** + * Obtain the prefix for each string as the hash value. + * All the string being used in this test suite will contains its hash value as the prefix for the string content. + * this way we know the order of the traverse() method. + */ + @Override + int getIndex(Text text) { + String s = text.toString(); + int underscore = s.indexOf("_"); + return Integer.parseInt(text.toString().substring(0, underscore)); + } + } + + @Test + public void test1() + throws Exception { + SimpleHashDictionary hashTableDictionary = new SimpleHashDictionary(5); + // Non-resize trivial cases + Assert.assertEquals(0, hashTableDictionary.getSizeInBytes()); + Assert.assertEquals(0, hashTableDictionary.add(new Text("2_Alice"))); + Assert.assertEquals(1, hashTableDictionary.add(new Text("3_Bob"))); + Assert.assertEquals(0, hashTableDictionary.add(new Text("2_Alice"))); + Assert.assertEquals(1, hashTableDictionary.add(new Text("3_Bob"))); + Assert.assertEquals(2, hashTableDictionary.add(new Text("1_Cindy"))); + + Text text = new Text(); + hashTableDictionary.getText(text, 0); + Assert.assertEquals("2_Alice", text.toString()); + hashTableDictionary.getText(text, 1); + Assert.assertEquals("3_Bob", text.toString()); + hashTableDictionary.getText(text, 2); + Assert.assertEquals("1_Cindy", text.toString()); + + // entering the fourth and fifth element which triggers rehash + Assert.assertEquals(3, hashTableDictionary.add(new Text("0_David"))); + hashTableDictionary.getText(text, 3); + Assert.assertEquals("0_David", text.toString()); + Assert.assertEquals(4, hashTableDictionary.add(new Text("4_Eason"))); + hashTableDictionary.getText(text, 4); + Assert.assertEquals("4_Eason", text.toString()); + + // Re-ensure no all previously existed string still have correct encoded value + hashTableDictionary.getText(text, 0); + Assert.assertEquals("2_Alice", text.toString()); + hashTableDictionary.getText(text, 1); + Assert.assertEquals("3_Bob", text.toString()); + hashTableDictionary.getText(text, 2); + Assert.assertEquals("1_Cindy", text.toString()); + + + // The order of words are based on each string's prefix given their index in the hashArray will be based on that. + TestStringRedBlackTree Review comment: This is a bit confusing -- I would just copy the checkContents method here, or create a Utility class if want to reuse -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
