This is an automated email from the ASF dual-hosted git repository. emkornfield pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 07b550c ARROW-5814: [Java] Implement a <Object, int> HashMap for DictionaryEncoder 07b550c is described below commit 07b550c222a54f219038880c141a595a7507e73f Author: tianchen <niki...@alibaba-inc.com> AuthorDate: Tue Jul 2 21:13:04 2019 -0700 ARROW-5814: [Java] Implement a <Object, int> HashMap for DictionaryEncoder Related to [ARROW-5814](https://issues.apache.org/jira/browse/ARROW-5814). As a follow-up of https://github.com/apache/arrow/pull/4698. Implement a Map<Object, int> for DictionaryEncoder to reduce boxing/unboxing operations. Benchmark: DictionaryEncodeHashMapBenchmarks.testHashMap: avgt 5 31151.345 ± 1661.878 ns/op DictionaryEncodeHashMapBenchmarks.testDictionaryEncodeHashMap: avgt 5 15549.902 ± 771.647 ns/op Author: tianchen <niki...@alibaba-inc.com> Closes #4765 from tianchen92/map and squashes the following commits: 38ee5a4af <tianchen> add UT f62003337 <tianchen> add javadoc and change package 10596ad87 <tianchen> fix style 86eb350b3 <tianchen> add interface 98f4c5593 <tianchen> init --- .../DictionaryEncodeHashMapBenchmarks.java | 117 +++++++ .../vector/dictionary/DictionaryEncodeHashMap.java | 368 +++++++++++++++++++++ .../arrow/vector/dictionary/ObjectIntMap.java | 50 +++ .../arrow/vector/TestDictionaryEncodeHashMap.java | 123 +++++++ 4 files changed, 658 insertions(+) diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java new file mode 100644 index 0000000..e97bff2 --- /dev/null +++ b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMapBenchmarks.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import org.junit.Test; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +/** + * Benchmarks for {@link DictionaryEncodeHashMap}. + */ +@State(Scope.Benchmark) +public class DictionaryEncodeHashMapBenchmarks { + private static final int SIZE = 1000; + + private static final int KEY_LENGTH = 10; + + private List<String> testData = new ArrayList<>(); + + private HashMap<String, Integer> hashMap = new HashMap(); + private DictionaryEncodeHashMap<String> dictionaryEncodeHashMap = new DictionaryEncodeHashMap(); + + /** + * Setup benchmarks. + */ + @Setup + public void prepare() { + for (int i = 0; i < SIZE; i++) { + testData.add(getRandomString(KEY_LENGTH)); + } + } + + private String getRandomString(int length) { + String str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + Random random = new Random(); + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < length; i++) { + int number = random.nextInt(62); + sb.append(str.charAt(number)); + } + return sb.toString(); + } + + /** + * Test set/get int values for {@link HashMap}. + * @return useless. To avoid DCE by JIT. + */ + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public int testHashMap() { + for (int i = 0; i < SIZE; i++) { + hashMap.put(testData.get(i), i); + } + for (int i = 0; i < SIZE; i++) { + hashMap.get(testData.get(i)); + } + return 0; + } + + /** + * Test set/get int values for {@link DictionaryEncodeHashMap}. + * @return useless. To avoid DCE by JIT. + */ + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public int testDictionaryEncodeHashMap() { + for (int i = 0; i < SIZE; i++) { + dictionaryEncodeHashMap.put(testData.get(i), i); + } + for (int i = 0; i < SIZE; i++) { + dictionaryEncodeHashMap.get(testData.get(i)); + } + return 0; + } + + @Test + public void evaluate() throws RunnerException { + Options opt = new OptionsBuilder() + .include(DictionaryEncodeHashMapBenchmarks.class.getSimpleName()) + .forks(1) + .build(); + + new Runner(opt).run(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java new file mode 100644 index 0000000..659a8d6 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncodeHashMap.java @@ -0,0 +1,368 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import java.util.Objects; + +/** + * Implementation of the {@link ObjectIntMap} interface, used for DictionaryEncoder. + * Note that value in this map is always not less than 0, and -1 represents a null value. + * @param <K> key type. + */ +public class DictionaryEncodeHashMap<K> implements ObjectIntMap<K> { + + /** + * Represents a null value in map. + */ + static final int NULL_VALUE = -1; + + /** + * The default initial capacity - MUST be a power of two. + */ + static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; + + /** + * The maximum capacity, used if a higher value is implicitly specified + * by either of the constructors with arguments. + */ + static final int MAXIMUM_CAPACITY = 1 << 30; + + /** + * The load factor used when none specified in constructor. + */ + static final float DEFAULT_LOAD_FACTOR = 0.75f; + + static final Entry<?>[] EMPTY_TABLE = {}; + + /** + * The table, initialized on first use, and resized as + * necessary. When allocated, length is always a power of two. + */ + transient Entry<K>[] table = (Entry<K>[]) EMPTY_TABLE; + + /** + * The number of key-value mappings contained in this map. + */ + transient int size; + + /** + * The next size value at which to resize (capacity * load factor). + */ + int threshold; + + /** + * The load factor for the hash table. + */ + final float loadFactor; + + /** + * Constructs an empty map with the specified initial capacity and load factor. + */ + public DictionaryEncodeHashMap(int initialCapacity, float loadFactor) { + if (initialCapacity < 0) { + throw new IllegalArgumentException("Illegal initial capacity: " + + initialCapacity); + } + if (initialCapacity > MAXIMUM_CAPACITY) { + initialCapacity = MAXIMUM_CAPACITY; + } + if (loadFactor <= 0 || Float.isNaN(loadFactor)) { + throw new IllegalArgumentException("Illegal load factor: " + + loadFactor); + } + this.loadFactor = loadFactor; + this.threshold = initialCapacity; + } + + public DictionaryEncodeHashMap(int initialCapacity) { + this(initialCapacity, DEFAULT_LOAD_FACTOR); + } + + public DictionaryEncodeHashMap() { + this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR); + } + + /** + * Compute the capacity with given threshold and create init table. + */ + private void inflateTable(int threshold) { + int capacity = roundUpToPowerOf2(threshold); + this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1); + table = new Entry[capacity]; + } + + /** + * Computes key.hashCode() and spreads (XORs) higher bits of hash to lower. + */ + static final int hash(Object key) { + int h; + return (key == null) ? 0 : (h = key.hashCode()) ^ (h >>> 16); + } + + /** + * Computes the storage location in an array for the given hashCode. + */ + static int indexFor(int h, int length) { + return h & (length - 1); + } + + /** + * Returns a power of two size for the given size. + */ + static final int roundUpToPowerOf2(int size) { + int n = size - 1; + n |= n >>> 1; + n |= n >>> 2; + n |= n >>> 4; + n |= n >>> 8; + n |= n >>> 16; + return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; + } + + /** + * Returns the value to which the specified key is mapped, + * or -1 if this map contains no mapping for the key. + */ + @Override + public int get(K key) { + int hash = hash(key); + int index = indexFor(hash, table.length); + for (Entry<K> e = table[index] ; e != null ; e = e.next) { + if ((e.hash == hash) && e.key.equals(key)) { + return e.value; + } + } + return NULL_VALUE; + } + + /** + * Associates the specified value with the specified key in this map. + * If the map previously contained a mapping for the key, the old + * value is replaced. + */ + @Override + public int put(K key, int value) { + if (table == EMPTY_TABLE) { + inflateTable(threshold); + } + + if (key == null) { + return putForNullKey(value); + } + + int hash = hash(key); + int i = indexFor(hash, table.length); + for (Entry<K> e = table[i]; e != null; e = e.next) { + Object k; + if (e.hash == hash && ((k = e.key) == key || key.equals(k))) { + int oldValue = e.value; + e.value = value; + return oldValue; + } + } + + addEntry(hash, key, value, i); + return NULL_VALUE; + } + + /** + * Removes the mapping for the specified key from this map if present. + * @param key key whose mapping is to be removed from the map + * @return the previous value associated with <tt>key</tt>, or + * -1 if there was no mapping for <tt>key</tt>. + */ + @Override + public int remove(K key) { + Entry<K> e = removeEntryForKey(key); + return (e == null ? NULL_VALUE : e.value); + } + + /** + * Create a new Entry at the specific position of table. + */ + void createEntry(int hash, K key, int value, int bucketIndex) { + Entry<K> e = table[bucketIndex]; + table[bucketIndex] = new Entry<>(hash, key, value, e); + size++; + } + + /** + * Put value when key is null. + */ + private int putForNullKey(int value) { + for (Entry<K> e = table[0]; e != null; e = e.next) { + if (e.key == null) { + int oldValue = e.value; + e.value = value; + return oldValue; + } + } + addEntry(0, null, value, 0); + return NULL_VALUE; + } + + /** + * Add Entry at the specified location of the table. + */ + void addEntry(int hash, K key, int value, int bucketIndex) { + if ((size >= threshold) && (null != table[bucketIndex])) { + resize(2 * table.length); + hash = (null != key) ? hash(key) : 0; + bucketIndex = indexFor(hash, table.length); + } + + createEntry(hash, key, value, bucketIndex); + } + + /** + * Resize table with given new capacity. + */ + void resize(int newCapacity) { + Entry[] oldTable = table; + int oldCapacity = oldTable.length; + if (oldCapacity == MAXIMUM_CAPACITY) { + threshold = Integer.MAX_VALUE; + return; + } + + Entry[] newTable = new Entry[newCapacity]; + transfer(newTable); + table = newTable; + threshold = (int)Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1); + } + + /** + * Transfer entries into new table from old table. + * @param newTable new table + */ + void transfer(Entry[] newTable) { + int newCapacity = newTable.length; + for (Entry<K> e : table) { + while (null != e) { + Entry<K> next = e.next; + int i = indexFor(e.hash, newCapacity); + e.next = newTable[i]; + newTable[i] = e; + e = next; + } + } + } + + /** + * Remove entry associated with the given key. + */ + final Entry<K> removeEntryForKey(Object key) { + if (size == 0) { + return null; + } + int hash = (key == null) ? 0 : hash(key); + int i = indexFor(hash, table.length); + Entry<K> prev = table[i]; + Entry<K> e = prev; + + while (e != null) { + Entry<K> next = e.next; + Object k; + if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k)))) { + size--; + if (prev == e) { + table[i] = next; + } else { + prev.next = next; + } + + return e; + } + prev = e; + e = next; + } + + return e; + } + + /** + * Returns the number of mappings in this Map. + */ + public int size() { + return size; + } + + /** + * Removes all elements from this map, leaving it empty. + */ + public void clear() { + size = 0; + for (int i = 0; i < table.length; i++) { + table[i] = null; + } + } + + /** + * Class to keep key-value data within hash map. + * @param <K> key type. + */ + static class Entry<K> { + final K key; + int value; + Entry<K> next; + int hash; + + Entry(int hash, K key, int value, Entry<K> next) { + this.key = key; + this.value = value; + this.hash = hash; + this.next = next; + } + + public final K getKey() { + return key; + } + + public final int getValue() { + return value; + } + + public final int setValue(int newValue) { + int oldValue = value; + value = newValue; + return oldValue; + } + + public final boolean equals(Object o) { + if (!(o instanceof DictionaryEncodeHashMap.Entry)) { + return false; + } + Entry e = (Entry) o; + if (Objects.equals(key, e.getKey())) { + if (value == e.getValue()) { + return true; + } + } + return false; + } + + public final int hashCode() { + return Objects.hashCode(key) ^ Objects.hashCode(value); + } + + public final String toString() { + return getKey() + "=" + getValue(); + } + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java new file mode 100644 index 0000000..582bb56 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/ObjectIntMap.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +/** + * Specific hash map for int type value, reducing boxing/unboxing operations. + * @param <K> key type. + */ +public interface ObjectIntMap<K> { + + /** + * Associates the specified value with the specified key in this map. + * If the map previously contained a mapping for the key, the old + * value is replaced. + * @param key key with which the specified value is to be associated + * @param value value to be associated with the specified key + * @return the previous value associated with <tt>key</tt>, or + * -1 if there was no mapping for <tt>key</tt>. + */ + int put(K key, int value); + + /** + * Returns the value to which the specified key is mapped, + * or -1 if this map contains no mapping for the key. + */ + int get(K key); + + /** + * Removes the mapping for the specified key from this map if present. + * @param key key whose mapping is to be removed from the map + * @return the previous value associated with <tt>key</tt>, or + * -1 if there was no mapping for <tt>key</tt>. + */ + int remove(K key); +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java new file mode 100644 index 0000000..5f4e710 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryEncodeHashMap.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.arrow.vector.dictionary.DictionaryEncodeHashMap; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + + + +public class TestDictionaryEncodeHashMap { + + private List<String> testData = new ArrayList<>(); + + private static final int SIZE = 100; + + private static final int KEY_LENGTH = 5; + + private DictionaryEncodeHashMap<String> map = new DictionaryEncodeHashMap(); + + @Before + public void init() { + for (int i = 0; i < SIZE; i++) { + testData.add(generateUniqueKey(KEY_LENGTH)); + } + } + + @After + public void terminate() throws Exception { + testData.clear(); + } + + private String generateUniqueKey(int length) { + String str = "abcdefghijklmnopqrstuvwxyz"; + Random random = new Random(); + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < length; i++) { + int number = random.nextInt(26); + sb.append(str.charAt(number)); + } + if (testData.contains(sb.toString())) { + return generateUniqueKey(length); + } + return sb.toString(); + } + + @Test + public void testPutAndGet() { + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + + for (int i = 0; i < SIZE; i++) { + assertEquals(i, map.get(testData.get(i))); + } + } + + @Test + public void testPutExistKey() { + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + map.put("test_key", 101); + assertEquals(101, map.get("test_key")); + map.put("test_key", 102); + assertEquals(102, map.get("test_key")); + } + + @Test + public void testGetNonExistKey() { + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + //remove if exists. + map.remove("test_key"); + assertEquals(-1, map.get("test_key")); + } + + @Test + public void testRemove() { + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + map.put("test_key", 10000); + assertEquals(SIZE + 1, map.size()); + assertEquals(10000, map.get("test_key")); + map.remove("test_key"); + assertEquals(SIZE, map.size()); + assertEquals(-1, map.get("test_key")); + } + + @Test + public void testSize() { + assertEquals(0, map.size()); + for (int i = 0; i < SIZE; i++) { + map.put(testData.get(i), i); + } + assertEquals(SIZE, map.size()); + } +}