This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 4b971ee  ARROW-6185: [Java] Provide hash table based dictionary builder
4b971ee is described below

commit 4b971ee0948bc12ef9955f743882bd1ce3452231
Author: liyafan82 <fan_li...@foxmail.com>
AuthorDate: Thu Aug 15 20:19:45 2019 -0700

    ARROW-6185: [Java] Provide hash table based dictionary builder
    
    This is related ARROW-5862. We provide another type of dictionary builder 
based on hash table. Compared with a search based dictionary encoder, a hash 
table based encoder process each new element in O(1) time, but require extra 
memory space.
    
    Closes #5054 from liyafan82/fly_0809_hashbuild and squashes the following 
commits:
    
    77e24531e <liyafan82>  Provide hash table based dictionary builder
    
    Authored-by: liyafan82 <fan_li...@foxmail.com>
    Signed-off-by: Micah Kornfield <emkornfi...@gmail.com>
---
 .../HashTableBasedDictionaryBuilder.java           | 174 ++++++++++++++++++
 .../TestHashTableBasedDictionaryEncoder.java       | 203 +++++++++++++++++++++
 2 files changed, 377 insertions(+)

diff --git 
a/java/algorithm/src/main/java/org/apache/arrow/algorithm/dictionary/HashTableBasedDictionaryBuilder.java
 
b/java/algorithm/src/main/java/org/apache/arrow/algorithm/dictionary/HashTableBasedDictionaryBuilder.java
new file mode 100644
index 0000000..eff0f05
--- /dev/null
+++ 
b/java/algorithm/src/main/java/org/apache/arrow/algorithm/dictionary/HashTableBasedDictionaryBuilder.java
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.dictionary;
+
+import java.util.HashMap;
+
+import org.apache.arrow.memory.util.ArrowBufPointer;
+import org.apache.arrow.memory.util.hash.ArrowBufHasher;
+import org.apache.arrow.memory.util.hash.SimpleHasher;
+import org.apache.arrow.vector.ElementAddressableVector;
+
+/**
+ * A dictionary builder is intended for the scenario frequently encountered in 
practice:
+ * the dictionary is not known a priori, so it is generated dynamically.
+ * In particular, when a new value arrives, it is tested to check if it is 
already
+ * in the dictionary. If so, it is simply neglected, otherwise, it is added to 
the dictionary.
+ *
+ * <p>
+ * This class builds the dictionary based on a hash table.
+ * Each add operation can be finished in O(1) time,
+ * where n is the current dictionary size.
+ * </p>
+ * <p>
+ * The dictionary builder is intended to build a single dictionary.
+ * So it cannot be used for different dictionaries.
+ * </p>
+ * <p>Below gives the sample code for using the dictionary builder
+ * <pre>{@code
+ * HashTableBasedDictionaryBuilder dictionaryBuilder = ...
+ * ...
+ * dictionaryBuild.addValue(newValue);
+ * ...
+ * }</pre>
+ * </p>
+ * <p>
+ *   With the above code, the dictionary vector will be populated,
+ *   and it can be retrieved by the {@link 
HashTableBasedDictionaryBuilder#getDictionary()} method.
+ *   After that, dictionary encoding can proceed with the populated dictionary 
encoder.
+ * </p>
+ *
+ * @param <V> the dictionary vector type.
+ */
+public class HashTableBasedDictionaryBuilder<V extends 
ElementAddressableVector> {
+
+  /**
+   * The dictionary to be built.
+   */
+  private final V dictionary;
+
+  /**
+   * If null should be encoded.
+   */
+  private final boolean encodeNull;
+
+  /**
+   * The hash map for distinct dictionary entries.
+   * The key is the pointer to the dictionary element, whereas the value is 
the index in the dictionary.
+   */
+  private HashMap<ArrowBufPointer, Integer> hashMap = new HashMap<>();
+
+  /**
+   * The hasher used for calculating the hash code.
+   */
+  private final ArrowBufHasher hasher;
+
+  /**
+   * Next pointer to try to add to the hash table.
+   */
+  private ArrowBufPointer nextPointer;
+
+  /**
+   * Constructs a hash table based dictionary builder.
+   *
+   * @param dictionary the dictionary to populate.
+   */
+  public HashTableBasedDictionaryBuilder(V dictionary) {
+    this(dictionary, false);
+  }
+
+  /**
+   * Constructs a hash table based dictionary builder.
+   *
+   * @param dictionary the dictionary to populate.
+   * @param encodeNull if null values should be added to the dictionary.
+   */
+  public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull) {
+    this(dictionary, encodeNull, SimpleHasher.INSTANCE);
+  }
+
+  /**
+   * Constructs a hash table based dictionary builder.
+   *
+   * @param dictionary the dictionary to populate.
+   * @param encodeNull if null values should be added to the dictionary.
+   * @param hasher     the hasher used to compute the hash code.
+   */
+  public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull, 
ArrowBufHasher hasher) {
+    this.dictionary = dictionary;
+    this.encodeNull = encodeNull;
+    this.hasher = hasher;
+    this.nextPointer = new ArrowBufPointer(hasher);
+  }
+
+  /**
+   * Gets the dictionary built.
+   *
+   * @return the dictionary.
+   */
+  public V getDictionary() {
+    return dictionary;
+  }
+
+  /**
+   * Try to add all values from the target vector to the dictionary.
+   *
+   * @param targetVector the target vector containing values to probe.
+   * @return the number of values actually added to the dictionary.
+   */
+  public int addValues(V targetVector) {
+    int ret = 0;
+    for (int i = 0; i < targetVector.getValueCount(); i++) {
+      if (!encodeNull && targetVector.isNull(i)) {
+        continue;
+      }
+      if (addValue(targetVector, i)) {
+        ret += 1;
+      }
+    }
+    return ret;
+  }
+
+  /**
+   * Try to add an element from the target vector to the dictionary.
+   *
+   * @param targetVector the target vector containing new element.
+   * @param targetIndex  the index of the new element in the target vector.
+   * @return true if the element is added to the dictionary, and false 
otherwise.
+   */
+  public boolean addValue(V targetVector, int targetIndex) {
+    targetVector.getDataPointer(targetIndex, nextPointer);
+
+    if (!hashMap.containsKey(nextPointer)) {
+      // a new dictionary element is found
+
+      // insert it to the dictionary
+      int dictSize = dictionary.getValueCount();
+      dictionary.copyFromSafe(targetIndex, dictSize, targetVector);
+      dictionary.setValueCount(dictSize + 1);
+      dictionary.getDataPointer(dictSize, nextPointer);
+
+      // insert it to the hash map
+      hashMap.put(nextPointer, dictSize);
+      nextPointer = new ArrowBufPointer(hasher);
+
+      return true;
+    }
+    return false;
+  }
+}
diff --git 
a/java/algorithm/src/test/java/org/apache/arrow/algorithm/dictionary/TestHashTableBasedDictionaryEncoder.java
 
b/java/algorithm/src/test/java/org/apache/arrow/algorithm/dictionary/TestHashTableBasedDictionaryEncoder.java
new file mode 100644
index 0000000..bdad2ed
--- /dev/null
+++ 
b/java/algorithm/src/test/java/org/apache/arrow/algorithm/dictionary/TestHashTableBasedDictionaryEncoder.java
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.algorithm.dictionary;
+
+import static junit.framework.TestCase.assertTrue;
+import static org.junit.Assert.assertNull;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.VarCharVector;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for {@link HashTableBasedDictionaryBuilder}.
+ */
+public class TestHashTableBasedDictionaryEncoder {
+
+  private BufferAllocator allocator;
+
+  @Before
+  public void prepare() {
+    allocator = new RootAllocator(1024 * 1024);
+  }
+
+  @After
+  public void shutdown() {
+    allocator.close();
+  }
+
+  @Test
+  public void testBuildVariableWidthDictionaryWithNull() {
+    try (VarCharVector vec = new VarCharVector("", allocator);
+         VarCharVector dictionary = new VarCharVector("", allocator)) {
+
+      vec.allocateNew(100, 10);
+      vec.setValueCount(10);
+
+      dictionary.allocateNew();
+
+      // fill data
+      vec.set(0, "hello".getBytes());
+      vec.set(1, "abc".getBytes());
+      vec.setNull(2);
+      vec.set(3, "world".getBytes());
+      vec.set(4, "12".getBytes());
+      vec.set(5, "dictionary".getBytes());
+      vec.setNull(6);
+      vec.set(7, "hello".getBytes());
+      vec.set(8, "good".getBytes());
+      vec.set(9, "abc".getBytes());
+
+      HashTableBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
+              new HashTableBasedDictionaryBuilder<>(dictionary, true);
+
+      int result = dictionaryBuilder.addValues(vec);
+
+      assertEquals(7, result);
+      assertEquals(7, dictionary.getValueCount());
+
+      assertEquals("hello", new String(dictionary.get(0)));
+      assertEquals("abc", new String(dictionary.get(1)));
+      assertNull(dictionary.get(2));
+      assertEquals("world", new String(dictionary.get(3)));
+      assertEquals("12", new String(dictionary.get(4)));
+      assertEquals("dictionary", new String(dictionary.get(5)));
+      assertEquals("good", new String(dictionary.get(6)));
+    }
+  }
+
+  @Test
+  public void testBuildVariableWidthDictionaryWithoutNull() {
+    try (VarCharVector vec = new VarCharVector("", allocator);
+         VarCharVector dictionary = new VarCharVector("", allocator)) {
+
+      vec.allocateNew(100, 10);
+      vec.setValueCount(10);
+
+      dictionary.allocateNew();
+
+      // fill data
+      vec.set(0, "hello".getBytes());
+      vec.set(1, "abc".getBytes());
+      vec.setNull(2);
+      vec.set(3, "world".getBytes());
+      vec.set(4, "12".getBytes());
+      vec.set(5, "dictionary".getBytes());
+      vec.setNull(6);
+      vec.set(7, "hello".getBytes());
+      vec.set(8, "good".getBytes());
+      vec.set(9, "abc".getBytes());
+
+      HashTableBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
+              new HashTableBasedDictionaryBuilder<>(dictionary, false);
+
+      int result = dictionaryBuilder.addValues(vec);
+
+      assertEquals(6, result);
+      assertEquals(6, dictionary.getValueCount());
+
+      assertEquals("hello", new String(dictionary.get(0)));
+      assertEquals("abc", new String(dictionary.get(1)));
+      assertEquals("world", new String(dictionary.get(2)));
+      assertEquals("12", new String(dictionary.get(3)));
+      assertEquals("dictionary", new String(dictionary.get(4)));
+      assertEquals("good", new String(dictionary.get(5)));
+
+    }
+  }
+
+  @Test
+  public void testBuildFixedWidthDictionaryWithNull() {
+    try (IntVector vec = new IntVector("", allocator);
+         IntVector dictionary = new IntVector("", allocator)) {
+      vec.allocateNew(10);
+      vec.setValueCount(10);
+
+      dictionary.allocateNew();
+
+      // fill data
+      vec.set(0, 4);
+      vec.set(1, 8);
+      vec.set(2, 32);
+      vec.set(3, 8);
+      vec.set(4, 16);
+      vec.set(5, 32);
+      vec.setNull(6);
+      vec.set(7, 4);
+      vec.set(8, 4);
+      vec.setNull(9);
+
+      HashTableBasedDictionaryBuilder<IntVector> dictionaryBuilder =
+              new HashTableBasedDictionaryBuilder<>(dictionary, true);
+
+      int result = dictionaryBuilder.addValues(vec);
+
+      assertEquals(5, result);
+      assertEquals(5, dictionary.getValueCount());
+
+      assertEquals(4, dictionary.get(0));
+      assertEquals(8, dictionary.get(1));
+      assertEquals(32, dictionary.get(2));
+      assertEquals(16, dictionary.get(3));
+      assertTrue(dictionary.isNull(4));
+    }
+  }
+
+  @Test
+  public void testBuildFixedWidthDictionaryWithoutNull() {
+    try (IntVector vec = new IntVector("", allocator);
+         IntVector dictionary = new IntVector("", allocator)) {
+      vec.allocateNew(10);
+      vec.setValueCount(10);
+
+      dictionary.allocateNew();
+
+      // fill data
+      vec.set(0, 4);
+      vec.set(1, 8);
+      vec.set(2, 32);
+      vec.set(3, 8);
+      vec.set(4, 16);
+      vec.set(5, 32);
+      vec.setNull(6);
+      vec.set(7, 4);
+      vec.set(8, 4);
+      vec.setNull(9);
+
+      HashTableBasedDictionaryBuilder<IntVector> dictionaryBuilder =
+              new HashTableBasedDictionaryBuilder<>(dictionary, false);
+
+      int result = dictionaryBuilder.addValues(vec);
+
+      assertEquals(4, result);
+      assertEquals(4, dictionary.getValueCount());
+
+      assertEquals(4, dictionary.get(0));
+      assertEquals(8, dictionary.get(1));
+      assertEquals(32, dictionary.get(2));
+      assertEquals(16, dictionary.get(3));
+
+    }
+  }
+}

Reply via email to