This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e1559f2  ARROW-5902: [Java] Implement hash table and equals & hashCode 
API for dictionary encoding
e1559f2 is described below

commit e1559f21abfd385b56b9e5c18ba31c997ee581d7
Author: tianchen <niki...@alibaba-inc.com>
AuthorDate: Thu Jul 18 20:36:54 2019 -0700

    ARROW-5902: [Java] Implement hash table and equals & hashCode API for 
dictionary encoding
    
    As discussed in https://github.com/apache/arrow/pull/4792
    
    Implement a hash table to only store hash & index, meanwhile add check 
equal function in ValueVector API.
    
    Author: tianchen <niki...@alibaba-inc.com>
    
    Closes #4846 from tianchen92/hasher and squashes the following commits:
    
    2db730268 <tianchen> fix
    5facc2a51 <tianchen> resolve comments
    175192a38 <tianchen> fix test and style
    7a87526e3 <tianchen> implementation of equals and hashCode
    c89608b7a <tianchen> fix
    8f2e1a2f7 <tianchen> hash table prototype
---
 .../src/main/codegen/templates/UnionVector.java    |  51 +++-
 .../apache/arrow/vector/BaseFixedWidthVector.java  |  30 +++
 .../arrow/vector/BaseVariableWidthVector.java      |  30 +++
 .../java/org/apache/arrow/vector/ValueVector.java  |  14 ++
 .../org/apache/arrow/vector/VarCharVector.java     |   1 -
 .../java/org/apache/arrow/vector/ZeroVector.java   |  10 +
 .../arrow/vector/complex/FixedSizeListVector.java  |  32 +++
 .../apache/arrow/vector/complex/ListVector.java    |  43 ++++
 .../vector/complex/NonNullableStructVector.java    |  51 ++++
 .../apache/arrow/vector/complex/StructVector.java  |  10 +
 .../arrow/vector/dictionary/DictionaryEncoder.java |  26 +-
 .../vector/dictionary/DictionaryHashTable.java     | 268 +++++++++++++++++++++
 .../arrow/vector/util/ByteFunctionHelpers.java     |  42 ++++
 .../arrow/vector/types/pojo/TestExtensionType.java |  11 +
 14 files changed, 587 insertions(+), 32 deletions(-)

diff --git a/java/vector/src/main/codegen/templates/UnionVector.java 
b/java/vector/src/main/codegen/templates/UnionVector.java
index 04eed72..b05005d 100644
--- a/java/vector/src/main/codegen/templates/UnionVector.java
+++ b/java/vector/src/main/codegen/templates/UnionVector.java
@@ -17,6 +17,7 @@
 
 import io.netty.buffer.ArrowBuf;
 import org.apache.arrow.memory.ReferenceManager;
+import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.types.pojo.FieldType;
 
 <@pp.dropOutputFile />
@@ -491,32 +492,35 @@ public class UnionVector implements FieldVector {
     return vectors.iterator();
   }
 
-
-    public Object getObject(int index) {
+    private ValueVector getVector(int index) {
       int type = typeBuffer.getByte(index * TYPE_WIDTH);
       switch (MinorType.values()[type]) {
-      case NULL:
-        return null;
+        case NULL:
+          return null;
       <#list vv.types as type>
         <#list type.minor as minor>
           <#assign name = minor.class?cap_first />
           <#assign fields = minor.fields!type.fields />
           <#assign uncappedName = name?uncap_first/>
           <#if !minor.typeParams?? >
-      case ${name?upper_case}:
-          return get${name}Vector().getObject(index);
+        case ${name?upper_case}:
+        return get${name}Vector();
           </#if>
         </#list>
       </#list>
-      case STRUCT:
-        return getStruct().getObject(index);
-      case LIST:
-        return getList().getObject(index);
-      default:
-        throw new UnsupportedOperationException("Cannot support type: " + 
MinorType.values()[type]);
+        case STRUCT:
+          return getStruct();
+        case LIST:
+          return getList();
+        default:
+          throw new UnsupportedOperationException("Cannot support type: " + 
MinorType.values()[type]);
       }
     }
 
+    public Object getObject(int index) {
+      return getVector(index).getObject(index);
+    }
+
     public byte[] get(int index) {
       return null;
     }
@@ -622,4 +626,27 @@ public class UnionVector implements FieldVector {
     private int getTypeBufferValueCapacity() {
       return typeBuffer.capacity() / TYPE_WIDTH;
     }
+
+    @Override
+    public int hashCode(int index) {
+      return getVector(index).hashCode(index);
+    }
+
+    @Override
+    public boolean equals(int index, ValueVector to, int toIndex) {
+      if (to == null) {
+        return false;
+      }
+      if (this.getClass() != to.getClass()) {
+        return false;
+      }
+      UnionVector that = (UnionVector) to;
+      ValueVector leftVector = getVector(index);
+      ValueVector rightVector = that.getVector(toIndex);
+
+      if (leftVector.getClass() != rightVector.getClass()) {
+        return false;
+      }
+      return leftVector.equals(index, rightVector, toIndex);
+    }
 }
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
index 8feca75..b0d716a 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java
@@ -25,6 +25,7 @@ import java.util.List;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
 import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.util.ByteFunctionHelpers;
 import org.apache.arrow.vector.util.CallBack;
 import org.apache.arrow.vector.util.OversizedAllocationException;
 import org.apache.arrow.vector.util.TransferPair;
@@ -837,4 +838,33 @@ public abstract class BaseFixedWidthVector extends 
BaseValueVector
     handleSafe(thisIndex);
     copyFrom(fromIndex, thisIndex, from);
   }
+
+  @Override
+  public int hashCode(int index) {
+    int start = typeWidth * index;
+    int end = typeWidth * (index + 1);
+    return ByteFunctionHelpers.hash(this.getDataBuffer(), start, end);
+  }
+
+  @Override
+  public boolean equals(int index, ValueVector to, int toIndex) {
+    if (to == null) {
+      return false;
+    }
+    if (this.getClass() != to.getClass()) {
+      return false;
+    }
+
+    BaseFixedWidthVector that = (BaseFixedWidthVector) to;
+
+    int leftStart = typeWidth * index;
+    int leftEnd = typeWidth * (index + 1);
+
+    int rightStart = typeWidth * toIndex;
+    int rightEnd = typeWidth * (toIndex + 1);
+
+    int ret = ByteFunctionHelpers.equal(this.getDataBuffer(), leftStart, 
leftEnd,
+        that.getDataBuffer(), rightStart, rightEnd);
+    return ret == 1;
+  }
 }
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index 5262f33..19fcc67 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -27,6 +27,7 @@ import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.OutOfMemoryException;
 import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
 import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.util.ByteFunctionHelpers;
 import org.apache.arrow.vector.util.CallBack;
 import org.apache.arrow.vector.util.OversizedAllocationException;
 import org.apache.arrow.vector.util.TransferPair;
@@ -1334,4 +1335,33 @@ public abstract class BaseVariableWidthVector extends 
BaseValueVector
     }
     lastSet = thisIndex;
   }
+
+  @Override
+  public int hashCode(int index) {
+    final int start = getStartOffset(index);
+    final int end = getStartOffset(index + 1);
+    return ByteFunctionHelpers.hash(this.getDataBuffer(), start, end);
+  }
+
+  @Override
+  public boolean equals(int index, ValueVector to, int toIndex) {
+    if (to == null) {
+      return false;
+    }
+    if (this.getClass() != to.getClass()) {
+      return false;
+    }
+
+    BaseVariableWidthVector that = (BaseVariableWidthVector) to;
+
+    final int leftStart = getStartOffset(index);
+    final int leftEnd = getStartOffset(index + 1);
+
+    final int rightStart = that.getStartOffset(toIndex);
+    final int rightEnd = that.getStartOffset(toIndex + 1);
+
+    int ret = ByteFunctionHelpers.equal(this.getDataBuffer(), leftStart, 
leftEnd,
+        that.getDataBuffer(), rightStart, rightEnd);
+    return ret == 1;
+  }
 }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
index 86a381a..795493a 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java
@@ -237,4 +237,18 @@ public interface ValueVector extends Closeable, 
Iterable<ValueVector> {
    * @return true if element is null
    */
   boolean isNull(int index);
+
+  /**
+   * Returns hashCode of element in index.
+   */
+  int hashCode(int index);
+
+  /**
+   * Check whether the element in index equals to the element in targetIndex 
from the target vector.
+   * @param index index to compare in this vector
+   * @param target target vector
+   * @param targetIndex index to compare in target vector
+   * @return true if equals, otherwise false.
+   */
+  boolean equals(int index, ValueVector target, int targetIndex);
 }
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java
index c012ce3..7a914d9 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java
@@ -270,7 +270,6 @@ public class VarCharVector extends BaseVariableWidthVector {
     setSafe(index, text.getBytes(), 0, text.getLength());
   }
 
-
   /*----------------------------------------------------------------*
    |                                                                |
    |                      vector transfer                           |
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
index 37784ed..b34aa4d 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java
@@ -244,4 +244,14 @@ public class ZeroVector implements FieldVector {
   public boolean isNull(int index) {
     return false;
   }
+
+  @Override
+  public int hashCode(int index) {
+    return 0;
+  }
+
+  @Override
+  public boolean equals(int index, ValueVector to, int toIndex) {
+    return false;
+  }
 }
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
index 1d2c8db..c665bef 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java
@@ -45,6 +45,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.arrow.vector.util.ByteFunctionHelpers;
 import org.apache.arrow.vector.util.CallBack;
 import org.apache.arrow.vector.util.JsonStringArrayList;
 import org.apache.arrow.vector.util.OversizedAllocationException;
@@ -502,6 +503,37 @@ public class FixedSizeListVector extends BaseValueVector 
implements FieldVector,
     return new TransferImpl((FixedSizeListVector) target);
   }
 
+  @Override
+  public int hashCode(int index) {
+    if (isSet(index) == 0) {
+      return 0;
+    }
+    int hash = 0;
+    for (int i = 0; i < listSize; i++) {
+      hash = ByteFunctionHelpers.comebineHash(hash, vector.hashCode(index * 
listSize + i));
+    }
+    return hash;
+  }
+
+  @Override
+  public boolean equals(int index, ValueVector to, int toIndex) {
+    if (to == null) {
+      return false;
+    }
+    if (this.getClass() != to.getClass()) {
+      return false;
+    }
+
+    FixedSizeListVector that = (FixedSizeListVector) to;
+
+    for (int i = 0; i < listSize; i++) {
+      if (!vector.equals(index * listSize + i, that, toIndex * listSize + i)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   private class TransferImpl implements TransferPair {
 
     FixedSizeListVector to;
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
index 945f50f..43b43bd 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java
@@ -411,6 +411,49 @@ public class ListVector extends BaseRepeatedValueVector 
implements FieldVector,
     return offsetBuffer;
   }
 
+  @Override
+  public int hashCode(int index) {
+    if (isSet(index) == 0) {
+      return 0;
+    }
+    int hash = 0;
+    final int start = offsetBuffer.getInt(index * OFFSET_WIDTH);
+    final int end = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH);
+    for (int i = start; i < end; i++) {
+      hash = 31 * vector.hashCode(i);
+    }
+    return hash;
+  }
+
+  @Override
+  public boolean equals(int index, ValueVector to, int toIndex) {
+    if (to == null) {
+      return false;
+    }
+    if (this.getClass() != to.getClass()) {
+      return false;
+    }
+
+    ListVector that = (ListVector) to;
+    final int leftStart = offsetBuffer.getInt(index * OFFSET_WIDTH);
+    final int leftEnd = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH);
+
+    final int rightStart = that.offsetBuffer.getInt(toIndex * OFFSET_WIDTH);
+    final int rightEnd = that.offsetBuffer.getInt((toIndex + 1) * 
OFFSET_WIDTH);
+
+    if ((leftEnd - leftStart) != (rightEnd - rightStart)) {
+      return false;
+    }
+
+    for (int i = 0; i < (leftEnd - leftStart); i++) {
+      if (!vector.equals(leftStart + i, that.vector, rightStart + i)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   private class TransferImpl implements TransferPair {
 
     ListVector to;
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
index 1ca315a..1d9b871 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java
@@ -287,6 +287,56 @@ public class NonNullableStructVector extends 
AbstractStructVector {
   }
 
   @Override
+  public int hashCode(int index) {
+    int hash = 0;
+    for (String child : getChildFieldNames()) {
+      ValueVector v = getChild(child);
+      if (v != null && index < v.getValueCount()) {
+        hash += 31 * hash + v.hashCode(index);
+      }
+    }
+    return hash;
+  }
+
+  @Override
+  public boolean equals(int index, ValueVector to, int toIndex) {
+    if (to == null) {
+      return false;
+    }
+    if (this.getClass() != to.getClass()) {
+      return false;
+    }
+    NonNullableStructVector that = (NonNullableStructVector) to;
+    List<ValueVector> leftChildrens = new ArrayList<>();
+    List<ValueVector> rightChildrens = new ArrayList<>();
+
+    for (String child : getChildFieldNames()) {
+      ValueVector v = getChild(child);
+      if (v != null) {
+        leftChildrens.add(v);
+      }
+    }
+
+    for (String child : that.getChildFieldNames()) {
+      ValueVector v = that.getChild(child);
+      if (v != null) {
+        rightChildrens.add(v);
+      }
+    }
+
+    if (leftChildrens.size() != rightChildrens.size()) {
+      return false;
+    }
+
+    for (int i = 0; i < leftChildrens.size(); i++) {
+      if (!leftChildrens.get(i).equals(index, rightChildrens.get(i), toIndex)) 
{
+        return false;
+      }
+    }
+    return true;
+  }
+
+  @Override
   public boolean isNull(int index) {
     return false;
   }
@@ -372,4 +422,5 @@ public class NonNullableStructVector extends 
AbstractStructVector {
   public List<FieldVector> getChildrenFromFields() {
     return getChildren();
   }
+
 }
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
index a07e0d2..2ad9fb7 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/StructVector.java
@@ -480,6 +480,15 @@ public class StructVector extends NonNullableStructVector 
implements FieldVector
   }
 
   @Override
+  public int hashCode(int index) {
+    if (isSet(index) == 0) {
+      return 0;
+    } else {
+      return super.hashCode(index);
+    }
+  }
+
+  @Override
   public void get(int index, ComplexHolder holder) {
     holder.isSet = isSet(index);
     if (holder.isSet == 0) {
@@ -546,4 +555,5 @@ public class StructVector extends NonNullableStructVector 
implements FieldVector
     super.setValueCount(valueCount);
     this.valueCount = valueCount;
   }
+
 }
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
index 623e4f4..ed69df3 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java
@@ -17,7 +17,6 @@
 
 package org.apache.arrow.vector.dictionary;
 
-import org.apache.arrow.vector.BaseBinaryVector;
 import org.apache.arrow.vector.BaseIntVector;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.ValueVector;
@@ -44,15 +43,11 @@ public class DictionaryEncoder {
    */
   public static ValueVector encode(ValueVector vector, Dictionary dictionary) {
     validateType(vector.getMinorType());
-    // load dictionary values into a hashmap for lookup
-    DictionaryEncodeHashMap<Object> lookUps = new 
DictionaryEncodeHashMap<>(dictionary.getVector().getValueCount());
-
-    boolean binaryType = isBinaryType(vector.getMinorType());
+    // load dictionary indices into a hashmap for lookup
 
+    DictionaryHashTable hashTable = new 
DictionaryHashTable(dictionary.getVector());
     for (int i = 0; i < dictionary.getVector().getValueCount(); i++) {
-      Object key = binaryType ? ((BaseBinaryVector) 
dictionary.getVector()).getByteArrayWrapper(i) :
-          dictionary.getVector().getObject(i);
-      lookUps.put(key, i);
+      hashTable.put(i);
     }
 
     Field valueField = vector.getField();
@@ -73,12 +68,12 @@ public class DictionaryEncoder {
     int count = vector.getValueCount();
 
     for (int i = 0; i < count; i++) {
-      Object value = binaryType ? ((BaseBinaryVector) 
vector).getByteArrayWrapper(i) : vector.getObject(i);
-      if (value != null) { // if it's null leave it null
+      if (!vector.isNull(i)) { // if it's null leave it null
         // note: this may fail if value was not included in the dictionary
-        int encoded = lookUps.get(value);
+        //int encoded = lookUps.get(value);
+        int encoded = hashTable.getIndex(i, vector);
         if (encoded == -1) {
-          throw new IllegalArgumentException("Dictionary encoding not defined 
for value:" + value);
+          throw new IllegalArgumentException("Dictionary encoding not defined 
for value:" + vector.getObject(i));
         }
         indices.setWithPossibleTruncate(i, encoded);
       }
@@ -119,13 +114,6 @@ public class DictionaryEncoder {
     return decoded;
   }
 
-  private static boolean isBinaryType(MinorType type) {
-    if (type == MinorType.VARBINARY || type == MinorType.FIXEDSIZEBINARY) {
-      return true;
-    }
-    return false;
-  }
-
   private static void validateType(MinorType type) {
     if (type == MinorType.UNION) {
       throw new IllegalArgumentException("Dictionary encoding not implemented 
for current type: " + type);
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java
new file mode 100644
index 0000000..bf0c788
--- /dev/null
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.vector.dictionary;
+
+import java.util.Objects;
+
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * HashTable used for Dictionary encoding. It holds two vectors (the vector to 
encode and dictionary vector)
+ * It stores the index in dictionary vector and for a given index in encode 
vector,
+ * it could return dictionary index.
+ */
+public class DictionaryHashTable {
+
+  /**
+   * Represents a null value in map.
+   */
+  static final int NULL_VALUE = -1;
+
+  /**
+   * The default initial capacity - MUST be a power of two.
+   */
+  static final int DEFAULT_INITIAL_CAPACITY = 1 << 4;
+
+  /**
+   * The maximum capacity, used if a higher value is implicitly specified
+   * by either of the constructors with arguments.
+   */
+  static final int MAXIMUM_CAPACITY = 1 << 30;
+
+  /**
+   * The load factor used when none specified in constructor.
+   */
+  static final float DEFAULT_LOAD_FACTOR = 0.75f;
+
+  static final DictionaryHashTable.Entry[] EMPTY_TABLE = {};
+
+  /**
+   * The table, initialized on first use, and resized as
+   * necessary. When allocated, length is always a power of two.
+   */
+  transient DictionaryHashTable.Entry[] table = (DictionaryHashTable.Entry[]) 
EMPTY_TABLE;
+
+  /**
+   * The number of key-value mappings contained in this map.
+   */
+  transient int size;
+
+  /**
+   * The next size value at which to resize (capacity * load factor).
+   */
+  int threshold;
+
+  /**
+   * The load factor for the hash table.
+   */
+  final float loadFactor;
+
+  private final ValueVector dictionary;
+
+  /**
+   * Constructs an empty map with the specified initial capacity and load 
factor.
+   */
+  public DictionaryHashTable(int initialCapacity, ValueVector dictionary) {
+    if (initialCapacity < 0) {
+      throw new IllegalArgumentException("Illegal initial capacity: " +
+          initialCapacity);
+    }
+    if (initialCapacity > MAXIMUM_CAPACITY) {
+      initialCapacity = MAXIMUM_CAPACITY;
+    }
+    this.loadFactor = DEFAULT_LOAD_FACTOR;
+    this.threshold = initialCapacity;
+
+    this.dictionary = dictionary;
+  }
+
+  public DictionaryHashTable(ValueVector dictionary) {
+    this(DEFAULT_INITIAL_CAPACITY, dictionary);
+  }
+
+  /**
+   * Compute the capacity with given threshold and create init table.
+   */
+  private void inflateTable(int threshold) {
+    int capacity = roundUpToPowerOf2(threshold);
+    this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 
1);
+    table = new DictionaryHashTable.Entry[capacity];
+  }
+
+  /**
+   * Computes the storage location in an array for the given hashCode.
+   */
+  static int indexFor(int h, int length) {
+    return h & (length - 1);
+  }
+
+  /**
+   * Returns a power of two size for the given size.
+   */
+  static final int roundUpToPowerOf2(int size) {
+    int n = size - 1;
+    n |= n >>> 1;
+    n |= n >>> 2;
+    n |= n >>> 4;
+    n |= n >>> 8;
+    n |= n >>> 16;
+    return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1;
+  }
+
+  /**
+   * get the corresponding dictionary index with the given index in vector 
which to encode.
+   * @param indexInArray index in vector.
+   * @return dictionary vector index or -1 if no value equals.
+   */
+  public int getIndex(int indexInArray, ValueVector toEncode) {
+    int hash = toEncode.hashCode(indexInArray);
+    int index = indexFor(hash, table.length);
+    for (DictionaryHashTable.Entry e = table[index]; e != null ; e = e.next) {
+      if ((e.hash == hash)) {
+        int dictIndex = e.index;
+        if (dictionary.equals(dictIndex, toEncode, indexInArray)) {
+          return dictIndex;
+        }
+      }
+    }
+    return NULL_VALUE;
+  }
+
+  /**
+   * put the index of dictionary vector to build hash table.
+   */
+  public void put(int indexInDictionary) {
+    if (table == EMPTY_TABLE) {
+      inflateTable(threshold);
+    }
+
+    int hash = dictionary.hashCode(indexInDictionary);
+    int i = indexFor(hash, table.length);
+    for (DictionaryHashTable.Entry e = table[i]; e != null; e = e.next) {
+      if (e.hash == hash && e.index == indexInDictionary) {
+        //already has this index, return
+        return;
+      }
+    }
+
+    addEntry(hash, indexInDictionary, i);
+  }
+
+  /**
+   * Create a new Entry at the specific position of table.
+   */
+  void createEntry(int hash, int index, int bucketIndex) {
+    DictionaryHashTable.Entry e = table[bucketIndex];
+    table[bucketIndex] = new DictionaryHashTable.Entry(hash, index, e);
+    size++;
+  }
+
+  /**
+   * Add Entry at the specified location of the table.
+   */
+  void addEntry(int hash, int index, int bucketIndex) {
+    if ((size >= threshold) && (null != table[bucketIndex])) {
+      resize(2 * table.length);
+      bucketIndex = indexFor(hash, table.length);
+    }
+
+    createEntry(hash, index, bucketIndex);
+  }
+
+  /**
+   * Resize table with given new capacity.
+   */
+  void resize(int newCapacity) {
+    DictionaryHashTable.Entry[] oldTable = table;
+    int oldCapacity = oldTable.length;
+    if (oldCapacity == MAXIMUM_CAPACITY) {
+      threshold = Integer.MAX_VALUE;
+      return;
+    }
+
+    DictionaryHashTable.Entry[] newTable = new 
DictionaryHashTable.Entry[newCapacity];
+    transfer(newTable);
+    table = newTable;
+    threshold = (int)Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1);
+  }
+
+  /**
+   * Transfer entries into new table from old table.
+   * @param newTable new table
+   */
+  void transfer(DictionaryHashTable.Entry[] newTable) {
+    int newCapacity = newTable.length;
+    for (DictionaryHashTable.Entry e : table) {
+      while (null != e) {
+        DictionaryHashTable.Entry next = e.next;
+        int i = indexFor(e.hash, newCapacity);
+        e.next = newTable[i];
+        newTable[i] = e;
+        e = next;
+      }
+    }
+  }
+
+  /**
+   * Returns the number of mappings in this Map.
+   */
+  public int size() {
+    return size;
+  }
+
+  /**
+   * Removes all elements from this map, leaving it empty.
+   */
+  public void clear() {
+    size = 0;
+    for (int i = 0; i < table.length; i++) {
+      table[i] = null;
+    }
+  }
+
+  /**
+   * Class to keep dictionary index data within hash table.
+   */
+  static class Entry {
+    //dictionary index
+    int index;
+    DictionaryHashTable.Entry next;
+    int hash;
+
+    Entry(int hash, int index, DictionaryHashTable.Entry next) {
+      this.index = index;
+      this.hash = hash;
+      this.next = next;
+    }
+
+    public final int getIndex() {
+      return this.index;
+    }
+
+    public final boolean equals(Object o) {
+      if (!(o instanceof DictionaryEncodeHashMap.Entry)) {
+        return false;
+      }
+      DictionaryHashTable.Entry e = (DictionaryHashTable.Entry) o;
+      if (Objects.equals(index, e.getIndex())) {
+        return true;
+      }
+      return false;
+    }
+  }
+}
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java
index fba32b6..8dbdc49 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java
@@ -49,6 +49,48 @@ public class ByteFunctionHelpers {
     return memEqual(left.memoryAddress(), lStart, lEnd, right.memoryAddress(), 
rStart, rEnd);
   }
 
+  /**
+   * Compute hashCode with the given {@link ArrowBuf} and start/end index.
+   */
+  public static final int hash(final ArrowBuf buf, int start, int end) {
+    long addr = buf.memoryAddress();
+    int len = end - start;
+    long pos = addr + start;
+
+    int hash = 0;
+
+    while (len > 7) {
+      long value = PlatformDependent.getLong(pos);
+      hash = comebineHash(hash, Long.hashCode(value));
+
+      pos += 8;
+      len -= 8;
+    }
+
+    while (len > 3) {
+      int value = PlatformDependent.getInt(pos);
+      hash = comebineHash(hash, value);
+
+      pos += 4;
+      len -= 4;
+    }
+
+    while (len-- != 0) {
+      byte value = PlatformDependent.getByte(pos);
+      hash = comebineHash(hash, value);
+      pos ++;
+    }
+
+    return hash;
+  }
+
+  /**
+   * Generate a new hashCode with the given current hashCode and new hashCode.
+   */
+  public static int comebineHash(int currentHash, int newHash) {
+    return currentHash * 31 + newHash;
+  }
+
   private static int memEqual(final long laddr, int lStart, int lEnd, final 
long raddr, int rStart,
                                     final int rEnd) {
 
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java
 
b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java
index 20d270c..792bd29 100644
--- 
a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java
+++ 
b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java
@@ -34,6 +34,7 @@ import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.ExtensionTypeVector;
 import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.FixedSizeBinaryVector;
+import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.arrow.vector.ipc.ArrowFileReader;
 import org.apache.arrow.vector.ipc.ArrowFileWriter;
@@ -204,6 +205,16 @@ public class TestExtensionType {
       return new UUID(bb.getLong(), bb.getLong());
     }
 
+    @Override
+    public int hashCode(int index) {
+      return getUnderlyingVector().hashCode(index);
+    }
+
+    @Override
+    public boolean equals(int index, ValueVector to, int toIndex) {
+      return getUnderlyingVector().equals(index, to, toIndex);
+    }
+
     public void set(int index, UUID uuid) {
       ByteBuffer bb = ByteBuffer.allocate(16);
       bb.putLong(uuid.getMostSignificantBits());

Reply via email to