This is an automated email from the ASF dual-hosted git repository. emkornfield pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 2ee55bc ARROW-5997: [Java] Support dictionary encoding for Union type 2ee55bc is described below commit 2ee55bcaca98def0709189fc05674f1f23379d6c Author: tianchen <niki...@alibaba-inc.com> AuthorDate: Tue Jul 23 20:40:26 2019 -0700 ARROW-5997: [Java] Support dictionary encoding for Union type Related to [ARROW-5997](https://issues.apache.org/jira/browse/ARROW-5997). Now only Union type is not supported in dictionary encoding. In the last several weeks, we did some refactor for encoding and now it's time to support Union type. Author: tianchen <niki...@alibaba-inc.com> Closes #4917 from tianchen92/ARROW-5997 and squashes the following commits: 577b73ce5 <tianchen> fix e8a58896f <tianchen> ARROW-5997: Support dictionary encoding for Union type --- .../arrow/vector/dictionary/DictionaryEncoder.java | 11 +-- .../apache/arrow/vector/TestDictionaryVector.java | 78 ++++++++++++++++++++++ 2 files changed, 79 insertions(+), 10 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 9b16bb1..accf2f9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -20,7 +20,6 @@ package org.apache.arrow.vector.dictionary; import org.apache.arrow.vector.BaseIntVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; @@ -42,9 +41,7 @@ public class DictionaryEncoder { * @return dictionary encoded vector */ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { - validateType(vector.getMinorType()); - // load dictionary indices into a hashmap for lookup - + // load dictionary indices into a hash table for lookup DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector()); for (int i = 0; i < dictionary.getVector().getValueCount(); i++) { hashTable.put(i); @@ -114,10 +111,4 @@ public class DictionaryEncoder { decoded.setValueCount(count); return decoded; } - - private static void validateType(MinorType type) { - if (type == MinorType.UNION) { - throw new IllegalArgumentException("Dictionary encoding not implemented for current type: " + type); - } - } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java index 0d2bce9..e0bd218 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -28,10 +28,14 @@ import java.util.Arrays; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.impl.NullableStructWriter; import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.dictionary.Dictionary; import org.apache.arrow.vector.dictionary.DictionaryEncoder; +import org.apache.arrow.vector.holders.NullableIntHolder; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.FieldType; @@ -328,4 +332,78 @@ public class TestDictionaryVector { } } } + + @Test + public void testEncodeUnion() { + // Create a new value vector + try (final UnionVector vector = new UnionVector("vector", allocator, null); + final UnionVector dictionaryVector = new UnionVector("dict", allocator, null);) { + + final NullableUInt4Holder uintHolder1 = new NullableUInt4Holder(); + uintHolder1.value = 10; + uintHolder1.isSet = 1; + + final NullableIntHolder intHolder1 = new NullableIntHolder(); + intHolder1.value = 10; + intHolder1.isSet = 1; + + final NullableIntHolder intHolder2 = new NullableIntHolder(); + intHolder2.value = 20; + intHolder2.isSet = 1; + + //write data + vector.setType(0, Types.MinorType.UINT4); + vector.setSafe(0, uintHolder1); + + vector.setType(1, Types.MinorType.INT); + vector.setSafe(1, intHolder1); + + vector.setType(2, Types.MinorType.INT); + vector.setSafe(2, intHolder1); + + vector.setType(3, Types.MinorType.INT); + vector.setSafe(3, intHolder2); + + vector.setType(4, Types.MinorType.INT); + vector.setSafe(4, intHolder2); + + vector.setValueCount(5); + + //write dictionary + dictionaryVector.setType(0, Types.MinorType.UINT4); + dictionaryVector.setSafe(0, uintHolder1); + + dictionaryVector.setType(1, Types.MinorType.INT); + dictionaryVector.setSafe(1, intHolder1); + + dictionaryVector.setType(2, Types.MinorType.INT); + dictionaryVector.setSafe(2, intHolder2); + + dictionaryVector.setValueCount(3); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) { + // verify indices + assertEquals(IntVector.class, encoded.getClass()); + + IntVector index = ((IntVector)encoded); + assertEquals(5, index.getValueCount()); + assertEquals(0, index.get(0)); + assertEquals(1, index.get(1)); + assertEquals(1, index.get(2)); + assertEquals(2, index.get(3)); + assertEquals(2, index.get(4)); + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getValueCount(), decoded.getValueCount()); + for (int i = 0; i < 5; i++) { + assertEquals(vector.getObject(i), decoded.getObject(i)); + } + } + } + } + } }