Repository: arrow Updated Branches: refs/heads/master 75d1f613c -> 4956e90a7
ARROW-1407: Fix bug where DictionaryEncoder can only encode vector le⦠â¦ss than 4096 elements Author: Li Jin <ice.xell...@gmail.com> Closes #1024 from icexelloss/dict-bug-ARROW-1407 and squashes the following commits: b64258ce [Li Jin] Minor style change e73ae599 [Li Jin] ARROW-1407: Fix bug where DictionaryEncoder can only encode vector less than 4096 elements Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/4956e90a Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/4956e90a Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/4956e90a Branch: refs/heads/master Commit: 4956e90a7c08fdf5b40b5a71253fafa4aacde434 Parents: 75d1f61 Author: Li Jin <ice.xell...@gmail.com> Authored: Fri Sep 1 16:50:30 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Fri Sep 1 16:50:30 2017 -0400 ---------------------------------------------------------------------- .../vector/dictionary/DictionaryEncoder.java | 2 +- .../arrow/vector/TestDictionaryVector.java | 48 ++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/4956e90a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index 7e20794..3b7dc4a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -68,7 +68,7 @@ public class DictionaryEncoder { Method setter = null; for (Class<?> c : ImmutableList.of(int.class, long.class)) { try { - setter = mutator.getClass().getMethod("set", int.class, c); + setter = mutator.getClass().getMethod("setSafe", int.class, c); break; } catch (NoSuchMethodException e) { // ignore http://git-wip-us.apache.org/repos/asf/arrow/blob/4956e90a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java ---------------------------------------------------------------------- diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java index f2db9ba..f8c16e7 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -39,6 +39,8 @@ public class TestDictionaryVector { byte[] one = "bar".getBytes(StandardCharsets.UTF_8); byte[] two = "baz".getBytes(StandardCharsets.UTF_8); + byte[][] data = new byte[][] {zero, one, two}; + @Before public void init() { allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); @@ -98,4 +100,50 @@ public class TestDictionaryVector { } } } + + @Test + public void testEncodeLargeVector() { + // Create a new value vector + try (final NullableVarCharVector vector = newNullableVarCharVector("foo", allocator); + final NullableVarCharVector dictionaryVector = newNullableVarCharVector("dict", allocator);) { + final NullableVarCharVector.Mutator m = vector.getMutator(); + vector.allocateNew(); + + int count = 10000; + + for (int i = 0; i < 10000; ++i) { + vector.getMutator().setSafe(i, data[i % 3], 0, data[i % 3].length); + } + vector.getMutator().setValueCount(count); + + dictionaryVector.allocateNew(512, 3); + dictionaryVector.getMutator().setSafe(0, zero, 0, zero.length); + dictionaryVector.getMutator().setSafe(1, one, 0, one.length); + dictionaryVector.getMutator().setSafe(2, two, 0, two.length); + dictionaryVector.getMutator().setValueCount(3); + + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + + try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) { + // verify indices + assertEquals(NullableIntVector.class, encoded.getClass()); + + NullableIntVector.Accessor indexAccessor = ((NullableIntVector) encoded).getAccessor(); + assertEquals(count, indexAccessor.getValueCount()); + for (int i = 0; i < count; ++i) { + assertEquals(i % 3, indexAccessor.get(i)); + } + + // now run through the decoder and verify we get the original back + try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) { + assertEquals(vector.getClass(), decoded.getClass()); + assertEquals(vector.getAccessor().getValueCount(), decoded.getAccessor().getValueCount()); + for (int i = 0; i < count; ++i) { + assertEquals(vector.getAccessor().getObject(i), decoded.getAccessor().getObject(i)); + } + } + } + } + } }