[
https://issues.apache.org/jira/browse/ARROW-2500?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16449781#comment-16449781
]
Emilio Lahr-Vivaz commented on ARROW-2500:
------------------------------------------
Note: this didn't seem to occur in 0.6.
> [Java] IPC Writers/readers are not always setting validity bits correctly
> -------------------------------------------------------------------------
>
> Key: ARROW-2500
> URL: https://issues.apache.org/jira/browse/ARROW-2500
> Project: Apache Arrow
> Issue Type: Bug
> Components: Java - Vectors
> Affects Versions: 0.8.0, 0.9.0
> Reporter: Emilio Lahr-Vivaz
> Priority: Major
>
> When writing multiple batches to a Stream/File Writer, the first validity bit
> can get garbled between writing and reading. I couldn't pinpoint the exact
> issue, but I was able to re-create it with a fairly simple unit test.
> in TestArrowStream.java:
> {code:java}
> @Test
> public void testReadWriteMultipleBatches() throws IOException {
> ByteArrayOutputStream os = new ByteArrayOutputStream();
> try (IntVector vector = new IntVector("foo", allocator);) {
> Schema schema = new
> Schema(Collections.singletonList(vector.getField()), null);
> try (VectorSchemaRoot root = new VectorSchemaRoot(schema,
> Collections.singletonList((FieldVector) vector), vector.getValueCount());
> ArrowStreamWriter writer = new ArrowStreamWriter(root, new
> MapDictionaryProvider(), Channels.newChannel(os));) {
> writer.start();
> vector.setNull(0);
> vector.setSafe(1, 1);
> vector.setSafe(2, 2);
> vector.setNull(3);
> vector.setSafe(4, 1);
> vector.setValueCount(5);
> root.setRowCount(5);
> writer.writeBatch();
> vector.setNull(0);
> vector.setSafe(1, 1);
> vector.setSafe(2, 2);
> vector.setValueCount(3);
> root.setRowCount(3);
> writer.writeBatch();
> }
> }
> ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray());
> try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) {
> IntVector read = (IntVector)
> reader.getVectorSchemaRoot().getFieldVectors().get(0);
> reader.loadNextBatch();
> assertEquals(read.getValueCount(), 5);
> assertNull(read.getObject(0));
> assertEquals(read.getObject(1), Integer.valueOf(1));
> assertEquals(read.getObject(2), Integer.valueOf(2));
> assertNull(read.getObject(3));
> assertEquals(read.getObject(4), Integer.valueOf(1));
> reader.loadNextBatch();
> assertEquals(read.getValueCount(), 3);
> assertNull(read.getObject(0));
> assertEquals(read.getObject(1), Integer.valueOf(1));
> assertEquals(read.getObject(2), Integer.valueOf(2));
> }
> }
> {code}
> in TestArrowFile.java:
> {code}
> @Test
> public void testReadWriteMultipleBatches() throws IOException {
> File file = new File("target/mytest_nulls_multibatch.arrow");
> try (IntVector vector = new IntVector("foo", allocator);) {
> Schema schema = new
> Schema(Collections.singletonList(vector.getField()), null);
> try (FileOutputStream fileOutputStream = new FileOutputStream(file);
> VectorSchemaRoot root = new VectorSchemaRoot(schema,
> Collections.singletonList((FieldVector) vector), vector.getValueCount());
> ArrowFileWriter writer = new ArrowFileWriter(root, new
> MapDictionaryProvider(), fileOutputStream.getChannel());) {
> writer.start();
> vector.setNull(0);
> vector.setSafe(1, 1);
> vector.setSafe(2, 2);
> vector.setNull(3);
> vector.setSafe(4, 1);
> vector.setValueCount(5);
> root.setRowCount(5);
> writer.writeBatch();
> vector.setNull(0);
> vector.setSafe(1, 1);
> vector.setSafe(2, 2);
> vector.setValueCount(3);
> root.setRowCount(3);
> writer.writeBatch();
> }
> }
> try (FileInputStream fileInputStream = new FileInputStream(file);
> ArrowFileReader reader = new
> ArrowFileReader(fileInputStream.getChannel(), allocator);) {
> IntVector read = (IntVector)
> reader.getVectorSchemaRoot().getFieldVectors().get(0);
> reader.loadNextBatch();
> assertEquals(read.getValueCount(), 5);
> assertNull(read.getObject(0));
> assertEquals(read.getObject(1), Integer.valueOf(1));
> assertEquals(read.getObject(2), Integer.valueOf(2));
> assertNull(read.getObject(3));
> assertEquals(read.getObject(4), Integer.valueOf(1));
> reader.loadNextBatch();
> assertEquals(read.getValueCount(), 3);
> assertNull(read.getObject(0));
> assertEquals(read.getObject(1), Integer.valueOf(1));
> assertEquals(read.getObject(2), Integer.valueOf(2));
> }
> }
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)