[ https://issues.apache.org/jira/browse/ARROW-2500?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Bryan Cutler updated ARROW-2500: -------------------------------- Priority: Critical (was: Major) > [Java] IPC Writers/readers are not always setting validity bits correctly > ------------------------------------------------------------------------- > > Key: ARROW-2500 > URL: https://issues.apache.org/jira/browse/ARROW-2500 > Project: Apache Arrow > Issue Type: Bug > Components: Java - Vectors > Affects Versions: 0.8.0, 0.9.0 > Reporter: Emilio Lahr-Vivaz > Priority: Critical > Labels: pull-request-available > Time Spent: 0.5h > Remaining Estimate: 0h > > When writing multiple batches to a Stream/File Writer, the first validity bit > can get garbled between writing and reading. I couldn't pinpoint the exact > issue, but I was able to re-create it with a fairly simple unit test. > in TestArrowStream.java: > {code:java} > @Test > public void testReadWriteMultipleBatches() throws IOException { > ByteArrayOutputStream os = new ByteArrayOutputStream(); > try (IntVector vector = new IntVector("foo", allocator);) { > Schema schema = new > Schema(Collections.singletonList(vector.getField()), null); > try (VectorSchemaRoot root = new VectorSchemaRoot(schema, > Collections.singletonList((FieldVector) vector), vector.getValueCount()); > ArrowStreamWriter writer = new ArrowStreamWriter(root, new > MapDictionaryProvider(), Channels.newChannel(os));) { > writer.start(); > vector.setNull(0); > vector.setSafe(1, 1); > vector.setSafe(2, 2); > vector.setNull(3); > vector.setSafe(4, 1); > vector.setValueCount(5); > root.setRowCount(5); > writer.writeBatch(); > vector.setNull(0); > vector.setSafe(1, 1); > vector.setSafe(2, 2); > vector.setValueCount(3); > root.setRowCount(3); > writer.writeBatch(); > } > } > ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray()); > try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) { > IntVector read = (IntVector) > reader.getVectorSchemaRoot().getFieldVectors().get(0); > reader.loadNextBatch(); > assertEquals(read.getValueCount(), 5); > assertNull(read.getObject(0)); > assertEquals(read.getObject(1), Integer.valueOf(1)); > assertEquals(read.getObject(2), Integer.valueOf(2)); > assertNull(read.getObject(3)); > assertEquals(read.getObject(4), Integer.valueOf(1)); > reader.loadNextBatch(); > assertEquals(read.getValueCount(), 3); > assertNull(read.getObject(0)); > assertEquals(read.getObject(1), Integer.valueOf(1)); > assertEquals(read.getObject(2), Integer.valueOf(2)); > } > } > {code} > in TestArrowFile.java: > {code} > @Test > public void testReadWriteMultipleBatches() throws IOException { > File file = new File("target/mytest_nulls_multibatch.arrow"); > try (IntVector vector = new IntVector("foo", allocator);) { > Schema schema = new > Schema(Collections.singletonList(vector.getField()), null); > try (FileOutputStream fileOutputStream = new FileOutputStream(file); > VectorSchemaRoot root = new VectorSchemaRoot(schema, > Collections.singletonList((FieldVector) vector), vector.getValueCount()); > ArrowFileWriter writer = new ArrowFileWriter(root, new > MapDictionaryProvider(), fileOutputStream.getChannel());) { > writer.start(); > vector.setNull(0); > vector.setSafe(1, 1); > vector.setSafe(2, 2); > vector.setNull(3); > vector.setSafe(4, 1); > vector.setValueCount(5); > root.setRowCount(5); > writer.writeBatch(); > vector.setNull(0); > vector.setSafe(1, 1); > vector.setSafe(2, 2); > vector.setValueCount(3); > root.setRowCount(3); > writer.writeBatch(); > } > } > try (FileInputStream fileInputStream = new FileInputStream(file); > ArrowFileReader reader = new > ArrowFileReader(fileInputStream.getChannel(), allocator);) { > IntVector read = (IntVector) > reader.getVectorSchemaRoot().getFieldVectors().get(0); > reader.loadNextBatch(); > assertEquals(read.getValueCount(), 5); > assertNull(read.getObject(0)); > assertEquals(read.getObject(1), Integer.valueOf(1)); > assertEquals(read.getObject(2), Integer.valueOf(2)); > assertNull(read.getObject(3)); > assertEquals(read.getObject(4), Integer.valueOf(1)); > reader.loadNextBatch(); > assertEquals(read.getValueCount(), 3); > assertNull(read.getObject(0)); > assertEquals(read.getObject(1), Integer.valueOf(1)); > assertEquals(read.getObject(2), Integer.valueOf(2)); > } > } > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)