lidavidm commented on a change in pull request #136: URL: https://github.com/apache/arrow-cookbook/pull/136#discussion_r809044110
########## File path: java/source/io.rst ########## @@ -0,0 +1,646 @@ +.. _arrow-io: + +======================== +Reading and writing data +======================== + +The `Arrow IPC format <https://arrow.apache.org/docs/java/ipc.html>`_ defines two types of binary formats +for serializing Arrow data: the streaming format and the file format (or random access format). Such files can +be directly memory-mapped when read. + +.. contents:: + +Writing +======= + +Both writing file and streaming formats use the same API. + +Writing Random Access Files +*************************** + +Write - Out to File +------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.File; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("randon_access_to_file.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ) { + writer.start(); + for (int i = 0; i < 10; i++) { + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + writer.end(); + System.out.println("Record batches written: " + writer.getRecordBlocks().size()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches written: 10 + +Write - Out to Buffer +--------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.ByteArrayOutputStream; + import java.io.FileNotFoundException; + import java.io.IOException; + import java.nio.channels.Channels; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))) + { + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println("Record batches written: " + writer.getRecordBlocks().size()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches written: 10 + +Writing Streaming Format +************************ + +Write - Out to File +------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowStreamWriter; + import java.io.File; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + // Create and populate data: + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("streaming_to_file.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ){ + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println(writer.bytesWritten()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + 2928 + +Write - Out to Buffer +--------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.ipc.ArrowStreamWriter; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + + import java.io.ByteArrayOutputStream; + import java.io.FileNotFoundException; + import java.io.IOException; + import java.nio.channels.Channels; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + // Create and populate data: + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out)) + ){ + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println(writer.bytesWritten()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + 2928 + +Reading +======= + +Reading the random access format and streaming format both offer the same API, +with the difference that random access files also offer access to any record batch by index. + +Reading Random Access Files +*************************** + +Read - From File +---------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import java.io.File; + import java.io.FileInputStream; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + File file = new File("./thirdpartydeps/arrowfiles/data1.arrow"); + try (FileInputStream fileInputStream = new FileInputStream(file); + ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator) + ){ + System.out.println("Record batches readed: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + Record batches readed: 3 + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Read - From Buffer +------------------ + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.SeekableReadChannel; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; + + import java.io.IOException; + import java.nio.file.Files; + import java.nio.file.Path; + import java.nio.file.Paths; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Path path = Paths.get("./thirdpartydeps/arrowfiles/data1.arrow"); + try (ArrowFileReader reader = new ArrowFileReader(new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(Files.readAllBytes(path))), rootAllocator)){ + System.out.println("Record batches readed: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + Record batches readed: 3 + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Reading Streaming Format +************************ + +Read - From File +---------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowStreamReader; + import org.apache.arrow.vector.VectorSchemaRoot; + import java.io.File; + import java.io.FileInputStream; + import java.io.IOException; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + File file = new File("./thirdpartydeps/arrowfiles/data2.arrow"); + try (FileInputStream fileInputStreamForStream = new FileInputStream(file); + ArrowStreamReader reader = new ArrowStreamReader(fileInputStreamForStream, rootAllocator)) { + while (reader.loadNextBatch()) { + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Read - From Buffer +------------------ + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowStreamReader; + + import java.io.ByteArrayInputStream; + import java.io.IOException; + import java.nio.file.Files; + import java.nio.file.Path; + import java.nio.file.Paths; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Path path = Paths.get("./thirdpartydeps/arrowfiles/data2.arrow"); + try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(Files.readAllBytes(path)), rootAllocator)){ + while(reader.loadNextBatch()){ + System.out.print(reader.getVectorSchemaRoot().contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Reading Parquet File +******************** + +Please check :ref:`arrow-dataset` + +Writing Fresh Data Review comment: The last example we could keep, but we should create the writer after the reader and use the reader's schema to create the writer's root. ########## File path: java/source/io.rst ########## @@ -0,0 +1,646 @@ +.. _arrow-io: + +======================== +Reading and writing data +======================== + +The `Arrow IPC format <https://arrow.apache.org/docs/java/ipc.html>`_ defines two types of binary formats +for serializing Arrow data: the streaming format and the file format (or random access format). Such files can +be directly memory-mapped when read. + +.. contents:: + +Writing +======= + +Both writing file and streaming formats use the same API. + +Writing Random Access Files +*************************** + +Write - Out to File +------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.File; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("randon_access_to_file.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ) { + writer.start(); + for (int i = 0; i < 10; i++) { + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + writer.end(); + System.out.println("Record batches written: " + writer.getRecordBlocks().size()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches written: 10 + +Write - Out to Buffer +--------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.ByteArrayOutputStream; + import java.io.FileNotFoundException; + import java.io.IOException; + import java.nio.channels.Channels; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))) + { + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println("Record batches written: " + writer.getRecordBlocks().size()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches written: 10 + +Writing Streaming Format +************************ + +Write - Out to File +------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowStreamWriter; + import java.io.File; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + // Create and populate data: + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("streaming_to_file.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ){ + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println(writer.bytesWritten()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + 2928 + +Write - Out to Buffer +--------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.ipc.ArrowStreamWriter; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + + import java.io.ByteArrayOutputStream; + import java.io.FileNotFoundException; + import java.io.IOException; + import java.nio.channels.Channels; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + // Create and populate data: + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out)) + ){ + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println(writer.bytesWritten()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + 2928 + +Reading +======= + +Reading the random access format and streaming format both offer the same API, +with the difference that random access files also offer access to any record batch by index. + +Reading Random Access Files +*************************** + +Read - From File +---------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import java.io.File; + import java.io.FileInputStream; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + File file = new File("./thirdpartydeps/arrowfiles/data1.arrow"); + try (FileInputStream fileInputStream = new FileInputStream(file); + ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator) + ){ + System.out.println("Record batches readed: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + Record batches readed: 3 + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Read - From Buffer +------------------ + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.SeekableReadChannel; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; + + import java.io.IOException; + import java.nio.file.Files; + import java.nio.file.Path; + import java.nio.file.Paths; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Path path = Paths.get("./thirdpartydeps/arrowfiles/data1.arrow"); Review comment: Ok, but it seems data1 is a random access file and data2 is a streaming file? Can we rename them to reflect that? ########## File path: java/source/io.rst ########## @@ -0,0 +1,630 @@ +.. _arrow-io: + +======================== +Reading and writing data +======================== + +The `Arrow IPC format <https://arrow.apache.org/docs/java/ipc.html>`_ defines two types of binary formats +for serializing Arrow data: the streaming format and the file format (or random access format). Such files can +be directly memory-mapped when read. + +.. contents:: + +Writing +======= + +Both writing file and streaming formats use the same API. + +Writing Random Access Files +*************************** + +Write - Out to File +------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.File; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("randon_access_to_file.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ) { + writer.start(); + for (int i = 0; i < 10; i++) { + // All the data that is going to write comes from the root and as we were not changing root data we are writing the same data 10x times + writer.writeBatch(); + } + writer.end(); + System.out.println("Record batches written: " + writer.getRecordBlocks().size()); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches written: 10 + +Write - Out to Buffer +--------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.ByteArrayOutputStream; + import java.io.IOException; + import java.nio.channels.Channels; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))) + { + writer.start(); + for (int i=0; i<10; i++){ + writer.writeBatch(); + } + System.out.println("Record batches written: " + writer.getRecordBlocks().size()); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches written: 10 + +Writing Streaming Format +************************ + +Write - Out to File +------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowStreamWriter; + import java.io.File; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + // Create and populate data: + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("streaming_to_file.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ){ + writer.start(); + for (int i=0; i<10; i++){ + writer.writeBatch(); + } + System.out.println(writer.bytesWritten()); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + 2928 + +Write - Out to Buffer +--------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.ipc.ArrowStreamWriter; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + + import java.io.ByteArrayOutputStream; + import java.io.IOException; + import java.nio.channels.Channels; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + // Create and populate data: + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out)) + ){ + writer.start(); + for (int i=0; i<10; i++){ + writer.writeBatch(); + } + System.out.println(writer.bytesWritten()); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + 2928 + +Reading +======= + +Reading the random access format and streaming format both offer the same API, +with the difference that random access files also offer access to any record batch by index. + +Reading Random Access Files +*************************** + +Read - From File +---------------- + +We are providing a path with auto generated arrow files for testing purposes, change that at your convenience. + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import java.io.File; + import java.io.FileInputStream; + import java.io.FileOutputStream; + import java.io.IOException; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + File file = new File("./thirdpartydeps/arrowfiles/data1.arrow"); + try (FileInputStream fileInputStream = new FileInputStream(file); + ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator) + ){ + System.out.println("Record batches in file: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + Record batches in file: 3 + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Read - From Buffer +------------------ + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.SeekableReadChannel; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; + + import java.io.IOException; + import java.nio.file.Files; + import java.nio.file.Path; + import java.nio.file.Paths; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Path path = Paths.get("./thirdpartydeps/arrowfiles/data1.arrow"); + try (ArrowFileReader reader = new ArrowFileReader(new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(Files.readAllBytes(path))), rootAllocator)){ + System.out.println("Record batches in file: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + Record batches in file: 3 + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Reading Streaming Format +************************ + +Read - From File +---------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowStreamReader; + import org.apache.arrow.vector.VectorSchemaRoot; + import java.io.File; + import java.io.FileInputStream; + import java.io.IOException; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + File file = new File("./thirdpartydeps/arrowfiles/data2.arrow"); + try (FileInputStream fileInputStreamForStream = new FileInputStream(file); + ArrowStreamReader reader = new ArrowStreamReader(fileInputStreamForStream, rootAllocator)) { + while (reader.loadNextBatch()) { + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Read - From Buffer +------------------ + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowStreamReader; + + import java.io.ByteArrayInputStream; + import java.io.IOException; + import java.nio.file.Files; + import java.nio.file.Path; + import java.nio.file.Paths; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Path path = Paths.get("./thirdpartydeps/arrowfiles/data2.arrow"); + try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(Files.readAllBytes(path)), rootAllocator)){ + while(reader.loadNextBatch()){ + System.out.print(reader.getVectorSchemaRoot().contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Reading Parquet File +******************** + +Please check :doc:`JNI Dataset <./dataset>` + +Writing Fresh Data +================== + +Write Update Data Modifying The Root +************************************ + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.File; + import java.io.FileInputStream; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("randon_access_to_file_update_root.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ) { + writer.start(); + writer.writeBatch(); + for (int i = 0; i < 2; i++) { + nameVector.set(0, ("New-"+(i+1)).getBytes()); + nameVector.set(1, ("New-"+(i+1)).getBytes()); + nameVector.set(2, ("New-"+(i+1)).getBytes()); + ageVector.set(0, (i+2)*10); + ageVector.set(1, (i+2)*20); + ageVector.set(2, (i+2)*30); + writer.writeBatch(); + } + } catch (IOException e) { + e.printStackTrace(); + } + try (FileInputStream fileInputStream = new FileInputStream(file); + ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator) + ){ + System.out.println("Record batches in file: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches in file: 3 + name age + David 10 + Gladis 20 + Juan 30 + name age + New-1 20 + New-1 40 + New-1 60 + name age + New-2 30 + New-2 60 + New-2 90 + + +Write Update Data With VectorLoader +*********************************** + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorUnloader; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.File; + import java.io.FileInputStream; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schema = new Schema(asList(name, age)); + try (VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schema, rootAllocator)) { + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "Raul".getBytes()); + nameVector.set(1, "Johao".getBytes()); + nameVector.set(2, "Rafael".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); Review comment: This data is never used right? Nor is the schema above? ########## File path: java/source/io.rst ########## @@ -0,0 +1,646 @@ +.. _arrow-io: + +======================== +Reading and writing data +======================== + +The `Arrow IPC format <https://arrow.apache.org/docs/java/ipc.html>`_ defines two types of binary formats +for serializing Arrow data: the streaming format and the file format (or random access format). Such files can +be directly memory-mapped when read. + +.. contents:: + +Writing +======= + +Both writing file and streaming formats use the same API. + +Writing Random Access Files +*************************** + +Write - Out to File +------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.File; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("randon_access_to_file.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ) { + writer.start(); + for (int i = 0; i < 10; i++) { + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + writer.end(); + System.out.println("Record batches written: " + writer.getRecordBlocks().size()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches written: 10 + +Write - Out to Buffer +--------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + + import java.io.ByteArrayOutputStream; + import java.io.FileNotFoundException; + import java.io.IOException; + import java.nio.channels.Channels; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))) + { + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println("Record batches written: " + writer.getRecordBlocks().size()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + Record batches written: 10 + +Writing Streaming Format +************************ + +Write - Out to File +------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + import org.apache.arrow.vector.ipc.ArrowStreamWriter; + import java.io.File; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + // Create and populate data: + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + File file = new File("streaming_to_file.arrow"); + try (FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel()) + ){ + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println(writer.bytesWritten()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + 2928 + +Write - Out to Buffer +--------------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.ipc.ArrowStreamWriter; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.vector.VectorSchemaRoot; + import static java.util.Arrays.asList; + + import java.io.ByteArrayOutputStream; + import java.io.FileNotFoundException; + import java.io.IOException; + import java.nio.channels.Channels; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + // Create and populate data: + Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()), null); + Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schemaPerson = new Schema(asList(name, age)); + try(VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schemaPerson, rootAllocator)){ + VarCharVector nameVector = (VarCharVector) vectorSchemaRoot.getVector("name"); + nameVector.allocateNew(3); + nameVector.set(0, "David".getBytes()); + nameVector.set(1, "Gladis".getBytes()); + nameVector.set(2, "Juan".getBytes()); + IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age"); + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + vectorSchemaRoot.setRowCount(3); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out)) + ){ + writer.start(); + for (int i=0; i<10; i++){ + // Generate data or modify the root or use a VectorLoader to get fresh data from somewhere else + writer.writeBatch(); + } + System.out.println(writer.bytesWritten()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + +.. testoutput:: + + 2928 + +Reading +======= + +Reading the random access format and streaming format both offer the same API, +with the difference that random access files also offer access to any record batch by index. + +Reading Random Access Files +*************************** + +Read - From File +---------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import java.io.File; + import java.io.FileInputStream; + import java.io.FileNotFoundException; + import java.io.FileOutputStream; + import java.io.IOException; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + File file = new File("./thirdpartydeps/arrowfiles/data1.arrow"); + try (FileInputStream fileInputStream = new FileInputStream(file); + ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), rootAllocator) + ){ + System.out.println("Record batches readed: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + Record batches readed: 3 + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Read - From Buffer +------------------ + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.SeekableReadChannel; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; + + import java.io.IOException; + import java.nio.file.Files; + import java.nio.file.Path; + import java.nio.file.Paths; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Path path = Paths.get("./thirdpartydeps/arrowfiles/data1.arrow"); + try (ArrowFileReader reader = new ArrowFileReader(new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(Files.readAllBytes(path))), rootAllocator)){ + System.out.println("Record batches readed: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + Record batches readed: 3 + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Reading Streaming Format +************************ + +Read - From File +---------------- + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowStreamReader; + import org.apache.arrow.vector.VectorSchemaRoot; + import java.io.File; + import java.io.FileInputStream; + import java.io.IOException; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + File file = new File("./thirdpartydeps/arrowfiles/data2.arrow"); + try (FileInputStream fileInputStreamForStream = new FileInputStream(file); + ArrowStreamReader reader = new ArrowStreamReader(fileInputStreamForStream, rootAllocator)) { + while (reader.loadNextBatch()) { + VectorSchemaRoot vectorSchemaRootRecover = reader.getVectorSchemaRoot(); + System.out.print(vectorSchemaRootRecover.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Read - From Buffer +------------------ + +.. testcode:: + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowStreamReader; + + import java.io.ByteArrayInputStream; + import java.io.IOException; + import java.nio.file.Files; + import java.nio.file.Path; + import java.nio.file.Paths; + + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + Path path = Paths.get("./thirdpartydeps/arrowfiles/data2.arrow"); + try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(Files.readAllBytes(path)), rootAllocator)){ + while(reader.loadNextBatch()){ + System.out.print(reader.getVectorSchemaRoot().contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. testoutput:: + + name age + David 10 + Gladis 20 + Juan 30 + name age + Nidia 15 + Alexa 20 + Mara 15 + name age + Raul 34 + Jhon 29 + Thomy 33 + +Reading Parquet File +******************** + +Please check :ref:`arrow-dataset` + +Writing Fresh Data Review comment: To me, the recipe feels very unnatural. For instance, why not just have this in the main recipe up above? Then we don't need two separate recipes. ``` // Delete the set() calls above writer.start(); for (int i = 0; i < 3; i++) { nameVector.set(0, ("New-"+(i+1)).getBytes()); nameVector.set(1, ("New-"+(i+1)).getBytes()); nameVector.set(2, ("New-"+(i+1)).getBytes()); ageVector.set(0, (i+2)*10); ageVector.set(1, (i+2)*20); ageVector.set(2, (i+2)*30); writer.writeBatch(); } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
