This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-cookbook.git
The following commit(s) were added to refs/heads/main by this push:
new 4396e96 ARROW-16170: [Java][Docs] Synch java code tutorial with java
cookbook (#183)
4396e96 is described below
commit 4396e9692386a8f170e55667422db780d65b02a5
Author: david dali susanibar arce <[email protected]>
AuthorDate: Tue Apr 12 13:58:34 2022 -0500
ARROW-16170: [Java][Docs] Synch java code tutorial with java cookbook (#183)
* Synch java code tutorial with java cookbook
* Apply suggestions from code review
Co-authored-by: David Li <[email protected]>
* Solving PR comments
* Apply suggestions from code review
Co-authored-by: David Li <[email protected]>
* Solving PR comments
Co-authored-by: David Li <[email protected]>
---
java/source/create.rst | 157 ++++++++++++++---------------------
java/source/data.rst | 213 ++++++++++++++++++++++++++++--------------------
java/source/dataset.rst | 183 ++++++++++++++++++++++-------------------
java/source/flight.rst | 7 +-
java/source/index.rst | 4 +-
java/source/io.rst | 157 +++++++++++++++++++----------------
java/source/schema.rst | 203 ++++++++++++++++++++++++---------------------
7 files changed, 488 insertions(+), 436 deletions(-)
diff --git a/java/source/create.rst b/java/source/create.rst
index 77113f5..f7680c9 100644
--- a/java/source/create.rst
+++ b/java/source/create.rst
@@ -4,33 +4,40 @@
Creating Arrow Objects
======================
-| A vector is the basic unit in the Arrow Java library. Vector by definition
is intended to be mutable, a Vector can be changed it is mutable.
+A vector is the basic unit in the Arrow Java library. Data types
+describe the types of values; ValueVectors are sequences of typed
+values. Vectors represent a one-dimensional sequence of values of
+the same type. They are mutable containers.
-| Vectors are provided by java arrow for the interface `FieldVector
<https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/FieldVector.html>`_
that extends `ValueVector <https://arrow.apache.org/docs/java/vector.html>`_.
+Vectors implement the interface `ValueVector`_. The Arrow libraries provide
+implementations of vectors for various data types.
.. contents::
Creating Vectors (arrays)
=========================
-Array of Int (32-bit integer value)
------------------------------------
+Array of Int
+------------
.. testcode::
- import org.apache.arrow.vector.IntVector;
- import org.apache.arrow.memory.RootAllocator;
-
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
-
- IntVector intVector = new IntVector("intVector", rootAllocator);
- intVector.allocateNew(3);
- intVector.set(0, 1);
- intVector.set(1, 2);
- intVector.set(2, 3);
- intVector.setValueCount(3);
-
- System.out.print(intVector);
+ import org.apache.arrow.memory.BufferAllocator;
+ import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.IntVector;
+
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ IntVector intVector = new IntVector("intVector", allocator)
+ ) {
+ intVector.allocateNew(3);
+ intVector.set(0, 1);
+ intVector.set(1, 2);
+ intVector.set(2, 3);
+ intVector.setValueCount(3);
+
+ System.out.print(intVector);
+ }
.. testoutput::
@@ -42,19 +49,22 @@ Array of Varchar
.. testcode::
- import org.apache.arrow.vector.VarCharVector;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.VarCharVector;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
-
- VarCharVector varVector = new VarCharVector("varVector", rootAllocator);
- varVector.allocateNew(3);
- varVector.set(0, "one".getBytes());
- varVector.set(1, "two".getBytes());
- varVector.set(2, "three".getBytes());
- varVector.setValueCount(3);
-
- System.out.print(varVector);
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ VarCharVector varCharVector = new VarCharVector("varCharVector",
allocator);
+ ) {
+ varCharVector.allocateNew(3);
+ varCharVector.set(0, "one".getBytes());
+ varCharVector.set(1, "two".getBytes());
+ varCharVector.set(2, "three".getBytes());
+ varCharVector.setValueCount(3);
+
+ System.out.print(varCharVector);
+ }
.. testoutput::
@@ -65,81 +75,38 @@ Array of List
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.complex.impl.UnionListWriter;
import org.apache.arrow.vector.complex.ListVector;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- ListVector listVector = ListVector.empty("listVector", rootAllocator);
- UnionListWriter listWriter = listVector.getWriter();
- int[] data = new int[] { 1, 2, 3, 10, 20, 30, 100, 200, 300, 1000, 2000,
3000 };
- int tmp_index = 0;
- for(int i = 0; i < 4; i++) {
- listWriter.setPosition(i);
- listWriter.startList();
- for(int j = 0; j < 3; j++) {
- listWriter.writeInt(data[tmp_index]);
- tmp_index = tmp_index + 1;
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ ListVector listVector = ListVector.empty("listVector", allocator);
+ UnionListWriter listWriter = listVector.getWriter()
+ ) {
+ int[] data = new int[] { 1, 2, 3, 10, 20, 30, 100, 200, 300, 1000,
2000, 3000 };
+ int tmp_index = 0;
+ for(int i = 0; i < 4; i++) {
+ listWriter.setPosition(i);
+ listWriter.startList();
+ for(int j = 0; j < 3; j++) {
+ listWriter.writeInt(data[tmp_index]);
+ tmp_index = tmp_index + 1;
+ }
+ listWriter.setValueCount(3);
+ listWriter.endList();
}
- listWriter.setValueCount(3);
- listWriter.endList();
- }
- listVector.setValueCount(4);
+ listVector.setValueCount(4);
- System.out.print(listVector);
+ System.out.print(listVector);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
.. testoutput::
[[1,2,3], [10,20,30], [100,200,300], [1000,2000,3000]]
-Creating VectorSchemaRoot (Table)
-=================================
-
-A `VectorSchemaRoot
<https://arrow.apache.org/docs/java/vector_schema_root.html>`_
-is a container that can hold batches, batches flow through VectorSchemaRoot as
part of a pipeline.
-
-.. testcode::
-
- import org.apache.arrow.memory.RootAllocator;
- import org.apache.arrow.vector.VarCharVector;
- import org.apache.arrow.vector.IntVector;
- import org.apache.arrow.vector.types.pojo.Field;
- import org.apache.arrow.vector.types.pojo.FieldType;
- import org.apache.arrow.vector.types.pojo.ArrowType;
- import org.apache.arrow.vector.types.pojo.Schema;
- import org.apache.arrow.vector.VectorSchemaRoot;
- import static java.util.Arrays.asList;
-
- // create a column data type
- Field name = new Field("name", FieldType.nullable(new
ArrowType.Utf8()), null);
- Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32,
true)), null);
-
- // create a definition
- Schema schemaPerson = new Schema(asList(name, age));
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, rootAllocator);
-
- // getting field vectors
- VarCharVector nameVector = (VarCharVector)
vectorSchemaRoot.getVector("name");
- nameVector.allocateNew(3);
- nameVector.set(0, "david".getBytes());
- nameVector.set(1, "gladis".getBytes());
- nameVector.set(2, "juan".getBytes());
- nameVector.setValueCount(3);
- IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
- ageVector.allocateNew(3);
- ageVector.set(0, 10);
- ageVector.set(1, 20);
- ageVector.set(2, 30);
- ageVector.setValueCount(3);
-
- vectorSchemaRoot.setRowCount(3);
-
- System.out.print(vectorSchemaRoot.contentToTSVString());
-
-.. testoutput::
-
- name age
- david 10
- gladis 20
- juan 30
+.. _`FieldVector`:
https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/FieldVector.html
+.. _`ValueVector`: https://arrow.apache.org/docs/java/vector.html
\ No newline at end of file
diff --git a/java/source/data.rst b/java/source/data.rst
index ce69999..0833a90 100644
--- a/java/source/data.rst
+++ b/java/source/data.rst
@@ -11,23 +11,27 @@ Compare Vectors for Field Equality
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.compare.TypeEqualsVisitor;
import org.apache.arrow.memory.RootAllocator;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- IntVector right = new IntVector("int", rootAllocator);
- right.allocateNew(3);
- right.set(0, 10);
- right.set(1, 20);
- right.set(2, 30);
- right.setValueCount(3);
- IntVector left1 = new IntVector("int", rootAllocator);
- IntVector left2 = new IntVector("int2", rootAllocator);
- TypeEqualsVisitor visitor = new TypeEqualsVisitor(right);
-
- System.out.println(visitor.equals(left1));
- System.out.println(visitor.equals(left2));
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ IntVector right = new IntVector("int", allocator);
+ ) {
+ right.allocateNew(3);
+ right.set(0, 10);
+ right.set(1, 20);
+ right.set(2, 30);
+ right.setValueCount(3);
+ IntVector left1 = new IntVector("int", allocator);
+ IntVector left2 = new IntVector("int2", allocator);
+ TypeEqualsVisitor visitor = new TypeEqualsVisitor(right);
+
+ System.out.println(visitor.equals(left1));
+ System.out.println(visitor.equals(left2));
+ }
.. testoutput::
@@ -39,27 +43,33 @@ Compare Vectors Equality
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
- import org.apache.arrow.vector.compare.VectorEqualsVisitor;
import org.apache.arrow.memory.RootAllocator;
+ import org.apache.arrow.vector.compare.VectorEqualsVisitor;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- IntVector vector1 = new IntVector("vector1", rootAllocator);
- vector1.allocateNew(1);
- vector1.set(0, 10);
- vector1.setValueCount(1);
- IntVector vector2 = new IntVector("vector1", rootAllocator);
- vector2.allocateNew(1);
- vector2.set(0, 10);
- vector2.setValueCount(1);
- IntVector vector3 = new IntVector("vector1", rootAllocator);
- vector3.allocateNew(1);
- vector3.set(0, 20);
- vector3.setValueCount(1);
- VectorEqualsVisitor visitor = new VectorEqualsVisitor();
-
- System.out.println(visitor.vectorEquals(vector1, vector2));
- System.out.println(visitor.vectorEquals(vector1, vector3));
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ IntVector vector1 = new IntVector("vector1", allocator);
+ IntVector vector2 = new IntVector("vector1", allocator);
+ IntVector vector3 = new IntVector("vector1", allocator)
+ ) {
+ vector1.allocateNew(1);
+ vector1.set(0, 10);
+ vector1.setValueCount(1);
+
+ vector2.allocateNew(1);
+ vector2.set(0, 10);
+ vector2.setValueCount(1);
+
+ vector3.allocateNew(1);
+ vector3.set(0, 20);
+ vector3.setValueCount(1);
+ VectorEqualsVisitor visitor = new VectorEqualsVisitor();
+
+ System.out.println(visitor.vectorEquals(vector1, vector2));
+ System.out.println(visitor.vectorEquals(vector1, vector3));
+ }
.. testoutput::
@@ -75,21 +85,25 @@ Comparing two values at the given indices in the vectors:
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.memory.RootAllocator;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- VarCharVector vec = new VarCharVector("valueindexcomparator",
rootAllocator);
- vec.allocateNew(3);
- vec.setValueCount(3);
- vec.set(0, "ba".getBytes());
- vec.set(1, "abc".getBytes());
- vec.set(2, "aa".getBytes());
- VectorValueComparator<VarCharVector> valueComparator =
DefaultVectorComparators.createDefaultComparator(vec);
- valueComparator.attachVector(vec);
-
- System.out.println(valueComparator.compare(0, 1) > 0);
- System.out.println(valueComparator.compare(1, 2) < 0);
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ VarCharVector vec = new VarCharVector("valueindexcomparator",
allocator);
+ ) {
+ vec.allocateNew(3);
+ vec.setValueCount(3);
+ vec.set(0, "ba".getBytes());
+ vec.set(1, "abc".getBytes());
+ vec.set(2, "aa".getBytes());
+ VectorValueComparator<VarCharVector> valueComparator =
DefaultVectorComparators.createDefaultComparator(vec);
+ valueComparator.attachVector(vec);
+
+ System.out.println(valueComparator.compare(0, 1) > 0);
+ System.out.println(valueComparator.compare(1, 2) < 0);
+ }
.. testoutput::
@@ -112,20 +126,24 @@ Algorithm:
org.apache.arrow.algorithm.search.VectorSearcher#linearSearch - O(n)
import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- IntVector linearSearchVector = new IntVector("linearSearchVector",
rootAllocator);
- linearSearchVector.allocateNew(10);
- linearSearchVector.setValueCount(10);
- for (int i = 0; i < 10; i++) {
- linearSearchVector.set(i, i);
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ IntVector linearSearchVector = new IntVector("linearSearchVector",
allocator);
+ ) {
+ linearSearchVector.allocateNew(10);
+ linearSearchVector.setValueCount(10);
+ for (int i = 0; i < 10; i++) {
+ linearSearchVector.set(i, i);
+ }
+ VectorValueComparator<IntVector> comparatorInt =
DefaultVectorComparators.createDefaultComparator(linearSearchVector);
+ int result = VectorSearcher.linearSearch(linearSearchVector,
comparatorInt, linearSearchVector, 3);
+
+ System.out.println(result);
}
- VectorValueComparator<IntVector> comparatorInt =
DefaultVectorComparators.createDefaultComparator(linearSearchVector);
- int result = VectorSearcher.linearSearch(linearSearchVector,
comparatorInt, linearSearchVector, 3);
-
- System.out.println(result);
.. testoutput::
@@ -141,20 +159,24 @@ Algorithm:
org.apache.arrow.algorithm.search.VectorSearcher#binarySearch - O(log
import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- IntVector binarySearchVector = new IntVector("", rootAllocator);
- binarySearchVector.allocateNew(10);
- binarySearchVector.setValueCount(10);
- for (int i = 0; i < 10; i++) {
- binarySearchVector.set(i, i);
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ IntVector binarySearchVector = new IntVector("", allocator);
+ ) {
+ binarySearchVector.allocateNew(10);
+ binarySearchVector.setValueCount(10);
+ for (int i = 0; i < 10; i++) {
+ binarySearchVector.set(i, i);
+ }
+ VectorValueComparator<IntVector> comparatorInt =
DefaultVectorComparators.createDefaultComparator(binarySearchVector);
+ int result = VectorSearcher.binarySearch(binarySearchVector,
comparatorInt, binarySearchVector, 3);
+
+ System.out.println(result);
}
- VectorValueComparator<IntVector> comparatorInt =
DefaultVectorComparators.createDefaultComparator(binarySearchVector);
- int result = VectorSearcher.binarySearch(binarySearchVector,
comparatorInt, binarySearchVector, 3);
-
- System.out.println(result);
.. testoutput::
@@ -174,21 +196,25 @@ Algorithm:
org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter - O(nlo
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
import org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- IntVector intVectorNotSorted = new IntVector("intvectornotsorted",
rootAllocator);
- intVectorNotSorted.allocateNew(3);
- intVectorNotSorted.setValueCount(3);
- intVectorNotSorted.set(0, 10);
- intVectorNotSorted.set(1, 8);
- intVectorNotSorted.setNull(2);
- FixedWidthInPlaceVectorSorter<IntVector> sorter = new
FixedWidthInPlaceVectorSorter<IntVector>();
- VectorValueComparator<IntVector> comparator =
DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
- sorter.sortInPlace(intVectorNotSorted, comparator);
-
- System.out.println(intVectorNotSorted);
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ IntVector intVectorNotSorted = new IntVector("intvectornotsorted",
allocator);
+ ) {
+ intVectorNotSorted.allocateNew(3);
+ intVectorNotSorted.setValueCount(3);
+ intVectorNotSorted.set(0, 10);
+ intVectorNotSorted.set(1, 8);
+ intVectorNotSorted.setNull(2);
+ FixedWidthInPlaceVectorSorter<IntVector> sorter = new
FixedWidthInPlaceVectorSorter<IntVector>();
+ VectorValueComparator<IntVector> comparator =
DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
+ sorter.sortInPlace(intVectorNotSorted, comparator);
+
+ System.out.println(intVectorNotSorted);
+ }
.. testoutput::
@@ -207,24 +233,31 @@ FixedWidthOutOfPlaceVectorSorter &
VariableWidthOutOfPlaceVectorSor
import org.apache.arrow.algorithm.sort.FixedWidthOutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.OutOfPlaceVectorSorter;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.memory.RootAllocator;
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- IntVector intVectorNotSorted = new IntVector("intvectornotsorted",
rootAllocator);
- intVectorNotSorted.allocateNew(3);
- intVectorNotSorted.setValueCount(3);
- intVectorNotSorted.set(0, 10);
- intVectorNotSorted.set(1, 8);
- intVectorNotSorted.setNull(2);
- OutOfPlaceVectorSorter<IntVector> sorterOutOfPlaceSorter = new
FixedWidthOutOfPlaceVectorSorter<>();
- VectorValueComparator<IntVector> comparatorOutOfPlaceSorter =
DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
- IntVector intVectorSorted = (IntVector)
intVectorNotSorted.getField().getFieldType().createNewSingleVector("new-out-of-place-sorter",
rootAllocator, null);
- intVectorSorted.allocateNew(intVectorNotSorted.getValueCount());
- intVectorSorted.setValueCount(intVectorNotSorted.getValueCount());
- sorterOutOfPlaceSorter.sortOutOfPlace(intVectorNotSorted, intVectorSorted,
comparatorOutOfPlaceSorter);
-
- System.out.println(intVectorSorted);
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ IntVector intVectorNotSorted = new IntVector("intvectornotsorted",
allocator);
+ IntVector intVectorSorted = (IntVector) intVectorNotSorted.getField()
+
.getFieldType().createNewSingleVector("new-out-of-place-sorter",
+ allocator, null);
+
+ ) {
+ intVectorNotSorted.allocateNew(3);
+ intVectorNotSorted.setValueCount(3);
+ intVectorNotSorted.set(0, 10);
+ intVectorNotSorted.set(1, 8);
+ intVectorNotSorted.setNull(2);
+ OutOfPlaceVectorSorter<IntVector> sorterOutOfPlaceSorter = new
FixedWidthOutOfPlaceVectorSorter<>();
+ VectorValueComparator<IntVector> comparatorOutOfPlaceSorter =
DefaultVectorComparators.createDefaultComparator(intVectorNotSorted);
+ intVectorSorted.allocateNew(intVectorNotSorted.getValueCount());
+ intVectorSorted.setValueCount(intVectorNotSorted.getValueCount());
+ sorterOutOfPlaceSorter.sortOutOfPlace(intVectorNotSorted,
intVectorSorted, comparatorOutOfPlaceSorter);
+
+ System.out.println(intVectorSorted);
+ }
.. testoutput::
diff --git a/java/source/dataset.rst b/java/source/dataset.rst
index ecf2bb3..0a631fa 100644
--- a/java/source/dataset.rst
+++ b/java/source/dataset.rst
@@ -22,19 +22,21 @@ We can construct a dataset with an auto-inferred schema.
import org.apache.arrow.dataset.scanner.Scanner;
import org.apache.arrow.dataset.source.Dataset;
import org.apache.arrow.dataset.source.DatasetFactory;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import java.util.stream.StreamSupport;
- try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
- String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/parquetfiles/data1.parquet";
- try (DatasetFactory datasetFactory = new
FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri)) {
- try(Dataset dataset = datasetFactory.finish()){
- ScanOptions options = new ScanOptions(/*batchSize*/ 100);
- try(Scanner scanner = dataset.newScan(options)){
-
System.out.println(StreamSupport.stream(scanner.scan().spliterator(),
false).count());
- }
- }
- }
+ String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/parquetfiles/data1.parquet";
+ ScanOptions options = new ScanOptions(/*batchSize*/ 100);
+ try (
+ BufferAllocator allocator = new RootAllocator();
+ DatasetFactory datasetFactory = new
FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
+ Dataset dataset = datasetFactory.finish();
+ Scanner scanner = dataset.newScan(options)
+ ) {
+ System.out.println(StreamSupport.stream(scanner.scan().spliterator(),
false).count());
+ } catch (Exception e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -52,19 +54,21 @@ Let construct our dataset with predefined schema.
import org.apache.arrow.dataset.scanner.Scanner;
import org.apache.arrow.dataset.source.Dataset;
import org.apache.arrow.dataset.source.DatasetFactory;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import java.util.stream.StreamSupport;
String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/parquetfiles/data1.parquet";
- try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
- try (DatasetFactory datasetFactory = new
FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri)) {
- try(Dataset dataset =
datasetFactory.finish(datasetFactory.inspect())){
- ScanOptions options = new ScanOptions(/*batchSize*/ 100);
- try(Scanner scanner = dataset.newScan(options)){
-
System.out.println(StreamSupport.stream(scanner.scan().spliterator(),
false).count());
- }
- }
- }
+ ScanOptions options = new ScanOptions(/*batchSize*/ 100);
+ try (
+ BufferAllocator allocator = new RootAllocator();
+ DatasetFactory datasetFactory = new
FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
+ Dataset dataset = datasetFactory.finish(datasetFactory.inspect());
+ Scanner scanner = dataset.newScan(options)
+ ) {
+ System.out.println(StreamSupport.stream(scanner.scan().spliterator(),
false).count());
+ } catch (Exception e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -83,16 +87,20 @@ During Dataset Construction
import org.apache.arrow.dataset.file.FileSystemDatasetFactory;
import org.apache.arrow.dataset.jni.NativeMemoryPool;
import org.apache.arrow.dataset.source.DatasetFactory;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.types.pojo.Schema;
String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/parquetfiles/data1.parquet";
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){
- try(DatasetFactory datasetFactory = new
FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri)){
- Schema schema = datasetFactory.inspect();
-
- System.out.println(schema);
- }
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ DatasetFactory datasetFactory = new
FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri)
+ ){
+ Schema schema = datasetFactory.inspect();
+
+ System.out.println(schema);
+ } catch (Exception e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -111,21 +119,23 @@ From a Dataset
import org.apache.arrow.dataset.scanner.Scanner;
import org.apache.arrow.dataset.source.Dataset;
import org.apache.arrow.dataset.source.DatasetFactory;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.types.pojo.Schema;
String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/parquetfiles/data1.parquet";
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){
- try(DatasetFactory datasetFactory = new
FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri)){
- ScanOptions options = new ScanOptions(/*batchSize*/ 1);
- try(Dataset dataset = datasetFactory.finish()){
- try(Scanner scanner = dataset.newScan(options)){
- Schema schema = scanner.schema();
-
- System.out.println(schema);
- }
- }
- }
+ ScanOptions options = new ScanOptions(/*batchSize*/ 1);
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ DatasetFactory datasetFactory = new
FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
+ Dataset dataset = datasetFactory.finish();
+ Scanner scanner = dataset.newScan(options)
+ ){
+ Schema schema = scanner.schema();
+
+ System.out.println(schema);
+ } catch (Exception e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -149,28 +159,31 @@ Query Data Content For File
import org.apache.arrow.dataset.scanner.Scanner;
import org.apache.arrow.dataset.source.Dataset;
import org.apache.arrow.dataset.source.DatasetFactory;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorLoader;
import org.apache.arrow.vector.VectorSchemaRoot;
-
import java.util.stream.Stream;
String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/parquetfiles/data1.parquet";
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- DatasetFactory datasetFactory = new
FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
- Dataset dataset = datasetFactory.finish()){
- ScanOptions options = new ScanOptions(/*batchSize*/ 100);
- try(Scanner scanner = dataset.newScan(options);
- VectorSchemaRoot vsr = VectorSchemaRoot.create(scanner.schema(),
rootAllocator)){
- scanner.scan().forEach(scanTask-> {
- VectorLoader loader = new VectorLoader(vsr);
- scanTask.execute().forEachRemaining(arrowRecordBatch -> {
- loader.load(arrowRecordBatch);
- System.out.print(vsr.contentToTSVString());
- arrowRecordBatch.close();
- });
+ ScanOptions options = new ScanOptions(/*batchSize*/ 100);
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ DatasetFactory datasetFactory = new
FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
+ Dataset dataset = datasetFactory.finish();
+ Scanner scanner = dataset.newScan(options);
+ VectorSchemaRoot vsr = VectorSchemaRoot.create(scanner.schema(),
allocator)
+ ){
+ scanner.scan().forEach(scanTask-> {
+ VectorLoader loader = new VectorLoader(vsr);
+ scanTask.execute().forEachRemaining(arrowRecordBatch -> {
+ loader.load(arrowRecordBatch);
+ System.out.print(vsr.contentToTSVString());
+ arrowRecordBatch.close();
});
- }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -194,29 +207,31 @@ Consider that we have these files: data1: 3 rows, data2:
3 rows and data3: 250 r
import org.apache.arrow.dataset.scanner.Scanner;
import org.apache.arrow.dataset.source.Dataset;
import org.apache.arrow.dataset.source.DatasetFactory;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorLoader;
import org.apache.arrow.vector.VectorSchemaRoot;
-
import java.util.stream.Stream;
String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/parquetfiles/";
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- DatasetFactory datasetFactory = new
FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
- Dataset dataset = datasetFactory.finish()){
- ScanOptions options = new ScanOptions(/*batchSize*/ 100);
- try(Scanner scanner = dataset.newScan(options);
- VectorSchemaRoot vsr = VectorSchemaRoot.create(scanner.schema(),
rootAllocator)){
- scanner.scan().forEach(scanTask-> {
- VectorLoader loader = new VectorLoader(vsr);
- final int[] count = {1};
- scanTask.execute().forEachRemaining(arrowRecordBatch -> {
- loader.load(arrowRecordBatch);
- System.out.println("Batch: " + count[0]++ + ", RowCount: "
+ vsr.getRowCount());
- arrowRecordBatch.close();
- });
+ ScanOptions options = new ScanOptions(/*batchSize*/ 100);
+ try(BufferAllocator allocator = new RootAllocator();
+ DatasetFactory datasetFactory = new
FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
+ Dataset dataset = datasetFactory.finish();
+ Scanner scanner = dataset.newScan(options);
+ VectorSchemaRoot vsr = VectorSchemaRoot.create(scanner.schema(),
allocator)
+ ){
+ scanner.scan().forEach(scanTask-> {
+ VectorLoader loader = new VectorLoader(vsr);
+ final int[] count = {1};
+ scanTask.execute().forEachRemaining(arrowRecordBatch -> {
+ loader.load(arrowRecordBatch);
+ System.out.println("Batch: " + count[0]++ + ", RowCount: " +
vsr.getRowCount());
+ arrowRecordBatch.close();
});
- }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -241,6 +256,7 @@ In case we need to project only certain columns we could
configure ScanOptions w
import org.apache.arrow.dataset.scanner.Scanner;
import org.apache.arrow.dataset.source.Dataset;
import org.apache.arrow.dataset.source.DatasetFactory;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VectorLoader;
import org.apache.arrow.vector.VectorSchemaRoot;
@@ -248,22 +264,25 @@ In case we need to project only certain columns we could
configure ScanOptions w
import java.util.Optional;
String uri = "file:" + System.getProperty("user.dir") +
"/thirdpartydeps/parquetfiles/data1.parquet";
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- DatasetFactory datasetFactory = new
FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
- Dataset dataset = datasetFactory.finish()){
- String[] projection = new String[] {"name"};
- ScanOptions options = new ScanOptions(/*batchSize*/ 100,
Optional.of(projection));
- try(Scanner scanner = dataset.newScan(options);
- VectorSchemaRoot vsr = VectorSchemaRoot.create(scanner.schema(),
rootAllocator)){
- scanner.scan().forEach(scanTask-> {
- VectorLoader loader = new VectorLoader(vsr);
- scanTask.execute().forEachRemaining(arrowRecordBatch -> {
- loader.load(arrowRecordBatch);
- System.out.print(vsr.contentToTSVString());
- arrowRecordBatch.close();
- });
+ String[] projection = new String[] {"name"};
+ ScanOptions options = new ScanOptions(/*batchSize*/ 100,
Optional.of(projection));
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ DatasetFactory datasetFactory = new
FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(),
FileFormat.PARQUET, uri);
+ Dataset dataset = datasetFactory.finish();
+ Scanner scanner = dataset.newScan(options);
+ VectorSchemaRoot vsr = VectorSchemaRoot.create(scanner.schema(),
allocator)
+ ){
+ scanner.scan().forEach(scanTask-> {
+ VectorLoader loader = new VectorLoader(vsr);
+ scanTask.execute().forEachRemaining(arrowRecordBatch -> {
+ loader.load(arrowRecordBatch);
+ System.out.print(vsr.contentToTSVString());
+ arrowRecordBatch.close();
});
- }
+ });
+ } catch (Exception e) {
+ e.printStackTrace();
}
.. testoutput::
diff --git a/java/source/flight.rst b/java/source/flight.rst
index 88a6eb7..bdaf81b 100644
--- a/java/source/flight.rst
+++ b/java/source/flight.rst
@@ -50,6 +50,7 @@ Flight Client and Server
import org.apache.arrow.flight.PutResult;
import org.apache.arrow.flight.Result;
import org.apache.arrow.flight.Ticket;
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.VectorLoader;
@@ -90,10 +91,10 @@ Flight Client and Server
}
}
class CookbookProducer extends NoOpFlightProducer {
- private final RootAllocator allocator;
+ private final BufferAllocator allocator;
private final Location location;
private final ConcurrentHashMap<FlightDescriptor, Dataset> datasets;
- public CookbookProducer(RootAllocator allocator, Location location) {
+ public CookbookProducer(BufferAllocator allocator, Location location) {
this.allocator = allocator;
this.location = location;
this.datasets = new ConcurrentHashMap<>();
@@ -175,7 +176,7 @@ Flight Client and Server
}
}
Location location = Location.forGrpcInsecure("0.0.0.0", 33333);
- try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE)){
+ try (BufferAllocator allocator = new RootAllocator()){
// Server
try(FlightServer flightServer = FlightServer.builder(allocator,
location,
new CookbookProducer(allocator, location)).build()) {
diff --git a/java/source/index.rst b/java/source/index.rst
index 78caa5c..d2d27fd 100644
--- a/java/source/index.rst
+++ b/java/source/index.rst
@@ -11,11 +11,11 @@ Welcome to java arrow's documentation!
:caption: Contents:
create
- io
schema
- data
+ io
flight
dataset
+ data
Indices and tables
==================
diff --git a/java/source/io.rst b/java/source/io.rst
index 43fa84d..ffa06a3 100644
--- a/java/source/io.rst
+++ b/java/source/io.rst
@@ -23,6 +23,7 @@ Write - Out to File
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.IntVector;
@@ -33,16 +34,17 @@ Write - Out to File
import org.apache.arrow.vector.VectorSchemaRoot;
import static java.util.Arrays.asList;
import org.apache.arrow.vector.ipc.ArrowFileWriter;
-
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
- try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ try (BufferAllocator allocator = new RootAllocator()) {
Field name = new Field("name", FieldType.nullable(new
ArrowType.Utf8()), null);
Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32,
true)), null);
Schema schemaPerson = new Schema(asList(name, age));
- try(VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, rootAllocator)){
+ try(
+ VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, allocator)
+ ){
VarCharVector nameVector = (VarCharVector)
vectorSchemaRoot.getVector("name");
nameVector.allocateNew(3);
nameVector.set(0, "David".getBytes());
@@ -55,8 +57,9 @@ Write - Out to File
ageVector.set(2, 30);
vectorSchemaRoot.setRowCount(3);
File file = new File("randon_access_to_file.arrow");
- try (FileOutputStream fileOutputStream = new
FileOutputStream(file);
- ArrowFileWriter writer = new
ArrowFileWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
+ try (
+ FileOutputStream fileOutputStream = new FileOutputStream(file);
+ ArrowFileWriter writer = new ArrowFileWriter(vectorSchemaRoot,
null, fileOutputStream.getChannel())
) {
writer.start();
writer.writeBatch();
@@ -77,6 +80,7 @@ Write - Out to Buffer
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.IntVector;
@@ -87,16 +91,17 @@ Write - Out to Buffer
import org.apache.arrow.vector.VectorSchemaRoot;
import static java.util.Arrays.asList;
import org.apache.arrow.vector.ipc.ArrowFileWriter;
-
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.channels.Channels;
- try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
+ try (BufferAllocator allocator = new RootAllocator()) {
Field name = new Field("name", FieldType.nullable(new
ArrowType.Utf8()), null);
Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32,
true)), null);
Schema schemaPerson = new Schema(asList(name, age));
- try(VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, rootAllocator)){
+ try(
+ VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, allocator)
+ ){
VarCharVector nameVector = (VarCharVector)
vectorSchemaRoot.getVector("name");
nameVector.allocateNew(3);
nameVector.set(0, "David".getBytes());
@@ -108,12 +113,15 @@ Write - Out to Buffer
ageVector.set(1, 20);
ageVector.set(2, 30);
vectorSchemaRoot.setRowCount(3);
- try (ByteArrayOutputStream out = new ByteArrayOutputStream();
- ArrowFileWriter writer = new
ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out)))
- {
+ try (
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowFileWriter writer = new
ArrowFileWriter(vectorSchemaRoot, null, Channels.newChannel(out))
+ ) {
writer.start();
writer.writeBatch();
- System.out.println("Record batches written: " +
writer.getRecordBlocks().size() + ". Number of rows written: " +
vectorSchemaRoot.getRowCount());
+
+ System.out.println("Record batches written: " +
writer.getRecordBlocks().size() +
+ ". Number of rows written: " +
vectorSchemaRoot.getRowCount());
} catch (IOException e) {
e.printStackTrace();
}
@@ -132,26 +140,28 @@ Write - Out to File
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.IntVector;
+ import org.apache.arrow.vector.ipc.ArrowStreamWriter;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.VectorSchemaRoot;
import static java.util.Arrays.asList;
- import org.apache.arrow.vector.ipc.ArrowStreamWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
- try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
- // Create and populate data:
+ try (BufferAllocator rootAllocator = new RootAllocator()) {
Field name = new Field("name", FieldType.nullable(new
ArrowType.Utf8()), null);
Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32,
true)), null);
Schema schemaPerson = new Schema(asList(name, age));
- try(VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, rootAllocator)){
+ try(
+ VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, rootAllocator)
+ ){
VarCharVector nameVector = (VarCharVector)
vectorSchemaRoot.getVector("name");
nameVector.allocateNew(3);
nameVector.set(0, "David".getBytes());
@@ -164,8 +174,9 @@ Write - Out to File
ageVector.set(2, 30);
vectorSchemaRoot.setRowCount(3);
File file = new File("streaming_to_file.arrow");
- try (FileOutputStream fileOutputStream = new
FileOutputStream(file);
- ArrowStreamWriter writer = new
ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
+ try (
+ FileOutputStream fileOutputStream = new FileOutputStream(file);
+ ArrowStreamWriter writer = new
ArrowStreamWriter(vectorSchemaRoot, null, fileOutputStream.getChannel())
){
writer.start();
writer.writeBatch();
@@ -185,6 +196,7 @@ Write - Out to Buffer
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.IntVector;
@@ -195,17 +207,17 @@ Write - Out to Buffer
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.VectorSchemaRoot;
import static java.util.Arrays.asList;
-
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.channels.Channels;
- try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
- // Create and populate data:
+ try (BufferAllocator rootAllocator = new RootAllocator()) {
Field name = new Field("name", FieldType.nullable(new
ArrowType.Utf8()), null);
Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32,
true)), null);
Schema schemaPerson = new Schema(asList(name, age));
- try(VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, rootAllocator)){
+ try(
+ VectorSchemaRoot vectorSchemaRoot =
VectorSchemaRoot.create(schemaPerson, rootAllocator)
+ ){
VarCharVector nameVector = (VarCharVector)
vectorSchemaRoot.getVector("name");
nameVector.allocateNew(3);
nameVector.set(0, "David".getBytes());
@@ -217,8 +229,9 @@ Write - Out to Buffer
ageVector.set(1, 20);
ageVector.set(2, 30);
vectorSchemaRoot.setRowCount(3);
- try (ByteArrayOutputStream out = new ByteArrayOutputStream();
- ArrowStreamWriter writer = new
ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out))
+ try (
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowStreamWriter writer = new
ArrowStreamWriter(vectorSchemaRoot, null, Channels.newChannel(out))
){
writer.start();
writer.writeBatch();
@@ -249,29 +262,29 @@ We are providing a path with auto generated arrow files
for testing purposes, ch
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.ipc.ArrowFileReader;
import org.apache.arrow.vector.ipc.message.ArrowBlock;
import org.apache.arrow.vector.VectorSchemaRoot;
import java.io.File;
import java.io.FileInputStream;
- import java.io.FileOutputStream;
import java.io.IOException;
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){
- File file = new
File("./thirdpartydeps/arrowfiles/random_access.arrow");
- try (FileInputStream fileInputStream = new FileInputStream(file);
- ArrowFileReader reader = new
ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
- ){
- System.out.println("Record batches in file: " +
reader.getRecordBlocks().size());
- for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
- reader.loadRecordBatch(arrowBlock);
- VectorSchemaRoot vectorSchemaRootRecover =
reader.getVectorSchemaRoot();
- System.out.print(vectorSchemaRootRecover.contentToTSVString());
- }
- } catch (IOException e) {
- e.printStackTrace();
+ File file = new File("./thirdpartydeps/arrowfiles/random_access.arrow");
+ try(
+ BufferAllocator rootAllocator = new RootAllocator();
+ FileInputStream fileInputStream = new FileInputStream(file);
+ ArrowFileReader reader = new
ArrowFileReader(fileInputStream.getChannel(), rootAllocator)
+ ){
+ System.out.println("Record batches in file: " +
reader.getRecordBlocks().size());
+ for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
+ reader.loadRecordBatch(arrowBlock);
+ VectorSchemaRoot vectorSchemaRootRecover =
reader.getVectorSchemaRoot();
+ System.out.print(vectorSchemaRootRecover.contentToTSVString());
}
+ } catch (IOException e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -295,30 +308,32 @@ Read - From Buffer
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.ipc.ArrowFileReader;
import org.apache.arrow.vector.ipc.SeekableReadChannel;
import org.apache.arrow.vector.ipc.message.ArrowBlock;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
-
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
- Path path =
Paths.get("./thirdpartydeps/arrowfiles/random_access.arrow");
- try (ArrowFileReader reader = new ArrowFileReader(new
SeekableReadChannel(new
ByteArrayReadableSeekableByteChannel(Files.readAllBytes(path))),
rootAllocator)){
- System.out.println("Record batches in file: " +
reader.getRecordBlocks().size());
- for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
- reader.loadRecordBatch(arrowBlock);
- VectorSchemaRoot vectorSchemaRootRecover =
reader.getVectorSchemaRoot();
- System.out.print(vectorSchemaRootRecover.contentToTSVString());
- }
- } catch (IOException e) {
- e.printStackTrace();
+ Path path = Paths.get("./thirdpartydeps/arrowfiles/random_access.arrow");
+ try(
+ BufferAllocator rootAllocator = new RootAllocator();
+ ArrowFileReader reader = new ArrowFileReader(new
SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(
+ Files.readAllBytes(path))),
rootAllocator)
+ ) {
+ System.out.println("Record batches in file: " +
reader.getRecordBlocks().size());
+ for (ArrowBlock arrowBlock : reader.getRecordBlocks()) {
+ reader.loadRecordBatch(arrowBlock);
+ VectorSchemaRoot vectorSchemaRootRecover =
reader.getVectorSchemaRoot();
+ System.out.print(vectorSchemaRootRecover.contentToTSVString());
}
+ } catch (IOException e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -345,6 +360,7 @@ Read - From File
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.ipc.ArrowStreamReader;
import org.apache.arrow.vector.VectorSchemaRoot;
@@ -352,17 +368,18 @@ Read - From File
import java.io.FileInputStream;
import java.io.IOException;
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
- File file = new File("./thirdpartydeps/arrowfiles/streaming.arrow");
- try (FileInputStream fileInputStreamForStream = new
FileInputStream(file);
- ArrowStreamReader reader = new
ArrowStreamReader(fileInputStreamForStream, rootAllocator)) {
- while (reader.loadNextBatch()) {
- VectorSchemaRoot vectorSchemaRootRecover =
reader.getVectorSchemaRoot();
- System.out.print(vectorSchemaRootRecover.contentToTSVString());
- }
- } catch (IOException e) {
- e.printStackTrace();
+ File file = new File("./thirdpartydeps/arrowfiles/streaming.arrow");
+ try(
+ BufferAllocator rootAllocator = new RootAllocator();
+ FileInputStream fileInputStreamForStream = new FileInputStream(file);
+ ArrowStreamReader reader = new
ArrowStreamReader(fileInputStreamForStream, rootAllocator)
+ ) {
+ while (reader.loadNextBatch()) {
+ VectorSchemaRoot vectorSchemaRootRecover =
reader.getVectorSchemaRoot();
+ System.out.print(vectorSchemaRootRecover.contentToTSVString());
}
+ } catch (IOException e) {
+ e.printStackTrace();
}
.. testoutput::
@@ -385,24 +402,26 @@ Read - From Buffer
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.ipc.ArrowStreamReader;
-
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
- try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) {
- Path path = Paths.get("./thirdpartydeps/arrowfiles/streaming.arrow");
- try (ArrowStreamReader reader = new ArrowStreamReader(new
ByteArrayInputStream(Files.readAllBytes(path)), rootAllocator)){
- while(reader.loadNextBatch()){
-
System.out.print(reader.getVectorSchemaRoot().contentToTSVString());
- }
- } catch (IOException e) {
- e.printStackTrace();
+ Path path = Paths.get("./thirdpartydeps/arrowfiles/streaming.arrow");
+ try(
+ BufferAllocator rootAllocator = new RootAllocator();
+ ArrowStreamReader reader = new ArrowStreamReader(new
ByteArrayInputStream(
+ Files.readAllBytes(path)),
rootAllocator)
+ ) {
+ while(reader.loadNextBatch()){
+
System.out.print(reader.getVectorSchemaRoot().contentToTSVString());
}
+ } catch (IOException e) {
+ e.printStackTrace();
}
.. testoutput::
diff --git a/java/source/schema.rst b/java/source/schema.rst
index 5cc5cd8..a855e77 100644
--- a/java/source/schema.rst
+++ b/java/source/schema.rst
@@ -2,16 +2,16 @@
Working with Schema
===================
-Common definition of table has an schema. Java arrow is columnar oriented and
it also has an schema representation.
-Consider that each name on the schema maps to a columns for a predefined data
type
-
+Let's start talking about tabular data. Data often comes in the form of
two-dimensional
+sets of heterogeneous data (such as database tables, CSV files...). Arrow
provides
+several abstractions to handle such data conveniently and efficiently.
.. contents::
-Define Data Type
-================
+Creating Fields
+===============
-Definition of columnar fields for string (name), integer (age) and array
(points):
+Fields are used to denote the particular columns of tabular data.
.. testcode::
@@ -58,10 +58,43 @@ Definition of columnar fields for string (name), integer
(age) and array (points
points: List<intCol: Int(32, true)>
-Define Metadata for Field
-=========================
+Creating the Schema
+===================
+
+A schema describes a sequence of columns in tabular data, and consists
+of a list of fields.
+
+.. testcode::
+
+ import org.apache.arrow.vector.types.pojo.Schema;
+ import org.apache.arrow.vector.types.pojo.ArrowType;
+ import org.apache.arrow.vector.types.pojo.Field;
+ import org.apache.arrow.vector.types.pojo.FieldType;
+ import java.util.ArrayList;
+ import java.util.List;
+ import static java.util.Arrays.asList;
+
+ Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()),
null);
+ Field document = new Field("document", new FieldType(true, new
ArrowType.Utf8(), null), null);
+ Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32,
true)), null);
+ FieldType intType = new FieldType(true, new ArrowType.Int(32, true),
/*dictionary=*/null);
+ FieldType listType = new FieldType(true, new ArrowType.List(),
/*dictionary=*/null);
+ Field childField = new Field("intCol", intType, null);
+ List<Field> childFields = new ArrayList<>();
+ childFields.add(childField);
+ Field points = new Field("points", listType, childFields);
+ Schema schemaPerson = new Schema(asList(name, document, age, points));
-In case we need to add metadata to our definition we could use:
+ System.out.print(schemaPerson);
+
+.. testoutput::
+
+ Schema<name: Utf8, document: Utf8, age: Int(32, true), points:
List<intCol: Int(32, true)>>
+
+Adding Metadata to Fields and Schemas
+=====================================
+
+In case we need to add metadata to our Field we could use:
.. testcode::
@@ -69,7 +102,6 @@ In case we need to add metadata to our definition we could
use:
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
- // create a column data type + metadata
Map<String, String> metadata = new HashMap<>();
metadata.put("A", "Id card");
metadata.put("B", "Passport");
@@ -82,25 +114,23 @@ In case we need to add metadata to our definition we could
use:
{A=Id card, B=Passport, C=Visa}
-Create the Schema
-=================
-
-A schema is a list of Fields, where each Field is defined by name and type.
+In case we need to add metadata to our Schema we could use:
.. testcode::
import org.apache.arrow.vector.types.pojo.Schema;
- import static java.util.Arrays.asList;
+
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
+ import java.util.ArrayList;
+ import java.util.HashMap;
+ import java.util.List;
+ import java.util.Map;
+ import static java.util.Arrays.asList;
Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()),
null);
- Map<String, String> metadata = new HashMap<>();
- metadata.put("A", "Id card");
- metadata.put("B", "Passport");
- metadata.put("C", "Visa");
- Field document = new Field("document", new FieldType(true, new
ArrowType.Utf8(), null, metadata), null);
+ Field document = new Field("document", new FieldType(true, new
ArrowType.Utf8(), null), null);
Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32,
true)), null);
FieldType intType = new FieldType(true, new ArrowType.Int(32, true),
/*dictionary=*/null);
FieldType listType = new FieldType(true, new ArrowType.List(),
/*dictionary=*/null);
@@ -108,47 +138,42 @@ A schema is a list of Fields, where each Field is defined
by name and type.
List<Field> childFields = new ArrayList<>();
childFields.add(childField);
Field points = new Field("points", listType, childFields);
+ Map<String, String> metadataSchema = new HashMap<>();
+ metadataSchema.put("Key-1", "Value-1");
+ Schema schemaPerson = new Schema(asList(name, document, age, points),
metadataSchema);
- // create a definition
- Schema schemaPerson = new Schema(asList(name, document, age, points));
-
- System.out.print(schemaPerson)
+ System.out.print(schemaPerson);
.. testoutput::
- Schema<name: Utf8, document: Utf8, age: Int(32, true), points:
List<intCol: Int(32, true)>>
+ Schema<name: Utf8, document: Utf8, age: Int(32, true), points:
List<intCol: Int(32, true)>>(metadata: {Key-1=Value-1})
+
+Creating VectorSchemaRoot
+=========================
-Populate Data
-=============
+``VectorSchemaRoot`` is somewhat analogous to tables and record batches in the
+other Arrow implementations in that they all are 2D datasets, but the usage is
different.
+
+Let's populate a ``VectorSchemaRoot`` with a small batch of records:
.. testcode::
+ import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
- import org.apache.arrow.vector.BitVectorHelper;
- import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.VarCharVector;
- import org.apache.arrow.vector.complex.BaseRepeatedValueVector;
- import org.apache.arrow.vector.complex.ListVector;
- import org.apache.arrow.vector.types.Types.MinorType;
import org.apache.arrow.vector.VectorSchemaRoot;
+ import org.apache.arrow.vector.complex.ListVector;
+ import org.apache.arrow.vector.IntVector;
+ import org.apache.arrow.vector.complex.impl.UnionListWriter;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
-
import java.util.ArrayList;
- import java.util.HashMap;
import java.util.List;
- import java.util.Map;
-
import static java.util.Arrays.asList;
Field name = new Field("name", FieldType.nullable(new ArrowType.Utf8()),
null);
- Map<String, String> metadata = new HashMap<>();
- metadata.put("A", "Id card");
- metadata.put("B", "Passport");
- metadata.put("C", "Visa");
- Field document = new Field("document", new FieldType(true, new
ArrowType.Utf8(), null, metadata), null);
Field age = new Field("age", FieldType.nullable(new ArrowType.Int(32,
true)), null);
FieldType intType = new FieldType(true, new ArrowType.Int(32, true), null);
FieldType listType = new FieldType(true, new ArrowType.List(), null);
@@ -156,60 +181,48 @@ Populate Data
List<Field> childFields = new ArrayList<>();
childFields.add(childField);
Field points = new Field("points", listType, childFields);
-
- RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
- Schema schema = new Schema(asList(name, document, age, points));
- VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(schema,
rootAllocator);
-
- VarCharVector nameVector = (VarCharVector)
vectorSchemaRoot.getVector("name");
- nameVector.allocateNew(3);
- nameVector.set(0, "David".getBytes());
- nameVector.set(1, "Gladis".getBytes());
- nameVector.set(2, "Juan".getBytes());
- nameVector.setValueCount(3);
- VarCharVector documentVector = (VarCharVector)
vectorSchemaRoot.getVector("name");
- documentVector.allocateNew(3);
- documentVector.set(0, "A".getBytes());
- documentVector.set(1, "B".getBytes());
- documentVector.set(2, "C".getBytes());
- documentVector.setValueCount(3);
- IntVector ageVector = (IntVector) vectorSchemaRoot.getVector("age");
- ageVector.allocateNew(3);
- ageVector.set(0, 10);
- ageVector.set(1, 20);
- ageVector.set(2, 30);
- ageVector.setValueCount(3);
- ListVector listVector = (ListVector) vectorSchemaRoot.getVector("points");
- listVector.allocateNew();
- MinorType type = MinorType.INT;
- listVector.addOrGetVector(FieldType.nullable(type.getType()));
- IntVector dataVector = (IntVector) listVector.getDataVector();
- dataVector.allocateNew();
- listVector.getOffsetBuffer().setInt(0, 0);
- BitVectorHelper.setBit(listVector.getValidityBuffer(), 0);
- dataVector.set(0, 1);
- dataVector.set(1, 2);
- dataVector.set(2, 3);
- listVector.getOffsetBuffer().setInt(1 *
BaseRepeatedValueVector.OFFSET_WIDTH, 3);
- BitVectorHelper.setBit(listVector.getValidityBuffer(), 1);
- dataVector.set(3, 9);
- dataVector.set(4, 8);
- listVector.getOffsetBuffer().setInt(2 *
BaseRepeatedValueVector.OFFSET_WIDTH, 5);
- BitVectorHelper.setBit(listVector.getValidityBuffer(), 2);
- dataVector.set(5, 10);
- dataVector.set(6, 20);
- dataVector.set(7, 30);
- listVector.getOffsetBuffer().setInt(3 *
BaseRepeatedValueVector.OFFSET_WIDTH, 8);
- listVector.setLastSet(2);
- listVector.setValueCount(3);
-
- vectorSchemaRoot.setRowCount(3);
-
- System.out.print(vectorSchemaRoot.contentToTSVString());
+ Schema schema = new Schema(asList(name, age, points));
+ try(
+ BufferAllocator allocator = new RootAllocator();
+ VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)
+ ){
+ VarCharVector nameVector = (VarCharVector) root.getVector("name");
+ nameVector.allocateNew(3);
+ nameVector.set(0, "David".getBytes());
+ nameVector.set(1, "Gladis".getBytes());
+ nameVector.set(2, "Juan".getBytes());
+ nameVector.setValueCount(3);
+ IntVector ageVector = (IntVector) root.getVector("age");
+ ageVector.allocateNew(3);
+ ageVector.set(0, 10);
+ ageVector.set(1, 20);
+ ageVector.set(2, 30);
+ ageVector.setValueCount(3);
+ ListVector listVector = (ListVector) root.getVector("points");
+ UnionListWriter listWriter = listVector.getWriter();
+ int[] data = new int[] { 4, 8, 12, 10, 20, 30, 5, 10, 15 };
+ int tmp_index = 0;
+ for(int i = 0; i < 3; i++) {
+ listWriter.setPosition(i);
+ listWriter.startList();
+ for(int j = 0; j < 3; j++) {
+ listWriter.writeInt(data[tmp_index]);
+ tmp_index = tmp_index + 1;
+ }
+ listWriter.setValueCount(2);
+ listWriter.endList();
+ }
+ listVector.setValueCount(3);
+ root.setRowCount(3);
+
+ System.out.print(root.contentToTSVString());
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
.. testoutput::
- name document age points
- A null 10 [1,2,3]
- B null 20 [9,8]
- C null 30 [10,20,30]
+ name age points
+ David 10 [4,8,12]
+ Gladis 20 [10,20,30]
+ Juan 30 [5,10,15]
\ No newline at end of file