lidavidm commented on a change in pull request #138: URL: https://github.com/apache/arrow-cookbook/pull/138#discussion_r809047986
########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= Review comment: ```suggestion Constructing Datasets ===================== ``` ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. Review comment: Maybe make it explicit that it's JNI bindings, not a separate implementation? Can we use Intersphinx to do this linking? (Example for Python: https://github.com/apache/arrow-cookbook/blob/main/python/source/flight.rst#authentication-with-userpassword) ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Let construct our dataset with predefined schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish(datasetFactory.inspect())){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Getting the Schema of a Dataset +=============================== + +Inspect Schema +************** + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + Schema schema = datasetFactory.inspect(); + + System.out.println(schema); + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Infer Schema +************ + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + ScanOptions options = new ScanOptions(1); + try(Dataset dataset = datasetFactory.finish()){ + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + + System.out.println(schema); + } + } + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Query Parquet File +================== + +Let query information for a parquet file. + +Query Data Content For File +*************************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); Review comment: In all these recipes, can we provide comments for parameters like this? ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. Review comment: ```suggestion We can construct a dataset with an auto-inferred schema. ``` ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Let construct our dataset with predefined schema. Review comment: ```suggestion We can also explicitly discover the schema during dataset construction, and pass that to the factory. We could modify or replace the schema ourselves if needed here. ``` ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Let construct our dataset with predefined schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish(datasetFactory.inspect())){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Getting the Schema of a Dataset +=============================== + +Inspect Schema +************** + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + Schema schema = datasetFactory.inspect(); + + System.out.println(schema); + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Infer Schema +************ + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + ScanOptions options = new ScanOptions(1); + try(Dataset dataset = datasetFactory.finish()){ + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + + System.out.println(schema); + } + } + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Query Parquet File +================== + +Let query information for a parquet file. + +Query Data Content For File +*************************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + List<ArrowRecordBatch> batches = StreamSupport.stream(scanner.scan().spliterator(), false).flatMap(t -> Streams.stream(t.execute())).collect(Collectors.toList()); + try (VectorSchemaRoot vsr = VectorSchemaRoot.create(schema, rootAllocator)) { + VectorLoader loader = new VectorLoader(vsr); + for (ArrowRecordBatch batch : batches) { + loader.load(batch); + System.out.print(vsr.contentToTSVString()); + batch.close(); + } + } + } + } + +.. testoutput:: + + id name + 1 David + 2 Gladis + 3 Juan + +Query Data Content For Directory +******************************** + +Consider that we have these files: data1: 3 rows, data2: 3 rows and data3: 250 rows. + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + List<ArrowRecordBatch> batches = StreamSupport.stream(scanner.scan().spliterator(), false).flatMap(t -> Streams.stream(t.execute())).collect(Collectors.toList()); + try (VectorSchemaRoot vsr = VectorSchemaRoot.create(schema, rootAllocator)) { + VectorLoader loader = new VectorLoader(vsr); + System.out.println("Batch Size: " + batches.size()); + int count = 1; + for (ArrowRecordBatch batch : batches) { + loader.load(batch); + System.out.println("Batch: " + count++ + ", RowCount: " + vsr.getRowCount()); + batch.close(); + } + } + } + } + +.. testoutput:: + + Batch Size: 5 + Batch: 1, RowCount: 3 + Batch: 2, RowCount: 3 + Batch: 3, RowCount: 100 + Batch: 4, RowCount: 100 + Batch: 5, RowCount: 50 + +Query Data Content with Projection +********************************** + Review comment: We should add a short explanation here that we can choose which columns to read from the dataset. ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Let construct our dataset with predefined schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish(datasetFactory.inspect())){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Getting the Schema of a Dataset +=============================== + +Inspect Schema +************** + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + Schema schema = datasetFactory.inspect(); + + System.out.println(schema); + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Infer Schema +************ + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + ScanOptions options = new ScanOptions(1); + try(Dataset dataset = datasetFactory.finish()){ + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + + System.out.println(schema); + } + } + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Query Parquet File +================== + +Let query information for a parquet file. + +Query Data Content For File +*************************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); Review comment: ```suggestion ScanOptions options = new ScanOptions(/*batchSize*/ 100); ``` ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Let construct our dataset with predefined schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish(datasetFactory.inspect())){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Getting the Schema of a Dataset +=============================== + +Inspect Schema +************** + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + Schema schema = datasetFactory.inspect(); + + System.out.println(schema); + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Infer Schema +************ + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + ScanOptions options = new ScanOptions(1); + try(Dataset dataset = datasetFactory.finish()){ + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + + System.out.println(schema); + } + } + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Query Parquet File +================== + +Let query information for a parquet file. + +Query Data Content For File +*************************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); Review comment: Also, something like 1024 might be more reasonable? ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Let construct our dataset with predefined schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish(datasetFactory.inspect())){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Getting the Schema of a Dataset Review comment: This section doesn't make sense to have anymore. ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Let construct our dataset with predefined schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish(datasetFactory.inspect())){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Getting the Schema of a Dataset +=============================== + +Inspect Schema +************** + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + Schema schema = datasetFactory.inspect(); + + System.out.println(schema); + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Infer Schema +************ + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + ScanOptions options = new ScanOptions(1); + try(Dataset dataset = datasetFactory.finish()){ + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + + System.out.println(schema); + } + } + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Query Parquet File +================== + +Let query information for a parquet file. + +Query Data Content For File +*************************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + List<ArrowRecordBatch> batches = StreamSupport.stream(scanner.scan().spliterator(), false).flatMap(t -> Streams.stream(t.execute())).collect(Collectors.toList()); Review comment: Can we iterate instead of collecting all data up front? ########## File path: java/source/dataset.rst ########## @@ -0,0 +1,300 @@ +.. _arrow-dataset: + +======= +Dataset +======= + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Datasets library. + +.. contents:: + +Dataset +======= + +Let construct our dataset with auto-inferred schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Let construct our dataset with predefined schema. + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try (RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)) { + try (DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)) { + try(Dataset dataset = datasetFactory.finish(datasetFactory.inspect())){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + System.out.println(StreamSupport.stream(scanner.scan().spliterator(), false).count()); + } + } + } + } + +.. testoutput:: + + 1 + +Getting the Schema of a Dataset +=============================== + +Inspect Schema +************** + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + Schema schema = datasetFactory.inspect(); + + System.out.println(schema); + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Infer Schema +************ + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data3.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE)){ + try(DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri)){ + ScanOptions options = new ScanOptions(1); + try(Dataset dataset = datasetFactory.finish()){ + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + + System.out.println(schema); + } + } + } + } + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Query Parquet File +================== + +Let query information for a parquet file. + +Query Data Content For File +*************************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + try(RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish()){ + ScanOptions options = new ScanOptions(100); + try(Scanner scanner = dataset.newScan(options)){ + Schema schema = scanner.schema(); + List<ArrowRecordBatch> batches = StreamSupport.stream(scanner.scan().spliterator(), false).flatMap(t -> Streams.stream(t.execute())).collect(Collectors.toList()); + try (VectorSchemaRoot vsr = VectorSchemaRoot.create(schema, rootAllocator)) { Review comment: Why not create the root in the same try-with-resources block as the scanner itself, and save an indentation level? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
