lidavidm commented on a change in pull request #138: URL: https://github.com/apache/arrow-cookbook/pull/138#discussion_r800641186
########## File path: java/source/jni/dataset.rst ########## @@ -0,0 +1,294 @@ +======= +Dataset +======= + +Current java project that use JNI are: + +* `Arrow Java Dataset <https://arrow.apache.org/docs/dev/java/dataset.html>`_: Java implementation of Arrow Dataset API/Framework. JniLoader [arrow_cdata_jni] +* `Arrow Java C Data Interface <https://arrow.apache.org/docs/format/CDataInterface.html>`_: Java implementation of C Data Interface. JniLoader [arrow_dataset_jni] + +.. contents:: + +Schema +====== + +Let read schema information for a parquet file using arrow dataset module (Number of rows in each file: 3) + +Inspect Schema +************** + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.util.AutoCloseables; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Schema schema = datasetFactory.inspect(); + AutoCloseables.close(datasetFactory); + + System.out.println(schema); + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Infer Schema +************ + +.. testcode:: + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.types.pojo.Schema; + import org.apache.arrow.util.AutoCloseables; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + ScanOptions options = new ScanOptions(1); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + Schema schema = scanner.schema(); + AutoCloseables.close(datasetFactory, dataset, scanner); + + System.out.println(schema); + +.. testoutput:: + + Schema<id: Int(32, true), name: Utf8>(metadata: {parquet.avro.schema={"type":"record","name":"User","namespace":"org.apache.arrow.dataset","fields":[{"name":"id","type":["int","null"]},{"name":"name","type":["string","null"]}]}, writer.model.name=avro}) + +Query Parquet File +================== + +Let query information for a parquet file usign arrow dataset module (Number of rows in each file: 3) + +Query Data Size +*************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + ScanOptions options = new ScanOptions(/*batchSize*/ 1); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + List<ArrowRecordBatch> batches = StreamSupport.stream(scanner.scan().spliterator(), false).flatMap(t -> Streams.stream(t.execute())).collect(Collectors.toList()); + AutoCloseables.close(datasetFactory, dataset, scanner); + + System.out.println(batches.size()); // totaRows 3 / batchSize 1 = 3 + +.. testoutput:: + + 3 + +Query Data Content +****************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + ScanOptions options = new ScanOptions(1); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + Schema schema = scanner.schema(); + List<ArrowRecordBatch> batches = StreamSupport.stream(scanner.scan().spliterator(), false).flatMap(t -> Streams.stream(t.execute())).collect(Collectors.toList()); + int fieldCount = schema.getFields().size(); + try (VectorSchemaRoot vsr = VectorSchemaRoot.create(schema, rootAllocator)) { + VectorLoader loader = new VectorLoader(vsr); + for (ArrowRecordBatch batch : batches) { + loader.load(batch); + int batchRowCount = vsr.getRowCount(); + for (int i = 0; i < fieldCount; i++) { + FieldVector vector = vsr.getVector(i); + for (int j = 0; j < batchRowCount; j++) { + Object object = vector.getObject(j); + System.out.println(object); + } + } + } + } + AutoCloseables.close(datasetFactory, dataset, scanner); + +.. testoutput:: + + 1 + David + 2 + Gladis + 3 + Juan + +Query Data Content with Projection +********************************** + +.. testcode:: + + import com.google.common.collect.Streams; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.util.AutoCloseables; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.VectorLoader; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + import org.apache.arrow.vector.types.pojo.Schema; + + import java.util.List; + import java.util.Optional; + import java.util.stream.Collectors; + import java.util.stream.StreamSupport; + + String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data1.parquet"; + RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(rootAllocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + String[] projection = new String[] {"name"}; + ScanOptions options = new ScanOptions(1, Optional.of(projection)); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + Schema schema = scanner.schema(); + List<ArrowRecordBatch> batches = StreamSupport.stream(scanner.scan().spliterator(), false).flatMap(t -> Streams.stream(t.execute())).collect(Collectors.toList()); + int fieldCount = schema.getFields().size(); + try (VectorSchemaRoot vsr = VectorSchemaRoot.create(schema, rootAllocator)) { + VectorLoader loader = new VectorLoader(vsr); + for (ArrowRecordBatch batch : batches) { + loader.load(batch); + int batchRowCount = vsr.getRowCount(); + for (int i = 0; i < fieldCount; i++) { + FieldVector vector = vsr.getVector(i); + for (int j = 0; j < batchRowCount; j++) { + Object object = vector.getObject(j); + System.out.println(object); + } + } + } + } + AutoCloseables.close(datasetFactory, dataset, scanner); + +.. testoutput:: + + David + Gladis + Juan + +JNI Library +=========== + +Shared Libraries / Dynamically Loaded Modules +********************************************* + +Consider arrow dataset versions for jni libarrow_dataset_jni.dylib and libarrow_dataset_jni.so. + +.. code-block:: shell + + (base) ➜ /tmp jar -tf ~/Downloads/arrow-dataset-6.0.0.jar|grep _jni. + + libarrow_dataset_jni.dylib + libarrow_dataset_jni.600.0.0.dylib + libarrow_dataset_jni.600.dylib + + (base) ➜ /tmp jar -tf ~/Downloads/arrow-dataset-6.0.1.jar|grep _jni. + + libarrow_dataset_jni.so + libarrow_dataset_jni.so.600 + libarrow_dataset_jni.so.600.1.0 + +Library Dependencies +******************** + +Consider this for jni libarrow_dataset_jni.dylib and libarrow_dataset_jni.so library dependencies. + +Shared Libraries: + +.. code-block:: shell + + (base) ➜ /tmp objdump -p libarrow_dataset_jni.so | grep NEEDED + NEEDED liblz4.so.1 + NEEDED libsnappy.so.1 + NEEDED libz.so.1 + NEEDED libzstd.so.1 + NEEDED libutf8proc.so.2 + NEEDED libre2.so.9 + NEEDED libthrift-0.13.0.so + NEEDED libstdc++.so.6 + NEEDED libm.so.6 + NEEDED libgcc_s.so.1 + NEEDED libpthread.so.0 + NEEDED libc.so.6 + NEEDED ld-linux-x86-64.so.2 Review comment: Well then, it's mostly concerning that we're building JARs that sound practically useless (especially as it is not obvious what dependencies/system requirements exactly they have), and it would be much better if we could keep the cookbook organization instead of having to split things like this. But if testing on MacOS is the only way, then so be it. Could we just run a single pipeline on MacOS, or at least run all the non-JNI tests on both platforms so we're not doing as much work in the CI to try to split up the tests so carefully? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
