lidavidm commented on code in PR #229: URL: https://github.com/apache/arrow-cookbook/pull/229#discussion_r938739902
########## java/source/io.rst: ########## @@ -461,6 +461,292 @@ Reading Parquet File Please check :doc:`Dataset <./dataset>` +Reading JDBC ResultSets +*********************** + +The `Arrow Java JDBC module <https://arrow.apache.org/docs/java/jdbc.html>`_ +converts JDBC ResultSets into Arrow VectorSchemaRoots. + +ResultSet to VectorSchemaRoot Conversion +---------------------------------------- + +The main class to help us to convert ResultSet to VectorSchemaRoot is +`JdbcToArrow <https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrow.html>`_ + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, allocator)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 + 101 true 1000000000300 + 102 true 100000000030 + 103 true 10000000003 + +ResultSet with Array to VectorSchemaRoot Conversion Review Comment: ```suggestion Configuring Array subtypes ``` ########## java/source/io.rst: ########## @@ -461,6 +461,292 @@ Reading Parquet File Please check :doc:`Dataset <./dataset>` +Reading JDBC ResultSets Review Comment: Hmm, I feel like JDBC belongs as its own file, especially if we have a separate file for Avro? ########## java/source/io.rst: ########## @@ -461,6 +461,292 @@ Reading Parquet File Please check :doc:`Dataset <./dataset>` +Reading JDBC ResultSets +*********************** + +The `Arrow Java JDBC module <https://arrow.apache.org/docs/java/jdbc.html>`_ +converts JDBC ResultSets into Arrow VectorSchemaRoots. + +ResultSet to VectorSchemaRoot Conversion +---------------------------------------- + +The main class to help us to convert ResultSet to VectorSchemaRoot is +`JdbcToArrow <https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrow.html>`_ + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, allocator)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 + 101 true 1000000000300 + 102 true 100000000030 + 103 true 10000000003 + +ResultSet with Array to VectorSchemaRoot Conversion +--------------------------------------------------- + +JdbcToArrow accepts configuration through `JdbcToArrowConfig +<https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.html>`_. +For example, the type of the elements of array columns can be specified by +``setArraySubTypeByColumnNameMap``. + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; + import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + import java.sql.Types; + import java.util.HashMap; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, + JdbcToArrowUtils.getUtcCalendar()) + .setArraySubTypeByColumnNameMap( + new HashMap<>() {{ + put("LIST_FIELD19", + new JdbcFieldInfo(Types.INTEGER)); + }} + ) + .build(); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, config)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 101 true 1000000000300 some char text [1,2,3] + 102 true 100000000030 some char text [1,2] + 103 true 10000000003 some char text [1] + +ResultSet per Batches to VectorSchemaRoot Conversion +---------------------------------------------------- + +The maximum rowCount to read each time is configured by default in 1024. This +can be customized by setting values as needed by ``setTargetBatchSize``. + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; + import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + import java.sql.Types; + import java.util.HashMap; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, + JdbcToArrowUtils.getUtcCalendar()) + .setTargetBatchSize(2) + .setArraySubTypeByColumnNameMap( + new HashMap<>() {{ + put("LIST_FIELD19", + new JdbcFieldInfo(Types.INTEGER)); + }} + ) + .build(); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, config)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 101 true 1000000000300 some char text [1,2,3] + 102 true 100000000030 some char text [1,2] + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 103 true 10000000003 some char text [1] + +ResultSet with Precision/Scale to VectorSchemaRoot Conversion +------------------------------------------------------------- + +There is a configuration about precision & scale for column data type needed +(i.e. ``JdbcFieldInfo(Types.DECIMAL, /*precision*/ 20, /*scale*/ 7))``) but this +configuration required exact matching of every row to the established scale +for the column, and throws ``UnsupportedOperationException`` when there is a mismatch, +aborting ResultSet processing, + +In this example we have BigInt data type configured on H2 Database, this is +converted to (``/*scale*/0)`` by default, then we have a ``/*scale*/7`` configured on +our code, this will be the error message for these differences: ``Caused by: java.lang.UnsupportedOperationException: BigDecimal scale must equal that in the Arrow vector: 0 != 7`` +if not applying ``setBigDecimalRoundingMode`` Review Comment: ```suggestion In this example, we have a BigInt column. By default, the inferred scale is 0. We override the scale to 7 and then set a RoundingMode to convert values to the given scale. ``` ########## java/source/io.rst: ########## @@ -461,6 +461,292 @@ Reading Parquet File Please check :doc:`Dataset <./dataset>` +Reading JDBC ResultSets +*********************** + +The `Arrow Java JDBC module <https://arrow.apache.org/docs/java/jdbc.html>`_ +converts JDBC ResultSets into Arrow VectorSchemaRoots. + +ResultSet to VectorSchemaRoot Conversion +---------------------------------------- + +The main class to help us to convert ResultSet to VectorSchemaRoot is +`JdbcToArrow <https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrow.html>`_ + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, allocator)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 + 101 true 1000000000300 + 102 true 100000000030 + 103 true 10000000003 + +ResultSet with Array to VectorSchemaRoot Conversion +--------------------------------------------------- + +JdbcToArrow accepts configuration through `JdbcToArrowConfig +<https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.html>`_. +For example, the type of the elements of array columns can be specified by +``setArraySubTypeByColumnNameMap``. + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; + import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + import java.sql.Types; + import java.util.HashMap; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, + JdbcToArrowUtils.getUtcCalendar()) + .setArraySubTypeByColumnNameMap( + new HashMap<>() {{ + put("LIST_FIELD19", + new JdbcFieldInfo(Types.INTEGER)); + }} + ) + .build(); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, config)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 101 true 1000000000300 some char text [1,2,3] + 102 true 100000000030 some char text [1,2] + 103 true 10000000003 some char text [1] + +ResultSet per Batches to VectorSchemaRoot Conversion +---------------------------------------------------- + +The maximum rowCount to read each time is configured by default in 1024. This +can be customized by setting values as needed by ``setTargetBatchSize``. + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; + import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + import java.sql.Types; + import java.util.HashMap; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, + JdbcToArrowUtils.getUtcCalendar()) + .setTargetBatchSize(2) + .setArraySubTypeByColumnNameMap( + new HashMap<>() {{ + put("LIST_FIELD19", + new JdbcFieldInfo(Types.INTEGER)); + }} + ) + .build(); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, config)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 101 true 1000000000300 some char text [1,2,3] + 102 true 100000000030 some char text [1,2] + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 103 true 10000000003 some char text [1] + +ResultSet with Precision/Scale to VectorSchemaRoot Conversion +------------------------------------------------------------- + +There is a configuration about precision & scale for column data type needed +(i.e. ``JdbcFieldInfo(Types.DECIMAL, /*precision*/ 20, /*scale*/ 7))``) but this +configuration required exact matching of every row to the established scale +for the column, and throws ``UnsupportedOperationException`` when there is a mismatch, +aborting ResultSet processing, Review Comment: ```suggestion By default, the scale of any decimal values must exactly match the defined scale of the Arrow type of the column, or else an UnsupportedOperationException will be thrown with a message like ``BigDecimal scale must equal that in the Arrow vector``will be thrown during conversion. This can happen because Arrow infers the type from the ResultSet metadata, which is not accurate for all database drivers. The JDBC adapter lets you avoid this by either overriding the decimal scale, or by providing a RoundingMode via ``setBigDecimalRoundingMode`` to convert values to the expected scale. ``` Also, link to https://arrow.apache.org/docs/java/jdbc.html#type-mapping ########## java/source/io.rst: ########## @@ -461,6 +461,292 @@ Reading Parquet File Please check :doc:`Dataset <./dataset>` +Reading JDBC ResultSets +*********************** + +The `Arrow Java JDBC module <https://arrow.apache.org/docs/java/jdbc.html>`_ +converts JDBC ResultSets into Arrow VectorSchemaRoots. + +ResultSet to VectorSchemaRoot Conversion +---------------------------------------- + +The main class to help us to convert ResultSet to VectorSchemaRoot is +`JdbcToArrow <https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrow.html>`_ + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, allocator)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 + 101 true 1000000000300 + 102 true 100000000030 + 103 true 10000000003 + +ResultSet with Array to VectorSchemaRoot Conversion +--------------------------------------------------- + +JdbcToArrow accepts configuration through `JdbcToArrowConfig +<https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.html>`_. +For example, the type of the elements of array columns can be specified by +``setArraySubTypeByColumnNameMap``. + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; + import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + import java.sql.Types; + import java.util.HashMap; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, + JdbcToArrowUtils.getUtcCalendar()) + .setArraySubTypeByColumnNameMap( + new HashMap<>() {{ + put("LIST_FIELD19", + new JdbcFieldInfo(Types.INTEGER)); + }} + ) + .build(); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, config)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 101 true 1000000000300 some char text [1,2,3] + 102 true 100000000030 some char text [1,2] + 103 true 10000000003 some char text [1] + +ResultSet per Batches to VectorSchemaRoot Conversion Review Comment: ```suggestion Configuring batch size ``` ########## java/source/io.rst: ########## @@ -461,6 +461,292 @@ Reading Parquet File Please check :doc:`Dataset <./dataset>` +Reading JDBC ResultSets +*********************** + +The `Arrow Java JDBC module <https://arrow.apache.org/docs/java/jdbc.html>`_ +converts JDBC ResultSets into Arrow VectorSchemaRoots. + +ResultSet to VectorSchemaRoot Conversion +---------------------------------------- + +The main class to help us to convert ResultSet to VectorSchemaRoot is +`JdbcToArrow <https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrow.html>`_ + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, allocator)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + Review Comment: ```suggestion ``` ########## java/source/io.rst: ########## @@ -461,6 +461,292 @@ Reading Parquet File Please check :doc:`Dataset <./dataset>` +Reading JDBC ResultSets Review Comment: Also, I think the header levels are mixed up again (so a separate file will make that clearer) ########## java/source/io.rst: ########## @@ -461,6 +461,292 @@ Reading Parquet File Please check :doc:`Dataset <./dataset>` +Reading JDBC ResultSets +*********************** + +The `Arrow Java JDBC module <https://arrow.apache.org/docs/java/jdbc.html>`_ +converts JDBC ResultSets into Arrow VectorSchemaRoots. + +ResultSet to VectorSchemaRoot Conversion +---------------------------------------- + +The main class to help us to convert ResultSet to VectorSchemaRoot is +`JdbcToArrow <https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrow.html>`_ + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, allocator)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 + 101 true 1000000000300 + 102 true 100000000030 + 103 true 10000000003 + +ResultSet with Array to VectorSchemaRoot Conversion +--------------------------------------------------- + +JdbcToArrow accepts configuration through `JdbcToArrowConfig +<https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.html>`_. +For example, the type of the elements of array columns can be specified by +``setArraySubTypeByColumnNameMap``. + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; + import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + import java.sql.Types; + import java.util.HashMap; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, + JdbcToArrowUtils.getUtcCalendar()) + .setArraySubTypeByColumnNameMap( + new HashMap<>() {{ + put("LIST_FIELD19", + new JdbcFieldInfo(Types.INTEGER)); + }} + ) + .build(); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, config)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 101 true 1000000000300 some char text [1,2,3] + 102 true 100000000030 some char text [1,2] + 103 true 10000000003 some char text [1] + +ResultSet per Batches to VectorSchemaRoot Conversion +---------------------------------------------------- + +The maximum rowCount to read each time is configured by default in 1024. This +can be customized by setting values as needed by ``setTargetBatchSize``. + +.. testcode:: + + import org.apache.arrow.adapter.jdbc.ArrowVectorIterator; + import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; + import org.apache.arrow.adapter.jdbc.JdbcToArrow; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; + import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; + import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.ibatis.jdbc.ScriptRunner; + + import java.io.BufferedReader; + import java.io.FileReader; + import java.io.IOException; + import java.sql.Connection; + import java.sql.DriverManager; + import java.sql.ResultSet; + import java.sql.SQLException; + import java.sql.Types; + import java.util.HashMap; + + try (BufferAllocator allocator = new RootAllocator(); + Connection connection = DriverManager.getConnection( + "jdbc:h2:mem:h2-jdbc-adapter")) { + ScriptRunner runnerDDLDML = new ScriptRunner(connection); + runnerDDLDML.setLogWriter(null); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-ddl.sql"))); + runnerDDLDML.runScript(new BufferedReader( + new FileReader("./thirdpartydeps/database/h2-dml.sql"))); + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, + JdbcToArrowUtils.getUtcCalendar()) + .setTargetBatchSize(2) + .setArraySubTypeByColumnNameMap( + new HashMap<>() {{ + put("LIST_FIELD19", + new JdbcFieldInfo(Types.INTEGER)); + }} + ) + .build(); + try (ResultSet resultSet = connection.createStatement().executeQuery( + "SELECT int_field1, bool_field2, bigint_field5, char_field16, list_field19 FROM TABLE1"); + ArrowVectorIterator iterator = JdbcToArrow.sqlToArrowVectorIterator( + resultSet, config)) { + while (iterator.hasNext()) { + try (VectorSchemaRoot root = iterator.next()) { + System.out.print(root.contentToTSVString()); + } + } + } + } catch (SQLException | IOException e) { + e.printStackTrace(); + } + +.. testoutput:: + + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 101 true 1000000000300 some char text [1,2,3] + 102 true 100000000030 some char text [1,2] + INT_FIELD1 BOOL_FIELD2 BIGINT_FIELD5 CHAR_FIELD16 LIST_FIELD19 + 103 true 10000000003 some char text [1] + +ResultSet with Precision/Scale to VectorSchemaRoot Conversion Review Comment: ```suggestion Configuring numeric (decimal) precision and scale ``` ########## java/ext/javadoctest.py: ########## @@ -37,6 +37,9 @@ def compile( "-q", "dependency:build-classpath", "-DincludeTypes=jar", + "-Dmaven.compiler.source="+os.environ.get('compiler_version', '8'), + "-Dmaven.compiler.target="+os.environ.get('compiler_version', '8'), + "-Darrow.version="+os.environ.get('arrow_version', '9.0.0'), Review Comment: I think we can remove all the changes to the build/CI config right? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
