This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f1737f93a5 ARROW-17003: [Java][Docs] Document arrow-jdbc adapter
(#13543)
f1737f93a5 is described below
commit f1737f93a5f9f420740f5e948a114c95c47427b0
Author: David Li <[email protected]>
AuthorDate: Wed Jul 13 08:01:45 2022 -0400
ARROW-17003: [Java][Docs] Document arrow-jdbc adapter (#13543)
Add a basic documentation page for the arrow-jdbc adapter.
I would also like to add a Cookbook page, and then cross-link the two
pages, as a follow-up.
Authored-by: David Li <[email protected]>
Signed-off-by: David Li <[email protected]>
---
docs/Makefile | 11 +-
docs/source/conf.py | 2 +-
docs/source/developers/java/building.rst | 31 ++--
docs/source/java/index.rst | 1 +
docs/source/java/jdbc.rst | 174 +++++++++++++++++++++
.../adapter/jdbc/JdbcToArrowConfigBuilder.java | 35 +++++
6 files changed, 237 insertions(+), 17 deletions(-)
diff --git a/docs/Makefile b/docs/Makefile
index 2511d22e44..ded2b73899 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -256,14 +256,19 @@ dummy:
python:
$(SPHINXBUILD) -b html $(SPHINXOPTS) -c $(SOURCEDIR)
$(SOURCEDIR)/python $(BUILDDIR)/html/python
@echo
- @echo "Build finished. The HTML files are in $(BUILLDIR)/html/python"
+ @echo "Build finished. The HTML files are in $(BUILDDIR)/html/python"
java_tutorial:
$(SPHINXBUILD) -b html $(SPHINXOPTS) -c $(SOURCEDIR) $(SOURCEDIR)/java
$(BUILDDIR)/html/tutorial/java
@echo
- @echo "Build finished. The HTML files are in
$(BUILLDIR)/html/tutorial/java"
+ @echo "Build finished. The HTML files are in
$(BUILDDIR)/html/tutorial/java"
java_dev:
$(SPHINXBUILD) -b html $(SPHINXOPTS) -c $(SOURCEDIR)
$(SOURCEDIR)/developers/java $(BUILDDIR)/html/developers/java
@echo
- @echo "Build finished. The HTML files are in
$(BUILLDIR)/html/developers/java"
\ No newline at end of file
+ @echo "Build finished. The HTML files are in
$(BUILDDIR)/html/developers/java"
+
+java:
+ $(SPHINXBUILD) -b html $(SPHINXOPTS) -c $(SOURCEDIR) $(SOURCEDIR)/java
$(BUILDDIR)/html/java
+ @echo
+ @echo "Build finished. The HTML files are in $(BUILDDIR)/html/java"
diff --git a/docs/source/conf.py b/docs/source/conf.py
index ea583aa8ce..42994b0f3b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -172,7 +172,7 @@ if "+" in release:
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
diff --git a/docs/source/developers/java/building.rst
b/docs/source/developers/java/building.rst
index e50142d285..b7b9cb9213 100644
--- a/docs/source/developers/java/building.rst
+++ b/docs/source/developers/java/building.rst
@@ -186,6 +186,9 @@ Arrow repository, and update the following settings:
Settings > Build, Execution, Deployment > Compiler > Java Compiler and
disable
"Use '--release' option for cross-compilation (Java 9 and later)". Otherwise
you will get an error like "package sun.misc does not exist".
+* You may want to disable error-prone entirely if it gives spurious
+ warnings (disable both error-prone profiles in the Maven tool window
+ and "Reload All Maven Projects").
* If using IntelliJ's Maven integration to build, you may need to change
``<fork>`` to ``false`` in the pom.xml files due to an `IntelliJ bug
<https://youtrack.jetbrains.com/issue/IDEA-278903>`__.
@@ -196,23 +199,25 @@ IntelliJ Maven integration instead of with IntelliJ
directly.
Common Errors
=============
-1. If the build cannot find dependencies, with errors like these:
- - Could NOT find Boost (missing: Boost_INCLUDE_DIR system filesystem)
- - Could NOT find Lz4 (missing: LZ4_LIB)
- - Could NOT find zstd (missing: ZSTD_LIB)
+* When working with the JNI code: if the C++ build cannot find dependencies,
with errors like these:
- Download the dependencies at build time (More details in the `Dependency
Resolution`_):
+ .. code-block::
- .. code-block::
+ Could NOT find Boost (missing: Boost_INCLUDE_DIR system filesystem)
+ Could NOT find Lz4 (missing: LZ4_LIB)
+ Could NOT find zstd (missing: ZSTD_LIB)
- -Dre2_SOURCE=BUNDLED \
- -DBoost_SOURCE=BUNDLED \
- -Dutf8proc_SOURCE=BUNDLED \
- -DSnappy_SOURCE=BUNDLED \
- -DORC_SOURCE=BUNDLED \
- -DZLIB_SOURCE=BUNDLED
+ Specify that the dependencies should be downloaded at build time (more
details at `Dependency Resolution`_):
+
+ .. code-block::
+
+ -Dre2_SOURCE=BUNDLED \
+ -DBoost_SOURCE=BUNDLED \
+ -Dutf8proc_SOURCE=BUNDLED \
+ -DSnappy_SOURCE=BUNDLED \
+ -DORC_SOURCE=BUNDLED \
+ -DZLIB_SOURCE=BUNDLED
.. _Archery: https://github.com/apache/arrow/blob/master/dev/archery/README.md
.. _Dependency Resolution:
https://arrow.apache.org/docs/developers/cpp/building.html#individual-dependency-resolution
.. _C++ shared libraries: https://arrow.apache.org/docs/cpp/build_system.html
-.. _TestArrowBuf.java:
https://github.com/apache/arrow/blob/master/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestArrowBuf.java#L130:L147
diff --git a/docs/source/java/index.rst b/docs/source/java/index.rst
index b05988626b..ea08364858 100644
--- a/docs/source/java/index.rst
+++ b/docs/source/java/index.rst
@@ -35,4 +35,5 @@ on the Arrow format and other language bindings see the
:doc:`parent documentati
flight
dataset
cdata
+ jdbc
Reference (javadoc) <reference/index>
diff --git a/docs/source/java/jdbc.rst b/docs/source/java/jdbc.rst
new file mode 100644
index 0000000000..da63351601
--- /dev/null
+++ b/docs/source/java/jdbc.rst
@@ -0,0 +1,174 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+==================
+Arrow JDBC Adapter
+==================
+
+The Arrow JDBC Adapter assists with working with JDBC and Arrow
+data. Currently, it supports reading JDBC ResultSets into Arrow
+VectorSchemaRoots.
+
+ResultSet to VectorSchemaRoot Conversion
+========================================
+
+This can be accessed via the JdbcToArrow class. The resulting
+ArrowVectorIterator will convert a ResultSet to Arrow data in batches
+of rows.
+
+.. code-block:: java
+
+ try (ArrowVectorIterator it =
JdbcToArrow.sqlToArrowVectorIterator(resultSet, allocator)) {
+ while (it.hasNext()) {
+ VectorSchemaRoot root = it.next();
+ // Consume the root…
+ }
+ }
+
+The batch size and type mapping can both be customized:
+
+.. code-block:: java
+
+ JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator,
/*calendar=*/null)
+ .setReuseVectorSchemaRoot(reuseVectorSchemaRoot)
+ .setJdbcToArrowTypeConverter((jdbcFieldInfo -> {
+ switch (jdbcFieldInfo.getJdbcType()) {
+ case Types.BIGINT:
+ // Assume actual value range is SMALLINT
+ return new ArrowType.Int(16, true);
+ default:
+ return null;
+ }
+ }))
+ .build();
+ try (ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs,
config)) {
+ while (iter.hasNext()) {
+ VectorSchemaRoot root = iter.next();
+ // Consume the root…
+ }
+ }
+
+The JDBC type can be explicitly specified, which is useful since JDBC
+drivers can give spurious type information. For example, the Postgres
+driver has been observed to use Decimal types with scale and precision
+0; these cases can be handled by specifying the type explicitly before
+reading. Also, some JDBC drivers may return BigDecimal values with
+inconsistent scale. A RoundingMode can be set to handle these cases:
+
+.. code-block:: java
+
+ Map<Integer, JdbcFieldInfo> mapping = new HashMap<>();
+ mapping.put(1, new JdbcFieldInfo(Types.DECIMAL, 20, 7));
+ JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator,
/*calendar=*/null)
+ .setBigDecimalRoundingMode(RoundingMode.UNNECESSARY)
+ .setExplicitTypesByColumnIndex(mapping)
+ .build();
+ try (ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs,
config)) {
+ while (iter.hasNext()) {
+ VectorSchemaRoot root = iter.next();
+ // Consume the root…
+ }
+ }
+
+Currently, it is not possible to define a custom type conversion for a
+supported or unsupported type.
+
+Type Mapping
+------------
+
+The JDBC to Arrow type mapping can be obtained at runtime from
+`JdbcToArrowUtils.getArrowTypeFromJdbcType`_.
+
+.. _JdbcToArrowUtils.getArrowTypeFromJdbcType:
https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.html#getArrowTypeFromJdbcType-org.apache.arrow.adapter.jdbc.JdbcFieldInfo-java.util.Calendar-
+
++--------------------+--------------------+-------+
+| JDBC Type | Arrow Type | Notes |
++====================+====================+=======+
+| ARRAY | List | \(1) |
++--------------------+--------------------+-------+
+| BIGINT | Int64 | |
++--------------------+--------------------+-------+
+| BINARY | Binary | |
++--------------------+--------------------+-------+
+| BIT | Bool | |
++--------------------+--------------------+-------+
+| BLOB | Binary | |
++--------------------+--------------------+-------+
+| BOOLEAN | Bool | |
++--------------------+--------------------+-------+
+| CHAR | Utf8 | |
++--------------------+--------------------+-------+
+| CLOB | Utf8 | |
++--------------------+--------------------+-------+
+| DATE | Date32 | |
++--------------------+--------------------+-------+
+| DECIMAL | Decimal128 | \(2) |
++--------------------+--------------------+-------+
+| DOUBLE | Double | |
++--------------------+--------------------+-------+
+| FLOAT | Float | |
++--------------------+--------------------+-------+
+| INTEGER | Int32 | |
++--------------------+--------------------+-------+
+| LONGVARBINARY | Binary | |
++--------------------+--------------------+-------+
+| LONGNVARCHAR | Utf8 | |
++--------------------+--------------------+-------+
+| LONGVARCHAR | Utf8 | |
++--------------------+--------------------+-------+
+| NCHAR | Utf8 | |
++--------------------+--------------------+-------+
+| NULL | Null | |
++--------------------+--------------------+-------+
+| NUMERIC | Decimal128 | |
++--------------------+--------------------+-------+
+| NVARCHAR | Utf8 | |
++--------------------+--------------------+-------+
+| REAL | Float | |
++--------------------+--------------------+-------+
+| SMALLINT | Int16 | |
++--------------------+--------------------+-------+
+| STRUCT | Struct | \(3) |
++--------------------+--------------------+-------+
+| TIME | Time32[ms] | |
++--------------------+--------------------+-------+
+| TIMESTAMP | Timestamp[ms] | \(4) |
++--------------------+--------------------+-------+
+| TINYINT | Int8 | |
++--------------------+--------------------+-------+
+| VARBINARY | Binary | |
++--------------------+--------------------+-------+
+| VARCHAR | Utf8 | |
++--------------------+--------------------+-------+
+
+* \(1) The list value type must be explicitly configured and cannot be
+ inferred. Use `setArraySubTypeByColumnIndexMap`_ or
+ `setArraySubTypeByColumnNameMap`_.
+* \(2) By default, the scale of decimal values must match the scale in
+ the type exactly; precision is allowed to be any value greater or
+ equal to the type precision. If there is a mismatch, by default, an
+ exception will be thrown. This can be configured by setting a
+ different RoundingMode with setBigDecimalRoundingMode.
+* \(3) Not fully supported: while the type conversion is defined, the
+ value conversion is not. See ARROW-17006_.
+* \(4) If a Calendar is provided, then the timestamp will have the
+ timezone of the calendar, else it will be a timestamp without
+ timezone.
+
+.. _setArraySubTypeByColumnIndexMap:
https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.html#setArraySubTypeByColumnIndexMap-java.util.Map-
+.. _setArraySubTypeByColumnNameMap:
https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.html#setArraySubTypeByColumnNameMap-java.util.Map-
+.. _ARROW-17006: https://issues.apache.org/jira/browse/ARROW-17006
diff --git
a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.java
b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.java
index 2d963cfac4..5618087669 100644
---
a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.java
+++
b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.java
@@ -170,32 +170,67 @@ public class JdbcToArrowConfigBuilder {
return this;
}
+ /**
+ * Sets the mapping of column-index-to-{@link JdbcFieldInfo} used for column
types.
+ * <p>
+ * This can be useful to override type information from JDBC drivers that
provide incomplete type info,
+ * e.g. DECIMAL with precision = scale = 0.
+ * <p>
+ * The column index is 1-based, to match the JDBC column index.
+ * @param map The mapping.
+ */
public JdbcToArrowConfigBuilder setExplicitTypesByColumnIndex(Map<Integer,
JdbcFieldInfo> map) {
this.explicitTypesByColumnIndex = map;
return this;
}
+ /**
+ * Sets the mapping of column-name-to-{@link JdbcFieldInfo} used for column
types.
+ * <p>
+ * This can be useful to override type information from JDBC drivers that
provide incomplete type info,
+ * e.g. DECIMAL with precision = scale = 0.
+ * @param map The mapping.
+ */
public JdbcToArrowConfigBuilder setExplicitTypesByColumnName(Map<String,
JdbcFieldInfo> map) {
this.explicitTypesByColumnName = map;
return this;
}
+ /**
+ * Set the target number of rows to convert at once.
+ * <p>
+ * Use {@link JdbcToArrowConfig#NO_LIMIT_BATCH_SIZE} to read all rows at
once.
+ */
public JdbcToArrowConfigBuilder setTargetBatchSize(int targetBatchSize) {
this.targetBatchSize = targetBatchSize;
return this;
}
+ /**
+ * Set the function used to convert JDBC types to Arrow types.
+ * <p>
+ * Defaults to wrapping {@link
JdbcToArrowUtils#getArrowTypeFromJdbcType(JdbcFieldInfo, Calendar)}.
+ */
public JdbcToArrowConfigBuilder setJdbcToArrowTypeConverter(
Function<JdbcFieldInfo, ArrowType> jdbcToArrowTypeConverter) {
this.jdbcToArrowTypeConverter = jdbcToArrowTypeConverter;
return this;
}
+ /**
+ * Set whether to use the same {@link
org.apache.arrow.vector.VectorSchemaRoot} instance on each iteration,
+ * or to allocate a new one.
+ */
public JdbcToArrowConfigBuilder setReuseVectorSchemaRoot(boolean
reuseVectorSchemaRoot) {
this.reuseVectorSchemaRoot = reuseVectorSchemaRoot;
return this;
}
+ /**
+ * Set the rounding mode used when the scale of the actual value does not
match the declared scale.
+ * <p>
+ * By default, an error is raised in such cases.
+ */
public JdbcToArrowConfigBuilder setBigDecimalRoundingMode(RoundingMode
bigDecimalRoundingMode) {
this.bigDecimalRoundingMode = bigDecimalRoundingMode;
return this;