This is an automated email from the ASF dual-hosted git repository.
william pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new d8becfc25 ORC-1564: Add Java ORC configuration documentation
d8becfc25 is described below
commit d8becfc2552f71467794180c781f308179d42a21
Author: sychen <[email protected]>
AuthorDate: Mon Jan 1 22:04:32 2024 -0800
ORC-1564: Add Java ORC configuration documentation
### What changes were proposed in this pull request?
### Why are the changes needed?
ORC Java has a lot of configuration items, but there is no page for users
to view.
### How was this patch tested?
Closes #1713 from cxzl25/ORC-1564.
Authored-by: sychen <[email protected]>
Signed-off-by: William Hyun <[email protected]>
---
java/core/src/java/org/apache/orc/OrcConf.java | 4 +-
site/_data/docs.yml | 1 +
site/_docs/core-java-config.md | 399 +++++++++++++++++++++++++
3 files changed, 402 insertions(+), 2 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/OrcConf.java
b/java/core/src/java/org/apache/orc/OrcConf.java
index 0ede3e7cc..900ab56fc 100644
--- a/java/core/src/java/org/apache/orc/OrcConf.java
+++ b/java/core/src/java/org/apache/orc/OrcConf.java
@@ -163,7 +163,7 @@ public enum OrcConf {
"Hive 2.1."),
FORCE_POSITIONAL_EVOLUTION_LEVEL("orc.force.positional.evolution.level",
"orc.force.positional.evolution.level", 1,
- "Require schema evolution to match the the defined no. of level columns
using position\n" +
+ "Require schema evolution to match the defined no. of level columns
using position\n" +
"rather than column names. This provides backwards compatibility
with Hive 2.1."),
ROWS_BETWEEN_CHECKS("orc.rows.between.memory.checks",
"orc.rows.between.memory.checks", 5000,
"How often should MemoryManager check the memory sizes? Measured in
rows\n" +
@@ -219,7 +219,7 @@ public enum OrcConf {
+ "optimization"),
ORC_MIN_DISK_SEEK_SIZE_TOLERANCE("orc.min.disk.seek.size.tolerance",
"orc.min.disk.seek.size.tolerance", 0.00,
- "Define the tolerance for for extra bytes read as a
result of "
+ "Define the tolerance for extra bytes read as a
result of "
+ "orc.min.disk.seek.size. If the "
+ "(bytesRead - bytesNeeded) / bytesNeeded is
greater than this "
+ "threshold then extra work is performed to drop
the extra bytes from "
diff --git a/site/_data/docs.yml b/site/_data/docs.yml
index b57e481b6..a3346b133 100644
--- a/site/_data/docs.yml
+++ b/site/_data/docs.yml
@@ -34,6 +34,7 @@
docs:
- core-java
- core-cpp
+ - core-java-config
- title: Tools
docs:
diff --git a/site/_docs/core-java-config.md b/site/_docs/core-java-config.md
new file mode 100644
index 000000000..38e0ed16a
--- /dev/null
+++ b/site/_docs/core-java-config.md
@@ -0,0 +1,399 @@
+---
+layout: docs
+title: ORC Java configuration
+permalink: /docs/core-java-config.html
+---
+## Configuration properties
+
+<table class="configtable">
+<tr>
+ <th>Key</th>
+ <th>Default</th>
+ <th>Notes</th>
+</tr>
+<tr>
+ <td><code>orc.stripe.size</code></td>
+ <td>67108864</td>
+ <td>
+ Define the default ORC stripe size, in bytes.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.stripe.row.count</code></td>
+ <td>2147483647</td>
+ <td>
+ This value limit the row count in one stripe. The number of stripe rows
can be controlled at (0, "orc.stripe.row.count" + max(batchSize,
"orc.rows.between.memory.checks"))
+ </td>
+</tr>
+<tr>
+ <td><code>orc.block.size</code></td>
+ <td>268435456</td>
+ <td>
+ Define the default file system block size for ORC files.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.create.index</code></td>
+ <td>true</td>
+ <td>
+ Should the ORC writer create indexes as part of the file.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.row.index.stride</code></td>
+ <td>10000</td>
+ <td>
+ Define the default ORC index stride in number of rows. (Stride is the
number of rows an index entry represents.)
+ </td>
+</tr>
+<tr>
+ <td><code>orc.compress.size</code></td>
+ <td>262144</td>
+ <td>
+ Define the default ORC buffer size, in bytes.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.base.delta.ratio</code></td>
+ <td>8</td>
+ <td>
+ The ratio of base writer and delta writer in terms of STRIPE_SIZE and
BUFFER_SIZE.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.block.padding</code></td>
+ <td>true</td>
+ <td>
+ Define whether stripes should be padded to the HDFS block boundaries.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.compress</code></td>
+ <td>ZLIB</td>
+ <td>
+ Define the default compression codec for ORC file
+ </td>
+</tr>
+<tr>
+ <td><code>orc.write.format</code></td>
+ <td>0.12</td>
+ <td>
+ Define the version of the file to write. Possible values are 0.11 and
0.12. If this parameter is not defined, ORC will use the run length encoding
(RLE) introduced in Hive 0.12.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.buffer.size.enforce</code></td>
+ <td>false</td>
+ <td>
+ Defines whether to enforce ORC compression buffer size.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.encoding.strategy</code></td>
+ <td>SPEED</td>
+ <td>
+ Define the encoding strategy to use while writing data. Changing this will
only affect the light weight encoding for integers. This flag will not change
the compression level of higher level compression codec (like ZLIB).
+ </td>
+</tr>
+<tr>
+ <td><code>orc.compression.strategy</code></td>
+ <td>SPEED</td>
+ <td>
+ Define the compression strategy to use while writing data. This changes
the compression level of higher level compression codec (like ZLIB).
+ </td>
+</tr>
+<tr>
+ <td><code>orc.block.padding.tolerance</code></td>
+ <td>0.05</td>
+ <td>
+ Define the tolerance for block padding as a decimal fraction of stripe
size (for example, the default value 0.05 is 5% of the stripe size). For the
defaults of 64Mb ORC stripe and 256Mb HDFS blocks, the default block padding
tolerance of 5% will reserve a maximum of 3.2Mb for padding within the 256Mb
block. In that case, if the available size within the block is more than 3.2Mb,
a new smaller stripe will be inserted to fit within that space. This will make
sure that no stripe written [...]
+ </td>
+</tr>
+<tr>
+ <td><code>orc.bloom.filter.fpp</code></td>
+ <td>0.01</td>
+ <td>
+ Define the default false positive probability for bloom filters.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.use.zerocopy</code></td>
+ <td>false</td>
+ <td>
+ Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)
+ </td>
+</tr>
+<tr>
+ <td><code>orc.skip.corrupt.data</code></td>
+ <td>false</td>
+ <td>
+ If ORC reader encounters corrupt data, this value will be used to
determine whether to skip the corrupt data or throw exception. The default
behavior is to throw exception.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.tolerate.missing.schema</code></td>
+ <td>true</td>
+ <td>
+ Writers earlier than HIVE-4243 may have inaccurate schema metadata. This
setting will enable best effort schema evolution rather than rejecting
mismatched schemas
+ </td>
+</tr>
+<tr>
+ <td><code>orc.memory.pool</code></td>
+ <td>0.5</td>
+ <td>
+ Maximum fraction of heap that can be used by ORC file writers
+ </td>
+</tr>
+<tr>
+ <td><code>orc.dictionary.key.threshold</code></td>
+ <td>0.8</td>
+ <td>
+ If the number of distinct keys in a dictionary is greater than this
fraction of the total number of non-null rows, turn off dictionary encoding.
Use 1 to always use dictionary encoding.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.dictionary.early.check</code></td>
+ <td>true</td>
+ <td>
+ If enabled dictionary check will happen after first row index stride
(default 10000 rows) else dictionary check will happen before writing first
stripe. In both cases, the decision to use dictionary or not will be retained
thereafter.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.dictionary.implementation</code></td>
+ <td>rbtree</td>
+ <td>
+ the implementation for the dictionary used for string-type column
encoding. The choices are: rbtree - use red-black tree as the implementation
for the dictionary. hash - use hash table as the implementation for the
dictionary.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.bloom.filter.columns</code></td>
+ <td></td>
+ <td>
+ List of columns to create bloom filters for when writing.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.bloom.filter.write.version</code></td>
+ <td>utf8</td>
+ <td>
+ Which version of the bloom filters should we write. The choices are:
original - writes two versions of the bloom filters for use by
both old and new readers. utf8 - writes just the new bloom filters.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.bloom.filter.ignore.non-utf8</code></td>
+ <td>false</td>
+ <td>
+ Should the reader ignore the obsolete non-UTF8 bloom filters.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.max.file.length</code></td>
+ <td>9223372036854775807</td>
+ <td>
+ The maximum size of the file to read for finding the file tail. This is
primarily used for streaming ingest to read intermediate footers while the file
is still open
+ </td>
+</tr>
+<tr>
+ <td><code>orc.mapred.input.schema</code></td>
+ <td>null</td>
+ <td>
+ The schema that the user desires to read. The values are interpreted using
TypeDescription.fromString.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.mapred.map.output.key.schema</code></td>
+ <td>null</td>
+ <td>
+ The schema of the MapReduce shuffle key. The values are interpreted using
TypeDescription.fromString.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.mapred.map.output.value.schema</code></td>
+ <td>null</td>
+ <td>
+ The schema of the MapReduce shuffle value. The values are interpreted
using TypeDescription.fromString.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.mapred.output.schema</code></td>
+ <td>null</td>
+ <td>
+ The schema that the user desires to write. The values are interpreted
using TypeDescription.fromString.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.include.columns</code></td>
+ <td>null</td>
+ <td>
+ The list of comma separated column ids that should be read with 0 being
the first column, 1 being the next, and so on. .
+ </td>
+</tr>
+<tr>
+ <td><code>orc.kryo.sarg</code></td>
+ <td>null</td>
+ <td>
+ The kryo and base64 encoded SearchArgument for predicate pushdown.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.kryo.sarg.buffer</code></td>
+ <td>8192</td>
+ <td>
+ The kryo buffer size for SearchArgument for predicate pushdown.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.sarg.column.names</code></td>
+ <td>null</td>
+ <td>
+ The list of column names for the SearchArgument.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.force.positional.evolution</code></td>
+ <td>false</td>
+ <td>
+ Require schema evolution to match the top level columns using position
rather than column names. This provides backwards compatibility with Hive 2.1.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.force.positional.evolution.level</code></td>
+ <td>1</td>
+ <td>
+ Require schema evolution to match the defined no. of level columns using
position rather than column names. This provides backwards compatibility with
Hive 2.1.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.rows.between.memory.checks</code></td>
+ <td>5000</td>
+ <td>
+ How often should MemoryManager check the memory sizes? Measured in rows
added to all of the writers. Valid range is [1,10000] and is primarily meant
fortesting. Setting this too low may negatively affect performance. Use
orc.stripe.row.count instead if the value larger than orc.stripe.row.count.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.overwrite.output.file</code></td>
+ <td>false</td>
+ <td>
+ A boolean flag to enable overwriting of the output file if it already
exists.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.schema.evolution.case.sensitive</code></td>
+ <td>true</td>
+ <td>
+ A boolean flag to determine if the comparision of field names in schema
evolution is case sensitive .
+ </td>
+</tr>
+<tr>
+ <td><code>orc.sarg.to.filter</code></td>
+ <td>false</td>
+ <td>
+ A boolean flag to determine if a SArg is allowed to become a filter
+ </td>
+</tr>
+<tr>
+ <td><code>orc.filter.use.selected</code></td>
+ <td>false</td>
+ <td>
+ A boolean flag to determine if the selected vector is supported by the
reading application. If false, the output of the ORC reader must have the
filter reapplied to avoid using unset values in the unselected rows. If unsure
please leave this as false.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.filter.plugin</code></td>
+ <td>false</td>
+ <td>
+ Enables the use of plugin filters during read. The plugin filters are
discovered against the service org.apache.orc.filter.PluginFilterService, if
multiple filters are determined, they are combined using AND. The order of
application is non-deterministic and the filter functionality should not depend
on the order of application.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.filter.plugin.allowlist</code></td>
+ <td>*</td>
+ <td>
+ A list of comma-separated class names. If specified it restricts the
PluginFilters to just these classes as discovered by the PluginFilterService.
The default of * allows all discovered classes and an empty string would not
allow any plugins to be applied.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.write.variable.length.blocks</code></td>
+ <td>false</td>
+ <td>
+ A boolean flag whether the ORC writer should write variable length HDFS
blocks.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.column.encoding.direct</code></td>
+ <td></td>
+ <td>
+ Comma-separated list of columns for which dictionary encoding is to be
skipped.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.max.disk.range.chunk.limit</code></td>
+ <td>2147482623</td>
+ <td>
+ When reading stripes >2GB, specify max limit for the chunk size.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.min.disk.seek.size</code></td>
+ <td>0</td>
+ <td>
+ When determining contiguous reads, gaps within this size are read
contiguously and not seeked. Default value of zero disables this optimization
+ </td>
+</tr>
+<tr>
+ <td><code>orc.min.disk.seek.size.tolerance</code></td>
+ <td>0.0</td>
+ <td>
+ Define the tolerance for extra bytes read as a result of
orc.min.disk.seek.size. If the (bytesRead - bytesNeeded) / bytesNeeded is
greater than this threshold then extra work is performed to drop the extra
bytes from memory after the read.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.encrypt</code></td>
+ <td>null</td>
+ <td>
+ The list of keys and columns to encrypt with
+ </td>
+</tr>
+<tr>
+ <td><code>orc.mask</code></td>
+ <td>null</td>
+ <td>
+ The masks to apply to the encrypted columns
+ </td>
+</tr>
+<tr>
+ <td><code>orc.key.provider</code></td>
+ <td>hadoop</td>
+ <td>
+ The kind of KeyProvider to use for encryption.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.proleptic.gregorian</code></td>
+ <td>false</td>
+ <td>
+ Should we read and write dates & times using the proleptic Gregorian
calendar instead of the hybrid Julian Gregorian? Hive before 3.1 and Spark
before 3.0 used hybrid.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.proleptic.gregorian.default</code></td>
+ <td>false</td>
+ <td>
+ This value controls whether pre-ORC 27 files are using the hybrid or
proleptic calendar. Only Hive 3.1 and the C++ library wrote using the
proleptic, so hybrid is the default.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.row.batch.size</code></td>
+ <td>1024</td>
+ <td>
+ The number of rows to include in an orc vectorized reader batch. The value
should be carefully chosen to minimize overhead and avoid OOMs in reading data.
+ </td>
+</tr>
+<tr>
+ <td><code>orc.row.child.limit</code></td>
+ <td>32768</td>
+ <td>
+ The maximum number of child elements to buffer before the ORC row writer
writes the batch to the file.
+ </td>
+</tr>
+</table>
\ No newline at end of file