This is an automated email from the ASF dual-hosted git repository.
kerwin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-paimon.git
The following commit(s) were added to refs/heads/master by this push:
new b87905497 [doc] Introduce orc options documentation page (#3061)
b87905497 is described below
commit b879054979ecad67ea7e8a48cede5f1a7cc50ec2
Author: Jingsong Lee <[email protected]>
AuthorDate: Thu Mar 21 08:44:34 2024 +0800
[doc] Introduce orc options documentation page (#3061)
---
docs/content/maintenance/configurations.md | 6 +-
.../shortcodes/generated/core_configuration.html | 30 ---------
.../shortcodes/generated/orc_configuration.html | 60 ++++++++++++++++++
.../generated/spark_connector_configuration.html | 12 ++--
.../main/java/org/apache/paimon/CoreOptions.java | 40 ------------
.../configuration/ConfigOptionsDocGenerator.java | 1 +
.../java/org/apache/paimon/format/OrcOptions.java | 72 ++++++++++++++++++++++
.../apache/paimon/format/orc/OrcWriterFactory.java | 3 +-
.../format/orc/writer/OrcBulkWriterTest.java | 5 +-
9 files changed, 149 insertions(+), 80 deletions(-)
diff --git a/docs/content/maintenance/configurations.md
b/docs/content/maintenance/configurations.md
index 87d59cd34..564b7c44c 100644
--- a/docs/content/maintenance/configurations.md
+++ b/docs/content/maintenance/configurations.md
@@ -74,7 +74,11 @@ Spark connector options for paimon.
{{< generated/spark_connector_configuration >}}
-## RocksDB Options
+### ORC Options
+
+{{< generated/orc_configuration >}}
+
+### RocksDB Options
The following options allow users to finely adjust RocksDB for better
performance. You can either specify them in table properties or in dynamic
table hints.
diff --git a/docs/layouts/shortcodes/generated/core_configuration.html
b/docs/layouts/shortcodes/generated/core_configuration.html
index ecf92c2f1..3c2dbcaae 100644
--- a/docs/layouts/shortcodes/generated/core_configuration.html
+++ b/docs/layouts/shortcodes/generated/core_configuration.html
@@ -383,36 +383,6 @@ This config option does not affect the default filesystem
metastore.</td>
<td>Integer</td>
<td>The number of sorted runs that trigger the stopping of writes,
the default value is 'num-sorted-run.compaction-trigger' + 1.</td>
</tr>
- <tr>
- <td><h5>orc.bloom.filter.columns</h5></td>
- <td style="word-wrap: break-word;">(none)</td>
- <td>String</td>
- <td>A comma-separated list of columns for which to create a bloom
filter when writing.</td>
- </tr>
- <tr>
- <td><h5>orc.bloom.filter.fpp</h5></td>
- <td style="word-wrap: break-word;">0.05</td>
- <td>Double</td>
- <td>Define the default false positive probability for bloom
filters.</td>
- </tr>
- <tr>
- <td><h5>orc.column.encoding.direct</h5></td>
- <td style="word-wrap: break-word;">(none)</td>
- <td>Integer</td>
- <td>Comma-separated list of fields for which dictionary encoding
is to be skipped in orc.</td>
- </tr>
- <tr>
- <td><h5>orc.dictionary.key.threshold</h5></td>
- <td style="word-wrap: break-word;">(none)</td>
- <td>Integer</td>
- <td>If the number of distinct keys in a dictionary is greater than
this fraction of the total number of non-null rows, turn off dictionary
encoding in orc. Use 1 to always use dictionary encoding.</td>
- </tr>
- <tr>
- <td><h5>orc.write.batch-size</h5></td>
- <td style="word-wrap: break-word;">1024</td>
- <td>Integer</td>
- <td>write batch size for orc.</td>
- </tr>
<tr>
<td><h5>page-size</h5></td>
<td style="word-wrap: break-word;">64 kb</td>
diff --git a/docs/layouts/shortcodes/generated/orc_configuration.html
b/docs/layouts/shortcodes/generated/orc_configuration.html
new file mode 100644
index 000000000..92b1a9106
--- /dev/null
+++ b/docs/layouts/shortcodes/generated/orc_configuration.html
@@ -0,0 +1,60 @@
+{{/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+*/}}
+<table class="configuration table table-bordered">
+ <thead>
+ <tr>
+ <th class="text-left" style="width: 20%">Key</th>
+ <th class="text-left" style="width: 15%">Default</th>
+ <th class="text-left" style="width: 10%">Type</th>
+ <th class="text-left" style="width: 55%">Description</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td><h5>orc.column.encoding.direct</h5></td>
+ <td style="word-wrap: break-word;">(none)</td>
+ <td>Integer</td>
+ <td>Comma-separated list of fields for which dictionary encoding
is to be skipped in orc.</td>
+ </tr>
+ <tr>
+ <td><h5>orc.compress</h5></td>
+ <td style="word-wrap: break-word;">"lz4"</td>
+ <td>String</td>
+ <td>Define the compression codec for ORC file, if a higher
compression ratio is required, it is recommended to configure it as 'zstd', and
you can configure: orc.compression.zstd.level</td>
+ </tr>
+ <tr>
+ <td><h5>orc.compression.zstd.level</h5></td>
+ <td style="word-wrap: break-word;">1</td>
+ <td>Integer</td>
+ <td>Define the compression level to use with ZStandard codec while
writing data. The valid range is 1~22.</td>
+ </tr>
+ <tr>
+ <td><h5>orc.dictionary.key.threshold</h5></td>
+ <td style="word-wrap: break-word;">0.8</td>
+ <td>Double</td>
+ <td>If the number of distinct keys in a dictionary is greater than
this fraction of the total number of non-null rows, turn off dictionary
encoding in orc. Use 0 to always disable dictionary encoding. Use 1 to always
use dictionary encoding.</td>
+ </tr>
+ <tr>
+ <td><h5>orc.write.batch-size</h5></td>
+ <td style="word-wrap: break-word;">1024</td>
+ <td>Integer</td>
+ <td>write batch size for orc.</td>
+ </tr>
+ </tbody>
+</table>
diff --git
a/docs/layouts/shortcodes/generated/spark_connector_configuration.html
b/docs/layouts/shortcodes/generated/spark_connector_configuration.html
index 09d363aff..9e74cefbc 100644
--- a/docs/layouts/shortcodes/generated/spark_connector_configuration.html
+++ b/docs/layouts/shortcodes/generated/spark_connector_configuration.html
@@ -26,6 +26,12 @@ under the License.
</tr>
</thead>
<tbody>
+ <tr>
+ <td><h5>catalog.create-underlying-session-catalog</h5></td>
+ <td style="word-wrap: break-word;">false</td>
+ <td>Boolean</td>
+ <td>If true, create and use an underlying session catalog instead
of default session catalog when use SparkGenericCatalog.</td>
+ </tr>
<tr>
<td><h5>read.changelog</h5></td>
<td style="word-wrap: break-word;">false</td>
@@ -74,11 +80,5 @@ under the License.
<td>Boolean</td>
<td>If true, allow to merge data types if the two types meet the
rules for explicit casting.</td>
</tr>
- <tr>
- <td><h5>catalog.create-underlying-session-catalog</h5></td>
- <td style="word-wrap: break-word;">false</td>
- <td>Boolean</td>
- <td>If true, create and use an underlying session catalog instead
of default session catalog when use SparkGenericCatalog.</td>
- </tr>
</tbody>
</table>
diff --git a/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
b/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
index 9deeb6324..05e931237 100644
--- a/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
+++ b/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
@@ -108,20 +108,6 @@ public class CoreOptions implements Serializable {
.withDescription(
"Specify the message format of data files,
currently orc, parquet and avro are supported.");
- public static final ConfigOption<String> ORC_BLOOM_FILTER_COLUMNS =
- key("orc.bloom.filter.columns")
- .stringType()
- .noDefaultValue()
- .withDescription(
- "A comma-separated list of columns for which to
create a bloom filter when writing.");
-
- public static final ConfigOption<Double> ORC_BLOOM_FILTER_FPP =
- key("orc.bloom.filter.fpp")
- .doubleType()
- .defaultValue(0.05)
- .withDescription(
- "Define the default false positive probability for
bloom filters.");
-
public static final ConfigOption<Map<String, String>>
FILE_COMPRESSION_PER_LEVEL =
key("file.compression.per.level")
.mapType()
@@ -745,12 +731,6 @@ public class CoreOptions implements Serializable {
.defaultValue(1024)
.withDescription("Read batch size for orc and parquet.");
- public static final ConfigOption<Integer> ORC_WRITE_BATCH_SIZE =
- key("orc.write.batch-size")
- .intType()
- .defaultValue(1024)
- .withDescription("write batch size for orc.");
-
public static final ConfigOption<String> CONSUMER_ID =
key("consumer-id")
.stringType()
@@ -1002,22 +982,6 @@ public class CoreOptions implements Serializable {
.noDefaultValue()
.withDescription("Turn off the dictionary encoding for all
fields in parquet.");
- public static final ConfigOption<Integer> ORC_COLUMN_ENCODING_DIRECT =
- key("orc.column.encoding.direct")
- .intType()
- .noDefaultValue()
- .withDescription(
- "Comma-separated list of fields for which
dictionary encoding is to be skipped in orc.");
-
- public static final ConfigOption<Integer> ORC_DICTIONARY_KEY_THRESHOLD =
- key("orc.dictionary.key.threshold")
- .intType()
- .noDefaultValue()
- .withDescription(
- "If the number of distinct keys in a dictionary is
greater than this "
- + "fraction of the total number of
non-null rows, turn off "
- + "dictionary encoding in orc. Use 1 to
always use dictionary encoding.");
-
public static final ConfigOption<String> SINK_WATERMARK_TIME_ZONE =
key("sink.watermark-time-zone")
.stringType()
@@ -1643,10 +1607,6 @@ public class CoreOptions implements Serializable {
return result;
}
- public int orcWriteBatch() {
- return options.getInteger(ORC_WRITE_BATCH_SIZE.key(),
ORC_WRITE_BATCH_SIZE.defaultValue());
- }
-
public boolean localMergeEnabled() {
return options.get(LOCAL_MERGE_BUFFER_SIZE) != null;
}
diff --git
a/paimon-docs/src/main/java/org/apache/paimon/docs/configuration/ConfigOptionsDocGenerator.java
b/paimon-docs/src/main/java/org/apache/paimon/docs/configuration/ConfigOptionsDocGenerator.java
index 37d8661d2..6f700724a 100644
---
a/paimon-docs/src/main/java/org/apache/paimon/docs/configuration/ConfigOptionsDocGenerator.java
+++
b/paimon-docs/src/main/java/org/apache/paimon/docs/configuration/ConfigOptionsDocGenerator.java
@@ -77,6 +77,7 @@ public class ConfigOptionsDocGenerator {
new OptionsClassLocation("paimon-core",
"org.apache.paimon.lookup"),
new OptionsClassLocation("paimon-core",
"org.apache.paimon.catalog"),
new OptionsClassLocation("paimon-core",
"org.apache.paimon.jdbc"),
+ new OptionsClassLocation("paimon-format",
"org.apache.paimon.format"),
new OptionsClassLocation(
"paimon-flink/paimon-flink-common",
"org.apache.paimon.flink"),
new OptionsClassLocation(
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/OrcOptions.java
b/paimon-format/src/main/java/org/apache/paimon/format/OrcOptions.java
new file mode 100644
index 000000000..ef79ecd34
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/OrcOptions.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format;
+
+import org.apache.paimon.options.ConfigOption;
+
+import org.apache.orc.OrcConf;
+
+import static org.apache.orc.OrcConf.COMPRESSION_ZSTD_LEVEL;
+import static org.apache.orc.OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD;
+import static org.apache.orc.OrcConf.DIRECT_ENCODING_COLUMNS;
+import static org.apache.paimon.options.ConfigOptions.key;
+
+/** Options for orc format. */
+public class OrcOptions {
+
+ public static final ConfigOption<Integer> ORC_WRITE_BATCH_SIZE =
+ key("orc.write.batch-size")
+ .intType()
+ .defaultValue(1024)
+ .withDescription("write batch size for orc.");
+
+ public static final ConfigOption<String> ORC_COMPRESS =
+ key(OrcConf.COMPRESS.getAttribute())
+ .stringType()
+ .defaultValue("lz4")
+ .withDescription(
+ "Define the compression codec for ORC file, if a
higher compression ratio is required, "
+ + "it is recommended to configure it as
'zstd', and you can configure: "
+ + COMPRESSION_ZSTD_LEVEL.getAttribute());
+
+ public static final ConfigOption<Integer> ORC_COLUMN_ENCODING_DIRECT =
+ key(DIRECT_ENCODING_COLUMNS.getAttribute())
+ .intType()
+ .noDefaultValue()
+ .withDescription(
+ "Comma-separated list of fields for which
dictionary encoding is to be skipped in orc.");
+
+ public static final ConfigOption<Double> ORC_DICTIONARY_KEY_THRESHOLD =
+ key(DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute())
+ .doubleType()
+ .defaultValue((Double)
DICTIONARY_KEY_SIZE_THRESHOLD.getDefaultValue())
+ .withDescription(
+ "If the number of distinct keys in a dictionary is
greater than this "
+ + "fraction of the total number of
non-null rows, turn off "
+ + "dictionary encoding in orc. Use 0 to
always disable dictionary encoding. "
+ + "Use 1 to always use dictionary
encoding.");
+
+ public static final ConfigOption<Integer> ORC_COMPRESSION_ZSTD_LEVEL =
+ key(COMPRESSION_ZSTD_LEVEL.getAttribute())
+ .intType()
+ .defaultValue((Integer)
COMPRESSION_ZSTD_LEVEL.getDefaultValue())
+ .withDescription(
+ "Define the compression level to use with
ZStandard codec while writing data. "
+ + "The valid range is 1~22.");
+}
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcWriterFactory.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcWriterFactory.java
index 63f555653..9a703bb18 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcWriterFactory.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcWriterFactory.java
@@ -42,6 +42,7 @@ import java.util.Map;
import java.util.Properties;
import java.util.UUID;
+import static org.apache.paimon.format.OrcOptions.ORC_WRITE_BATCH_SIZE;
import static org.apache.paimon.utils.Preconditions.checkNotNull;
/**
@@ -128,7 +129,7 @@ public class OrcWriterFactory implements
FormatWriterFactory {
vectorizer,
new WriterImpl(null, unusedPath, opts),
out,
- coreOptions.orcWriteBatch());
+ coreOptions.toConfiguration().get(ORC_WRITE_BATCH_SIZE));
}
@VisibleForTesting
diff --git
a/paimon-format/src/test/java/org/apache/paimon/format/orc/writer/OrcBulkWriterTest.java
b/paimon-format/src/test/java/org/apache/paimon/format/orc/writer/OrcBulkWriterTest.java
index 227d8a9d0..eccac13ea 100644
---
a/paimon-format/src/test/java/org/apache/paimon/format/orc/writer/OrcBulkWriterTest.java
+++
b/paimon-format/src/test/java/org/apache/paimon/format/orc/writer/OrcBulkWriterTest.java
@@ -18,7 +18,6 @@
package org.apache.paimon.format.orc.writer;
-import org.apache.paimon.CoreOptions;
import org.apache.paimon.format.FileFormat;
import org.apache.paimon.format.FormatWriter;
import org.apache.paimon.format.FormatWriterFactory;
@@ -37,12 +36,14 @@ import org.junit.jupiter.api.io.TempDir;
import java.io.IOException;
+import static org.apache.paimon.format.OrcOptions.ORC_WRITE_BATCH_SIZE;
+
class OrcBulkWriterTest {
@Test
void testRowBatch(@TempDir java.nio.file.Path tempDir) throws IOException {
Options options = new Options();
- options.set(CoreOptions.ORC_WRITE_BATCH_SIZE, 1);
+ options.set(ORC_WRITE_BATCH_SIZE, 1);
FileFormat orc = FileFormat.getFileFormat(options, "orc");
Assertions.assertThat(orc).isInstanceOf(OrcFileFormat.class);