This is an automated email from the ASF dual-hosted git repository.

kerwin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new b87905497 [doc] Introduce orc options documentation page (#3061)
b87905497 is described below

commit b879054979ecad67ea7e8a48cede5f1a7cc50ec2
Author: Jingsong Lee <[email protected]>
AuthorDate: Thu Mar 21 08:44:34 2024 +0800

    [doc] Introduce orc options documentation page (#3061)
---
 docs/content/maintenance/configurations.md         |  6 +-
 .../shortcodes/generated/core_configuration.html   | 30 ---------
 .../shortcodes/generated/orc_configuration.html    | 60 ++++++++++++++++++
 .../generated/spark_connector_configuration.html   | 12 ++--
 .../main/java/org/apache/paimon/CoreOptions.java   | 40 ------------
 .../configuration/ConfigOptionsDocGenerator.java   |  1 +
 .../java/org/apache/paimon/format/OrcOptions.java  | 72 ++++++++++++++++++++++
 .../apache/paimon/format/orc/OrcWriterFactory.java |  3 +-
 .../format/orc/writer/OrcBulkWriterTest.java       |  5 +-
 9 files changed, 149 insertions(+), 80 deletions(-)

diff --git a/docs/content/maintenance/configurations.md 
b/docs/content/maintenance/configurations.md
index 87d59cd34..564b7c44c 100644
--- a/docs/content/maintenance/configurations.md
+++ b/docs/content/maintenance/configurations.md
@@ -74,7 +74,11 @@ Spark connector options for paimon.
 
 {{< generated/spark_connector_configuration >}}
 
-## RocksDB Options
+### ORC Options
+
+{{< generated/orc_configuration >}}
+
+### RocksDB Options
 
 The following options allow users to finely adjust RocksDB for better 
performance. You can either specify them in table properties or in dynamic 
table hints.
 
diff --git a/docs/layouts/shortcodes/generated/core_configuration.html 
b/docs/layouts/shortcodes/generated/core_configuration.html
index ecf92c2f1..3c2dbcaae 100644
--- a/docs/layouts/shortcodes/generated/core_configuration.html
+++ b/docs/layouts/shortcodes/generated/core_configuration.html
@@ -383,36 +383,6 @@ This config option does not affect the default filesystem 
metastore.</td>
             <td>Integer</td>
             <td>The number of sorted runs that trigger the stopping of writes, 
the default value is 'num-sorted-run.compaction-trigger' + 1.</td>
         </tr>
-        <tr>
-            <td><h5>orc.bloom.filter.columns</h5></td>
-            <td style="word-wrap: break-word;">(none)</td>
-            <td>String</td>
-            <td>A comma-separated list of columns for which to create a bloom 
filter when writing.</td>
-        </tr>
-        <tr>
-            <td><h5>orc.bloom.filter.fpp</h5></td>
-            <td style="word-wrap: break-word;">0.05</td>
-            <td>Double</td>
-            <td>Define the default false positive probability for bloom 
filters.</td>
-        </tr>
-        <tr>
-            <td><h5>orc.column.encoding.direct</h5></td>
-            <td style="word-wrap: break-word;">(none)</td>
-            <td>Integer</td>
-            <td>Comma-separated list of fields for which dictionary encoding 
is to be skipped in orc.</td>
-        </tr>
-        <tr>
-            <td><h5>orc.dictionary.key.threshold</h5></td>
-            <td style="word-wrap: break-word;">(none)</td>
-            <td>Integer</td>
-            <td>If the number of distinct keys in a dictionary is greater than 
this fraction of the total number of non-null rows, turn off dictionary 
encoding in orc.  Use 1 to always use dictionary encoding.</td>
-        </tr>
-        <tr>
-            <td><h5>orc.write.batch-size</h5></td>
-            <td style="word-wrap: break-word;">1024</td>
-            <td>Integer</td>
-            <td>write batch size for orc.</td>
-        </tr>
         <tr>
             <td><h5>page-size</h5></td>
             <td style="word-wrap: break-word;">64 kb</td>
diff --git a/docs/layouts/shortcodes/generated/orc_configuration.html 
b/docs/layouts/shortcodes/generated/orc_configuration.html
new file mode 100644
index 000000000..92b1a9106
--- /dev/null
+++ b/docs/layouts/shortcodes/generated/orc_configuration.html
@@ -0,0 +1,60 @@
+{{/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+*/}}
+<table class="configuration table table-bordered">
+    <thead>
+        <tr>
+            <th class="text-left" style="width: 20%">Key</th>
+            <th class="text-left" style="width: 15%">Default</th>
+            <th class="text-left" style="width: 10%">Type</th>
+            <th class="text-left" style="width: 55%">Description</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td><h5>orc.column.encoding.direct</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>Integer</td>
+            <td>Comma-separated list of fields for which dictionary encoding 
is to be skipped in orc.</td>
+        </tr>
+        <tr>
+            <td><h5>orc.compress</h5></td>
+            <td style="word-wrap: break-word;">"lz4"</td>
+            <td>String</td>
+            <td>Define the compression codec for ORC file, if a higher 
compression ratio is required, it is recommended to configure it as 'zstd', and 
you can configure: orc.compression.zstd.level</td>
+        </tr>
+        <tr>
+            <td><h5>orc.compression.zstd.level</h5></td>
+            <td style="word-wrap: break-word;">1</td>
+            <td>Integer</td>
+            <td>Define the compression level to use with ZStandard codec while 
writing data. The valid range is 1~22.</td>
+        </tr>
+        <tr>
+            <td><h5>orc.dictionary.key.threshold</h5></td>
+            <td style="word-wrap: break-word;">0.8</td>
+            <td>Double</td>
+            <td>If the number of distinct keys in a dictionary is greater than 
this fraction of the total number of non-null rows, turn off dictionary 
encoding in orc. Use 0 to always disable dictionary encoding. Use 1 to always 
use dictionary encoding.</td>
+        </tr>
+        <tr>
+            <td><h5>orc.write.batch-size</h5></td>
+            <td style="word-wrap: break-word;">1024</td>
+            <td>Integer</td>
+            <td>write batch size for orc.</td>
+        </tr>
+    </tbody>
+</table>
diff --git 
a/docs/layouts/shortcodes/generated/spark_connector_configuration.html 
b/docs/layouts/shortcodes/generated/spark_connector_configuration.html
index 09d363aff..9e74cefbc 100644
--- a/docs/layouts/shortcodes/generated/spark_connector_configuration.html
+++ b/docs/layouts/shortcodes/generated/spark_connector_configuration.html
@@ -26,6 +26,12 @@ under the License.
         </tr>
     </thead>
     <tbody>
+        <tr>
+            <td><h5>catalog.create-underlying-session-catalog</h5></td>
+            <td style="word-wrap: break-word;">false</td>
+            <td>Boolean</td>
+            <td>If true, create and use an underlying session catalog instead 
of default session catalog when use SparkGenericCatalog.</td>
+        </tr>
         <tr>
             <td><h5>read.changelog</h5></td>
             <td style="word-wrap: break-word;">false</td>
@@ -74,11 +80,5 @@ under the License.
             <td>Boolean</td>
             <td>If true, allow to merge data types if the two types meet the 
rules for explicit casting.</td>
         </tr>
-        <tr>
-            <td><h5>catalog.create-underlying-session-catalog</h5></td>
-            <td style="word-wrap: break-word;">false</td>
-            <td>Boolean</td>
-            <td>If true, create and use an underlying session catalog instead 
of default session catalog when use SparkGenericCatalog.</td>
-        </tr>
     </tbody>
 </table>
diff --git a/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java 
b/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
index 9deeb6324..05e931237 100644
--- a/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
+++ b/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
@@ -108,20 +108,6 @@ public class CoreOptions implements Serializable {
                     .withDescription(
                             "Specify the message format of data files, 
currently orc, parquet and avro are supported.");
 
-    public static final ConfigOption<String> ORC_BLOOM_FILTER_COLUMNS =
-            key("orc.bloom.filter.columns")
-                    .stringType()
-                    .noDefaultValue()
-                    .withDescription(
-                            "A comma-separated list of columns for which to 
create a bloom filter when writing.");
-
-    public static final ConfigOption<Double> ORC_BLOOM_FILTER_FPP =
-            key("orc.bloom.filter.fpp")
-                    .doubleType()
-                    .defaultValue(0.05)
-                    .withDescription(
-                            "Define the default false positive probability for 
bloom filters.");
-
     public static final ConfigOption<Map<String, String>> 
FILE_COMPRESSION_PER_LEVEL =
             key("file.compression.per.level")
                     .mapType()
@@ -745,12 +731,6 @@ public class CoreOptions implements Serializable {
                     .defaultValue(1024)
                     .withDescription("Read batch size for orc and parquet.");
 
-    public static final ConfigOption<Integer> ORC_WRITE_BATCH_SIZE =
-            key("orc.write.batch-size")
-                    .intType()
-                    .defaultValue(1024)
-                    .withDescription("write batch size for orc.");
-
     public static final ConfigOption<String> CONSUMER_ID =
             key("consumer-id")
                     .stringType()
@@ -1002,22 +982,6 @@ public class CoreOptions implements Serializable {
                     .noDefaultValue()
                     .withDescription("Turn off the dictionary encoding for all 
fields in parquet.");
 
-    public static final ConfigOption<Integer> ORC_COLUMN_ENCODING_DIRECT =
-            key("orc.column.encoding.direct")
-                    .intType()
-                    .noDefaultValue()
-                    .withDescription(
-                            "Comma-separated list of fields for which 
dictionary encoding is to be skipped in orc.");
-
-    public static final ConfigOption<Integer> ORC_DICTIONARY_KEY_THRESHOLD =
-            key("orc.dictionary.key.threshold")
-                    .intType()
-                    .noDefaultValue()
-                    .withDescription(
-                            "If the number of distinct keys in a dictionary is 
greater than this "
-                                    + "fraction of the total number of 
non-null rows, turn off "
-                                    + "dictionary encoding in orc.  Use 1 to 
always use dictionary encoding.");
-
     public static final ConfigOption<String> SINK_WATERMARK_TIME_ZONE =
             key("sink.watermark-time-zone")
                     .stringType()
@@ -1643,10 +1607,6 @@ public class CoreOptions implements Serializable {
         return result;
     }
 
-    public int orcWriteBatch() {
-        return options.getInteger(ORC_WRITE_BATCH_SIZE.key(), 
ORC_WRITE_BATCH_SIZE.defaultValue());
-    }
-
     public boolean localMergeEnabled() {
         return options.get(LOCAL_MERGE_BUFFER_SIZE) != null;
     }
diff --git 
a/paimon-docs/src/main/java/org/apache/paimon/docs/configuration/ConfigOptionsDocGenerator.java
 
b/paimon-docs/src/main/java/org/apache/paimon/docs/configuration/ConfigOptionsDocGenerator.java
index 37d8661d2..6f700724a 100644
--- 
a/paimon-docs/src/main/java/org/apache/paimon/docs/configuration/ConfigOptionsDocGenerator.java
+++ 
b/paimon-docs/src/main/java/org/apache/paimon/docs/configuration/ConfigOptionsDocGenerator.java
@@ -77,6 +77,7 @@ public class ConfigOptionsDocGenerator {
                 new OptionsClassLocation("paimon-core", 
"org.apache.paimon.lookup"),
                 new OptionsClassLocation("paimon-core", 
"org.apache.paimon.catalog"),
                 new OptionsClassLocation("paimon-core", 
"org.apache.paimon.jdbc"),
+                new OptionsClassLocation("paimon-format", 
"org.apache.paimon.format"),
                 new OptionsClassLocation(
                         "paimon-flink/paimon-flink-common", 
"org.apache.paimon.flink"),
                 new OptionsClassLocation(
diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/OrcOptions.java 
b/paimon-format/src/main/java/org/apache/paimon/format/OrcOptions.java
new file mode 100644
index 000000000..ef79ecd34
--- /dev/null
+++ b/paimon-format/src/main/java/org/apache/paimon/format/OrcOptions.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format;
+
+import org.apache.paimon.options.ConfigOption;
+
+import org.apache.orc.OrcConf;
+
+import static org.apache.orc.OrcConf.COMPRESSION_ZSTD_LEVEL;
+import static org.apache.orc.OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD;
+import static org.apache.orc.OrcConf.DIRECT_ENCODING_COLUMNS;
+import static org.apache.paimon.options.ConfigOptions.key;
+
+/** Options for orc format. */
+public class OrcOptions {
+
+    public static final ConfigOption<Integer> ORC_WRITE_BATCH_SIZE =
+            key("orc.write.batch-size")
+                    .intType()
+                    .defaultValue(1024)
+                    .withDescription("write batch size for orc.");
+
+    public static final ConfigOption<String> ORC_COMPRESS =
+            key(OrcConf.COMPRESS.getAttribute())
+                    .stringType()
+                    .defaultValue("lz4")
+                    .withDescription(
+                            "Define the compression codec for ORC file, if a 
higher compression ratio is required, "
+                                    + "it is recommended to configure it as 
'zstd', and you can configure: "
+                                    + COMPRESSION_ZSTD_LEVEL.getAttribute());
+
+    public static final ConfigOption<Integer> ORC_COLUMN_ENCODING_DIRECT =
+            key(DIRECT_ENCODING_COLUMNS.getAttribute())
+                    .intType()
+                    .noDefaultValue()
+                    .withDescription(
+                            "Comma-separated list of fields for which 
dictionary encoding is to be skipped in orc.");
+
+    public static final ConfigOption<Double> ORC_DICTIONARY_KEY_THRESHOLD =
+            key(DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute())
+                    .doubleType()
+                    .defaultValue((Double) 
DICTIONARY_KEY_SIZE_THRESHOLD.getDefaultValue())
+                    .withDescription(
+                            "If the number of distinct keys in a dictionary is 
greater than this "
+                                    + "fraction of the total number of 
non-null rows, turn off "
+                                    + "dictionary encoding in orc. Use 0 to 
always disable dictionary encoding. "
+                                    + "Use 1 to always use dictionary 
encoding.");
+
+    public static final ConfigOption<Integer> ORC_COMPRESSION_ZSTD_LEVEL =
+            key(COMPRESSION_ZSTD_LEVEL.getAttribute())
+                    .intType()
+                    .defaultValue((Integer) 
COMPRESSION_ZSTD_LEVEL.getDefaultValue())
+                    .withDescription(
+                            "Define the compression level to use with 
ZStandard codec while writing data. "
+                                    + "The valid range is 1~22.");
+}
diff --git 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcWriterFactory.java
 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcWriterFactory.java
index 63f555653..9a703bb18 100644
--- 
a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcWriterFactory.java
+++ 
b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcWriterFactory.java
@@ -42,6 +42,7 @@ import java.util.Map;
 import java.util.Properties;
 import java.util.UUID;
 
+import static org.apache.paimon.format.OrcOptions.ORC_WRITE_BATCH_SIZE;
 import static org.apache.paimon.utils.Preconditions.checkNotNull;
 
 /**
@@ -128,7 +129,7 @@ public class OrcWriterFactory implements 
FormatWriterFactory {
                 vectorizer,
                 new WriterImpl(null, unusedPath, opts),
                 out,
-                coreOptions.orcWriteBatch());
+                coreOptions.toConfiguration().get(ORC_WRITE_BATCH_SIZE));
     }
 
     @VisibleForTesting
diff --git 
a/paimon-format/src/test/java/org/apache/paimon/format/orc/writer/OrcBulkWriterTest.java
 
b/paimon-format/src/test/java/org/apache/paimon/format/orc/writer/OrcBulkWriterTest.java
index 227d8a9d0..eccac13ea 100644
--- 
a/paimon-format/src/test/java/org/apache/paimon/format/orc/writer/OrcBulkWriterTest.java
+++ 
b/paimon-format/src/test/java/org/apache/paimon/format/orc/writer/OrcBulkWriterTest.java
@@ -18,7 +18,6 @@
 
 package org.apache.paimon.format.orc.writer;
 
-import org.apache.paimon.CoreOptions;
 import org.apache.paimon.format.FileFormat;
 import org.apache.paimon.format.FormatWriter;
 import org.apache.paimon.format.FormatWriterFactory;
@@ -37,12 +36,14 @@ import org.junit.jupiter.api.io.TempDir;
 
 import java.io.IOException;
 
+import static org.apache.paimon.format.OrcOptions.ORC_WRITE_BATCH_SIZE;
+
 class OrcBulkWriterTest {
 
     @Test
     void testRowBatch(@TempDir java.nio.file.Path tempDir) throws IOException {
         Options options = new Options();
-        options.set(CoreOptions.ORC_WRITE_BATCH_SIZE, 1);
+        options.set(ORC_WRITE_BATCH_SIZE, 1);
         FileFormat orc = FileFormat.getFileFormat(options, "orc");
         Assertions.assertThat(orc).isInstanceOf(OrcFileFormat.class);
 

Reply via email to