This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 1c0135475 [doc] Document dictionary option (#1851)
1c0135475 is described below

commit 1c01354755193c182a14fb70de4ef9d18950dab8
Author: wgcn <[email protected]>
AuthorDate: Mon Aug 21 09:54:34 2023 +0800

    [doc] Document dictionary option (#1851)
---
 docs/content/maintenance/write-performance.md      |  3 +++
 .../shortcodes/generated/core_configuration.html   | 18 ++++++++++++++++++
 .../main/java/org/apache/paimon/CoreOptions.java   | 22 ++++++++++++++++++++++
 3 files changed, 43 insertions(+)

diff --git a/docs/content/maintenance/write-performance.md 
b/docs/content/maintenance/write-performance.md
index 77ce6e246..27d8c7bbd 100644
--- a/docs/content/maintenance/write-performance.md
+++ b/docs/content/maintenance/write-performance.md
@@ -220,6 +220,9 @@ There are three main places in Paimon writer that takes up 
memory:
 * Memory consumed when merging several sorted runs for compaction. Can be 
adjusted by the `num-sorted-run.compaction-trigger` option to change the number 
of sorted runs to be merged.
 * If the row is very large, reading too many lines of data at once will 
consume a lot of memory when making a compaction. Reducing the 
`read.batch-size` option can alleviate the impact of this case.
 * The memory consumed by writing columnar (ORC, Parquet, etc.) file. 
Decreasing the `orc.write.batch-size` option can reduce the consumption of 
memory for ORC format.
+* If files are automatically compaction in the write task, dictionaries for 
certain large columns can significantly consume memory during compaction.
+  * To disable dictionary encoding for all fields in Parquet format, set 
`'parquet.enable.dictionary'= 'false'`.
+  * To disable dictionary encoding for all fields in ORC format, set 
`orc.dictionary.key.threshold='0'`. Additionally,set 
`orc.column.encoding.direct='field1,field2'` to disable dictionary encoding for 
specific columns.
 
 If your Flink job does not rely on state, please avoid using managed memory, 
which you can control with the following Flink parameter:
 ```shell
diff --git a/docs/layouts/shortcodes/generated/core_configuration.html 
b/docs/layouts/shortcodes/generated/core_configuration.html
index 383501f8f..a54885ef3 100644
--- a/docs/layouts/shortcodes/generated/core_configuration.html
+++ b/docs/layouts/shortcodes/generated/core_configuration.html
@@ -310,6 +310,18 @@ This config option does not affect the default filesystem 
metastore.</td>
             <td>Double</td>
             <td>Define the default false positive probability for bloom 
filters.</td>
         </tr>
+        <tr>
+            <td><h5>orc.column.encoding.direct</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>Integer</td>
+            <td>Comma-separated list of fields for which dictionary encoding 
is to be skipped in orc.</td>
+        </tr>
+        <tr>
+            <td><h5>orc.dictionary.key.threshold</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>Integer</td>
+            <td>If the number of distinct keys in a dictionary is greater than 
this fraction of the total number of non-null rows, turn off dictionary 
encoding in orc.  Use 1 to always use dictionary encoding.</td>
+        </tr>
         <tr>
             <td><h5>orc.write.batch-size</h5></td>
             <td style="word-wrap: break-word;">1024</td>
@@ -322,6 +334,12 @@ This config option does not affect the default filesystem 
metastore.</td>
             <td>MemorySize</td>
             <td>Memory page size.</td>
         </tr>
+        <tr>
+            <td><h5>parquet.enable.dictionary</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>Integer</td>
+            <td>Turn off the dictionary encoding for all fields in 
parquet.</td>
+        </tr>
         <tr>
             <td><h5>partial-update.ignore-delete</h5></td>
             <td style="word-wrap: break-word;">false</td>
diff --git a/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java 
b/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
index 006bd0687..9c516273a 100644
--- a/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
+++ b/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java
@@ -823,6 +823,28 @@ public class CoreOptions implements Serializable {
                     .noDefaultValue()
                     .withDescription("The maximum number of tags to retain.");
 
+    public static final ConfigOption<Integer> PARQUET_ENABLE_DICTIONARY =
+            key("parquet.enable.dictionary")
+                    .intType()
+                    .noDefaultValue()
+                    .withDescription("Turn off the dictionary encoding for all 
fields in parquet.");
+
+    public static final ConfigOption<Integer> ORC_COLUMN_ENCODING_DIRECT =
+            key("orc.column.encoding.direct")
+                    .intType()
+                    .noDefaultValue()
+                    .withDescription(
+                            "Comma-separated list of fields for which 
dictionary encoding is to be skipped in orc.");
+
+    public static final ConfigOption<Integer> ORC_DICTIONARY_KEY_THRESHOLD =
+            key("orc.dictionary.key.threshold")
+                    .intType()
+                    .noDefaultValue()
+                    .withDescription(
+                            "If the number of distinct keys in a dictionary is 
greater than this "
+                                    + "fraction of the total number of 
non-null rows, turn off "
+                                    + "dictionary encoding in orc.  Use 1 to 
always use dictionary encoding.");
+
     public static final ConfigOption<String> SINK_WATERMARK_TIME_ZONE =
             key("sink.watermark-time-zone")
                     .stringType()

Reply via email to