kszucs commented on code in PR #45360:
URL: https://github.com/apache/arrow/pull/45360#discussion_r2025137614


##########
cpp/src/parquet/properties.h:
##########
@@ -275,10 +285,60 @@ class PARQUET_EXPORT WriterProperties {
           page_checksum_enabled_(properties.page_checksum_enabled()),
           size_statistics_level_(properties.size_statistics_level()),
           sorting_columns_(properties.sorting_columns()),
-          default_column_properties_(properties.default_column_properties()) {}
+          default_column_properties_(properties.default_column_properties()),
+          content_defined_chunking_enabled_(
+              properties.content_defined_chunking_enabled()),
+          content_defined_chunking_options_(
+              properties.content_defined_chunking_options()) {}
 
     virtual ~Builder() {}
 
+    /// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
+    ///
+    /// Optimize parquet files for content addressable storage (CAS) systems 
by writing
+    /// data pages according to content-defined chunk boundaries. This allows 
for more
+    /// efficient deduplication of data across files, hence more efficient 
network
+    /// transfers and storage. The chunking is based on a rolling hash 
algorithm that
+    /// identifies chunk boundaries based on the actual content of the data.
+    Builder* enable_content_defined_chunking() {
+      content_defined_chunking_enabled_ = true;
+      return this;
+    }
+
+    /// \brief EXPERIMENTAL: Disable content-defined page chunking for all 
columns.
+    Builder* disable_content_defined_chunking() {
+      content_defined_chunking_enabled_ = false;
+      return this;
+    }
+
+    /// \brief EXPERIMENTAL: Specify content-defined chunking options.
+    ///
+    /// \param min_chunk_size Minimum chunk size in bytes, default 256 KiB
+    /// The rolling hash will not be updated until this size is reached for 
each chunk.
+    /// Note that all data sent through the hash function is counted towards 
the chunk
+    /// size, including definition and repetition levels if present.
+    /// \param max_chunk_size Maximum chunk size in bytes, default is 1024 KiB
+    /// The chunker will create a new chunk whenever the chunk size exceeds 
this value.
+    /// Note that the parquet writer has a related `pagesize` property that 
controls
+    /// the maximum size of a parquet data page after encoding. While setting
+    /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the
+    /// chunking effectiveness, it results in more small parquet data pages.
+    /// \param norm_factor Normalization factor to center the chunk size 
around the
+    /// average size more aggressively, default 0
+    /// Increasing the normalization factor increases the probability of 
finding a chunk,
+    /// improving the deduplication ratio, but also increasing the number of 
small chunks
+    /// resulting in many small parquet data pages. The default value provides 
a good
+    /// balance between deduplication ratio and fragmentation. Use 
norm_factor=1 or
+    /// norm_factor=2 to reach a higher deduplication ratio at the expense of
+    /// fragmentation.
+    Builder* content_defined_chunking_options(
+        int64_t min_chunk_size, int64_t max_chunk_size,
+        int8_t norm_factor = kDefaultCdcOptions.norm_factor) {

Review Comment:
   Updated.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to