This is an automated email from the ASF dual-hosted git repository.

westonpace pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 7e248b10b4 GH-34280: [C++][Python] Clarify meaning of row_group_size 
and change default to 1Mi (#34281)
7e248b10b4 is described below

commit 7e248b10b428ad1275045b229323672afba6c5e4
Author: Weston Pace <[email protected]>
AuthorDate: Mon Feb 27 05:14:49 2023 -0800

    GH-34280: [C++][Python] Clarify meaning of row_group_size and change 
default to 1Mi (#34281)
    
    BREAKING CHANGE: Changes the default row group size when writing parquet 
files.
    * Closes: #34280
    
    Authored-by: Weston Pace <[email protected]>
    Signed-off-by: Weston Pace <[email protected]>
---
 cpp/src/parquet/properties.h   |  6 +++---
 python/pyarrow/parquet/core.py | 22 +++++++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index e1550203ca..572194f4ee 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -132,7 +132,7 @@ static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
 static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
 static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = 
kDefaultDataPageSize;
 static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
-static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
+static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
 static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
 static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
 static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
@@ -264,8 +264,8 @@ class PARQUET_EXPORT WriterProperties {
       return this;
     }
 
-    /// Specify the max row group length.
-    /// Default 64M.
+    /// Specify the max number of rows to put in a single row group.
+    /// Default 1Mi rows.
     Builder* max_row_group_length(int64_t max_row_group_length) {
       max_row_group_length_ = max_row_group_length;
       return this;
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 1e991e8f08..eb81b976c9 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -1038,9 +1038,9 @@ Examples
         ----------
         table_or_batch : {RecordBatch, Table}
         row_group_size : int, default None
-            Maximum size of each written row group. If None, the
-            row group size will be the minimum of the input
-            table or batch length and 64 * 1024 * 1024.
+            Maximum number of rows in each written row group. If None,
+            the row group size will be the minimum of the input
+            table or batch length and 1024 * 1024.
         """
         if isinstance(table_or_batch, pa.RecordBatch):
             self.write_batch(table_or_batch, row_group_size)
@@ -1057,9 +1057,9 @@ Examples
         ----------
         batch : RecordBatch
         row_group_size : int, default None
-            Maximum size of each written row group. If None, the
+            Maximum number of rows in written row group. If None, the
             row group size will be the minimum of the RecordBatch
-            size and 64 * 1024 * 1024.
+            size and 1024 * 1024.
         """
         table = pa.Table.from_batches([batch], batch.schema)
         self.write_table(table, row_group_size)
@@ -1072,9 +1072,9 @@ Examples
         ----------
         table : Table
         row_group_size : int, default None
-            Maximum size of each written row group. If None, the
-            row group size will be the minimum of the Table size
-            and 64 * 1024 * 1024.
+            Maximum number of rows in each written row group. If None,
+            the row group size will be the minimum of the Table size
+            and 1024 * 1024.
 
         """
         if self.schema_changed:
@@ -3153,9 +3153,9 @@ Parameters
 table : pyarrow.Table
 where : string or pyarrow.NativeFile
 row_group_size : int
-    Maximum size of each written row group. If None, the
-    row group size will be the minimum of the Table size
-    and 64 * 1024 * 1024.
+    Maximum number of rows in each written row group. If None, the
+    row group size will be the minimum of the Table size and
+    1024 * 1024.
 {}
 **kwargs : optional
     Additional options for ParquetWriter

Reply via email to