This is an automated email from the ASF dual-hosted git repository.
westonpace pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7e248b10b4 GH-34280: [C++][Python] Clarify meaning of row_group_size
and change default to 1Mi (#34281)
7e248b10b4 is described below
commit 7e248b10b428ad1275045b229323672afba6c5e4
Author: Weston Pace <[email protected]>
AuthorDate: Mon Feb 27 05:14:49 2023 -0800
GH-34280: [C++][Python] Clarify meaning of row_group_size and change
default to 1Mi (#34281)
BREAKING CHANGE: Changes the default row group size when writing parquet
files.
* Closes: #34280
Authored-by: Weston Pace <[email protected]>
Signed-off-by: Weston Pace <[email protected]>
---
cpp/src/parquet/properties.h | 6 +++---
python/pyarrow/parquet/core.py | 22 +++++++++++-----------
2 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index e1550203ca..572194f4ee 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -132,7 +132,7 @@ static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT =
kDefaultDataPageSize;
static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
-static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
+static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
@@ -264,8 +264,8 @@ class PARQUET_EXPORT WriterProperties {
return this;
}
- /// Specify the max row group length.
- /// Default 64M.
+ /// Specify the max number of rows to put in a single row group.
+ /// Default 1Mi rows.
Builder* max_row_group_length(int64_t max_row_group_length) {
max_row_group_length_ = max_row_group_length;
return this;
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 1e991e8f08..eb81b976c9 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -1038,9 +1038,9 @@ Examples
----------
table_or_batch : {RecordBatch, Table}
row_group_size : int, default None
- Maximum size of each written row group. If None, the
- row group size will be the minimum of the input
- table or batch length and 64 * 1024 * 1024.
+ Maximum number of rows in each written row group. If None,
+ the row group size will be the minimum of the input
+ table or batch length and 1024 * 1024.
"""
if isinstance(table_or_batch, pa.RecordBatch):
self.write_batch(table_or_batch, row_group_size)
@@ -1057,9 +1057,9 @@ Examples
----------
batch : RecordBatch
row_group_size : int, default None
- Maximum size of each written row group. If None, the
+ Maximum number of rows in written row group. If None, the
row group size will be the minimum of the RecordBatch
- size and 64 * 1024 * 1024.
+ size and 1024 * 1024.
"""
table = pa.Table.from_batches([batch], batch.schema)
self.write_table(table, row_group_size)
@@ -1072,9 +1072,9 @@ Examples
----------
table : Table
row_group_size : int, default None
- Maximum size of each written row group. If None, the
- row group size will be the minimum of the Table size
- and 64 * 1024 * 1024.
+ Maximum number of rows in each written row group. If None,
+ the row group size will be the minimum of the Table size
+ and 1024 * 1024.
"""
if self.schema_changed:
@@ -3153,9 +3153,9 @@ Parameters
table : pyarrow.Table
where : string or pyarrow.NativeFile
row_group_size : int
- Maximum size of each written row group. If None, the
- row group size will be the minimum of the Table size
- and 64 * 1024 * 1024.
+ Maximum number of rows in each written row group. If None, the
+ row group size will be the minimum of the Table size and
+ 1024 * 1024.
{}
**kwargs : optional
Additional options for ParquetWriter