This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 7a1fe28 Enable setting `{page,row-group}` limit (#390)
7a1fe28 is described below
commit 7a1fe28df38bfdc69166915dca49cfde660853bf
Author: Fokko Driesprong <[email protected]>
AuthorDate: Thu Feb 8 11:13:03 2024 +0100
Enable setting `{page,row-group}` limit (#390)
---
mkdocs/docs/configuration.md | 2 ++
pyiceberg/io/pyarrow.py | 12 +++++++++++-
pyiceberg/table/__init__.py | 3 +++
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
index ce17931..8acc0a9 100644
--- a/mkdocs/docs/configuration.md
+++ b/mkdocs/docs/configuration.md
@@ -57,7 +57,9 @@ Iceberg tables support table properties to configure table
behavior.
| `write.parquet.compression-codec` | `{uncompressed,zstd,gzip,snappy}` | zstd
| Sets the Parquet compression coddec.
|
| `write.parquet.compression-level` | Integer | null
| Parquet compression level for the codec. If not set, it is up to PyIceberg
|
| `write.parquet.page-size-bytes` | Size in bytes | 1MB
| Set a target threshold for the approximate encoded size of data pages
within a column chunk |
+| `write.parquet.page-row-limit` | Number of rows |
20000 | Set a target threshold for the approximate encoded size of data pages
within a column chunk |
| `write.parquet.dict-size-bytes` | Size in bytes | 2MB
| Set the dictionary page size limit per row group
|
+| `write.parquet.row-group-limit` | Number of rows |
122880 | The Parquet row group limit
|
# FileIO
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index ee8f63f..8657144 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -1730,9 +1730,14 @@ def write_file(table: Table, tasks: Iterator[WriteTask])
-> Iterator[DataFile]:
file_schema = schema_to_pyarrow(table.schema())
fo = table.io.new_output(file_path)
+ row_group_size = PropertyUtil.property_as_int(
+ properties=table.properties,
+ property_name=TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES,
+ default=TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT,
+ )
with fo.create(overwrite=True) as fos:
with pq.ParquetWriter(fos, schema=file_schema,
**parquet_writer_kwargs) as writer:
- writer.write_table(task.df)
+ writer.write_table(task.df, row_group_size=row_group_size)
data_file = DataFile(
content=DataFileContent.DATA,
@@ -1795,4 +1800,9 @@ def _get_parquet_writer_kwargs(table_properties:
Properties) -> Dict[str, Any]:
property_name=TableProperties.PARQUET_DICT_SIZE_BYTES,
default=TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT,
),
+ "write_batch_size": PropertyUtil.property_as_int(
+ properties=table_properties,
+ property_name=TableProperties.PARQUET_PAGE_ROW_LIMIT,
+ default=TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT,
+ ),
}
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index 8670bc2..cc4bbf5 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -137,6 +137,9 @@ class TableProperties:
PARQUET_ROW_GROUP_SIZE_BYTES = "write.parquet.row-group-size-bytes"
PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 # 128 MB
+ PARQUET_ROW_GROUP_LIMIT = "write.parquet.row-group-limit"
+ PARQUET_ROW_GROUP_LIMIT_DEFAULT = 128 * 1024 * 1024 # 128 MB
+
PARQUET_PAGE_SIZE_BYTES = "write.parquet.page-size-bytes"
PARQUET_PAGE_SIZE_BYTES_DEFAULT = 1024 * 1024 # 1 MB