This is an automated email from the ASF dual-hosted git repository.
eladkal pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 681859c7bf Change default `parquet_row_group_size` in
`BaseSQLToGCSOperator` (#36817)
681859c7bf is described below
commit 681859c7bffabce0c294060d811db2fb16851816
Author: Renze Post <[email protected]>
AuthorDate: Thu Jan 18 15:20:12 2024 +0100
Change default `parquet_row_group_size` in `BaseSQLToGCSOperator` (#36817)
* Change default parquet_row_group_size in BaseSQLToGCSOperator
* Add change to changelog
* Added a better change description
* Remove unnecessary extra newline
* Applied suggested changes
Co-authored-by: Andrey Anshin <[email protected]>
* Applied suggested changes
Co-authored-by: Elad Kalif <[email protected]>
---------
Co-authored-by: Andrey Anshin <[email protected]>
Co-authored-by: Elad Kalif <[email protected]>
---
airflow/providers/google/CHANGELOG.rst | 10 ++++++++++
airflow/providers/google/cloud/transfers/sql_to_gcs.py | 4 ++--
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/airflow/providers/google/CHANGELOG.rst
b/airflow/providers/google/CHANGELOG.rst
index 3187a397b4..9249209235 100644
--- a/airflow/providers/google/CHANGELOG.rst
+++ b/airflow/providers/google/CHANGELOG.rst
@@ -27,6 +27,16 @@
Changelog
---------
+.. note::
+ The default value of ``parquet_row_group_size`` in ``BaseSQLToGCSOperator``
has changed from 1 to
+ 100000, in order to have a default that provides better compression
efficiency and performance of
+ reading the data in the output Parquet files. In many cases, the previous
value of 1 resulted in
+ very large files, long task durations and out of memory issues. A default
value of 100000 may require
+ more memory to execute the operator, in which case users can override the
``parquet_row_group_size``
+ parameter in the operator. All operators that are derived from
``BaseSQLToGCSOperator`` are affected
+ when ``export_format`` is ``parquet``: ``MySQLToGCSOperator``,
``PrestoToGCSOperator``,
+ ``OracleToGCSOperator``, ``TrinoToGCSOperator``, ``MSSQLToGCSOperator`` and
``PostgresToGCSOperator``. Due to the above we treat this change as bug fix.
+
10.13.1
.......
diff --git a/airflow/providers/google/cloud/transfers/sql_to_gcs.py
b/airflow/providers/google/cloud/transfers/sql_to_gcs.py
index dcadaf7859..1529430c97 100644
--- a/airflow/providers/google/cloud/transfers/sql_to_gcs.py
+++ b/airflow/providers/google/cloud/transfers/sql_to_gcs.py
@@ -85,7 +85,7 @@ class BaseSQLToGCSOperator(BaseOperator):
:param parquet_row_group_size: The approximate number of rows in each row
group
when using parquet format. Using a large row group size can reduce the
file size
and improve the performance of reading the data, but it needs more
memory to
- execute the operator. (default: 1)
+ execute the operator. (default: 100000)
"""
template_fields: Sequence[str] = (
@@ -123,7 +123,7 @@ class BaseSQLToGCSOperator(BaseOperator):
exclude_columns: set | None = None,
partition_columns: list | None = None,
write_on_empty: bool = False,
- parquet_row_group_size: int = 1,
+ parquet_row_group_size: int = 100000,
**kwargs,
) -> None:
super().__init__(**kwargs)