This is an automated email from the ASF dual-hosted git repository.
pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2a89d03bbe GH-49923: [Parquet][Python] Inconsistent default values for
Parquet pre_buffer (#49924)
2a89d03bbe is described below
commit 2a89d03bbefd620b42126b8e00f8ae57e99cd638
Author: Rok Mihevc <[email protected]>
AuthorDate: Wed May 6 16:27:58 2026 +0200
GH-49923: [Parquet][Python] Inconsistent default values for Parquet
pre_buffer (#49924)
### Rationale for this change
Default `pre_buffer=True` on parquet APIs should probably be made
consistent.
### What changes are included in this PR?
Couple of default parameter changes and an updated docstring.
### Are these changes tested?
By CI.
### Are there any user-facing changes?
Users will see changes to default behavior. It's hard to estimate how
disruptive (if at all) this will be.
* GitHub Issue: #49923
Authored-by: Rok Mihevc <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/_parquet.pyx | 4 ++--
python/pyarrow/parquet/core.py | 11 +++++++----
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index afd85da1ef..1448d90ec5 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1565,7 +1565,7 @@ cdef class ParquetReader(_Weakrefable):
def open(self, object source not None, *, bint use_memory_map=False,
read_dictionary=None, binary_type=None, list_type=None,
FileMetaData metadata=None,
- int buffer_size=0, bint pre_buffer=False,
+ int buffer_size=0, bint pre_buffer=True,
coerce_int96_timestamp_unit=None,
FileDecryptionProperties decryption_properties=None,
thrift_string_size_limit=None,
@@ -1584,7 +1584,7 @@ cdef class ParquetReader(_Weakrefable):
list_type : subclass of pyarrow.DataType, optional
metadata : FileMetaData, optional
buffer_size : int, default 0
- pre_buffer : bool, default False
+ pre_buffer : bool, default True
coerce_int96_timestamp_unit : str, optional
decryption_properties : FileDecryptionProperties, optional
thrift_string_size_limit : int, optional
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 5234976a92..080bfa55c2 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -236,10 +236,13 @@ class ParquetFile:
buffer_size : int, default 0
If positive, perform read buffering when deserializing individual
column chunks. Otherwise IO calls are unbuffered.
- pre_buffer : bool, default False
+ pre_buffer : bool, default True
Coalesce and issue file reads in parallel to improve performance on
- high-latency filesystems (e.g. S3). If True, Arrow will use a
- background I/O thread pool.
+ high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a
+ background I/O thread pool. If using a filesystem layer that itself
+ performs readahead (e.g. fsspec's S3FS), disable readahead for best
+ results. Set to False if you want to prioritize minimal memory usage
+ over maximum speed.
coerce_int96_timestamp_unit : str, default None
Cast timestamps that are stored in INT96 format to a particular
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
@@ -310,7 +313,7 @@ class ParquetFile:
def __init__(self, source, *, metadata=None, common_metadata=None,
read_dictionary=None, binary_type=None, list_type=None,
- memory_map=False, buffer_size=0, pre_buffer=False,
+ memory_map=False, buffer_size=0, pre_buffer=True,
coerce_int96_timestamp_unit=None,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None, filesystem=None,