This is an automated email from the ASF dual-hosted git repository.
alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9b36c709a5 GH-44790: [Python] Remove use_legacy_dataset from code base
(#45742)
9b36c709a5 is described below
commit 9b36c709a52caabd3579d006bb92379e1b263e52
Author: Alenka Frim <[email protected]>
AuthorDate: Wed Mar 12 11:07:19 2025 +0100
GH-44790: [Python] Remove use_legacy_dataset from code base (#45742)
### Rationale for this change
Legacy dataset in `ParquetDataset` has been deprecated since Arrow 15.0.0.
We should remove the warnings and deprecate the `use_legacy_dataset`
keyword arguments.
### What changes are included in this PR?
Warnings and `use_legacy_dataset` keyword argument are removed from
`ParquetDataset` together with connected test
`test_deprecated_use_legacy_dataset `.
### Are these changes tested?
Existing tests should pass.
### Are there any user-facing changes?
Legacy dataset is removed from `ParquetDataset`.
* GitHub Issue: #44790
Authored-by: AlenkaF <[email protected]>
Signed-off-by: AlenkaF <[email protected]>
---
python/pyarrow/parquet/core.py | 36 ++++--------------------------
python/pyarrow/tests/parquet/test_basic.py | 20 -----------------
2 files changed, 4 insertions(+), 52 deletions(-)
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 6ca6f7089e..8d3dec96a6 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -25,7 +25,6 @@ import json
import os
import re
import operator
-import warnings
import pyarrow as pa
@@ -1265,8 +1264,6 @@ thrift_container_size_limit : int, default None
sufficient for most Parquet files.
page_checksum_verification : bool, default False
If True, verify the page checksum for each page read from the file.
-use_legacy_dataset : bool, optional
- Deprecated and has no effect from PyArrow version 15.0.0.
Examples
--------
@@ -1279,14 +1276,7 @@ Examples
coerce_int96_timestamp_unit=None,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None,
- page_checksum_verification=False,
- use_legacy_dataset=None):
-
- if use_legacy_dataset is not None:
- warnings.warn(
- "Passing 'use_legacy_dataset' is deprecated as of pyarrow
15.0.0 "
- "and will be removed in a future version.",
- FutureWarning, stacklevel=2)
+ page_checksum_verification=False):
import pyarrow.dataset as ds
@@ -1653,8 +1643,6 @@ filters : pyarrow.compute.Expression or List[Tuple] or
List[List[Tuple]], defaul
Within-file level filtering and different partitioning schemes are
supported.
{3}
-use_legacy_dataset : bool, optional
- Deprecated and has no effect from PyArrow version 15.0.0.
ignore_prefixes : list, optional
Files matching any of these prefixes will be ignored by the
discovery process.
@@ -1776,19 +1764,12 @@ Read data from a single Parquet file:
def read_table(source, *, columns=None, use_threads=True,
schema=None, use_pandas_metadata=False, read_dictionary=None,
memory_map=False, buffer_size=0, partitioning="hive",
- filesystem=None, filters=None, use_legacy_dataset=None,
- ignore_prefixes=None, pre_buffer=True,
- coerce_int96_timestamp_unit=None,
+ filesystem=None, filters=None, ignore_prefixes=None,
+ pre_buffer=True, coerce_int96_timestamp_unit=None,
decryption_properties=None, thrift_string_size_limit=None,
thrift_container_size_limit=None,
page_checksum_verification=False):
- if use_legacy_dataset is not None:
- warnings.warn(
- "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 "
- "and will be removed in a future version.",
- FutureWarning, stacklevel=2)
-
try:
dataset = ParquetDataset(
source,
@@ -1991,8 +1972,7 @@ Examples
def write_to_dataset(table, root_path, partition_cols=None,
- filesystem=None, use_legacy_dataset=None,
- schema=None, partitioning=None,
+ filesystem=None, schema=None, partitioning=None,
basename_template=None, use_threads=None,
file_visitor=None, existing_data_behavior=None,
**kwargs):
@@ -2026,8 +2006,6 @@ def write_to_dataset(table, root_path,
partition_cols=None,
If nothing passed, will be inferred based on path.
Path will try to be found in the local on-disk filesystem otherwise
it will be parsed as an URI to determine the filesystem.
- use_legacy_dataset : bool, optional
- Deprecated and has no effect from PyArrow version 15.0.0.
schema : Schema, optional
This Schema of the dataset.
partitioning : Partitioning or list[str], optional
@@ -2114,12 +2092,6 @@ def write_to_dataset(table, root_path,
partition_cols=None,
>>> pq.ParquetDataset('dataset_name_4/').files
['dataset_name_4/...-0.parquet']
"""
- if use_legacy_dataset is not None:
- warnings.warn(
- "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 "
- "and will be removed in a future version.",
- FutureWarning, stacklevel=2)
-
metadata_collector = kwargs.pop('metadata_collector', None)
# Check for conflicting keywords
diff --git a/python/pyarrow/tests/parquet/test_basic.py
b/python/pyarrow/tests/parquet/test_basic.py
index 6496aa9909..43fddd413a 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -970,23 +970,3 @@ def test_checksum_write_to_dataset(tempdir):
# checksum verification enabled raises an exception
with pytest.raises(OSError, match="CRC checksum verification"):
_ = pq.read_table(corrupted_file_path, page_checksum_verification=True)
-
-
[email protected]
-def test_deprecated_use_legacy_dataset(tempdir):
- # Test that specifying use_legacy_dataset in ParquetDataset,
write_to_dataset
- # and read_table doesn't raise an error but gives a warning.
- table = pa.table({"a": [1, 2, 3]})
- path = tempdir / "deprecate_legacy"
-
- msg = "Passing 'use_legacy_dataset'"
- with pytest.warns(FutureWarning, match=msg):
- pq.write_to_dataset(table, path, use_legacy_dataset=False)
-
- pq.write_to_dataset(table, path)
-
- with pytest.warns(FutureWarning, match=msg):
- pq.read_table(path, use_legacy_dataset=False)
-
- with pytest.warns(FutureWarning, match=msg):
- pq.ParquetDataset(path, use_legacy_dataset=False)