(arrow) branch main updated: GH-44790: [Python] Remove use_legacy_dataset from code base (#45742)

alenka Wed, 12 Mar 2025 03:07:39 -0700

This is an automated email from the ASF dual-hosted git repository.

alenka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 9b36c709a5 GH-44790: [Python] Remove use_legacy_dataset from code base 
(#45742)
9b36c709a5 is described below

commit 9b36c709a52caabd3579d006bb92379e1b263e52
Author: Alenka Frim <[email protected]>
AuthorDate: Wed Mar 12 11:07:19 2025 +0100

    GH-44790: [Python] Remove use_legacy_dataset from code base (#45742)
    
    ### Rationale for this change
    Legacy dataset in `ParquetDataset` has been deprecated since Arrow 15.0.0.
    We should remove the warnings and deprecate the `use_legacy_dataset` 
keyword arguments.
    
    ### What changes are included in this PR?
    Warnings and `use_legacy_dataset` keyword argument are removed from 
`ParquetDataset` together with connected test 
`test_deprecated_use_legacy_dataset `.
    
    ### Are these changes tested?
    Existing tests should pass.
    
    ### Are there any user-facing changes?
    Legacy dataset is removed from `ParquetDataset`.
    * GitHub Issue: #44790
    
    Authored-by: AlenkaF <[email protected]>
    Signed-off-by: AlenkaF <[email protected]>
---
 python/pyarrow/parquet/core.py             | 36 ++++--------------------------
 python/pyarrow/tests/parquet/test_basic.py | 20 -----------------
 2 files changed, 4 insertions(+), 52 deletions(-)

diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 6ca6f7089e..8d3dec96a6 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -25,7 +25,6 @@ import json
 import os
 import re
 import operator
-import warnings
 
 import pyarrow as pa
 
@@ -1265,8 +1264,6 @@ thrift_container_size_limit : int, default None
     sufficient for most Parquet files.
 page_checksum_verification : bool, default False
     If True, verify the page checksum for each page read from the file.
-use_legacy_dataset : bool, optional
-    Deprecated and has no effect from PyArrow version 15.0.0.
 
 Examples
 --------
@@ -1279,14 +1276,7 @@ Examples
                  coerce_int96_timestamp_unit=None,
                  decryption_properties=None, thrift_string_size_limit=None,
                  thrift_container_size_limit=None,
-                 page_checksum_verification=False,
-                 use_legacy_dataset=None):
-
-        if use_legacy_dataset is not None:
-            warnings.warn(
-                "Passing 'use_legacy_dataset' is deprecated as of pyarrow 
15.0.0 "
-                "and will be removed in a future version.",
-                FutureWarning, stacklevel=2)
+                 page_checksum_verification=False):
 
         import pyarrow.dataset as ds
 
@@ -1653,8 +1643,6 @@ filters : pyarrow.compute.Expression or List[Tuple] or 
List[List[Tuple]], defaul
     Within-file level filtering and different partitioning schemes are 
supported.
 
     {3}
-use_legacy_dataset : bool, optional
-    Deprecated and has no effect from PyArrow version 15.0.0.
 ignore_prefixes : list, optional
     Files matching any of these prefixes will be ignored by the
     discovery process.
@@ -1776,19 +1764,12 @@ Read data from a single Parquet file:
 def read_table(source, *, columns=None, use_threads=True,
                schema=None, use_pandas_metadata=False, read_dictionary=None,
                memory_map=False, buffer_size=0, partitioning="hive",
-               filesystem=None, filters=None, use_legacy_dataset=None,
-               ignore_prefixes=None, pre_buffer=True,
-               coerce_int96_timestamp_unit=None,
+               filesystem=None, filters=None, ignore_prefixes=None,
+               pre_buffer=True, coerce_int96_timestamp_unit=None,
                decryption_properties=None, thrift_string_size_limit=None,
                thrift_container_size_limit=None,
                page_checksum_verification=False):
 
-    if use_legacy_dataset is not None:
-        warnings.warn(
-            "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 "
-            "and will be removed in a future version.",
-            FutureWarning, stacklevel=2)
-
     try:
         dataset = ParquetDataset(
             source,
@@ -1991,8 +1972,7 @@ Examples
 
 
 def write_to_dataset(table, root_path, partition_cols=None,
-                     filesystem=None, use_legacy_dataset=None,
-                     schema=None, partitioning=None,
+                     filesystem=None, schema=None, partitioning=None,
                      basename_template=None, use_threads=None,
                      file_visitor=None, existing_data_behavior=None,
                      **kwargs):
@@ -2026,8 +2006,6 @@ def write_to_dataset(table, root_path, 
partition_cols=None,
         If nothing passed, will be inferred based on path.
         Path will try to be found in the local on-disk filesystem otherwise
         it will be parsed as an URI to determine the filesystem.
-    use_legacy_dataset : bool, optional
-        Deprecated and has no effect from PyArrow version 15.0.0.
     schema : Schema, optional
         This Schema of the dataset.
     partitioning : Partitioning or list[str], optional
@@ -2114,12 +2092,6 @@ def write_to_dataset(table, root_path, 
partition_cols=None,
     >>> pq.ParquetDataset('dataset_name_4/').files
     ['dataset_name_4/...-0.parquet']
     """
-    if use_legacy_dataset is not None:
-        warnings.warn(
-            "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 "
-            "and will be removed in a future version.",
-            FutureWarning, stacklevel=2)
-
     metadata_collector = kwargs.pop('metadata_collector', None)
 
     # Check for conflicting keywords
diff --git a/python/pyarrow/tests/parquet/test_basic.py 
b/python/pyarrow/tests/parquet/test_basic.py
index 6496aa9909..43fddd413a 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -970,23 +970,3 @@ def test_checksum_write_to_dataset(tempdir):
     # checksum verification enabled raises an exception
     with pytest.raises(OSError, match="CRC checksum verification"):
         _ = pq.read_table(corrupted_file_path, page_checksum_verification=True)
-
-
[email protected]
-def test_deprecated_use_legacy_dataset(tempdir):
-    # Test that specifying use_legacy_dataset in ParquetDataset, 
write_to_dataset
-    # and read_table doesn't raise an error but gives a warning.
-    table = pa.table({"a": [1, 2, 3]})
-    path = tempdir / "deprecate_legacy"
-
-    msg = "Passing 'use_legacy_dataset'"
-    with pytest.warns(FutureWarning, match=msg):
-        pq.write_to_dataset(table, path, use_legacy_dataset=False)
-
-    pq.write_to_dataset(table, path)
-
-    with pytest.warns(FutureWarning, match=msg):
-        pq.read_table(path, use_legacy_dataset=False)
-
-    with pytest.warns(FutureWarning, match=msg):
-        pq.ParquetDataset(path, use_legacy_dataset=False)

(arrow) branch main updated: GH-44790: [Python] Remove use_legacy_dataset from code base (#45742)

Reply via email to