This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 46b033f0bd GH-47728: [Python] Check the source argument in
parquet.read_table (#48008)
46b033f0bd is described below
commit 46b033f0bd18ce5c0fcc69cd6523cdbb7acd48cd
Author: Bogdan Romenskii <[email protected]>
AuthorDate: Fri Nov 14 09:52:04 2025 +0100
GH-47728: [Python] Check the source argument in parquet.read_table (#48008)
### Rationale for this change
See #47728. Check `source` argument in `pyarrow.parquet.read_table` if
`pyarrow.dataset` is not available.
### What changes are included in this PR?
Check the `source` argument, raise `ValueError` if the `source` argument is
either a list of `.parquet` files or a directory.
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
In case if the `source` argument is a directory, I decided not to check it
directly, but to catch the exceptions coming from the `fs.open_input_file`,
since it already checks for it, and add extra exception on top of the stack
that explains the actual reason.
* GitHub Issue: #47728
Authored-by: Bogdan Romenskii <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
python/pyarrow/parquet/core.py | 15 ++++++++++++++-
python/pyarrow/tests/parquet/test_basic.py | 14 ++++++++++++--
2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 24cb586c82..5f62a3fc4f 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -1887,10 +1887,23 @@ def read_table(source, *, columns=None,
use_threads=True,
"the 'schema' argument is not supported when the "
"pyarrow.dataset module is not available"
)
+ if isinstance(source, list):
+ raise ValueError(
+ "the 'source' argument cannot be a list of files "
+ "when the pyarrow.dataset module is not available"
+ )
+
filesystem, path = _resolve_filesystem_and_path(source, filesystem)
if filesystem is not None:
+ if not filesystem.get_file_info(path).is_file:
+ raise ValueError(
+ "the 'source' argument should be "
+ "an existing parquet file and not a directory "
+ "when the pyarrow.dataset module is not available"
+ )
+
source = filesystem.open_input_file(path)
- # TODO test that source is not a directory or a list
+
dataset = ParquetFile(
source, read_dictionary=read_dictionary,
binary_type=binary_type,
diff --git a/python/pyarrow/tests/parquet/test_basic.py
b/python/pyarrow/tests/parquet/test_basic.py
index 591bcffc1a..3b991fdd57 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -16,6 +16,7 @@
# under the License.
import os
+import sys
from collections import OrderedDict
import io
import warnings
@@ -185,8 +186,7 @@ def test_read_table_without_dataset(tempdir):
pq.read_table(path, partitioning=['week', 'color'])
with pytest.raises(ValueError, match="the 'schema' argument"):
pq.read_table(path, schema=table.schema)
- # Error message varies depending on OS
- with pytest.raises(OSError):
+ with pytest.raises(ValueError, match="the 'source' argument"):
pq.read_table(tempdir)
result = pq.read_table(path)
assert result == table
@@ -993,3 +993,13 @@ def test_checksum_write_to_dataset(tempdir):
# checksum verification enabled raises an exception
with pytest.raises(OSError, match="CRC checksum verification"):
_ = pq.read_table(corrupted_file_path, page_checksum_verification=True)
+
+
[email protected](
+ "source", ["/tmp/", ["/tmp/file1.parquet", "/tmp/file2.parquet"]])
+def test_read_table_raises_value_error_when_ds_is_unavailable(monkeypatch,
source):
+ # GH-47728
+ monkeypatch.setitem(sys.modules, "pyarrow.dataset", None)
+
+ with pytest.raises(ValueError, match="the 'source' argument"):
+ pq.read_table(source=source)