This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e279a7e ARROW-8213: [Python][Dataset] Opening a dataset with a local
incorrect path gives confusing error message
e279a7e is described below
commit e279a7e06e61c14868ca7d71dea795420aea6539
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Tue Apr 7 19:47:55 2020 -0500
ARROW-8213: [Python][Dataset] Opening a dataset with a local incorrect path
gives confusing error message
Workaround until it is properly handled in the C++ implementation.
Closes #6854 from kszucs/ARROW-8213
Lead-authored-by: Krisztián Szűcs <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
---
python/pyarrow/dataset.py | 22 ++++++++++++++++------
python/pyarrow/tests/test_dataset.py | 7 +++++++
2 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index fff11be..daacf2b 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -159,15 +159,24 @@ def _ensure_fs(filesystem, path):
FileSystem, LocalFileSystem, FileType, _normalize_path)
if filesystem is None:
- # first check if the file exists as a local (relative) file path
+ # First check if the file exists as a local (relative) file path
filesystem = LocalFileSystem()
try:
infos = filesystem.get_file_info([path])[0]
except OSError:
- return FileSystem.from_uri(path)
-
- if infos.type == FileType.NotFound:
- return FileSystem.from_uri(path)
+ local_path_exists = False
+ else:
+ local_path_exists = (infos.type != FileType.NotFound)
+
+ if not local_path_exists:
+ # Perhaps it's a URI?
+ try:
+ return FileSystem.from_uri(path)
+ except ValueError as e:
+ if "empty scheme" not in str(e):
+ raise
+ # ARROW-8213: not a URI, assume local path
+ # to get a nice error message.
# ensure we have a proper path (eg no backslashes on Windows)
path = _normalize_path(filesystem, path)
@@ -179,7 +188,8 @@ def _ensure_fs_and_paths(path, filesystem=None):
# Return filesystem and list of string paths or FileSelector
from pyarrow.fs import FileType, FileSelector
- filesystem, path = _ensure_fs(filesystem, _stringify_path(path))
+ path = _stringify_path(path)
+ filesystem, path = _ensure_fs(filesystem, path)
infos = filesystem.get_file_info([path])[0]
if infos.type == FileType.Directory:
# for directory, pass a selector
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index dc28512..9bb0b91 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1042,6 +1042,13 @@ def
test_open_dataset_from_source_additional_kwargs(multisourcefs):
ds.dataset(child, format="parquet")
+def test_open_dataset_non_existing_file():
+ # ARROW-8213: Opening a dataset with a local incorrect path gives confusing
+ # error message
+ with pytest.raises(FileNotFoundError):
+ ds.dataset('i-am-not-existing.parquet', format='parquet')
+
+
@pytest.mark.parquet
@pytest.mark.s3
def test_open_dataset_from_uri_s3(s3_connection, s3_server):