This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new af925d9 ARROW-2038: [Python] Strip s3:// scheme in S3FSWrapper
isdir() and isfile()
af925d9 is described below
commit af925d9395bd8f5cf435f379e389633bd3acfdfd
Author: Dmitry Vukolov <[email protected]>
AuthorDate: Wed Jan 9 13:58:48 2019 +0100
ARROW-2038: [Python] Strip s3:// scheme in S3FSWrapper isdir() and isfile()
This fixes an exception from ParquetDataset arising when the supplied path
contains the `s3://` scheme specifier. The issue stemmed from the fact that
while the underlying S3FileSystem does support both types of paths, with and
without and explicit `s3://`, its function calls always return paths stripped
of the scheme. This messed up with the logic in isdir() and isfile().
An alternative solution would be to strip the scheme in parquet.py (by
adding it to _URI_STRIP_SCHEMES). This however would require additional code
changes along the lines of:
```python
_URI_STRIP_SCHEMES = ('hdfs', 's3')
def _parse_uri(path):
path = _stringify_path(path)
parsed_uri = urlparse(path)
if parsed_uri.scheme in _URI_STRIP_SCHEMES:
scheme = '{0}://'.format(parsed_uri.scheme)
path = parsed_uri.geturl().replace(scheme, '', 1)
return path
else:
# ARROW-4073: On Windows returning the path with the scheme
# stripped removes the drive letter, if any
return path
```
Not sure if that would have any impact on handling HDFS. Therefore this
patch proposes a safer, more localised approach, already used in other parts of
S3FSWrapper.
Author: Dmitry Vukolov <[email protected]>
Closes #3286 from dvukolov/master and squashes the following commits:
8de916c5 <Dmitry Vukolov> Strip s3:// scheme in S3FSWrapper isdir() and
isfile()
---
python/pyarrow/filesystem.py | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py
index 98efb1e..92a65ce 100644
--- a/python/pyarrow/filesystem.py
+++ b/python/pyarrow/filesystem.py
@@ -319,7 +319,7 @@ class S3FSWrapper(DaskFileSystem):
@implements(FileSystem.isdir)
def isdir(self, path):
- path = _stringify_path(path)
+ path = _sanitize_s3(_stringify_path(path))
try:
contents = self.fs.ls(path)
if len(contents) == 1 and contents[0] == path:
@@ -331,7 +331,7 @@ class S3FSWrapper(DaskFileSystem):
@implements(FileSystem.isfile)
def isfile(self, path):
- path = _stringify_path(path)
+ path = _sanitize_s3(_stringify_path(path))
try:
contents = self.fs.ls(path)
return len(contents) == 1 and contents[0] == path
@@ -345,7 +345,7 @@ class S3FSWrapper(DaskFileSystem):
Generator version of what is in s3fs, which yields a flattened list of
files
"""
- path = _stringify_path(path).replace('s3://', '')
+ path = _sanitize_s3(_stringify_path(path))
directories = set()
files = set()
@@ -371,6 +371,13 @@ class S3FSWrapper(DaskFileSystem):
yield tup
+def _sanitize_s3(path):
+ if path.startswith('s3://'):
+ return path.replace('s3://', '')
+ else:
+ return path
+
+
def _ensure_filesystem(fs):
fs_type = type(fs)