[ https://issues.apache.org/jira/browse/ARROW-16438?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17531116#comment-17531116 ]
Joris Van den Bossche edited comment on ARROW-16438 at 5/3/22 9:25 AM: ----------------------------------------------------------------------- So reproducing this locally with a pyarrow local filesystem: {code} import pyarrow as pa import pyarrow.parquet as pq import pyarrow.dataset as ds table = pa.table({'a': [1, 2, 3]}) pq.write_to_dataset(table, "test_parquet_dataset/") In [9]: ds.dataset(["test_parquet_dataset/"], format="parquet", filesystem=LocalFileSystem()) --------------------------------------------------------------------------- IsADirectoryError Traceback (most recent call last) <ipython-input-9-8e554a28b381> in <module> ----> 1 ds.dataset(["test_parquet_dataset/"], format="parquet", filesystem=LocalFileSystem()) ~/scipy/repos/arrow/python/pyarrow/dataset.py in dataset(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes) 695 elif isinstance(source, (tuple, list)): 696 if all(_is_path_like(elem) for elem in source): --> 697 return _filesystem_dataset(source, **kwargs) 698 elif all(isinstance(elem, Dataset) for elem in source): 699 return _union_dataset(source, **kwargs) ~/scipy/repos/arrow/python/pyarrow/dataset.py in _filesystem_dataset(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes) 435 436 if isinstance(source, (list, tuple)): --> 437 fs, paths_or_selector = _ensure_multiple_sources(source, filesystem) 438 else: 439 fs, paths_or_selector = _ensure_single_source(source, filesystem) ~/scipy/repos/arrow/python/pyarrow/dataset.py in _ensure_multiple_sources(paths, filesystem) 356 raise FileNotFoundError(info.path) 357 elif file_type == FileType.Directory: --> 358 raise IsADirectoryError( 359 'Path {} points to a directory, but only file paths are ' 360 'supported. To construct a nested or union dataset pass ' IsADirectoryError: Path test_parquet_dataset/ points to a directory, but only file paths are supported. To construct a nested or union dataset pass a list of dataset objects instead. {code} So it also errors, although it gives a more clear error message about a directory not being supported (this error message comes from an additional check that we only do if the filesystem is local, I suppose because those checks can potentially be costly for remote filesystems). was (Author: jorisvandenbossche): So reproducing this locally with a pyarrow local filesystem: {code} import pyarrow as pa import pyarrow.parquet as pq import pyarrow.dataset as ds table = pa.table({'a': [1, 2, 3]}) pq.write_to_dataset(table, "test_parquet_dataset/") In [9]: ds.dataset(["test_parquet_dataset/"], format="parquet", filesystem=LocalFileSystem()) --------------------------------------------------------------------------- IsADirectoryError Traceback (most recent call last) <ipython-input-9-8e554a28b381> in <module> ----> 1 ds.dataset(["test_parquet_dataset/"], format="parquet", filesystem=LocalFileSystem()) ~/scipy/repos/arrow/python/pyarrow/dataset.py in dataset(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes) 695 elif isinstance(source, (tuple, list)): 696 if all(_is_path_like(elem) for elem in source): --> 697 return _filesystem_dataset(source, **kwargs) 698 elif all(isinstance(elem, Dataset) for elem in source): 699 return _union_dataset(source, **kwargs) ~/scipy/repos/arrow/python/pyarrow/dataset.py in _filesystem_dataset(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes) 435 436 if isinstance(source, (list, tuple)): --> 437 fs, paths_or_selector = _ensure_multiple_sources(source, filesystem) 438 else: 439 fs, paths_or_selector = _ensure_single_source(source, filesystem) ~/scipy/repos/arrow/python/pyarrow/dataset.py in _ensure_multiple_sources(paths, filesystem) 356 raise FileNotFoundError(info.path) 357 elif file_type == FileType.Directory: --> 358 raise IsADirectoryError( 359 'Path {} points to a directory, but only file paths are ' 360 'supported. To construct a nested or union dataset pass ' IsADirectoryError: Path test_parquet_dataset/ points to a directory, but only file paths are supported. To construct a nested or union dataset pass a list of dataset objects instead. {code} So it also errors, although it gives a more clear error message about a directory not being supported. > pyarrow dataset API fails to read s3 directory > ---------------------------------------------- > > Key: ARROW-16438 > URL: https://issues.apache.org/jira/browse/ARROW-16438 > Project: Apache Arrow > Issue Type: Bug > Components: Python > Affects Versions: 7.0.0 > Reporter: Prem Sagar Gali > Priority: Major > > When an s3 file system as `file_system` is passed to > [pyarrow.dataset.dataset|https://arrow.apache.org/docs/python/generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset] > API and the `source` is a directory name with bucket, there is an error: > {code:python} > In [5]: from fsspec.core import get_fs_token_paths > In [6]: fs, _, path = get_fs_token_paths("s3://prem-rapids-test/folder/", > mode="rb") > In [7]: fs > Out[7]: <s3fs.core.S3FileSystem at 0x7f3d02cc1460> > In [8]: path > Out[8]: ['prem-rapids-test/folder'] > In [10]: pa.dataset.dataset(path, filesystem=fs, format="parquet") > --------------------------------------------------------------------------- > FileNotFoundError Traceback (most recent call last) > Input In [10], in <cell line: 1>() > ----> 1 pa.dataset.dataset(path, filesystem=fs, format="parquet") > File > /nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pyarrow/dataset.py:670, > in dataset(source, schema, format, filesystem, partitioning, > partition_base_dir, exclude_invalid_files, ignore_prefixes) > 668 elif isinstance(source, (tuple, list)): > 669 if all(_is_path_like(elem) for elem in source): > --> 670 return _filesystem_dataset(source, **kwargs) > 671 elif all(isinstance(elem, Dataset) for elem in source): > 672 return _union_dataset(source, **kwargs) > File > /nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pyarrow/dataset.py:422, > in _filesystem_dataset(source, schema, filesystem, partitioning, format, > partition_base_dir, exclude_invalid_files, selector_ignore_prefixes) > 414 options = FileSystemFactoryOptions( > 415 partitioning=partitioning, > 416 partition_base_dir=partition_base_dir, > 417 exclude_invalid_files=exclude_invalid_files, > 418 selector_ignore_prefixes=selector_ignore_prefixes > 419 ) > 420 factory = FileSystemDatasetFactory(fs, paths_or_selector, format, > options) > --> 422 return factory.finish(schema) > File > /nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pyarrow/_dataset.pyx:1680, > in pyarrow._dataset.DatasetFactory.finish() > File > /nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pyarrow/error.pxi:143, > in pyarrow.lib.pyarrow_internal_check_status() > File > /nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pyarrow/_fs.pyx:1179, > in pyarrow._fs._cb_open_input_file() > File > /nvme/0/pgali/envs/cudfdev/lib/python3.8/site-packages/pyarrow/fs.py:394, in > FSSpecHandler.open_input_file(self, path) > 391 from pyarrow import PythonFile > 393 if not self.fs.isfile(path): > --> 394 raise FileNotFoundError(path) > 396 return PythonFile(self.fs.open(path, mode="rb"), mode="r") > FileNotFoundError: prem-rapids-test/folder > {code} > But it works only if the folder is passed as a full string: > {code:python} > In [3]: import pyarrow.dataset > In [4]: pa.dataset.dataset("s3://prem-rapids-test/folder/", format="parquet") > Out[4]: <pyarrow._dataset.FileSystemDataset at 0x7f3ce502d870> > {code} > -- This message was sent by Atlassian Jira (v8.20.7#820007)