This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1a00cf6 ARROW-4723: [Python] Ignore "hidden" files that starts with
underscore
1a00cf6 is described below
commit 1a00cf61aba51ab58196128749c0d123c50f400d
Author: HyukjinKwon <[email protected]>
AuthorDate: Fri Jun 7 14:24:40 2019 -0500
ARROW-4723: [Python] Ignore "hidden" files that starts with underscore
This PR proposes to ignore "hidden" files that start with underscore as
well. This is a Hadoop convention.
Author: HyukjinKwon <[email protected]>
Closes #4478 from HyukjinKwon/regex-support and squashes the following
commits:
bb6218f53 <HyukjinKwon> Ignore "hidden" files that starts with underscore
---
python/pyarrow/parquet.py | 3 ++-
python/pyarrow/tests/test_parquet.py | 20 +++++++++++++++++++-
2 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 34a9c42..754d3c1 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -832,7 +832,8 @@ class ParquetManifest(object):
def _should_silently_exclude(self, file_name):
return (file_name.endswith('.crc') or # Checksums
file_name.endswith('_$folder$') or # HDFS directories in S3
- file_name.startswith('.') or # Hidden files
+ file_name.startswith('.') or # Hidden files starting with .
+ file_name.startswith('_') or # Hidden files starting with _
file_name in EXCLUDED_PARQUET_PATHS)
def _visit_directories(self, level, directories, part_keys):
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index 0e6d636..4598bb9 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1953,7 +1953,7 @@ def test_ignore_private_directories(tempdir):
@pytest.mark.pandas
-def test_ignore_hidden_files(tempdir):
+def test_ignore_hidden_files_dot(tempdir):
dirpath = tempdir / guid()
dirpath.mkdir()
@@ -1971,6 +1971,24 @@ def test_ignore_hidden_files(tempdir):
@pytest.mark.pandas
+def test_ignore_hidden_files_underscore(tempdir):
+ dirpath = tempdir / guid()
+ dirpath.mkdir()
+
+ paths = _make_example_multifile_dataset(dirpath, nfiles=10,
+ file_nrows=5)
+
+ with (dirpath / '_committed_123').open('wb') as f:
+ f.write(b'abcd')
+
+ with (dirpath / '_started_321').open('wb') as f:
+ f.write(b'abcd')
+
+ dataset = pq.ParquetDataset(dirpath)
+ assert set(map(str, paths)) == set(x.path for x in dataset.pieces)
+
+
[email protected]
def test_multiindex_duplicate_values(tempdir):
num_rows = 3
numbers = list(range(num_rows))