This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a00cf6  ARROW-4723: [Python] Ignore "hidden" files that starts with 
underscore
1a00cf6 is described below

commit 1a00cf61aba51ab58196128749c0d123c50f400d
Author: HyukjinKwon <[email protected]>
AuthorDate: Fri Jun 7 14:24:40 2019 -0500

    ARROW-4723: [Python] Ignore "hidden" files that starts with underscore
    
    This PR proposes to ignore "hidden" files that start with underscore as 
well. This is a Hadoop convention.
    
    Author: HyukjinKwon <[email protected]>
    
    Closes #4478 from HyukjinKwon/regex-support and squashes the following 
commits:
    
    bb6218f53 <HyukjinKwon> Ignore "hidden" files that starts with underscore
---
 python/pyarrow/parquet.py            |  3 ++-
 python/pyarrow/tests/test_parquet.py | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 34a9c42..754d3c1 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -832,7 +832,8 @@ class ParquetManifest(object):
     def _should_silently_exclude(self, file_name):
         return (file_name.endswith('.crc') or  # Checksums
                 file_name.endswith('_$folder$') or  # HDFS directories in S3
-                file_name.startswith('.') or  # Hidden files
+                file_name.startswith('.') or  # Hidden files starting with .
+                file_name.startswith('_') or  # Hidden files starting with _
                 file_name in EXCLUDED_PARQUET_PATHS)
 
     def _visit_directories(self, level, directories, part_keys):
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index 0e6d636..4598bb9 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1953,7 +1953,7 @@ def test_ignore_private_directories(tempdir):
 
 
 @pytest.mark.pandas
-def test_ignore_hidden_files(tempdir):
+def test_ignore_hidden_files_dot(tempdir):
     dirpath = tempdir / guid()
     dirpath.mkdir()
 
@@ -1971,6 +1971,24 @@ def test_ignore_hidden_files(tempdir):
 
 
 @pytest.mark.pandas
+def test_ignore_hidden_files_underscore(tempdir):
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    paths = _make_example_multifile_dataset(dirpath, nfiles=10,
+                                            file_nrows=5)
+
+    with (dirpath / '_committed_123').open('wb') as f:
+        f.write(b'abcd')
+
+    with (dirpath / '_started_321').open('wb') as f:
+        f.write(b'abcd')
+
+    dataset = pq.ParquetDataset(dirpath)
+    assert set(map(str, paths)) == set(x.path for x in dataset.pieces)
+
+
[email protected]
 def test_multiindex_duplicate_values(tempdir):
     num_rows = 3
     numbers = list(range(num_rows))

Reply via email to