Repository: arrow Updated Branches: refs/heads/master 0bdfd5efb -> 31f145dc5
ARROW-545: [Python] Ignore non .parq/.parquet files when reading directories as Parquet datasets Author: Wes McKinney <[email protected]> Closes #331 from wesm/ARROW-545 and squashes the following commits: 5494167 [Wes McKinney] Docstring typo 92b274c [Wes McKinney] Ignore non .parq/.parquet files when reading directories-as-Parquet-datasets Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/31f145dc Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/31f145dc Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/31f145dc Branch: refs/heads/master Commit: 31f145dc5296d27cc8010a4cd17ca5b4ae461dff Parents: 0bdfd5e Author: Wes McKinney <[email protected]> Authored: Thu Feb 9 13:47:09 2017 +0100 Committer: Uwe L. Korn <[email protected]> Committed: Thu Feb 9 13:47:09 2017 +0100 ---------------------------------------------------------------------- python/pyarrow/__init__.py | 2 +- python/pyarrow/filesystem.py | 23 +++++++++++++++++------ python/pyarrow/parquet.py | 18 ++++++++++++++++-- python/pyarrow/tests/test_parquet.py | 4 ++++ 4 files changed, 38 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/31f145dc/python/pyarrow/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index ea4710d..6724b52 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -67,4 +67,4 @@ from pyarrow.schema import (null, bool_, from pyarrow.table import Column, RecordBatch, Table, concat_tables -localfs = LocalFilesystem() +localfs = LocalFilesystem.get_instance() http://git-wip-us.apache.org/repos/asf/arrow/blob/31f145dc/python/pyarrow/filesystem.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 82409b7..55bcad0 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -62,7 +62,7 @@ class Filesystem(object): """ raise NotImplementedError - def read_parquet(self, path, columns=None, schema=None): + def read_parquet(self, path, columns=None, metadata=None, schema=None): """ Read Parquet data from path in file system. Can read from a single file or a directory of files @@ -73,8 +73,11 @@ class Filesystem(object): Single file path or directory columns : List[str], optional Subset of columns to read + metadata : pyarrow.parquet.FileMetaData + Known metadata to validate files against schema : pyarrow.parquet.Schema - Known schema to validate files against + Known schema to validate files against. Alternative to metadata + argument Returns ------- @@ -85,18 +88,26 @@ class Filesystem(object): if self.isdir(path): paths_to_read = [] for path in self.ls(path): - if path == '_metadata' or path == '_common_metadata': - raise ValueError('No support yet for common metadata file') - paths_to_read.append(path) + if path.endswith('parq') or path.endswith('parquet'): + paths_to_read.append(path) else: paths_to_read = [path] return read_multiple_files(paths_to_read, columns=columns, - filesystem=self, schema=schema) + filesystem=self, schema=schema, + metadata=metadata) class LocalFilesystem(Filesystem): + _instance = None + + @classmethod + def get_instance(cls): + if cls._instance is None: + cls._instance = LocalFilesystem() + return cls._instance + @implements(Filesystem.ls) def ls(self, path): return sorted(pjoin(path, x) for x in os.listdir(path)) http://git-wip-us.apache.org/repos/asf/arrow/blob/31f145dc/python/pyarrow/parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 6654b77..9766ff6 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -15,12 +15,17 @@ # specific language governing permissions and limitations # under the License. +import six + from pyarrow._parquet import (ParquetReader, FileMetaData, # noqa RowGroupMetaData, Schema, ParquetWriter) import pyarrow._parquet as _parquet # noqa from pyarrow.table import concat_tables +EXCLUDED_PARQUET_PATHS = {'_metadata', '_common_metadata', '_SUCCESS'} + + class ParquetFile(object): """ Open a Parquet binary file for reading @@ -82,8 +87,9 @@ def read_table(source, columns=None, nthreads=1, metadata=None): Parameters ---------- source: str or pyarrow.io.NativeFile - Readable source. For passing Python file objects or byte buffers, see - pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. + Location of Parquet dataset. If a string passed, can be a single file + name or directory name. For passing Python file objects or byte + buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 @@ -97,6 +103,14 @@ def read_table(source, columns=None, nthreads=1, metadata=None): pyarrow.Table Content of the file as a table (of columns) """ + from pyarrow.filesystem import LocalFilesystem + + if isinstance(source, six.string_types): + fs = LocalFilesystem.get_instance() + if fs.isdir(source): + return fs.read_parquet(source, columns=columns, + metadata=metadata) + pf = ParquetFile(source, metadata=metadata) return pf.read(columns=columns, nthreads=nthreads) http://git-wip-us.apache.org/repos/asf/arrow/blob/31f145dc/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 80a995f..969f68b 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -393,6 +393,10 @@ def test_read_multiple_files(tmpdir): test_data.append(table) paths.append(path) + # Write a _SUCCESS.crc file + with open(pjoin(dirpath, '_SUCCESS.crc'), 'wb') as f: + f.write(b'0') + result = pq.read_multiple_files(paths) expected = pa.concat_tables(test_data)
