[ 
https://issues.apache.org/jira/browse/ARROW-2306?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16398360#comment-16398360
 ] 

ASF GitHub Bot commented on ARROW-2306:
---------------------------------------

xhochy closed pull request #1742: ARROW-2306: [Python] Fix partitioned Parquet 
test against HDFS
URL: https://github.com/apache/arrow/pull/1742
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py
index 3f2014b65..34ddfaef3 100644
--- a/python/pyarrow/hdfs.py
+++ b/python/pyarrow/hdfs.py
@@ -40,6 +40,13 @@ def __reduce__(self):
         return (HadoopFileSystem, (self.host, self.port, self.user,
                                    self.kerb_ticket, self.driver))
 
+    def _isfilestore(self):
+        """
+        Returns True if this FileSystem is a unix-style file store with
+        directories.
+        """
+        return True
+
     @implements(FileSystem.isdir)
     def isdir(self, path):
         return super(HadoopFileSystem, self).isdir(path)
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index fd9c740f1..0929a1549 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -1103,6 +1103,9 @@ def write_metadata(schema, where, version='1.0',
     coerce_timestamps : string, default None
         Cast timestamps a particular resolution.
         Valid values: {None, 'ms', 'us'}
+    filesystem : FileSystem, default None
+        If nothing passed, paths assumed to be found in the local on-disk
+        filesystem
     """
     writer = ParquetWriter(
         where, schema, version=version,
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index a3da05fe3..b301de606 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1431,8 +1431,15 @@ def _test_write_to_dataset_with_partitions(base_path, 
filesystem=None):
     output_table = pa.Table.from_pandas(output_df)
     pq.write_to_dataset(output_table, base_path, partition_by,
                         filesystem=filesystem)
-    pq.write_metadata(output_table.schema,
-                      os.path.join(base_path, '_common_metadata'))
+
+    metadata_path = os.path.join(base_path, '_common_metadata')
+
+    if filesystem is not None:
+        with filesystem.open(metadata_path, 'wb') as f:
+            pq.write_metadata(output_table.schema, f)
+    else:
+        pq.write_metadata(output_table.schema, metadata_path)
+
     dataset = pq.ParquetDataset(base_path, filesystem=filesystem)
     # ARROW-2209: Ensure the dataset schema also includes the partition columns
     dataset_cols = set(dataset.schema.to_arrow_schema().names)


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] HDFS test failures
> ---------------------------
>
>                 Key: ARROW-2306
>                 URL: https://issues.apache.org/jira/browse/ARROW-2306
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Wes McKinney
>            Priority: Blocker
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> These weren't caught because we aren't running the HDFS tests in Travis CI
> {code}
> pyarrow/tests/test_hdfs.py::TestLibHdfs::test_write_to_dataset_no_partitions 
> FAILED
> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> traceback 
> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
> self = <pyarrow.tests.test_hdfs.TestLibHdfs 
> testMethod=test_write_to_dataset_no_partitions>
>     @test_parquet.parquet
>     def test_write_to_dataset_no_partitions(self):
>         tmpdir = pjoin(self.tmp_path, 'write-no_partitions-' + guid())
>         self.hdfs.mkdir(tmpdir)
>         test_parquet._test_write_to_dataset_no_partitions(
> >           tmpdir, filesystem=self.hdfs)
> pyarrow/tests/test_hdfs.py:367: 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> pyarrow/tests/test_parquet.py:1475: in _test_write_to_dataset_no_partitions
>     filesystem=filesystem)
> pyarrow/parquet.py:1059: in write_to_dataset
>     _mkdir_if_not_exists(fs, root_path)
> pyarrow/parquet.py:1006: in _mkdir_if_not_exists
>     if fs._isfilestore() and not fs.exists(path):
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> self = <pyarrow.hdfs.HadoopFileSystem object at 0x7f09e87a4c48>
>     def _isfilestore(self):
>         """
>             Returns True if this FileSystem is a unix-style file store with
>             directories.
>             """
> >       raise NotImplementedError
> E       NotImplementedError
> pyarrow/filesystem.py:143: NotImplementedError
> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> entering PDB 
> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
> > /home/wesm/code/arrow/python/pyarrow/filesystem.py(143)_isfilestore()
> -> raise NotImplementedError
> (Pdb) c
> pyarrow/tests/test_hdfs.py::TestLibHdfs::test_write_to_dataset_with_partitions
>  FAILED
> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> traceback 
> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
> self = <pyarrow.tests.test_hdfs.TestLibHdfs 
> testMethod=test_write_to_dataset_with_partitions>
>     @test_parquet.parquet
>     def test_write_to_dataset_with_partitions(self):
>         tmpdir = pjoin(self.tmp_path, 'write-partitions-' + guid())
>         self.hdfs.mkdir(tmpdir)
>         test_parquet._test_write_to_dataset_with_partitions(
> >           tmpdir, filesystem=self.hdfs)
> pyarrow/tests/test_hdfs.py:360: 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> pyarrow/tests/test_parquet.py:1433: in _test_write_to_dataset_with_partitions
>     filesystem=filesystem)
> pyarrow/parquet.py:1059: in write_to_dataset
>     _mkdir_if_not_exists(fs, root_path)
> pyarrow/parquet.py:1006: in _mkdir_if_not_exists
>     if fs._isfilestore() and not fs.exists(path):
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
> self = <pyarrow.hdfs.HadoopFileSystem object at 0x7f09e87a4c48>
>     def _isfilestore(self):
>         """
>             Returns True if this FileSystem is a unix-style file store with
>             directories.
>             """
> >       raise NotImplementedError
> E       NotImplementedError
> pyarrow/filesystem.py:143: NotImplementedError
> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> entering PDB 
> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
> > /home/wesm/code/arrow/python/pyarrow/filesystem.py(143)_isfilestore()
> -> raise NotImplementedError
> (Pdb) c
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to