[
https://issues.apache.org/jira/browse/ARROW-2031?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16340155#comment-16340155
]
ASF GitHub Bot commented on ARROW-2031:
---------------------------------------
wesm closed pull request #1505: ARROW-2031: [Python] HadoopFileSystem is
pickleable
URL: https://github.com/apache/arrow/pull/1505
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py
index 3c9d04188..3f2014b65 100644
--- a/python/pyarrow/hdfs.py
+++ b/python/pyarrow/hdfs.py
@@ -36,6 +36,10 @@ def __init__(self, host="default", port=0, user=None,
kerb_ticket=None,
self._connect(host, port, user, kerb_ticket, driver)
+ def __reduce__(self):
+ return (HadoopFileSystem, (self.host, self.port, self.user,
+ self.kerb_ticket, self.driver))
+
@implements(FileSystem.isdir)
def isdir(self, path):
return super(HadoopFileSystem, self).isdir(path)
diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi
index e65381323..3abf045f9 100644
--- a/python/pyarrow/io-hdfs.pxi
+++ b/python/pyarrow/io-hdfs.pxi
@@ -59,29 +59,41 @@ cdef class HadoopFileSystem:
cdef readonly:
bint is_open
-
- def __cinit__(self):
- pass
+ str host
+ str user
+ str kerb_ticket
+ str driver
+ int port
def _connect(self, host, port, user, kerb_ticket, driver):
cdef HdfsConnectionConfig conf
if host is not None:
conf.host = tobytes(host)
+ self.host = host
+
conf.port = port
+ self.port = port
+
if user is not None:
conf.user = tobytes(user)
+ self.user = user
+
if kerb_ticket is not None:
conf.kerb_ticket = tobytes(kerb_ticket)
+ self.kerb_ticket = kerb_ticket
if driver == 'libhdfs':
with nogil:
check_status(HaveLibHdfs())
conf.driver = HdfsDriver_LIBHDFS
- else:
+ elif driver == 'libhdfs3':
with nogil:
check_status(HaveLibHdfs3())
conf.driver = HdfsDriver_LIBHDFS3
+ else:
+ raise ValueError("unknown driver: %r" % driver)
+ self.driver = driver
with nogil:
check_status(CHadoopFileSystem.Connect(&conf, &self.client))
diff --git a/python/pyarrow/tests/test_hdfs.py
b/python/pyarrow/tests/test_hdfs.py
index 51b6ba25b..b62458cd7 100644
--- a/python/pyarrow/tests/test_hdfs.py
+++ b/python/pyarrow/tests/test_hdfs.py
@@ -18,6 +18,7 @@
from io import BytesIO
from os.path import join as pjoin
import os
+import pickle
import random
import unittest
@@ -36,7 +37,7 @@
def hdfs_test_client(driver='libhdfs'):
host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost')
- user = os.environ['ARROW_HDFS_TEST_USER']
+ user = os.environ.get('ARROW_HDFS_TEST_USER', None)
try:
port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 20500))
except ValueError:
@@ -72,6 +73,22 @@ def tearDownClass(cls):
cls.hdfs.delete(cls.tmp_path, recursive=True)
cls.hdfs.close()
+ def test_unknown_driver(self):
+ with pytest.raises(ValueError):
+ hdfs_test_client(driver="not_a_driver_name")
+
+ def test_pickle(self):
+ s = pickle.dumps(self.hdfs)
+ h2 = pickle.loads(s)
+ assert h2.is_open
+ assert h2.host == self.hdfs.host
+ assert h2.port == self.hdfs.port
+ assert h2.user == self.hdfs.user
+ assert h2.kerb_ticket == self.hdfs.kerb_ticket
+ assert h2.driver == self.hdfs.driver
+ # smoketest unpickled client works
+ h2.ls(self.tmp_path)
+
def test_cat(self):
path = pjoin(self.tmp_path, 'cat-test')
@@ -299,7 +316,7 @@ class TestLibHdfs(HdfsTestCases, unittest.TestCase):
@classmethod
def check_driver(cls):
if not pa.have_libhdfs():
- pytest.fail('No libhdfs available on system')
+ pytest.skip('No libhdfs available on system')
def test_orphaned_file(self):
hdfs = hdfs_test_client()
@@ -318,4 +335,4 @@ class TestLibHdfs3(HdfsTestCases, unittest.TestCase):
@classmethod
def check_driver(cls):
if not pa.have_libhdfs3():
- pytest.fail('No libhdfs3 available on system')
+ pytest.skip('No libhdfs3 available on system')
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> HadoopFileSystem isn't pickleable
> ---------------------------------
>
> Key: ARROW-2031
> URL: https://issues.apache.org/jira/browse/ARROW-2031
> Project: Apache Arrow
> Issue Type: Improvement
> Components: Python
> Reporter: Jim Crist
> Priority: Minor
> Labels: pull-request-available
> Fix For: 0.9.0
>
>
> Currently instances of `pa.hdfs.HadoopFileSystem` aren't pickleable.
> Pickling/unpickling of a `HadoopFileSystem` should create a new instance with
> the same connection parameters.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)