Repository: arrow Updated Branches: refs/heads/master 2660dda40 -> f355354c2
ARROW-1317: [Python] Attempt to set Hadoop CLASSPATH when using JNI Author: Wes McKinney <[email protected]> Closes #1040 from wesm/ARROW-1317 and squashes the following commits: 274f839c [Wes McKinney] Add note to documentation 1d664b29 [Wes McKinney] If HADOOP_HOME is not set, see if 'hadoop' is in PATH c6e52d12 [Wes McKinney] Try to set CLASSPATH if HADOOP_HOME is set but classpath is not Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/f355354c Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/f355354c Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/f355354c Branch: refs/heads/master Commit: f355354c24d58ea86558e8fc46f33b47d402ac04 Parents: 2660dda Author: Wes McKinney <[email protected]> Authored: Tue Sep 5 19:17:19 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Tue Sep 5 19:17:19 2017 -0400 ---------------------------------------------------------------------- python/doc/source/filesystems.rst | 3 +++ python/pyarrow/hdfs.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/f355354c/python/doc/source/filesystems.rst ---------------------------------------------------------------------- diff --git a/python/doc/source/filesystems.rst b/python/doc/source/filesystems.rst index c0530f9..5c3297b 100644 --- a/python/doc/source/filesystems.rst +++ b/python/doc/source/filesystems.rst @@ -54,6 +54,9 @@ LD_LIBRARY_PATH), and relies on some environment variables. export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob` +If ``CLASSPATH`` is not set, then it will be set automatically if the +``hadoop`` executable is in your system path, or if ``HADOOP_HOME`` is set. + You can also use libhdfs3, a thirdparty C++ library for HDFS from Pivotal Labs: .. code-block:: python http://git-wip-us.apache.org/repos/asf/arrow/blob/f355354c/python/pyarrow/hdfs.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py index 2f20be2..3c9d041 100644 --- a/python/pyarrow/hdfs.py +++ b/python/pyarrow/hdfs.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import os import posixpath from pyarrow.util import implements @@ -30,6 +31,9 @@ class HadoopFileSystem(lib.HadoopFileSystem, FileSystem): def __init__(self, host="default", port=0, user=None, kerb_ticket=None, driver='libhdfs'): + if driver == 'libhdfs': + _maybe_set_hadoop_classpath() + self._connect(host, port, user, kerb_ticket, driver) @implements(FileSystem.isdir) @@ -105,6 +109,21 @@ class HadoopFileSystem(lib.HadoopFileSystem, FileSystem): yield tup +def _maybe_set_hadoop_classpath(): + import subprocess + + if 'hadoop' in os.environ.get('CLASSPATH', ''): + return + + if 'HADOOP_HOME' in os.environ: + hadoop_bin = '{0}/bin/hadoop'.format(os.environ['HADOOP_HOME']) + else: + hadoop_bin = 'hadoop' + + classpath = subprocess.check_output([hadoop_bin, 'classpath', '--glob']) + os.environ['CLASSPATH'] = classpath.decode('utf-8') + + def _libhdfs_walk_files_dirs(top_path, contents): files = [] directories = []
