Repository: arrow Updated Branches: refs/heads/master a4f3259b0 -> fee447510
ARROW-1029: [Python] Fixes for building pyarrow with Parquet support on MSVC. Add to appveyor build Miscellaneous fixes to build with ``--with-parquet`` and pass the test suite. We still have a bunch of compiler warnings; not blocking for 0.4.0, but after PARQUET-991 is resolved we should fix the MSVC compiler warnings in pyarrow Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #700 from wesm/ARROW-1029 and squashes the following commits: b651169c [Wes McKinney] Fix post rebase issue 0f438488 [Wes McKinney] Fixes for building pyarrow with Parquet support on MSVC. Add to appveyor build Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/fee44751 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/fee44751 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/fee44751 Branch: refs/heads/master Commit: fee44751048847ce1e08039ab72c09918b11b31b Parents: a4f3259 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Wed May 17 13:32:08 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Wed May 17 13:32:08 2017 -0400 ---------------------------------------------------------------------- ci/msvc-build.bat | 32 ++++++++++++++---- python/CMakeLists.txt | 16 +++++---- python/cmake_modules/FindArrow.cmake | 2 -- python/cmake_modules/FindParquet.cmake | 51 ++++++++++++++++++----------- python/doc/source/development.rst | 51 +++++++++++++++++++++++++++-- python/pyarrow/filesystem.py | 8 +++++ python/pyarrow/parquet.py | 2 +- python/pyarrow/tests/test_parquet.py | 6 ++-- 8 files changed, 127 insertions(+), 41 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/ci/msvc-build.bat ---------------------------------------------------------------------- diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat index 504da76..f756fc5 100644 --- a/ci/msvc-build.bat +++ b/ci/msvc-build.bat @@ -19,17 +19,19 @@ conda create -n arrow -q -y python=%PYTHON% ^ six pytest setuptools numpy pandas cython -conda install -n arrow -q -y -c conda-forge flatbuffers rapidjson +conda install -n arrow -q -y -c conda-forge ^ + flatbuffers rapidjson ^ + cmake git boost-cpp thrift-cpp snappy zlib brotli + call activate arrow set ARROW_HOME=%CONDA_PREFIX%\Library -set FLATBUFFERS_HOME=%CONDA_PREFIX%\Library -set RAPIDJSON_HOME=%CONDA_PREFIX%\Library +set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library @rem Build and test Arrow C++ libraries mkdir cpp\build -cd cpp\build +pushd cpp\build cmake -G "%GENERATOR%" ^ -DCMAKE_INSTALL_PREFIX=%CONDA_PREFIX%\Library ^ @@ -44,10 +46,28 @@ cmake --build . --target INSTALL --config Release || exit /B set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX% ctest -VV || exit /B +popd + +@rem Build parquet-cpp + +git clone https://github.com/apache/parquet-cpp.git || exit /B +mkdir parquet-cpp\build +pushd parquet-cpp\build + +set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library +set PARQUET_HOME=%CONDA_PREFIX%\Library +cmake -G "%GENERATOR%" ^ + -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DPARQUET_ZLIB_VENDORED=off ^ + -DPARQUET_BUILD_TESTS=off .. || exit /B +cmake --build . --target INSTALL --config Release || exit /B +popd @rem Build and import pyarrow set PYTHONPATH= -cd ..\..\python -python setup.py build_ext --inplace || exit /B +pushd python +python setup.py build_ext --inplace --with-parquet || exit /B py.test pyarrow -v -s || exit /B +popd http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ded85e8..6f48f7f 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -219,12 +219,6 @@ include_directories(SYSTEM # Dependencies ############################################################ -## Parquet -find_package(Parquet) -if(PARQUET_FOUND) - include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) -endif() - ## Arrow find_package(Arrow REQUIRED) include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) @@ -286,9 +280,14 @@ set(LINK_LIBS ) if (PYARROW_BUILD_PARQUET) + ## Parquet + find_package(Parquet) + if(NOT (PARQUET_FOUND AND PARQUET_ARROW_FOUND)) message(FATAL_ERROR "Unable to locate Parquet libraries") endif() + include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) + if (PYARROW_BUNDLE_ARROW_CPP) get_filename_component(PARQUET_LIBRARY_DIR ${PARQUET_SHARED_LIB} DIRECTORY) get_filename_component(PARQUET_LIBRARY_NAME ${PARQUET_SHARED_LIB} NAME_WE) @@ -333,11 +332,14 @@ if (PYARROW_BUILD_PARQUET) #SET(PARQUET_ARROW_SHARED_LIB # ${BUILD_OUTPUT_ROOT_DIRECTORY}/libparquet_arrow${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() + ADD_THIRDPARTY_LIB(parquet + SHARED_LIB ${PARQUET_SHARED_LIB}) ADD_THIRDPARTY_LIB(parquet_arrow SHARED_LIB ${PARQUET_ARROW_SHARED_LIB}) set(LINK_LIBS ${LINK_LIBS} - parquet_arrow) + parquet_shared + parquet_arrow_shared) set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _parquet) http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/cmake_modules/FindArrow.cmake ---------------------------------------------------------------------- diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index 9fb1355..c16a4bf 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -76,10 +76,8 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIBS) if (MSVC) set(ARROW_STATIC_LIB ${ARROW_LIB_PATH}) set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}) - set(ARROW_JEMALLOC_STATIC_LIB ${ARROW_JEMALLOC_LIB_PATH}) set(ARROW_SHARED_LIB ${ARROW_STATIC_LIB}) set(ARROW_PYTHON_SHARED_LIB ${ARROW_PYTHON_STATIC_LIB}) - set(ARROW_JEMALLOC_SHARED_LIB ${ARROW_JEMALLOC_STATIC_LIB}) else() set(ARROW_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/libarrow.a) set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/libarrow_python.a) http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/cmake_modules/FindParquet.cmake ---------------------------------------------------------------------- diff --git a/python/cmake_modules/FindParquet.cmake b/python/cmake_modules/FindParquet.cmake index ffd043d..de53a29 100644 --- a/python/cmake_modules/FindParquet.cmake +++ b/python/cmake_modules/FindParquet.cmake @@ -1,16 +1,19 @@ -# Copyright 2012 Cloudera Inc. +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 # -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. # - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so) # This module defines @@ -71,9 +74,14 @@ endif() if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES) set(PARQUET_FOUND TRUE) - set(PARQUET_LIB_NAME libparquet) - set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a) - set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + if (MSVC) + set(PARQUET_STATIC_LIB "${PARQUET_LIBRARIES}_static") + set(PARQUET_SHARED_LIB "${PARQUET_LIBRARIES}") + else() + set(PARQUET_LIB_NAME libparquet) + set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a) + set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() else () set(PARQUET_FOUND FALSE) endif () @@ -81,11 +89,16 @@ endif () if (PARQUET_INCLUDE_DIR AND PARQUET_ARROW_LIBRARIES) set(PARQUET_ARROW_FOUND TRUE) get_filename_component(PARQUET_ARROW_LIBS ${PARQUET_ARROW_LIBRARIES} PATH) - set(PARQUET_ARROW_LIB_NAME libparquet_arrow) - set(PARQUET_ARROW_STATIC_LIB - ${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a) - set(PARQUET_ARROW_SHARED_LIB - ${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + if (MSVC) + set(PARQUET_ARROW_STATIC_LIB "${PARQUET_ARROW_LIBRARIES}_static") + set(PARQUET_ARROW_SHARED_LIB "${PARQUET_ARROW_LIBRARIES}") + else() + set(PARQUET_ARROW_LIB_NAME libparquet_arrow) + set(PARQUET_ARROW_STATIC_LIB + ${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a) + set(PARQUET_ARROW_SHARED_LIB + ${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() else () set(PARQUET_ARROW_FOUND FALSE) endif () http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/doc/source/development.rst ---------------------------------------------------------------------- diff --git a/python/doc/source/development.rst b/python/doc/source/development.rst index 440c1c4..6ec563e 100644 --- a/python/doc/source/development.rst +++ b/python/doc/source/development.rst @@ -174,14 +174,37 @@ You should be able to run the unit tests with: Windows ======= -First, make sure you can `build the C++ library <https://github.com/apache/arrow/blob/master/cpp/doc/Windows.md>`_. +First, we bootstrap a conda environment similar to the `C++ build instructions +<https://github.com/apache/arrow/blob/master/cpp/doc/Windows.md>`_. This +includes all the dependencies for Arrow and the Apache Parquet C++ libraries. -Now, we need to build and install the C++ libraries someplace. +First, starting from fresh clones of Apache Arrow and parquet-cpp: + +.. code-block:: shell + + git clone https://github.com/apache/arrow.git + git clone https://github.com/apache/parquet-cpp.git + +.. code-block:: shell + + conda create -n arrow-dev cmake git boost-cpp ^ + flatbuffers snappy zlib brotli thrift-cpp rapidjson + activate arrow-dev + +As one git housekeeping item, we must run this command in our Arrow clone: + +.. code-block:: shell + + cd arrow + git config core.symlinks true + +Now, we build and install Arrow C++ libraries .. code-block:: shell mkdir cpp\build cd cpp\build + set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library set ARROW_HOME=C:\thirdparty cmake -G "Visual Studio 14 2015 Win64" ^ -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ @@ -191,6 +214,22 @@ Now, we need to build and install the C++ libraries someplace. cmake --build . --target INSTALL --config Release cd ..\.. +Now, we build parquet-cpp and install the result in the same place: + +.. code-block:: shell + + mkdir ..\parquet-cpp\build + pushd ..\parquet-cpp\build + set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library + set PARQUET_HOME=C:\thirdparty + cmake -G "Visual Studio 14 2015 Win64" ^ + -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DPARQUET_ZLIB_VENDORED=off ^ + -DPARQUET_BUILD_TESTS=off .. + cmake --build . --target INSTALL --config Release + popd + After that, we must put the install directory's bin path in our ``%PATH%``: .. code-block:: shell @@ -202,7 +241,13 @@ Now, we can build pyarrow: .. code-block:: shell cd python - python setup.py build_ext --inplace + python setup.py build_ext --inplace --with-parquet + +Then run the unit tests with: + +.. code-block:: shell + + py.test pyarrow -v Running C++ unit tests with Python ---------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/pyarrow/filesystem.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index ac37fd8..798d96b 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -92,6 +92,10 @@ class Filesystem(object): filesystem=self) return dataset.read(columns=columns, nthreads=nthreads) + @property + def pathsep(self): + return '/' + class LocalFilesystem(Filesystem): @@ -132,6 +136,10 @@ class LocalFilesystem(Filesystem): """ return open(path, mode=mode) + @property + def pathsep(self): + return os.path.sep + class HdfsClient(lib._HdfsClient, Filesystem): """ http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/pyarrow/parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index e69d85e..f59a719 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -582,7 +582,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/'): if is_string(path_or_paths) and fs.isdir(path_or_paths): manifest = ParquetManifest(path_or_paths, filesystem=fs, - pathsep=pathsep) + pathsep=fs.pathsep) metadata_path = manifest.metadata_path pieces = manifest.pieces partitions = manifest.partitions http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index db446d3..5f65f28 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -192,10 +192,10 @@ def test_pandas_column_selection(tmpdir): def _random_integers(size, dtype): # We do not generate integers outside the int64 range - i64_info = np.iinfo('int64') + platform_int_info = np.iinfo('int_') iinfo = np.iinfo(dtype) - return np.random.randint(max(iinfo.min, i64_info.min), - min(iinfo.max, i64_info.max), + return np.random.randint(max(iinfo.min, platform_int_info.min), + min(iinfo.max, platform_int_info.max), size=size).astype(dtype)