Repository: arrow
Updated Branches:
  refs/heads/master a4f3259b0 -> fee447510


ARROW-1029: [Python] Fixes for building pyarrow with Parquet support on MSVC. 
Add to appveyor build

Miscellaneous fixes to build with ``--with-parquet`` and pass the test suite. 
We still have a bunch of compiler warnings; not blocking for 0.4.0, but after 
PARQUET-991 is resolved we should fix the MSVC compiler warnings in pyarrow

Author: Wes McKinney <wes.mckin...@twosigma.com>

Closes #700 from wesm/ARROW-1029 and squashes the following commits:

b651169c [Wes McKinney] Fix post rebase issue
0f438488 [Wes McKinney] Fixes for building pyarrow with Parquet support on 
MSVC. Add to appveyor build


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/fee44751
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/fee44751
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/fee44751

Branch: refs/heads/master
Commit: fee44751048847ce1e08039ab72c09918b11b31b
Parents: a4f3259
Author: Wes McKinney <wes.mckin...@twosigma.com>
Authored: Wed May 17 13:32:08 2017 -0400
Committer: Wes McKinney <wes.mckin...@twosigma.com>
Committed: Wed May 17 13:32:08 2017 -0400

----------------------------------------------------------------------
 ci/msvc-build.bat                      | 32 ++++++++++++++----
 python/CMakeLists.txt                  | 16 +++++----
 python/cmake_modules/FindArrow.cmake   |  2 --
 python/cmake_modules/FindParquet.cmake | 51 ++++++++++++++++++-----------
 python/doc/source/development.rst      | 51 +++++++++++++++++++++++++++--
 python/pyarrow/filesystem.py           |  8 +++++
 python/pyarrow/parquet.py              |  2 +-
 python/pyarrow/tests/test_parquet.py   |  6 ++--
 8 files changed, 127 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/ci/msvc-build.bat
----------------------------------------------------------------------
diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat
index 504da76..f756fc5 100644
--- a/ci/msvc-build.bat
+++ b/ci/msvc-build.bat
@@ -19,17 +19,19 @@
 
 conda create -n arrow -q -y python=%PYTHON% ^
       six pytest setuptools numpy pandas cython
-conda install -n arrow -q -y -c conda-forge flatbuffers rapidjson
+conda install -n arrow -q -y -c conda-forge ^
+      flatbuffers rapidjson ^
+      cmake git boost-cpp thrift-cpp snappy zlib brotli
+
 call activate arrow
 
 set ARROW_HOME=%CONDA_PREFIX%\Library
-set FLATBUFFERS_HOME=%CONDA_PREFIX%\Library
-set RAPIDJSON_HOME=%CONDA_PREFIX%\Library
+set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
 
 @rem Build and test Arrow C++ libraries
 
 mkdir cpp\build
-cd cpp\build
+pushd cpp\build
 
 cmake -G "%GENERATOR%" ^
       -DCMAKE_INSTALL_PREFIX=%CONDA_PREFIX%\Library ^
@@ -44,10 +46,28 @@ cmake --build . --target INSTALL --config Release  || exit 
/B
 set 
PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%
 
 ctest -VV  || exit /B
+popd
+
+@rem Build parquet-cpp
+
+git clone https://github.com/apache/parquet-cpp.git || exit /B
+mkdir parquet-cpp\build
+pushd parquet-cpp\build
+
+set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
+set PARQUET_HOME=%CONDA_PREFIX%\Library
+cmake -G "%GENERATOR%" ^
+     -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^
+     -DCMAKE_BUILD_TYPE=Release ^
+     -DPARQUET_ZLIB_VENDORED=off ^
+     -DPARQUET_BUILD_TESTS=off .. || exit /B
+cmake --build . --target INSTALL --config Release || exit /B
+popd
 
 @rem Build and import pyarrow
 set PYTHONPATH=
 
-cd ..\..\python
-python setup.py build_ext --inplace  || exit /B
+pushd python
+python setup.py build_ext --inplace --with-parquet  || exit /B
 py.test pyarrow -v -s || exit /B
+popd

http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index ded85e8..6f48f7f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -219,12 +219,6 @@ include_directories(SYSTEM
 # Dependencies
 ############################################################
 
-## Parquet
-find_package(Parquet)
-if(PARQUET_FOUND)
-  include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
-endif()
-
 ## Arrow
 find_package(Arrow REQUIRED)
 include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
@@ -286,9 +280,14 @@ set(LINK_LIBS
 )
 
 if (PYARROW_BUILD_PARQUET)
+  ## Parquet
+  find_package(Parquet)
+
   if(NOT (PARQUET_FOUND AND PARQUET_ARROW_FOUND))
     message(FATAL_ERROR "Unable to locate Parquet libraries")
   endif()
+  include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
+
   if (PYARROW_BUNDLE_ARROW_CPP)
       get_filename_component(PARQUET_LIBRARY_DIR ${PARQUET_SHARED_LIB} 
DIRECTORY)
       get_filename_component(PARQUET_LIBRARY_NAME ${PARQUET_SHARED_LIB} 
NAME_WE)
@@ -333,11 +332,14 @@ if (PYARROW_BUILD_PARQUET)
       #SET(PARQUET_ARROW_SHARED_LIB
       #    
${BUILD_OUTPUT_ROOT_DIRECTORY}/libparquet_arrow${CMAKE_SHARED_LIBRARY_SUFFIX})
   endif()
+  ADD_THIRDPARTY_LIB(parquet
+    SHARED_LIB ${PARQUET_SHARED_LIB})
   ADD_THIRDPARTY_LIB(parquet_arrow
     SHARED_LIB ${PARQUET_ARROW_SHARED_LIB})
   set(LINK_LIBS
     ${LINK_LIBS}
-    parquet_arrow)
+    parquet_shared
+    parquet_arrow_shared)
   set(CYTHON_EXTENSIONS
     ${CYTHON_EXTENSIONS}
     _parquet)

http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/cmake_modules/FindArrow.cmake
----------------------------------------------------------------------
diff --git a/python/cmake_modules/FindArrow.cmake 
b/python/cmake_modules/FindArrow.cmake
index 9fb1355..c16a4bf 100644
--- a/python/cmake_modules/FindArrow.cmake
+++ b/python/cmake_modules/FindArrow.cmake
@@ -76,10 +76,8 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIBS)
   if (MSVC)
     set(ARROW_STATIC_LIB ${ARROW_LIB_PATH})
     set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH})
-    set(ARROW_JEMALLOC_STATIC_LIB ${ARROW_JEMALLOC_LIB_PATH})
     set(ARROW_SHARED_LIB ${ARROW_STATIC_LIB})
     set(ARROW_PYTHON_SHARED_LIB ${ARROW_PYTHON_STATIC_LIB})
-    set(ARROW_JEMALLOC_SHARED_LIB ${ARROW_JEMALLOC_STATIC_LIB})
   else()
     set(ARROW_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/libarrow.a)
     set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/libarrow_python.a)

http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/cmake_modules/FindParquet.cmake
----------------------------------------------------------------------
diff --git a/python/cmake_modules/FindParquet.cmake 
b/python/cmake_modules/FindParquet.cmake
index ffd043d..de53a29 100644
--- a/python/cmake_modules/FindParquet.cmake
+++ b/python/cmake_modules/FindParquet.cmake
@@ -1,16 +1,19 @@
-# Copyright 2012 Cloudera Inc.
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#   http://www.apache.org/licenses/LICENSE-2.0
 #
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 
 # - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so)
 # This module defines
@@ -71,9 +74,14 @@ endif()
 
 if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES)
   set(PARQUET_FOUND TRUE)
-  set(PARQUET_LIB_NAME libparquet)
-  set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a)
-  set(PARQUET_SHARED_LIB 
${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if (MSVC)
+    set(PARQUET_STATIC_LIB "${PARQUET_LIBRARIES}_static")
+    set(PARQUET_SHARED_LIB "${PARQUET_LIBRARIES}")
+  else()
+    set(PARQUET_LIB_NAME libparquet)
+    set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a)
+    set(PARQUET_SHARED_LIB 
${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
 else ()
   set(PARQUET_FOUND FALSE)
 endif ()
@@ -81,11 +89,16 @@ endif ()
 if (PARQUET_INCLUDE_DIR AND PARQUET_ARROW_LIBRARIES)
   set(PARQUET_ARROW_FOUND TRUE)
   get_filename_component(PARQUET_ARROW_LIBS ${PARQUET_ARROW_LIBRARIES} PATH)
-  set(PARQUET_ARROW_LIB_NAME libparquet_arrow)
-  set(PARQUET_ARROW_STATIC_LIB
-    ${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a)
-  set(PARQUET_ARROW_SHARED_LIB
-    
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if (MSVC)
+    set(PARQUET_ARROW_STATIC_LIB "${PARQUET_ARROW_LIBRARIES}_static")
+    set(PARQUET_ARROW_SHARED_LIB "${PARQUET_ARROW_LIBRARIES}")
+  else()
+    set(PARQUET_ARROW_LIB_NAME libparquet_arrow)
+    set(PARQUET_ARROW_STATIC_LIB
+      ${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a)
+    set(PARQUET_ARROW_SHARED_LIB
+      
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
 else ()
   set(PARQUET_ARROW_FOUND FALSE)
 endif ()

http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/doc/source/development.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/development.rst 
b/python/doc/source/development.rst
index 440c1c4..6ec563e 100644
--- a/python/doc/source/development.rst
+++ b/python/doc/source/development.rst
@@ -174,14 +174,37 @@ You should be able to run the unit tests with:
 Windows
 =======
 
-First, make sure you can `build the C++ library 
<https://github.com/apache/arrow/blob/master/cpp/doc/Windows.md>`_.
+First, we bootstrap a conda environment similar to the `C++ build instructions
+<https://github.com/apache/arrow/blob/master/cpp/doc/Windows.md>`_. This
+includes all the dependencies for Arrow and the Apache Parquet C++ libraries.
 
-Now, we need to build and install the C++ libraries someplace.
+First, starting from fresh clones of Apache Arrow and parquet-cpp:
+
+.. code-block:: shell
+
+   git clone https://github.com/apache/arrow.git
+   git clone https://github.com/apache/parquet-cpp.git
+
+.. code-block:: shell
+
+   conda create -n arrow-dev cmake git boost-cpp ^
+         flatbuffers snappy zlib brotli thrift-cpp rapidjson
+   activate arrow-dev
+
+As one git housekeeping item, we must run this command in our Arrow clone:
+
+.. code-block:: shell
+
+   cd arrow
+   git config core.symlinks true
+
+Now, we build and install Arrow C++ libraries
 
 .. code-block:: shell
 
    mkdir cpp\build
    cd cpp\build
+   set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
    set ARROW_HOME=C:\thirdparty
    cmake -G "Visual Studio 14 2015 Win64" ^
          -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
@@ -191,6 +214,22 @@ Now, we need to build and install the C++ libraries 
someplace.
    cmake --build . --target INSTALL --config Release
    cd ..\..
 
+Now, we build parquet-cpp and install the result in the same place:
+
+.. code-block:: shell
+
+   mkdir ..\parquet-cpp\build
+   pushd ..\parquet-cpp\build
+   set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
+   set PARQUET_HOME=C:\thirdparty
+   cmake -G "Visual Studio 14 2015 Win64" ^
+         -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^
+         -DCMAKE_BUILD_TYPE=Release ^
+         -DPARQUET_ZLIB_VENDORED=off ^
+         -DPARQUET_BUILD_TESTS=off ..
+   cmake --build . --target INSTALL --config Release
+   popd
+
 After that, we must put the install directory's bin path in our ``%PATH%``:
 
 .. code-block:: shell
@@ -202,7 +241,13 @@ Now, we can build pyarrow:
 .. code-block:: shell
 
    cd python
-   python setup.py build_ext --inplace
+   python setup.py build_ext --inplace --with-parquet
+
+Then run the unit tests with:
+
+.. code-block:: shell
+
+   py.test pyarrow -v
 
 Running C++ unit tests with Python
 ----------------------------------

http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/pyarrow/filesystem.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py
index ac37fd8..798d96b 100644
--- a/python/pyarrow/filesystem.py
+++ b/python/pyarrow/filesystem.py
@@ -92,6 +92,10 @@ class Filesystem(object):
                                  filesystem=self)
         return dataset.read(columns=columns, nthreads=nthreads)
 
+    @property
+    def pathsep(self):
+        return '/'
+
 
 class LocalFilesystem(Filesystem):
 
@@ -132,6 +136,10 @@ class LocalFilesystem(Filesystem):
         """
         return open(path, mode=mode)
 
+    @property
+    def pathsep(self):
+        return os.path.sep
+
 
 class HdfsClient(lib._HdfsClient, Filesystem):
     """

http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index e69d85e..f59a719 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -582,7 +582,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/'):
 
     if is_string(path_or_paths) and fs.isdir(path_or_paths):
         manifest = ParquetManifest(path_or_paths, filesystem=fs,
-                                   pathsep=pathsep)
+                                   pathsep=fs.pathsep)
         metadata_path = manifest.metadata_path
         pieces = manifest.pieces
         partitions = manifest.partitions

http://git-wip-us.apache.org/repos/asf/arrow/blob/fee44751/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index db446d3..5f65f28 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -192,10 +192,10 @@ def test_pandas_column_selection(tmpdir):
 
 def _random_integers(size, dtype):
     # We do not generate integers outside the int64 range
-    i64_info = np.iinfo('int64')
+    platform_int_info = np.iinfo('int_')
     iinfo = np.iinfo(dtype)
-    return np.random.randint(max(iinfo.min, i64_info.min),
-                             min(iinfo.max, i64_info.max),
+    return np.random.randint(max(iinfo.min, platform_int_info.min),
+                             min(iinfo.max, platform_int_info.max),
                              size=size).astype(dtype)
 
 

Reply via email to