This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1c546fb3c1 GH-41480: [Python] Building PyArrow: enable/disable python
components by default based on availability in Arrow C++ (#41494)
1c546fb3c1 is described below
commit 1c546fb3c130fc6a4f3e06ad31dc49d923785104
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Thu May 16 14:15:57 2024 +0200
GH-41480: [Python] Building PyArrow: enable/disable python components by
default based on availability in Arrow C++ (#41494)
### Rationale for this change
Currently, when building pyarrow from source, one needs to manually enable
the optional components through setting `PYARROW_WITH_...` environment
variables. However, we could also make a default choice of components based on
which ones where enabled in the Arrow C++ build.
### What changes are included in this PR?
Set defaults for the various `PYARROW_BUILD_<component>` based on the
`ARROW_<component>` setting. Keep the current `PYARROW_WITH_<component>`
environment variables working to allow to override this default.
### Are there any user-facing changes?
No
* GitHub Issue: #41480
Lead-authored-by: Joris Van den Bossche <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
ci/appveyor-cpp-build.bat | 1 -
python/CMakeLists.txt | 115 +++++++++++++++++++++++++++++----------
python/setup.py | 134 +++++++++++++---------------------------------
3 files changed, 123 insertions(+), 127 deletions(-)
diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index 8cfa67c437..f688fbb63a 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC%
set PYARROW_WITH_PARQUET=ON
set PYARROW_WITH_PARQUET_ENCRYPTION=ON
set PYARROW_WITH_S3=%ARROW_S3%
-set PYARROW_WITH_STATIC_BOOST=ON
set PYARROW_WITH_SUBSTRAIT=ON
set ARROW_HOME=%CONDA_PREFIX%\Library
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 212862357a..07acb9e31a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -108,25 +108,6 @@ if(UNIX)
endif()
endif()
-# Top level cmake dir
-if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
- option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF)
- option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF)
- option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
- option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF)
- option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
- option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF)
- option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
- option(PYARROW_BUILD_PARQUET_ENCRYPTION
- "Build the PyArrow Parquet encryption integration" OFF)
- option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
- option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython"
OFF)
- option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled"
OFF)
- set(PYARROW_CXXFLAGS
- ""
- CACHE STRING "Compiler flags to append when compiling Arrow")
-endif()
-
find_program(CCACHE_FOUND ccache)
if(CCACHE_FOUND
AND NOT CMAKE_C_COMPILER_LAUNCHER
@@ -265,11 +246,70 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")
include(UseCython)
-# PyArrow C++
+# Arrow C++ and set default PyArrow build options
include(GNUInstallDirs)
-
find_package(Arrow REQUIRED)
+macro(define_option name description arrow_option)
+ set("PYARROW_${name}"
+ "AUTO"
+ CACHE STRING ${description})
+
+ if("${PYARROW_${name}}" STREQUAL "AUTO")
+ # by default, first check if env variable exists, otherwise use Arrow C++
config
+ set(env_variable "PYARROW_WITH_${name}")
+ if(DEFINED ENV{${env_variable}})
+ if($ENV{${env_variable}})
+ set("PYARROW_BUILD_${name}" ON)
+ else()
+ set("PYARROW_BUILD_${name}" OFF)
+ endif()
+ else()
+ if(${arrow_option})
+ set("PYARROW_BUILD_${name}" ON)
+ else()
+ set("PYARROW_BUILD_${name}" OFF)
+ endif()
+ endif()
+ else()
+ if("${PYARROW_${name}}")
+ set("PYARROW_BUILD_${name}" ON)
+ else()
+ set("PYARROW_BUILD_${name}" OFF)
+ endif()
+ endif()
+endmacro()
+
+define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO)
+define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA)
+define_option(DATASET "Build the PyArrow Dataset integration" ARROW_DATASET)
+define_option(FLIGHT "Build the PyArrow Flight integration" ARROW_FLIGHT)
+define_option(GANDIVA "Build the PyArrow Gandiva integration" ARROW_GANDIVA)
+define_option(ORC "Build the PyArrow ORC integration" ARROW_ORC)
+define_option(PARQUET "Build the PyArrow Parquet integration" ARROW_PARQUET)
+define_option(PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption
integration"
+ PARQUET_REQUIRE_ENCRYPTION)
+define_option(SUBSTRAIT "Build the PyArrow Substrait integration"
ARROW_SUBSTRAIT)
+define_option(AZURE "Build the PyArrow Azure integration" ARROW_AZURE)
+define_option(GCS "Build the PyArrow GCS integration" ARROW_GCS)
+define_option(S3 "Build the PyArrow S3 integration" ARROW_S3)
+define_option(HDFS "Build the PyArrow HDFS integration" ARROW_HDFS)
+option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
+option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython"
OFF)
+option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
+set(PYARROW_CXXFLAGS
+ ""
+ CACHE STRING "Compiler flags to append when compiling PyArrow C++")
+
+# enforce module dependencies
+if(PYARROW_BUILD_SUBSTRAIT)
+ set(PYARROW_BUILD_DATASET ON)
+endif()
+if(PYARROW_BUILD_DATASET)
+ set(PYARROW_BUILD_ACERO ON)
+endif()
+
+# PyArrow C++
set(PYARROW_CPP_ROOT_DIR pyarrow/src)
set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python)
set(PYARROW_CPP_SRCS
@@ -305,6 +345,7 @@ set(PYARROW_CPP_LINK_LIBS "")
# Check all the options from Arrow and PyArrow C++ to be in line
if(PYARROW_BUILD_DATASET)
+ message(STATUS "Building PyArrow with Dataset")
if(NOT ARROW_DATASET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON")
endif()
@@ -317,6 +358,7 @@ if(PYARROW_BUILD_DATASET)
endif()
if(PYARROW_BUILD_ACERO)
+ message(STATUS "Building PyArrow with Acero")
if(NOT ARROW_ACERO)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON")
endif()
@@ -329,18 +371,13 @@ if(PYARROW_BUILD_ACERO)
endif()
if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION)
+ message(STATUS "Building PyArrow with Parquet")
if(NOT ARROW_PARQUET)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON")
endif()
find_package(Parquet REQUIRED)
endif()
-if(PYARROW_BUILD_HDFS)
- if(NOT ARROW_HDFS)
- message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
- endif()
-endif()
-
# Check for only Arrow C++ options
if(ARROW_CSV)
list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc)
@@ -400,6 +437,7 @@ endif()
set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc)
if(PYARROW_BUILD_FLIGHT)
+ message(STATUS "Building PyArrow with Flight")
if(NOT ARROW_FLIGHT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON")
endif()
@@ -555,23 +593,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES
CYTHON_API TRUE)
set(LINK_LIBS arrow_python)
if(PYARROW_BUILD_AZURE)
+ message(STATUS "Building PyArrow with Azure")
+ if(NOT ARROW_AZURE)
+ message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON")
+ endif()
list(APPEND CYTHON_EXTENSIONS _azurefs)
endif()
if(PYARROW_BUILD_GCS)
+ message(STATUS "Building PyArrow with GCS")
+ if(NOT ARROW_GCS)
+ message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON")
+ endif()
list(APPEND CYTHON_EXTENSIONS _gcsfs)
endif()
if(PYARROW_BUILD_S3)
+ message(STATUS "Building PyArrow with S3")
+ if(NOT ARROW_S3)
+ message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON")
+ endif()
list(APPEND CYTHON_EXTENSIONS _s3fs)
endif()
if(PYARROW_BUILD_HDFS)
+ message(STATUS "Building PyArrow with HDFS")
+ if(NOT ARROW_HDFS)
+ message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
+ endif()
list(APPEND CYTHON_EXTENSIONS _hdfs)
endif()
if(PYARROW_BUILD_CUDA)
- # Arrow CUDA
+ message(STATUS "Building PyArrow with CUDA")
if(NOT ARROW_CUDA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON")
endif()
@@ -646,8 +700,9 @@ if(PYARROW_BUILD_PARQUET)
endif()
endif()
+# ORC
if(PYARROW_BUILD_ORC)
- # ORC
+ message(STATUS "Building PyArrow with ORC")
if(NOT ARROW_ORC)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON")
endif()
@@ -679,6 +734,7 @@ endif()
# Substrait
if(PYARROW_BUILD_SUBSTRAIT)
+ message(STATUS "Building PyArrow with Substrait")
if(NOT ARROW_SUBSTRAIT)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON")
endif()
@@ -696,6 +752,7 @@ endif()
# Gandiva
if(PYARROW_BUILD_GANDIVA)
+ message(STATUS "Building PyArrow with Gandiva")
if(NOT ARROW_GANDIVA)
message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON")
endif()
diff --git a/python/setup.py b/python/setup.py
index 6f3dddb29d..ed2b7961e5 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -152,32 +152,20 @@ class build_ext(_build_ext):
if not hasattr(sys, 'gettotalrefcount'):
self.build_type = 'release'
- self.with_azure = strtobool(
- os.environ.get('PYARROW_WITH_AZURE', '0'))
- self.with_gcs = strtobool(
- os.environ.get('PYARROW_WITH_GCS', '0'))
- self.with_s3 = strtobool(
- os.environ.get('PYARROW_WITH_S3', '0'))
- self.with_hdfs = strtobool(
- os.environ.get('PYARROW_WITH_HDFS', '0'))
- self.with_cuda = strtobool(
- os.environ.get('PYARROW_WITH_CUDA', '0'))
- self.with_substrait = strtobool(
- os.environ.get('PYARROW_WITH_SUBSTRAIT', '0'))
- self.with_flight = strtobool(
- os.environ.get('PYARROW_WITH_FLIGHT', '0'))
- self.with_acero = strtobool(
- os.environ.get('PYARROW_WITH_ACERO', '0'))
- self.with_dataset = strtobool(
- os.environ.get('PYARROW_WITH_DATASET', '0'))
- self.with_parquet = strtobool(
- os.environ.get('PYARROW_WITH_PARQUET', '0'))
- self.with_parquet_encryption = strtobool(
- os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0'))
- self.with_orc = strtobool(
- os.environ.get('PYARROW_WITH_ORC', '0'))
- self.with_gandiva = strtobool(
- os.environ.get('PYARROW_WITH_GANDIVA', '0'))
+ self.with_azure = None
+ self.with_gcs = None
+ self.with_s3 = None
+ self.with_hdfs = None
+ self.with_cuda = None
+ self.with_substrait = None
+ self.with_flight = None
+ self.with_acero = None
+ self.with_dataset = None
+ self.with_parquet = None
+ self.with_parquet_encryption = None
+ self.with_orc = None
+ self.with_gandiva = None
+
self.generate_coverage = strtobool(
os.environ.get('PYARROW_GENERATE_COVERAGE', '0'))
self.bundle_arrow_cpp = strtobool(
@@ -185,15 +173,6 @@ class build_ext(_build_ext):
self.bundle_cython_cpp = strtobool(
os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0'))
- self.with_parquet_encryption = (self.with_parquet_encryption and
- self.with_parquet)
-
- # enforce module dependencies
- if self.with_substrait:
- self.with_dataset = True
- if self.with_dataset:
- self.with_acero = True
-
CYTHON_MODULE_NAMES = [
'lib',
'_fs',
@@ -270,23 +249,30 @@ class build_ext(_build_ext):
cmake_options.append('-D{0}={1}'.format(
varname, 'on' if value else 'off'))
+ def append_cmake_component(flag, varname):
+ # only pass this to cmake is the user pass the --with-component
+ # flag to setup.py build_ext
+ if flag is not None:
+ append_cmake_bool(flag, varname)
+
if self.cmake_generator:
cmake_options += ['-G', self.cmake_generator]
- append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA')
- append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT')
- append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT')
- append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA')
- append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO')
- append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET')
- append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC')
- append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET')
- append_cmake_bool(self.with_parquet_encryption,
- 'PYARROW_BUILD_PARQUET_ENCRYPTION')
- append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE')
- append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS')
- append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3')
- append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS')
+ append_cmake_component(self.with_cuda, 'PYARROW_CUDA')
+ append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT')
+ append_cmake_component(self.with_flight, 'PYARROW_FLIGHT')
+ append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA')
+ append_cmake_component(self.with_acero, 'PYARROW_ACERO')
+ append_cmake_component(self.with_dataset, 'PYARROW_DATASET')
+ append_cmake_component(self.with_orc, 'PYARROW_ORC')
+ append_cmake_component(self.with_parquet, 'PYARROW_PARQUET')
+ append_cmake_component(self.with_parquet_encryption,
+ 'PYARROW_PARQUET_ENCRYPTION')
+ append_cmake_component(self.with_azure, 'PYARROW_AZURE')
+ append_cmake_component(self.with_gcs, 'PYARROW_GCS')
+ append_cmake_component(self.with_s3, 'PYARROW_S3')
+ append_cmake_component(self.with_hdfs, 'PYARROW_HDFS')
+
append_cmake_bool(self.bundle_arrow_cpp,
'PYARROW_BUNDLE_ARROW_CPP')
append_cmake_bool(self.bundle_cython_cpp,
@@ -329,54 +315,8 @@ class build_ext(_build_ext):
self._found_names = []
for name in self.CYTHON_MODULE_NAMES:
built_path = pjoin(install_prefix, name + ext_suffix)
- if not os.path.exists(built_path):
- print(f'Did not find {built_path}')
- if self._failure_permitted(name):
- print(f'Cython module {name} failure permitted')
- continue
- raise RuntimeError('PyArrow C-extension failed to build:',
- os.path.abspath(built_path))
-
- self._found_names.append(name)
-
- def _failure_permitted(self, name):
- if name == '_parquet' and not self.with_parquet:
- return True
- if name == '_parquet_encryption' and not self.with_parquet_encryption:
- return True
- if name == '_orc' and not self.with_orc:
- return True
- if name == '_flight' and not self.with_flight:
- return True
- if name == '_substrait' and not self.with_substrait:
- return True
- if name == '_azurefs' and not self.with_azure:
- return True
- if name == '_gcsfs' and not self.with_gcs:
- return True
- if name == '_s3fs' and not self.with_s3:
- return True
- if name == '_hdfs' and not self.with_hdfs:
- return True
- if name == '_dataset' and not self.with_dataset:
- return True
- if name == '_acero' and not self.with_acero:
- return True
- if name == '_exec_plan' and not self.with_acero:
- return True
- if name == '_dataset_orc' and not (
- self.with_orc and self.with_dataset
- ):
- return True
- if name == '_dataset_parquet' and not (
- self.with_parquet and self.with_dataset
- ):
- return True
- if name == '_cuda' and not self.with_cuda:
- return True
- if name == 'gandiva' and not self.with_gandiva:
- return True
- return False
+ if os.path.exists(built_path):
+ self._found_names.append(name)
def _get_build_dir(self):
# Get the package directory from build_py