This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 1c546fb3c1 GH-41480: [Python] Building PyArrow: enable/disable python 
components by default based on availability in Arrow C++ (#41494)
1c546fb3c1 is described below

commit 1c546fb3c130fc6a4f3e06ad31dc49d923785104
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Thu May 16 14:15:57 2024 +0200

    GH-41480: [Python] Building PyArrow: enable/disable python components by 
default based on availability in Arrow C++ (#41494)
    
    ### Rationale for this change
    
    Currently, when building pyarrow from source, one needs to manually enable 
the optional components through setting `PYARROW_WITH_...` environment 
variables. However, we could also make a default choice of components based on 
which ones where enabled in the Arrow C++ build.
    
    ### What changes are included in this PR?
    
    Set defaults for the various `PYARROW_BUILD_<component>` based on the 
`ARROW_<component>` setting. Keep the current `PYARROW_WITH_<component>` 
environment variables working to allow to override this default.
    
    ### Are there any user-facing changes?
    
    No
    * GitHub Issue: #41480
    
    Lead-authored-by: Joris Van den Bossche <[email protected]>
    Co-authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 ci/appveyor-cpp-build.bat |   1 -
 python/CMakeLists.txt     | 115 +++++++++++++++++++++++++++++----------
 python/setup.py           | 134 +++++++++++++---------------------------------
 3 files changed, 123 insertions(+), 127 deletions(-)

diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index 8cfa67c437..f688fbb63a 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC%
 set PYARROW_WITH_PARQUET=ON
 set PYARROW_WITH_PARQUET_ENCRYPTION=ON
 set PYARROW_WITH_S3=%ARROW_S3%
-set PYARROW_WITH_STATIC_BOOST=ON
 set PYARROW_WITH_SUBSTRAIT=ON
 
 set ARROW_HOME=%CONDA_PREFIX%\Library
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 212862357a..07acb9e31a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -108,25 +108,6 @@ if(UNIX)
   endif()
 endif()
 
-# Top level cmake dir
-if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
-  option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF)
-  option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF)
-  option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
-  option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF)
-  option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
-  option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF)
-  option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
-  option(PYARROW_BUILD_PARQUET_ENCRYPTION
-         "Build the PyArrow Parquet encryption integration" OFF)
-  option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
-  option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" 
OFF)
-  option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" 
OFF)
-  set(PYARROW_CXXFLAGS
-      ""
-      CACHE STRING "Compiler flags to append when compiling Arrow")
-endif()
-
 find_program(CCACHE_FOUND ccache)
 if(CCACHE_FOUND
    AND NOT CMAKE_C_COMPILER_LAUNCHER
@@ -265,11 +246,70 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")
 
 include(UseCython)
 
-# PyArrow C++
+# Arrow C++ and set default PyArrow build options
 include(GNUInstallDirs)
-
 find_package(Arrow REQUIRED)
 
+macro(define_option name description arrow_option)
+  set("PYARROW_${name}"
+      "AUTO"
+      CACHE STRING ${description})
+
+  if("${PYARROW_${name}}" STREQUAL "AUTO")
+    # by default, first check if env variable exists, otherwise use Arrow C++ 
config
+    set(env_variable "PYARROW_WITH_${name}")
+    if(DEFINED ENV{${env_variable}})
+      if($ENV{${env_variable}})
+        set("PYARROW_BUILD_${name}" ON)
+      else()
+        set("PYARROW_BUILD_${name}" OFF)
+      endif()
+    else()
+      if(${arrow_option})
+        set("PYARROW_BUILD_${name}" ON)
+      else()
+        set("PYARROW_BUILD_${name}" OFF)
+      endif()
+    endif()
+  else()
+    if("${PYARROW_${name}}")
+      set("PYARROW_BUILD_${name}" ON)
+    else()
+      set("PYARROW_BUILD_${name}" OFF)
+    endif()
+  endif()
+endmacro()
+
+define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO)
+define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA)
+define_option(DATASET "Build the PyArrow Dataset integration" ARROW_DATASET)
+define_option(FLIGHT "Build the PyArrow Flight integration" ARROW_FLIGHT)
+define_option(GANDIVA "Build the PyArrow Gandiva integration" ARROW_GANDIVA)
+define_option(ORC "Build the PyArrow ORC integration" ARROW_ORC)
+define_option(PARQUET "Build the PyArrow Parquet integration" ARROW_PARQUET)
+define_option(PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption 
integration"
+              PARQUET_REQUIRE_ENCRYPTION)
+define_option(SUBSTRAIT "Build the PyArrow Substrait integration" 
ARROW_SUBSTRAIT)
+define_option(AZURE "Build the PyArrow Azure integration" ARROW_AZURE)
+define_option(GCS "Build the PyArrow GCS integration" ARROW_GCS)
+define_option(S3 "Build the PyArrow S3 integration" ARROW_S3)
+define_option(HDFS "Build the PyArrow HDFS integration" ARROW_HDFS)
+option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
+option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" 
OFF)
+option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
+set(PYARROW_CXXFLAGS
+    ""
+    CACHE STRING "Compiler flags to append when compiling PyArrow C++")
+
+# enforce module dependencies
+if(PYARROW_BUILD_SUBSTRAIT)
+  set(PYARROW_BUILD_DATASET ON)
+endif()
+if(PYARROW_BUILD_DATASET)
+  set(PYARROW_BUILD_ACERO ON)
+endif()
+
+# PyArrow C++
 set(PYARROW_CPP_ROOT_DIR pyarrow/src)
 set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python)
 set(PYARROW_CPP_SRCS
@@ -305,6 +345,7 @@ set(PYARROW_CPP_LINK_LIBS "")
 
 # Check all the options from Arrow and PyArrow C++ to be in line
 if(PYARROW_BUILD_DATASET)
+  message(STATUS "Building PyArrow with Dataset")
   if(NOT ARROW_DATASET)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON")
   endif()
@@ -317,6 +358,7 @@ if(PYARROW_BUILD_DATASET)
 endif()
 
 if(PYARROW_BUILD_ACERO)
+  message(STATUS "Building PyArrow with Acero")
   if(NOT ARROW_ACERO)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON")
   endif()
@@ -329,18 +371,13 @@ if(PYARROW_BUILD_ACERO)
 endif()
 
 if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION)
+  message(STATUS "Building PyArrow with Parquet")
   if(NOT ARROW_PARQUET)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON")
   endif()
   find_package(Parquet REQUIRED)
 endif()
 
-if(PYARROW_BUILD_HDFS)
-  if(NOT ARROW_HDFS)
-    message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
-  endif()
-endif()
-
 # Check for only Arrow C++ options
 if(ARROW_CSV)
   list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc)
@@ -400,6 +437,7 @@ endif()
 
 set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc)
 if(PYARROW_BUILD_FLIGHT)
+  message(STATUS "Building PyArrow with Flight")
   if(NOT ARROW_FLIGHT)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON")
   endif()
@@ -555,23 +593,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES 
CYTHON_API TRUE)
 set(LINK_LIBS arrow_python)
 
 if(PYARROW_BUILD_AZURE)
+  message(STATUS "Building PyArrow with Azure")
+  if(NOT ARROW_AZURE)
+    message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON")
+  endif()
   list(APPEND CYTHON_EXTENSIONS _azurefs)
 endif()
 
 if(PYARROW_BUILD_GCS)
+  message(STATUS "Building PyArrow with GCS")
+  if(NOT ARROW_GCS)
+    message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON")
+  endif()
   list(APPEND CYTHON_EXTENSIONS _gcsfs)
 endif()
 
 if(PYARROW_BUILD_S3)
+  message(STATUS "Building PyArrow with S3")
+  if(NOT ARROW_S3)
+    message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON")
+  endif()
   list(APPEND CYTHON_EXTENSIONS _s3fs)
 endif()
 
 if(PYARROW_BUILD_HDFS)
+  message(STATUS "Building PyArrow with HDFS")
+  if(NOT ARROW_HDFS)
+    message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON")
+  endif()
   list(APPEND CYTHON_EXTENSIONS _hdfs)
 endif()
 
 if(PYARROW_BUILD_CUDA)
-  # Arrow CUDA
+  message(STATUS "Building PyArrow with CUDA")
   if(NOT ARROW_CUDA)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON")
   endif()
@@ -646,8 +700,9 @@ if(PYARROW_BUILD_PARQUET)
   endif()
 endif()
 
+# ORC
 if(PYARROW_BUILD_ORC)
-  # ORC
+  message(STATUS "Building PyArrow with ORC")
   if(NOT ARROW_ORC)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON")
   endif()
@@ -679,6 +734,7 @@ endif()
 
 # Substrait
 if(PYARROW_BUILD_SUBSTRAIT)
+  message(STATUS "Building PyArrow with Substrait")
   if(NOT ARROW_SUBSTRAIT)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON")
   endif()
@@ -696,6 +752,7 @@ endif()
 
 # Gandiva
 if(PYARROW_BUILD_GANDIVA)
+  message(STATUS "Building PyArrow with Gandiva")
   if(NOT ARROW_GANDIVA)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON")
   endif()
diff --git a/python/setup.py b/python/setup.py
index 6f3dddb29d..ed2b7961e5 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -152,32 +152,20 @@ class build_ext(_build_ext):
             if not hasattr(sys, 'gettotalrefcount'):
                 self.build_type = 'release'
 
-        self.with_azure = strtobool(
-            os.environ.get('PYARROW_WITH_AZURE', '0'))
-        self.with_gcs = strtobool(
-            os.environ.get('PYARROW_WITH_GCS', '0'))
-        self.with_s3 = strtobool(
-            os.environ.get('PYARROW_WITH_S3', '0'))
-        self.with_hdfs = strtobool(
-            os.environ.get('PYARROW_WITH_HDFS', '0'))
-        self.with_cuda = strtobool(
-            os.environ.get('PYARROW_WITH_CUDA', '0'))
-        self.with_substrait = strtobool(
-            os.environ.get('PYARROW_WITH_SUBSTRAIT', '0'))
-        self.with_flight = strtobool(
-            os.environ.get('PYARROW_WITH_FLIGHT', '0'))
-        self.with_acero = strtobool(
-            os.environ.get('PYARROW_WITH_ACERO', '0'))
-        self.with_dataset = strtobool(
-            os.environ.get('PYARROW_WITH_DATASET', '0'))
-        self.with_parquet = strtobool(
-            os.environ.get('PYARROW_WITH_PARQUET', '0'))
-        self.with_parquet_encryption = strtobool(
-            os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0'))
-        self.with_orc = strtobool(
-            os.environ.get('PYARROW_WITH_ORC', '0'))
-        self.with_gandiva = strtobool(
-            os.environ.get('PYARROW_WITH_GANDIVA', '0'))
+        self.with_azure = None
+        self.with_gcs = None
+        self.with_s3 = None
+        self.with_hdfs = None
+        self.with_cuda = None
+        self.with_substrait = None
+        self.with_flight = None
+        self.with_acero = None
+        self.with_dataset = None
+        self.with_parquet = None
+        self.with_parquet_encryption = None
+        self.with_orc = None
+        self.with_gandiva = None
+
         self.generate_coverage = strtobool(
             os.environ.get('PYARROW_GENERATE_COVERAGE', '0'))
         self.bundle_arrow_cpp = strtobool(
@@ -185,15 +173,6 @@ class build_ext(_build_ext):
         self.bundle_cython_cpp = strtobool(
             os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0'))
 
-        self.with_parquet_encryption = (self.with_parquet_encryption and
-                                        self.with_parquet)
-
-        # enforce module dependencies
-        if self.with_substrait:
-            self.with_dataset = True
-        if self.with_dataset:
-            self.with_acero = True
-
     CYTHON_MODULE_NAMES = [
         'lib',
         '_fs',
@@ -270,23 +249,30 @@ class build_ext(_build_ext):
                 cmake_options.append('-D{0}={1}'.format(
                     varname, 'on' if value else 'off'))
 
+            def append_cmake_component(flag, varname):
+                # only pass this to cmake is the user pass the --with-component
+                # flag to setup.py build_ext
+                if flag is not None:
+                    append_cmake_bool(flag, varname)
+
             if self.cmake_generator:
                 cmake_options += ['-G', self.cmake_generator]
 
-            append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA')
-            append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT')
-            append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT')
-            append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA')
-            append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO')
-            append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET')
-            append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC')
-            append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET')
-            append_cmake_bool(self.with_parquet_encryption,
-                              'PYARROW_BUILD_PARQUET_ENCRYPTION')
-            append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE')
-            append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS')
-            append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3')
-            append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS')
+            append_cmake_component(self.with_cuda, 'PYARROW_CUDA')
+            append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT')
+            append_cmake_component(self.with_flight, 'PYARROW_FLIGHT')
+            append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA')
+            append_cmake_component(self.with_acero, 'PYARROW_ACERO')
+            append_cmake_component(self.with_dataset, 'PYARROW_DATASET')
+            append_cmake_component(self.with_orc, 'PYARROW_ORC')
+            append_cmake_component(self.with_parquet, 'PYARROW_PARQUET')
+            append_cmake_component(self.with_parquet_encryption,
+                                   'PYARROW_PARQUET_ENCRYPTION')
+            append_cmake_component(self.with_azure, 'PYARROW_AZURE')
+            append_cmake_component(self.with_gcs, 'PYARROW_GCS')
+            append_cmake_component(self.with_s3, 'PYARROW_S3')
+            append_cmake_component(self.with_hdfs, 'PYARROW_HDFS')
+
             append_cmake_bool(self.bundle_arrow_cpp,
                               'PYARROW_BUNDLE_ARROW_CPP')
             append_cmake_bool(self.bundle_cython_cpp,
@@ -329,54 +315,8 @@ class build_ext(_build_ext):
             self._found_names = []
             for name in self.CYTHON_MODULE_NAMES:
                 built_path = pjoin(install_prefix, name + ext_suffix)
-                if not os.path.exists(built_path):
-                    print(f'Did not find {built_path}')
-                    if self._failure_permitted(name):
-                        print(f'Cython module {name} failure permitted')
-                        continue
-                    raise RuntimeError('PyArrow C-extension failed to build:',
-                                       os.path.abspath(built_path))
-
-                self._found_names.append(name)
-
-    def _failure_permitted(self, name):
-        if name == '_parquet' and not self.with_parquet:
-            return True
-        if name == '_parquet_encryption' and not self.with_parquet_encryption:
-            return True
-        if name == '_orc' and not self.with_orc:
-            return True
-        if name == '_flight' and not self.with_flight:
-            return True
-        if name == '_substrait' and not self.with_substrait:
-            return True
-        if name == '_azurefs' and not self.with_azure:
-            return True
-        if name == '_gcsfs' and not self.with_gcs:
-            return True
-        if name == '_s3fs' and not self.with_s3:
-            return True
-        if name == '_hdfs' and not self.with_hdfs:
-            return True
-        if name == '_dataset' and not self.with_dataset:
-            return True
-        if name == '_acero' and not self.with_acero:
-            return True
-        if name == '_exec_plan' and not self.with_acero:
-            return True
-        if name == '_dataset_orc' and not (
-                self.with_orc and self.with_dataset
-        ):
-            return True
-        if name == '_dataset_parquet' and not (
-                self.with_parquet and self.with_dataset
-        ):
-            return True
-        if name == '_cuda' and not self.with_cuda:
-            return True
-        if name == 'gandiva' and not self.with_gandiva:
-            return True
-        return False
+                if os.path.exists(built_path):
+                    self._found_names.append(name)
 
     def _get_build_dir(self):
         # Get the package directory from build_py

Reply via email to