kou commented on code in PR #37822:
URL: https://github.com/apache/arrow/pull/37822#discussion_r1622047559


##########
ci/docker/ubuntu-22.04-cpp-emscripten.dockerfile:
##########


Review Comment:
   Can we use `ci/docker/linux-apt-python-3.dockerfile` like `ubuntu-python` 
service in `docker-compose.yml` does?



##########
ci/scripts/emscripten_python_build.sh:
##########


Review Comment:
   Could you rename this to `python_build_emscripten.sh`?
   We use `${language}_XXX.sh` naming convention.
   



##########
ci/scripts/emscripten_python_build.sh:
##########
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+source ~/emsdk/emsdk_env.sh
+
+source_dir=${1}/python
+cd ${source_dir}
+rm -rf dist
+pyodide build

Review Comment:
   Could you use build directory instead of changing source directory?
   It creates files owned by `root` on host.
   See also: https://github.com/apache/arrow/issues/41429
   
   `python_build.sh` copies sources for now: 
https://github.com/apache/arrow/blob/255dbf990c3d3e5fb1270a2a11efe0af2be195ab/ci/scripts/python_build.sh#L81-L93



##########
python/CMakeLists.txt:
##########
@@ -650,28 +671,42 @@ endif()
 
 # Acero
 if(PYARROW_BUILD_ACERO)
-  if(PYARROW_BUNDLE_ARROW_CPP)
-    bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION})
-    if(MSVC)
-      bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
+  list(APPEND CYTHON_EXTENSIONS _acero)
+
+  if(ARROW_BUILD_SHARED)
+
+    if(PYARROW_BUNDLE_ARROW_CPP)
+      bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION 
${ARROW_SO_VERSION})
+      if(MSVC)
+        bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
+      endif()
     endif()
-  endif()
 
-  set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared)
-  list(APPEND CYTHON_EXTENSIONS _acero)
+    set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared)
+  else()
+    # ACERO is statically linked into libarrow_python already
+    set(ACERO_LINK_LIBS)
+  endif()
 endif()
 
 # Dataset
 if(PYARROW_BUILD_DATASET)
-  if(PYARROW_BUNDLE_ARROW_CPP)
-    bundle_arrow_lib(${ARROW_DATASET_SHARED_LIB} SO_VERSION 
${ARROW_SO_VERSION})
-    if(MSVC)
-      bundle_arrow_import_lib(${ARROW_DATASET_IMPORT_LIB})
+  if(ARROW_BUILD_SHARED)
+

Review Comment:
   ```suggestion
   ```



##########
python/setup.py:
##########
@@ -40,6 +40,14 @@
 # Check if we're running 64-bit Python
 is_64_bit = sys.maxsize > 2**32
 
+is_emscripten = False
+if (
+    sysconfig.get_config_var("SOABI")
+    and sysconfig.get_config_var("SOABI").find("emscripten") != -1
+):
+    is_emscripten = True

Review Comment:
   ```suggestion
   is_emscripten = (
       sysconfig.get_config_var("SOABI")
       and sysconfig.get_config_var("SOABI").find("emscripten") != -1
   )
   ```



##########
python/CMakeLists.txt:
##########
@@ -143,6 +146,22 @@ if(NOT DEFINED ARROW_RUNTIME_SIMD_LEVEL)
 endif()
 include(SetupCxxFlags)
 
+if($ENV{PYODIDE})
+  # these variables are needed for building pyarrow on Emscripten

Review Comment:
   ```suggestion
     # These variables are needed for building PyArrow on Emscripten.
   ```



##########
python/CMakeLists.txt:
##########
@@ -143,6 +146,22 @@ if(NOT DEFINED ARROW_RUNTIME_SIMD_LEVEL)
 endif()
 include(SetupCxxFlags)
 
+if($ENV{PYODIDE})
+  # these variables are needed for building pyarrow on Emscripten
+  # if they aren't set, cmake cross compiling fails for python
+  # modules (at least under pyodide it does)
+  set(Python3_INCLUDE_DIR $ENV{PYTHONINCLUDE})
+  set(Python3_LIBRARY $ENV{CPYTHONLIB})
+  set(Python3_NumPy_INCLUDE_DIR $ENV{NUMPY_LIB}/core/include)
+  set(Python3_EXECUTABLE)
+  set(ENV{_PYTHON_SYSCONFIGDATA_NAME} $ENV{SYSCONFIG_NAME})
+  # we set the c and cxx compiler manually to bypass pywasmcross
+  # which is pyodide's way of messing with C++ build parameters.
+  set(CMAKE_C_COMPILER emcc)
+  set(CMAKE_CXX_COMPILER em++)
+

Review Comment:
   ```suggestion
   ```



##########
ci/scripts/emscripten_python_test.sh:
##########


Review Comment:
   Could you rename this to `python_test_emscripten.sh`?



##########
python/CMakeLists.txt:
##########
@@ -68,6 +68,9 @@ if(POLICY CMP0095)
   cmake_policy(SET CMP0095 NEW)
 endif()
 
+# this option is used to auto-set defaults for pyarrow build
+option(DUMP_ARROW_ARGUMENTS "Dump the arrow arguments then quit" OFF)
+

Review Comment:
   Can we remove this?



##########
ci/scripts/cpp_build.sh:
##########
@@ -105,8 +105,6 @@ if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then
     -DCMAKE_C_FLAGS="${CFLAGS:-}" \
     -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \
     -DCMAKE_CXX_STANDARD="${CMAKE_CXX_STANDARD:-17}" \
-    -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \
-    -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \

Review Comment:
   Why do we need to remove them?
   



##########
python/CMakeLists.txt:
##########
@@ -408,13 +427,15 @@ if(NOT PYARROW_CPP_LINK_LIBS)
 endif()
 
 add_library(arrow_python SHARED ${PYARROW_CPP_SRCS})
+

Review Comment:
   Could you revert needless changes?
   
   ```suggestion
   ```



##########
python/scripts/run_emscripten_tests.py:
##########
@@ -0,0 +1,330 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import argparse
+import contextlib
+import http.server
+import multiprocessing
+import os
+import shutil
+import subprocess
+import sys
+import time
+
+from pathlib import Path
+from io import BytesIO
+
+from selenium import webdriver
+
+
+class TemplateOverrider(http.server.SimpleHTTPRequestHandler):
+    def log_request(self, code="-", size="-"):
+        # don't log successful requests
+        return
+
+    def do_GET(self) -> bytes | None:
+        if self.path.endswith(PYARROW_WHEEL_PATH.name):
+            self.send_response(200)
+            self.send_header("Content-type", "application/x-zip")
+            self.end_headers()
+            self.copyfile(PYARROW_WHEEL_PATH.open(mode="rb"), self.wfile)
+        if self.path.endswith("/test.html"):
+            body = b"""
+                <!doctype html>
+                <html>
+                <head>
+                    <script>
+                        window.python_done_callback=undefined;
+                        window.python_logs=[];
+                        function capturelogs(evt)
+                        {
+                            if('results' in evt.data){
+                                if(window.python_done_callback){
+                                    let callback=window.python_done_callback;
+                                    window.python_done_callback=undefined;
+                                    callback({result:evt.data.results});
+                                }
+                            }
+                            if('print' in evt.data){
+                                
evt.data.print.forEach((x)=>{window.python_logs.push(x)});
+                            }
+                        }
+                        window.pyworker = new Worker("worker.js");
+                        window.pyworker.onmessage=capturelogs;
+                    </script>
+                </head>
+                <body></body>
+                </html>
+                """
+            self.send_response(200)
+            self.send_header("Content-type", "text/html")
+            self.send_header("Content-length", len(body))
+            self.end_headers()
+            self.copyfile(BytesIO(body), self.wfile)
+        elif self.path.endswith("/worker.js"):
+            body = b"""
+                importScripts("./pyodide.js");
+                onmessage = async function (e) {
+                    const data = e.data;
+                    if(!self.pyodide){
+                        self.pyodide = await loadPyodide()
+                    }
+                    function do_print(arg){
+                        let databytes = Array.from(arg)
+                        self.postMessage({print:databytes})
+                        return databytes.length
+                    }
+                    
self.pyodide.setStdout({write:do_print,isatty:data.isatty});
+                    
self.pyodide.setStderr({write:do_print,isatty:data.isatty});
+
+                    await self.pyodide.loadPackagesFromImports(data.python);
+                    let results = await 
self.pyodide.runPythonAsync(data.python);
+                    self.postMessage({results});
+                    console.log('FINISHED_WEBWORKER')
+                }
+                """
+            self.send_response(200)
+            self.send_header("Content-type", "application/javascript")
+            self.send_header("Content-length", len(body))
+            self.end_headers()
+            self.copyfile(BytesIO(body), self.wfile)
+
+        else:
+            return super().do_GET()
+
+    def end_headers(self):
+        # Enable Cross-Origin Resource Sharing (CORS)
+        self.send_header("Access-Control-Allow-Origin", "*")
+        super().end_headers()
+
+
+def run_server_thread(dist_dir, q):
+    global _SERVER_ADDRESS
+    os.chdir(dist_dir)
+    server = http.server.HTTPServer(("", 0), TemplateOverrider)
+    q.put(server.server_address)
+    print(f"Starting server for {dist_dir} at: {server.server_address}")
+    server.serve_forever()
+
+
[email protected]
+def launch_server(dist_dir):
+    q = multiprocessing.Queue()
+    p = multiprocessing.Process(target=run_server_thread, args=[dist_dir, q])
+    p.start()
+    address = q.get(timeout=50)
+    yield address
+    p.terminate()
+
+
+class NodeDriver:
+    import subprocess
+
+    def __init__(self, hostname, port):
+        self.process = subprocess.Popen(
+            [shutil.which("node"), "-i"], stdin=subprocess.PIPE, bufsize=0
+        )
+        print(self.process)
+        self.hostname = hostname
+        self.port = port
+        self.last_ret_code = None
+
+    def load_pyodide(self, dist_dir):
+        self.execute_js(
+            f"""
+        const {{ loadPyodide }} = require('{dist_dir}/pyodide.js')
+        let pyodide = await loadPyodide()
+        """
+        )
+
+    def clear_logs(self):
+        pass  # we don't handle logs for node
+
+    def execute_js(self, code, wait_for_terminate=True):
+        self.process.stdin.write((code + "\n").encode("utf-8"))
+
+    def load_arrow(self):
+        self.execute_js(f"await pyodide.loadPackage('{PYARROW_WHEEL_PATH}')")
+
+    def execute_python(self, code, wait_for_terminate=True):
+        js_code = f"""
+            python = `{code}`;
+            await pyodide.loadPackagesFromImports(python);
+            await pyodide.runPythonAsync(python);
+        """
+        self.last_ret_code = self.execute_js(js_code, wait_for_terminate)
+        return self.last_ret_code
+
+    def wait_for_done(self):
+        # in node we just let it run above
+        # then send EOF
+        self.process.stdin.write(b".exit\n")
+        return self.last_ret_code
+
+
+class BrowserDriver:
+    def __init__(self, hostname, port, driver):
+        self.driver = driver
+        self.driver.get(f"http://{hostname}:{port}/test.html";)
+        self.driver.set_script_timeout(100)
+
+    def load_pyodide(self, dist_dir):
+        pass
+
+    def load_arrow(self):
+        self.execute_python(
+            f"import pyodide_js as pjs\n"
+            f"await pjs.loadPackage('{PYARROW_WHEEL_PATH.name}')\n"
+        )
+
+    def execute_python(self, code, wait_for_terminate=True):
+        if wait_for_terminate:
+            self.driver.execute_async_script(
+                f"""
+                let callback=arguments[arguments.length-1];
+                python = `{code}`;
+                window.python_done_callback=callback
+                window.pyworker.postMessage(
+                    {{python,isatty:{'true' if sys.stdout.isatty() else 
'false'}}})
+                """
+            )
+        else:
+            self.driver.execute_script(
+                f"""
+                let python = `{code}`;
+                window.python_done_callback= (x) => 
{{window.python_script_done=x}}
+                window.pyworker.postMessage(
+                    {{python,isatty:{'true' if sys.stdout.isatty() else 
'false'}}});
+                """
+            )
+
+    def clear_logs(self):
+        self.driver.execute_script("window.python_logs = []")
+
+    def wait_for_done(self):
+        while True:
+            # poll for console.log messages from our webworker
+            # which are the output of pytest
+            lines = self.driver.execute_script(
+                "let temp = window.python_logs;window.python_logs=[];return 
temp;"
+            )
+            if len(lines) > 0:
+                sys.stdout.buffer.write(bytes(lines))
+            done = self.driver.execute_script("return 
window.python_script_done")
+            if done is not None:
+                value = done["result"]
+                self.driver.execute_script("delete window.python_script_done")
+                return value
+            time.sleep(0.1)
+
+
+class ChromeDriver(BrowserDriver):
+    def __init__(self, hostname, port):
+        from selenium.webdriver.chrome.options import Options
+
+        options = Options()
+        options.add_argument("--headless")
+        options.add_argument("--no-sandbox")
+        super().__init__(hostname, port, webdriver.Chrome(options=options))
+
+
+class FirefoxDriver(BrowserDriver):
+    def __init__(self, hostname, port):
+        from selenium.webdriver.firefox.options import Options
+
+        options = Options()
+        options.add_argument("--headless")
+
+        super().__init__(hostname, port, webdriver.Firefox(options=options))
+
+
+def _load_pyarrow_in_runner(driver, wheel_name):
+    driver.load_arrow()
+    driver.execute_python(
+        """import sys
+import micropip
+if "pyarrow" not in sys.modules:
+    try:
+        import tzdata
+    except:
+        await micropip.install("tzdata")
+    await micropip.install("hypothesis")
+    import pyodide_js as pjs
+    await pjs.loadPackage("numpy")
+    await pjs.loadPackage("pandas")
+    import pytest
+    import pandas # import pandas after pyarrow package load for pandas/pyarrow
+                    #functions to work

Review Comment:
   ```suggestion
                     # functions to work
   ```



##########
python/CMakeLists.txt:
##########
@@ -650,28 +671,42 @@ endif()
 
 # Acero
 if(PYARROW_BUILD_ACERO)
-  if(PYARROW_BUNDLE_ARROW_CPP)
-    bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION})
-    if(MSVC)
-      bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
+  list(APPEND CYTHON_EXTENSIONS _acero)
+
+  if(ARROW_BUILD_SHARED)
+
+    if(PYARROW_BUNDLE_ARROW_CPP)
+      bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION 
${ARROW_SO_VERSION})
+      if(MSVC)
+        bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
+      endif()
     endif()
-  endif()
 
-  set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared)
-  list(APPEND CYTHON_EXTENSIONS _acero)
+    set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared)
+  else()
+    # ACERO is statically linked into libarrow_python already
+    set(ACERO_LINK_LIBS)
+  endif()
 endif()
 
 # Dataset
 if(PYARROW_BUILD_DATASET)
-  if(PYARROW_BUNDLE_ARROW_CPP)
-    bundle_arrow_lib(${ARROW_DATASET_SHARED_LIB} SO_VERSION 
${ARROW_SO_VERSION})
-    if(MSVC)
-      bundle_arrow_import_lib(${ARROW_DATASET_IMPORT_LIB})
+  if(ARROW_BUILD_SHARED)
+
+    if(PYARROW_BUNDLE_ARROW_CPP)
+      bundle_arrow_lib(${ARROW_DATASET_SHARED_LIB} SO_VERSION 
${ARROW_SO_VERSION})
+      if(MSVC)
+        bundle_arrow_import_lib(${ARROW_DATASET_IMPORT_LIB})
+      endif()
     endif()
-  endif()
 
-  set(DATASET_LINK_LIBS ArrowDataset::arrow_dataset_shared)
+    set(DATASET_LINK_LIBS ArrowDataset::arrow_dataset_shared)
+  else()
+    # dataset is statically linked into libarrow_python already
+    set(DATASET_LINK_LIBS)
+  endif()
   list(APPEND CYTHON_EXTENSIONS _dataset)
+

Review Comment:
   ```suggestion
   ```



##########
python/CMakeLists.txt:
##########
@@ -650,28 +671,42 @@ endif()
 
 # Acero
 if(PYARROW_BUILD_ACERO)
-  if(PYARROW_BUNDLE_ARROW_CPP)
-    bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION})
-    if(MSVC)
-      bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
+  list(APPEND CYTHON_EXTENSIONS _acero)
+
+  if(ARROW_BUILD_SHARED)
+
+    if(PYARROW_BUNDLE_ARROW_CPP)
+      bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION 
${ARROW_SO_VERSION})
+      if(MSVC)
+        bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
+      endif()
     endif()
-  endif()
 
-  set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared)
-  list(APPEND CYTHON_EXTENSIONS _acero)
+    set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared)
+  else()
+    # ACERO is statically linked into libarrow_python already

Review Comment:
   ```suggestion
       # Acero is statically linked into libarrow_python already
   ```



##########
python/CMakeLists.txt:
##########
@@ -745,14 +782,22 @@ if(PYARROW_BUILD_SUBSTRAIT)
     message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON")
   endif()
   find_package(ArrowSubstrait REQUIRED)
-  if(PYARROW_BUNDLE_ARROW_CPP)
-    bundle_arrow_lib(${ARROW_SUBSTRAIT_SHARED_LIB} SO_VERSION 
${ARROW_SO_VERSION})
-    if(MSVC)
-      bundle_arrow_import_lib(${ARROW_SUBSTRAIT_IMPORT_LIB})
+
+  if(ARROW_BUILD_SHARED)
+    if(PYARROW_BUNDLE_ARROW_CPP)
+      bundle_arrow_lib(${ARROW_SUBSTRAIT_SHARED_LIB} SO_VERSION 
${ARROW_SO_VERSION})
+      if(MSVC)
+        bundle_arrow_import_lib(${ARROW_SUBSTRAIT_IMPORT_LIB})
+      endif()
     endif()
+    set(SUBSTRAIT_LINK_LIBS ArrowSubstrait::arrow_substrait_shared)
+  else()
+    # only link to the output of substrait, not to libarrow etc.
+    # if we link to the target directly we end up with multiple copies

Review Comment:
   ```suggestion
       # Only link to the output of substrait, not to libarrow etc.
       # If we link to the target directly we end up with multiple copies
   ```



##########
python/CMakeLists.txt:
##########
@@ -650,28 +671,42 @@ endif()
 
 # Acero
 if(PYARROW_BUILD_ACERO)
-  if(PYARROW_BUNDLE_ARROW_CPP)
-    bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION})
-    if(MSVC)
-      bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
+  list(APPEND CYTHON_EXTENSIONS _acero)
+
+  if(ARROW_BUILD_SHARED)
+

Review Comment:
   ```suggestion
   ```



##########
python/CMakeLists.txt:
##########
@@ -143,6 +146,22 @@ if(NOT DEFINED ARROW_RUNTIME_SIMD_LEVEL)
 endif()
 include(SetupCxxFlags)
 
+if($ENV{PYODIDE})
+  # these variables are needed for building pyarrow on Emscripten
+  # if they aren't set, cmake cross compiling fails for python
+  # modules (at least under pyodide it does)

Review Comment:
   ```suggestion
     # If they aren't set, CMake cross compiling fails for Python
     # modules (at least under Pyodide it does).
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to