This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 58415d1fac GH-38183: [CI][Python] Use pipx to install GCS testbench
(#43852)
58415d1fac is described below
commit 58415d1fac50cb829b3dcf08526033d6db8c30db
Author: Antoine Pitrou <[email protected]>
AuthorDate: Thu Aug 29 02:54:32 2024 +0200
GH-38183: [CI][Python] Use pipx to install GCS testbench (#43852)
### Rationale for this change
Installing the GCS testbench using the same Python that's being used to
test PyArrow is fragile: some testbench versions may not be compatible, or
there could be conflicts among the dependencies of the respective libraries.
### What changes are included in this PR?
Use `pipx` to install the GCS testbench in a separate, controlled
environment, using an appropriate Python version.
### Are these changes tested?
Yes, by CI.
### Are there any user-facing changes?
No.
* GitHub Issue: #38183
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
.github/workflows/cpp.yml | 8 ++-
appveyor.yml | 1 +
ci/appveyor-cpp-build.bat | 2 +
ci/docker/conda-cpp.dockerfile | 12 ++--
ci/docker/conda-python.dockerfile | 5 --
.../python-wheel-windows-test-vs2019.dockerfile | 27 ++++++---
ci/docker/ubuntu-20.04-cpp-minimal.dockerfile | 1 +
ci/docker/ubuntu-22.04-cpp-minimal.dockerfile | 1 +
ci/docker/ubuntu-24.04-cpp-minimal.dockerfile | 1 +
ci/scripts/install_gcs_testbench.bat | 13 ++++-
ci/scripts/install_gcs_testbench.sh | 20 ++++---
ci/scripts/python_wheel_windows_test.bat | 40 +++++++------
cpp/src/arrow/filesystem/gcsfs_test.cc | 68 +++++++++++-----------
python/pyarrow/tests/conftest.py | 7 +--
python/scripts/run_emscripten_tests.py | 2 +-
r/tests/testthat/test-gcs.R | 4 +-
16 files changed, 122 insertions(+), 90 deletions(-)
diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index c5482f7308..fd23e0cf21 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -465,15 +465,17 @@ jobs:
chmod +x /usr/local/bin/minio.exe
- name: Set up Python
uses: actions/[email protected]
+ id: python-install
with:
python-version: 3.9
- name: Install Google Cloud Storage Testbench
- shell: bash
+ shell: msys2 {0}
+ env:
+ PIPX_BIN_DIR: /usr/local/bin
+ PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }}
run: |
ci/scripts/install_gcs_testbench.sh default
- echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which
python3.exe)))" >> $GITHUB_ENV
- name: Test
shell: msys2 {0}
run: |
- PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}"
ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build"
diff --git a/appveyor.yml b/appveyor.yml
index 5954251d34..9e4582f1d8 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -24,6 +24,7 @@ only_commits:
- appveyor.yml
- ci/appveyor*
- ci/conda*
+ - ci/scripts/*.bat
- cpp/
- format/
- python/
diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index f688fbb63a..08a052e82f 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -46,7 +46,9 @@ set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA
-DARROW_WITH_BZ2=ON
set ARROW_CXXFLAGS=/WX /MP
@rem Install GCS testbench
+set PIPX_BIN_DIR=C:\Windows\
call %CD%\ci\scripts\install_gcs_testbench.bat
+storage-testbench -h || exit /B
@rem
@rem Build and test Arrow C++ libraries (including Parquet)
diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile
index dff1f22248..eb035d887a 100644
--- a/ci/docker/conda-cpp.dockerfile
+++ b/ci/docker/conda-cpp.dockerfile
@@ -42,17 +42,19 @@ RUN mamba install -q -y \
valgrind && \
mamba clean --all
+# We want to install the GCS testbench using the Conda base environment's
Python,
+# because the test environment's Python may later change.
+ENV PIPX_PYTHON=/opt/conda/bin/python3
+COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
+RUN /arrow/ci/scripts/install_gcs_testbench.sh default
+
# Ensure npm, node and azurite are on path. npm and node are required to
install azurite, which will then need to
-# be on the path for the tests to run.
+# be on the path for the tests to run.
ENV PATH=/opt/conda/envs/arrow/bin:$PATH
COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_azurite.sh
-# We want to install the GCS testbench using the same Python binary that the
Conda code will use.
-COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
-RUN /arrow/ci/scripts/install_gcs_testbench.sh default
-
COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/
RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin
diff --git a/ci/docker/conda-python.dockerfile
b/ci/docker/conda-python.dockerfile
index 027fd589ce..7e8dbe76f6 100644
--- a/ci/docker/conda-python.dockerfile
+++ b/ci/docker/conda-python.dockerfile
@@ -32,11 +32,6 @@ RUN mamba install -q -y \
nomkl && \
mamba clean --all
-# XXX The GCS testbench was already installed in conda-cpp.dockerfile,
-# but we changed the installed Python version above, so we need to reinstall
it.
-COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
-RUN /arrow/ci/scripts/install_gcs_testbench.sh default
-
ENV ARROW_ACERO=ON \
ARROW_BUILD_STATIC=OFF \
ARROW_BUILD_TESTS=OFF \
diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
index 5f488a4c28..625ab25f84 100644
--- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
@@ -35,16 +35,27 @@ RUN setx path "%path%;C:\Program Files\Git\usr\bin"
RUN wmic product where "name like 'python%%'" call uninstall /nointeractive &&
\
rm -rf Python*
+# Install the GCS testbench using a well-known Python version.
+# NOTE: cannot use pipx's `--fetch-missing-python` because of
+# https://github.com/pypa/pipx/issues/1521, therefore download Python
ourselves.
+RUN choco install -r -y --pre --no-progress python --version=3.11.9
+ENV PIPX_BIN_DIR=C:\\Windows\\
+ENV PIPX_PYTHON="C:\Python311\python.exe"
+COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/
+RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \
+ storage-testbench -h
+
# Define the full version number otherwise choco falls back to patch number 0
(3.8 => 3.8.0)
ARG python=3.8
-RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH
"%PATH%;C:\Python38;C:\Python38\Scripts") & \
- (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH
"%PATH%;C:\Python39;C:\Python39\Scripts") & \
- (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH
"%PATH%;C:\Python310;C:\Python310\Scripts") & \
- (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH
"%PATH%;C:\Python311;C:\Python311\Scripts") & \
- (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH
"%PATH%;C:\Python312;C:\Python312\Scripts") & \
- (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH
"%PATH%;C:\Python313;C:\Python313\Scripts")
+RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10") & \
+ (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \
+ (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \
+ (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \
+ (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4") & \
+ (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1")
# Install archiver to extract xz archives
-RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION%
& \
- python -m pip install --no-cache-dir -U pip setuptools & \
+RUN choco install -r -y --pre --no-progress --force python
--version=%PYTHON_VERSION% && \
choco install --no-progress -r -y archiver
+
+ENV PYTHON=$python
diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
index e17c0306f1..4d867a448c 100644
--- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
libssl-dev \
libcurl4-openssl-dev \
python3-pip \
+ python3-venv \
tzdata \
wget && \
apt-get clean && \
diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
index 341d8a87e8..f26cad51f0 100644
--- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
libssl-dev \
libcurl4-openssl-dev \
python3-pip \
+ python3-venv \
tzdata \
wget && \
apt-get clean && \
diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
index a995ab2a8b..125bc7ba46 100644
--- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
libssl-dev \
libcurl4-openssl-dev \
python3-pip \
+ python3-venv \
tzdata \
tzdata-legacy \
wget && \
diff --git a/ci/scripts/install_gcs_testbench.bat
b/ci/scripts/install_gcs_testbench.bat
index b03d0c2ad6..f54f98db7c 100644
--- a/ci/scripts/install_gcs_testbench.bat
+++ b/ci/scripts/install_gcs_testbench.bat
@@ -17,9 +17,18 @@
@echo on
-set GCS_TESTBENCH_VERSION="v0.36.0"
+set GCS_TESTBENCH_VERSION="v0.40.0"
+
+set PIPX_FLAGS=--verbose
+if NOT "%PIPX_PYTHON%"=="" (
+ set PIPX_FLAGS=--python %PIPX_PYTHON% %PIPX_FLAGS%
+)
+
+python -m pip install -U pipx || exit /B 1
@REM Install GCS testbench %GCS_TESTBENCH_VERSION%
-python -m pip install ^
+pipx install %PIPX_FLAGS% ^
"https://github.com/googleapis/storage-testbench/archive/%GCS_TESTBENCH_VERSION%.tar.gz"
^
|| exit /B 1
+
+pipx list --verbose
diff --git a/ci/scripts/install_gcs_testbench.sh
b/ci/scripts/install_gcs_testbench.sh
index 5471b3cc23..78826e94d3 100755
--- a/ci/scripts/install_gcs_testbench.sh
+++ b/ci/scripts/install_gcs_testbench.sh
@@ -17,7 +17,7 @@
# specific language governing permissions and limitations
# under the License.
-set -e
+set -ex
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <storage-testbench version>"
@@ -34,19 +34,23 @@ case "$(uname -m)" in
;;
esac
-# On newer pythons install into the system will fail, so override that
-export PIP_BREAK_SYSTEM_PACKAGES=1
-
version=$1
if [[ "${version}" -eq "default" ]]; then
version="v0.39.0"
- # Latests versions of Testbench require newer setuptools
- python3 -m pip install --upgrade setuptools
fi
+: ${PIPX_PYTHON:=$(which python3)}
+
+export PIP_BREAK_SYSTEM_PACKAGES=1
+${PIPX_PYTHON} -m pip install -U pipx
+
# This script is run with PYTHON undefined in some places,
# but those only use older pythons.
if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then
- python3 -m pip install \
- "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+ pipx_flags=--verbose
+ if [[ $(id -un) == "root" ]]; then
+ # Install globally as /root/.local/bin is typically not in $PATH
+ pipx_flags="${pipx_flags} --global"
+ fi
+ ${PIPX_PYTHON} -m pipx install ${pipx_flags}
"https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
fi
diff --git a/ci/scripts/python_wheel_windows_test.bat
b/ci/scripts/python_wheel_windows_test.bat
index 87c0bb1252..cac3f18434 100755
--- a/ci/scripts/python_wheel_windows_test.bat
+++ b/ci/scripts/python_wheel_windows_test.bat
@@ -37,28 +37,32 @@ set PYARROW_TEST_TENSORFLOW=ON
set ARROW_TEST_DATA=C:\arrow\testing\data
set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data
-@REM Install testing dependencies
-pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1
+@REM List installed Pythons
+py -0p
+
+set PYTHON_CMD=py -%PYTHON%
-@REM Install GCS testbench
-call "C:\arrow\ci\scripts\install_gcs_testbench.bat"
+%PYTHON_CMD% -m pip install -U pip setuptools || exit /B 1
+
+@REM Install testing dependencies
+%PYTHON_CMD% -m pip install -r C:\arrow\python\requirements-wheel-test.txt ||
exit /B 1
@REM Install the built wheels
-python -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow ||
exit /B 1
+%PYTHON_CMD% -m pip install --no-index --find-links=C:\arrow\python\dist\
pyarrow || exit /B 1
@REM Test that the modules are importable
-python -c "import pyarrow" || exit /B 1
-python -c "import pyarrow._gcsfs" || exit /B 1
-python -c "import pyarrow._hdfs" || exit /B 1
-python -c "import pyarrow._s3fs" || exit /B 1
-python -c "import pyarrow.csv" || exit /B 1
-python -c "import pyarrow.dataset" || exit /B 1
-python -c "import pyarrow.flight" || exit /B 1
-python -c "import pyarrow.fs" || exit /B 1
-python -c "import pyarrow.json" || exit /B 1
-python -c "import pyarrow.orc" || exit /B 1
-python -c "import pyarrow.parquet" || exit /B 1
-python -c "import pyarrow.substrait" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.csv" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.dataset" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.flight" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.fs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.json" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.orc" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1
@rem Download IANA Timezone Database for ORC C++
curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz
--output tzdata.tar.xz || exit /B
@@ -67,4 +71,4 @@ arc unarchive tzdata.tar.xz
%USERPROFILE%\Downloads\test\tzdata
set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo
@REM Execute unittest
-pytest -r s --pyargs pyarrow || exit /B 1
+%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1
diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc
b/cpp/src/arrow/filesystem/gcsfs_test.cc
index a6022a8d21..2098cf4d7f 100644
--- a/cpp/src/arrow/filesystem/gcsfs_test.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_test.cc
@@ -95,44 +95,41 @@ class GcsTestbench : public ::testing::Environment {
if (const auto* env = std::getenv("PYTHON")) {
names = {env};
}
- auto error = std::string(
- "Could not start GCS emulator."
- " Used the following list of python interpreter names:");
- for (const auto& interpreter : names) {
- auto exe_path = bp::search_path(interpreter);
- error += " " + interpreter;
- if (exe_path.empty()) {
- error += " (exe not found)";
- continue;
- }
+ auto error = std::string("Could not start GCS emulator
'storage-testbench'");
- bp::ipstream output;
- server_process_ = bp::child(exe_path, "-m", "testbench", "--port",
port_, group_,
- bp::std_err > output);
+ auto testbench_is_running = [](bp::child& process, bp::ipstream& output) {
// Wait for message: "* Restarting with"
- auto testbench_is_running = [&output, this](bp::child& process) {
- std::string line;
- std::chrono::time_point<std::chrono::steady_clock> end =
- std::chrono::steady_clock::now() + std::chrono::seconds(10);
- while (server_process_.valid() && server_process_.running() &&
- std::chrono::steady_clock::now() < end) {
- if (output.peek() && std::getline(output, line)) {
- std::cerr << line << std::endl;
- if (line.find("* Restarting with") != std::string::npos) return
true;
- } else {
- std::this_thread::sleep_for(std::chrono::milliseconds(20));
- }
+ std::string line;
+ std::chrono::time_point<std::chrono::steady_clock> end =
+ std::chrono::steady_clock::now() + std::chrono::seconds(10);
+ while (process.valid() && process.running() &&
+ std::chrono::steady_clock::now() < end) {
+ if (output.peek() && std::getline(output, line)) {
+ std::cerr << line << std::endl;
+ if (line.find("* Restarting with") != std::string::npos) return true;
+ } else {
+ std::this_thread::sleep_for(std::chrono::milliseconds(20));
}
- return false;
- };
+ }
+ return false;
+ };
- if (testbench_is_running(server_process_)) break;
- error += " (failed to start)";
- server_process_.terminate();
- server_process_.wait();
+ auto exe_path = bp::search_path("storage-testbench");
+ if (!exe_path.empty()) {
+ bp::ipstream output;
+ server_process_ =
+ bp::child(exe_path, "--port", port_, group_, bp::std_err > output);
+ if (!testbench_is_running(server_process_, output)) {
+ error += " (failed to start)";
+ server_process_.terminate();
+ server_process_.wait();
+ }
+ } else {
+ error += " (exe not found)";
+ }
+ if (!server_process_.valid()) {
+ error_ = std::move(error);
}
- if (server_process_.valid() && server_process_.valid()) return;
- error_ = std::move(error);
}
bool running() { return server_process_.running(); }
@@ -140,7 +137,10 @@ class GcsTestbench : public ::testing::Environment {
~GcsTestbench() override {
// Brutal shutdown, kill the full process group because the GCS testbench
may launch
// additional children.
- group_.terminate();
+ try {
+ group_.terminate();
+ } catch (bp::process_error&) {
+ }
if (server_process_.valid()) {
server_process_.wait();
}
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index e1919497b5..7a222cec8a 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -233,17 +233,16 @@ def s3_server(s3_connection, tmpdir_factory):
def gcs_server():
port = find_free_port()
env = os.environ.copy()
- args = [sys.executable, '-m', 'testbench', '--port', str(port)]
+ exe = 'storage-testbench'
+ args = [exe, '--port', str(port)]
proc = None
try:
- # check first if testbench module is available
- import testbench # noqa:F401
# start server
proc = subprocess.Popen(args, env=env)
# Make sure the server is alive.
if proc.poll() is not None:
pytest.skip(f"Command {args} did not start server successfully!")
- except (ModuleNotFoundError, OSError) as e:
+ except OSError as e:
pytest.skip(f"Command {args} failed to execute: {e}")
else:
yield {
diff --git a/python/scripts/run_emscripten_tests.py
b/python/scripts/run_emscripten_tests.py
index 1a4b4a4e05..53d3dd52bd 100644
--- a/python/scripts/run_emscripten_tests.py
+++ b/python/scripts/run_emscripten_tests.py
@@ -335,7 +335,7 @@ with launch_server(dist_dir) as (hostname, port):
"""
import pyarrow,pathlib
pyarrow_dir = pathlib.Path(pyarrow.__file__).parent
-pytest.main([pyarrow_dir, '-v'])
+pytest.main([pyarrow_dir, '-r', 's'])
""",
wait_for_terminate=False,
)
diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R
index d671c12138..54159e82ca 100644
--- a/r/tests/testthat/test-gcs.R
+++ b/r/tests/testthat/test-gcs.R
@@ -116,12 +116,12 @@ test_that("GcsFileSystem$create() can read
json_credentials", {
})
skip_on_cran()
-skip_if_not(system('python -c "import testbench"') == 0, message =
"googleapis-storage-testbench is not installed.")
+skip_if_not(system("storage-testbench -h") == 0, message =
"googleapis-storage-testbench is not installed.")
library(dplyr)
testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001")
-pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port",
testbench_port),
+pid_minio <- sys::exec_background("storage-testbench", c("--port",
testbench_port),
std_out = FALSE,
std_err = FALSE # TODO: is there a good place to send output?
)