This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 6b59098698 GH-41922: [CI][C++] Update Minio version (#44225)
6b59098698 is described below
commit 6b59098698712a5c14dfdbfa1a6d7b16ccb2085b
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 1 16:49:47 2024 +0200
GH-41922: [CI][C++] Update Minio version (#44225)
We were using a rather old version of Minio for CI tests.
Update the Minio version and ensure the S3FileSystem implementation passes
all the tests with it.
* GitHub Issue: #41922
Lead-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
.github/workflows/cpp.yml | 2 +-
ci/appveyor-cpp-setup.bat | 2 +-
.../python-wheel-windows-test-vs2019.dockerfile | 2 +-
ci/scripts/cpp_build.sh | 8 ++-
ci/scripts/install_minio.sh | 4 +-
cpp/src/arrow/filesystem/s3fs.cc | 58 ++++++++++++++++------
cpp/src/arrow/filesystem/s3fs_test.cc | 15 +++++-
cpp/src/arrow/filesystem/test_util.cc | 10 ++--
cpp/src/arrow/filesystem/test_util.h | 3 ++
python/pyarrow/tests/conftest.py | 4 +-
python/pyarrow/tests/parquet/test_dataset.py | 4 ++
python/pyarrow/tests/test_dataset.py | 8 +--
python/pyarrow/tests/test_fs.py | 5 +-
python/pyarrow/tests/util.py | 17 +++----
14 files changed, 99 insertions(+), 43 deletions(-)
diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index 762032f8e4..93bc723cd4 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -460,7 +460,7 @@ jobs:
mkdir -p /usr/local/bin
wget \
--output-document /usr/local/bin/minio.exe \
-
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z
+
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z
chmod +x /usr/local/bin/minio.exe
- name: Set up Python
uses: actions/[email protected]
diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat
index 092bf2640c..f9463e5074 100644
--- a/ci/appveyor-cpp-setup.bat
+++ b/ci/appveyor-cpp-setup.bat
@@ -94,7 +94,7 @@ set CXX=cl.exe
@rem Download Minio somewhere on PATH, for unit tests
@rem
if "%ARROW_S3%" == "ON" (
- appveyor DownloadFile
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z
-FileName C:\Windows\Minio.exe || exit /B
+ appveyor DownloadFile
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z
-FileName C:\Windows\Minio.exe || exit /B
)
diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
index 8c17ebfa2f..564bca329f 100644
--- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
@@ -35,7 +35,7 @@ RUN setx path "%path%;C:\Program Files\Git\usr\bin"
# 2. Install Minio for S3 testing.
RUN wmic product where "name like 'python%%'" call uninstall /nointeractive &&
\
rm -rf Python* && \
- curl
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z
\
+ curl
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z
\
--output "C:\Windows\Minio.exe"
# Install the GCS testbench using a well-known Python version.
diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh
index bc2bba915f..ac0ba2bf72 100755
--- a/ci/scripts/cpp_build.sh
+++ b/ci/scripts/cpp_build.sh
@@ -34,7 +34,13 @@ if [ ! -z "${CONDA_PREFIX}" ] && [
"${ARROW_EMSCRIPTEN:-OFF}" = "OFF" ]; then
echo -e "===\n=== Conda environment for build\n==="
conda list
- export ARROW_CMAKE_ARGS="${ARROW_CMAKE_ARGS} -DCMAKE_AR=${AR}
-DCMAKE_RANLIB=${RANLIB}"
+ if [ -n "${AR}" ]; then
+ ARROW_CMAKE_ARGS+=" -DCMAKE_AR=${AR}"
+ fi
+ if [ -n "${RANLIB}" ]; then
+ ARROW_CMAKE_ARGS+=" -DCMAKE_RANLIB=${RANLIB}"
+ fi
+ export ARROW_CMAKE_ARGS
export ARROW_GANDIVA_PC_CXX_FLAGS=$(echo | ${CXX} -E -Wp,-v -xc++ - 2>&1 |
grep '^ ' | awk '{print "-isystem;" substr($1, 1)}' | tr '\n' ';')
elif [ -x "$(command -v xcrun)" ]; then
export ARROW_GANDIVA_PC_CXX_FLAGS="-isysroot;$(xcrun --show-sdk-path)"
diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh
index 40762c9f32..6f9701ab5a 100755
--- a/ci/scripts/install_minio.sh
+++ b/ci/scripts/install_minio.sh
@@ -63,8 +63,8 @@ if [ "${version}" != "latest" ]; then
fi
# Use specific versions for minio server and client to avoid CI failures on
new releases.
-minio_version="minio.RELEASE.2022-05-26T05-48-41Z"
-mc_version="mc.RELEASE.2022-05-09T04-08-26Z"
+minio_version="minio.RELEASE.2024-09-13T20-26-02Z"
+mc_version="mc.RELEASE.2024-09-16T17-43-14Z"
download()
{
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 77b111f61b..3a0ade3d2e 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -2218,11 +2218,28 @@ class S3FileSystem::Impl : public
std::enable_shared_from_this<S3FileSystem::Imp
return std::string(FromAwsString(builder_.config().region));
}
+ // TODO: for every returned error, call GetOrSetBackend()?
+
template <typename Error>
- void SaveBackend(const Aws::Client::AWSError<Error>& error) {
+ S3Backend GetOrSetBackend(const Aws::Client::AWSError<Error>& error) {
if (!backend_ || *backend_ == S3Backend::Other) {
backend_ = DetectS3Backend(error);
}
+ DCHECK(backend_.has_value());
+ return *backend_;
+ }
+
+ Result<S3Backend> GetBackend() {
+ if (!backend_) {
+ ARROW_ASSIGN_OR_RAISE(auto client_lock, holder_->Lock());
+
+ S3Model::HeadBucketRequest req;
+ req.SetBucket("$extremelyunlikelytoexist$");
+ auto outcome = client_lock.Move()->HeadBucket(req);
+ DCHECK(!outcome.IsSuccess());
+ return GetOrSetBackend(outcome.GetError());
+ }
+ return *backend_;
}
// Tests to see if a bucket exists
@@ -2345,12 +2362,7 @@ class S3FileSystem::Impl : public
std::enable_shared_from_this<S3FileSystem::Imp
if (previous_outcome) {
// Fetch the backend from the previous error
- DCHECK(!previous_outcome->IsSuccess());
- if (!backend_) {
- SaveBackend(previous_outcome->GetError());
- DCHECK(backend_);
- }
- if (backend_ != S3Backend::Minio) {
+ if (GetOrSetBackend(previous_outcome->GetError()) != S3Backend::Minio) {
// HEAD already returned a 404, nothing more to do
return false;
}
@@ -2373,9 +2385,7 @@ class S3FileSystem::Impl : public
std::enable_shared_from_this<S3FileSystem::Imp
return true;
}
if (!backend_) {
- SaveBackend(outcome.GetError());
- DCHECK(backend_);
- if (*backend_ == S3Backend::Minio) {
+ if (GetOrSetBackend(outcome.GetError()) == S3Backend::Minio) {
// Try again with separator-terminated key (see above)
return IsEmptyDirectory(bucket, key);
}
@@ -3053,6 +3063,7 @@ Result<FileInfo> S3FileSystem::GetFileInfo(const
std::string& s) {
auto outcome = client_lock.Move()->HeadBucket(req);
if (!outcome.IsSuccess()) {
+ impl_->GetOrSetBackend(outcome.GetError());
if (!IsNotFound(outcome.GetError())) {
const auto msg = "When getting information for bucket '" + path.bucket
+ "': ";
return ErrorToStatus(msg, "HeadBucket", outcome.GetError(),
@@ -3077,6 +3088,7 @@ Result<FileInfo> S3FileSystem::GetFileInfo(const
std::string& s) {
FileObjectToInfo(path.key, outcome.GetResult(), &info);
return info;
}
+ impl_->GetOrSetBackend(outcome.GetError());
if (!IsNotFound(outcome.GetError())) {
const auto msg = "When getting information for key '" + path.key + "' in
bucket '" +
path.bucket + "': ";
@@ -3124,8 +3136,9 @@ Status S3FileSystem::CreateDir(const std::string& s, bool
recursive) {
return impl_->CreateBucket(path.bucket);
}
+ ARROW_ASSIGN_OR_RAISE(auto backend, impl_->GetBackend());
+
FileInfo file_info;
- // Create object
if (recursive) {
// Ensure bucket exists
ARROW_ASSIGN_OR_RAISE(bool bucket_exists,
impl_->BucketExists(path.bucket));
@@ -3135,7 +3148,8 @@ Status S3FileSystem::CreateDir(const std::string& s, bool
recursive) {
auto key_i = path.key_parts.begin();
std::string parent_key{};
- if (options().check_directory_existence_before_creation) {
+ if (options().check_directory_existence_before_creation ||
+ backend == S3Backend::Minio) {
// Walk up the directory first to find the first existing parent
for (const auto& part : path.key_parts) {
parent_key += part;
@@ -3146,6 +3160,10 @@ Status S3FileSystem::CreateDir(const std::string& s,
bool recursive) {
this->GetFileInfo(path.bucket + kSep +
parent_key));
if (file_info.type() != FileType::NotFound) {
// Found!
+ if (file_info.type() != FileType::Directory) {
+ return Status::IOError("Cannot create directory '",
file_info.path(),
+ "': a non-directory entry already exists");
+ }
break;
} else {
// remove the kSep and the part
@@ -3178,15 +3196,23 @@ Status S3FileSystem::CreateDir(const std::string& s,
bool recursive) {
}
}
- // Check if the directory exists already
- if (options().check_directory_existence_before_creation) {
+ // Non-recursive operation
+
+ // Check if the entry exists already
+ if (options().check_directory_existence_before_creation ||
+ backend == S3Backend::Minio) {
ARROW_ASSIGN_OR_RAISE(file_info, this->GetFileInfo(path.full_path));
if (file_info.type() != FileType::NotFound) {
+ if (file_info.type() != FileType::Directory) {
+ return Status::IOError("Cannot create directory '", file_info.path(),
+ "': a non-directory entry already exists");
+ }
return Status::OK();
}
}
- // XXX Should we check that no non-directory entry exists?
- // Minio does it for us, not sure about other S3 implementations.
+ // NOTE: this won't check that no non-directory entry exists with the same
name
+ // (unlike when `check_directory_existence_before_creation` is enabled).
+ // Old versions of Minio do it for us, newer versions don't.
return impl_->CreateEmptyDir(path.bucket, path.key);
}
diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc
b/cpp/src/arrow/filesystem/s3fs_test.cc
index 82a7d6e546..b8f497d23c 100644
--- a/cpp/src/arrow/filesystem/s3fs_test.cc
+++ b/cpp/src/arrow/filesystem/s3fs_test.cc
@@ -995,7 +995,7 @@ TEST_F(TestS3FS, CreateDir) {
ASSERT_OK(fs_->CreateDir("bucket/newdir", /*recursive=*/false));
AssertFileInfo(fs_.get(), "bucket/newdir", FileType::Directory);
- // By default CreateDir uses recursvie mode, make it explictly to be false
+ // Non-recursive, but parent does not exist
ASSERT_RAISES(IOError,
fs_->CreateDir("bucket/newdir/newsub/newsubsub",
/*recursive=*/false));
@@ -1005,7 +1005,8 @@ TEST_F(TestS3FS, CreateDir) {
AssertFileInfo(fs_.get(), "bucket/newdir/newsub/newsubsub",
FileType::Directory);
// Existing "file", should fail
- ASSERT_RAISES(IOError, fs_->CreateDir("bucket/somefile"));
+ ASSERT_RAISES(IOError, fs_->CreateDir("bucket/somefile",
/*recursive=*/false));
+ ASSERT_RAISES(IOError, fs_->CreateDir("bucket/somefile",
/*recursive=*/true));
// URI
ASSERT_RAISES(Invalid, fs_->CreateDir("s3:bucket/newdir2"));
@@ -1017,6 +1018,11 @@ TEST_F(TestS3FS, CreateDir) {
// check existing before creation
options_.check_directory_existence_before_creation = true;
MakeFileSystem();
+
+ // Existing "file", should fail again
+ ASSERT_RAISES(IOError, fs_->CreateDir("bucket/somefile",
/*recursive=*/false));
+ ASSERT_RAISES(IOError, fs_->CreateDir("bucket/somefile",
/*recursive=*/true));
+
// New "directory" again
AssertFileInfo(fs_.get(), "bucket/checknewdir", FileType::NotFound);
ASSERT_OK(fs_->CreateDir("bucket/checknewdir"));
@@ -1031,6 +1037,7 @@ TEST_F(TestS3FS, CreateDir) {
AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub/newsubsub",
FileType::Directory);
AssertFileInfo(fs_.get(), "bucket/checknewdir/newsub/newsubsub/newsubsub",
FileType::NotFound);
+
// Try creation with the same name
ASSERT_OK(fs_->CreateDir("bucket/checknewdir/newsub/newsubsub/newsubsub/",
/*recursive=*/true));
@@ -1532,6 +1539,10 @@ class TestS3FSGeneric : public S3TestMixin, public
GenericFileSystemTest {
bool have_implicit_directories() const override { return true; }
bool allow_write_file_over_dir() const override { return true; }
+ bool allow_write_implicit_dir_over_file() const override {
+ // Recent Minio versions allow this
+ return true;
+ }
bool allow_move_dir() const override { return false; }
bool allow_append_to_file() const override { return false; }
bool have_directory_mtimes() const override { return false; }
diff --git a/cpp/src/arrow/filesystem/test_util.cc
b/cpp/src/arrow/filesystem/test_util.cc
index 8eb00b8ae4..a6c8976360 100644
--- a/cpp/src/arrow/filesystem/test_util.cc
+++ b/cpp/src/arrow/filesystem/test_util.cc
@@ -447,7 +447,9 @@ void GenericFileSystemTest::TestMoveFile(FileSystem* fs) {
}
// Parent destination is not a directory
CreateFile(fs, "xxx", "");
- ASSERT_RAISES(IOError, fs->Move("AB/pqr", "xxx/mno"));
+ if (!allow_write_implicit_dir_over_file()) {
+ ASSERT_RAISES(IOError, fs->Move("AB/pqr", "xxx/mno"));
+ }
if (!allow_write_file_over_dir()) {
// Destination is a directory
ASSERT_RAISES(IOError, fs->Move("AB/pqr", "EF"));
@@ -568,8 +570,10 @@ void GenericFileSystemTest::TestCopyFile(FileSystem* fs) {
// Parent destination doesn't exist
ASSERT_RAISES(IOError, fs->CopyFile("AB/abc", "XX/mno"));
}
- // Parent destination is not a directory
- ASSERT_RAISES(IOError, fs->CopyFile("AB/abc", "def/mno"));
+ // Parent destination is not a directory ("def" is a file)
+ if (!allow_write_implicit_dir_over_file()) {
+ ASSERT_RAISES(IOError, fs->CopyFile("AB/abc", "def/mno"));
+ }
AssertAllDirs(fs, all_dirs);
AssertAllFiles(fs, {"AB/abc", "EF/ghi", "def"});
}
diff --git a/cpp/src/arrow/filesystem/test_util.h
b/cpp/src/arrow/filesystem/test_util.h
index e70c787aa8..04000c14e9 100644
--- a/cpp/src/arrow/filesystem/test_util.h
+++ b/cpp/src/arrow/filesystem/test_util.h
@@ -166,6 +166,9 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
virtual bool have_implicit_directories() const { return false; }
// - Whether the filesystem may allow writing a file "over" a directory
virtual bool allow_write_file_over_dir() const { return false; }
+ // - Whether the filesystem may allow writing a directory "over" a file,
+ // for example copying file "A" to "B/C" while "B" exists and is a file.
+ virtual bool allow_write_implicit_dir_over_file() const { return false; }
// - Whether the filesystem allows reading a directory
virtual bool allow_read_dir_as_file() const { return false; }
// - Whether the filesystem allows moving a file
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 0b82696d0a..63518beeba 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -151,7 +151,7 @@ def hdfs_connection():
@pytest.fixture(scope='session')
def s3_connection():
- host, port = 'localhost', find_free_port()
+ host, port = '127.0.0.1', find_free_port()
access_key, secret_key = 'arrow', 'apachearrow'
return host, port, access_key, secret_key
@@ -195,7 +195,7 @@ def retry(attempts=3, delay=1.0, max_delay=None, backoff=1):
def s3_server(s3_connection, tmpdir_factory):
@retry(attempts=5, delay=1, backoff=2)
def minio_server_health_check(address):
- resp = urllib.request.urlopen(f"http://{address}/minio/health/cluster")
+ resp = urllib.request.urlopen(f"http://{address}/minio/health/live")
assert resp.getcode() == 200
tmpdir = tmpdir_factory.getbasetemp()
diff --git a/python/pyarrow/tests/parquet/test_dataset.py
b/python/pyarrow/tests/parquet/test_dataset.py
index f68f1aa9cd..5cab902fda 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -19,6 +19,7 @@ import datetime
import inspect
import os
import pathlib
+import sys
try:
import numpy as np
@@ -1075,6 +1076,9 @@ def test_write_to_dataset_pathlib_nonlocal(tempdir,
s3_example_s3fs):
@pytest.mark.pandas
@pytest.mark.s3
+# See https://github.com/apache/arrow/pull/44225#issuecomment-2378365291
[email protected](sys.platform == "win32",
+ reason="test fails because of unsupported characters")
def test_write_to_dataset_with_partitions_s3fs(s3_example_s3fs):
fs, path = s3_example_s3fs
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index 276cd2e78d..ab18159091 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -4914,13 +4914,15 @@ def test_write_dataset_s3_put_only(s3_server):
# write dataset with s3 filesystem
host, port, _, _ = s3_server['connection']
+
+ _configure_s3_limited_user(s3_server, _minio_put_only_policy,
+ 'test_dataset_limited_user', 'limited123')
fs = S3FileSystem(
- access_key='limited',
+ access_key='test_dataset_limited_user',
secret_key='limited123',
endpoint_override='{}:{}'.format(host, port),
scheme='http'
)
- _configure_s3_limited_user(s3_server, _minio_put_only_policy)
table = pa.table([
pa.array(range(20)), pa.array(random.random() for _ in range(20)),
@@ -4970,7 +4972,7 @@ def test_write_dataset_s3_put_only(s3_server):
scheme='http',
allow_bucket_creation=True,
)
- with pytest.raises(OSError, match="Access Denied"):
+ with pytest.raises(OSError, match="(Access Denied|ACCESS_DENIED)"):
ds.write_dataset(
table, "non-existing-bucket", filesystem=fs,
format="feather", create_dir=True,
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 1c639412cd..99d82c3cbf 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -515,11 +515,12 @@ def skip_azure(fs, reason):
@pytest.mark.s3
def test_s3fs_limited_permissions_create_bucket(s3_server):
from pyarrow.fs import S3FileSystem
- _configure_s3_limited_user(s3_server, _minio_limited_policy)
+ _configure_s3_limited_user(s3_server, _minio_limited_policy,
+ 'test_fs_limited_user', 'limited123')
host, port, _, _ = s3_server['connection']
fs = S3FileSystem(
- access_key='limited',
+ access_key='test_fs_limited_user',
secret_key='limited123',
endpoint_override='{}:{}'.format(host, port),
scheme='http'
diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
index acbb2a5c0d..84215d30ef 100644
--- a/python/pyarrow/tests/util.py
+++ b/python/pyarrow/tests/util.py
@@ -371,7 +371,7 @@ def _run_mc_command(mcdir, *args):
raise ChildProcessError("Could not run mc")
-def _configure_s3_limited_user(s3_server, policy):
+def _configure_s3_limited_user(s3_server, policy, username, password):
"""
Attempts to use the mc command to configure the minio server
with a special user limited:limited123 which does not have
@@ -407,15 +407,14 @@ def _configure_s3_limited_user(s3_server, policy):
# The s3_server fixture starts the minio process but
# it takes a few moments for the process to become available
_wait_for_minio_startup(mcdir, address, access_key, secret_key)
- # These commands create a limited user with a specific
- # policy and creates a sample bucket for that user to
- # write to
- _run_mc_command(mcdir, 'admin', 'policy', 'add',
- 'myminio/', 'no-create-buckets', policy_path)
+ # Create a limited user with a specific policy ...
_run_mc_command(mcdir, 'admin', 'user', 'add',
- 'myminio/', 'limited', 'limited123')
- _run_mc_command(mcdir, 'admin', 'policy', 'set',
- 'myminio', 'no-create-buckets', 'user=limited')
+ 'myminio/', username, password)
+ _run_mc_command(mcdir, 'admin', 'policy', 'create',
+ 'myminio/', 'no-create-buckets', policy_path)
+ _run_mc_command(mcdir, 'admin', 'policy', 'attach',
+ 'myminio/', 'no-create-buckets', '--user', username)
+ # ... and a sample bucket for that user to write to
_run_mc_command(mcdir, 'mb', 'myminio/existing-bucket',
'--ignore-existing')