This is an automated email from the ASF dual-hosted git repository. raulcd pushed a commit to branch maint-14.0.x in repository https://gitbox.apache.org/repos/asf/arrow.git
commit ae8ea4d7109f66a2819c3a6586779564eb4d737e Author: Joris Van den Bossche <[email protected]> AuthorDate: Tue Dec 5 18:23:15 2023 +0100 GH-38618: [C++] S3FileSystem: fix regression in deleting explicitly created sub-directories (#38845) ### Rationale for this change See https://github.com/apache/arrow/issues/38618#issuecomment-1821252024 and below for the analysis. When deleting the dir contents, we use a GetFileInfo with recursive FileSelector to list all objects to delete, but when doing that the file paths for directories don't end in a trailing `/`, so for deleting explicitly created directories we need to add the `kSep` here as well to properly delete the object. ### Are these changes tested? I tested them manually with an actual S3 bucket. The problem is that MinIO doesn't have the same problem, and so it's not actually tested with the test I added using our MinIO testing setup. ### Are there any user-facing changes? Fixes the regression * Closes: #38618 Lead-authored-by: Joris Van den Bossche <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]> --- cpp/src/arrow/filesystem/s3fs.cc | 11 ++++++++++- python/pyarrow/tests/test_fs.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 08fbcde6fd..b2fd3715dd 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2408,7 +2408,16 @@ class S3FileSystem::Impl : public std::enable_shared_from_this<S3FileSystem::Imp std::vector<std::string> file_paths; for (const auto& file_info : file_infos) { DCHECK_GT(file_info.path().size(), bucket.size()); - file_paths.push_back(file_info.path().substr(bucket.size() + 1)); + auto file_path = file_info.path().substr(bucket.size() + 1); + if (file_info.IsDirectory()) { + // The selector returns FileInfo objects for directories with a + // a path that never ends in a trailing slash, but for AWS the file + // needs to have a trailing slash to recognize it as directory + // (https://github.com/apache/arrow/issues/38618) + DCHECK_OK(internal::AssertNoTrailingSlash(file_path)); + file_path = file_path + kSep; + } + file_paths.push_back(std::move(file_path)); } scheduler->AddSimpleTask( [=, file_paths = std::move(file_paths)] { diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index c540bf9681..6f91b7d4d1 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -760,6 +760,38 @@ def test_delete_dir(fs, pathfn): fs.delete_dir(d) +def test_delete_dir_with_explicit_subdir(fs, pathfn): + # GH-38618: regression with AWS failing to delete directories, + # depending on whether they were created explicitly. Note that + # Minio doesn't reproduce the issue, so this test is not a regression + # test in itself. + skip_fsspec_s3fs(fs) + + d = pathfn('directory/') + nd = pathfn('directory/nested/') + + # deleting dir with explicit subdir + fs.create_dir(d) + fs.create_dir(nd) + fs.delete_dir(d) + dir_info = fs.get_file_info(d) + assert dir_info.type == FileType.NotFound + + # deleting dir with blob in explicit subdir + d = pathfn('directory2') + nd = pathfn('directory2/nested') + f = pathfn('directory2/nested/target-file') + + fs.create_dir(d) + fs.create_dir(nd) + with fs.open_output_stream(f) as s: + s.write(b'data') + + fs.delete_dir(d) + dir_info = fs.get_file_info(d) + assert dir_info.type == FileType.NotFound + + def test_delete_dir_contents(fs, pathfn): skip_fsspec_s3fs(fs)
