This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 06d4495173 GH-41034: [C++][FS][Azure] Adjust
DeleteDir/DeleteDirContents/GetFileInfoSelector behaviors against Azure for
generic filesystem tests (#41068)
06d4495173 is described below
commit 06d44951736b5a6e5836f2ea2fc988d935ea7f32
Author: Sutou Kouhei <[email protected]>
AuthorDate: Thu Apr 11 08:59:58 2024 +0900
GH-41034: [C++][FS][Azure] Adjust
DeleteDir/DeleteDirContents/GetFileInfoSelector behaviors against Azure for
generic filesystem tests (#41068)
### Rationale for this change
They are failing:
```text
[ FAILED ] TestAzureHierarchicalNSGeneric.DeleteDir
[ FAILED ] TestAzureHierarchicalNSGeneric.DeleteDirContents
[ FAILED ] TestAzureHierarchicalNSGeneric.GetFileInfoSelector
```
### What changes are included in this PR?
* `DeleteDir()`: Check not a directory case
* `DeleteDirContents()`: Check not a directory case
* `GetFileInfoSelector()`:
* Add not a directory check for input
* Add support for returning metadata for directory
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #41034
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/filesystem/azurefs.cc | 83 +++++++++++++++++++++++++++++++-
cpp/src/arrow/filesystem/azurefs_test.cc | 6 ++-
2 files changed, 87 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/filesystem/azurefs.cc
b/cpp/src/arrow/filesystem/azurefs.cc
index 84733a824e..bb8309a247 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -1179,6 +1179,15 @@ Status CreateContainerIfNotExists(const std::string&
container_name,
}
}
+FileInfo FileInfoFromPath(std::string_view container,
+ const DataLake::Models::PathItem& path) {
+ FileInfo info{internal::ConcatAbstractPath(container, path.Name),
+ path.IsDirectory ? FileType::Directory : FileType::File};
+ info.set_size(path.FileSize);
+ info.set_mtime(std::chrono::system_clock::time_point{path.LastModified});
+ return info;
+}
+
FileInfo DirectoryFileInfoFromPath(std::string_view path) {
return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
FileType::Directory};
}
@@ -1576,7 +1585,7 @@ class AzureFileSystem::Impl {
}
private:
- /// \pref location.container is not empty.
+ /// \pre location.container is not empty.
template <typename ContainerClient>
Status CheckDirExists(const ContainerClient& container_client,
const AzureLocation& location) {
@@ -1623,6 +1632,50 @@ class AzureFileSystem::Impl {
return result;
}
+ /// \brief List the paths at the root of a filesystem or some dir in a
filesystem.
+ ///
+ /// \pre adlfs_client is the client for the filesystem named like the first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromFileSystem(
+ const DataLake::DataLakeFileSystemClient& adlfs_client,
+ const Core::Context& context, Azure::Nullable<int32_t> page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ auto directory_client =
adlfs_client.GetDirectoryClient(base_location.path);
+ bool found = false;
+ DataLake::ListPathsOptions options;
+ options.PageSizeHint = page_size_hint;
+
+ try {
+ auto list_response = directory_client.ListPaths(select.recursive,
options, context);
+ for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
+ if (list_response.Paths.empty()) {
+ continue;
+ }
+ found = true;
+ for (const auto& path : list_response.Paths) {
+ if (path.Name == base_location.path && !path.IsDirectory) {
+ return NotADir(base_location);
+ }
+ acc_results->push_back(FileInfoFromPath(base_location.container,
path));
+ }
+ }
+ } catch (const Storage::StorageException& exception) {
+ if (IsContainerNotFound(exception) || exception.ErrorCode ==
"PathNotFound") {
+ found = false;
+ } else {
+ return ExceptionToStatus(exception,
+ "Failed to list paths in a directory: ",
select.base_dir,
+ ": ", directory_client.GetUrl());
+ }
+ }
+
+ return found || select.allow_not_found
+ ? Status::OK()
+ : ::arrow::fs::internal::PathNotFound(select.base_dir);
+ }
+
/// \brief List the blobs at the root of a container or some dir in a
container.
///
/// \pre container_client is the client for the container named like the
first
@@ -1772,6 +1825,20 @@ class AzureFileSystem::Impl {
return VisitContainers(context, std::move(on_container));
}
+ auto adlfs_client = GetFileSystemClient(base_location.container);
+ ARROW_ASSIGN_OR_RAISE(auto hns_support,
HierarchicalNamespaceSupport(adlfs_client));
+ if (hns_support == HNSSupport::kContainerNotFound) {
+ if (select.allow_not_found) {
+ return Status::OK();
+ } else {
+ return ::arrow::fs::internal::PathNotFound(select.base_dir);
+ }
+ }
+ if (hns_support == HNSSupport::kEnabled) {
+ return GetFileInfoWithSelectorFromFileSystem(adlfs_client, context,
page_size_hint,
+ select, acc_results);
+ }
+ DCHECK_EQ(hns_support, HNSSupport::kDisabled);
auto container_client =
blob_service_client_->GetBlobContainerClient(base_location.container);
return GetFileInfoWithSelectorFromContainer(container_client, context,
page_size_hint,
@@ -2157,6 +2224,17 @@ class AzureFileSystem::Impl {
Azure::Nullable<std::string> lease_id = {}) {
DCHECK(!location.container.empty());
DCHECK(!location.path.empty());
+ ARROW_ASSIGN_OR_RAISE(auto file_info, GetFileInfo(adlfs_client, location,
lease_id));
+ if (file_info.type() == FileType::NotFound) {
+ if (require_dir_to_exist) {
+ return PathNotFound(location);
+ } else {
+ return Status::OK();
+ }
+ }
+ if (file_info.type() != FileType::Directory) {
+ return NotADir(location);
+ }
auto directory_client = adlfs_client.GetDirectoryClient(
std::string(internal::RemoveTrailingSlash(location.path)));
DataLake::DeleteDirectoryOptions options;
@@ -2200,6 +2278,9 @@ class AzureFileSystem::Impl {
kDelimiter, path.Name, ": ", sub_directory_client.GetUrl());
}
} else {
+ if (path.Name == location.path) {
+ return NotADir(location);
+ }
auto sub_file_client = adlfs_client.GetFileClient(path.Name);
try {
sub_file_client.Delete();
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc
b/cpp/src/arrow/filesystem/azurefs_test.cc
index 24031e313f..ed09bfc2fa 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -392,7 +392,7 @@ class TestGeneric : public ::testing::Test, public
GenericFileSystemTest {
bool allow_move_dir() const override { return false; }
bool allow_move_file() const override { return true; }
bool allow_append_to_file() const override { return true; }
- bool have_directory_mtimes() const override { return false; }
+ bool have_directory_mtimes() const override { return true; }
bool have_flaky_directory_tree_deletion() const override { return false; }
bool have_file_metadata() const override { return true; }
// calloc() used in libxml2's xmlNewGlobalState() is detected as a
@@ -429,6 +429,8 @@ class TestAzuriteGeneric : public TestGeneric {
protected:
// Azurite doesn't support moving files over containers.
bool allow_move_file() const override { return false; }
+ // Azurite doesn't support directory mtime.
+ bool have_directory_mtimes() const override { return false; }
// DeleteDir() doesn't work with Azurite on macOS
bool have_flaky_directory_tree_deletion() const override {
return env_->HasSubmitBatchBug();
@@ -449,6 +451,8 @@ class TestAzureFlatNSGeneric : public TestGeneric {
protected:
// Flat namespace account doesn't support moving files over containers.
bool allow_move_file() const override { return false; }
+ // Flat namespace account doesn't support directory mtime.
+ bool have_directory_mtimes() const override { return false; }
};
class TestAzureHierarchicalNSGeneric : public TestGeneric {