kou commented on code in PR #39009:
URL: https://github.com/apache/arrow/pull/39009#discussion_r1418394235


##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -518,6 +572,180 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest, 
GetFileInfoObject) {
   RunGetFileInfoObjectTest();
 }
 
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) {
+  SetUpSmallFileSystemTree();
+
+  FileSelector select;
+  std::vector<FileInfo> infos;
+
+  // Root dir
+  select.base_dir = "";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 3);
+  ASSERT_EQ(infos, SortedInfos(infos));
+  AssertFileInfo(infos[0], "container", FileType::Directory);
+  AssertFileInfo(infos[1], "empty-container", FileType::Directory);
+  AssertFileInfo(infos[2], container_name_, FileType::Directory);
+
+  // Empty container
+  select.base_dir = "empty-container";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  // Nonexistent container
+  select.base_dir = "nonexistent-container";
+  ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+  select.allow_not_found = true;
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  select.allow_not_found = false;
+  // Non-empty container
+  select.base_dir = "container";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 4);
+  AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+  AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+  AssertFileInfo(infos[2], "container/somedir", FileType::Directory);
+  AssertFileInfo(infos[3], "container/somefile", FileType::File, 9);
+
+  // Empty "directory"
+  select.base_dir = "container/emptydir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  // Non-empty "directories"
+  select.base_dir = "container/somedir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 1);
+  AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory);
+  select.base_dir = "container/somedir/subdir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 1);
+  AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 
8);
+  // Nonexistent
+  select.base_dir = "container/nonexistent";
+  ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+  select.allow_not_found = true;
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  select.allow_not_found = false;
+
+  // Trailing slashes
+  select.base_dir = "empty-container/";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+  select.base_dir = "nonexistent-container/";
+  ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+  select.base_dir = "container/";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 4);
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) {
+  SetUpSmallFileSystemTree();
+
+  FileSelector select;
+  select.recursive = true;
+
+  std::vector<FileInfo> infos;
+  // Root dir
+  select.base_dir = "";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 14);
+  ASSERT_EQ(infos, SortedInfos(infos));
+  AssertInfoAllContainersRecursive(infos);
+
+  // Empty container
+  select.base_dir = "empty-container";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+
+  // Non-empty container
+  select.base_dir = "container";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 10);
+  AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+  AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+  AssertFileInfo(infos[2], "container/otherdir/1", FileType::Directory);
+  AssertFileInfo(infos[3], "container/otherdir/1/2", FileType::Directory);
+  AssertFileInfo(infos[4], "container/otherdir/1/2/3", FileType::Directory);
+  AssertFileInfo(infos[5], "container/otherdir/1/2/3/otherfile", 
FileType::File, 10);
+  AssertFileInfo(infos[6], "container/somedir", FileType::Directory);
+  AssertFileInfo(infos[7], "container/somedir/subdir", FileType::Directory);
+  AssertFileInfo(infos[8], "container/somedir/subdir/subfile", FileType::File, 
8);
+  AssertFileInfo(infos[9], "container/somefile", FileType::File, 9);
+
+  // Empty "directory"
+  select.base_dir = "container/emptydir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos.size(), 0);
+
+  // Non-empty "directories"
+  select.base_dir = "container/somedir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 2);
+  AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory);
+  AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 
8);
+
+  select.base_dir = "container/otherdir";
+  ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+  ASSERT_EQ(infos, SortedInfos(infos));
+  ASSERT_EQ(infos.size(), 4);
+  AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory);
+  AssertFileInfo(infos[1], "container/otherdir/1/2", FileType::Directory);
+  AssertFileInfo(infos[2], "container/otherdir/1/2/3", FileType::Directory);
+  AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", 
FileType::File, 10);
+}
+
+TEST_F(AzuriteFileSystemTest, TestExplicitImplicitDirDeduplication) {

Review Comment:
   Could you use `GetFileInfoSelectorXXX` test name like other added tests?



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
     }
   }
 
+ private:
+  template <typename OnContainer>
+  Status ListContainers(const Azure::Core::Context& context,
+                        OnContainer&& on_container) const {
+    Azure::Storage::Blobs::ListBlobContainersOptions options;
+    // Deleted containers are not returned.
+    options.Include = 
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+    try {
+      auto container_list_response =
+          blob_service_client_->ListBlobContainers(options, context);
+      for (; container_list_response.HasPage();
+           container_list_response.MoveToNextPage(context)) {
+        for (const auto& container : container_list_response.BlobContainers) {
+          RETURN_NOT_OK(on_container(container));
+        }
+      }
+    } catch (const Azure::Storage::StorageException& exception) {
+      return internal::ExceptionToStatus("Failed to list account containers.", 
exception);
+    }
+    return Status::OK();
+  }
+
+  static FileInfo FileInfoFromBlob(const std::string& container,
+                                   const 
Azure::Storage::Blobs::Models::BlobItem& blob) {
+    if (blob.Name.back() == internal::kSep) {
+      return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+    }
+    std::string path;
+    path.reserve(container.size() + 1 + blob.Name.size());
+    path += container;
+    path += internal::kSep;
+    path += blob.Name;
+    FileInfo info{std::move(path), FileType::File};
+    info.set_size(blob.BlobSize);
+    
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+    return info;
+  }
+
+  static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+    return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+                    FileType::Directory};
+  }
+
+  static std::string_view BasenameView(std::string_view s) {
+    auto offset = s.find_last_of(internal::kSep);
+    auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+    return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+  }
+
+  /// \brief List the blobs at the root of a container or some dir in a 
container.
+  ///
+  /// \pre container_client is the client for the container named like the 
first
+  /// segment of select.base_dir.
+  Status GetFileInfoWithSelectorFromContainer(
+      const Azure::Storage::Blobs::BlobContainerClient& container_client,
+      const Azure::Core::Context& context, Azure::Nullable<int32_t> 
page_size_hint,
+      const FileSelector& select, FileInfoVector* acc_results) {
+    ARROW_ASSIGN_OR_RAISE(auto base_location, 
AzureLocation::FromString(select.base_dir));
+
+    bool found = false;
+    Azure::Storage::Blobs::ListBlobsOptions options;
+    if (internal::GetAbstractPathDepth(base_location.path) == 0) {
+      // If the base_dir is the root of the container, then we want to list 
all blobs in
+      // the container and the Prefix should be empty and not even include the 
trailing
+      // slash because the container itself represents the `<container>/` 
directory.
+      options.Prefix = {};
+      found = true;  // Unless the container itself is not found later!
+    } else {
+      options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+    }
+    options.PageSizeHint = page_size_hint;
+    options.Include = 
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+    // When Prefix.Value() contains a trailing slash and we find a blob that
+    // matches it completely, it is an empty directory marker blob for the
+    // directory we're listing from, and we should skip it.
+    auto is_empty_dir_marker =
+        [&options](const Azure::Storage::Blobs::Models::BlobItem& blob) 
noexcept -> bool {
+      return options.Prefix.HasValue() && blob.Name == options.Prefix.Value();

Review Comment:
   Why do we need to differentiate them?
   It seems that `HasTrailingSlash(blob.Name)` is happen only when 
`options.Prefix.Value()` doesn't have any files. I confirmed it with the added 
tests. If it's false, we may need to add one more test for the case.
   
   How about simplify this?
   
   ```diff
   diff --git a/cpp/src/arrow/filesystem/azurefs.cc 
b/cpp/src/arrow/filesystem/azurefs.cc
   index 7355f5440..8a3f4ef35 100644
   --- a/cpp/src/arrow/filesystem/azurefs.cc
   +++ b/cpp/src/arrow/filesystem/azurefs.cc
   @@ -884,14 +884,6 @@ class AzureFileSystem::Impl {
        options.PageSizeHint = page_size_hint;
        options.Include = 
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
    
   -    // When Prefix contains a value and we find a blob that matches it 
completely, it is
   -    // an empty directory marker blob for the directory we're listing from, 
and we should
   -    // skip it.
   -    auto is_the_root_empty_dir_marker =
   -        [&options](const Azure::Storage::Blobs::Models::BlobItem& blob) 
noexcept -> bool {
   -      return options.Prefix.HasValue() && blob.Name == 
options.Prefix.Value();
   -    };
   -
        auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
          if (select.recursive && select.max_recursion > 0) {
            FileSelector sub_select;
   @@ -909,7 +901,10 @@ class AzureFileSystem::Impl {
    
        auto process_blob =
            [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept {
   -          if (!is_the_root_empty_dir_marker(blob)) {
   +          // blob.Name has trailing slash only when Prefix is an empty
   +          // directory marker blob for the directory we're listing
   +          // from, and we should skip it.
   +          if (!internal::HasTrailingSlash(blob.Name)) {
                
acc_results->push_back(FileInfoFromBlob(base_location.container, blob));
              }
            };
   ```



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
     }
   }
 
+ private:
+  template <typename OnContainer>
+  Status ListContainers(const Azure::Core::Context& context,
+                        OnContainer&& on_container) const {
+    Azure::Storage::Blobs::ListBlobContainersOptions options;
+    // Deleted containers are not returned.
+    options.Include = 
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+    try {
+      auto container_list_response =
+          blob_service_client_->ListBlobContainers(options, context);
+      for (; container_list_response.HasPage();
+           container_list_response.MoveToNextPage(context)) {
+        for (const auto& container : container_list_response.BlobContainers) {
+          RETURN_NOT_OK(on_container(container));
+        }
+      }
+    } catch (const Azure::Storage::StorageException& exception) {
+      return internal::ExceptionToStatus("Failed to list account containers.", 
exception);
+    }
+    return Status::OK();
+  }
+
+  static FileInfo FileInfoFromBlob(const std::string& container,
+                                   const 
Azure::Storage::Blobs::Models::BlobItem& blob) {
+    if (blob.Name.back() == internal::kSep) {
+      return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+    }
+    std::string path;
+    path.reserve(container.size() + 1 + blob.Name.size());
+    path += container;
+    path += internal::kSep;
+    path += blob.Name;
+    FileInfo info{std::move(path), FileType::File};
+    info.set_size(blob.BlobSize);
+    
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+    return info;
+  }
+
+  static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+    return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+                    FileType::Directory};
+  }
+
+  static std::string_view BasenameView(std::string_view s) {
+    auto offset = s.find_last_of(internal::kSep);
+    auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+    return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+  }
+
+  /// \brief List the blobs at the root of a container or some dir in a 
container.
+  ///
+  /// \pre container_client is the client for the container named like the 
first
+  /// segment of select.base_dir.
+  Status GetFileInfoWithSelectorFromContainer(
+      const Azure::Storage::Blobs::BlobContainerClient& container_client,
+      const Azure::Core::Context& context, Azure::Nullable<int32_t> 
page_size_hint,
+      const FileSelector& select, FileInfoVector* acc_results) {
+    ARROW_ASSIGN_OR_RAISE(auto base_location, 
AzureLocation::FromString(select.base_dir));
+
+    bool found = false;
+    Azure::Storage::Blobs::ListBlobsOptions options;
+    if (internal::GetAbstractPathDepth(base_location.path) == 0) {
+      // If the base_dir is the root of the container, then we want to list 
all blobs in
+      // the container and the Prefix should be empty and not even include the 
trailing
+      // slash because the container itself represents the `<container>/` 
directory.
+      options.Prefix = {};
+      found = true;  // Unless the container itself is not found later!
+    } else {
+      options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+    }
+    options.PageSizeHint = page_size_hint;
+    options.Include = 
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+    // When Prefix.Value() contains a trailing slash and we find a blob that
+    // matches it completely, it is an empty directory marker blob for the
+    // directory we're listing from, and we should skip it.
+    auto is_empty_dir_marker =
+        [&options](const Azure::Storage::Blobs::Models::BlobItem& blob) 
noexcept -> bool {
+      return options.Prefix.HasValue() && blob.Name == options.Prefix.Value();
+    };
+
+    auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
+      if (select.recursive && select.max_recursion > 0) {
+        FileSelector sub_select;
+        sub_select.base_dir = base_location.container;
+        sub_select.base_dir += internal::kSep;
+        sub_select.base_dir += internal::RemoveTrailingSlash(blob_prefix);

Review Comment:
   @felipecrv Have you tried `ConcatAbstractPath`?



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,215 @@ class AzureFileSystem::Impl {
     }
   }
 
+ private:
+  template <typename OnContainer>
+  Status VisitContainers(const Azure::Core::Context& context,
+                         OnContainer&& on_container) const {
+    Azure::Storage::Blobs::ListBlobContainersOptions options;
+    try {
+      auto container_list_response =
+          blob_service_client_->ListBlobContainers(options, context);
+      for (; container_list_response.HasPage();
+           container_list_response.MoveToNextPage(context)) {
+        for (const auto& container : container_list_response.BlobContainers) {
+          RETURN_NOT_OK(on_container(container));
+        }
+      }
+    } catch (const Azure::Storage::StorageException& exception) {
+      return internal::ExceptionToStatus("Failed to list account containers.", 
exception);
+    }
+    return Status::OK();
+  }
+
+  static FileInfo FileInfoFromBlob(const std::string& container,
+                                   const 
Azure::Storage::Blobs::Models::BlobItem& blob) {
+    auto path = internal::ConcatAbstractPath(container, blob.Name);
+    if (internal::HasTrailingSlash(blob.Name)) {
+      return DirectoryFileInfoFromPath(path);
+    }
+    FileInfo info{std::move(path), FileType::File};
+    info.set_size(blob.BlobSize);
+    
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+    return info;
+  }
+
+  static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+    return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+                    FileType::Directory};
+  }
+
+  static std::string_view BasenameView(std::string_view s) {
+    DCHECK(!internal::HasTrailingSlash(s));
+    auto offset = s.find_last_of(internal::kSep);
+    auto result = (offset == std::string_view::npos) ? s : s.substr(offset);
+    DCHECK(!result.empty() && result.back() != internal::kSep);
+    return result;
+  }
+
+  /// \brief List the blobs at the root of a container or some dir in a 
container.
+  ///
+  /// \pre container_client is the client for the container named like the 
first
+  /// segment of select.base_dir.
+  Status GetFileInfoWithSelectorFromContainer(
+      const Azure::Storage::Blobs::BlobContainerClient& container_client,
+      const Azure::Core::Context& context, Azure::Nullable<int32_t> 
page_size_hint,
+      const FileSelector& select, FileInfoVector* acc_results) {
+    ARROW_ASSIGN_OR_RAISE(auto base_location, 
AzureLocation::FromString(select.base_dir));
+
+    bool found = false;
+    Azure::Storage::Blobs::ListBlobsOptions options;
+    if (internal::GetAbstractPathDepth(base_location.path) == 0) {

Review Comment:
   We want to check `base_location.path.empty() || base_location.path == "/"` 
here, right?
   How about defining `internal::IsAbstractRootPath()` and use it here?
   It seems that counting all `kSep`s in `internal::GetAbstractPathDepth()` is 
inefficient a bit here for a long path.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to