kou commented on code in PR #39009:
URL: https://github.com/apache/arrow/pull/39009#discussion_r1412585789
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ auto offset = s.find_last_of(internal::kSep);
+ auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+ return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+ }
+
+ /// \brief List the blobs at the root of a container or some dir in a
container.
+ ///
+ /// \pre container_client is the client for the container named like the
first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromContainer(
+ const Azure::Storage::Blobs::BlobContainerClient& container_client,
+ const Azure::Core::Context& context, Azure::Nullable<int32_t>
page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ bool found = false;
Review Comment:
```suggestion
auto found = false;
```
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ auto offset = s.find_last_of(internal::kSep);
+ auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+ return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+ }
Review Comment:
How about moving this to `path_util.{cc,h}`?
Because extracting basename from a path is a generic path operation.
We will be able to use `GetAbstractPathParent()` to implement this like
`return GetAbstractPathParent(s).second;`.
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
Review Comment:
I think that `None` is the default value.
Should we set it explicitly?
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
Review Comment:
I want to avoid the `1` magic number but the following isn't readable...:
```suggestion
path.reserve(container.size() + sizeof(internal::kSep) +
blob.Name.size());
```
Should we define `constexpr size_t kSepSize = sizeof(kSep);` in
`path_util.h` and use it here?
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ auto offset = s.find_last_of(internal::kSep);
+ auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+ return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+ }
+
+ /// \brief List the blobs at the root of a container or some dir in a
container.
+ ///
+ /// \pre container_client is the client for the container named like the
first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromContainer(
+ const Azure::Storage::Blobs::BlobContainerClient& container_client,
+ const Azure::Core::Context& context, Azure::Nullable<int32_t>
page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ bool found = false;
+ Azure::Storage::Blobs::ListBlobsOptions options;
+ if (internal::GetAbstractPathDepth(base_location.path) == 0) {
+ // If the base_dir is the root of the container, then we want to list
all blobs in
+ // the container and the Prefix should be empty and not even include the
trailing
+ // slash because the container itself represents the `<container>/`
directory.
+ options.Prefix = {};
+ found = true; // Unless the container itself is not found later!
+ } else {
+ options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+ }
+ options.PageSizeHint = page_size_hint;
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+ // When Prefix.Value() contains a trailing slash and we find a blob that
+ // matches it completely, it is an empty directory marker blob for the
+ // directory we're listing from, and we should skip it.
+ auto is_empty_dir_marker =
+ [&options](const Azure::Storage::Blobs::Models::BlobItem& blob)
noexcept -> bool {
+ return options.Prefix.HasValue() && blob.Name == options.Prefix.Value();
+ };
+
+ auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
+ if (select.recursive && select.max_recursion > 0) {
+ FileSelector sub_select;
+ sub_select.base_dir = base_location.container;
+ sub_select.base_dir += internal::kSep;
+ sub_select.base_dir += internal::RemoveTrailingSlash(blob_prefix);
+ sub_select.allow_not_found = true;
+ sub_select.recursive = true;
+ sub_select.max_recursion = select.max_recursion - 1;
+ return GetFileInfoWithSelectorFromContainer(
+ container_client, context, page_size_hint, sub_select,
acc_results);
+ }
+ return Status::OK();
+ };
+
+ // (*acc_results)[*last_dir_reported] is the last FileType::Directory in
the results
+ // produced through this loop over the response pages.
+ std::optional<size_t> last_dir_reported{};
+ auto matches_last_dir_reported = [&last_dir_reported,
+ acc_results](const FileInfo& info)
noexcept {
+ if (!last_dir_reported.has_value() || info.type() !=
FileType::Directory) {
+ return false;
+ }
+ const auto& last_dir = (*acc_results)[*last_dir_reported];
+ return BasenameView(info.path()) == BasenameView(last_dir.path());
+ };
+
+ auto process_blob =
+ [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept {
+ if (!is_empty_dir_marker(blob)) {
+ const auto& info = acc_results->emplace_back(
+ FileInfoFromBlob(base_location.container, blob));
+ if (info.type() == FileType::Directory) {
+ last_dir_reported = acc_results->size() - 1;
+ }
+ }
+ };
+ auto process_prefix = [&](const std::string& prefix) noexcept -> Status {
+ const std::string path = base_location.container + internal::kSep +
prefix;
Review Comment:
```suggestion
const auto path = base_location.container + internal::kSep + prefix;
```
BTW, we may want to use `ConcatAbstractPath()` instead of joining with
`internal::kSep` manually.
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
Review Comment:
How about reusing `path`?
```suggestion
std::string path;
path.reserve(container.size() + 1 + blob.Name.size());
path += container;
path += internal::kSep;
path += blob.Name;
if (blob.Name.back() == internal::kSep) {
return DirectoryFileInfoFromPath(path);
}
```
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ auto offset = s.find_last_of(internal::kSep);
+ auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+ return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+ }
+
+ /// \brief List the blobs at the root of a container or some dir in a
container.
+ ///
+ /// \pre container_client is the client for the container named like the
first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromContainer(
+ const Azure::Storage::Blobs::BlobContainerClient& container_client,
+ const Azure::Core::Context& context, Azure::Nullable<int32_t>
page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ bool found = false;
+ Azure::Storage::Blobs::ListBlobsOptions options;
+ if (internal::GetAbstractPathDepth(base_location.path) == 0) {
+ // If the base_dir is the root of the container, then we want to list
all blobs in
+ // the container and the Prefix should be empty and not even include the
trailing
+ // slash because the container itself represents the `<container>/`
directory.
+ options.Prefix = {};
Review Comment:
I think that we should not set `options.Prefix` explicitly.
(I don't object this if this is more readable.)
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ auto offset = s.find_last_of(internal::kSep);
+ auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+ return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+ }
+
+ /// \brief List the blobs at the root of a container or some dir in a
container.
+ ///
+ /// \pre container_client is the client for the container named like the
first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromContainer(
+ const Azure::Storage::Blobs::BlobContainerClient& container_client,
+ const Azure::Core::Context& context, Azure::Nullable<int32_t>
page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ bool found = false;
+ Azure::Storage::Blobs::ListBlobsOptions options;
+ if (internal::GetAbstractPathDepth(base_location.path) == 0) {
+ // If the base_dir is the root of the container, then we want to list
all blobs in
+ // the container and the Prefix should be empty and not even include the
trailing
+ // slash because the container itself represents the `<container>/`
directory.
+ options.Prefix = {};
+ found = true; // Unless the container itself is not found later!
+ } else {
+ options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+ }
+ options.PageSizeHint = page_size_hint;
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+ // When Prefix.Value() contains a trailing slash and we find a blob that
+ // matches it completely, it is an empty directory marker blob for the
+ // directory we're listing from, and we should skip it.
+ auto is_empty_dir_marker =
+ [&options](const Azure::Storage::Blobs::Models::BlobItem& blob)
noexcept -> bool {
+ return options.Prefix.HasValue() && blob.Name == options.Prefix.Value();
+ };
+
+ auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
+ if (select.recursive && select.max_recursion > 0) {
+ FileSelector sub_select;
+ sub_select.base_dir = base_location.container;
+ sub_select.base_dir += internal::kSep;
+ sub_select.base_dir += internal::RemoveTrailingSlash(blob_prefix);
+ sub_select.allow_not_found = true;
+ sub_select.recursive = true;
+ sub_select.max_recursion = select.max_recursion - 1;
+ return GetFileInfoWithSelectorFromContainer(
+ container_client, context, page_size_hint, sub_select,
acc_results);
+ }
+ return Status::OK();
+ };
+
+ // (*acc_results)[*last_dir_reported] is the last FileType::Directory in
the results
+ // produced through this loop over the response pages.
+ std::optional<size_t> last_dir_reported{};
+ auto matches_last_dir_reported = [&last_dir_reported,
+ acc_results](const FileInfo& info)
noexcept {
+ if (!last_dir_reported.has_value() || info.type() !=
FileType::Directory) {
+ return false;
+ }
+ const auto& last_dir = (*acc_results)[*last_dir_reported];
+ return BasenameView(info.path()) == BasenameView(last_dir.path());
+ };
+
+ auto process_blob =
+ [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept {
+ if (!is_empty_dir_marker(blob)) {
+ const auto& info = acc_results->emplace_back(
+ FileInfoFromBlob(base_location.container, blob));
+ if (info.type() == FileType::Directory) {
+ last_dir_reported = acc_results->size() - 1;
+ }
+ }
+ };
+ auto process_prefix = [&](const std::string& prefix) noexcept -> Status {
+ const std::string path = base_location.container + internal::kSep +
prefix;
+ const auto& info =
acc_results->emplace_back(DirectoryFileInfoFromPath(path));
+ if (ARROW_PREDICT_FALSE(matches_last_dir_reported(info))) {
+ acc_results->pop_back();
+ } else {
+ last_dir_reported = acc_results->size() - 1;
+ return recurse(prefix);
+ }
+ return Status::OK();
+ };
+
+ try {
+ auto list_response =
+ container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options,
context);
+ for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
+ if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty())
{
+ continue;
+ }
+ found = true;
+ // Blob and BlobPrefixes are sorted by name, so we can merge-iterate
+ // them to ensure returned results are all sorted.
+ size_t blob_index = 0;
+ size_t blob_prefix_index = 0;
+ while (blob_index < list_response.Blobs.size() &&
+ blob_prefix_index < list_response.BlobPrefixes.size()) {
+ const auto& blob = list_response.Blobs[blob_index];
+ const auto& prefix = list_response.BlobPrefixes[blob_prefix_index];
+ const int cmp = blob.Name.compare(prefix);
Review Comment:
```suggestion
const auto cmp = blob.Name.compare(prefix);
```
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ auto offset = s.find_last_of(internal::kSep);
+ auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+ return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+ }
+
+ /// \brief List the blobs at the root of a container or some dir in a
container.
+ ///
+ /// \pre container_client is the client for the container named like the
first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromContainer(
+ const Azure::Storage::Blobs::BlobContainerClient& container_client,
+ const Azure::Core::Context& context, Azure::Nullable<int32_t>
page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ bool found = false;
+ Azure::Storage::Blobs::ListBlobsOptions options;
+ if (internal::GetAbstractPathDepth(base_location.path) == 0) {
+ // If the base_dir is the root of the container, then we want to list
all blobs in
+ // the container and the Prefix should be empty and not even include the
trailing
+ // slash because the container itself represents the `<container>/`
directory.
+ options.Prefix = {};
+ found = true; // Unless the container itself is not found later!
+ } else {
+ options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+ }
+ options.PageSizeHint = page_size_hint;
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+ // When Prefix.Value() contains a trailing slash and we find a blob that
+ // matches it completely, it is an empty directory marker blob for the
+ // directory we're listing from, and we should skip it.
+ auto is_empty_dir_marker =
+ [&options](const Azure::Storage::Blobs::Models::BlobItem& blob)
noexcept -> bool {
+ return options.Prefix.HasValue() && blob.Name == options.Prefix.Value();
+ };
+
+ auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
+ if (select.recursive && select.max_recursion > 0) {
+ FileSelector sub_select;
+ sub_select.base_dir = base_location.container;
+ sub_select.base_dir += internal::kSep;
+ sub_select.base_dir += internal::RemoveTrailingSlash(blob_prefix);
+ sub_select.allow_not_found = true;
+ sub_select.recursive = true;
+ sub_select.max_recursion = select.max_recursion - 1;
+ return GetFileInfoWithSelectorFromContainer(
+ container_client, context, page_size_hint, sub_select,
acc_results);
+ }
+ return Status::OK();
+ };
+
+ // (*acc_results)[*last_dir_reported] is the last FileType::Directory in
the results
+ // produced through this loop over the response pages.
+ std::optional<size_t> last_dir_reported{};
+ auto matches_last_dir_reported = [&last_dir_reported,
+ acc_results](const FileInfo& info)
noexcept {
+ if (!last_dir_reported.has_value() || info.type() !=
FileType::Directory) {
+ return false;
+ }
+ const auto& last_dir = (*acc_results)[*last_dir_reported];
+ return BasenameView(info.path()) == BasenameView(last_dir.path());
+ };
+
+ auto process_blob =
+ [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept {
+ if (!is_empty_dir_marker(blob)) {
+ const auto& info = acc_results->emplace_back(
+ FileInfoFromBlob(base_location.container, blob));
+ if (info.type() == FileType::Directory) {
+ last_dir_reported = acc_results->size() - 1;
+ }
+ }
+ };
+ auto process_prefix = [&](const std::string& prefix) noexcept -> Status {
+ const std::string path = base_location.container + internal::kSep +
prefix;
+ const auto& info =
acc_results->emplace_back(DirectoryFileInfoFromPath(path));
+ if (ARROW_PREDICT_FALSE(matches_last_dir_reported(info))) {
+ acc_results->pop_back();
+ } else {
+ last_dir_reported = acc_results->size() - 1;
+ return recurse(prefix);
+ }
+ return Status::OK();
+ };
+
+ try {
+ auto list_response =
+ container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options,
context);
+ for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
+ if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty())
{
+ continue;
+ }
+ found = true;
+ // Blob and BlobPrefixes are sorted by name, so we can merge-iterate
+ // them to ensure returned results are all sorted.
+ size_t blob_index = 0;
+ size_t blob_prefix_index = 0;
+ while (blob_index < list_response.Blobs.size() &&
+ blob_prefix_index < list_response.BlobPrefixes.size()) {
+ const auto& blob = list_response.Blobs[blob_index];
+ const auto& prefix = list_response.BlobPrefixes[blob_prefix_index];
+ const int cmp = blob.Name.compare(prefix);
+ if (cmp < 0) {
+ process_blob(blob);
+ blob_index += 1;
Review Comment:
```suggestion
++blob_index;
```
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ auto offset = s.find_last_of(internal::kSep);
+ auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+ return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+ }
+
+ /// \brief List the blobs at the root of a container or some dir in a
container.
+ ///
+ /// \pre container_client is the client for the container named like the
first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromContainer(
+ const Azure::Storage::Blobs::BlobContainerClient& container_client,
+ const Azure::Core::Context& context, Azure::Nullable<int32_t>
page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ bool found = false;
+ Azure::Storage::Blobs::ListBlobsOptions options;
+ if (internal::GetAbstractPathDepth(base_location.path) == 0) {
+ // If the base_dir is the root of the container, then we want to list
all blobs in
+ // the container and the Prefix should be empty and not even include the
trailing
+ // slash because the container itself represents the `<container>/`
directory.
+ options.Prefix = {};
+ found = true; // Unless the container itself is not found later!
+ } else {
+ options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+ }
+ options.PageSizeHint = page_size_hint;
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+ // When Prefix.Value() contains a trailing slash and we find a blob that
+ // matches it completely, it is an empty directory marker blob for the
+ // directory we're listing from, and we should skip it.
+ auto is_empty_dir_marker =
+ [&options](const Azure::Storage::Blobs::Models::BlobItem& blob)
noexcept -> bool {
+ return options.Prefix.HasValue() && blob.Name == options.Prefix.Value();
+ };
+
+ auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
+ if (select.recursive && select.max_recursion > 0) {
+ FileSelector sub_select;
+ sub_select.base_dir = base_location.container;
+ sub_select.base_dir += internal::kSep;
+ sub_select.base_dir += internal::RemoveTrailingSlash(blob_prefix);
+ sub_select.allow_not_found = true;
+ sub_select.recursive = true;
+ sub_select.max_recursion = select.max_recursion - 1;
+ return GetFileInfoWithSelectorFromContainer(
+ container_client, context, page_size_hint, sub_select,
acc_results);
+ }
+ return Status::OK();
+ };
+
+ // (*acc_results)[*last_dir_reported] is the last FileType::Directory in
the results
+ // produced through this loop over the response pages.
+ std::optional<size_t> last_dir_reported{};
+ auto matches_last_dir_reported = [&last_dir_reported,
+ acc_results](const FileInfo& info)
noexcept {
+ if (!last_dir_reported.has_value() || info.type() !=
FileType::Directory) {
+ return false;
+ }
+ const auto& last_dir = (*acc_results)[*last_dir_reported];
+ return BasenameView(info.path()) == BasenameView(last_dir.path());
+ };
+
+ auto process_blob =
+ [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept {
+ if (!is_empty_dir_marker(blob)) {
+ const auto& info = acc_results->emplace_back(
+ FileInfoFromBlob(base_location.container, blob));
+ if (info.type() == FileType::Directory) {
+ last_dir_reported = acc_results->size() - 1;
+ }
+ }
+ };
+ auto process_prefix = [&](const std::string& prefix) noexcept -> Status {
+ const std::string path = base_location.container + internal::kSep +
prefix;
+ const auto& info =
acc_results->emplace_back(DirectoryFileInfoFromPath(path));
+ if (ARROW_PREDICT_FALSE(matches_last_dir_reported(info))) {
+ acc_results->pop_back();
+ } else {
+ last_dir_reported = acc_results->size() - 1;
+ return recurse(prefix);
+ }
+ return Status::OK();
+ };
+
+ try {
+ auto list_response =
+ container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options,
context);
+ for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
+ if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty())
{
+ continue;
+ }
+ found = true;
+ // Blob and BlobPrefixes are sorted by name, so we can merge-iterate
+ // them to ensure returned results are all sorted.
+ size_t blob_index = 0;
+ size_t blob_prefix_index = 0;
+ while (blob_index < list_response.Blobs.size() &&
+ blob_prefix_index < list_response.BlobPrefixes.size()) {
+ const auto& blob = list_response.Blobs[blob_index];
+ const auto& prefix = list_response.BlobPrefixes[blob_prefix_index];
+ const int cmp = blob.Name.compare(prefix);
+ if (cmp < 0) {
+ process_blob(blob);
+ blob_index += 1;
+ } else if (cmp > 0) {
+ RETURN_NOT_OK(process_prefix(prefix));
+ blob_prefix_index += 1;
+ } else { // there is a blob (empty dir marker) and a prefix with
the same name
+ DCHECK_EQ(blob.Name, prefix);
+ RETURN_NOT_OK(process_prefix(prefix));
+ blob_index += 1;
+ blob_prefix_index += 1;
Review Comment:
```suggestion
++blob_index;
++blob_prefix_index;
```
##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -287,6 +287,49 @@ class AzureFileSystemTest : public ::testing::Test {
void RunGetFileInfoObjectWithNestedStructureTest();
void RunGetFileInfoObjectTest();
+
+ void SetUpSmallFileSystemTree() {
+ // Set up test containers
+
blob_service_client_->GetBlobContainerClient("empty-container").CreateIfNotExists();
+
+ auto container_client =
blob_service_client_->GetBlobContainerClient("container");
+ container_client.CreateIfNotExists();
+
+ auto blob_client = container_client.GetBlockBlobClient("emptydir/");
+ blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(""), 0);
+
+ blob_client =
container_client.GetBlockBlobClient("somedir/subdir/subfile");
+ const char* sub_data = "sub data";
+ blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(sub_data),
strlen(sub_data));
+
+ blob_client = container_client.GetBlockBlobClient("somefile");
+ const char* some_data = "some data";
+ blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(some_data),
+ strlen(some_data));
+
+ blob_client =
container_client.GetBlockBlobClient("otherdir/1/2/3/otherfile");
+ const char* other_data = "other data";
+ blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(other_data),
+ strlen(other_data));
Review Comment:
How about using `fs_` instead?
##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -479,6 +522,133 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest,
GetFileInfoObject) {
RunGetFileInfoObjectTest();
}
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) {
+ SetUpSmallFileSystemTree();
+
+ FileSelector select;
+ std::vector<FileInfo> infos;
+
+ // Root dir
+ select.base_dir = "";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 3);
+ ASSERT_EQ(infos, SortedInfos(infos));
+ AssertFileInfo(infos[0], "container", FileType::Directory);
+ AssertFileInfo(infos[1], "empty-container", FileType::Directory);
+ AssertFileInfo(infos[2], container_name_, FileType::Directory);
+
+ // Empty container
+ select.base_dir = "empty-container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ // Nonexistent container
+ select.base_dir = "nonexistent-container";
+ ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+ select.allow_not_found = true;
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ select.allow_not_found = false;
+ // Non-empty container
+ select.base_dir = "container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 4);
+ AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+ AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+ AssertFileInfo(infos[2], "container/somedir", FileType::Directory);
+ AssertFileInfo(infos[3], "container/somefile", FileType::File, 9);
Review Comment:
Can we avoid a magic number here?
##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -213,7 +213,7 @@ class AzureFileSystemTest : public ::testing::Test {
suite_skipped_ = true;
GTEST_SKIP() << options.status().message();
}
- container_name_ = RandomChars(32);
+ container_name_ = "z" + RandomChars(31);
Review Comment:
Why do we want to use `z` as the first character?
Is it for putting `Preexisting*` to the last entries in
`AssertInfoAllContainersRecursive()`?
If so, how about removing `Preexisting*` in the teststhat use
`AssertInfoAllContainersRecursive()` (or `SetUpSmallFileSystemTree()`) and
don't check them in `AssertInfoAllContainersRecursive()`.
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -815,6 +815,233 @@ class AzureFileSystem::Impl {
}
}
+ private:
+ template <typename OnContainer>
+ Status ListContainers(const Azure::Core::Context& context,
+ OnContainer&& on_container) const {
+ Azure::Storage::Blobs::ListBlobContainersOptions options;
+ // Deleted containers are not returned.
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobContainersIncludeFlags::None;
+ try {
+ auto container_list_response =
+ blob_service_client_->ListBlobContainers(options, context);
+ for (; container_list_response.HasPage();
+ container_list_response.MoveToNextPage(context)) {
+ for (const auto& container : container_list_response.BlobContainers) {
+ RETURN_NOT_OK(on_container(container));
+ }
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus("Failed to list account containers.",
exception);
+ }
+ return Status::OK();
+ }
+
+ static FileInfo FileInfoFromBlob(const std::string& container,
+ const
Azure::Storage::Blobs::Models::BlobItem& blob) {
+ if (blob.Name.back() == internal::kSep) {
+ return DirectoryFileInfoFromPath(container + internal::kSep + blob.Name);
+ }
+ std::string path;
+ path.reserve(container.size() + 1 + blob.Name.size());
+ path += container;
+ path += internal::kSep;
+ path += blob.Name;
+ FileInfo info{std::move(path), FileType::File};
+ info.set_size(blob.BlobSize);
+
info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified});
+ return info;
+ }
+
+ static FileInfo DirectoryFileInfoFromPath(const std::string& path) {
+ return FileInfo{std::string{internal::RemoveTrailingSlash(path)},
+ FileType::Directory};
+ }
+
+ static std::string_view BasenameView(std::string_view s) {
+ auto offset = s.find_last_of(internal::kSep);
+ auto tail = (offset == std::string_view::npos) ? s : s.substr(offset);
+ return internal::RemoveTrailingSlash(tail, /*preserve_root=*/false);
+ }
+
+ /// \brief List the blobs at the root of a container or some dir in a
container.
+ ///
+ /// \pre container_client is the client for the container named like the
first
+ /// segment of select.base_dir.
+ Status GetFileInfoWithSelectorFromContainer(
+ const Azure::Storage::Blobs::BlobContainerClient& container_client,
+ const Azure::Core::Context& context, Azure::Nullable<int32_t>
page_size_hint,
+ const FileSelector& select, FileInfoVector* acc_results) {
+ ARROW_ASSIGN_OR_RAISE(auto base_location,
AzureLocation::FromString(select.base_dir));
+
+ bool found = false;
+ Azure::Storage::Blobs::ListBlobsOptions options;
+ if (internal::GetAbstractPathDepth(base_location.path) == 0) {
+ // If the base_dir is the root of the container, then we want to list
all blobs in
+ // the container and the Prefix should be empty and not even include the
trailing
+ // slash because the container itself represents the `<container>/`
directory.
+ options.Prefix = {};
+ found = true; // Unless the container itself is not found later!
+ } else {
+ options.Prefix = internal::EnsureTrailingSlash(base_location.path);
+ }
+ options.PageSizeHint = page_size_hint;
+ options.Include =
Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata;
+
+ // When Prefix.Value() contains a trailing slash and we find a blob that
+ // matches it completely, it is an empty directory marker blob for the
+ // directory we're listing from, and we should skip it.
+ auto is_empty_dir_marker =
+ [&options](const Azure::Storage::Blobs::Models::BlobItem& blob)
noexcept -> bool {
+ return options.Prefix.HasValue() && blob.Name == options.Prefix.Value();
+ };
+
+ auto recurse = [&](const std::string& blob_prefix) noexcept -> Status {
+ if (select.recursive && select.max_recursion > 0) {
+ FileSelector sub_select;
+ sub_select.base_dir = base_location.container;
+ sub_select.base_dir += internal::kSep;
+ sub_select.base_dir += internal::RemoveTrailingSlash(blob_prefix);
+ sub_select.allow_not_found = true;
+ sub_select.recursive = true;
+ sub_select.max_recursion = select.max_recursion - 1;
+ return GetFileInfoWithSelectorFromContainer(
+ container_client, context, page_size_hint, sub_select,
acc_results);
+ }
+ return Status::OK();
+ };
+
+ // (*acc_results)[*last_dir_reported] is the last FileType::Directory in
the results
+ // produced through this loop over the response pages.
+ std::optional<size_t> last_dir_reported{};
+ auto matches_last_dir_reported = [&last_dir_reported,
+ acc_results](const FileInfo& info)
noexcept {
+ if (!last_dir_reported.has_value() || info.type() !=
FileType::Directory) {
+ return false;
+ }
+ const auto& last_dir = (*acc_results)[*last_dir_reported];
+ return BasenameView(info.path()) == BasenameView(last_dir.path());
+ };
+
+ auto process_blob =
+ [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept {
+ if (!is_empty_dir_marker(blob)) {
+ const auto& info = acc_results->emplace_back(
+ FileInfoFromBlob(base_location.container, blob));
+ if (info.type() == FileType::Directory) {
+ last_dir_reported = acc_results->size() - 1;
+ }
+ }
+ };
+ auto process_prefix = [&](const std::string& prefix) noexcept -> Status {
+ const std::string path = base_location.container + internal::kSep +
prefix;
+ const auto& info =
acc_results->emplace_back(DirectoryFileInfoFromPath(path));
+ if (ARROW_PREDICT_FALSE(matches_last_dir_reported(info))) {
+ acc_results->pop_back();
+ } else {
+ last_dir_reported = acc_results->size() - 1;
+ return recurse(prefix);
+ }
+ return Status::OK();
+ };
+
+ try {
+ auto list_response =
+ container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options,
context);
+ for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
+ if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty())
{
+ continue;
+ }
+ found = true;
+ // Blob and BlobPrefixes are sorted by name, so we can merge-iterate
+ // them to ensure returned results are all sorted.
+ size_t blob_index = 0;
+ size_t blob_prefix_index = 0;
+ while (blob_index < list_response.Blobs.size() &&
+ blob_prefix_index < list_response.BlobPrefixes.size()) {
+ const auto& blob = list_response.Blobs[blob_index];
+ const auto& prefix = list_response.BlobPrefixes[blob_prefix_index];
+ const int cmp = blob.Name.compare(prefix);
+ if (cmp < 0) {
+ process_blob(blob);
+ blob_index += 1;
+ } else if (cmp > 0) {
+ RETURN_NOT_OK(process_prefix(prefix));
+ blob_prefix_index += 1;
Review Comment:
```suggestion
++blob_prefix_index;
```
##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -479,6 +522,133 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest,
GetFileInfoObject) {
RunGetFileInfoObjectTest();
}
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) {
+ SetUpSmallFileSystemTree();
+
+ FileSelector select;
+ std::vector<FileInfo> infos;
+
+ // Root dir
+ select.base_dir = "";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 3);
+ ASSERT_EQ(infos, SortedInfos(infos));
+ AssertFileInfo(infos[0], "container", FileType::Directory);
+ AssertFileInfo(infos[1], "empty-container", FileType::Directory);
+ AssertFileInfo(infos[2], container_name_, FileType::Directory);
+
+ // Empty container
+ select.base_dir = "empty-container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ // Nonexistent container
+ select.base_dir = "nonexistent-container";
+ ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+ select.allow_not_found = true;
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ select.allow_not_found = false;
+ // Non-empty container
+ select.base_dir = "container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 4);
+ AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+ AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+ AssertFileInfo(infos[2], "container/somedir", FileType::Directory);
+ AssertFileInfo(infos[3], "container/somefile", FileType::File, 9);
+
+ // Empty "directory"
+ select.base_dir = "container/emptydir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ // Non-empty "directories"
+ select.base_dir = "container/somedir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory);
+ select.base_dir = "container/somedir/subdir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File,
8);
+ // Nonexistent
+ select.base_dir = "container/nonexistent";
+ ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+ select.allow_not_found = true;
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ select.allow_not_found = false;
+
+ // Trailing slashes
+ select.base_dir = "empty-container/";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ select.base_dir = "nonexistent-container/";
+ ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+ select.base_dir = "container/";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 4);
+}
+
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) {
+ SetUpSmallFileSystemTree();
+
+ FileSelector select;
+ select.recursive = true;
+
+ std::vector<FileInfo> infos;
+ // Root dir
+ select.base_dir = "";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 14);
Review Comment:
This is redundant because it's also checked in
`AssertInfoAllContainersRecursive()`.
##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -479,6 +522,133 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest,
GetFileInfoObject) {
RunGetFileInfoObjectTest();
}
+TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) {
+ SetUpSmallFileSystemTree();
+
+ FileSelector select;
+ std::vector<FileInfo> infos;
+
+ // Root dir
+ select.base_dir = "";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 3);
+ ASSERT_EQ(infos, SortedInfos(infos));
+ AssertFileInfo(infos[0], "container", FileType::Directory);
+ AssertFileInfo(infos[1], "empty-container", FileType::Directory);
+ AssertFileInfo(infos[2], container_name_, FileType::Directory);
+
+ // Empty container
+ select.base_dir = "empty-container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ // Nonexistent container
+ select.base_dir = "nonexistent-container";
+ ASSERT_RAISES(IOError, fs_->GetFileInfo(select));
+ select.allow_not_found = true;
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ select.allow_not_found = false;
+ // Non-empty container
+ select.base_dir = "container";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos, SortedInfos(infos));
+ ASSERT_EQ(infos.size(), 4);
+ AssertFileInfo(infos[0], "container/emptydir", FileType::Directory);
+ AssertFileInfo(infos[1], "container/otherdir", FileType::Directory);
+ AssertFileInfo(infos[2], "container/somedir", FileType::Directory);
+ AssertFileInfo(infos[3], "container/somefile", FileType::File, 9);
+
+ // Empty "directory"
+ select.base_dir = "container/emptydir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 0);
+ // Non-empty "directories"
+ select.base_dir = "container/somedir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory);
+ select.base_dir = "container/somedir/subdir";
+ ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select));
+ ASSERT_EQ(infos.size(), 1);
+ AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File,
8);
Review Comment:
ditto.
##########
cpp/src/arrow/filesystem/path_util.cc:
##########
@@ -72,15 +72,12 @@ std::string SliceAbstractPath(const std::string& s, int
offset, int length, char
return "";
}
std::vector<std::string> components = SplitAbstractPath(s, sep);
- std::stringstream combined;
if (offset >= static_cast<int>(components.size())) {
return "";
}
- int end = offset + length;
- if (end > static_cast<int>(components.size())) {
- end = static_cast<int>(components.size());
- }
- for (int i = offset; i < end; i++) {
+ const size_t end = std::min(static_cast<size_t>(offset) + length,
components.size());
Review Comment:
```suggestion
const auto end = std::min(static_cast<size_t>(offset) + length,
components.size());
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]