kou commented on code in PR #38793: URL: https://github.com/apache/arrow/pull/38793#discussion_r1401240179
########## cpp/src/arrow/filesystem/azurefs.cc: ########## @@ -969,6 +969,95 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(stream->Init()); return stream; } + + Status DeleteDir(const AzureLocation& location) { + if (location.container.empty()) { + return Status::Invalid("Cannot delete an empty container"); + } + + if (location.path.empty()) { + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); + try { + auto response = container_client.Delete(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse( + container_client.GetUrl(), response.RawResponse.get(), + "Failed to delete a container: " + location.container); + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete a container: " + location.container + ": " + + container_client.GetUrl(), + exception); + } + } + + ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, + hierarchical_namespace_.Enabled(location.container)); + if (hierarchical_namespace_enabled) { + auto directory_client = + datalake_service_client_->GetFileSystemClient(location.container) + .GetDirectoryClient(location.path); + try { + auto response = directory_client.DeleteRecursive(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse( + directory_client.GetUrl(), response.RawResponse.get(), + "Failed to delete a directory: " + location.path); + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete a directory: " + location.path + ": " + + directory_client.GetUrl(), + exception); + } + } else { + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); + Azure::Storage::Blobs::ListBlobsOptions options; + options.Prefix = internal::EnsureTrailingSlash(location.path); + // https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch#remarks + // + // Only supports up to 256 subrequests in a single batch. The + // size of the body for a batch request can't exceed 4 MB. + const int32_t kNumMaxRequestsInBatch = 256; + options.PageSizeHint = kNumMaxRequestsInBatch; + auto list_response = container_client.ListBlobs(options); + while (list_response.HasPage() && !list_response.Blobs.empty()) { Review Comment: Oh, sorry. I missed it. I'll add `try`/`catch` for it. ########## cpp/src/arrow/filesystem/azurefs.cc: ########## @@ -969,6 +969,95 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(stream->Init()); return stream; } + + Status DeleteDir(const AzureLocation& location) { + if (location.container.empty()) { + return Status::Invalid("Cannot delete an empty container"); + } + + if (location.path.empty()) { + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); + try { + auto response = container_client.Delete(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse( + container_client.GetUrl(), response.RawResponse.get(), + "Failed to delete a container: " + location.container); + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete a container: " + location.container + ": " + + container_client.GetUrl(), + exception); + } + } + + ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, + hierarchical_namespace_.Enabled(location.container)); + if (hierarchical_namespace_enabled) { + auto directory_client = + datalake_service_client_->GetFileSystemClient(location.container) + .GetDirectoryClient(location.path); + try { + auto response = directory_client.DeleteRecursive(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse( + directory_client.GetUrl(), response.RawResponse.get(), + "Failed to delete a directory: " + location.path); + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete a directory: " + location.path + ": " + + directory_client.GetUrl(), + exception); + } + } else { + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); + Azure::Storage::Blobs::ListBlobsOptions options; + options.Prefix = internal::EnsureTrailingSlash(location.path); + // https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch#remarks + // + // Only supports up to 256 subrequests in a single batch. The + // size of the body for a batch request can't exceed 4 MB. + const int32_t kNumMaxRequestsInBatch = 256; + options.PageSizeHint = kNumMaxRequestsInBatch; + auto list_response = container_client.ListBlobs(options); + while (list_response.HasPage() && !list_response.Blobs.empty()) { + auto batch = container_client.CreateBatch(); + std::vector<Azure::Storage::DeferredResponse< + Azure::Storage::Blobs::Models::DeleteBlobResult>> + deferred_responses; + for (const auto& blob_item : list_response.Blobs) { + deferred_responses.push_back(batch.DeleteBlob(blob_item.Name)); + } + try { + container_client.SubmitBatch(batch); + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "Failed to delete blobs in a directory: " + location.path + ": " + + container_client.GetUrl(), + exception); + } + for (size_t i = 0; i < deferred_responses.size(); ++i) { + const auto& deferred_response = deferred_responses[i]; + auto delete_response = deferred_response.GetResponse(); + if (!delete_response.Value.Deleted) { + const auto& blob_item = list_response.Blobs[i]; + return Status::IOError("Failed to delete a blob: ", blob_item.Name, + ": " + container_client.GetUrl()); Review Comment: We don't need to continue the loop to avoid wasting the list effort. Because we already submitted all delete requests by `SubmitBatch()`. This loop just checks the results. > I think `GetResponse` should be called on all the deferred responses we have sent. Why? Is it for avoiding a resource leak? I think that not calling `GetResponse()` doesn't leak any resource. Because I think that `GetResponse()` just returns a sub response returned by `SubmitBatch()`. I think that all sub responses are managed by `SubmitBatch()` response and the response is already read. See also a `SubmitBatch()` response example: https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch#sample-response Anyway, I'll check all deferred responses to show all failed blob names in error message. It'll help debug. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org