kou commented on code in PR #40075:
URL: https://github.com/apache/arrow/pull/40075#discussion_r1496566444
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -2055,6 +2039,132 @@ class AzureFileSystem::Impl {
static constexpr auto kTimeNeededForFileOrDirectoryRename =
std::chrono::seconds{3};
public:
+ /// \pre location.container is not empty.
+ /// \pre location.path is not empty.
+ Status DeleteFileOnFileSystem(const DataLake::DataLakeFileSystemClient&
adlfs_client,
+ const AzureLocation& location,
+ bool require_file_to_exist) {
+ DCHECK(!location.container.empty());
+ DCHECK(!location.path.empty());
+ auto path_no_trailing_slash =
+ std::string{internal::RemoveTrailingSlash(location.path)};
+ auto file_client = adlfs_client.GetFileClient(path_no_trailing_slash);
+ try {
+ // This is necessary to avoid deletion of directories via DeleteFile.
+ auto properties = file_client.GetProperties();
+ if (properties.Value.IsDirectory) {
+ return internal::NotAFile(location.all);
+ }
+ if (internal::HasTrailingSlash(location.path)) {
+ return internal::NotADir(location.all);
+ }
+ auto response = file_client.Delete();
+ // Only the "*IfExists" functions ever set Deleted to false.
+ // All the others either succeed or throw an exception.
+ DCHECK(response.Value.Deleted);
+ } catch (const Storage::StorageException& exception) {
+ if (exception.StatusCode == Http::HttpStatusCode::NotFound) {
+ // ErrorCode can be "FilesystemNotFound", "PathNotFound"...
+ if (require_file_to_exist) {
+ return PathNotFound(location);
+ }
+ return Status::OK();
+ }
+ return ExceptionToStatus(exception, "Failed to delete a file: ",
location.path,
+ ": ", file_client.GetUrl());
+ }
+ return Status::OK();
+ }
+
+ /// \pre location.container is not empty.
+ /// \pre location.path is not empty.
+ Status DeleteFileOnContainer(const Blobs::BlobContainerClient&
container_client,
+ const AzureLocation& location, bool
require_file_to_exist,
+ const char* operation) {
Review Comment:
It seems that we don't need to receive `operation` as an argument. How about
defining it as a local variable instead of an argument?
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -2055,6 +2039,132 @@ class AzureFileSystem::Impl {
static constexpr auto kTimeNeededForFileOrDirectoryRename =
std::chrono::seconds{3};
public:
+ /// \pre location.container is not empty.
+ /// \pre location.path is not empty.
+ Status DeleteFileOnFileSystem(const DataLake::DataLakeFileSystemClient&
adlfs_client,
+ const AzureLocation& location,
+ bool require_file_to_exist) {
+ DCHECK(!location.container.empty());
+ DCHECK(!location.path.empty());
+ auto path_no_trailing_slash =
+ std::string{internal::RemoveTrailingSlash(location.path)};
+ auto file_client = adlfs_client.GetFileClient(path_no_trailing_slash);
+ try {
+ // This is necessary to avoid deletion of directories via DeleteFile.
+ auto properties = file_client.GetProperties();
+ if (properties.Value.IsDirectory) {
+ return internal::NotAFile(location.all);
+ }
+ if (internal::HasTrailingSlash(location.path)) {
+ return internal::NotADir(location.all);
+ }
+ auto response = file_client.Delete();
+ // Only the "*IfExists" functions ever set Deleted to false.
+ // All the others either succeed or throw an exception.
+ DCHECK(response.Value.Deleted);
+ } catch (const Storage::StorageException& exception) {
+ if (exception.StatusCode == Http::HttpStatusCode::NotFound) {
+ // ErrorCode can be "FilesystemNotFound", "PathNotFound"...
+ if (require_file_to_exist) {
+ return PathNotFound(location);
+ }
+ return Status::OK();
+ }
+ return ExceptionToStatus(exception, "Failed to delete a file: ",
location.path,
+ ": ", file_client.GetUrl());
+ }
+ return Status::OK();
+ }
+
+ /// \pre location.container is not empty.
+ /// \pre location.path is not empty.
+ Status DeleteFileOnContainer(const Blobs::BlobContainerClient&
container_client,
+ const AzureLocation& location, bool
require_file_to_exist,
+ const char* operation) {
+ DCHECK(!location.container.empty());
+ DCHECK(!location.path.empty());
+ constexpr auto kFileBlobLeaseTime = std::chrono::seconds{15};
+
+ // When it's known that the blob doesn't exist as a file, check if it
exists as a
+ // directory to generate the appropriate error message.
+ auto check_if_location_exists_as_dir = [&]() -> Status {
+ auto no_trailing_slash = location;
+ no_trailing_slash.path = internal::RemoveTrailingSlash(location.path);
+ no_trailing_slash.all = internal::RemoveTrailingSlash(location.all);
+ ARROW_ASSIGN_OR_RAISE(auto file_info,
+ GetFileInfo(container_client, no_trailing_slash));
+ if (file_info.type() == FileType::NotFound) {
+ return require_file_to_exist ? PathNotFound(location) : Status::OK();
+ }
+ if (file_info.type() == FileType::Directory) {
+ return internal::NotAFile(location.all);
+ }
+ return internal::HasTrailingSlash(location.path) ?
internal::NotADir(location.all)
+ :
internal::NotAFile(location.all);
+ };
+
+ // Paths ending with trailing slashes are never leading to a deletion,
+ // but the correct error message requires a check of the path.
+ if (internal::HasTrailingSlash(location.path)) {
+ return check_if_location_exists_as_dir();
+ }
+
+ // If the parent directory of a file is not the container itself, there is
a
+ // risk that deleting the file also deletes the *implied directory* -- the
+ // directory that is implied by the existence of the file path.
+ //
+ // In this case, we must ensure that the deletion is not semantically
+ // equivalent to also deleting the directory. This is done by ensuring the
+ // directory marker blob exists before the file is deleted.
+ std::optional<LeaseGuard> file_blob_lease_guard;
+ const auto parent = location.parent();
+ if (!parent.path.empty()) {
+ // We have to check the existence of the file before checking the
+ // existence of the parent directory marker, so we acquire a lease on the
+ // file first.
+ ARROW_ASSIGN_OR_RAISE(auto file_blob_lease_client,
+ AcquireBlobLease(location, kFileBlobLeaseTime,
+ /*allow_missing=*/true));
+ if (file_blob_lease_client) {
+ file_blob_lease_guard.emplace(std::move(file_blob_lease_client),
+ kFileBlobLeaseTime);
+ // Ensure the empty directory marker blob of the parent exists before
the file is
+ // deleted.
+ //
+ // There is not need to hold a lease on the directory marker because if
+ // a concurrent client deletes the directory marker right after we
+ // create it, the file deletion itself won't be the cause of the
directory
+ // deletion. Additionally, the fact that a lease is held on the blob
path
+ // semantically preserves the directory -- its existence is implied
+ // until the blob representing the file is deleted -- even if another
+ // client deletes the directory marker.
+ RETURN_NOT_OK(EnsureEmptyDirExists(container_client, parent,
operation));
+ } else {
+ return check_if_location_exists_as_dir();
+ }
+ }
+
+ DCHECK(!internal::HasTrailingSlash(location.path));
Review Comment:
Can we remove this because we have `if
(internal::HasTrailingSlash(location.path)) {` above?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]