Tom-Newton commented on code in PR #39361:
URL: https://github.com/apache/arrow/pull/39361#discussion_r1435827376


##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -1404,11 +1428,76 @@ class AzureFileSystem::Impl {
     return stream;
   }
 
- private:
-  Status DeleteDirContentsWithoutHierarchicalNamespace(const AzureLocation& 
location,
-                                                       bool missing_dir_ok) {
-    auto container_client =
-        blob_service_client_->GetBlobContainerClient(location.container);
+  /// This function assumes the container already exists. So it can only be
+  /// called after that has been verified.
+  ///
+  /// \pre location.container is not empty.
+  /// \pre The location.container container already exists.
+  Status EnsureEmptyDirExists(const Blobs::BlobContainerClient& 
container_client,
+                              const AzureLocation& location) {
+    DCHECK(!location.container.empty());
+    if (location.path.empty()) {
+      // Nothing to do. The container already exists per the preconditions.
+      return Status::OK();
+    }
+    auto dir_marker_blob_path = internal::EnsureTrailingSlash(location.path);
+    auto block_blob_client =
+        
container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient();
+    try {
+      block_blob_client.UploadFrom(nullptr, 0);
+      return Status::OK();
+    } catch (const Storage::StorageException& exception) {
+      return ExceptionToStatus(exception);

Review Comment:
   Could you add a bit more information to this error so it will be clear what 
the filesystem was trying to do when the error occurred. 



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -1288,97 +1328,81 @@ class AzureFileSystem::Impl {
     return ptr;
   }
 
-  Status CreateDir(const AzureLocation& location) {
-    if (location.container.empty()) {
-      return Status::Invalid("CreateDir requires a non-empty path.");
-    }
-
-    auto container_client =
-        blob_service_client_->GetBlobContainerClient(location.container);
-    if (location.path.empty()) {
-      try {
-        auto response = container_client.Create();
-        return response.Value.Created
-                   ? Status::OK()
-                   : Status::AlreadyExists("Directory already exists: " + 
location.all);
-      } catch (const Storage::StorageException& exception) {
-        return ExceptionToStatus(exception,
-                                 "Failed to create a container: ", 
location.container,
-                                 ": ", container_client.GetUrl());
-      }
-    }
-
-    auto adlfs_client = 
datalake_service_client_->GetFileSystemClient(location.container);
-    ARROW_ASSIGN_OR_RAISE(auto hns_support, 
HierarchicalNamespaceSupport(adlfs_client));
-    if (hns_support == HNSSupport::kContainerNotFound) {
-      return PathNotFound(location);
-    }
-    if (hns_support == HNSSupport::kDisabled) {
-      ARROW_ASSIGN_OR_RAISE(
-          auto container_info,
-          GetContainerPropsAsFileInfo(location.container, container_client));
-      if (container_info.type() == FileType::NotFound) {
-        return PathNotFound(location);
-      }
-      // Without hierarchical namespace enabled Azure blob storage has no 
directories.
-      // Therefore we can't, and don't need to create one. Simply creating a 
blob with `/`
-      // in the name implies directories.
-      return Status::OK();
-    }
-
-    auto directory_client = adlfs_client.GetDirectoryClient(location.path);
-    try {
-      auto response = directory_client.Create();
-      if (response.Value.Created) {
-        return Status::OK();
-      } else {
-        return StatusFromErrorResponse(directory_client.GetUrl(), 
*response.RawResponse,
-                                       "Failed to create a directory: " + 
location.path);
+ private:
+  /// This function cannot assume the filesystem/container already exists.
+  ///
+  /// \pre location.container is not empty.
+  /// \pre location.path is not empty.
+  template <class ContainerClient, class GetDirectoryClient, class 
CreateDirIfNotExists>
+  Status CreateDirTemplate(const ContainerClient& container_client,
+                           GetDirectoryClient&& get_directory_client,
+                           CreateDirIfNotExists&& create_if_not_exists,
+                           const AzureLocation& location, bool recursive) {
+    DCHECK(!location.container.empty());
+    DCHECK(!location.path.empty());
+    // Non-recursive CreateDir calls require the parent directory to exist.
+    if (!recursive) {
+      auto parent = location.parent();
+      if (!parent.path.empty()) {
+        RETURN_NOT_OK(CheckDirExists(container_client, parent));
       }
-    } catch (const Storage::StorageException& exception) {
-      return ExceptionToStatus(exception, "Failed to create a directory: ", 
location.path,
-                               ": ", directory_client.GetUrl());
-    }
-  }
-
-  Status CreateDirRecursive(const AzureLocation& location) {
-    if (location.container.empty()) {
-      return Status::Invalid("CreateDir requires a non-empty path.");
+      // If the parent location is just the container, we don't need to check 
if it
+      // exists because the operation we perform below will fail if the 
container
+      // doesn't exist and we can handle that error according to the recursive 
flag.
     }
-
-    auto container_client =
-        blob_service_client_->GetBlobContainerClient(location.container);
+    auto directory_client = get_directory_client(container_client, location);
     try {
-      container_client.CreateIfNotExists();
-    } catch (const Storage::StorageException& exception) {
-      return ExceptionToStatus(exception,
-                               "Failed to create a container: ", 
location.container, " (",
-                               container_client.GetUrl(), ")");
-    }
-
-    auto adlfs_client = 
datalake_service_client_->GetFileSystemClient(location.container);
-    ARROW_ASSIGN_OR_RAISE(auto hns_support, 
HierarchicalNamespaceSupport(adlfs_client));
-    if (hns_support == HNSSupport::kDisabled) {
-      // Without hierarchical namespace enabled Azure blob storage has no 
directories.
-      // Therefore we can't, and don't need to create one. Simply creating a 
blob with `/`
-      // in the name implies directories.
+      create_if_not_exists(directory_client);
       return Status::OK();
-    }
-    // Don't handle HNSSupport::kContainerNotFound, just assume it still 
exists (because
-    // it was created above) and try to create the directory.
-
-    if (!location.path.empty()) {
-      auto directory_client = adlfs_client.GetDirectoryClient(location.path);
-      try {
-        directory_client.CreateIfNotExists();
-      } catch (const Storage::StorageException& exception) {
-        return ExceptionToStatus(exception,
-                                 "Failed to create a directory: ", 
location.path, " (",
-                                 directory_client.GetUrl(), ")");
+    } catch (const Storage::StorageException& exception) {
+      if (IsContainerNotFound(exception)) {
+        try {
+          if (recursive) {
+            container_client.CreateIfNotExists();
+            create_if_not_exists(directory_client);
+            return Status::OK();
+          } else {
+            auto parent = location.parent();
+            return PathNotFound(parent);
+          }
+        } catch (const Storage::StorageException& second_exception) {
+          return ExceptionToStatus(second_exception);
+        }
       }
+      return ExceptionToStatus(exception);
     }
+  }
 
-    return Status::OK();
+ public:
+  /// This function cannot assume the filesystem already exists.
+  ///
+  /// \pre location.container is not empty.
+  /// \pre location.path is not empty.
+  Status CreateDirOnFileSystem(const DataLake::DataLakeFileSystemClient& 
adlfs_client,
+                               const AzureLocation& location, bool recursive) {
+    return CreateDirTemplate(
+        adlfs_client,
+        [](auto& adlfs_client, auto& location) {
+          return adlfs_client.GetDirectoryClient(location.path);
+        },
+        [](auto& directory_client) { directory_client.CreateIfNotExists(); }, 
location,
+        recursive);
+  }
+
+  /// This function cannot assume the container already exists.
+  ///
+  /// \pre location.container is not empty.
+  /// \pre location.path is not empty.
+  Status CreateDirOnContainer(const Blobs::BlobContainerClient& 
container_client,
+                              const AzureLocation& location, bool recursive) {
+    return CreateDirTemplate(
+        container_client,
+        [](auto& container_client, auto& location) {
+          auto dir_marker_blob_path = 
internal::EnsureTrailingSlash(location.path);
+          return 
container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient();
+        },
+        [](auto& block_blob_client) { block_blob_client.UploadFrom(nullptr, 
0); },

Review Comment:
   I think it would be a good idea to put some metadata on all directory marker 
blobs we create. That would allow us to add a simple checks in 
`ObjectInputFile::Init` and `ObjectAppendStream::Init` to ensure they are not 
being initialised as directories. 
   
   Additionally I think this will not be compatible with `adlfs` if it does not 
add metadata to directory markers. I think its very likely that people will end 
up using a combination of this arrow filesystem and `adlfs`. 
   
   



##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -566,86 +574,259 @@ class TestAzureFileSystem : public ::testing::Test {
     return env->WithHierarchicalNamespace();
   }
 
+  constexpr static const char* const kSubmitBatchBugMessage =
+      "This test is affected by an Azurite isse: "

Review Comment:
   typo
   ```suggestion
         "This test is affected by an Azurite issue: "
   ```



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -1033,47 +1042,78 @@ class AzureFileSystem::Impl {
       return info;
     } catch (const Storage::StorageException& exception) {
       if (exception.StatusCode == Http::HttpStatusCode::NotFound) {
-        ARROW_ASSIGN_OR_RAISE(auto hns_support,
-                              HierarchicalNamespaceSupport(adlfs_client));
-        if (hns_support == HNSSupport::kContainerNotFound ||
-            hns_support == HNSSupport::kEnabled) {
-          // If the hierarchical namespace is enabled, then the storage 
account will
-          // have explicit directories. Neither a file nor a directory was 
found.
-          info.set_type(FileType::NotFound);
+        return FileInfo{location.all, FileType::NotFound};
+      }
+      return ExceptionToStatus(
+          exception, "GetProperties for '", file_client.GetUrl(),
+          "' failed. GetFileInfo is unable to determine whether the path 
exists.");
+    }
+  }
+
+  /// On flat namespace accounts there are no real directories. Directories are
+  /// implied by empty directory marker blobs with names ending in "/" or there
+  /// being blobs with names starting with the directory path.
+  ///
+  /// \pre location.path is not empty.
+  Result<FileInfo> GetFileInfo(const Blobs::BlobContainerClient& 
container_client,
+                               const AzureLocation& location) {
+    DCHECK(!location.path.empty());
+    Blobs::ListBlobsOptions options;
+    options.Prefix = internal::RemoveTrailingSlash(location.path);
+    options.PageSizeHint = 1;
+
+    try {
+      FileInfo info{location.all};
+      auto list_response = container_client.ListBlobsByHierarchy(kDelimiter, 
options);
+      if (!list_response.BlobPrefixes.empty()) {

Review Comment:
   Could you add a comment to explain when `list_response.BlobPrefixes` will be 
non-empty compared to when `list_response.BlobPrefixes` is non-empty. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to