This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new a394a399ee GH-38699: [C++][FS][Azure] Implement `CreateDir()` (#38708)
a394a399ee is described below

commit a394a399ee0d0e25587b31b5c2d2abcdc0e67234
Author: Sutou Kouhei <[email protected]>
AuthorDate: Sat Nov 18 11:09:15 2023 +0900

    GH-38699: [C++][FS][Azure] Implement `CreateDir()` (#38708)
    
    ### Rationale for this change
    
    It seems that we can't create a directory explicitly without hierarchical 
namespace support.
    
    It seems that Azure Blob Storage supports only virtual directory. There is 
no directory. If a file (blob) name has "/", it's treated that the file (blob) 
exists under a virtual directory.
    
    It seems that Azure Data Lake Storage Gen2 supports a real directory.
    
    See also:
    
https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blobs-introduction
    
    ### What changes are included in this PR?
    
    This change chooses the following behavior:
    
    * Container can be created with/without hierarchical namespace support.
    * Directory can be created with hierarchical namespace support.
    * Directory can't be created without hierarchical namespace support. So do 
nothing without hierachical namespace support. (`arrow::Status::OK()` is just 
returned.)
    
    ### Are these changes tested?
    
    Azurite doesn't support hierarchical namespace yet. So I can't test the 
implementation for hierarchical namespace yet. Sorry.
    
    ### Are there any user-facing changes?
    
    Yes.
    * Closes: #38699
    
    Lead-authored-by: Sutou Kouhei <[email protected]>
    Co-authored-by: Sutou Kouhei <[email protected]>
    Co-authored-by: Thomas Newton <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/filesystem/azurefs.cc      | 113 ++++++++++++++++++++++++++++++-
 cpp/src/arrow/filesystem/azurefs_test.cc | 113 ++++++++++++++++++++++++++++++-
 2 files changed, 224 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/filesystem/azurefs.cc 
b/cpp/src/arrow/filesystem/azurefs.cc
index 6359183d90..fdf119477a 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -148,6 +148,19 @@ Status ValidateFilePath(const AzurePath& path) {
   return Status::OK();
 }
 
+Status StatusFromErrorResponse(const std::string& url,
+                               Azure::Core::Http::RawResponse* raw_response,
+                               const std::string& context) {
+  const auto& body = raw_response->GetBody();
+  // There isn't an Azure specification that response body on error
+  // doesn't contain any binary data but we assume it. We hope that
+  // error response body has useful information for the error.
+  std::string_view body_text(reinterpret_cast<const char*>(body.data()), 
body.size());
+  return Status::IOError(context, ": ", url, ": ", 
raw_response->GetReasonPhrase(), " (",
+                         static_cast<int>(raw_response->GetStatusCode()),
+                         "): ", body_text);
+}
+
 template <typename ArrowType>
 std::string FormatValue(typename TypeTraits<ArrowType>::CType value) {
   struct StringAppender {
@@ -611,6 +624,99 @@ class AzureFileSystem::Impl {
     RETURN_NOT_OK(ptr->Init());
     return ptr;
   }
+
+  Status CreateDir(const AzurePath& path) {
+    if (path.container.empty()) {
+      return Status::Invalid("Cannot create an empty container");
+    }
+
+    if (path.path_to_file.empty()) {
+      auto container_client =
+          blob_service_client_->GetBlobContainerClient(path.container);
+      try {
+        auto response = container_client.Create();
+        if (response.Value.Created) {
+          return Status::OK();
+        } else {
+          return StatusFromErrorResponse(
+              container_client.GetUrl(), response.RawResponse.get(),
+              "Failed to create a container: " + path.container);
+        }
+      } catch (const Azure::Storage::StorageException& exception) {
+        return internal::ExceptionToStatus(
+            "Failed to create a container: " + path.container + ": " +
+                container_client.GetUrl(),
+            exception);
+      }
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled,
+                          hierarchical_namespace_.Enabled(path.container));
+    if (!hierarchical_namespace_enabled) {
+      // Without hierarchical namespace enabled Azure blob storage has no 
directories.
+      // Therefore we can't, and don't need to create one. Simply creating a 
blob with `/`
+      // in the name implies directories.
+      return Status::OK();
+    }
+
+    auto directory_client = 
datalake_service_client_->GetFileSystemClient(path.container)
+                                .GetDirectoryClient(path.path_to_file);
+    try {
+      auto response = directory_client.Create();
+      if (response.Value.Created) {
+        return Status::OK();
+      } else {
+        return StatusFromErrorResponse(
+            directory_client.GetUrl(), response.RawResponse.get(),
+            "Failed to create a directory: " + path.path_to_file);
+      }
+    } catch (const Azure::Storage::StorageException& exception) {
+      return internal::ExceptionToStatus(
+          "Failed to create a directory: " + path.path_to_file + ": " +
+              directory_client.GetUrl(),
+          exception);
+    }
+  }
+
+  Status CreateDirRecursive(const AzurePath& path) {
+    if (path.container.empty()) {
+      return Status::Invalid("Cannot create an empty container");
+    }
+
+    auto container_client = 
blob_service_client_->GetBlobContainerClient(path.container);
+    try {
+      container_client.CreateIfNotExists();
+    } catch (const Azure::Storage::StorageException& exception) {
+      return internal::ExceptionToStatus(
+          "Failed to create a container: " + path.container + " (" +
+              container_client.GetUrl() + ")",
+          exception);
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled,
+                          hierarchical_namespace_.Enabled(path.container));
+    if (!hierarchical_namespace_enabled) {
+      // We can't create a directory without hierarchical namespace
+      // support. There is only "virtual directory" without
+      // hierarchical namespace support. And a "virtual directory" is
+      // (virtually) created a blob with ".../.../blob" blob name
+      // automatically.
+      return Status::OK();
+    }
+
+    auto directory_client = 
datalake_service_client_->GetFileSystemClient(path.container)
+                                .GetDirectoryClient(path.path_to_file);
+    try {
+      directory_client.CreateIfNotExists();
+    } catch (const Azure::Storage::StorageException& exception) {
+      return internal::ExceptionToStatus(
+          "Failed to create a directory: " + path.path_to_file + " (" +
+              directory_client.GetUrl() + ")",
+          exception);
+    }
+
+    return Status::OK();
+  }
 };
 
 const AzureOptions& AzureFileSystem::options() const { return 
impl_->options(); }
@@ -636,7 +742,12 @@ Result<FileInfoVector> AzureFileSystem::GetFileInfo(const 
FileSelector& select)
 }
 
 Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) {
-  return Status::NotImplemented("The Azure FileSystem is not fully 
implemented");
+  ARROW_ASSIGN_OR_RAISE(auto p, AzurePath::FromString(path));
+  if (recursive) {
+    return impl_->CreateDirRecursive(p);
+  } else {
+    return impl_->CreateDir(p);
+  }
 }
 
 Status AzureFileSystem::DeleteDir(const std::string& path) {
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc 
b/cpp/src/arrow/filesystem/azurefs_test.cc
index c08a4b50b7..ecf0a19f68 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -49,6 +49,7 @@
 #include <azure/storage/common/storage_credential.hpp>
 #include <azure/storage/files/datalake.hpp>
 
+#include "arrow/filesystem/path_util.h"
 #include "arrow/filesystem/test_util.h"
 #include "arrow/result.h"
 #include "arrow/testing/gtest_util.h"
@@ -225,6 +226,10 @@ class AzureFileSystemTest : public ::testing::Test {
     return s;
   }
 
+  std::string RandomContainerName() { return RandomChars(32); }
+
+  std::string RandomDirectoryName() { return RandomChars(32); }
+
   void UploadLines(const std::vector<std::string>& lines, const char* 
path_to_file,
                    int total_size) {
     // TODO(GH-38333): Switch to using Azure filesystem to write once its 
implemented.
@@ -267,6 +272,22 @@ class AzureFlatNamespaceFileSystemTest : public 
AzureFileSystemTest {
   }
 };
 
+// How to enable this test:
+//
+// You need an Azure account. You should be able to create a free
+// account at https://azure.microsoft.com/en-gb/free/ . You should be
+// able to create a storage account through the portal Web UI.
+//
+// See also the official document how to create a storage account:
+// 
https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account
+//
+// A few suggestions on configuration:
+//
+// * Use Standard general-purpose v2 not premium
+// * Use LRS redundancy
+// * Obviously you need to enable hierarchical namespace.
+// * Set the default access tier to hot
+// * SFTP, NFS and file shares are not required.
 class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest {
   Result<AzureOptions> MakeOptions() override {
     AzureOptions options;
@@ -396,6 +417,96 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest, 
GetFileInfoObject) {
   RunGetFileInfoObjectTest();
 }
 
+TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) {
+  ASSERT_RAISES(Invalid, fs_->CreateDir("", false));
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirSuccessContainerOnly) {
+  auto container_name = RandomContainerName();
+  ASSERT_OK(fs_->CreateDir(container_name, false));
+  arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirSuccessContainerAndDirectory) {
+  const auto path = PreexistingContainerPath() + RandomDirectoryName();
+  ASSERT_OK(fs_->CreateDir(path, false));
+  // There is only virtual directory without hierarchical namespace
+  // support. So the CreateDir() does nothing.
+  arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest, 
CreateDirSuccessContainerAndDirectory) {
+  const auto path = PreexistingContainerPath() + RandomDirectoryName();
+  ASSERT_OK(fs_->CreateDir(path, false));
+  arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirFailureDirectoryWithMissingContainer) {
+  const auto path = std::string("not-a-container/new-directory");
+  ASSERT_RAISES(IOError, fs_->CreateDir(path, false));
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirRecursiveFailureNoContainer) {
+  ASSERT_RAISES(Invalid, fs_->CreateDir("", true));
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest, 
CreateDirRecursiveSuccessContainerOnly) {
+  auto container_name = RandomContainerName();
+  ASSERT_OK(fs_->CreateDir(container_name, true));
+  arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessContainerOnly) {
+  auto container_name = RandomContainerName();
+  ASSERT_OK(fs_->CreateDir(container_name, true));
+  arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest, 
CreateDirRecursiveSuccessDirectoryOnly) {
+  const auto parent = PreexistingContainerPath() + RandomDirectoryName();
+  const auto path = internal::ConcatAbstractPath(parent, "new-sub");
+  ASSERT_OK(fs_->CreateDir(path, true));
+  arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
+  arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessDirectoryOnly) {
+  const auto parent = PreexistingContainerPath() + RandomDirectoryName();
+  const auto path = internal::ConcatAbstractPath(parent, "new-sub");
+  ASSERT_OK(fs_->CreateDir(path, true));
+  // There is only virtual directory without hierarchical namespace
+  // support. So the CreateDir() does nothing.
+  arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
+  arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound);
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest,
+       CreateDirRecursiveSuccessContainerAndDirectory) {
+  auto container_name = RandomContainerName();
+  const auto parent = internal::ConcatAbstractPath(container_name, 
RandomDirectoryName());
+  const auto path = internal::ConcatAbstractPath(parent, "new-sub");
+  ASSERT_OK(fs_->CreateDir(path, true));
+  arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
+  arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory);
+  arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessContainerAndDirectory) {
+  auto container_name = RandomContainerName();
+  const auto parent = internal::ConcatAbstractPath(container_name, 
RandomDirectoryName());
+  const auto path = internal::ConcatAbstractPath(parent, "new-sub");
+  ASSERT_OK(fs_->CreateDir(path, true));
+  // There is only virtual directory without hierarchical namespace
+  // support. So the CreateDir() does nothing.
+  arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
+  arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound);
+  arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirUri) {
+  ASSERT_RAISES(Invalid, fs_->CreateDir("abfs://" + RandomContainerName(), 
true));
+}
+
 TEST_F(AzuriteFileSystemTest, OpenInputStreamString) {
   std::shared_ptr<io::InputStream> stream;
   ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath()));
@@ -455,7 +566,7 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamInfoInvalid) {
 }
 
 TEST_F(AzuriteFileSystemTest, OpenInputStreamUri) {
-  ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" + 
PreexistingObjectPath()));
+  ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + 
PreexistingObjectPath()));
 }
 
 TEST_F(AzuriteFileSystemTest, OpenInputStreamTrailingSlash) {

Reply via email to