This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new a394a399ee GH-38699: [C++][FS][Azure] Implement `CreateDir()` (#38708)
a394a399ee is described below
commit a394a399ee0d0e25587b31b5c2d2abcdc0e67234
Author: Sutou Kouhei <[email protected]>
AuthorDate: Sat Nov 18 11:09:15 2023 +0900
GH-38699: [C++][FS][Azure] Implement `CreateDir()` (#38708)
### Rationale for this change
It seems that we can't create a directory explicitly without hierarchical
namespace support.
It seems that Azure Blob Storage supports only virtual directory. There is
no directory. If a file (blob) name has "/", it's treated that the file (blob)
exists under a virtual directory.
It seems that Azure Data Lake Storage Gen2 supports a real directory.
See also:
https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blobs-introduction
### What changes are included in this PR?
This change chooses the following behavior:
* Container can be created with/without hierarchical namespace support.
* Directory can be created with hierarchical namespace support.
* Directory can't be created without hierarchical namespace support. So do
nothing without hierachical namespace support. (`arrow::Status::OK()` is just
returned.)
### Are these changes tested?
Azurite doesn't support hierarchical namespace yet. So I can't test the
implementation for hierarchical namespace yet. Sorry.
### Are there any user-facing changes?
Yes.
* Closes: #38699
Lead-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Thomas Newton <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/filesystem/azurefs.cc | 113 ++++++++++++++++++++++++++++++-
cpp/src/arrow/filesystem/azurefs_test.cc | 113 ++++++++++++++++++++++++++++++-
2 files changed, 224 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/filesystem/azurefs.cc
b/cpp/src/arrow/filesystem/azurefs.cc
index 6359183d90..fdf119477a 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -148,6 +148,19 @@ Status ValidateFilePath(const AzurePath& path) {
return Status::OK();
}
+Status StatusFromErrorResponse(const std::string& url,
+ Azure::Core::Http::RawResponse* raw_response,
+ const std::string& context) {
+ const auto& body = raw_response->GetBody();
+ // There isn't an Azure specification that response body on error
+ // doesn't contain any binary data but we assume it. We hope that
+ // error response body has useful information for the error.
+ std::string_view body_text(reinterpret_cast<const char*>(body.data()),
body.size());
+ return Status::IOError(context, ": ", url, ": ",
raw_response->GetReasonPhrase(), " (",
+ static_cast<int>(raw_response->GetStatusCode()),
+ "): ", body_text);
+}
+
template <typename ArrowType>
std::string FormatValue(typename TypeTraits<ArrowType>::CType value) {
struct StringAppender {
@@ -611,6 +624,99 @@ class AzureFileSystem::Impl {
RETURN_NOT_OK(ptr->Init());
return ptr;
}
+
+ Status CreateDir(const AzurePath& path) {
+ if (path.container.empty()) {
+ return Status::Invalid("Cannot create an empty container");
+ }
+
+ if (path.path_to_file.empty()) {
+ auto container_client =
+ blob_service_client_->GetBlobContainerClient(path.container);
+ try {
+ auto response = container_client.Create();
+ if (response.Value.Created) {
+ return Status::OK();
+ } else {
+ return StatusFromErrorResponse(
+ container_client.GetUrl(), response.RawResponse.get(),
+ "Failed to create a container: " + path.container);
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus(
+ "Failed to create a container: " + path.container + ": " +
+ container_client.GetUrl(),
+ exception);
+ }
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled,
+ hierarchical_namespace_.Enabled(path.container));
+ if (!hierarchical_namespace_enabled) {
+ // Without hierarchical namespace enabled Azure blob storage has no
directories.
+ // Therefore we can't, and don't need to create one. Simply creating a
blob with `/`
+ // in the name implies directories.
+ return Status::OK();
+ }
+
+ auto directory_client =
datalake_service_client_->GetFileSystemClient(path.container)
+ .GetDirectoryClient(path.path_to_file);
+ try {
+ auto response = directory_client.Create();
+ if (response.Value.Created) {
+ return Status::OK();
+ } else {
+ return StatusFromErrorResponse(
+ directory_client.GetUrl(), response.RawResponse.get(),
+ "Failed to create a directory: " + path.path_to_file);
+ }
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus(
+ "Failed to create a directory: " + path.path_to_file + ": " +
+ directory_client.GetUrl(),
+ exception);
+ }
+ }
+
+ Status CreateDirRecursive(const AzurePath& path) {
+ if (path.container.empty()) {
+ return Status::Invalid("Cannot create an empty container");
+ }
+
+ auto container_client =
blob_service_client_->GetBlobContainerClient(path.container);
+ try {
+ container_client.CreateIfNotExists();
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus(
+ "Failed to create a container: " + path.container + " (" +
+ container_client.GetUrl() + ")",
+ exception);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled,
+ hierarchical_namespace_.Enabled(path.container));
+ if (!hierarchical_namespace_enabled) {
+ // We can't create a directory without hierarchical namespace
+ // support. There is only "virtual directory" without
+ // hierarchical namespace support. And a "virtual directory" is
+ // (virtually) created a blob with ".../.../blob" blob name
+ // automatically.
+ return Status::OK();
+ }
+
+ auto directory_client =
datalake_service_client_->GetFileSystemClient(path.container)
+ .GetDirectoryClient(path.path_to_file);
+ try {
+ directory_client.CreateIfNotExists();
+ } catch (const Azure::Storage::StorageException& exception) {
+ return internal::ExceptionToStatus(
+ "Failed to create a directory: " + path.path_to_file + " (" +
+ directory_client.GetUrl() + ")",
+ exception);
+ }
+
+ return Status::OK();
+ }
};
const AzureOptions& AzureFileSystem::options() const { return
impl_->options(); }
@@ -636,7 +742,12 @@ Result<FileInfoVector> AzureFileSystem::GetFileInfo(const
FileSelector& select)
}
Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) {
- return Status::NotImplemented("The Azure FileSystem is not fully
implemented");
+ ARROW_ASSIGN_OR_RAISE(auto p, AzurePath::FromString(path));
+ if (recursive) {
+ return impl_->CreateDirRecursive(p);
+ } else {
+ return impl_->CreateDir(p);
+ }
}
Status AzureFileSystem::DeleteDir(const std::string& path) {
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc
b/cpp/src/arrow/filesystem/azurefs_test.cc
index c08a4b50b7..ecf0a19f68 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -49,6 +49,7 @@
#include <azure/storage/common/storage_credential.hpp>
#include <azure/storage/files/datalake.hpp>
+#include "arrow/filesystem/path_util.h"
#include "arrow/filesystem/test_util.h"
#include "arrow/result.h"
#include "arrow/testing/gtest_util.h"
@@ -225,6 +226,10 @@ class AzureFileSystemTest : public ::testing::Test {
return s;
}
+ std::string RandomContainerName() { return RandomChars(32); }
+
+ std::string RandomDirectoryName() { return RandomChars(32); }
+
void UploadLines(const std::vector<std::string>& lines, const char*
path_to_file,
int total_size) {
// TODO(GH-38333): Switch to using Azure filesystem to write once its
implemented.
@@ -267,6 +272,22 @@ class AzureFlatNamespaceFileSystemTest : public
AzureFileSystemTest {
}
};
+// How to enable this test:
+//
+// You need an Azure account. You should be able to create a free
+// account at https://azure.microsoft.com/en-gb/free/ . You should be
+// able to create a storage account through the portal Web UI.
+//
+// See also the official document how to create a storage account:
+//
https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account
+//
+// A few suggestions on configuration:
+//
+// * Use Standard general-purpose v2 not premium
+// * Use LRS redundancy
+// * Obviously you need to enable hierarchical namespace.
+// * Set the default access tier to hot
+// * SFTP, NFS and file shares are not required.
class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest {
Result<AzureOptions> MakeOptions() override {
AzureOptions options;
@@ -396,6 +417,96 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest,
GetFileInfoObject) {
RunGetFileInfoObjectTest();
}
+TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) {
+ ASSERT_RAISES(Invalid, fs_->CreateDir("", false));
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirSuccessContainerOnly) {
+ auto container_name = RandomContainerName();
+ ASSERT_OK(fs_->CreateDir(container_name, false));
+ arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirSuccessContainerAndDirectory) {
+ const auto path = PreexistingContainerPath() + RandomDirectoryName();
+ ASSERT_OK(fs_->CreateDir(path, false));
+ // There is only virtual directory without hierarchical namespace
+ // support. So the CreateDir() does nothing.
+ arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest,
CreateDirSuccessContainerAndDirectory) {
+ const auto path = PreexistingContainerPath() + RandomDirectoryName();
+ ASSERT_OK(fs_->CreateDir(path, false));
+ arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirFailureDirectoryWithMissingContainer) {
+ const auto path = std::string("not-a-container/new-directory");
+ ASSERT_RAISES(IOError, fs_->CreateDir(path, false));
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirRecursiveFailureNoContainer) {
+ ASSERT_RAISES(Invalid, fs_->CreateDir("", true));
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest,
CreateDirRecursiveSuccessContainerOnly) {
+ auto container_name = RandomContainerName();
+ ASSERT_OK(fs_->CreateDir(container_name, true));
+ arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessContainerOnly) {
+ auto container_name = RandomContainerName();
+ ASSERT_OK(fs_->CreateDir(container_name, true));
+ arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest,
CreateDirRecursiveSuccessDirectoryOnly) {
+ const auto parent = PreexistingContainerPath() + RandomDirectoryName();
+ const auto path = internal::ConcatAbstractPath(parent, "new-sub");
+ ASSERT_OK(fs_->CreateDir(path, true));
+ arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
+ arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessDirectoryOnly) {
+ const auto parent = PreexistingContainerPath() + RandomDirectoryName();
+ const auto path = internal::ConcatAbstractPath(parent, "new-sub");
+ ASSERT_OK(fs_->CreateDir(path, true));
+ // There is only virtual directory without hierarchical namespace
+ // support. So the CreateDir() does nothing.
+ arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
+ arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound);
+}
+
+TEST_F(AzureHierarchicalNamespaceFileSystemTest,
+ CreateDirRecursiveSuccessContainerAndDirectory) {
+ auto container_name = RandomContainerName();
+ const auto parent = internal::ConcatAbstractPath(container_name,
RandomDirectoryName());
+ const auto path = internal::ConcatAbstractPath(parent, "new-sub");
+ ASSERT_OK(fs_->CreateDir(path, true));
+ arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
+ arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory);
+ arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessContainerAndDirectory) {
+ auto container_name = RandomContainerName();
+ const auto parent = internal::ConcatAbstractPath(container_name,
RandomDirectoryName());
+ const auto path = internal::ConcatAbstractPath(parent, "new-sub");
+ ASSERT_OK(fs_->CreateDir(path, true));
+ // There is only virtual directory without hierarchical namespace
+ // support. So the CreateDir() does nothing.
+ arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
+ arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound);
+ arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
+}
+
+TEST_F(AzuriteFileSystemTest, CreateDirUri) {
+ ASSERT_RAISES(Invalid, fs_->CreateDir("abfs://" + RandomContainerName(),
true));
+}
+
TEST_F(AzuriteFileSystemTest, OpenInputStreamString) {
std::shared_ptr<io::InputStream> stream;
ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath()));
@@ -455,7 +566,7 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamInfoInvalid) {
}
TEST_F(AzuriteFileSystemTest, OpenInputStreamUri) {
- ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" +
PreexistingObjectPath()));
+ ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" +
PreexistingObjectPath()));
}
TEST_F(AzuriteFileSystemTest, OpenInputStreamTrailingSlash) {