Tom-Newton commented on code in PR #38269:
URL: https://github.com/apache/arrow/pull/38269#discussion_r1364449056
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -37,34 +43,330 @@ bool AzureOptions::Equals(const AzureOptions& other) const
{
credentials_kind == other.credentials_kind);
}
+Status AzureOptions::ConfigureAccountKeyCredentials(const std::string&
account_name,
+ const std::string&
account_key) {
+ if (this->backend == AzureBackend::Azurite) {
+ account_blob_url = "http://127.0.0.1:10000/" + account_name + "/";
+ account_dfs_url = "http://127.0.0.1:10000/" + account_name + "/";
+ } else {
+ account_dfs_url = "https://" + account_name + ".dfs.core.windows.net/";
+ account_blob_url = "https://" + account_name + ".blob.core.windows.net/";
+ }
+ storage_credentials_provider =
+
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(account_name,
+
account_key);
+ credentials_kind = AzureCredentialsKind::StorageCredentials;
+ return Status::OK();
+}
+namespace {
+
+// An AzureFileSystem represents a single Azure storage account. AzurePath
describes a
+// container and path within that storage account.
+struct AzurePath {
+ std::string full_path;
+ std::string container;
+ std::string path_to_file;
+ std::vector<std::string> path_to_file_parts;
+
+ static Result<AzurePath> FromString(const std::string& s) {
+ // Example expected string format: testcontainer/testdir/testfile.txt
+ // container = testcontainer
+ // path_to_file = testdir/testfile.txt
+ // path_to_file_parts = [testdir, testfile.txt]
+ if (internal::IsLikelyUri(s)) {
+ return Status::Invalid(
+ "Expected an Azure object path of the form 'container/path...', got
a URI: '",
+ s, "'");
+ }
+ auto src = internal::RemoveTrailingSlash(s);
+ auto input_path = std::string(src.data());
+ src = internal::RemoveLeadingSlash(src);
+ auto first_sep = src.find_first_of(internal::kSep);
+ if (first_sep == 0) {
+ return Status::Invalid("Path cannot start with a separator ('",
input_path, "')");
+ }
+ if (first_sep == std::string::npos) {
+ return AzurePath{std::string(src), std::string(src), "", {}};
+ }
+ AzurePath path;
+ path.full_path = std::string(src);
+ path.container = std::string(src.substr(0, first_sep));
+ path.path_to_file = std::string(src.substr(first_sep + 1));
+ path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file);
+ RETURN_NOT_OK(Validate(path));
+ return path;
+ }
+
+ static Status Validate(const AzurePath& path) {
+ auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts);
+ if (!status.ok()) {
+ return Status::Invalid(status.message(), " in path ", path.full_path);
+ } else {
+ return status;
+ }
+ }
+
+ AzurePath parent() const {
+ DCHECK(has_parent());
+ auto parent = AzurePath{"", container, "", path_to_file_parts};
+ parent.path_to_file_parts.pop_back();
+ parent.path_to_file =
internal::JoinAbstractPath(parent.path_to_file_parts);
+ if (parent.path_to_file.empty()) {
+ parent.full_path = parent.container;
+ } else {
+ parent.full_path = parent.container + internal::kSep +
parent.path_to_file;
+ }
+ return parent;
+ }
+
+ bool has_parent() const { return !path_to_file.empty(); }
+
+ bool empty() const { return container.empty() && path_to_file.empty(); }
+
+ bool operator==(const AzurePath& other) const {
+ return container == other.container && path_to_file == other.path_to_file;
+ }
+};
+
+Status PathNotFound(const AzurePath& path) {
+ return ::arrow::fs::internal::PathNotFound(path.full_path);
+}
+
+Status NotAFile(const AzurePath& path) {
+ return ::arrow::fs::internal::NotAFile(path.full_path);
+}
+
+Status ValidateFilePath(const AzurePath& path) {
+ if (path.container.empty()) {
+ return PathNotFound(path);
+ }
+
+ if (path.path_to_file.empty()) {
+ return NotAFile(path);
+ }
+ return Status::OK();
+}
Review Comment:
This is for validating that an `AzurePath` might refer to a file, rather
than a directory. If we want list the root of a container then its expected
that `path_to_file` will be empty. Additionally if we consider the storage
account, rather than the container, to be the root of the `AzureFileSystem`
then it would also be valid for `container` to be empty to.
Whether to make the container or the storage account the root of the
`AzureFileSystem` is a good question. Makng the container the root would
simplify things but if you look at other Azure filesystems it seems like the
storage account is considered the root. For example:
`fsspec` does listing assuming the storage account is the root
https://github.com/fsspec/adlfs/blob/092685f102c5cd215550d10e8347e5bce0e2b93d/adlfs/spec.py#L766
So does the original AzureFileSystem PR #12914
https://github.com/apache/arrow/blob/3ea2d7fae20742baaf670b81dfabdd33fcad0258/cpp/src/arrow/filesystem/azurefs.cc#L1786C27-L1786C27
Given this precident I was planning to consider the storage account to be
the root of the filesystem. That means that its valid for `AzurePath` to have
empty `container` if `path_to_file` is also empty, so neither of these checks
can be moved to `Validate`.
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -37,34 +43,330 @@ bool AzureOptions::Equals(const AzureOptions& other) const
{
credentials_kind == other.credentials_kind);
}
+Status AzureOptions::ConfigureAccountKeyCredentials(const std::string&
account_name,
+ const std::string&
account_key) {
+ if (this->backend == AzureBackend::Azurite) {
+ account_blob_url = "http://127.0.0.1:10000/" + account_name + "/";
+ account_dfs_url = "http://127.0.0.1:10000/" + account_name + "/";
+ } else {
+ account_dfs_url = "https://" + account_name + ".dfs.core.windows.net/";
+ account_blob_url = "https://" + account_name + ".blob.core.windows.net/";
+ }
+ storage_credentials_provider =
+
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(account_name,
+
account_key);
+ credentials_kind = AzureCredentialsKind::StorageCredentials;
+ return Status::OK();
+}
+namespace {
+
+// An AzureFileSystem represents a single Azure storage account. AzurePath
describes a
+// container and path within that storage account.
+struct AzurePath {
+ std::string full_path;
+ std::string container;
+ std::string path_to_file;
+ std::vector<std::string> path_to_file_parts;
+
+ static Result<AzurePath> FromString(const std::string& s) {
+ // Example expected string format: testcontainer/testdir/testfile.txt
+ // container = testcontainer
+ // path_to_file = testdir/testfile.txt
+ // path_to_file_parts = [testdir, testfile.txt]
+ if (internal::IsLikelyUri(s)) {
+ return Status::Invalid(
+ "Expected an Azure object path of the form 'container/path...', got
a URI: '",
+ s, "'");
+ }
+ auto src = internal::RemoveTrailingSlash(s);
+ auto input_path = std::string(src.data());
+ src = internal::RemoveLeadingSlash(src);
+ auto first_sep = src.find_first_of(internal::kSep);
+ if (first_sep == 0) {
+ return Status::Invalid("Path cannot start with a separator ('",
input_path, "')");
+ }
+ if (first_sep == std::string::npos) {
+ return AzurePath{std::string(src), std::string(src), "", {}};
+ }
+ AzurePath path;
+ path.full_path = std::string(src);
+ path.container = std::string(src.substr(0, first_sep));
+ path.path_to_file = std::string(src.substr(first_sep + 1));
+ path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file);
+ RETURN_NOT_OK(Validate(path));
+ return path;
+ }
+
+ static Status Validate(const AzurePath& path) {
+ auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts);
+ if (!status.ok()) {
+ return Status::Invalid(status.message(), " in path ", path.full_path);
+ } else {
+ return status;
+ }
+ }
+
+ AzurePath parent() const {
+ DCHECK(has_parent());
+ auto parent = AzurePath{"", container, "", path_to_file_parts};
+ parent.path_to_file_parts.pop_back();
+ parent.path_to_file =
internal::JoinAbstractPath(parent.path_to_file_parts);
+ if (parent.path_to_file.empty()) {
+ parent.full_path = parent.container;
+ } else {
+ parent.full_path = parent.container + internal::kSep +
parent.path_to_file;
+ }
+ return parent;
+ }
+
+ bool has_parent() const { return !path_to_file.empty(); }
+
+ bool empty() const { return container.empty() && path_to_file.empty(); }
+
+ bool operator==(const AzurePath& other) const {
+ return container == other.container && path_to_file == other.path_to_file;
+ }
+};
+
+Status PathNotFound(const AzurePath& path) {
+ return ::arrow::fs::internal::PathNotFound(path.full_path);
+}
+
+Status NotAFile(const AzurePath& path) {
+ return ::arrow::fs::internal::NotAFile(path.full_path);
+}
+
+Status ValidateFilePath(const AzurePath& path) {
+ if (path.container.empty()) {
+ return PathNotFound(path);
+ }
+
+ if (path.path_to_file.empty()) {
+ return NotAFile(path);
+ }
+ return Status::OK();
+}
Review Comment:
This is for validating that an `AzurePath` might refer to a file, rather
than a directory. If we want to list the root of a container then its expected
that `path_to_file` will be empty. Additionally if we consider the storage
account, rather than the container, to be the root of the `AzureFileSystem`
then it would also be valid for `container` to be empty to.
Whether to make the container or the storage account the root of the
`AzureFileSystem` is a good question. Makng the container the root would
simplify things but if you look at other Azure filesystems it seems like the
storage account is considered the root. For example:
`fsspec` does listing assuming the storage account is the root
https://github.com/fsspec/adlfs/blob/092685f102c5cd215550d10e8347e5bce0e2b93d/adlfs/spec.py#L766
So does the original AzureFileSystem PR #12914
https://github.com/apache/arrow/blob/3ea2d7fae20742baaf670b81dfabdd33fcad0258/cpp/src/arrow/filesystem/azurefs.cc#L1786C27-L1786C27
Given this precident I was planning to consider the storage account to be
the root of the filesystem. That means that its valid for `AzurePath` to have
empty `container` if `path_to_file` is also empty, so neither of these checks
can be moved to `Validate`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]