felipecrv commented on code in PR #40325:
URL: https://github.com/apache/arrow/pull/40325#discussion_r1511534137
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
AzureOptions::~AzureOptions() = default;
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+ std::string* out_path) {
Review Comment:
A PR by @bkietz is moving `Uri` out of `internal` so we should be careful
with the merges.
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
AzureOptions::~AzureOptions() = default;
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+ std::string* out_path) {
+ AzureOptions options;
+ const auto host = uri.host();
+ std::string container;
+ std::string path;
+ if (arrow::internal::EndsWith(host, options.blob_storage_authority)) {
+ options.account_name =
+ host.substr(0, host.size() - options.blob_storage_authority.size());
+ auto components = internal::SplitAbstractPath(uri.path());
+ if (!components.empty()) {
+ container = components[0];
+ path = internal::JoinAbstractPath(components.begin() + 1,
components.end());
+ }
+ } else if (arrow::internal::EndsWith(host, options.dfs_storage_authority)) {
+ options.account_name =
+ host.substr(0, host.size() - options.dfs_storage_authority.size());
+ container = uri.username();
+ path = uri.path();
+ } else {
+ options.account_name = uri.username();
+ std::string host_port = host;
+ const auto port_text = uri.port_text();
+ if (!port_text.empty()) {
+ host_port += ":" + port_text;
+ }
+ options.blob_storage_authority = host_port;
+ options.dfs_storage_authority = host_port;
+ if (uri.scheme() == "abfs") {
+ options.blob_storage_scheme = "http";
+ options.dfs_storage_scheme = "http";
Review Comment:
Why would we ever not use HTTPS in this day and age?
##########
cpp/src/arrow/filesystem/azurefs.h:
##########
@@ -123,6 +126,13 @@ struct ARROW_EXPORT AzureOptions {
AzureOptions();
~AzureOptions();
+ /// Initialize from URIs such as
+ /// "abfs://account.blob.core.windows.net/container/dir/blob" and
+ /// "abfs://[email protected]/dir/file".
Review Comment:
These should be just
`abfs://<account_name>/container/dir/filename`
because we are not exclusively using Blobs or ADLFS APIs in the filesystem
implementation. We automatically detect what works and use the most appropriate
API at different times.
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
AzureOptions::~AzureOptions() = default;
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+ std::string* out_path) {
+ AzureOptions options;
+ const auto host = uri.host();
+ std::string container;
+ std::string path;
+ if (arrow::internal::EndsWith(host, options.blob_storage_authority)) {
+ options.account_name =
+ host.substr(0, host.size() - options.blob_storage_authority.size());
+ auto components = internal::SplitAbstractPath(uri.path());
+ if (!components.empty()) {
+ container = components[0];
+ path = internal::JoinAbstractPath(components.begin() + 1,
components.end());
+ }
+ } else if (arrow::internal::EndsWith(host, options.dfs_storage_authority)) {
+ options.account_name =
+ host.substr(0, host.size() - options.dfs_storage_authority.size());
+ container = uri.username();
+ path = uri.path();
+ } else {
+ options.account_name = uri.username();
+ std::string host_port = host;
+ const auto port_text = uri.port_text();
+ if (!port_text.empty()) {
+ host_port += ":" + port_text;
+ }
+ options.blob_storage_authority = host_port;
+ options.dfs_storage_authority = host_port;
+ if (uri.scheme() == "abfs") {
+ options.blob_storage_scheme = "http";
+ options.dfs_storage_scheme = "http";
+ }
+ auto components = internal::SplitAbstractPath(uri.path());
+ if (!components.empty()) {
+ container = components[0];
+ path = internal::JoinAbstractPath(components.begin() + 1,
components.end());
+ }
+ }
+ const auto account_key = uri.password();
+
+ if (container.empty()) {
+ return Status::Invalid("Missing container name in Azure Blob File System
URI");
+ }
Review Comment:
Why you need a container name if the filesystem wraps the entire storage
account?
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
AzureOptions::~AzureOptions() = default;
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+ std::string* out_path) {
+ AzureOptions options;
+ const auto host = uri.host();
+ std::string container;
+ std::string path;
+ if (arrow::internal::EndsWith(host, options.blob_storage_authority)) {
+ options.account_name =
+ host.substr(0, host.size() - options.blob_storage_authority.size());
+ auto components = internal::SplitAbstractPath(uri.path());
+ if (!components.empty()) {
+ container = components[0];
+ path = internal::JoinAbstractPath(components.begin() + 1,
components.end());
+ }
+ } else if (arrow::internal::EndsWith(host, options.dfs_storage_authority)) {
+ options.account_name =
+ host.substr(0, host.size() - options.dfs_storage_authority.size());
+ container = uri.username();
+ path = uri.path();
+ } else {
+ options.account_name = uri.username();
+ std::string host_port = host;
+ const auto port_text = uri.port_text();
+ if (!port_text.empty()) {
+ host_port += ":" + port_text;
+ }
+ options.blob_storage_authority = host_port;
+ options.dfs_storage_authority = host_port;
+ if (uri.scheme() == "abfs") {
+ options.blob_storage_scheme = "http";
+ options.dfs_storage_scheme = "http";
+ }
+ auto components = internal::SplitAbstractPath(uri.path());
+ if (!components.empty()) {
+ container = components[0];
+ path = internal::JoinAbstractPath(components.begin() + 1,
components.end());
+ }
+ }
+ const auto account_key = uri.password();
+
+ if (container.empty()) {
+ return Status::Invalid("Missing container name in Azure Blob File System
URI");
+ }
+ if (out_path != nullptr) {
+ *out_path = std::string(internal::ConcatAbstractPath(container, path));
+ }
+
+ std::unordered_map<std::string, std::string> options_map;
+ ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items());
+ for (const auto& kv : options_items) {
+ options_map.emplace(kv.first, kv.second);
+ }
+
+ CredentialKind credential_kind = options.account_name.empty()
+ ? CredentialKind::kAnonymous
+ : CredentialKind::kDefault;
+ std::string tenant_id;
+ std::string client_id;
+ std::string client_secret;
+ for (const auto& kv : options_map) {
+ if (kv.first == "blob_storage_authority") {
+ options.blob_storage_authority = kv.second;
+ } else if (kv.first == "dfs_storage_authority") {
+ options.dfs_storage_authority = kv.second;
+ } else if (kv.first == "blob_storage_scheme") {
+ options.blob_storage_scheme = kv.second;
+ } else if (kv.first == "dfs_storage_scheme") {
+ options.dfs_storage_scheme = kv.second;
+ } else if (kv.first == "credential_kind") {
Review Comment:
`credential_kind_` should be inferred from what you find on the URI without
the user having to set both the credential kind *and* the credentials.
##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
AzureOptions::~AzureOptions() = default;
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+ std::string* out_path) {
+ AzureOptions options;
+ const auto host = uri.host();
+ std::string container;
+ std::string path;
+ if (arrow::internal::EndsWith(host, options.blob_storage_authority)) {
+ options.account_name =
+ host.substr(0, host.size() - options.blob_storage_authority.size());
+ auto components = internal::SplitAbstractPath(uri.path());
+ if (!components.empty()) {
+ container = components[0];
+ path = internal::JoinAbstractPath(components.begin() + 1,
components.end());
+ }
+ } else if (arrow::internal::EndsWith(host, options.dfs_storage_authority)) {
+ options.account_name =
+ host.substr(0, host.size() - options.dfs_storage_authority.size());
+ container = uri.username();
+ path = uri.path();
+ } else {
+ options.account_name = uri.username();
+ std::string host_port = host;
+ const auto port_text = uri.port_text();
+ if (!port_text.empty()) {
+ host_port += ":" + port_text;
+ }
+ options.blob_storage_authority = host_port;
+ options.dfs_storage_authority = host_port;
+ if (uri.scheme() == "abfs") {
+ options.blob_storage_scheme = "http";
+ options.dfs_storage_scheme = "http";
+ }
+ auto components = internal::SplitAbstractPath(uri.path());
+ if (!components.empty()) {
+ container = components[0];
+ path = internal::JoinAbstractPath(components.begin() + 1,
components.end());
+ }
+ }
+ const auto account_key = uri.password();
+
+ if (container.empty()) {
+ return Status::Invalid("Missing container name in Azure Blob File System
URI");
+ }
+ if (out_path != nullptr) {
+ *out_path = std::string(internal::ConcatAbstractPath(container, path));
+ }
+
+ std::unordered_map<std::string, std::string> options_map;
+ ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items());
+ for (const auto& kv : options_items) {
+ options_map.emplace(kv.first, kv.second);
+ }
Review Comment:
No need to build the map if you're going to iterate over the kv pairs and
switch. This is just randomizing the iteration order.
##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -690,6 +693,21 @@ Result<std::shared_ptr<FileSystem>>
FileSystemFromUriReal(const Uri& uri,
}
return std::make_shared<LocalFileSystem>(options, io_context);
}
+ /// "abfs" and "abfss" schemes are taken from
+ /// the Azure Data Lake Storage Gen2 URI:
+ ///
https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri
+ ///
+ ///
abfs[s]://<file_system>@<account_name>.dfs.core.windows.net/<path>/<file_name>
Review Comment:
`arrow::AzureFileSystem` is not an alternative implementation of the Data
Lake storage APIs provided by Azure, it's a file-system abstraction that can
use both Blobs and Data Lake APIs, so copying this format is both unnecessary
and extremely confusing to users.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]