felipecrv commented on code in PR #40325:
URL: https://github.com/apache/arrow/pull/40325#discussion_r1511534137


##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
 
 AzureOptions::~AzureOptions() = default;
 
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+                                           std::string* out_path) {

Review Comment:
   A PR by @bkietz is moving `Uri` out of `internal` so we should be careful 
with the merges.



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
 
 AzureOptions::~AzureOptions() = default;
 
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+                                           std::string* out_path) {
+  AzureOptions options;
+  const auto host = uri.host();
+  std::string container;
+  std::string path;
+  if (arrow::internal::EndsWith(host, options.blob_storage_authority)) {
+    options.account_name =
+        host.substr(0, host.size() - options.blob_storage_authority.size());
+    auto components = internal::SplitAbstractPath(uri.path());
+    if (!components.empty()) {
+      container = components[0];
+      path = internal::JoinAbstractPath(components.begin() + 1, 
components.end());
+    }
+  } else if (arrow::internal::EndsWith(host, options.dfs_storage_authority)) {
+    options.account_name =
+        host.substr(0, host.size() - options.dfs_storage_authority.size());
+    container = uri.username();
+    path = uri.path();
+  } else {
+    options.account_name = uri.username();
+    std::string host_port = host;
+    const auto port_text = uri.port_text();
+    if (!port_text.empty()) {
+      host_port += ":" + port_text;
+    }
+    options.blob_storage_authority = host_port;
+    options.dfs_storage_authority = host_port;
+    if (uri.scheme() == "abfs") {
+      options.blob_storage_scheme = "http";
+      options.dfs_storage_scheme = "http";

Review Comment:
   Why would we ever not use HTTPS in this day and age?



##########
cpp/src/arrow/filesystem/azurefs.h:
##########
@@ -123,6 +126,13 @@ struct ARROW_EXPORT AzureOptions {
   AzureOptions();
   ~AzureOptions();
 
+  /// Initialize from URIs such as
+  /// "abfs://account.blob.core.windows.net/container/dir/blob" and
+  /// "abfs://[email protected]/dir/file".

Review Comment:
   These should be just
   
   `abfs://<account_name>/container/dir/filename`
   
   because we are not exclusively using Blobs or ADLFS APIs in the filesystem 
implementation. We automatically detect what works and use the most appropriate 
API at different times.



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
 
 AzureOptions::~AzureOptions() = default;
 
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+                                           std::string* out_path) {
+  AzureOptions options;
+  const auto host = uri.host();
+  std::string container;
+  std::string path;
+  if (arrow::internal::EndsWith(host, options.blob_storage_authority)) {
+    options.account_name =
+        host.substr(0, host.size() - options.blob_storage_authority.size());
+    auto components = internal::SplitAbstractPath(uri.path());
+    if (!components.empty()) {
+      container = components[0];
+      path = internal::JoinAbstractPath(components.begin() + 1, 
components.end());
+    }
+  } else if (arrow::internal::EndsWith(host, options.dfs_storage_authority)) {
+    options.account_name =
+        host.substr(0, host.size() - options.dfs_storage_authority.size());
+    container = uri.username();
+    path = uri.path();
+  } else {
+    options.account_name = uri.username();
+    std::string host_port = host;
+    const auto port_text = uri.port_text();
+    if (!port_text.empty()) {
+      host_port += ":" + port_text;
+    }
+    options.blob_storage_authority = host_port;
+    options.dfs_storage_authority = host_port;
+    if (uri.scheme() == "abfs") {
+      options.blob_storage_scheme = "http";
+      options.dfs_storage_scheme = "http";
+    }
+    auto components = internal::SplitAbstractPath(uri.path());
+    if (!components.empty()) {
+      container = components[0];
+      path = internal::JoinAbstractPath(components.begin() + 1, 
components.end());
+    }
+  }
+  const auto account_key = uri.password();
+
+  if (container.empty()) {
+    return Status::Invalid("Missing container name in Azure Blob File System 
URI");
+  }

Review Comment:
   Why you need a container name if the filesystem wraps the entire storage 
account?



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
 
 AzureOptions::~AzureOptions() = default;
 
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+                                           std::string* out_path) {
+  AzureOptions options;
+  const auto host = uri.host();
+  std::string container;
+  std::string path;
+  if (arrow::internal::EndsWith(host, options.blob_storage_authority)) {
+    options.account_name =
+        host.substr(0, host.size() - options.blob_storage_authority.size());
+    auto components = internal::SplitAbstractPath(uri.path());
+    if (!components.empty()) {
+      container = components[0];
+      path = internal::JoinAbstractPath(components.begin() + 1, 
components.end());
+    }
+  } else if (arrow::internal::EndsWith(host, options.dfs_storage_authority)) {
+    options.account_name =
+        host.substr(0, host.size() - options.dfs_storage_authority.size());
+    container = uri.username();
+    path = uri.path();
+  } else {
+    options.account_name = uri.username();
+    std::string host_port = host;
+    const auto port_text = uri.port_text();
+    if (!port_text.empty()) {
+      host_port += ":" + port_text;
+    }
+    options.blob_storage_authority = host_port;
+    options.dfs_storage_authority = host_port;
+    if (uri.scheme() == "abfs") {
+      options.blob_storage_scheme = "http";
+      options.dfs_storage_scheme = "http";
+    }
+    auto components = internal::SplitAbstractPath(uri.path());
+    if (!components.empty()) {
+      container = components[0];
+      path = internal::JoinAbstractPath(components.begin() + 1, 
components.end());
+    }
+  }
+  const auto account_key = uri.password();
+
+  if (container.empty()) {
+    return Status::Invalid("Missing container name in Azure Blob File System 
URI");
+  }
+  if (out_path != nullptr) {
+    *out_path = std::string(internal::ConcatAbstractPath(container, path));
+  }
+
+  std::unordered_map<std::string, std::string> options_map;
+  ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items());
+  for (const auto& kv : options_items) {
+    options_map.emplace(kv.first, kv.second);
+  }
+
+  CredentialKind credential_kind = options.account_name.empty()
+                                       ? CredentialKind::kAnonymous
+                                       : CredentialKind::kDefault;
+  std::string tenant_id;
+  std::string client_id;
+  std::string client_secret;
+  for (const auto& kv : options_map) {
+    if (kv.first == "blob_storage_authority") {
+      options.blob_storage_authority = kv.second;
+    } else if (kv.first == "dfs_storage_authority") {
+      options.dfs_storage_authority = kv.second;
+    } else if (kv.first == "blob_storage_scheme") {
+      options.blob_storage_scheme = kv.second;
+    } else if (kv.first == "dfs_storage_scheme") {
+      options.dfs_storage_scheme = kv.second;
+    } else if (kv.first == "credential_kind") {

Review Comment:
   `credential_kind_` should be inferred from what you find on the URI without 
the user having to set both the credential kind *and* the credentials.



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -65,6 +65,133 @@ AzureOptions::AzureOptions() = default;
 
 AzureOptions::~AzureOptions() = default;
 
+Result<AzureOptions> AzureOptions::FromUri(const arrow::internal::Uri& uri,
+                                           std::string* out_path) {
+  AzureOptions options;
+  const auto host = uri.host();
+  std::string container;
+  std::string path;
+  if (arrow::internal::EndsWith(host, options.blob_storage_authority)) {
+    options.account_name =
+        host.substr(0, host.size() - options.blob_storage_authority.size());
+    auto components = internal::SplitAbstractPath(uri.path());
+    if (!components.empty()) {
+      container = components[0];
+      path = internal::JoinAbstractPath(components.begin() + 1, 
components.end());
+    }
+  } else if (arrow::internal::EndsWith(host, options.dfs_storage_authority)) {
+    options.account_name =
+        host.substr(0, host.size() - options.dfs_storage_authority.size());
+    container = uri.username();
+    path = uri.path();
+  } else {
+    options.account_name = uri.username();
+    std::string host_port = host;
+    const auto port_text = uri.port_text();
+    if (!port_text.empty()) {
+      host_port += ":" + port_text;
+    }
+    options.blob_storage_authority = host_port;
+    options.dfs_storage_authority = host_port;
+    if (uri.scheme() == "abfs") {
+      options.blob_storage_scheme = "http";
+      options.dfs_storage_scheme = "http";
+    }
+    auto components = internal::SplitAbstractPath(uri.path());
+    if (!components.empty()) {
+      container = components[0];
+      path = internal::JoinAbstractPath(components.begin() + 1, 
components.end());
+    }
+  }
+  const auto account_key = uri.password();
+
+  if (container.empty()) {
+    return Status::Invalid("Missing container name in Azure Blob File System 
URI");
+  }
+  if (out_path != nullptr) {
+    *out_path = std::string(internal::ConcatAbstractPath(container, path));
+  }
+
+  std::unordered_map<std::string, std::string> options_map;
+  ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items());
+  for (const auto& kv : options_items) {
+    options_map.emplace(kv.first, kv.second);
+  }

Review Comment:
   No need to build the map if you're going to iterate over the kv pairs and 
switch. This is just randomizing the iteration order.



##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -690,6 +693,21 @@ Result<std::shared_ptr<FileSystem>> 
FileSystemFromUriReal(const Uri& uri,
     }
     return std::make_shared<LocalFileSystem>(options, io_context);
   }
+  /// "abfs" and "abfss" schemes are taken from
+  /// the Azure Data Lake Storage Gen2 URI:
+  /// 
https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri
+  ///
+  /// 
abfs[s]://<file_system>@<account_name>.dfs.core.windows.net/<path>/<file_name>

Review Comment:
   `arrow::AzureFileSystem` is not an alternative implementation of the Data 
Lake storage APIs provided by Azure, it's a file-system abstraction that can 
use both Blobs and Data Lake APIs, so copying this format is both unnecessary 
and extremely confusing to users.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to