kashyap-kunal commented on code in PR #552:
URL: 
https://github.com/apache/arrow-rs-object-store/pull/552#discussion_r2638720561


##########
src/azure/builder.rs:
##########
@@ -675,32 +679,79 @@ impl MicrosoftAzureBuilder {
                     return Err(Error::UrlNotRecognised { url: url.into() 
}.into());
                 }
             }
-            "https" => match host.split_once('.') {
-                Some((a, "dfs.core.windows.net")) | Some((a, 
"blob.core.windows.net")) => {
-                    self.account_name = Some(validate(a)?);
-                    let container = 
parsed.path_segments().unwrap().next().expect(
+            "https" => {
+                const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com";
+                const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com";
+                const DFS_AZURE_SUFFIX: &str = "dfs.core.windows.net";
+                const BLOB_AZURE_SUFFIX: &str = "blob.core.windows.net";
+
+                // Regex to match WS-PL FQDN: 
"{workspaceid}.z??.dfs.fabric.microsoft.com"
+                // workspaceid = 32 hex chars, z?? = z + first two chars of 
workspaceid
+                lazy_static::lazy_static! {
+                    static ref WS_PL_REGEX: Regex = 
Regex::new(r"^(?P<workspaceid>[0-9a-f]{32})\.z(?P<xy>[0-9a-f]{2})\.(dfs|blob)\.fabric\.microsoft\.com$").unwrap();

Review Comment:
   Let's add support for .onelake.fabric.microsoft.com also



##########
src/azure/builder.rs:
##########
@@ -675,32 +679,79 @@ impl MicrosoftAzureBuilder {
                     return Err(Error::UrlNotRecognised { url: url.into() 
}.into());
                 }
             }
-            "https" => match host.split_once('.') {
-                Some((a, "dfs.core.windows.net")) | Some((a, 
"blob.core.windows.net")) => {
-                    self.account_name = Some(validate(a)?);
-                    let container = 
parsed.path_segments().unwrap().next().expect(
+            "https" => {
+                const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com";
+                const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com";
+                const DFS_AZURE_SUFFIX: &str = "dfs.core.windows.net";
+                const BLOB_AZURE_SUFFIX: &str = "blob.core.windows.net";
+
+                // Regex to match WS-PL FQDN: 
"{workspaceid}.z??.dfs.fabric.microsoft.com"
+                // workspaceid = 32 hex chars, z?? = z + first two chars of 
workspaceid
+                lazy_static::lazy_static! {
+                    static ref WS_PL_REGEX: Regex = 
Regex::new(r"^(?P<workspaceid>[0-9a-f]{32})\.z(?P<xy>[0-9a-f]{2})\.(dfs|blob)\.fabric\.microsoft\.com$").unwrap();
+                }
+
+                if let Some(captures) = WS_PL_REGEX.captures(host) {
+                    let workspaceid = 
captures.name("workspaceid").unwrap().as_str();
+                    let xy = captures.name("xy").unwrap().as_str();
+
+                    // Validate z?? matches first 2 chars of workspaceid
+                    if &workspaceid[0..2] != xy {

Review Comment:
   remove this validation



##########
src/azure/builder.rs:
##########
@@ -675,32 +679,79 @@ impl MicrosoftAzureBuilder {
                     return Err(Error::UrlNotRecognised { url: url.into() 
}.into());
                 }
             }
-            "https" => match host.split_once('.') {
-                Some((a, "dfs.core.windows.net")) | Some((a, 
"blob.core.windows.net")) => {
-                    self.account_name = Some(validate(a)?);
-                    let container = 
parsed.path_segments().unwrap().next().expect(
+            "https" => {
+                const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com";
+                const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com";
+                const DFS_AZURE_SUFFIX: &str = "dfs.core.windows.net";
+                const BLOB_AZURE_SUFFIX: &str = "blob.core.windows.net";
+
+                // Regex to match WS-PL FQDN: 
"{workspaceid}.z??.dfs.fabric.microsoft.com"
+                // workspaceid = 32 hex chars, z?? = z + first two chars of 
workspaceid
+                lazy_static::lazy_static! {
+                    static ref WS_PL_REGEX: Regex = 
Regex::new(r"^(?P<workspaceid>[0-9a-f]{32})\.z(?P<xy>[0-9a-f]{2})\.(dfs|blob)\.fabric\.microsoft\.com$").unwrap();
+                }
+
+                if let Some(captures) = WS_PL_REGEX.captures(host) {
+                    let workspaceid = 
captures.name("workspaceid").unwrap().as_str();
+                    let xy = captures.name("xy").unwrap().as_str();
+
+                    // Validate z?? matches first 2 chars of workspaceid
+                    if &workspaceid[0..2] != xy {
+                        return Err(Error::UrlNotRecognised { url: url.into() 
}.into());
+                    }
+
+                    self.account_name = Some(validate(workspaceid)?);
+                    self.use_fabric_endpoint = true;
+
+                    let container = parsed
+                        .path_segments()
+                        .and_then(|mut s| s.next())
+                        .unwrap_or("");
+                    if !container.is_empty() {
+                        self.container_name = Some(validate(container)?);
+                    }
+
+                    return Ok(());
+                }
+
+                // Otherwise, check Fabric global / Onelake API FQDN 
+                if host.ends_with(DFS_FABRIC_SUFFIX) || 
host.ends_with(BLOB_FABRIC_SUFFIX) {
+                let labels: Vec<&str> = host.split('.').collect();  
+                let account_name = if labels.len() >= 2 && 
labels[0].contains("api") && labels[1] == "onelake" {
+                                        format!("{}-{}", labels[0], labels[1])

Review Comment:
   If we are referring workspace id as account_name, then this will not work 
for non pl scenario. In those case it will give account_name as 
"westus-api-onelake"



##########
src/azure/builder.rs:
##########
@@ -675,32 +679,79 @@ impl MicrosoftAzureBuilder {
                     return Err(Error::UrlNotRecognised { url: url.into() 
}.into());
                 }
             }
-            "https" => match host.split_once('.') {
-                Some((a, "dfs.core.windows.net")) | Some((a, 
"blob.core.windows.net")) => {
-                    self.account_name = Some(validate(a)?);
-                    let container = 
parsed.path_segments().unwrap().next().expect(
+            "https" => {
+                const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com";
+                const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com";
+                const DFS_AZURE_SUFFIX: &str = "dfs.core.windows.net";
+                const BLOB_AZURE_SUFFIX: &str = "blob.core.windows.net";
+
+                // Regex to match WS-PL FQDN: 
"{workspaceid}.z??.dfs.fabric.microsoft.com"
+                // workspaceid = 32 hex chars, z?? = z + first two chars of 
workspaceid
+                lazy_static::lazy_static! {
+                    static ref WS_PL_REGEX: Regex = 
Regex::new(r"^(?P<workspaceid>[0-9a-f]{32})\.z(?P<xy>[0-9a-f]{2})\.(dfs|blob)\.fabric\.microsoft\.com$").unwrap();
+                }
+
+                if let Some(captures) = WS_PL_REGEX.captures(host) {
+                    let workspaceid = 
captures.name("workspaceid").unwrap().as_str();
+                    let xy = captures.name("xy").unwrap().as_str();
+
+                    // Validate z?? matches first 2 chars of workspaceid
+                    if &workspaceid[0..2] != xy {
+                        return Err(Error::UrlNotRecognised { url: url.into() 
}.into());
+                    }
+
+                    self.account_name = Some(validate(workspaceid)?);
+                    self.use_fabric_endpoint = true;
+
+                    let container = parsed
+                        .path_segments()
+                        .and_then(|mut s| s.next())
+                        .unwrap_or("");
+                    if !container.is_empty() {
+                        self.container_name = Some(validate(container)?);
+                    }
+
+                    return Ok(());
+                }
+
+                // Otherwise, check Fabric global / Onelake API FQDN 
+                if host.ends_with(DFS_FABRIC_SUFFIX) || 
host.ends_with(BLOB_FABRIC_SUFFIX) {

Review Comment:
   if we are checking for global endpoint, should these not be with not like 
(!host.ends_with(DFS_FABRIC_SUFFIX) &&  !host.ends_with(BLOB_FABRIC_SUFFIX))



##########
src/azure/builder.rs:
##########
@@ -675,32 +679,79 @@ impl MicrosoftAzureBuilder {
                     return Err(Error::UrlNotRecognised { url: url.into() 
}.into());
                 }
             }
-            "https" => match host.split_once('.') {
-                Some((a, "dfs.core.windows.net")) | Some((a, 
"blob.core.windows.net")) => {
-                    self.account_name = Some(validate(a)?);
-                    let container = 
parsed.path_segments().unwrap().next().expect(
+            "https" => {
+                const DFS_FABRIC_SUFFIX: &str = "dfs.fabric.microsoft.com";
+                const BLOB_FABRIC_SUFFIX: &str = "blob.fabric.microsoft.com";
+                const DFS_AZURE_SUFFIX: &str = "dfs.core.windows.net";
+                const BLOB_AZURE_SUFFIX: &str = "blob.core.windows.net";
+
+                // Regex to match WS-PL FQDN: 
"{workspaceid}.z??.dfs.fabric.microsoft.com"
+                // workspaceid = 32 hex chars, z?? = z + first two chars of 
workspaceid
+                lazy_static::lazy_static! {
+                    static ref WS_PL_REGEX: Regex = 
Regex::new(r"^(?P<workspaceid>[0-9a-f]{32})\.z(?P<xy>[0-9a-f]{2})\.(dfs|blob)\.fabric\.microsoft\.com$").unwrap();
+                }
+
+                if let Some(captures) = WS_PL_REGEX.captures(host) {
+                    let workspaceid = 
captures.name("workspaceid").unwrap().as_str();
+                    let xy = captures.name("xy").unwrap().as_str();
+
+                    // Validate z?? matches first 2 chars of workspaceid
+                    if &workspaceid[0..2] != xy {
+                        return Err(Error::UrlNotRecognised { url: url.into() 
}.into());
+                    }
+
+                    self.account_name = Some(validate(workspaceid)?);
+                    self.use_fabric_endpoint = true;
+
+                    let container = parsed
+                        .path_segments()
+                        .and_then(|mut s| s.next())
+                        .unwrap_or("");
+                    if !container.is_empty() {
+                        self.container_name = Some(validate(container)?);
+                    }
+
+                    return Ok(());
+                }
+
+                // Otherwise, check Fabric global / Onelake API FQDN 
+                if host.ends_with(DFS_FABRIC_SUFFIX) || 
host.ends_with(BLOB_FABRIC_SUFFIX) {
+                let labels: Vec<&str> = host.split('.').collect();  
+                let account_name = if labels.len() >= 2 && 
labels[0].contains("api") && labels[1] == "onelake" {
+                                        format!("{}-{}", labels[0], labels[1])
+                                    } else {
+                                        labels[0].to_string()
+                                    };
+
+                self.account_name = Some(validate(&account_name)?);
+                self.use_fabric_endpoint = true;
+
+                let container = parsed.path_segments().unwrap().next().expect(
                         "iterator always contains at least one string (which 
may be empty)",
                     );
                     if !container.is_empty() {
                         self.container_name = Some(validate(container)?);
                     }
+
+                    return Ok(());
                 }
-                Some((a, "dfs.fabric.microsoft.com")) | Some((a, 
"blob.fabric.microsoft.com")) => {
-                    self.account_name = Some(validate(a)?);
-                    // Attempt to infer the container name from the URL
-                    // - 
https://onelake.dfs.fabric.microsoft.com/<workspaceGUID>/<itemGUID>/Files/test.csv
-                    // - 
https://onelake.dfs.fabric.microsoft.com/<workspace>/<item>.<itemtype>/<path>/<fileName>
-                    //
-                    // See 
<https://learn.microsoft.com/en-us/fabric/onelake/onelake-access-api>
-                    let workspace = 
parsed.path_segments().unwrap().next().expect(
+
+                // Azure Storage public
+                if host.ends_with(DFS_AZURE_SUFFIX) || 
host.ends_with(BLOB_AZURE_SUFFIX) {
+                    let first_label = 
host.split('.').next().unwrap_or_default();
+                    self.account_name = Some(validate(first_label)?);
+
+                    let container = 
parsed.path_segments().unwrap().next().expect(

Review Comment:
   nit: onelake uses workspace terminology



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to