alamb commented on code in PR #552:
URL:
https://github.com/apache/arrow-rs-object-store/pull/552#discussion_r2701236996
##########
Cargo.toml:
##########
@@ -40,6 +40,7 @@ humantime = "2.1"
itertools = "0.14.0"
parking_lot = { version = "0.12" }
percent-encoding = "2.1"
+regex = "1.11.1"
Review Comment:
Since this is only used for azure, in order to keep the dependency chain
minimal, can we please make this an optional dependency, following for example
`base64`?
And then activate like this?
```
azure = ["cloud", "httparse"]
```
##########
src/azure/builder.rs:
##########
@@ -671,36 +673,86 @@ impl MicrosoftAzureBuilder {
self.container_name = Some(validate(parsed.username())?);
self.account_name = Some(validate(a)?);
self.use_fabric_endpoint = true.into();
+ } else if let Some(a) =
host.strip_suffix(".blob.core.windows.net") {
+ self.container_name = Some(validate(parsed.username())?);
+ self.account_name = Some(validate(a)?);
+ } else if let Some(a) =
host.strip_suffix(".blob.fabric.microsoft.com") {
+ self.container_name = Some(validate(parsed.username())?);
+ self.account_name = Some(validate(a)?);
+ self.use_fabric_endpoint = true.into();
+ } else if let Some(a) =
host.strip_suffix("-api.onelake.fabric.microsoft.com") {
Review Comment:
This comment is still outstanding I think. Is this a vaild url?
##########
src/azure/builder.rs:
##########
@@ -1211,6 +1264,38 @@ mod tests {
assert_eq!(builder.container_name.as_deref(), Some("container"));
assert!(builder.use_fabric_endpoint.get().unwrap());
+ let mut builder = MicrosoftAzureBuilder::new();
+ builder
+
.parse_url("https://Ab000000000000000000000000000000.zAb.dfs.fabric.microsoft.com/")
+ .unwrap();
+ assert_eq!(builder.account_name,
Some("ab000000000000000000000000000000.zab".to_string()));
Review Comment:
is the account name really supposed to have the
`ab000000000000000000000000000000` in it? It seems confusing that the container
name is also `ab000000000000000000000000000000`
##########
src/azure/builder.rs:
##########
@@ -664,48 +666,99 @@ impl MicrosoftAzureBuilder {
// or the convention for the hadoop driver
abfs[s]://<file_system>@<account_name>.dfs.core.windows.net/<path>
if parsed.username().is_empty() {
self.container_name = Some(validate(host)?);
+ } else if let Some(a) =
host.strip_suffix(".dfs.core.windows.net") {
+ self.container_name = Some(validate(parsed.username())?);
+ self.account_name = Some(validate(a)?);
+ } else if let Some(a) =
host.strip_suffix(".dfs.fabric.microsoft.com") {
+ self.container_name = Some(validate(parsed.username())?);
+ self.account_name = Some(validate(a)?);
+ self.use_fabric_endpoint = true.into();
+ } else if let Some(a) =
host.strip_suffix(".blob.core.windows.net") {
+ self.container_name = Some(validate(parsed.username())?);
+ self.account_name = Some(validate(a)?);
+ } else if let Some(a) =
host.strip_suffix(".blob.fabric.microsoft.com") {
+ self.container_name = Some(validate(parsed.username())?);
+ self.account_name = Some(validate(a)?);
+ self.use_fabric_endpoint = true.into();
+ } else if let Some(a) =
host.strip_suffix("-api.onelake.fabric.microsoft.com") {
+ self.container_name = Some(validate(parsed.username())?);
+ self.account_name = Some(validate(a)?);
+ self.use_fabric_endpoint = true.into();
} else {
- match host.split_once('.') {
- Some((a, "dfs.core.windows.net")) | Some((a,
"blob.core.windows.net")) => {
- self.account_name = Some(validate(a)?);
- self.container_name =
Some(validate(parsed.username())?);
- }
- Some((a, "dfs.fabric.microsoft.com")) | Some((a,
"blob.fabric.microsoft.net")) => {
- self.account_name = Some(validate(a)?);
- self.container_name =
Some(validate(parsed.username())?);
- self.use_fabric_endpoint = true.into();
- }
- _ => return Err(Error::UrlNotRecognised { url:
url.into() }.into())
-
- }
+ return Err(Error::UrlNotRecognised { url: url.into()
}.into());
}
}
- "https" => match host.split_once('.') {
- Some((a, "dfs.core.windows.net")) | Some((a,
"blob.core.windows.net")) => {
- self.account_name = Some(validate(a)?);
- let container =
parsed.path_segments().unwrap().next().expect(
- "iterator always contains at least one string (which
may be empty)",
- );
- if !container.is_empty() {
- self.container_name = Some(validate(container)?);
+ "https" => {
+ // Regex to match WS-PL FQDN:
+ // "{workspaceid}.z??.(onelake|dfs|blob).fabric.microsoft.com"
+ static WS_PL_REGEX: OnceLock<Regex> = OnceLock::new();
+
+ let ws_pl_regex = WS_PL_REGEX.get_or_init(|| {
+ Regex::new(
+
r"(?i)^(?P<workspaceid>[0-9a-f]{32})\.z(?P<xy>[0-9a-f]{2})\.(onelake|dfs|blob)\.fabric\.microsoft\.com$"
Review Comment:
What is the rationale for using regexp math here rather than just hostname
splitting? The examples / tests you have added don't seem to test for invalid
URLs / that don't follow a simple <container>.<account>... format
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]