tustvold commented on code in PR #362: URL: https://github.com/apache/arrow-rs-object-store/pull/362#discussion_r2088319276
########## src/parse.rs: ########## @@ -43,6 +43,162 @@ impl From<Error> for super::Error { } } +/// Path to an object in an ObjectStore +#[non_exhaustive] // permit new variants +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct ObjectStoreUrl { + scheme: ObjectStoreScheme, + store: Url, + path: Path, +} + +impl ObjectStoreUrl { + /// Parse a URL into an ObjectStoreUrl + /// + /// Extracts the relevant ObjectStoreScheme, store URL (for identification), + /// and path within the store. + /// + /// # Example + /// + /// ``` + /// # use url::Url; + /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl}; + /// let url: Url = "s3://bucket/path/to/file".parse().unwrap(); + /// let os_url = ObjectStoreUrl::parse(&url).unwrap(); + /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3); + /// assert_eq!(os_url.store().as_str(), "s3://bucket/"); + /// assert_eq!(os_url.path().as_ref(), "path/to/file"); + /// ``` + pub fn parse(url: &Url) -> Result<Self, Error> { + let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); + let store_with_bucket = || { Review Comment: IIRC you can pop segments off a URL ########## src/parse.rs: ########## @@ -43,6 +43,162 @@ impl From<Error> for super::Error { } } +/// Path to an object in an ObjectStore +#[non_exhaustive] // permit new variants +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct ObjectStoreUrl { + scheme: ObjectStoreScheme, + store: Url, + path: Path, +} + +impl ObjectStoreUrl { + /// Parse a URL into an ObjectStoreUrl + /// + /// Extracts the relevant ObjectStoreScheme, store URL (for identification), + /// and path within the store. + /// + /// # Example + /// + /// ``` + /// # use url::Url; + /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl}; + /// let url: Url = "s3://bucket/path/to/file".parse().unwrap(); + /// let os_url = ObjectStoreUrl::parse(&url).unwrap(); + /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3); + /// assert_eq!(os_url.store().as_str(), "s3://bucket/"); + /// assert_eq!(os_url.path().as_ref(), "path/to/file"); + /// ``` + pub fn parse(url: &Url) -> Result<Self, Error> { + let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); + let store_with_bucket = || { + let bucket = url + .path_segments() + .map(|mut segments| segments.next()) + .flatten() + .ok_or(Error::Unrecognised { url: url.clone() })?; + Url::parse(&format!( + "{}/{}", + &url[url::Position::BeforeScheme..url::Position::AfterPort], + bucket, + )) + .map_err(|_| Error::Unrecognised { url: url.clone() }) + }; + + let (scheme, store_url, path) = match (url.scheme(), url.host_str()) { + ("file", None) => ( + ObjectStoreScheme::Local, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), Review Comment: Is there a reason we don't just use BeforePath? I worry we're discarding part of the URL that could matter ########## src/parse.rs: ########## @@ -43,6 +43,162 @@ impl From<Error> for super::Error { } } +/// Path to an object in an ObjectStore +#[non_exhaustive] // permit new variants +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct ObjectStoreUrl { + scheme: ObjectStoreScheme, + store: Url, + path: Path, +} + +impl ObjectStoreUrl { + /// Parse a URL into an ObjectStoreUrl + /// + /// Extracts the relevant ObjectStoreScheme, store URL (for identification), + /// and path within the store. + /// + /// # Example + /// + /// ``` + /// # use url::Url; + /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl}; + /// let url: Url = "s3://bucket/path/to/file".parse().unwrap(); + /// let os_url = ObjectStoreUrl::parse(&url).unwrap(); + /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3); + /// assert_eq!(os_url.store().as_str(), "s3://bucket/"); + /// assert_eq!(os_url.path().as_ref(), "path/to/file"); + /// ``` + pub fn parse(url: &Url) -> Result<Self, Error> { + let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); + let store_with_bucket = || { + let bucket = url + .path_segments() + .map(|mut segments| segments.next()) + .flatten() + .ok_or(Error::Unrecognised { url: url.clone() })?; + Url::parse(&format!( + "{}/{}", + &url[url::Position::BeforeScheme..url::Position::AfterPort], + bucket, + )) + .map_err(|_| Error::Unrecognised { url: url.clone() }) + }; + + let (scheme, store_url, path) = match (url.scheme(), url.host_str()) { + ("file", None) => ( + ObjectStoreScheme::Local, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + &url[url::Position::BeforePath..url::Position::AfterPath], + ), + ("memory", None) => ( + ObjectStoreScheme::Memory, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + &url[url::Position::BeforePath..url::Position::AfterPath], + ), + ("s3" | "s3a", Some(_)) => ( + ObjectStoreScheme::AmazonS3, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("gs", Some(_)) => ( + ObjectStoreScheme::GoogleCloudStorage, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => ( Review Comment: IIRC there is a pre-existing bug where the bucket is part of the path for some of these, check AzureBuilder ########## src/parse.rs: ########## @@ -43,6 +43,162 @@ impl From<Error> for super::Error { } } +/// Path to an object in an ObjectStore +#[non_exhaustive] // permit new variants +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct ObjectStoreUrl { + scheme: ObjectStoreScheme, + store: Url, + path: Path, +} + +impl ObjectStoreUrl { + /// Parse a URL into an ObjectStoreUrl + /// + /// Extracts the relevant ObjectStoreScheme, store URL (for identification), + /// and path within the store. + /// + /// # Example + /// + /// ``` + /// # use url::Url; + /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl}; + /// let url: Url = "s3://bucket/path/to/file".parse().unwrap(); + /// let os_url = ObjectStoreUrl::parse(&url).unwrap(); + /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3); + /// assert_eq!(os_url.store().as_str(), "s3://bucket/"); + /// assert_eq!(os_url.path().as_ref(), "path/to/file"); + /// ``` + pub fn parse(url: &Url) -> Result<Self, Error> { + let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); + let store_with_bucket = || { + let bucket = url + .path_segments() + .map(|mut segments| segments.next()) + .flatten() + .ok_or(Error::Unrecognised { url: url.clone() })?; + Url::parse(&format!( + "{}/{}", + &url[url::Position::BeforeScheme..url::Position::AfterPort], + bucket, + )) + .map_err(|_| Error::Unrecognised { url: url.clone() }) + }; + + let (scheme, store_url, path) = match (url.scheme(), url.host_str()) { + ("file", None) => ( + ObjectStoreScheme::Local, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + &url[url::Position::BeforePath..url::Position::AfterPath], + ), + ("memory", None) => ( + ObjectStoreScheme::Memory, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + &url[url::Position::BeforePath..url::Position::AfterPath], + ), + ("s3" | "s3a", Some(_)) => ( + ObjectStoreScheme::AmazonS3, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("gs", Some(_)) => ( + ObjectStoreScheme::GoogleCloudStorage, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => ( + ObjectStoreScheme::MicrosoftAzure, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("http", Some(_)) => ( + ObjectStoreScheme::Http, + url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(), + url.path(), + ), + ("https", Some(host)) => { + if host.ends_with("dfs.core.windows.net") + || host.ends_with("blob.core.windows.net") + || host.ends_with("dfs.fabric.microsoft.com") + || host.ends_with("blob.fabric.microsoft.com") + { + ( + ObjectStoreScheme::MicrosoftAzure, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ) + } else if host.ends_with("amazonaws.com") { + match host.starts_with("s3") { + true => { + let scheme_host_port_bucket = store_with_bucket()?; + eprintln!("scheme_host_port_bucket: {}", scheme_host_port_bucket); Review Comment: ? ########## src/parse.rs: ########## @@ -43,6 +43,162 @@ impl From<Error> for super::Error { } } +/// Path to an object in an ObjectStore +#[non_exhaustive] // permit new variants +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct ObjectStoreUrl { + scheme: ObjectStoreScheme, + store: Url, + path: Path, +} + +impl ObjectStoreUrl { + /// Parse a URL into an ObjectStoreUrl + /// + /// Extracts the relevant ObjectStoreScheme, store URL (for identification), + /// and path within the store. + /// + /// # Example + /// + /// ``` + /// # use url::Url; + /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl}; + /// let url: Url = "s3://bucket/path/to/file".parse().unwrap(); + /// let os_url = ObjectStoreUrl::parse(&url).unwrap(); + /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3); + /// assert_eq!(os_url.store().as_str(), "s3://bucket/"); + /// assert_eq!(os_url.path().as_ref(), "path/to/file"); + /// ``` + pub fn parse(url: &Url) -> Result<Self, Error> { + let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); + let store_with_bucket = || { + let bucket = url + .path_segments() + .map(|mut segments| segments.next()) + .flatten() + .ok_or(Error::Unrecognised { url: url.clone() })?; + Url::parse(&format!( + "{}/{}", + &url[url::Position::BeforeScheme..url::Position::AfterPort], + bucket, + )) + .map_err(|_| Error::Unrecognised { url: url.clone() }) + }; + + let (scheme, store_url, path) = match (url.scheme(), url.host_str()) { + ("file", None) => ( + ObjectStoreScheme::Local, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + &url[url::Position::BeforePath..url::Position::AfterPath], + ), + ("memory", None) => ( + ObjectStoreScheme::Memory, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + &url[url::Position::BeforePath..url::Position::AfterPath], + ), + ("s3" | "s3a", Some(_)) => ( + ObjectStoreScheme::AmazonS3, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("gs", Some(_)) => ( + ObjectStoreScheme::GoogleCloudStorage, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => ( + ObjectStoreScheme::MicrosoftAzure, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("http", Some(_)) => ( + ObjectStoreScheme::Http, + url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(), + url.path(), + ), + ("https", Some(host)) => { + if host.ends_with("dfs.core.windows.net") + || host.ends_with("blob.core.windows.net") + || host.ends_with("dfs.fabric.microsoft.com") + || host.ends_with("blob.fabric.microsoft.com") + { + ( + ObjectStoreScheme::MicrosoftAzure, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ) + } else if host.ends_with("amazonaws.com") { + match host.starts_with("s3") { + true => { + let scheme_host_port_bucket = store_with_bucket()?; + eprintln!("scheme_host_port_bucket: {}", scheme_host_port_bucket); + ( + ObjectStoreScheme::AmazonS3, + scheme_host_port_bucket.as_str().to_string(), + strip_bucket().unwrap_or_default(), + ) + } + false => ( + ObjectStoreScheme::AmazonS3, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + } + } else if host.ends_with("r2.cloudflarestorage.com") { + let scheme_host_port_bucket = store_with_bucket()?; + ( + ObjectStoreScheme::AmazonS3, + scheme_host_port_bucket.as_str().to_string(), + strip_bucket().unwrap_or_default(), + ) + } else { + ( + ObjectStoreScheme::Http, + url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(), + url.path(), + ) + } + } + _ => return Err(Error::Unrecognised { url: url.clone() }), + }; + + let mut store_url = + Url::parse(&store_url).map_err(|_| Error::Unrecognised { url: url.clone() })?; + + // Ensure path ends with a '/' + if !store_url.path().ends_with('/') { + store_url = Url::parse(&format!("{}/", store_url)) + .map_err(|_| Error::Unrecognised { url: url.clone() })?; + } + + // Strip password, leave username Review Comment: The intent is the store URL could be passed to parse_url or similar, I therefore don't think we should do this ########## src/parse.rs: ########## @@ -43,6 +43,162 @@ impl From<Error> for super::Error { } } +/// Path to an object in an ObjectStore +#[non_exhaustive] // permit new variants Review Comment: This is unnecessary as the struct has private fields ########## src/parse.rs: ########## @@ -43,6 +43,162 @@ impl From<Error> for super::Error { } } +/// Path to an object in an ObjectStore +#[non_exhaustive] // permit new variants +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct ObjectStoreUrl { + scheme: ObjectStoreScheme, + store: Url, + path: Path, +} + +impl ObjectStoreUrl { + /// Parse a URL into an ObjectStoreUrl + /// + /// Extracts the relevant ObjectStoreScheme, store URL (for identification), + /// and path within the store. + /// + /// # Example + /// + /// ``` + /// # use url::Url; + /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl}; + /// let url: Url = "s3://bucket/path/to/file".parse().unwrap(); + /// let os_url = ObjectStoreUrl::parse(&url).unwrap(); + /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3); + /// assert_eq!(os_url.store().as_str(), "s3://bucket/"); + /// assert_eq!(os_url.path().as_ref(), "path/to/file"); + /// ``` + pub fn parse(url: &Url) -> Result<Self, Error> { + let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); + let store_with_bucket = || { + let bucket = url + .path_segments() + .map(|mut segments| segments.next()) + .flatten() + .ok_or(Error::Unrecognised { url: url.clone() })?; + Url::parse(&format!( + "{}/{}", + &url[url::Position::BeforeScheme..url::Position::AfterPort], + bucket, + )) + .map_err(|_| Error::Unrecognised { url: url.clone() }) + }; + + let (scheme, store_url, path) = match (url.scheme(), url.host_str()) { + ("file", None) => ( + ObjectStoreScheme::Local, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + &url[url::Position::BeforePath..url::Position::AfterPath], + ), + ("memory", None) => ( + ObjectStoreScheme::Memory, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + &url[url::Position::BeforePath..url::Position::AfterPath], + ), + ("s3" | "s3a", Some(_)) => ( + ObjectStoreScheme::AmazonS3, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("gs", Some(_)) => ( + ObjectStoreScheme::GoogleCloudStorage, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => ( + ObjectStoreScheme::MicrosoftAzure, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + ("http", Some(_)) => ( + ObjectStoreScheme::Http, + url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(), + url.path(), + ), + ("https", Some(host)) => { + if host.ends_with("dfs.core.windows.net") + || host.ends_with("blob.core.windows.net") + || host.ends_with("dfs.fabric.microsoft.com") + || host.ends_with("blob.fabric.microsoft.com") + { + ( + ObjectStoreScheme::MicrosoftAzure, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ) + } else if host.ends_with("amazonaws.com") { + match host.starts_with("s3") { + true => { + let scheme_host_port_bucket = store_with_bucket()?; + eprintln!("scheme_host_port_bucket: {}", scheme_host_port_bucket); + ( + ObjectStoreScheme::AmazonS3, + scheme_host_port_bucket.as_str().to_string(), + strip_bucket().unwrap_or_default(), + ) + } + false => ( + ObjectStoreScheme::AmazonS3, + url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(), + url.path(), + ), + } + } else if host.ends_with("r2.cloudflarestorage.com") { + let scheme_host_port_bucket = store_with_bucket()?; + ( + ObjectStoreScheme::AmazonS3, + scheme_host_port_bucket.as_str().to_string(), + strip_bucket().unwrap_or_default(), + ) + } else { + ( + ObjectStoreScheme::Http, + url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(), + url.path(), + ) + } + } + _ => return Err(Error::Unrecognised { url: url.clone() }), + }; + + let mut store_url = + Url::parse(&store_url).map_err(|_| Error::Unrecognised { url: url.clone() })?; + + // Ensure path ends with a '/' + if !store_url.path().ends_with('/') { Review Comment: Is this necessary I seem to remembervURL handling this automatically -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org