tustvold commented on code in PR #362:
URL: 
https://github.com/apache/arrow-rs-object-store/pull/362#discussion_r2088319276


##########
src/parse.rs:
##########
@@ -43,6 +43,162 @@ impl From<Error> for super::Error {
     }
 }
 
+/// Path to an object in an ObjectStore
+#[non_exhaustive] // permit new variants
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct ObjectStoreUrl {
+    scheme: ObjectStoreScheme,
+    store: Url,
+    path: Path,
+}
+
+impl ObjectStoreUrl {
+    /// Parse a URL into an ObjectStoreUrl
+    ///
+    /// Extracts the relevant ObjectStoreScheme, store URL (for 
identification),
+    /// and path within the store.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use url::Url;
+    /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl};
+    /// let url: Url = "s3://bucket/path/to/file".parse().unwrap();
+    /// let os_url = ObjectStoreUrl::parse(&url).unwrap();
+    /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3);
+    /// assert_eq!(os_url.store().as_str(), "s3://bucket/");
+    /// assert_eq!(os_url.path().as_ref(), "path/to/file");
+    /// ```
+    pub fn parse(url: &Url) -> Result<Self, Error> {
+        let strip_bucket = || 
Some(url.path().strip_prefix('/')?.split_once('/')?.1);
+        let store_with_bucket = || {

Review Comment:
   IIRC you can pop segments off a URL



##########
src/parse.rs:
##########
@@ -43,6 +43,162 @@ impl From<Error> for super::Error {
     }
 }
 
+/// Path to an object in an ObjectStore
+#[non_exhaustive] // permit new variants
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct ObjectStoreUrl {
+    scheme: ObjectStoreScheme,
+    store: Url,
+    path: Path,
+}
+
+impl ObjectStoreUrl {
+    /// Parse a URL into an ObjectStoreUrl
+    ///
+    /// Extracts the relevant ObjectStoreScheme, store URL (for 
identification),
+    /// and path within the store.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use url::Url;
+    /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl};
+    /// let url: Url = "s3://bucket/path/to/file".parse().unwrap();
+    /// let os_url = ObjectStoreUrl::parse(&url).unwrap();
+    /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3);
+    /// assert_eq!(os_url.store().as_str(), "s3://bucket/");
+    /// assert_eq!(os_url.path().as_ref(), "path/to/file");
+    /// ```
+    pub fn parse(url: &Url) -> Result<Self, Error> {
+        let strip_bucket = || 
Some(url.path().strip_prefix('/')?.split_once('/')?.1);
+        let store_with_bucket = || {
+            let bucket = url
+                .path_segments()
+                .map(|mut segments| segments.next())
+                .flatten()
+                .ok_or(Error::Unrecognised { url: url.clone() })?;
+            Url::parse(&format!(
+                "{}/{}",
+                &url[url::Position::BeforeScheme..url::Position::AfterPort],
+                bucket,
+            ))
+            .map_err(|_| Error::Unrecognised { url: url.clone() })
+        };
+
+        let (scheme, store_url, path) = match (url.scheme(), url.host_str()) {
+            ("file", None) => (
+                ObjectStoreScheme::Local,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),

Review Comment:
   Is there a reason we don't just use BeforePath? I worry we're discarding 
part of the URL that could matter



##########
src/parse.rs:
##########
@@ -43,6 +43,162 @@ impl From<Error> for super::Error {
     }
 }
 
+/// Path to an object in an ObjectStore
+#[non_exhaustive] // permit new variants
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct ObjectStoreUrl {
+    scheme: ObjectStoreScheme,
+    store: Url,
+    path: Path,
+}
+
+impl ObjectStoreUrl {
+    /// Parse a URL into an ObjectStoreUrl
+    ///
+    /// Extracts the relevant ObjectStoreScheme, store URL (for 
identification),
+    /// and path within the store.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use url::Url;
+    /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl};
+    /// let url: Url = "s3://bucket/path/to/file".parse().unwrap();
+    /// let os_url = ObjectStoreUrl::parse(&url).unwrap();
+    /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3);
+    /// assert_eq!(os_url.store().as_str(), "s3://bucket/");
+    /// assert_eq!(os_url.path().as_ref(), "path/to/file");
+    /// ```
+    pub fn parse(url: &Url) -> Result<Self, Error> {
+        let strip_bucket = || 
Some(url.path().strip_prefix('/')?.split_once('/')?.1);
+        let store_with_bucket = || {
+            let bucket = url
+                .path_segments()
+                .map(|mut segments| segments.next())
+                .flatten()
+                .ok_or(Error::Unrecognised { url: url.clone() })?;
+            Url::parse(&format!(
+                "{}/{}",
+                &url[url::Position::BeforeScheme..url::Position::AfterPort],
+                bucket,
+            ))
+            .map_err(|_| Error::Unrecognised { url: url.clone() })
+        };
+
+        let (scheme, store_url, path) = match (url.scheme(), url.host_str()) {
+            ("file", None) => (
+                ObjectStoreScheme::Local,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                &url[url::Position::BeforePath..url::Position::AfterPath],
+            ),
+            ("memory", None) => (
+                ObjectStoreScheme::Memory,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                &url[url::Position::BeforePath..url::Position::AfterPath],
+            ),
+            ("s3" | "s3a", Some(_)) => (
+                ObjectStoreScheme::AmazonS3,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("gs", Some(_)) => (
+                ObjectStoreScheme::GoogleCloudStorage,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => (

Review Comment:
   IIRC there is a pre-existing bug where the bucket is part of the path for 
some of these, check AzureBuilder



##########
src/parse.rs:
##########
@@ -43,6 +43,162 @@ impl From<Error> for super::Error {
     }
 }
 
+/// Path to an object in an ObjectStore
+#[non_exhaustive] // permit new variants
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct ObjectStoreUrl {
+    scheme: ObjectStoreScheme,
+    store: Url,
+    path: Path,
+}
+
+impl ObjectStoreUrl {
+    /// Parse a URL into an ObjectStoreUrl
+    ///
+    /// Extracts the relevant ObjectStoreScheme, store URL (for 
identification),
+    /// and path within the store.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use url::Url;
+    /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl};
+    /// let url: Url = "s3://bucket/path/to/file".parse().unwrap();
+    /// let os_url = ObjectStoreUrl::parse(&url).unwrap();
+    /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3);
+    /// assert_eq!(os_url.store().as_str(), "s3://bucket/");
+    /// assert_eq!(os_url.path().as_ref(), "path/to/file");
+    /// ```
+    pub fn parse(url: &Url) -> Result<Self, Error> {
+        let strip_bucket = || 
Some(url.path().strip_prefix('/')?.split_once('/')?.1);
+        let store_with_bucket = || {
+            let bucket = url
+                .path_segments()
+                .map(|mut segments| segments.next())
+                .flatten()
+                .ok_or(Error::Unrecognised { url: url.clone() })?;
+            Url::parse(&format!(
+                "{}/{}",
+                &url[url::Position::BeforeScheme..url::Position::AfterPort],
+                bucket,
+            ))
+            .map_err(|_| Error::Unrecognised { url: url.clone() })
+        };
+
+        let (scheme, store_url, path) = match (url.scheme(), url.host_str()) {
+            ("file", None) => (
+                ObjectStoreScheme::Local,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                &url[url::Position::BeforePath..url::Position::AfterPath],
+            ),
+            ("memory", None) => (
+                ObjectStoreScheme::Memory,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                &url[url::Position::BeforePath..url::Position::AfterPath],
+            ),
+            ("s3" | "s3a", Some(_)) => (
+                ObjectStoreScheme::AmazonS3,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("gs", Some(_)) => (
+                ObjectStoreScheme::GoogleCloudStorage,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => (
+                ObjectStoreScheme::MicrosoftAzure,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("http", Some(_)) => (
+                ObjectStoreScheme::Http,
+                
url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(),
+                url.path(),
+            ),
+            ("https", Some(host)) => {
+                if host.ends_with("dfs.core.windows.net")
+                    || host.ends_with("blob.core.windows.net")
+                    || host.ends_with("dfs.fabric.microsoft.com")
+                    || host.ends_with("blob.fabric.microsoft.com")
+                {
+                    (
+                        ObjectStoreScheme::MicrosoftAzure,
+                        
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                        url.path(),
+                    )
+                } else if host.ends_with("amazonaws.com") {
+                    match host.starts_with("s3") {
+                        true => {
+                            let scheme_host_port_bucket = store_with_bucket()?;
+                            eprintln!("scheme_host_port_bucket: {}", 
scheme_host_port_bucket);

Review Comment:
   ?



##########
src/parse.rs:
##########
@@ -43,6 +43,162 @@ impl From<Error> for super::Error {
     }
 }
 
+/// Path to an object in an ObjectStore
+#[non_exhaustive] // permit new variants
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct ObjectStoreUrl {
+    scheme: ObjectStoreScheme,
+    store: Url,
+    path: Path,
+}
+
+impl ObjectStoreUrl {
+    /// Parse a URL into an ObjectStoreUrl
+    ///
+    /// Extracts the relevant ObjectStoreScheme, store URL (for 
identification),
+    /// and path within the store.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use url::Url;
+    /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl};
+    /// let url: Url = "s3://bucket/path/to/file".parse().unwrap();
+    /// let os_url = ObjectStoreUrl::parse(&url).unwrap();
+    /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3);
+    /// assert_eq!(os_url.store().as_str(), "s3://bucket/");
+    /// assert_eq!(os_url.path().as_ref(), "path/to/file");
+    /// ```
+    pub fn parse(url: &Url) -> Result<Self, Error> {
+        let strip_bucket = || 
Some(url.path().strip_prefix('/')?.split_once('/')?.1);
+        let store_with_bucket = || {
+            let bucket = url
+                .path_segments()
+                .map(|mut segments| segments.next())
+                .flatten()
+                .ok_or(Error::Unrecognised { url: url.clone() })?;
+            Url::parse(&format!(
+                "{}/{}",
+                &url[url::Position::BeforeScheme..url::Position::AfterPort],
+                bucket,
+            ))
+            .map_err(|_| Error::Unrecognised { url: url.clone() })
+        };
+
+        let (scheme, store_url, path) = match (url.scheme(), url.host_str()) {
+            ("file", None) => (
+                ObjectStoreScheme::Local,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                &url[url::Position::BeforePath..url::Position::AfterPath],
+            ),
+            ("memory", None) => (
+                ObjectStoreScheme::Memory,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                &url[url::Position::BeforePath..url::Position::AfterPath],
+            ),
+            ("s3" | "s3a", Some(_)) => (
+                ObjectStoreScheme::AmazonS3,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("gs", Some(_)) => (
+                ObjectStoreScheme::GoogleCloudStorage,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => (
+                ObjectStoreScheme::MicrosoftAzure,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("http", Some(_)) => (
+                ObjectStoreScheme::Http,
+                
url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(),
+                url.path(),
+            ),
+            ("https", Some(host)) => {
+                if host.ends_with("dfs.core.windows.net")
+                    || host.ends_with("blob.core.windows.net")
+                    || host.ends_with("dfs.fabric.microsoft.com")
+                    || host.ends_with("blob.fabric.microsoft.com")
+                {
+                    (
+                        ObjectStoreScheme::MicrosoftAzure,
+                        
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                        url.path(),
+                    )
+                } else if host.ends_with("amazonaws.com") {
+                    match host.starts_with("s3") {
+                        true => {
+                            let scheme_host_port_bucket = store_with_bucket()?;
+                            eprintln!("scheme_host_port_bucket: {}", 
scheme_host_port_bucket);
+                            (
+                                ObjectStoreScheme::AmazonS3,
+                                scheme_host_port_bucket.as_str().to_string(),
+                                strip_bucket().unwrap_or_default(),
+                            )
+                        }
+                        false => (
+                            ObjectStoreScheme::AmazonS3,
+                            
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                            url.path(),
+                        ),
+                    }
+                } else if host.ends_with("r2.cloudflarestorage.com") {
+                    let scheme_host_port_bucket = store_with_bucket()?;
+                    (
+                        ObjectStoreScheme::AmazonS3,
+                        scheme_host_port_bucket.as_str().to_string(),
+                        strip_bucket().unwrap_or_default(),
+                    )
+                } else {
+                    (
+                        ObjectStoreScheme::Http,
+                        
url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(),
+                        url.path(),
+                    )
+                }
+            }
+            _ => return Err(Error::Unrecognised { url: url.clone() }),
+        };
+
+        let mut store_url =
+            Url::parse(&store_url).map_err(|_| Error::Unrecognised { url: 
url.clone() })?;
+
+        // Ensure path ends with a '/'
+        if !store_url.path().ends_with('/') {
+            store_url = Url::parse(&format!("{}/", store_url))
+                .map_err(|_| Error::Unrecognised { url: url.clone() })?;
+        }
+
+        // Strip password, leave username

Review Comment:
   The intent is the store URL could be passed to parse_url or similar, I 
therefore don't think we should do this



##########
src/parse.rs:
##########
@@ -43,6 +43,162 @@ impl From<Error> for super::Error {
     }
 }
 
+/// Path to an object in an ObjectStore
+#[non_exhaustive] // permit new variants

Review Comment:
   This is unnecessary as the struct has private fields



##########
src/parse.rs:
##########
@@ -43,6 +43,162 @@ impl From<Error> for super::Error {
     }
 }
 
+/// Path to an object in an ObjectStore
+#[non_exhaustive] // permit new variants
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct ObjectStoreUrl {
+    scheme: ObjectStoreScheme,
+    store: Url,
+    path: Path,
+}
+
+impl ObjectStoreUrl {
+    /// Parse a URL into an ObjectStoreUrl
+    ///
+    /// Extracts the relevant ObjectStoreScheme, store URL (for 
identification),
+    /// and path within the store.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use url::Url;
+    /// # use object_store::{ObjectStoreScheme, ObjectStoreUrl};
+    /// let url: Url = "s3://bucket/path/to/file".parse().unwrap();
+    /// let os_url = ObjectStoreUrl::parse(&url).unwrap();
+    /// assert_eq!(os_url.scheme(), ObjectStoreScheme::AmazonS3);
+    /// assert_eq!(os_url.store().as_str(), "s3://bucket/");
+    /// assert_eq!(os_url.path().as_ref(), "path/to/file");
+    /// ```
+    pub fn parse(url: &Url) -> Result<Self, Error> {
+        let strip_bucket = || 
Some(url.path().strip_prefix('/')?.split_once('/')?.1);
+        let store_with_bucket = || {
+            let bucket = url
+                .path_segments()
+                .map(|mut segments| segments.next())
+                .flatten()
+                .ok_or(Error::Unrecognised { url: url.clone() })?;
+            Url::parse(&format!(
+                "{}/{}",
+                &url[url::Position::BeforeScheme..url::Position::AfterPort],
+                bucket,
+            ))
+            .map_err(|_| Error::Unrecognised { url: url.clone() })
+        };
+
+        let (scheme, store_url, path) = match (url.scheme(), url.host_str()) {
+            ("file", None) => (
+                ObjectStoreScheme::Local,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                &url[url::Position::BeforePath..url::Position::AfterPath],
+            ),
+            ("memory", None) => (
+                ObjectStoreScheme::Memory,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                &url[url::Position::BeforePath..url::Position::AfterPath],
+            ),
+            ("s3" | "s3a", Some(_)) => (
+                ObjectStoreScheme::AmazonS3,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("gs", Some(_)) => (
+                ObjectStoreScheme::GoogleCloudStorage,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => (
+                ObjectStoreScheme::MicrosoftAzure,
+                
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                url.path(),
+            ),
+            ("http", Some(_)) => (
+                ObjectStoreScheme::Http,
+                
url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(),
+                url.path(),
+            ),
+            ("https", Some(host)) => {
+                if host.ends_with("dfs.core.windows.net")
+                    || host.ends_with("blob.core.windows.net")
+                    || host.ends_with("dfs.fabric.microsoft.com")
+                    || host.ends_with("blob.fabric.microsoft.com")
+                {
+                    (
+                        ObjectStoreScheme::MicrosoftAzure,
+                        
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                        url.path(),
+                    )
+                } else if host.ends_with("amazonaws.com") {
+                    match host.starts_with("s3") {
+                        true => {
+                            let scheme_host_port_bucket = store_with_bucket()?;
+                            eprintln!("scheme_host_port_bucket: {}", 
scheme_host_port_bucket);
+                            (
+                                ObjectStoreScheme::AmazonS3,
+                                scheme_host_port_bucket.as_str().to_string(),
+                                strip_bucket().unwrap_or_default(),
+                            )
+                        }
+                        false => (
+                            ObjectStoreScheme::AmazonS3,
+                            
url[url::Position::BeforeScheme..url::Position::AfterHost].to_string(),
+                            url.path(),
+                        ),
+                    }
+                } else if host.ends_with("r2.cloudflarestorage.com") {
+                    let scheme_host_port_bucket = store_with_bucket()?;
+                    (
+                        ObjectStoreScheme::AmazonS3,
+                        scheme_host_port_bucket.as_str().to_string(),
+                        strip_bucket().unwrap_or_default(),
+                    )
+                } else {
+                    (
+                        ObjectStoreScheme::Http,
+                        
url[url::Position::BeforeScheme..url::Position::AfterPort].to_string(),
+                        url.path(),
+                    )
+                }
+            }
+            _ => return Err(Error::Unrecognised { url: url.clone() }),
+        };
+
+        let mut store_url =
+            Url::parse(&store_url).map_err(|_| Error::Unrecognised { url: 
url.clone() })?;
+
+        // Ensure path ends with a '/'
+        if !store_url.path().ends_with('/') {

Review Comment:
   Is this necessary I seem to remembervURL handling this automatically



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to