roeap commented on code in PR #4200: URL: https://github.com/apache/arrow-rs/pull/4200#discussion_r1191736849
########## object_store/src/parse.rs: ########## @@ -0,0 +1,240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::local::LocalFileSystem; +use crate::memory::InMemory; +use crate::path::Path; +use crate::ObjectStore; +use snafu::Snafu; +use url::Url; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] + InvalidUrl { url: Url }, + + #[snafu(display("Unable to recognise URL \"{}\"", url))] + Unrecognised { url: Url }, + + #[snafu(display("Feature {scheme:?} not enabled"))] + NotEnabled { scheme: ObjectStoreScheme }, +} + +impl From<Error> for super::Error { + fn from(e: Error) -> Self { + Self::Generic { + store: "URL", + source: Box::new(e), + } + } +} + +/// Recognises various URL formats, identifying the relevant [`ObjectStore`](crate::ObjectStore) +#[derive(Debug, Eq, PartialEq)] +pub enum ObjectStoreScheme { + /// Url corresponding to [`LocalFileSystem`](crate::local::LocalFileSystem) + Local, + /// Url corresponding to [`InMemory`](crate::memory::InMemory) + Memory, + /// Url corresponding to [`AmazonS3`](crate::aws::AmazonS3) + AmazonS3, + /// Url corresponding to [`GoogleCloudStorage`](crate::gcp::GoogleCloudStorage) + GoogleCloudStorage, + /// Url corresponding to [`MicrosoftAzure`](crate::azure::MicrosoftAzure) + MicrosoftAzure, + /// Url corresponding to [`HttpStore`](crate::http::HttpStore) + Http, +} + +impl ObjectStoreScheme { + /// Create an [`ObjectStoreScheme`] from the provided [`Url`] + fn parse(url: &Url) -> Result<Self, Error> { + match (url.scheme(), url.host_str()) { + ("file", None) => Ok(Self::Local), + ("memory", None) => Ok(Self::Memory), + ("s3" | "s3a", Some(_)) => Ok(Self::AmazonS3), + ("gs", Some(_)) => Ok(Self::GoogleCloudStorage), + ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => { + Ok(Self::MicrosoftAzure) + } + ("http", Some(_)) => Ok(Self::Http), + ("https", Some(host)) => { + if host.ends_with("dfs.core.windows.net") + || host.ends_with("blob.core.windows.net") + { + Ok(Self::MicrosoftAzure) + } else if host.ends_with("amazonaws.com") + || host.ends_with("r2.cloudflarestorage.com") + { + Ok(Self::AmazonS3) + } else { + Ok(Self::Http) + } + } + _ => Err(Error::Unrecognised { url: url.clone() }), + } + } +} + +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] +macro_rules! builder_opts { + ($builder:ty, $url:expr, $options:expr) => {{ + let builder = $options.into_iter().fold( + <$builder>::from_env().with_url($url.as_str()), Review Comment: This is actually something we have been struggling a bit with, and I am unsure what the best solution is without introducing too much complexity. Essentially we end up in a situation where any valid credentials configured in the environment could take precedence over a credentials passed via the storage options. Priority would be based on the order in which we check credentials during build. To work around this I was contemplating several options, none of which really resonated so far. but they all boil down to having some sort of notion on what combinations of config options constitute a credential, then checking if the options contain a full credential and skipping parsing from env if they do. Things get a bit messier if the options contain a partial credential that can be imputed via the environment, as especially in the azure case there are several permutations. Within delta-rs we are unfortunately in a situation right now, where this is affecting users. This becomes even more relevant as we begin to introduce catalog integrations where different locations are associated with different credentials. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
