This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 4e60cddb1c Percent Decode URL Paths (#8009) (#8012)
4e60cddb1c is described below
commit 4e60cddb1caa8f3f552a6f9d4aa6fd3a3653fb58
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Nov 1 12:14:52 2023 +0000
Percent Decode URL Paths (#8009) (#8012)
* Treat ListingTableUrl as URL-encoded (#8009)
* Update lockfile
* Review feedback
---
datafusion-cli/Cargo.lock | 1 -
datafusion/core/Cargo.toml | 1 -
datafusion/core/src/datasource/listing/url.rs | 48 +++++++++++++++++++++------
3 files changed, 38 insertions(+), 12 deletions(-)
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 268f9e2842..dc828f018f 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1130,7 +1130,6 @@ dependencies = [
"object_store",
"parking_lot",
"parquet",
- "percent-encoding",
"pin-project-lite",
"rand",
"sqlparser",
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index a961493467..f1542fca00 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -82,7 +82,6 @@ num_cpus = "1.13.0"
object_store = "0.7.0"
parking_lot = "0.12"
parquet = { workspace = true, optional = true }
-percent-encoding = "2.2.0"
pin-project-lite = "^0.2.7"
rand = "0.8"
sqlparser = { workspace = true }
diff --git a/datafusion/core/src/datasource/listing/url.rs
b/datafusion/core/src/datasource/listing/url.rs
index 4d1ca4853a..9197e37adb 100644
--- a/datafusion/core/src/datasource/listing/url.rs
+++ b/datafusion/core/src/datasource/listing/url.rs
@@ -27,7 +27,6 @@ use itertools::Itertools;
use log::debug;
use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
-use percent_encoding;
use std::sync::Arc;
use url::Url;
@@ -46,6 +45,16 @@ pub struct ListingTableUrl {
impl ListingTableUrl {
/// Parse a provided string as a `ListingTableUrl`
///
+ /// # URL Encoding
+ ///
+ /// URL paths are expected to be URL-encoded. That is, the URL for a file
named `bar%2Efoo`
+ /// would be `file:///bar%252Efoo`, as per the [URL] specification.
+ ///
+ /// It should be noted that some tools, such as the AWS CLI, take a
different approach and
+ /// instead interpret the URL path verbatim. For example the object
`bar%2Efoo` would be
+ /// addressed as `s3://BUCKET/bar%252Efoo` using [`ListingTableUrl`] but
`s3://BUCKET/bar%2Efoo`
+ /// when using the aws-cli.
+ ///
/// # Paths without a Scheme
///
/// If no scheme is provided, or the string is an absolute filesystem path
@@ -77,6 +86,7 @@ impl ListingTableUrl {
/// filter when listing files from object storage
///
/// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme
+ /// [URL]: https://url.spec.whatwg.org/
pub fn parse(s: impl AsRef<str>) -> Result<Self> {
let s = s.as_ref();
@@ -86,7 +96,7 @@ impl ListingTableUrl {
}
match Url::parse(s) {
- Ok(url) => Ok(Self::new(url, None)),
+ Ok(url) => Self::try_new(url, None),
Err(url::ParseError::RelativeUrlWithoutBase) =>
Self::parse_path(s),
Err(e) => Err(DataFusionError::External(Box::new(e))),
}
@@ -138,15 +148,13 @@ impl ListingTableUrl {
.map_err(|_| DataFusionError::Internal(format!("Can not open path:
{s}")))?;
// TODO: Currently we do not have an IO-related error variant that
accepts ()
// or a string. Once we have such a variant, change the error
type above.
- Ok(Self::new(url, glob))
+ Self::try_new(url, glob)
}
/// Creates a new [`ListingTableUrl`] from a url and optional glob
expression
- fn new(url: Url, glob: Option<Pattern>) -> Self {
- let decoded_path =
-
percent_encoding::percent_decode_str(url.path()).decode_utf8_lossy();
- let prefix = Path::from(decoded_path.as_ref());
- Self { url, prefix, glob }
+ fn try_new(url: Url, glob: Option<Pattern>) -> Result<Self> {
+ let prefix = Path::from_url_path(url.path())?;
+ Ok(Self { url, prefix, glob })
}
/// Returns the URL scheme
@@ -286,6 +294,7 @@ fn split_glob_expression(path: &str) -> Option<(&str,
&str)> {
#[cfg(test)]
mod tests {
use super::*;
+ use tempfile::tempdir;
#[test]
fn test_prefix_path() {
@@ -317,8 +326,27 @@ mod tests {
let url = ListingTableUrl::parse("file:///foo/bar?").unwrap();
assert_eq!(url.prefix.as_ref(), "foo/bar");
- let url = ListingTableUrl::parse("file:///foo/😺").unwrap();
- assert_eq!(url.prefix.as_ref(), "foo/%F0%9F%98%BA");
+ let err = ListingTableUrl::parse("file:///foo/😺").unwrap_err();
+ assert_eq!(err.to_string(), "Object Store error: Encountered object
with invalid path: Error parsing Path \"/foo/😺\": Encountered illegal character
sequence \"😺\" whilst parsing path segment \"😺\"");
+
+ let url = ListingTableUrl::parse("file:///foo/bar%2Efoo").unwrap();
+ assert_eq!(url.prefix.as_ref(), "foo/bar.foo");
+
+ let url = ListingTableUrl::parse("file:///foo/bar%2Efoo").unwrap();
+ assert_eq!(url.prefix.as_ref(), "foo/bar.foo");
+
+ let url = ListingTableUrl::parse("file:///foo/bar%252Ffoo").unwrap();
+ assert_eq!(url.prefix.as_ref(), "foo/bar%2Ffoo");
+
+ let url = ListingTableUrl::parse("file:///foo/a%252Fb.txt").unwrap();
+ assert_eq!(url.prefix.as_ref(), "foo/a%2Fb.txt");
+
+ let dir = tempdir().unwrap();
+ let path = dir.path().join("bar%2Ffoo");
+ std::fs::File::create(&path).unwrap();
+
+ let url = ListingTableUrl::parse(path.to_str().unwrap()).unwrap();
+ assert!(url.prefix.as_ref().ends_with("bar%2Ffoo"), "{}", url.prefix);
}
#[test]