This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 53d5df2d5c Don't canonicalize ListingTableUrl (#8014)
53d5df2d5c is described below
commit 53d5df2d5c97667729ac94dde9f1a3c1d5d3b4e0
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Nov 20 13:15:00 2023 +0000
Don't canonicalize ListingTableUrl (#8014)
---
datafusion-cli/src/exec.rs | 10 +--
datafusion/core/src/datasource/listing/url.rs | 110 +++++++++++++++++++++-----
2 files changed, 91 insertions(+), 29 deletions(-)
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index 14ac22687b..1869e15ef5 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -388,15 +388,7 @@ mod tests {
// Ensure that local files are also registered
let sql =
format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION
'{location}'");
- let err = create_external_table_test(location, &sql)
- .await
- .unwrap_err();
-
- if let DataFusionError::IoError(e) = err {
- assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
- } else {
- return Err(err);
- }
+ create_external_table_test(location, &sql).await.unwrap();
Ok(())
}
diff --git a/datafusion/core/src/datasource/listing/url.rs
b/datafusion/core/src/datasource/listing/url.rs
index 45845916a9..9e9fb92100 100644
--- a/datafusion/core/src/datasource/listing/url.rs
+++ b/datafusion/core/src/datasource/listing/url.rs
@@ -45,6 +45,17 @@ pub struct ListingTableUrl {
impl ListingTableUrl {
/// Parse a provided string as a `ListingTableUrl`
///
+ /// A URL can either refer to a single object, or a collection of objects
with a
+ /// common prefix, with the presence of a trailing `/` indicating a
collection.
+ ///
+ /// For example, `file:///foo.txt` refers to the file at `/foo.txt`,
whereas
+ /// `file:///foo/` refers to all the files under the directory `/foo` and
its
+ /// subdirectories.
+ ///
+ /// Similarly `s3://BUCKET/blob.csv` refers to `blob.csv` in the S3 bucket
`BUCKET`,
+ /// wherease `s3://BUCKET/foo/` refers to all objects with the prefix
`foo/` in the
+ /// S3 bucket `BUCKET`
+ ///
/// # URL Encoding
///
/// URL paths are expected to be URL-encoded. That is, the URL for a file
named `bar%2Efoo`
@@ -58,19 +69,21 @@ impl ListingTableUrl {
/// # Paths without a Scheme
///
/// If no scheme is provided, or the string is an absolute filesystem path
- /// as determined [`std::path::Path::is_absolute`], the string will be
+ /// as determined by [`std::path::Path::is_absolute`], the string will be
/// interpreted as a path on the local filesystem using the operating
/// system's standard path delimiter, i.e. `\` on Windows, `/` on Unix.
///
/// If the path contains any of `'?', '*', '['`, it will be considered
/// a glob expression and resolved as described in the section below.
///
- /// Otherwise, the path will be resolved to an absolute path, returning
- /// an error if it does not exist, and converted to a [file URI]
+ /// Otherwise, the path will be resolved to an absolute path based on the
current
+ /// working directory, and converted to a [file URI].
///
- /// If you wish to specify a path that does not exist on the local
- /// machine you must provide it as a fully-qualified [file URI]
- /// e.g. `file:///myfile.txt`
+ /// If the path already exists in the local filesystem this will be used
to determine if this
+ /// [`ListingTableUrl`] refers to a collection or a single object,
otherwise the presence
+ /// of a trailing path delimiter will be used to indicate a directory. For
the avoidance
+ /// of ambiguity it is recommended users always include trailing `/` when
intending to
+ /// refer to a directory.
///
/// ## Glob File Paths
///
@@ -78,9 +91,7 @@ impl ListingTableUrl {
/// be resolved as follows.
///
/// The string up to the first path segment containing a glob expression
will be extracted,
- /// and resolved in the same manner as a normal scheme-less path. That is,
resolved to
- /// an absolute path on the local filesystem, returning an error if it
does not exist,
- /// and converted to a [file URI]
+ /// and resolved in the same manner as a normal scheme-less path above.
///
/// The remaining string will be interpreted as a [`glob::Pattern`] and
used as a
/// filter when listing files from object storage
@@ -130,7 +141,7 @@ impl ListingTableUrl {
/// Creates a new [`ListingTableUrl`] interpreting `s` as a filesystem path
fn parse_path(s: &str) -> Result<Self> {
- let (prefix, glob) = match split_glob_expression(s) {
+ let (path, glob) = match split_glob_expression(s) {
Some((prefix, glob)) => {
let glob = Pattern::new(glob)
.map_err(|e| DataFusionError::External(Box::new(e)))?;
@@ -139,15 +150,12 @@ impl ListingTableUrl {
None => (s, None),
};
- let path = std::path::Path::new(prefix).canonicalize()?;
- let url = if path.is_dir() {
- Url::from_directory_path(path)
- } else {
- Url::from_file_path(path)
- }
- .map_err(|_| DataFusionError::Internal(format!("Can not open path:
{s}")))?;
- // TODO: Currently we do not have an IO-related error variant that
accepts ()
- // or a string. Once we have such a variant, change the error
type above.
+ let url = url_from_filesystem_path(path).ok_or_else(|| {
+ DataFusionError::External(
+ format!("Failed to convert path to URL: {path}").into(),
+ )
+ })?;
+
Self::try_new(url, glob)
}
@@ -162,7 +170,10 @@ impl ListingTableUrl {
self.url.scheme()
}
- /// Return the prefix from which to list files
+ /// Return the URL path not excluding any glob expression
+ ///
+ /// If [`Self::is_collection`], this is the listing prefix
+ /// Otherwise, this is the path to the object
pub fn prefix(&self) -> &Path {
&self.prefix
}
@@ -249,6 +260,34 @@ impl ListingTableUrl {
}
}
+/// Creates a file URL from a potentially relative filesystem path
+fn url_from_filesystem_path(s: &str) -> Option<Url> {
+ let path = std::path::Path::new(s);
+ let is_dir = match path.exists() {
+ true => path.is_dir(),
+ // Fallback to inferring from trailing separator
+ false => std::path::is_separator(s.chars().last()?),
+ };
+
+ let from_absolute_path = |p| {
+ let first = match is_dir {
+ true => Url::from_directory_path(p).ok(),
+ false => Url::from_file_path(p).ok(),
+ }?;
+
+ // By default from_*_path preserve relative path segments
+ // We therefore parse the URL again to resolve these
+ Url::parse(first.as_str()).ok()
+ };
+
+ if path.is_absolute() {
+ return from_absolute_path(path);
+ }
+
+ let absolute = std::env::current_dir().ok()?.join(path);
+ from_absolute_path(&absolute)
+}
+
impl AsRef<str> for ListingTableUrl {
fn as_ref(&self) -> &str {
self.url.as_ref()
@@ -349,6 +388,37 @@ mod tests {
let url = ListingTableUrl::parse(path.to_str().unwrap()).unwrap();
assert!(url.prefix.as_ref().ends_with("bar%2Ffoo"), "{}", url.prefix);
+
+ let url =
ListingTableUrl::parse("file:///foo/../a%252Fb.txt").unwrap();
+ assert_eq!(url.prefix.as_ref(), "a%2Fb.txt");
+
+ let url =
+
ListingTableUrl::parse("file:///foo/./bar/../../baz/./test.txt").unwrap();
+ assert_eq!(url.prefix.as_ref(), "baz/test.txt");
+
+ let workdir = std::env::current_dir().unwrap();
+ let t = workdir.join("non-existent");
+ let a = ListingTableUrl::parse(t.to_str().unwrap()).unwrap();
+ let b = ListingTableUrl::parse("non-existent").unwrap();
+ assert_eq!(a, b);
+ assert!(a.prefix.as_ref().ends_with("non-existent"));
+
+ let t = workdir.parent().unwrap();
+ let a = ListingTableUrl::parse(t.to_str().unwrap()).unwrap();
+ let b = ListingTableUrl::parse("..").unwrap();
+ assert_eq!(a, b);
+
+ let t = t.join("bar");
+ let a = ListingTableUrl::parse(t.to_str().unwrap()).unwrap();
+ let b = ListingTableUrl::parse("../bar").unwrap();
+ assert_eq!(a, b);
+ assert!(a.prefix.as_ref().ends_with("bar"));
+
+ let t = t.join(".").join("foo").join("..").join("baz");
+ let a = ListingTableUrl::parse(t.to_str().unwrap()).unwrap();
+ let b = ListingTableUrl::parse("../bar/./foo/../baz").unwrap();
+ assert_eq!(a, b);
+ assert!(a.prefix.as_ref().ends_with("bar/baz"));
}
#[test]