This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7281a0c167 Relax path safety (#5019) (#5020)
7281a0c167 is described below
commit 7281a0c167554d5fa69055a3f3ee46108254c9c9
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Nov 2 10:27:53 2023 +0000
Relax path safety (#5019) (#5020)
* Relax path safety (#5019)
* Review feedback
* WASM
---
object_store/src/lib.rs | 17 ++++
object_store/src/local.rs | 174 +++++++++++++++++++++++++++++++----------
object_store/src/path/mod.rs | 59 +++++++-------
object_store/src/path/parts.rs | 23 ++----
4 files changed, 184 insertions(+), 89 deletions(-)
diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs
index 1b94f816b1..cdd572dd9b 100644
--- a/object_store/src/lib.rs
+++ b/object_store/src/lib.rs
@@ -1442,6 +1442,23 @@ mod tests {
storage.delete(&path).await.unwrap();
+ // Test handling of unicode paths
+ let path = Path::parse("🇦🇺/$shenanigans@@~.txt").unwrap();
+ storage.put(&path, "test".into()).await.unwrap();
+
+ let r = storage.get(&path).await.unwrap();
+ assert_eq!(r.bytes().await.unwrap(), "test");
+
+ let dir = Path::parse("🇦🇺").unwrap();
+ let r = storage.list_with_delimiter(None).await.unwrap();
+ assert!(r.common_prefixes.contains(&dir));
+
+ let r = storage.list_with_delimiter(Some(&dir)).await.unwrap();
+ assert_eq!(r.objects.len(), 1);
+ assert_eq!(r.objects[0].location, path);
+
+ storage.delete(&path).await.unwrap();
+
// Can also write non-percent encoded sequences
let path = Path::parse("%Q.parquet").unwrap();
storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap();
diff --git a/object_store/src/local.rs b/object_store/src/local.rs
index 1a87dc33c7..e5c4e32046 100644
--- a/object_store/src/local.rs
+++ b/object_store/src/local.rs
@@ -144,6 +144,11 @@ pub(crate) enum Error {
path: PathBuf,
source: io::Error,
},
+
+ #[snafu(display("Filenames containing trailing '/#\\d+/' are not
supported: {}", path))]
+ InvalidPath {
+ path: String,
+ },
}
impl From<Error> for super::Error {
@@ -176,6 +181,30 @@ impl From<Error> for super::Error {
/// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme
/// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986
///
+/// # Path Semantics
+///
+/// [`LocalFileSystem`] will expose the path semantics of the underlying
filesystem, which may
+/// have additional restrictions beyond those enforced by [`Path`].
+///
+/// For example:
+///
+/// * Windows forbids certain filenames, e.g. `COM0`,
+/// * Windows forbids folders with trailing `.`
+/// * Windows forbids certain ASCII characters, e.g. `<` or `|`
+/// * OS X forbids filenames containing `:`
+/// * Leading `-` are discouraged on Unix systems where they may be
interpreted as CLI flags
+/// * Filesystems may have restrictions on the maximum path or path segment
length
+/// * Filesystem support for non-ASCII characters is inconsistent
+///
+/// Additionally some filesystems, such as NTFS, are case-insensitive, whilst
others like
+/// FAT don't preserve case at all. Further some filesystems support
non-unicode character
+/// sequences, such as unpaired UTF-16 surrogates, and [`LocalFileSystem`]
will error on
+/// encountering such sequences.
+///
+/// Finally, filenames matching the regex `/.*#\d+/`, e.g. `foo.parquet#123`,
are not supported
+/// by [`LocalFileSystem`] as they are used to provide atomic writes. Such
files will be ignored
+/// for listing operations, and attempting to address such a file will error.
+///
/// # Tokio Compatibility
///
/// Tokio discourages performing blocking IO on a tokio worker thread, however,
@@ -196,6 +225,11 @@ impl From<Error> for super::Error {
/// * Mutating a file through one or more symlinks will mutate the underlying
file
/// * Deleting a path that resolves to a symlink will only delete the symlink
///
+/// # Cross-Filesystem Copy
+///
+/// [`LocalFileSystem::copy`] is implemented using [`std::fs::hard_link`], and
therefore
+/// does not support copying across filesystem boundaries.
+///
#[derive(Debug)]
pub struct LocalFileSystem {
config: Arc<Config>,
@@ -246,8 +280,19 @@ impl LocalFileSystem {
}
impl Config {
- /// Return an absolute filesystem path of the given location
+ /// Return an absolute filesystem path of the given file location
fn path_to_filesystem(&self, location: &Path) -> Result<PathBuf> {
+ ensure!(
+ is_valid_file_path(location),
+ InvalidPathSnafu {
+ path: location.as_ref()
+ }
+ );
+ self.prefix_to_filesystem(location)
+ }
+
+ /// Return an absolute filesystem path of the given location
+ fn prefix_to_filesystem(&self, location: &Path) -> Result<PathBuf> {
let mut url = self.root.clone();
url.path_segments_mut()
.expect("url path")
@@ -269,6 +314,19 @@ impl Config {
}
}
+fn is_valid_file_path(path: &Path) -> bool {
+ match path.filename() {
+ Some(p) => match p.split_once('#') {
+ Some((_, suffix)) if !suffix.is_empty() => {
+ // Valid if contains non-digits
+ !suffix.as_bytes().iter().all(|x| x.is_ascii_digit())
+ }
+ _ => true,
+ },
+ None => false,
+ }
+}
+
#[async_trait]
impl ObjectStore for LocalFileSystem {
async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions)
-> Result<PutResult> {
@@ -406,7 +464,7 @@ impl ObjectStore for LocalFileSystem {
let config = Arc::clone(&self.config);
let root_path = match prefix {
- Some(prefix) => match config.path_to_filesystem(prefix) {
+ Some(prefix) => match config.prefix_to_filesystem(prefix) {
Ok(path) => path,
Err(e) => return
futures::future::ready(Err(e)).into_stream().boxed(),
},
@@ -419,20 +477,21 @@ impl ObjectStore for LocalFileSystem {
.follow_links(true);
let s = walkdir.into_iter().flat_map(move |result_dir_entry| {
- match convert_walkdir_result(result_dir_entry) {
+ let entry = match
convert_walkdir_result(result_dir_entry).transpose()? {
+ Ok(entry) => entry,
+ Err(e) => return Some(Err(e)),
+ };
+
+ if !entry.path().is_file() {
+ return None;
+ }
+
+ match config.filesystem_to_path(entry.path()) {
+ Ok(path) => match is_valid_file_path(&path) {
+ true => Some(convert_entry(entry, path)),
+ false => None,
+ },
Err(e) => Some(Err(e)),
- Ok(None) => None,
- Ok(entry @ Some(_)) => entry
- .filter(|dir_entry| {
- dir_entry.file_type().is_file()
- // Ignore file names with # in them, since they
might be in-progress uploads.
- // They would be rejected anyways by
filesystem_to_path below.
- &&
!dir_entry.file_name().to_string_lossy().contains('#')
- })
- .map(|entry| {
- let location =
config.filesystem_to_path(entry.path())?;
- convert_entry(entry, location)
- }),
}
});
@@ -473,7 +532,7 @@ impl ObjectStore for LocalFileSystem {
let config = Arc::clone(&self.config);
let prefix = prefix.cloned().unwrap_or_default();
- let resolved_prefix = config.path_to_filesystem(&prefix)?;
+ let resolved_prefix = config.prefix_to_filesystem(&prefix)?;
maybe_spawn_blocking(move || {
let walkdir = WalkDir::new(&resolved_prefix)
@@ -486,15 +545,11 @@ impl ObjectStore for LocalFileSystem {
for entry_res in walkdir.into_iter().map(convert_walkdir_result) {
if let Some(entry) = entry_res? {
- if entry.file_type().is_file()
- // Ignore file names with # in them, since they might
be in-progress uploads.
- // They would be rejected anyways by
filesystem_to_path below.
- && entry.file_name().to_string_lossy().contains('#')
- {
- continue;
- }
let is_directory = entry.file_type().is_dir();
let entry_location =
config.filesystem_to_path(entry.path())?;
+ if !is_directory && !is_valid_file_path(&entry_location) {
+ continue;
+ }
let mut parts = match entry_location.prefix_match(&prefix)
{
Some(parts) => parts,
@@ -1325,26 +1380,18 @@ mod tests {
assert!(result.common_prefixes.is_empty());
assert_eq!(result.objects[0].location, object);
- let illegal = root.join("💀");
- std::fs::write(illegal, "foo").unwrap();
-
- // Can list directory that doesn't contain illegal path
- flatten_list_stream(&integration, Some(&directory))
- .await
- .unwrap();
+ let emoji = root.join("💀");
+ std::fs::write(emoji, "foo").unwrap();
- // Cannot list illegal file
- let err = flatten_list_stream(&integration, None)
- .await
- .unwrap_err()
- .to_string();
+ // Can list illegal file
+ let paths = flatten_list_stream(&integration, None).await.unwrap();
- assert!(
- err.contains(
- "Encountered illegal character sequence \"💀\" whilst parsing
path segment \"💀\""
- ),
- "{}",
- err
+ assert_eq!(
+ paths,
+ vec![
+ Path::parse("💀").unwrap(),
+ Path::parse("directory/child.txt").unwrap()
+ ]
);
}
@@ -1403,6 +1450,51 @@ mod tests {
let path = Path::from_filesystem_path(".").unwrap();
integration.list_with_delimiter(Some(&path)).await.unwrap();
}
+
+ #[test]
+ fn test_valid_path() {
+ let cases = [
+ ("foo#123/test.txt", true),
+ ("foo#123/test#23.txt", true),
+ ("foo#123/test#34", false),
+ ("foo😁/test#34", false),
+ ("foo/test#😁34", true),
+ ];
+
+ for (case, expected) in cases {
+ let path = Path::parse(case).unwrap();
+ assert_eq!(is_valid_file_path(&path), expected);
+ }
+ }
+
+ #[tokio::test]
+ async fn test_intermediate_files() {
+ let root = TempDir::new().unwrap();
+ let integration =
LocalFileSystem::new_with_prefix(root.path()).unwrap();
+
+ let a = Path::parse("foo#123/test.txt").unwrap();
+ integration.put(&a, "test".into()).await.unwrap();
+
+ let list = flatten_list_stream(&integration, None).await.unwrap();
+ assert_eq!(list, vec![a.clone()]);
+
+ std::fs::write(root.path().join("bar#123"), "test").unwrap();
+
+ // Should ignore file
+ let list = flatten_list_stream(&integration, None).await.unwrap();
+ assert_eq!(list, vec![a.clone()]);
+
+ let b = Path::parse("bar#123").unwrap();
+ let err = integration.get(&b).await.unwrap_err().to_string();
+ assert_eq!(err, "Generic LocalFileSystem error: Filenames containing
trailing '/#\\d+/' are not supported: bar#123");
+
+ let c = Path::parse("foo#123.txt").unwrap();
+ integration.put(&c, "test".into()).await.unwrap();
+
+ let mut list = flatten_list_stream(&integration, None).await.unwrap();
+ list.sort_unstable();
+ assert_eq!(list, vec![c, a]);
+ }
}
#[cfg(not(target_arch = "wasm32"))]
diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs
index e065c31d31..f914862bc5 100644
--- a/object_store/src/path/mod.rs
+++ b/object_store/src/path/mod.rs
@@ -65,10 +65,23 @@ pub enum Error {
/// A parsed path representation that can be safely written to object storage
///
-/// # Path Safety
+/// A [`Path`] maintains the following invariants:
+///
+/// * Paths are delimited by `/`
+/// * Paths do not contain leading or trailing `/`
+/// * Paths do not contain relative path segments, i.e. `.` or `..`
+/// * Paths do not contain empty path segments
+/// * Paths do not contain any ASCII control characters
+///
+/// There are no enforced restrictions on path length, however, it should be
noted that most
+/// object stores do not permit paths longer than 1024 bytes, and many
filesystems do not
+/// support path segments longer than 255 bytes.
+///
+/// # Encode
///
/// In theory object stores support any UTF-8 character sequence, however,
certain character
-/// sequences cause compatibility problems with some applications and
protocols. As such the
+/// sequences cause compatibility problems with some applications and
protocols. Additionally
+/// some filesystems may impose character restrictions, see
[`LocalFileSystem`]. As such the
/// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend
sticking to a
/// limited character subset.
///
@@ -76,34 +89,16 @@ pub enum Error {
/// [GCS]: https://cloud.google.com/storage/docs/naming-objects
/// [Azure Blob Storage]:
https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names
///
-/// This presents libraries with two options for consistent path handling:
-///
-/// 1. Allow constructing unsafe paths, allowing for both reading and writing
of data to paths
-/// that may not be consistently understood or supported
-/// 2. Disallow constructing unsafe paths, ensuring data written can be
consistently handled by
-/// all other systems, but preventing interaction with objects at unsafe paths
-///
-/// This library takes the second approach, in particular:
-///
-/// * Paths are delimited by `/`
-/// * Paths do not start with a `/`
-/// * Empty path segments are discarded (e.g. `//` is treated as though it
were `/`)
-/// * Relative path segments, i.e. `.` and `..` are percent encoded
-/// * Unsafe characters are percent encoded, as described by [RFC 1738]
-/// * All paths are relative to the root of the object store
-///
-/// In order to provide these guarantees there are two ways to safely
construct a [`Path`]
-///
-/// # Encode
-///
-/// A string containing potentially illegal path segments can be encoded to a
[`Path`]
-/// using [`Path::from`] or [`Path::from_iter`].
+/// A string containing potentially problematic path segments can therefore be
encoded to a [`Path`]
+/// using [`Path::from`] or [`Path::from_iter`]. This will percent encode any
problematic
+/// segments according to [RFC 1738].
///
/// ```
/// # use object_store::path::Path;
/// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar");
/// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar");
/// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar");
+/// assert_eq!(Path::from("/").as_ref(), "");
/// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar");
/// ```
///
@@ -116,20 +111,20 @@ pub enum Error {
///
/// # Parse
///
-/// Alternatively a [`Path`] can be created from an existing string, returning
an
-/// error if it is invalid. Unlike the encoding methods, this will permit
-/// valid percent encoded sequences.
+/// Alternatively a [`Path`] can be parsed from an existing string, returning
an
+/// error if it is invalid. Unlike the encoding methods above, this will permit
+/// arbitrary unicode, including percent encoded sequences.
///
/// ```
/// # use object_store::path::Path;
-///
/// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(),
"foo/foo%2Fbar");
-/// Path::parse("..").unwrap_err();
-/// Path::parse("/foo//").unwrap_err();
-/// Path::parse("😀").unwrap_err();
+/// Path::parse("..").unwrap_err(); // Relative path segments are disallowed
+/// Path::parse("/foo//").unwrap_err(); // Empty path segments are disallowed
+/// Path::parse("\x00").unwrap_err(); // ASCII control characters are
disallowed
/// ```
///
/// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt
+/// [`LocalFileSystem`]: crate::local::LocalFileSystem
#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)]
pub struct Path {
/// The raw path with no leading or trailing delimiters
@@ -236,7 +231,7 @@ impl Path {
pub fn filename(&self) -> Option<&str> {
match self.raw.is_empty() {
true => None,
- false => self.raw.split(DELIMITER).last(),
+ false => self.raw.rsplit(DELIMITER).next(),
}
}
diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs
index 9da4815712..df7097cbe9 100644
--- a/object_store/src/path/parts.rs
+++ b/object_store/src/path/parts.rs
@@ -37,8 +37,10 @@ pub struct InvalidPart {
/// The PathPart type exists to validate the directory/file names that form
part
/// of a path.
///
-/// A PathPart instance is guaranteed to to contain no illegal characters
(e.g. `/`)
-/// as it can only be constructed by going through the `from` impl.
+/// A [`PathPart`] is guaranteed to:
+///
+/// * Contain no ASCII control characters or `/`
+/// * Not be a relative path segment, i.e. `.` or `..`
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
pub struct PathPart<'a> {
pub(super) raw: Cow<'a, str>,
@@ -54,19 +56,12 @@ impl<'a> PathPart<'a> {
});
}
- for (idx, b) in segment.as_bytes().iter().cloned().enumerate() {
- // A percent character is always valid, even if not
- // followed by a valid 2-digit hex code
- // https://url.spec.whatwg.org/#percent-encoded-bytes
- if b == b'%' {
- continue;
- }
-
- if !b.is_ascii() || should_percent_encode(b) {
+ for c in segment.chars() {
+ if c.is_ascii_control() || c == '/' {
return Err(InvalidPart {
segment: segment.to_string(),
// This is correct as only single byte characters up to
this point
- illegal: segment.chars().nth(idx).unwrap().to_string(),
+ illegal: c.to_string(),
});
}
}
@@ -77,10 +72,6 @@ impl<'a> PathPart<'a> {
}
}
-fn should_percent_encode(c: u8) -> bool {
- percent_encode(&[c], INVALID).next().unwrap().len() != 1
-}
-
/// Characters we want to encode.
const INVALID: &AsciiSet = &CONTROLS
// The delimiter we are reserving for internal hierarchy