(arrow-rs) branch master updated: Relax path safety (#5019) (#5020)

tustvold Thu, 02 Nov 2023 03:28:03 -0700

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 7281a0c167 Relax path safety (#5019) (#5020)
7281a0c167 is described below

commit 7281a0c167554d5fa69055a3f3ee46108254c9c9
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Nov 2 10:27:53 2023 +0000

    Relax path safety (#5019) (#5020)
    
    * Relax path safety (#5019)
    
    * Review feedback
    
    * WASM
---
 object_store/src/lib.rs        |  17 ++++
 object_store/src/local.rs      | 174 +++++++++++++++++++++++++++++++----------
 object_store/src/path/mod.rs   |  59 +++++++-------
 object_store/src/path/parts.rs |  23 ++----
 4 files changed, 184 insertions(+), 89 deletions(-)

diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs
index 1b94f816b1..cdd572dd9b 100644
--- a/object_store/src/lib.rs
+++ b/object_store/src/lib.rs
@@ -1442,6 +1442,23 @@ mod tests {
 
         storage.delete(&path).await.unwrap();
 
+        // Test handling of unicode paths
+        let path = Path::parse("🇦🇺/$shenanigans@@~.txt").unwrap();
+        storage.put(&path, "test".into()).await.unwrap();
+
+        let r = storage.get(&path).await.unwrap();
+        assert_eq!(r.bytes().await.unwrap(), "test");
+
+        let dir = Path::parse("🇦🇺").unwrap();
+        let r = storage.list_with_delimiter(None).await.unwrap();
+        assert!(r.common_prefixes.contains(&dir));
+
+        let r = storage.list_with_delimiter(Some(&dir)).await.unwrap();
+        assert_eq!(r.objects.len(), 1);
+        assert_eq!(r.objects[0].location, path);
+
+        storage.delete(&path).await.unwrap();
+
         // Can also write non-percent encoded sequences
         let path = Path::parse("%Q.parquet").unwrap();
         storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap();
diff --git a/object_store/src/local.rs b/object_store/src/local.rs
index 1a87dc33c7..e5c4e32046 100644
--- a/object_store/src/local.rs
+++ b/object_store/src/local.rs
@@ -144,6 +144,11 @@ pub(crate) enum Error {
         path: PathBuf,
         source: io::Error,
     },
+
+    #[snafu(display("Filenames containing trailing '/#\\d+/' are not 
supported: {}", path))]
+    InvalidPath {
+        path: String,
+    },
 }
 
 impl From<Error> for super::Error {
@@ -176,6 +181,30 @@ impl From<Error> for super::Error {
 /// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme
 /// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986
 ///
+/// # Path Semantics
+///
+/// [`LocalFileSystem`] will expose the path semantics of the underlying 
filesystem, which may
+/// have additional restrictions beyond those enforced by [`Path`].
+///
+/// For example:
+///
+/// * Windows forbids certain filenames, e.g. `COM0`,
+/// * Windows forbids folders with trailing `.`
+/// * Windows forbids certain ASCII characters, e.g. `<` or `|`
+/// * OS X forbids filenames containing `:`
+/// * Leading `-` are discouraged on Unix systems where they may be 
interpreted as CLI flags
+/// * Filesystems may have restrictions on the maximum path or path segment 
length
+/// * Filesystem support for non-ASCII characters is inconsistent
+///
+/// Additionally some filesystems, such as NTFS, are case-insensitive, whilst 
others like
+/// FAT don't preserve case at all. Further some filesystems support 
non-unicode character
+/// sequences, such as unpaired UTF-16 surrogates, and [`LocalFileSystem`] 
will error on
+/// encountering such sequences.
+///
+/// Finally, filenames matching the regex `/.*#\d+/`, e.g. `foo.parquet#123`, 
are not supported
+/// by [`LocalFileSystem`] as they are used to provide atomic writes. Such 
files will be ignored
+/// for listing operations, and attempting to address such a file will error.
+///
 /// # Tokio Compatibility
 ///
 /// Tokio discourages performing blocking IO on a tokio worker thread, however,
@@ -196,6 +225,11 @@ impl From<Error> for super::Error {
 /// * Mutating a file through one or more symlinks will mutate the underlying 
file
 /// * Deleting a path that resolves to a symlink will only delete the symlink
 ///
+/// # Cross-Filesystem Copy
+///
+/// [`LocalFileSystem::copy`] is implemented using [`std::fs::hard_link`], and 
therefore
+/// does not support copying across filesystem boundaries.
+///
 #[derive(Debug)]
 pub struct LocalFileSystem {
     config: Arc<Config>,
@@ -246,8 +280,19 @@ impl LocalFileSystem {
 }
 
 impl Config {
-    /// Return an absolute filesystem path of the given location
+    /// Return an absolute filesystem path of the given file location
     fn path_to_filesystem(&self, location: &Path) -> Result<PathBuf> {
+        ensure!(
+            is_valid_file_path(location),
+            InvalidPathSnafu {
+                path: location.as_ref()
+            }
+        );
+        self.prefix_to_filesystem(location)
+    }
+
+    /// Return an absolute filesystem path of the given location
+    fn prefix_to_filesystem(&self, location: &Path) -> Result<PathBuf> {
         let mut url = self.root.clone();
         url.path_segments_mut()
             .expect("url path")
@@ -269,6 +314,19 @@ impl Config {
     }
 }
 
+fn is_valid_file_path(path: &Path) -> bool {
+    match path.filename() {
+        Some(p) => match p.split_once('#') {
+            Some((_, suffix)) if !suffix.is_empty() => {
+                // Valid if contains non-digits
+                !suffix.as_bytes().iter().all(|x| x.is_ascii_digit())
+            }
+            _ => true,
+        },
+        None => false,
+    }
+}
+
 #[async_trait]
 impl ObjectStore for LocalFileSystem {
     async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) 
-> Result<PutResult> {
@@ -406,7 +464,7 @@ impl ObjectStore for LocalFileSystem {
         let config = Arc::clone(&self.config);
 
         let root_path = match prefix {
-            Some(prefix) => match config.path_to_filesystem(prefix) {
+            Some(prefix) => match config.prefix_to_filesystem(prefix) {
                 Ok(path) => path,
                 Err(e) => return 
futures::future::ready(Err(e)).into_stream().boxed(),
             },
@@ -419,20 +477,21 @@ impl ObjectStore for LocalFileSystem {
             .follow_links(true);
 
         let s = walkdir.into_iter().flat_map(move |result_dir_entry| {
-            match convert_walkdir_result(result_dir_entry) {
+            let entry = match 
convert_walkdir_result(result_dir_entry).transpose()? {
+                Ok(entry) => entry,
+                Err(e) => return Some(Err(e)),
+            };
+
+            if !entry.path().is_file() {
+                return None;
+            }
+
+            match config.filesystem_to_path(entry.path()) {
+                Ok(path) => match is_valid_file_path(&path) {
+                    true => Some(convert_entry(entry, path)),
+                    false => None,
+                },
                 Err(e) => Some(Err(e)),
-                Ok(None) => None,
-                Ok(entry @ Some(_)) => entry
-                    .filter(|dir_entry| {
-                        dir_entry.file_type().is_file()
-                            // Ignore file names with # in them, since they 
might be in-progress uploads.
-                            // They would be rejected anyways by 
filesystem_to_path below.
-                            && 
!dir_entry.file_name().to_string_lossy().contains('#')
-                    })
-                    .map(|entry| {
-                        let location = 
config.filesystem_to_path(entry.path())?;
-                        convert_entry(entry, location)
-                    }),
             }
         });
 
@@ -473,7 +532,7 @@ impl ObjectStore for LocalFileSystem {
         let config = Arc::clone(&self.config);
 
         let prefix = prefix.cloned().unwrap_or_default();
-        let resolved_prefix = config.path_to_filesystem(&prefix)?;
+        let resolved_prefix = config.prefix_to_filesystem(&prefix)?;
 
         maybe_spawn_blocking(move || {
             let walkdir = WalkDir::new(&resolved_prefix)
@@ -486,15 +545,11 @@ impl ObjectStore for LocalFileSystem {
 
             for entry_res in walkdir.into_iter().map(convert_walkdir_result) {
                 if let Some(entry) = entry_res? {
-                    if entry.file_type().is_file()
-                        // Ignore file names with # in them, since they might 
be in-progress uploads.
-                        // They would be rejected anyways by 
filesystem_to_path below.
-                        && entry.file_name().to_string_lossy().contains('#')
-                    {
-                        continue;
-                    }
                     let is_directory = entry.file_type().is_dir();
                     let entry_location = 
config.filesystem_to_path(entry.path())?;
+                    if !is_directory && !is_valid_file_path(&entry_location) {
+                        continue;
+                    }
 
                     let mut parts = match entry_location.prefix_match(&prefix) 
{
                         Some(parts) => parts,
@@ -1325,26 +1380,18 @@ mod tests {
         assert!(result.common_prefixes.is_empty());
         assert_eq!(result.objects[0].location, object);
 
-        let illegal = root.join("💀");
-        std::fs::write(illegal, "foo").unwrap();
-
-        // Can list directory that doesn't contain illegal path
-        flatten_list_stream(&integration, Some(&directory))
-            .await
-            .unwrap();
+        let emoji = root.join("💀");
+        std::fs::write(emoji, "foo").unwrap();
 
-        // Cannot list illegal file
-        let err = flatten_list_stream(&integration, None)
-            .await
-            .unwrap_err()
-            .to_string();
+        // Can list illegal file
+        let paths = flatten_list_stream(&integration, None).await.unwrap();
 
-        assert!(
-            err.contains(
-                "Encountered illegal character sequence \"💀\" whilst parsing 
path segment \"💀\""
-            ),
-            "{}",
-            err
+        assert_eq!(
+            paths,
+            vec![
+                Path::parse("💀").unwrap(),
+                Path::parse("directory/child.txt").unwrap()
+            ]
         );
     }
 
@@ -1403,6 +1450,51 @@ mod tests {
         let path = Path::from_filesystem_path(".").unwrap();
         integration.list_with_delimiter(Some(&path)).await.unwrap();
     }
+
+    #[test]
+    fn test_valid_path() {
+        let cases = [
+            ("foo#123/test.txt", true),
+            ("foo#123/test#23.txt", true),
+            ("foo#123/test#34", false),
+            ("foo😁/test#34", false),
+            ("foo/test#😁34", true),
+        ];
+
+        for (case, expected) in cases {
+            let path = Path::parse(case).unwrap();
+            assert_eq!(is_valid_file_path(&path), expected);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_intermediate_files() {
+        let root = TempDir::new().unwrap();
+        let integration = 
LocalFileSystem::new_with_prefix(root.path()).unwrap();
+
+        let a = Path::parse("foo#123/test.txt").unwrap();
+        integration.put(&a, "test".into()).await.unwrap();
+
+        let list = flatten_list_stream(&integration, None).await.unwrap();
+        assert_eq!(list, vec![a.clone()]);
+
+        std::fs::write(root.path().join("bar#123"), "test").unwrap();
+
+        // Should ignore file
+        let list = flatten_list_stream(&integration, None).await.unwrap();
+        assert_eq!(list, vec![a.clone()]);
+
+        let b = Path::parse("bar#123").unwrap();
+        let err = integration.get(&b).await.unwrap_err().to_string();
+        assert_eq!(err, "Generic LocalFileSystem error: Filenames containing 
trailing '/#\\d+/' are not supported: bar#123");
+
+        let c = Path::parse("foo#123.txt").unwrap();
+        integration.put(&c, "test".into()).await.unwrap();
+
+        let mut list = flatten_list_stream(&integration, None).await.unwrap();
+        list.sort_unstable();
+        assert_eq!(list, vec![c, a]);
+    }
 }
 
 #[cfg(not(target_arch = "wasm32"))]
diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs
index e065c31d31..f914862bc5 100644
--- a/object_store/src/path/mod.rs
+++ b/object_store/src/path/mod.rs
@@ -65,10 +65,23 @@ pub enum Error {
 
 /// A parsed path representation that can be safely written to object storage
 ///
-/// # Path Safety
+/// A [`Path`] maintains the following invariants:
+///
+/// * Paths are delimited by `/`
+/// * Paths do not contain leading or trailing `/`
+/// * Paths do not contain relative path segments, i.e. `.` or `..`
+/// * Paths do not contain empty path segments
+/// * Paths do not contain any ASCII control characters
+///
+/// There are no enforced restrictions on path length, however, it should be 
noted that most
+/// object stores do not permit paths longer than 1024 bytes, and many 
filesystems do not
+/// support path segments longer than 255 bytes.
+///
+/// # Encode
 ///
 /// In theory object stores support any UTF-8 character sequence, however, 
certain character
-/// sequences cause compatibility problems with some applications and 
protocols. As such the
+/// sequences cause compatibility problems with some applications and 
protocols. Additionally
+/// some filesystems may impose character restrictions, see 
[`LocalFileSystem`]. As such the
 /// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend 
sticking to a
 /// limited character subset.
 ///
@@ -76,34 +89,16 @@ pub enum Error {
 /// [GCS]: https://cloud.google.com/storage/docs/naming-objects
 /// [Azure Blob Storage]: 
https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names
 ///
-/// This presents libraries with two options for consistent path handling:
-///
-/// 1. Allow constructing unsafe paths, allowing for both reading and writing 
of data to paths
-/// that may not be consistently understood or supported
-/// 2. Disallow constructing unsafe paths, ensuring data written can be 
consistently handled by
-/// all other systems, but preventing interaction with objects at unsafe paths
-///
-/// This library takes the second approach, in particular:
-///
-/// * Paths are delimited by `/`
-/// * Paths do not start with a `/`
-/// * Empty path segments are discarded (e.g. `//` is treated as though it 
were `/`)
-/// * Relative path segments, i.e. `.` and `..` are percent encoded
-/// * Unsafe characters are percent encoded, as described by [RFC 1738]
-/// * All paths are relative to the root of the object store
-///
-/// In order to provide these guarantees there are two ways to safely 
construct a [`Path`]
-///
-/// # Encode
-///
-/// A string containing potentially illegal path segments can be encoded to a 
[`Path`]
-/// using [`Path::from`] or [`Path::from_iter`].
+/// A string containing potentially problematic path segments can therefore be 
encoded to a [`Path`]
+/// using [`Path::from`] or [`Path::from_iter`]. This will percent encode any 
problematic
+/// segments according to [RFC 1738].
 ///
 /// ```
 /// # use object_store::path::Path;
 /// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar");
 /// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar");
 /// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar");
+/// assert_eq!(Path::from("/").as_ref(), "");
 /// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar");
 /// ```
 ///
@@ -116,20 +111,20 @@ pub enum Error {
 ///
 /// # Parse
 ///
-/// Alternatively a [`Path`] can be created from an existing string, returning 
an
-/// error if it is invalid. Unlike the encoding methods, this will permit
-/// valid percent encoded sequences.
+/// Alternatively a [`Path`] can be parsed from an existing string, returning 
an
+/// error if it is invalid. Unlike the encoding methods above, this will permit
+/// arbitrary unicode, including percent encoded sequences.
 ///
 /// ```
 /// # use object_store::path::Path;
-///
 /// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), 
"foo/foo%2Fbar");
-/// Path::parse("..").unwrap_err();
-/// Path::parse("/foo//").unwrap_err();
-/// Path::parse("😀").unwrap_err();
+/// Path::parse("..").unwrap_err(); // Relative path segments are disallowed
+/// Path::parse("/foo//").unwrap_err(); // Empty path segments are disallowed
+/// Path::parse("\x00").unwrap_err(); // ASCII control characters are 
disallowed
 /// ```
 ///
 /// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt
+/// [`LocalFileSystem`]: crate::local::LocalFileSystem
 #[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)]
 pub struct Path {
     /// The raw path with no leading or trailing delimiters
@@ -236,7 +231,7 @@ impl Path {
     pub fn filename(&self) -> Option<&str> {
         match self.raw.is_empty() {
             true => None,
-            false => self.raw.split(DELIMITER).last(),
+            false => self.raw.rsplit(DELIMITER).next(),
         }
     }
 
diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs
index 9da4815712..df7097cbe9 100644
--- a/object_store/src/path/parts.rs
+++ b/object_store/src/path/parts.rs
@@ -37,8 +37,10 @@ pub struct InvalidPart {
 /// The PathPart type exists to validate the directory/file names that form 
part
 /// of a path.
 ///
-/// A PathPart instance is guaranteed to to contain no illegal characters 
(e.g. `/`)
-/// as it can only be constructed by going through the `from` impl.
+/// A [`PathPart`] is guaranteed to:
+///
+/// * Contain no ASCII control characters or `/`
+/// * Not be a relative path segment, i.e. `.` or `..`
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
 pub struct PathPart<'a> {
     pub(super) raw: Cow<'a, str>,
@@ -54,19 +56,12 @@ impl<'a> PathPart<'a> {
             });
         }
 
-        for (idx, b) in segment.as_bytes().iter().cloned().enumerate() {
-            // A percent character is always valid, even if not
-            // followed by a valid 2-digit hex code
-            // https://url.spec.whatwg.org/#percent-encoded-bytes
-            if b == b'%' {
-                continue;
-            }
-
-            if !b.is_ascii() || should_percent_encode(b) {
+        for c in segment.chars() {
+            if c.is_ascii_control() || c == '/' {
                 return Err(InvalidPart {
                     segment: segment.to_string(),
                     // This is correct as only single byte characters up to 
this point
-                    illegal: segment.chars().nth(idx).unwrap().to_string(),
+                    illegal: c.to_string(),
                 });
             }
         }
@@ -77,10 +72,6 @@ impl<'a> PathPart<'a> {
     }
 }
 
-fn should_percent_encode(c: u8) -> bool {
-    percent_encode(&[c], INVALID).next().unwrap().len() != 1
-}
-
 /// Characters we want to encode.
 const INVALID: &AsciiSet = &CONTROLS
     // The delimiter we are reserving for internal hierarchy

(arrow-rs) branch master updated: Relax path safety (#5019) (#5020)

Reply via email to