This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0024962ff7 ARROW-17045: [C++] Reject trailing slashes on file path 
(#13577)
0024962ff7 is described below

commit 0024962ff761d1d5f3a63013e67886334f1e57ca
Author: Will Jones <[email protected]>
AuthorDate: Wed Jul 13 10:12:06 2022 -0700

    ARROW-17045: [C++] Reject trailing slashes on file path (#13577)
    
    BREAKING CHANGE: We had several different behaviors when passing in file 
paths with trailing slashes: LocalFileSystem would return IOError, S3 would 
trim off the trailing slash, and GCS would keep the trailing slash as part of 
the file name (later creating confusion as the file would be labelled a 
"directory" in list calls). This PR moves them all to the behavior of 
LocalFileSystem: return IOError.
    
    The R filesystem bindings relied on the behavior provided by S3, so they 
are now modified to trim the trailing slash before passing down to C++.
    
    Here is an example of the differences in behavior between S3 and GCS:
    
    ```python
    import pyarrow.fs
    from pyarrow.fs import FileSelector
    from datetime import timedelta
    
    gcs = pyarrow.fs.GcsFileSystem(
        endpoint_override="localhost:9001",
        scheme="http",
        anonymous=True,
        retry_time_limit=timedelta(seconds=1),
    )
    
    gcs.create_dir("py_test")
    
    # Writing to test.txt with and without slash produces a file and a 
directory!?
    with gcs.open_output_stream("py_test/test.txt") as out_stream:
        out_stream.write(b"Hello world!")
    with gcs.open_output_stream("py_test/test.txt/") as out_stream:
        out_stream.write(b"Hello world!")
    gcs.get_file_info(FileSelector("py_test"))
    # [<FileInfo for 'py_test/test.txt': type=FileType.File, size=12>, 
<FileInfo for 'py_test/test.txt': type=FileType.Directory>]
    
    s3 = pyarrow.fs.S3FileSystem(
        access_key="minioadmin",
        secret_key="minioadmin",
        scheme="http",
        endpoint_override="localhost:9000",
        allow_bucket_creation=True,
        allow_bucket_deletion=True,
    )
    
    s3.create_dir("py-test")
    
    # Writing to test.txt with and without slash writes to same file
    with s3.open_output_stream("py-test/test.txt") as out_stream:
        out_stream.write(b"Hello world!")
    with s3.open_output_stream("py-test/test.txt/") as out_stream:
        out_stream.write(b"Hello world!")
    s3.get_file_info(FileSelector("py-test"))
    # [<FileInfo for 'py-test/test.txt': type=FileType.File, size=12>]
    ```
    
    
    Authored-by: Will Jones <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/filesystem/gcsfs.cc         |  5 +++++
 cpp/src/arrow/filesystem/mockfs.cc        |  3 +++
 cpp/src/arrow/filesystem/path_util.cc     |  8 ++++++++
 cpp/src/arrow/filesystem/path_util.h      |  3 +++
 cpp/src/arrow/filesystem/s3fs.cc          |  3 +++
 cpp/src/arrow/filesystem/test_util.cc     | 33 +++++++++++++++++++++++++++++++
 cpp/src/arrow/filesystem/util_internal.cc |  8 ++++----
 cpp/src/arrow/filesystem/util_internal.h  |  9 +++++----
 r/R/io.R                                  | 12 +++++++----
 9 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/filesystem/gcsfs.cc 
b/cpp/src/arrow/filesystem/gcsfs.cc
index 82d2439a60..da7b856be4 100644
--- a/cpp/src/arrow/filesystem/gcsfs.cc
+++ b/cpp/src/arrow/filesystem/gcsfs.cc
@@ -887,6 +887,7 @@ Status GcsFileSystem::CopyFile(const std::string& src, 
const std::string& dest)
 
 Result<std::shared_ptr<io::InputStream>> GcsFileSystem::OpenInputStream(
     const std::string& path) {
+  ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path));
   ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path));
   return impl_->OpenInputStream(p, gcs::Generation(), gcs::ReadFromOffset());
 }
@@ -897,12 +898,14 @@ Result<std::shared_ptr<io::InputStream>> 
GcsFileSystem::OpenInputStream(
     return Status::IOError("Cannot open directory '", info.path(),
                            "' as an input stream");
   }
+  ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(info.path()));
   ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path()));
   return impl_->OpenInputStream(p, gcs::Generation(), gcs::ReadFromOffset());
 }
 
 Result<std::shared_ptr<io::RandomAccessFile>> GcsFileSystem::OpenInputFile(
     const std::string& path) {
+  ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path));
   ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path));
   auto metadata = impl_->GetObjectMetadata(p);
   ARROW_GCS_RETURN_NOT_OK(metadata.status());
@@ -924,6 +927,7 @@ Result<std::shared_ptr<io::RandomAccessFile>> 
GcsFileSystem::OpenInputFile(
     return Status::IOError("Cannot open directory '", info.path(),
                            "' as an input stream");
   }
+  ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(info.path()));
   ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path()));
   auto metadata = impl_->GetObjectMetadata(p);
   ARROW_GCS_RETURN_NOT_OK(metadata.status());
@@ -941,6 +945,7 @@ Result<std::shared_ptr<io::RandomAccessFile>> 
GcsFileSystem::OpenInputFile(
 
 Result<std::shared_ptr<io::OutputStream>> GcsFileSystem::OpenOutputStream(
     const std::string& path, const std::shared_ptr<const KeyValueMetadata>& 
metadata) {
+  ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path));
   ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path));
   return impl_->OpenOutputStream(p, metadata);
 }
diff --git a/cpp/src/arrow/filesystem/mockfs.cc 
b/cpp/src/arrow/filesystem/mockfs.cc
index 31dba27326..69e49b3204 100644
--- a/cpp/src/arrow/filesystem/mockfs.cc
+++ b/cpp/src/arrow/filesystem/mockfs.cc
@@ -382,6 +382,7 @@ class MockFileSystem::Impl {
   Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
       const std::string& path, bool append,
       const std::shared_ptr<const KeyValueMetadata>& metadata) {
+    ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path));
     auto parts = SplitAbstractPath(path);
     RETURN_NOT_OK(ValidateAbstractPathParts(parts));
 
@@ -412,6 +413,7 @@ class MockFileSystem::Impl {
   }
 
   Result<std::shared_ptr<io::BufferReader>> OpenInputReader(const std::string& 
path) {
+    ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path));
     auto parts = SplitAbstractPath(path);
     RETURN_NOT_OK(ValidateAbstractPathParts(parts));
 
@@ -727,6 +729,7 @@ Result<std::shared_ptr<io::OutputStream>> 
MockFileSystem::OpenOutputStream(
 
 Result<std::shared_ptr<io::OutputStream>> MockFileSystem::OpenAppendStream(
     const std::string& path, const std::shared_ptr<const KeyValueMetadata>& 
metadata) {
+  ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path));
   RETURN_NOT_OK(ValidatePath(path));
   auto guard = impl_->lock_guard();
 
diff --git a/cpp/src/arrow/filesystem/path_util.cc 
b/cpp/src/arrow/filesystem/path_util.cc
index 04484dd5bd..1afc3b2a89 100644
--- a/cpp/src/arrow/filesystem/path_util.cc
+++ b/cpp/src/arrow/filesystem/path_util.cc
@@ -19,6 +19,7 @@
 #include <regex>
 
 #include "arrow/filesystem/path_util.h"
+#include "arrow/filesystem/util_internal.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/util/logging.h"
@@ -139,6 +140,13 @@ util::string_view RemoveLeadingSlash(util::string_view 
key) {
   return key;
 }
 
+Status AssertNoTrailingSlash(util::string_view key) {
+  if (key.back() == '/') {
+    return NotAFile(key);
+  }
+  return Status::OK();
+}
+
 Result<std::string> MakeAbstractPathRelative(const std::string& base,
                                              const std::string& path) {
   if (base.empty() || base.front() != kSep) {
diff --git a/cpp/src/arrow/filesystem/path_util.h 
b/cpp/src/arrow/filesystem/path_util.h
index 75745ee44c..d4083d3b5c 100644
--- a/cpp/src/arrow/filesystem/path_util.h
+++ b/cpp/src/arrow/filesystem/path_util.h
@@ -72,6 +72,9 @@ std::string EnsureTrailingSlash(util::string_view s);
 ARROW_EXPORT
 util::string_view RemoveTrailingSlash(util::string_view s);
 
+ARROW_EXPORT
+Status AssertNoTrailingSlash(util::string_view s);
+
 ARROW_EXPORT
 bool IsAncestorOf(util::string_view ancestor, util::string_view descendant);
 
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 7a3df54350..dd3973ba77 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -2169,6 +2169,7 @@ class S3FileSystem::Impl : public 
std::enable_shared_from_this<S3FileSystem::Imp
 
   Result<std::shared_ptr<ObjectInputFile>> OpenInputFile(const std::string& s,
                                                          S3FileSystem* fs) {
+    ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s));
     ARROW_ASSIGN_OR_RAISE(auto path, S3Path::FromString(s));
     RETURN_NOT_OK(ValidateFilePath(path));
 
@@ -2179,6 +2180,7 @@ class S3FileSystem::Impl : public 
std::enable_shared_from_this<S3FileSystem::Imp
 
   Result<std::shared_ptr<ObjectInputFile>> OpenInputFile(const FileInfo& info,
                                                          S3FileSystem* fs) {
+    ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(info.path()));
     if (info.type() == FileType::NotFound) {
       return ::arrow::fs::internal::PathNotFound(info.path());
     }
@@ -2537,6 +2539,7 @@ Result<std::shared_ptr<io::RandomAccessFile>> 
S3FileSystem::OpenInputFile(
 
 Result<std::shared_ptr<io::OutputStream>> S3FileSystem::OpenOutputStream(
     const std::string& s, const std::shared_ptr<const KeyValueMetadata>& 
metadata) {
+  ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s));
   ARROW_ASSIGN_OR_RAISE(auto path, S3Path::FromString(s));
   RETURN_NOT_OK(ValidateFilePath(path));
 
diff --git a/cpp/src/arrow/filesystem/test_util.cc 
b/cpp/src/arrow/filesystem/test_util.cc
index 3e197e110b..d72386a64a 100644
--- a/cpp/src/arrow/filesystem/test_util.cc
+++ b/cpp/src/arrow/filesystem/test_util.cc
@@ -899,6 +899,9 @@ void 
GenericFileSystemTest::TestOpenOutputStream(FileSystem* fs) {
 
   ASSERT_RAISES(Invalid, stream->Write("x"));  // Stream is closed
 
+  // Trailing slash rejected
+  ASSERT_RAISES(IOError, fs->OpenOutputStream("CD/ghi/"));
+
   // Storing metadata along file
   auto metadata = KeyValueMetadata::Make({"Content-Type", "Content-Language"},
                                          {"x-arrow/filesystem-test", "fr_FR"});
@@ -931,6 +934,9 @@ void 
GenericFileSystemTest::TestOpenAppendStream(FileSystem* fs) {
 
   std::shared_ptr<io::OutputStream> stream;
 
+  // Trailing slash rejected
+  ASSERT_RAISES(IOError, fs->OpenAppendStream("abc/"));
+
   if (allow_append_to_new_file()) {
     ASSERT_OK_AND_ASSIGN(stream, fs->OpenAppendStream("abc"));
   } else {
@@ -974,6 +980,9 @@ void GenericFileSystemTest::TestOpenInputStream(FileSystem* 
fs) {
   ASSERT_OK(stream->Close());
   ASSERT_RAISES(Invalid, stream->Read(1));  // Stream is closed
 
+  // Trailing slash rejected
+  ASSERT_RAISES(IOError, fs->OpenInputStream("AB/abc/"));
+
   // File does not exist
   AssertRaisesWithErrno(ENOENT, fs->OpenInputStream("AB/def"));
   AssertRaisesWithErrno(ENOENT, fs->OpenInputStream("def"));
@@ -1009,6 +1018,15 @@ void 
GenericFileSystemTest::TestOpenInputStreamWithFileInfo(FileSystem* fs) {
   info.set_type(FileType::Unknown);
   AssertRaisesWithErrno(ENOENT, fs->OpenInputStream(info));
 
+  // Trailing slash rejected
+  auto maybe_info = fs->GetFileInfo("AB/abc/");
+  if (maybe_info.ok()) {
+    ASSERT_OK_AND_ASSIGN(info, maybe_info);
+    ASSERT_RAISES(IOError, fs->OpenInputStream(info));
+  } else {
+    ASSERT_RAISES(IOError, maybe_info);
+  }
+
   // Cannot open directory
   ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo("AB"));
   ASSERT_RAISES(IOError, fs->OpenInputStream(info));
@@ -1044,6 +1062,9 @@ void GenericFileSystemTest::TestOpenInputFile(FileSystem* 
fs) {
   ASSERT_OK(file->Close());
   ASSERT_RAISES(Invalid, file->ReadAt(1, 1));  // Stream is closed
 
+  // Trailing slash rejected
+  ASSERT_RAISES(IOError, fs->OpenInputFile("AB/abc/"));
+
   // File does not exist
   AssertRaisesWithErrno(ENOENT, fs->OpenInputFile("AB/def"));
   AssertRaisesWithErrno(ENOENT, fs->OpenInputFile("def"));
@@ -1067,6 +1088,9 @@ void 
GenericFileSystemTest::TestOpenInputFileAsync(FileSystem* fs) {
 
   // File does not exist
   AssertRaisesWithErrno(ENOENT, fs->OpenInputFileAsync("AB/def").result());
+
+  // Trailing slash rejected
+  ASSERT_RAISES(IOError, fs->OpenInputFileAsync("AB/abc/").result());
 }
 
 void GenericFileSystemTest::TestOpenInputFileWithFileInfo(FileSystem* fs) {
@@ -1096,6 +1120,15 @@ void 
GenericFileSystemTest::TestOpenInputFileWithFileInfo(FileSystem* fs) {
   info.set_type(FileType::Unknown);
   AssertRaisesWithErrno(ENOENT, fs->OpenInputFile(info));
 
+  // Trailing slash rejected
+  auto maybe_info = fs->GetFileInfo("AB/abc/");
+  if (maybe_info.ok()) {
+    ASSERT_OK_AND_ASSIGN(info, maybe_info);
+    ASSERT_RAISES(IOError, fs->OpenInputFile(info));
+  } else {
+    ASSERT_RAISES(IOError, maybe_info);
+  }
+
   // Cannot open directory
   ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo("AB"));
   ASSERT_RAISES(IOError, fs->OpenInputFile(info));
diff --git a/cpp/src/arrow/filesystem/util_internal.cc 
b/cpp/src/arrow/filesystem/util_internal.cc
index 4851fccd55..0d2ad70902 100644
--- a/cpp/src/arrow/filesystem/util_internal.cc
+++ b/cpp/src/arrow/filesystem/util_internal.cc
@@ -56,21 +56,21 @@ Status CopyStream(const std::shared_ptr<io::InputStream>& 
src,
   return Status::OK();
 }
 
-Status PathNotFound(const std::string& path) {
+Status PathNotFound(util::string_view path) {
   return Status::IOError("Path does not exist '", path, "'")
       .WithDetail(StatusDetailFromErrno(ENOENT));
 }
 
-Status NotADir(const std::string& path) {
+Status NotADir(util::string_view path) {
   return Status::IOError("Not a directory: '", path, "'")
       .WithDetail(StatusDetailFromErrno(ENOTDIR));
 }
 
-Status NotAFile(const std::string& path) {
+Status NotAFile(util::string_view path) {
   return Status::IOError("Not a regular file: '", path, "'");
 }
 
-Status InvalidDeleteDirContents(const std::string& path) {
+Status InvalidDeleteDirContents(util::string_view path) {
   return Status::Invalid(
       "DeleteDirContents called on invalid path '", path, "'. ",
       "If you wish to delete the root directory's contents, call 
DeleteRootDirContents.");
diff --git a/cpp/src/arrow/filesystem/util_internal.h 
b/cpp/src/arrow/filesystem/util_internal.h
index 80a29feb85..75a2d3a2ef 100644
--- a/cpp/src/arrow/filesystem/util_internal.h
+++ b/cpp/src/arrow/filesystem/util_internal.h
@@ -23,6 +23,7 @@
 #include "arrow/filesystem/filesystem.h"
 #include "arrow/io/interfaces.h"
 #include "arrow/status.h"
+#include "arrow/util/string_view.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -38,16 +39,16 @@ Status CopyStream(const std::shared_ptr<io::InputStream>& 
src,
                   const io::IOContext& io_context);
 
 ARROW_EXPORT
-Status PathNotFound(const std::string& path);
+Status PathNotFound(util::string_view path);
 
 ARROW_EXPORT
-Status NotADir(const std::string& path);
+Status NotADir(util::string_view path);
 
 ARROW_EXPORT
-Status NotAFile(const std::string& path);
+Status NotAFile(util::string_view path);
 
 ARROW_EXPORT
-Status InvalidDeleteDirContents(const std::string& path);
+Status InvalidDeleteDirContents(util::string_view path);
 
 /// \brief Return files matching the glob pattern on the filesystem
 ///
diff --git a/r/R/io.R b/r/R/io.R
index 8e72187b43..82e3847df5 100644
--- a/r/R/io.R
+++ b/r/R/io.R
@@ -241,7 +241,9 @@ mmap_open <- function(path, mode = c("read", "write", 
"readwrite")) {
 make_readable_file <- function(file, mmap = TRUE, compression = NULL, 
filesystem = NULL) {
   if (inherits(file, "SubTreeFileSystem")) {
     filesystem <- file$base_fs
-    file <- file$base_path
+    # SubTreeFileSystem adds a slash to base_path, but filesystems will reject 
file names
+    # with trailing slashes, so we need to remove it here.
+    file <- sub("/$", "", file$base_path)
   }
   if (is.string(file)) {
     if (is_url(file)) {
@@ -303,7 +305,9 @@ make_output_stream <- function(x, filesystem = NULL, 
compression = NULL) {
 
   if (inherits(x, "SubTreeFileSystem")) {
     filesystem <- x$base_fs
-    x <- x$base_path
+    # SubTreeFileSystem adds a slash to base_path, but filesystems will reject 
file names
+    # with trailing slashes, so we need to remove it here.
+    x <- sub("/$", "", x$base_path)
   } else if (is_url(x)) {
     fs_and_path <- FileSystem$from_uri(x)
     filesystem <- fs_and_path$fs
@@ -317,7 +321,7 @@ make_output_stream <- function(x, filesystem = NULL, 
compression = NULL) {
 
   assert_that(is.string(x))
   if (is.null(filesystem) && is_compressed(compression)) {
-    CompressedOutputStream$create(x) ##compressed local
+    CompressedOutputStream$create(x) ## compressed local
   } else if (is.null(filesystem) && !is_compressed(compression)) {
     FileOutputStream$create(x) ## uncompressed local
   } else if (!is.null(filesystem) && is_compressed(compression)) {
@@ -333,7 +337,7 @@ detect_compression <- function(path) {
   }
 
   # Remove any trailing slashes, which FileSystem$from_uri may add
-  path <- gsub("/$", "", path)
+  path <- sub("/$", "", path)
 
   switch(tools::file_ext(path),
     bz2 = "bz2",

Reply via email to