This is an automated email from the ASF dual-hosted git repository.

westonpace pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 7ca7724139 GH-34386: [C++] Add a PathFromUriOrPath method (#34420)
7ca7724139 is described below

commit 7ca7724139d3b04161369ffce04cf53e74eec54c
Author: Weston Pace <[email protected]>
AuthorDate: Thu May 4 15:22:44 2023 -0700

    GH-34386: [C++] Add a PathFromUriOrPath method (#34420)
    
    ### Rationale for this change
    
    We have some URI parsing indirectly exposed through FilesystemFromUri.  
However, this isn't very useful if the user has multiple URIs or if they 
already have a filesystem.  This method allows that same URI handling to be 
used even if the user already has a filesystem.
    
    ### What changes are included in this PR?
    
    Adds a new arrow::fs::PathFromUriOrPath method
    
    ### Are these changes tested?
    
    Yes, via unit tests
    
    ### Are there any user-facing changes?
    
    There is a new API method but no changes to any existing APIs
    * Closes: #34386
    
    Authored-by: Weston Pace <[email protected]>
    Signed-off-by: Weston Pace <[email protected]>
---
 cpp/src/arrow/filesystem/filesystem.cc    |  33 ++++-----
 cpp/src/arrow/filesystem/filesystem.h     |  22 ++++++
 cpp/src/arrow/filesystem/gcsfs.cc         |   6 ++
 cpp/src/arrow/filesystem/gcsfs.h          |   1 +
 cpp/src/arrow/filesystem/gcsfs_test.cc    |  17 ++++-
 cpp/src/arrow/filesystem/hdfs.cc          |   6 ++
 cpp/src/arrow/filesystem/hdfs.h           |   1 +
 cpp/src/arrow/filesystem/hdfs_test.cc     |   2 +
 cpp/src/arrow/filesystem/localfs.cc       |  57 ++++++----------
 cpp/src/arrow/filesystem/localfs.h        |   9 +--
 cpp/src/arrow/filesystem/localfs_test.cc  |  26 ++++++-
 cpp/src/arrow/filesystem/mockfs.cc        |   8 +++
 cpp/src/arrow/filesystem/mockfs.h         |   1 +
 cpp/src/arrow/filesystem/path_util.cc     |  12 +++-
 cpp/src/arrow/filesystem/path_util.h      |   5 +-
 cpp/src/arrow/filesystem/s3fs.cc          |   5 ++
 cpp/src/arrow/filesystem/s3fs.h           |   1 +
 cpp/src/arrow/filesystem/s3fs_test.cc     |  13 ++++
 cpp/src/arrow/filesystem/util_internal.cc | 110 ++++++++++++++++++++++++++++++
 cpp/src/arrow/filesystem/util_internal.h  |  36 ++++++++++
 20 files changed, 303 insertions(+), 68 deletions(-)

diff --git a/cpp/src/arrow/filesystem/filesystem.cc 
b/cpp/src/arrow/filesystem/filesystem.cc
index 73b94d3828..6296dd8d85 100644
--- a/cpp/src/arrow/filesystem/filesystem.cc
+++ b/cpp/src/arrow/filesystem/filesystem.cc
@@ -60,6 +60,7 @@ using internal::ConcatAbstractPath;
 using internal::EnsureTrailingSlash;
 using internal::GetAbstractPathParent;
 using internal::kSep;
+using internal::ParseFileSystemUri;
 using internal::RemoveLeadingSlash;
 using internal::RemoveTrailingSlash;
 using internal::ToSlashes;
@@ -254,6 +255,10 @@ Result<std::shared_ptr<io::OutputStream>> 
FileSystem::OpenAppendStream(
   return OpenAppendStream(path, std::shared_ptr<const KeyValueMetadata>{});
 }
 
+Result<std::string> FileSystem::PathFromUri(const std::string& uri_string) 
const {
+  return Status::NotImplemented("PathFromUri is not yet supported on this 
filesystem");
+}
+
 //////////////////////////////////////////////////////////////////////////
 // SubTreeFileSystem implementation
 
@@ -484,6 +489,10 @@ Result<std::shared_ptr<io::OutputStream>> 
SubTreeFileSystem::OpenAppendStream(
   return base_fs_->OpenAppendStream(real_path, metadata);
 }
 
+Result<std::string> SubTreeFileSystem::PathFromUri(const std::string& 
uri_string) const {
+  return base_fs_->PathFromUri(uri_string);
+}
+
 //////////////////////////////////////////////////////////////////////////
 // SlowFileSystem implementation
 
@@ -505,6 +514,10 @@ SlowFileSystem::SlowFileSystem(std::shared_ptr<FileSystem> 
base_fs,
 
 bool SlowFileSystem::Equals(const FileSystem& other) const { return this == 
&other; }
 
+Result<std::string> SlowFileSystem::PathFromUri(const std::string& uri_string) 
const {
+  return base_fs_->PathFromUri(uri_string);
+}
+
 Result<FileInfo> SlowFileSystem::GetFileInfo(const std::string& path) {
   latencies_->Sleep();
   return base_fs_->GetFileInfo(path);
@@ -662,23 +675,6 @@ Status CopyFiles(const std::shared_ptr<FileSystem>& 
source_fs,
 
 namespace {
 
-Result<Uri> ParseFileSystemUri(const std::string& uri_string) {
-  Uri uri;
-  auto status = uri.Parse(uri_string);
-  if (!status.ok()) {
-#ifdef _WIN32
-    // Could be a "file:..." URI with backslashes instead of regular slashes.
-    RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string)));
-    if (uri.scheme() != "file") {
-      return status;
-    }
-#else
-    return status;
-#endif
-  }
-  return std::move(uri);
-}
-
 Result<std::shared_ptr<FileSystem>> FileSystemFromUriReal(const Uri& uri,
                                                           const std::string& 
uri_string,
                                                           const io::IOContext& 
io_context,
@@ -763,7 +759,8 @@ Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
   if (internal::DetectAbsolutePath(uri_string)) {
     // Normalize path separators
     if (out_path != nullptr) {
-      *out_path = ToSlashes(uri_string);
+      *out_path =
+          std::string(RemoveTrailingSlash(ToSlashes(uri_string), 
/*preserve_root=*/true));
     }
     return std::make_shared<LocalFileSystem>();
   }
diff --git a/cpp/src/arrow/filesystem/filesystem.h 
b/cpp/src/arrow/filesystem/filesystem.h
index 6dc18d7de8..cfadaeb0ce 100644
--- a/cpp/src/arrow/filesystem/filesystem.h
+++ b/cpp/src/arrow/filesystem/filesystem.h
@@ -171,6 +171,26 @@ class ARROW_EXPORT FileSystem : public 
std::enable_shared_from_this<FileSystem>
   /// may allow normalizing irregular path forms (such as Windows local paths).
   virtual Result<std::string> NormalizePath(std::string path);
 
+  /// \brief Ensure a URI (or path) is compatible with the given filesystem 
and return the
+  ///        path
+  ///
+  /// \param uri_string A URI representing a resource in the given filesystem.
+  ///
+  /// This method will check to ensure the given filesystem is compatible with 
the
+  /// URI. This can be useful when the user provides both a URI and a 
filesystem or
+  /// when a user provides multiple URIs that should be compatible with the 
same
+  /// filesystem.
+  ///
+  /// uri_string can be an absolute path instead of a URI.  In that case it 
will ensure
+  /// the filesystem (if supplied) is the local filesystem (or some custom 
filesystem that
+  /// is capable of reading local paths) and will normalize the path's file 
separators.
+  ///
+  /// Note, this method only checks to ensure the URI scheme is valid.  It 
will not detect
+  /// inconsistencies like a mismatching region or endpoint override.
+  ///
+  /// \return The path inside the filesystem that is indicated by the URI.
+  virtual Result<std::string> PathFromUri(const std::string& uri_string) const;
+
   virtual bool Equals(const FileSystem& other) const = 0;
 
   virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
@@ -336,6 +356,7 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
   std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }
 
   Result<std::string> NormalizePath(std::string path) override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const 
override;
 
   bool Equals(const FileSystem& other) const override;
 
@@ -410,6 +431,7 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem {
 
   std::string type_name() const override { return "slow"; }
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const 
override;
 
   using FileSystem::GetFileInfo;
   Result<FileInfo> GetFileInfo(const std::string& path) override;
diff --git a/cpp/src/arrow/filesystem/gcsfs.cc 
b/cpp/src/arrow/filesystem/gcsfs.cc
index f063e31b5c..6fc75589b0 100644
--- a/cpp/src/arrow/filesystem/gcsfs.cc
+++ b/cpp/src/arrow/filesystem/gcsfs.cc
@@ -873,6 +873,12 @@ bool GcsFileSystem::Equals(const FileSystem& other) const {
   return impl_->options().Equals(fs.impl_->options());
 }
 
+Result<std::string> GcsFileSystem::PathFromUri(const std::string& uri_string) 
const {
+  return internal::PathFromUriHelper(uri_string, {"gs", "gcs"},
+                                     /*accept_local_paths=*/false,
+                                     
internal::AuthorityHandlingBehavior::kPrepend);
+}
+
 Result<FileInfo> GcsFileSystem::GetFileInfo(const std::string& path) {
   ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path));
   return impl_->GetFileInfo(p);
diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h
index c3d03b5cb2..d4b919ec81 100644
--- a/cpp/src/arrow/filesystem/gcsfs.h
+++ b/cpp/src/arrow/filesystem/gcsfs.h
@@ -178,6 +178,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem {
   const GcsOptions& options() const;
 
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const 
override;
 
   Result<FileInfo> GetFileInfo(const std::string& path) override;
   Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc 
b/cpp/src/arrow/filesystem/gcsfs_test.cc
index 15d596dc0b..9d5136b47c 100644
--- a/cpp/src/arrow/filesystem/gcsfs_test.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_test.cc
@@ -54,6 +54,7 @@
 #include "arrow/filesystem/path_util.h"
 #include "arrow/filesystem/test_util.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/future.h"
 #include "arrow/util/key_value_metadata.h"
@@ -1383,12 +1384,24 @@ TEST_F(GcsIntegrationTest, OpenInputFileClosed) {
 
 TEST_F(GcsIntegrationTest, TestFileSystemFromUri) {
   // Smoke test for FileSystemFromUri
-  ASSERT_OK_AND_ASSIGN(auto fs, 
FileSystemFromUri(std::string("gs://anonymous@") +
-                                                  PreexistingBucketPath()));
+  std::string path;
+  ASSERT_OK_AND_ASSIGN(
+      auto fs,
+      FileSystemFromUri(std::string("gs://anonymous@") + 
PreexistingBucketPath(), &path));
   EXPECT_EQ(fs->type_name(), "gcs");
+  EXPECT_EQ(path, PreexistingBucketName());
+  ASSERT_OK_AND_ASSIGN(
+      path, fs->PathFromUri(std::string("gs://anonymous@") + 
PreexistingBucketPath()));
+  EXPECT_EQ(path, PreexistingBucketName());
   ASSERT_OK_AND_ASSIGN(auto fs2, 
FileSystemFromUri(std::string("gcs://anonymous@") +
                                                    PreexistingBucketPath()));
   EXPECT_EQ(fs2->type_name(), "gcs");
+  ASSERT_THAT(fs->PathFromUri("/foo/bar"),
+              Raises(StatusCode::Invalid, testing::HasSubstr("Expected a 
URI")));
+  ASSERT_THAT(
+      fs->PathFromUri("s3:///foo/bar"),
+      Raises(StatusCode::Invalid,
+             testing::HasSubstr("expected a URI with one of the schemes (gs, 
gcs)")));
 }
 
 }  // namespace
diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc
index 8709ab4562..b227aae65d 100644
--- a/cpp/src/arrow/filesystem/hdfs.cc
+++ b/cpp/src/arrow/filesystem/hdfs.cc
@@ -473,6 +473,12 @@ bool HadoopFileSystem::Equals(const FileSystem& other) 
const {
   return options().Equals(hdfs.options());
 }
 
+Result<std::string> HadoopFileSystem::PathFromUri(const std::string& 
uri_string) const {
+  return internal::PathFromUriHelper(uri_string, {"hdfs", "viewfs"},
+                                     /*accept_local_paths=*/false,
+                                     
internal::AuthorityHandlingBehavior::kIgnore);
+}
+
 Result<std::vector<FileInfo>> HadoopFileSystem::GetFileInfo(const 
FileSelector& select) {
   return impl_->GetFileInfo(select);
 }
diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h
index bed0ac4c61..798aac0ea9 100644
--- a/cpp/src/arrow/filesystem/hdfs.h
+++ b/cpp/src/arrow/filesystem/hdfs.h
@@ -66,6 +66,7 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem {
   std::string type_name() const override { return "hdfs"; }
   HdfsOptions options() const;
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const 
override;
 
   /// \cond FALSE
   using FileSystem::GetFileInfo;
diff --git a/cpp/src/arrow/filesystem/hdfs_test.cc 
b/cpp/src/arrow/filesystem/hdfs_test.cc
index b1020231be..7ad9e6cd40 100644
--- a/cpp/src/arrow/filesystem/hdfs_test.cc
+++ b/cpp/src/arrow/filesystem/hdfs_test.cc
@@ -119,6 +119,8 @@ class TestHadoopFileSystem : public ::testing::Test, public 
HadoopFileSystemTest
     ARROW_LOG(INFO) << "!!! uri = " << ss.str();
     ASSERT_OK_AND_ASSIGN(uri_fs, FileSystemFromUri(ss.str(), &path));
     ASSERT_EQ(path, "/");
+    ASSERT_OK_AND_ASSIGN(path, uri_fs->PathFromUri(ss.str()));
+    ASSERT_EQ(path, "/");
 
     // Sanity check
     ASSERT_OK(uri_fs->CreateDir("AB"));
diff --git a/cpp/src/arrow/filesystem/localfs.cc 
b/cpp/src/arrow/filesystem/localfs.cc
index 03b4ad3bc7..e030014159 100644
--- a/cpp/src/arrow/filesystem/localfs.cc
+++ b/cpp/src/arrow/filesystem/localfs.cc
@@ -52,37 +52,6 @@ using ::arrow::internal::IOErrorFromWinError;
 using ::arrow::internal::NativePathString;
 using ::arrow::internal::PlatformFilename;
 
-namespace internal {
-
-#ifdef _WIN32
-static bool IsDriveLetter(char c) {
-  // Can't use locale-dependent functions from the C/C++ stdlib
-  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
-}
-#endif
-
-bool DetectAbsolutePath(const std::string& s) {
-  // Is it a /-prefixed local path?
-  if (s.length() >= 1 && s[0] == '/') {
-    return true;
-  }
-#ifdef _WIN32
-  // Is it a \-prefixed local path?
-  if (s.length() >= 1 && s[0] == '\\') {
-    return true;
-  }
-  // Does it start with a drive letter in addition to being /- or \-prefixed,
-  // e.g. "C:\..."?
-  if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') &&
-      IsDriveLetter(s[0])) {
-    return true;
-  }
-#endif
-  return false;
-}
-
-}  // namespace internal
-
 namespace {
 
 Status ValidatePath(std::string_view s) {
@@ -92,6 +61,12 @@ Status ValidatePath(std::string_view s) {
   return Status::OK();
 }
 
+Result<std::string> DoNormalizePath(std::string path) {
+  RETURN_NOT_OK(ValidatePath(path));
+  ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path));
+  return fn.ToString();
+}
+
 #ifdef _WIN32
 
 std::string NativeToString(const NativePathString& ns) {
@@ -263,13 +238,15 @@ Result<LocalFileSystemOptions> 
LocalFileSystemOptions::FromUri(
 #ifdef _WIN32
     std::stringstream ss;
     ss << "//" << host << "/" << internal::RemoveLeadingSlash(uri.path());
-    *out_path = ss.str();
+    *out_path =
+        std::string(internal::RemoveTrailingSlash(ss.str(), 
/*preserve_root=*/true));
 #else
     return Status::Invalid("Unsupported hostname in non-Windows local URI: '",
                            uri.ToString(), "'");
 #endif
   } else {
-    *out_path = uri.path();
+    *out_path =
+        std::string(internal::RemoveTrailingSlash(uri.path(), 
/*preserve_root=*/true));
   }
 
   // TODO handle use_mmap option
@@ -286,9 +263,17 @@ LocalFileSystem::LocalFileSystem(const 
LocalFileSystemOptions& options,
 LocalFileSystem::~LocalFileSystem() {}
 
 Result<std::string> LocalFileSystem::NormalizePath(std::string path) {
-  RETURN_NOT_OK(ValidatePath(path));
-  ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path));
-  return fn.ToString();
+  return DoNormalizePath(std::move(path));
+}
+
+Result<std::string> LocalFileSystem::PathFromUri(const std::string& 
uri_string) const {
+#ifdef _WIN32
+  auto authority_handling = internal::AuthorityHandlingBehavior::kWindows;
+#else
+  auto authority_handling = internal::AuthorityHandlingBehavior::kDisallow;
+#endif
+  return internal::PathFromUriHelper(uri_string, {"file"}, 
/*accept_local_paths=*/true,
+                                     authority_handling);
 }
 
 bool LocalFileSystem::Equals(const FileSystem& other) const {
diff --git a/cpp/src/arrow/filesystem/localfs.h 
b/cpp/src/arrow/filesystem/localfs.h
index 75eaf314e4..108530c2b2 100644
--- a/cpp/src/arrow/filesystem/localfs.h
+++ b/cpp/src/arrow/filesystem/localfs.h
@@ -82,6 +82,7 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem {
   std::string type_name() const override { return "local"; }
 
   Result<std::string> NormalizePath(std::string path) override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const 
override;
 
   bool Equals(const FileSystem& other) const override;
 
@@ -121,13 +122,5 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem {
   LocalFileSystemOptions options_;
 };
 
-namespace internal {
-
-// Return whether the string is detected as a local absolute path.
-ARROW_EXPORT
-bool DetectAbsolutePath(const std::string& s);
-
-}  // namespace internal
-
 }  // namespace fs
 }  // namespace arrow
diff --git a/cpp/src/arrow/filesystem/localfs_test.cc 
b/cpp/src/arrow/filesystem/localfs_test.cc
index 33f75dd845..7ce2a56968 100644
--- a/cpp/src/arrow/filesystem/localfs_test.cc
+++ b/cpp/src/arrow/filesystem/localfs_test.cc
@@ -31,6 +31,7 @@
 #include "arrow/filesystem/test_util.h"
 #include "arrow/filesystem/util_internal.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/uri.h"
 
@@ -183,7 +184,7 @@ class TestLocalFS : public LocalFSTestMixin {
     }
     std::string path;
     ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path));
-    ASSERT_EQ(path, local_path_);
+    ASSERT_EQ(path, std::string(RemoveTrailingSlash(local_path_)));
 
     // Test that the right location on disk is accessed
     CreateFile(fs_.get(), local_path_ + "abc", "some data");
@@ -201,6 +202,7 @@ class TestLocalFS : public LocalFSTestMixin {
   template <typename FileSystemFromUriFunc>
   void CheckLocalUri(const std::string& uri, const std::string& expected_path,
                      FileSystemFromUriFunc&& fs_from_uri) {
+    ARROW_SCOPED_TRACE("uri = ", uri);
     if (!path_formatter_.supports_uri()) {
       return;  // skip
     }
@@ -208,6 +210,8 @@ class TestLocalFS : public LocalFSTestMixin {
     ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path));
     ASSERT_EQ(fs_->type_name(), "local");
     ASSERT_EQ(path, expected_path);
+    ASSERT_OK_AND_ASSIGN(path, fs_->PathFromUri(uri));
+    ASSERT_EQ(path, expected_path);
   }
 
   // Like TestFileSystemFromUri, but with an arbitrary non-existing path
@@ -220,6 +224,7 @@ class TestLocalFS : public LocalFSTestMixin {
   }
 
   void TestInvalidUri(const std::string& uri) {
+    ARROW_SCOPED_TRACE("uri = ", uri);
     if (!path_formatter_.supports_uri()) {
       return;  // skip
     }
@@ -227,10 +232,20 @@ class TestLocalFS : public LocalFSTestMixin {
   }
 
   void TestInvalidUriOrPath(const std::string& uri) {
+    ARROW_SCOPED_TRACE("uri = ", uri);
     if (!path_formatter_.supports_uri()) {
       return;  // skip
     }
     ASSERT_RAISES(Invalid, FileSystemFromUriOrPath(uri));
+    LocalFileSystem lfs;
+    ASSERT_RAISES(Invalid, lfs.PathFromUri(uri));
+  }
+
+  void TestInvalidPathFromUri(const std::string& uri, const std::string& 
expected_err) {
+    // Legitimate URI for the wrong filesystem
+    LocalFileSystem lfs;
+    ASSERT_THAT(lfs.PathFromUri(uri),
+                Raises(StatusCode::Invalid, testing::HasSubstr(expected_err)));
   }
 
   void CheckConcreteFile(const std::string& path, int64_t expected_size) {
@@ -320,6 +335,7 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriFile) {
                      "//some server/some share/some path");
 #else
   this->TestInvalidUri("file://server/share/foo/bar");
+  this->TestInvalidUriOrPath("file://server/share/foo/bar");
 #endif
 
   // Relative paths
@@ -334,9 +350,10 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriNoScheme) {
 
   // Variations
   this->TestLocalUriOrPath(this->path_formatter_("/foo/bar"), "/foo/bar");
+  this->TestLocalUriOrPath(this->path_formatter_("/"), "/");
 
 #ifdef _WIN32
-  this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"), 
"C:/foo/bar/");
+  this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"), "C:/foo/bar");
 #endif
 
   // Relative paths
@@ -360,6 +377,11 @@ TYPED_TEST(TestLocalFS, 
FileSystemFromUriNoSchemeBackslashes) {
   this->TestInvalidUriOrPath("foo\\bar");
 }
 
+TYPED_TEST(TestLocalFS, MismatchedFilesystemPathFromUri) {
+  this->TestInvalidPathFromUri("s3://foo",
+                               "expected a URI with one of the schemes 
(file)");
+}
+
 TYPED_TEST(TestLocalFS, DirectoryMTime) {
   TimePoint t1 = CurrentTimePoint();
   ASSERT_OK(this->fs_->CreateDir("AB/CD/EF"));
diff --git a/cpp/src/arrow/filesystem/mockfs.cc 
b/cpp/src/arrow/filesystem/mockfs.cc
index 3bc6f4464e..8eff8ecc2f 100644
--- a/cpp/src/arrow/filesystem/mockfs.cc
+++ b/cpp/src/arrow/filesystem/mockfs.cc
@@ -436,6 +436,14 @@ MockFileSystem::MockFileSystem(TimePoint current_time, 
const io::IOContext& io_c
 
 bool MockFileSystem::Equals(const FileSystem& other) const { return this == 
&other; }
 
+Result<std::string> MockFileSystem::PathFromUri(const std::string& uri_string) 
const {
+  ARROW_ASSIGN_OR_RAISE(
+      std::string parsed_path,
+      internal::PathFromUriHelper(uri_string, {"mock"}, 
/*accept_local_paths=*/true,
+                                  
internal::AuthorityHandlingBehavior::kDisallow));
+  return std::string(internal::RemoveLeadingSlash(parsed_path));
+}
+
 Status MockFileSystem::CreateDir(const std::string& path, bool recursive) {
   RETURN_NOT_OK(ValidatePath(path));
   auto parts = SplitAbstractPath(path);
diff --git a/cpp/src/arrow/filesystem/mockfs.h 
b/cpp/src/arrow/filesystem/mockfs.h
index e12408f52c..32d06e5910 100644
--- a/cpp/src/arrow/filesystem/mockfs.h
+++ b/cpp/src/arrow/filesystem/mockfs.h
@@ -66,6 +66,7 @@ class ARROW_EXPORT MockFileSystem : public FileSystem {
   std::string type_name() const override { return "mock"; }
 
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const 
override;
 
   // XXX It's not very practical to have to explicitly declare inheritance
   // of default overrides.
diff --git a/cpp/src/arrow/filesystem/path_util.cc 
b/cpp/src/arrow/filesystem/path_util.cc
index ba4892a0ac..e25e544f03 100644
--- a/cpp/src/arrow/filesystem/path_util.cc
+++ b/cpp/src/arrow/filesystem/path_util.cc
@@ -129,7 +129,17 @@ std::string EnsureLeadingSlash(std::string_view v) {
     return std::string(v);
   }
 }
-std::string_view RemoveTrailingSlash(std::string_view key) {
+std::string_view RemoveTrailingSlash(std::string_view key, bool preserve_root) 
{
+  if (preserve_root && key.size() == 1) {
+    // If the user gives us "/" then don't return ""
+    return key;
+  }
+#ifdef _WIN32
+  if (preserve_root && key.size() == 3 && key[1] == ':' && key[0] != '/') {
+    // If the user gives us C:/ then don't return C:
+    return key;
+  }
+#endif
   while (!key.empty() && key.back() == kSep) {
     key.remove_suffix(1);
   }
diff --git a/cpp/src/arrow/filesystem/path_util.h 
b/cpp/src/arrow/filesystem/path_util.h
index 059827fb0a..b821e79338 100644
--- a/cpp/src/arrow/filesystem/path_util.h
+++ b/cpp/src/arrow/filesystem/path_util.h
@@ -69,8 +69,11 @@ std::string_view RemoveLeadingSlash(std::string_view s);
 ARROW_EXPORT
 std::string EnsureTrailingSlash(std::string_view s);
 
+/// \brief remove the forward slash (if any) from the given path
+/// \param s the input path
+/// \param preserve_root if true, allow a path of just "/" to remain unchanged
 ARROW_EXPORT
-std::string_view RemoveTrailingSlash(std::string_view s);
+std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = 
false);
 
 ARROW_EXPORT
 Status AssertNoTrailingSlash(std::string_view s);
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index a22d9c10be..dc033b9958 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -2235,6 +2235,11 @@ bool S3FileSystem::Equals(const FileSystem& other) const 
{
   return options().Equals(s3fs.options());
 }
 
+Result<std::string> S3FileSystem::PathFromUri(const std::string& uri_string) 
const {
+  return internal::PathFromUriHelper(uri_string, {"s3"}, 
/*accept_local_paths=*/false,
+                                     
internal::AuthorityHandlingBehavior::kPrepend);
+}
+
 S3Options S3FileSystem::options() const { return impl_->options(); }
 
 std::string S3FileSystem::region() const { return impl_->region(); }
diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h
index 2bccecafe8..0a7ca73ccb 100644
--- a/cpp/src/arrow/filesystem/s3fs.h
+++ b/cpp/src/arrow/filesystem/s3fs.h
@@ -247,6 +247,7 @@ class ARROW_EXPORT S3FileSystem : public FileSystem {
   std::string region() const;
 
   bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const 
override;
 
   /// \cond FALSE
   using FileSystem::GetFileInfo;
diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc 
b/cpp/src/arrow/filesystem/s3fs_test.cc
index 38df84bded..5b0287d997 100644
--- a/cpp/src/arrow/filesystem/s3fs_test.cc
+++ b/cpp/src/arrow/filesystem/s3fs_test.cc
@@ -58,6 +58,7 @@
 #include "arrow/status.h"
 #include "arrow/testing/future_util.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/async_generator.h"
 #include "arrow/util/checked_cast.h"
@@ -1143,6 +1144,18 @@ TEST_F(TestS3FS, FileSystemFromUri) {
   ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(ss.str(), &path));
   ASSERT_EQ(path, "bucket/somedir/subdir/subfile");
 
+  ASSERT_OK_AND_ASSIGN(path, fs->PathFromUri(ss.str()));
+  ASSERT_EQ(path, "bucket/somedir/subdir/subfile");
+
+  // Incorrect scheme
+  ASSERT_THAT(fs->PathFromUri("file:///@bucket/somedir/subdir/subfile"),
+              Raises(StatusCode::Invalid,
+                     testing::HasSubstr("expected a URI with one of the 
schemes (s3)")));
+
+  // Not a URI
+  ASSERT_THAT(fs->PathFromUri("/@bucket/somedir/subdir/subfile"),
+              Raises(StatusCode::Invalid, testing::HasSubstr("Expected a 
URI")));
+
   // Check the filesystem has the right connection parameters
   AssertFileInfo(fs.get(), path, FileType::File, 8);
 }
diff --git a/cpp/src/arrow/filesystem/util_internal.cc 
b/cpp/src/arrow/filesystem/util_internal.cc
index 79e8503818..a2f34fb1bb 100644
--- a/cpp/src/arrow/filesystem/util_internal.cc
+++ b/cpp/src/arrow/filesystem/util_internal.cc
@@ -24,10 +24,12 @@
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/util/io_util.h"
+#include "arrow/util/string.h"
 
 namespace arrow {
 
 using internal::StatusDetailFromErrno;
+using internal::Uri;
 
 namespace fs {
 namespace internal {
@@ -76,6 +78,114 @@ Status InvalidDeleteDirContents(std::string_view path) {
       "If you wish to delete the root directory's contents, call 
DeleteRootDirContents.");
 }
 
+Result<Uri> ParseFileSystemUri(const std::string& uri_string) {
+  Uri uri;
+  auto status = uri.Parse(uri_string);
+  if (!status.ok()) {
+#ifdef _WIN32
+    // Could be a "file:..." URI with backslashes instead of regular slashes.
+    RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string)));
+    if (uri.scheme() != "file") {
+      return status;
+    }
+#else
+    return status;
+#endif
+  }
+  return std::move(uri);
+}
+
+#ifdef _WIN32
+static bool IsDriveLetter(char c) {
+  // Can't use locale-dependent functions from the C/C++ stdlib
+  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+#endif
+
+bool DetectAbsolutePath(const std::string& s) {
+  // Is it a /-prefixed local path?
+  if (s.length() >= 1 && s[0] == '/') {
+    return true;
+  }
+#ifdef _WIN32
+  // Is it a \-prefixed local path?
+  if (s.length() >= 1 && s[0] == '\\') {
+    return true;
+  }
+  // Does it start with a drive letter in addition to being /- or \-prefixed,
+  // e.g. "C:\..."?
+  if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') &&
+      IsDriveLetter(s[0])) {
+    return true;
+  }
+#endif
+  return false;
+}
+
+Result<std::string> PathFromUriHelper(const std::string& uri_string,
+                                      std::vector<std::string> 
supported_schemes,
+                                      bool accept_local_paths,
+                                      AuthorityHandlingBehavior 
authority_handling) {
+  if (internal::DetectAbsolutePath(uri_string)) {
+    if (accept_local_paths) {
+      // Normalize the path and remove any trailing slash
+      return std::string(
+          internal::RemoveTrailingSlash(ToSlashes(uri_string), 
/*preserve_root=*/true));
+    }
+    return Status::Invalid(
+        "The filesystem is not capable of loading local paths.  Expected a URI 
but "
+        "received ",
+        uri_string);
+  }
+  Uri uri;
+  ARROW_RETURN_NOT_OK(uri.Parse(uri_string));
+  const auto scheme = uri.scheme();
+  if (std::find(supported_schemes.begin(), supported_schemes.end(), scheme) ==
+      supported_schemes.end()) {
+    std::string expected_schemes =
+        ::arrow::internal::JoinStrings(supported_schemes, ", ");
+    return Status::Invalid("The filesystem expected a URI with one of the 
schemes (",
+                           expected_schemes, ") but received ", uri_string);
+  }
+  std::string host = uri.host();
+  std::string path = uri.path();
+  if (host.empty()) {
+    // Just a path, may be absolute or relative, only allow relative paths if 
local
+    if (path[0] == '/') {
+      return std::string(internal::RemoveTrailingSlash(path));
+    }
+    if (accept_local_paths) {
+      return std::string(internal::RemoveTrailingSlash(path));
+    }
+    return Status::Invalid("The filesystem does not support relative paths.  
Received ",
+                           uri_string);
+  }
+  if (authority_handling == AuthorityHandlingBehavior::kDisallow) {
+    return Status::Invalid(
+        "The filesystem does not support the authority (host) component of a 
URI.  "
+        "Received ",
+        uri_string);
+  }
+  if (path[0] != '/') {
+    // This should not be possible
+    return Status::Invalid(
+        "The provided URI has a host component but a relative path which is 
not "
+        "supported. "
+        "Received ",
+        uri_string);
+  }
+  switch (authority_handling) {
+    case AuthorityHandlingBehavior::kPrepend:
+      return std::string(internal::RemoveTrailingSlash(host + path));
+    case AuthorityHandlingBehavior::kWindows:
+      return std::string(internal::RemoveTrailingSlash("//" + host + path));
+    case AuthorityHandlingBehavior::kIgnore:
+      return std::string(internal::RemoveTrailingSlash(path));
+    default:
+      return Status::Invalid("Unrecognized authority_handling value");
+  }
+}
+
 Result<FileInfoVector> GlobFiles(const std::shared_ptr<FileSystem>& filesystem,
                                  const std::string& glob) {
   // TODO: ARROW-17640
diff --git a/cpp/src/arrow/filesystem/util_internal.h 
b/cpp/src/arrow/filesystem/util_internal.h
index cc16dbba10..29a51512d0 100644
--- a/cpp/src/arrow/filesystem/util_internal.h
+++ b/cpp/src/arrow/filesystem/util_internal.h
@@ -24,9 +24,11 @@
 #include "arrow/filesystem/filesystem.h"
 #include "arrow/io/interfaces.h"
 #include "arrow/status.h"
+#include "arrow/util/uri.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
+using internal::Uri;
 namespace fs {
 namespace internal {
 
@@ -50,6 +52,40 @@ Status NotAFile(std::string_view path);
 ARROW_EXPORT
 Status InvalidDeleteDirContents(std::string_view path);
 
+/// \brief Parse the string as a URI
+/// \param uri_string the string to parse
+///
+/// This is the same as Uri::Parse except it tolerates Windows
+/// file URIs that contain backslash instead of /
+Result<Uri> ParseFileSystemUri(const std::string& uri_string);
+
+/// \brief check if the string is a local absolute path
+ARROW_EXPORT
+bool DetectAbsolutePath(const std::string& s);
+
+/// \brief describes how to handle the authority (host) component of the URI
+enum class AuthorityHandlingBehavior {
+  // Return an invalid status if the authority is non-empty
+  kDisallow = 0,
+  // Prepend the authority to the path (e.g. authority/some/path)
+  kPrepend = 1,
+  // Convert to a Windows style network path (e.g. //authority/some/path)
+  kWindows = 2,
+  // Ignore the authority and just use the path
+  kIgnore = 3
+};
+
+/// \brief check to see if uri_string matches one of the supported schemes and 
return the
+/// path component
+/// \param uri_string a uri or local path to test and convert
+/// \param supported_schemes the set of URI schemes that should be accepted
+/// \param accept_local_paths if true, allow an absolute path
+/// \return the path portion of the URI
+Result<std::string> PathFromUriHelper(const std::string& uri_string,
+                                      std::vector<std::string> 
supported_schemes,
+                                      bool accept_local_paths,
+                                      AuthorityHandlingBehavior 
authority_handling);
+
 /// \brief Return files matching the glob pattern on the filesystem
 ///
 /// Globbing starts from the root of the filesystem.

Reply via email to