This is an automated email from the ASF dual-hosted git repository.
westonpace pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7ca7724139 GH-34386: [C++] Add a PathFromUriOrPath method (#34420)
7ca7724139 is described below
commit 7ca7724139d3b04161369ffce04cf53e74eec54c
Author: Weston Pace <[email protected]>
AuthorDate: Thu May 4 15:22:44 2023 -0700
GH-34386: [C++] Add a PathFromUriOrPath method (#34420)
### Rationale for this change
We have some URI parsing indirectly exposed through FilesystemFromUri.
However, this isn't very useful if the user has multiple URIs or if they
already have a filesystem. This method allows that same URI handling to be
used even if the user already has a filesystem.
### What changes are included in this PR?
Adds a new arrow::fs::PathFromUriOrPath method
### Are these changes tested?
Yes, via unit tests
### Are there any user-facing changes?
There is a new API method but no changes to any existing APIs
* Closes: #34386
Authored-by: Weston Pace <[email protected]>
Signed-off-by: Weston Pace <[email protected]>
---
cpp/src/arrow/filesystem/filesystem.cc | 33 ++++-----
cpp/src/arrow/filesystem/filesystem.h | 22 ++++++
cpp/src/arrow/filesystem/gcsfs.cc | 6 ++
cpp/src/arrow/filesystem/gcsfs.h | 1 +
cpp/src/arrow/filesystem/gcsfs_test.cc | 17 ++++-
cpp/src/arrow/filesystem/hdfs.cc | 6 ++
cpp/src/arrow/filesystem/hdfs.h | 1 +
cpp/src/arrow/filesystem/hdfs_test.cc | 2 +
cpp/src/arrow/filesystem/localfs.cc | 57 ++++++----------
cpp/src/arrow/filesystem/localfs.h | 9 +--
cpp/src/arrow/filesystem/localfs_test.cc | 26 ++++++-
cpp/src/arrow/filesystem/mockfs.cc | 8 +++
cpp/src/arrow/filesystem/mockfs.h | 1 +
cpp/src/arrow/filesystem/path_util.cc | 12 +++-
cpp/src/arrow/filesystem/path_util.h | 5 +-
cpp/src/arrow/filesystem/s3fs.cc | 5 ++
cpp/src/arrow/filesystem/s3fs.h | 1 +
cpp/src/arrow/filesystem/s3fs_test.cc | 13 ++++
cpp/src/arrow/filesystem/util_internal.cc | 110 ++++++++++++++++++++++++++++++
cpp/src/arrow/filesystem/util_internal.h | 36 ++++++++++
20 files changed, 303 insertions(+), 68 deletions(-)
diff --git a/cpp/src/arrow/filesystem/filesystem.cc
b/cpp/src/arrow/filesystem/filesystem.cc
index 73b94d3828..6296dd8d85 100644
--- a/cpp/src/arrow/filesystem/filesystem.cc
+++ b/cpp/src/arrow/filesystem/filesystem.cc
@@ -60,6 +60,7 @@ using internal::ConcatAbstractPath;
using internal::EnsureTrailingSlash;
using internal::GetAbstractPathParent;
using internal::kSep;
+using internal::ParseFileSystemUri;
using internal::RemoveLeadingSlash;
using internal::RemoveTrailingSlash;
using internal::ToSlashes;
@@ -254,6 +255,10 @@ Result<std::shared_ptr<io::OutputStream>>
FileSystem::OpenAppendStream(
return OpenAppendStream(path, std::shared_ptr<const KeyValueMetadata>{});
}
+Result<std::string> FileSystem::PathFromUri(const std::string& uri_string)
const {
+ return Status::NotImplemented("PathFromUri is not yet supported on this
filesystem");
+}
+
//////////////////////////////////////////////////////////////////////////
// SubTreeFileSystem implementation
@@ -484,6 +489,10 @@ Result<std::shared_ptr<io::OutputStream>>
SubTreeFileSystem::OpenAppendStream(
return base_fs_->OpenAppendStream(real_path, metadata);
}
+Result<std::string> SubTreeFileSystem::PathFromUri(const std::string&
uri_string) const {
+ return base_fs_->PathFromUri(uri_string);
+}
+
//////////////////////////////////////////////////////////////////////////
// SlowFileSystem implementation
@@ -505,6 +514,10 @@ SlowFileSystem::SlowFileSystem(std::shared_ptr<FileSystem>
base_fs,
bool SlowFileSystem::Equals(const FileSystem& other) const { return this ==
&other; }
+Result<std::string> SlowFileSystem::PathFromUri(const std::string& uri_string)
const {
+ return base_fs_->PathFromUri(uri_string);
+}
+
Result<FileInfo> SlowFileSystem::GetFileInfo(const std::string& path) {
latencies_->Sleep();
return base_fs_->GetFileInfo(path);
@@ -662,23 +675,6 @@ Status CopyFiles(const std::shared_ptr<FileSystem>&
source_fs,
namespace {
-Result<Uri> ParseFileSystemUri(const std::string& uri_string) {
- Uri uri;
- auto status = uri.Parse(uri_string);
- if (!status.ok()) {
-#ifdef _WIN32
- // Could be a "file:..." URI with backslashes instead of regular slashes.
- RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string)));
- if (uri.scheme() != "file") {
- return status;
- }
-#else
- return status;
-#endif
- }
- return std::move(uri);
-}
-
Result<std::shared_ptr<FileSystem>> FileSystemFromUriReal(const Uri& uri,
const std::string&
uri_string,
const io::IOContext&
io_context,
@@ -763,7 +759,8 @@ Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
if (internal::DetectAbsolutePath(uri_string)) {
// Normalize path separators
if (out_path != nullptr) {
- *out_path = ToSlashes(uri_string);
+ *out_path =
+ std::string(RemoveTrailingSlash(ToSlashes(uri_string),
/*preserve_root=*/true));
}
return std::make_shared<LocalFileSystem>();
}
diff --git a/cpp/src/arrow/filesystem/filesystem.h
b/cpp/src/arrow/filesystem/filesystem.h
index 6dc18d7de8..cfadaeb0ce 100644
--- a/cpp/src/arrow/filesystem/filesystem.h
+++ b/cpp/src/arrow/filesystem/filesystem.h
@@ -171,6 +171,26 @@ class ARROW_EXPORT FileSystem : public
std::enable_shared_from_this<FileSystem>
/// may allow normalizing irregular path forms (such as Windows local paths).
virtual Result<std::string> NormalizePath(std::string path);
+ /// \brief Ensure a URI (or path) is compatible with the given filesystem
and return the
+ /// path
+ ///
+ /// \param uri_string A URI representing a resource in the given filesystem.
+ ///
+ /// This method will check to ensure the given filesystem is compatible with
the
+ /// URI. This can be useful when the user provides both a URI and a
filesystem or
+ /// when a user provides multiple URIs that should be compatible with the
same
+ /// filesystem.
+ ///
+ /// uri_string can be an absolute path instead of a URI. In that case it
will ensure
+ /// the filesystem (if supplied) is the local filesystem (or some custom
filesystem that
+ /// is capable of reading local paths) and will normalize the path's file
separators.
+ ///
+ /// Note, this method only checks to ensure the URI scheme is valid. It
will not detect
+ /// inconsistencies like a mismatching region or endpoint override.
+ ///
+ /// \return The path inside the filesystem that is indicated by the URI.
+ virtual Result<std::string> PathFromUri(const std::string& uri_string) const;
+
virtual bool Equals(const FileSystem& other) const = 0;
virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
@@ -336,6 +356,7 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }
Result<std::string> NormalizePath(std::string path) override;
+ Result<std::string> PathFromUri(const std::string& uri_string) const
override;
bool Equals(const FileSystem& other) const override;
@@ -410,6 +431,7 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem {
std::string type_name() const override { return "slow"; }
bool Equals(const FileSystem& other) const override;
+ Result<std::string> PathFromUri(const std::string& uri_string) const
override;
using FileSystem::GetFileInfo;
Result<FileInfo> GetFileInfo(const std::string& path) override;
diff --git a/cpp/src/arrow/filesystem/gcsfs.cc
b/cpp/src/arrow/filesystem/gcsfs.cc
index f063e31b5c..6fc75589b0 100644
--- a/cpp/src/arrow/filesystem/gcsfs.cc
+++ b/cpp/src/arrow/filesystem/gcsfs.cc
@@ -873,6 +873,12 @@ bool GcsFileSystem::Equals(const FileSystem& other) const {
return impl_->options().Equals(fs.impl_->options());
}
+Result<std::string> GcsFileSystem::PathFromUri(const std::string& uri_string)
const {
+ return internal::PathFromUriHelper(uri_string, {"gs", "gcs"},
+ /*accept_local_paths=*/false,
+
internal::AuthorityHandlingBehavior::kPrepend);
+}
+
Result<FileInfo> GcsFileSystem::GetFileInfo(const std::string& path) {
ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path));
return impl_->GetFileInfo(p);
diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h
index c3d03b5cb2..d4b919ec81 100644
--- a/cpp/src/arrow/filesystem/gcsfs.h
+++ b/cpp/src/arrow/filesystem/gcsfs.h
@@ -178,6 +178,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem {
const GcsOptions& options() const;
bool Equals(const FileSystem& other) const override;
+ Result<std::string> PathFromUri(const std::string& uri_string) const
override;
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc
b/cpp/src/arrow/filesystem/gcsfs_test.cc
index 15d596dc0b..9d5136b47c 100644
--- a/cpp/src/arrow/filesystem/gcsfs_test.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_test.cc
@@ -54,6 +54,7 @@
#include "arrow/filesystem/path_util.h"
#include "arrow/filesystem/test_util.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
#include "arrow/testing/util.h"
#include "arrow/util/future.h"
#include "arrow/util/key_value_metadata.h"
@@ -1383,12 +1384,24 @@ TEST_F(GcsIntegrationTest, OpenInputFileClosed) {
TEST_F(GcsIntegrationTest, TestFileSystemFromUri) {
// Smoke test for FileSystemFromUri
- ASSERT_OK_AND_ASSIGN(auto fs,
FileSystemFromUri(std::string("gs://anonymous@") +
- PreexistingBucketPath()));
+ std::string path;
+ ASSERT_OK_AND_ASSIGN(
+ auto fs,
+ FileSystemFromUri(std::string("gs://anonymous@") +
PreexistingBucketPath(), &path));
EXPECT_EQ(fs->type_name(), "gcs");
+ EXPECT_EQ(path, PreexistingBucketName());
+ ASSERT_OK_AND_ASSIGN(
+ path, fs->PathFromUri(std::string("gs://anonymous@") +
PreexistingBucketPath()));
+ EXPECT_EQ(path, PreexistingBucketName());
ASSERT_OK_AND_ASSIGN(auto fs2,
FileSystemFromUri(std::string("gcs://anonymous@") +
PreexistingBucketPath()));
EXPECT_EQ(fs2->type_name(), "gcs");
+ ASSERT_THAT(fs->PathFromUri("/foo/bar"),
+ Raises(StatusCode::Invalid, testing::HasSubstr("Expected a
URI")));
+ ASSERT_THAT(
+ fs->PathFromUri("s3:///foo/bar"),
+ Raises(StatusCode::Invalid,
+ testing::HasSubstr("expected a URI with one of the schemes (gs,
gcs)")));
}
} // namespace
diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc
index 8709ab4562..b227aae65d 100644
--- a/cpp/src/arrow/filesystem/hdfs.cc
+++ b/cpp/src/arrow/filesystem/hdfs.cc
@@ -473,6 +473,12 @@ bool HadoopFileSystem::Equals(const FileSystem& other)
const {
return options().Equals(hdfs.options());
}
+Result<std::string> HadoopFileSystem::PathFromUri(const std::string&
uri_string) const {
+ return internal::PathFromUriHelper(uri_string, {"hdfs", "viewfs"},
+ /*accept_local_paths=*/false,
+
internal::AuthorityHandlingBehavior::kIgnore);
+}
+
Result<std::vector<FileInfo>> HadoopFileSystem::GetFileInfo(const
FileSelector& select) {
return impl_->GetFileInfo(select);
}
diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h
index bed0ac4c61..798aac0ea9 100644
--- a/cpp/src/arrow/filesystem/hdfs.h
+++ b/cpp/src/arrow/filesystem/hdfs.h
@@ -66,6 +66,7 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem {
std::string type_name() const override { return "hdfs"; }
HdfsOptions options() const;
bool Equals(const FileSystem& other) const override;
+ Result<std::string> PathFromUri(const std::string& uri_string) const
override;
/// \cond FALSE
using FileSystem::GetFileInfo;
diff --git a/cpp/src/arrow/filesystem/hdfs_test.cc
b/cpp/src/arrow/filesystem/hdfs_test.cc
index b1020231be..7ad9e6cd40 100644
--- a/cpp/src/arrow/filesystem/hdfs_test.cc
+++ b/cpp/src/arrow/filesystem/hdfs_test.cc
@@ -119,6 +119,8 @@ class TestHadoopFileSystem : public ::testing::Test, public
HadoopFileSystemTest
ARROW_LOG(INFO) << "!!! uri = " << ss.str();
ASSERT_OK_AND_ASSIGN(uri_fs, FileSystemFromUri(ss.str(), &path));
ASSERT_EQ(path, "/");
+ ASSERT_OK_AND_ASSIGN(path, uri_fs->PathFromUri(ss.str()));
+ ASSERT_EQ(path, "/");
// Sanity check
ASSERT_OK(uri_fs->CreateDir("AB"));
diff --git a/cpp/src/arrow/filesystem/localfs.cc
b/cpp/src/arrow/filesystem/localfs.cc
index 03b4ad3bc7..e030014159 100644
--- a/cpp/src/arrow/filesystem/localfs.cc
+++ b/cpp/src/arrow/filesystem/localfs.cc
@@ -52,37 +52,6 @@ using ::arrow::internal::IOErrorFromWinError;
using ::arrow::internal::NativePathString;
using ::arrow::internal::PlatformFilename;
-namespace internal {
-
-#ifdef _WIN32
-static bool IsDriveLetter(char c) {
- // Can't use locale-dependent functions from the C/C++ stdlib
- return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
-}
-#endif
-
-bool DetectAbsolutePath(const std::string& s) {
- // Is it a /-prefixed local path?
- if (s.length() >= 1 && s[0] == '/') {
- return true;
- }
-#ifdef _WIN32
- // Is it a \-prefixed local path?
- if (s.length() >= 1 && s[0] == '\\') {
- return true;
- }
- // Does it start with a drive letter in addition to being /- or \-prefixed,
- // e.g. "C:\..."?
- if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') &&
- IsDriveLetter(s[0])) {
- return true;
- }
-#endif
- return false;
-}
-
-} // namespace internal
-
namespace {
Status ValidatePath(std::string_view s) {
@@ -92,6 +61,12 @@ Status ValidatePath(std::string_view s) {
return Status::OK();
}
+Result<std::string> DoNormalizePath(std::string path) {
+ RETURN_NOT_OK(ValidatePath(path));
+ ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path));
+ return fn.ToString();
+}
+
#ifdef _WIN32
std::string NativeToString(const NativePathString& ns) {
@@ -263,13 +238,15 @@ Result<LocalFileSystemOptions>
LocalFileSystemOptions::FromUri(
#ifdef _WIN32
std::stringstream ss;
ss << "//" << host << "/" << internal::RemoveLeadingSlash(uri.path());
- *out_path = ss.str();
+ *out_path =
+ std::string(internal::RemoveTrailingSlash(ss.str(),
/*preserve_root=*/true));
#else
return Status::Invalid("Unsupported hostname in non-Windows local URI: '",
uri.ToString(), "'");
#endif
} else {
- *out_path = uri.path();
+ *out_path =
+ std::string(internal::RemoveTrailingSlash(uri.path(),
/*preserve_root=*/true));
}
// TODO handle use_mmap option
@@ -286,9 +263,17 @@ LocalFileSystem::LocalFileSystem(const
LocalFileSystemOptions& options,
LocalFileSystem::~LocalFileSystem() {}
Result<std::string> LocalFileSystem::NormalizePath(std::string path) {
- RETURN_NOT_OK(ValidatePath(path));
- ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path));
- return fn.ToString();
+ return DoNormalizePath(std::move(path));
+}
+
+Result<std::string> LocalFileSystem::PathFromUri(const std::string&
uri_string) const {
+#ifdef _WIN32
+ auto authority_handling = internal::AuthorityHandlingBehavior::kWindows;
+#else
+ auto authority_handling = internal::AuthorityHandlingBehavior::kDisallow;
+#endif
+ return internal::PathFromUriHelper(uri_string, {"file"},
/*accept_local_paths=*/true,
+ authority_handling);
}
bool LocalFileSystem::Equals(const FileSystem& other) const {
diff --git a/cpp/src/arrow/filesystem/localfs.h
b/cpp/src/arrow/filesystem/localfs.h
index 75eaf314e4..108530c2b2 100644
--- a/cpp/src/arrow/filesystem/localfs.h
+++ b/cpp/src/arrow/filesystem/localfs.h
@@ -82,6 +82,7 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem {
std::string type_name() const override { return "local"; }
Result<std::string> NormalizePath(std::string path) override;
+ Result<std::string> PathFromUri(const std::string& uri_string) const
override;
bool Equals(const FileSystem& other) const override;
@@ -121,13 +122,5 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem {
LocalFileSystemOptions options_;
};
-namespace internal {
-
-// Return whether the string is detected as a local absolute path.
-ARROW_EXPORT
-bool DetectAbsolutePath(const std::string& s);
-
-} // namespace internal
-
} // namespace fs
} // namespace arrow
diff --git a/cpp/src/arrow/filesystem/localfs_test.cc
b/cpp/src/arrow/filesystem/localfs_test.cc
index 33f75dd845..7ce2a56968 100644
--- a/cpp/src/arrow/filesystem/localfs_test.cc
+++ b/cpp/src/arrow/filesystem/localfs_test.cc
@@ -31,6 +31,7 @@
#include "arrow/filesystem/test_util.h"
#include "arrow/filesystem/util_internal.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
#include "arrow/util/io_util.h"
#include "arrow/util/uri.h"
@@ -183,7 +184,7 @@ class TestLocalFS : public LocalFSTestMixin {
}
std::string path;
ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path));
- ASSERT_EQ(path, local_path_);
+ ASSERT_EQ(path, std::string(RemoveTrailingSlash(local_path_)));
// Test that the right location on disk is accessed
CreateFile(fs_.get(), local_path_ + "abc", "some data");
@@ -201,6 +202,7 @@ class TestLocalFS : public LocalFSTestMixin {
template <typename FileSystemFromUriFunc>
void CheckLocalUri(const std::string& uri, const std::string& expected_path,
FileSystemFromUriFunc&& fs_from_uri) {
+ ARROW_SCOPED_TRACE("uri = ", uri);
if (!path_formatter_.supports_uri()) {
return; // skip
}
@@ -208,6 +210,8 @@ class TestLocalFS : public LocalFSTestMixin {
ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path));
ASSERT_EQ(fs_->type_name(), "local");
ASSERT_EQ(path, expected_path);
+ ASSERT_OK_AND_ASSIGN(path, fs_->PathFromUri(uri));
+ ASSERT_EQ(path, expected_path);
}
// Like TestFileSystemFromUri, but with an arbitrary non-existing path
@@ -220,6 +224,7 @@ class TestLocalFS : public LocalFSTestMixin {
}
void TestInvalidUri(const std::string& uri) {
+ ARROW_SCOPED_TRACE("uri = ", uri);
if (!path_formatter_.supports_uri()) {
return; // skip
}
@@ -227,10 +232,20 @@ class TestLocalFS : public LocalFSTestMixin {
}
void TestInvalidUriOrPath(const std::string& uri) {
+ ARROW_SCOPED_TRACE("uri = ", uri);
if (!path_formatter_.supports_uri()) {
return; // skip
}
ASSERT_RAISES(Invalid, FileSystemFromUriOrPath(uri));
+ LocalFileSystem lfs;
+ ASSERT_RAISES(Invalid, lfs.PathFromUri(uri));
+ }
+
+ void TestInvalidPathFromUri(const std::string& uri, const std::string&
expected_err) {
+ // Legitimate URI for the wrong filesystem
+ LocalFileSystem lfs;
+ ASSERT_THAT(lfs.PathFromUri(uri),
+ Raises(StatusCode::Invalid, testing::HasSubstr(expected_err)));
}
void CheckConcreteFile(const std::string& path, int64_t expected_size) {
@@ -320,6 +335,7 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriFile) {
"//some server/some share/some path");
#else
this->TestInvalidUri("file://server/share/foo/bar");
+ this->TestInvalidUriOrPath("file://server/share/foo/bar");
#endif
// Relative paths
@@ -334,9 +350,10 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriNoScheme) {
// Variations
this->TestLocalUriOrPath(this->path_formatter_("/foo/bar"), "/foo/bar");
+ this->TestLocalUriOrPath(this->path_formatter_("/"), "/");
#ifdef _WIN32
- this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"),
"C:/foo/bar/");
+ this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"), "C:/foo/bar");
#endif
// Relative paths
@@ -360,6 +377,11 @@ TYPED_TEST(TestLocalFS,
FileSystemFromUriNoSchemeBackslashes) {
this->TestInvalidUriOrPath("foo\\bar");
}
+TYPED_TEST(TestLocalFS, MismatchedFilesystemPathFromUri) {
+ this->TestInvalidPathFromUri("s3://foo",
+ "expected a URI with one of the schemes
(file)");
+}
+
TYPED_TEST(TestLocalFS, DirectoryMTime) {
TimePoint t1 = CurrentTimePoint();
ASSERT_OK(this->fs_->CreateDir("AB/CD/EF"));
diff --git a/cpp/src/arrow/filesystem/mockfs.cc
b/cpp/src/arrow/filesystem/mockfs.cc
index 3bc6f4464e..8eff8ecc2f 100644
--- a/cpp/src/arrow/filesystem/mockfs.cc
+++ b/cpp/src/arrow/filesystem/mockfs.cc
@@ -436,6 +436,14 @@ MockFileSystem::MockFileSystem(TimePoint current_time,
const io::IOContext& io_c
bool MockFileSystem::Equals(const FileSystem& other) const { return this ==
&other; }
+Result<std::string> MockFileSystem::PathFromUri(const std::string& uri_string)
const {
+ ARROW_ASSIGN_OR_RAISE(
+ std::string parsed_path,
+ internal::PathFromUriHelper(uri_string, {"mock"},
/*accept_local_paths=*/true,
+
internal::AuthorityHandlingBehavior::kDisallow));
+ return std::string(internal::RemoveLeadingSlash(parsed_path));
+}
+
Status MockFileSystem::CreateDir(const std::string& path, bool recursive) {
RETURN_NOT_OK(ValidatePath(path));
auto parts = SplitAbstractPath(path);
diff --git a/cpp/src/arrow/filesystem/mockfs.h
b/cpp/src/arrow/filesystem/mockfs.h
index e12408f52c..32d06e5910 100644
--- a/cpp/src/arrow/filesystem/mockfs.h
+++ b/cpp/src/arrow/filesystem/mockfs.h
@@ -66,6 +66,7 @@ class ARROW_EXPORT MockFileSystem : public FileSystem {
std::string type_name() const override { return "mock"; }
bool Equals(const FileSystem& other) const override;
+ Result<std::string> PathFromUri(const std::string& uri_string) const
override;
// XXX It's not very practical to have to explicitly declare inheritance
// of default overrides.
diff --git a/cpp/src/arrow/filesystem/path_util.cc
b/cpp/src/arrow/filesystem/path_util.cc
index ba4892a0ac..e25e544f03 100644
--- a/cpp/src/arrow/filesystem/path_util.cc
+++ b/cpp/src/arrow/filesystem/path_util.cc
@@ -129,7 +129,17 @@ std::string EnsureLeadingSlash(std::string_view v) {
return std::string(v);
}
}
-std::string_view RemoveTrailingSlash(std::string_view key) {
+std::string_view RemoveTrailingSlash(std::string_view key, bool preserve_root)
{
+ if (preserve_root && key.size() == 1) {
+ // If the user gives us "/" then don't return ""
+ return key;
+ }
+#ifdef _WIN32
+ if (preserve_root && key.size() == 3 && key[1] == ':' && key[0] != '/') {
+ // If the user gives us C:/ then don't return C:
+ return key;
+ }
+#endif
while (!key.empty() && key.back() == kSep) {
key.remove_suffix(1);
}
diff --git a/cpp/src/arrow/filesystem/path_util.h
b/cpp/src/arrow/filesystem/path_util.h
index 059827fb0a..b821e79338 100644
--- a/cpp/src/arrow/filesystem/path_util.h
+++ b/cpp/src/arrow/filesystem/path_util.h
@@ -69,8 +69,11 @@ std::string_view RemoveLeadingSlash(std::string_view s);
ARROW_EXPORT
std::string EnsureTrailingSlash(std::string_view s);
+/// \brief remove the forward slash (if any) from the given path
+/// \param s the input path
+/// \param preserve_root if true, allow a path of just "/" to remain unchanged
ARROW_EXPORT
-std::string_view RemoveTrailingSlash(std::string_view s);
+std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root =
false);
ARROW_EXPORT
Status AssertNoTrailingSlash(std::string_view s);
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index a22d9c10be..dc033b9958 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -2235,6 +2235,11 @@ bool S3FileSystem::Equals(const FileSystem& other) const
{
return options().Equals(s3fs.options());
}
+Result<std::string> S3FileSystem::PathFromUri(const std::string& uri_string)
const {
+ return internal::PathFromUriHelper(uri_string, {"s3"},
/*accept_local_paths=*/false,
+
internal::AuthorityHandlingBehavior::kPrepend);
+}
+
S3Options S3FileSystem::options() const { return impl_->options(); }
std::string S3FileSystem::region() const { return impl_->region(); }
diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h
index 2bccecafe8..0a7ca73ccb 100644
--- a/cpp/src/arrow/filesystem/s3fs.h
+++ b/cpp/src/arrow/filesystem/s3fs.h
@@ -247,6 +247,7 @@ class ARROW_EXPORT S3FileSystem : public FileSystem {
std::string region() const;
bool Equals(const FileSystem& other) const override;
+ Result<std::string> PathFromUri(const std::string& uri_string) const
override;
/// \cond FALSE
using FileSystem::GetFileInfo;
diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc
b/cpp/src/arrow/filesystem/s3fs_test.cc
index 38df84bded..5b0287d997 100644
--- a/cpp/src/arrow/filesystem/s3fs_test.cc
+++ b/cpp/src/arrow/filesystem/s3fs_test.cc
@@ -58,6 +58,7 @@
#include "arrow/status.h"
#include "arrow/testing/future_util.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
#include "arrow/testing/util.h"
#include "arrow/util/async_generator.h"
#include "arrow/util/checked_cast.h"
@@ -1143,6 +1144,18 @@ TEST_F(TestS3FS, FileSystemFromUri) {
ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(ss.str(), &path));
ASSERT_EQ(path, "bucket/somedir/subdir/subfile");
+ ASSERT_OK_AND_ASSIGN(path, fs->PathFromUri(ss.str()));
+ ASSERT_EQ(path, "bucket/somedir/subdir/subfile");
+
+ // Incorrect scheme
+ ASSERT_THAT(fs->PathFromUri("file:///@bucket/somedir/subdir/subfile"),
+ Raises(StatusCode::Invalid,
+ testing::HasSubstr("expected a URI with one of the
schemes (s3)")));
+
+ // Not a URI
+ ASSERT_THAT(fs->PathFromUri("/@bucket/somedir/subdir/subfile"),
+ Raises(StatusCode::Invalid, testing::HasSubstr("Expected a
URI")));
+
// Check the filesystem has the right connection parameters
AssertFileInfo(fs.get(), path, FileType::File, 8);
}
diff --git a/cpp/src/arrow/filesystem/util_internal.cc
b/cpp/src/arrow/filesystem/util_internal.cc
index 79e8503818..a2f34fb1bb 100644
--- a/cpp/src/arrow/filesystem/util_internal.cc
+++ b/cpp/src/arrow/filesystem/util_internal.cc
@@ -24,10 +24,12 @@
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/io_util.h"
+#include "arrow/util/string.h"
namespace arrow {
using internal::StatusDetailFromErrno;
+using internal::Uri;
namespace fs {
namespace internal {
@@ -76,6 +78,114 @@ Status InvalidDeleteDirContents(std::string_view path) {
"If you wish to delete the root directory's contents, call
DeleteRootDirContents.");
}
+Result<Uri> ParseFileSystemUri(const std::string& uri_string) {
+ Uri uri;
+ auto status = uri.Parse(uri_string);
+ if (!status.ok()) {
+#ifdef _WIN32
+ // Could be a "file:..." URI with backslashes instead of regular slashes.
+ RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string)));
+ if (uri.scheme() != "file") {
+ return status;
+ }
+#else
+ return status;
+#endif
+ }
+ return std::move(uri);
+}
+
+#ifdef _WIN32
+static bool IsDriveLetter(char c) {
+ // Can't use locale-dependent functions from the C/C++ stdlib
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+#endif
+
+bool DetectAbsolutePath(const std::string& s) {
+ // Is it a /-prefixed local path?
+ if (s.length() >= 1 && s[0] == '/') {
+ return true;
+ }
+#ifdef _WIN32
+ // Is it a \-prefixed local path?
+ if (s.length() >= 1 && s[0] == '\\') {
+ return true;
+ }
+ // Does it start with a drive letter in addition to being /- or \-prefixed,
+ // e.g. "C:\..."?
+ if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') &&
+ IsDriveLetter(s[0])) {
+ return true;
+ }
+#endif
+ return false;
+}
+
+Result<std::string> PathFromUriHelper(const std::string& uri_string,
+ std::vector<std::string>
supported_schemes,
+ bool accept_local_paths,
+ AuthorityHandlingBehavior
authority_handling) {
+ if (internal::DetectAbsolutePath(uri_string)) {
+ if (accept_local_paths) {
+ // Normalize the path and remove any trailing slash
+ return std::string(
+ internal::RemoveTrailingSlash(ToSlashes(uri_string),
/*preserve_root=*/true));
+ }
+ return Status::Invalid(
+ "The filesystem is not capable of loading local paths. Expected a URI
but "
+ "received ",
+ uri_string);
+ }
+ Uri uri;
+ ARROW_RETURN_NOT_OK(uri.Parse(uri_string));
+ const auto scheme = uri.scheme();
+ if (std::find(supported_schemes.begin(), supported_schemes.end(), scheme) ==
+ supported_schemes.end()) {
+ std::string expected_schemes =
+ ::arrow::internal::JoinStrings(supported_schemes, ", ");
+ return Status::Invalid("The filesystem expected a URI with one of the
schemes (",
+ expected_schemes, ") but received ", uri_string);
+ }
+ std::string host = uri.host();
+ std::string path = uri.path();
+ if (host.empty()) {
+ // Just a path, may be absolute or relative, only allow relative paths if
local
+ if (path[0] == '/') {
+ return std::string(internal::RemoveTrailingSlash(path));
+ }
+ if (accept_local_paths) {
+ return std::string(internal::RemoveTrailingSlash(path));
+ }
+ return Status::Invalid("The filesystem does not support relative paths.
Received ",
+ uri_string);
+ }
+ if (authority_handling == AuthorityHandlingBehavior::kDisallow) {
+ return Status::Invalid(
+ "The filesystem does not support the authority (host) component of a
URI. "
+ "Received ",
+ uri_string);
+ }
+ if (path[0] != '/') {
+ // This should not be possible
+ return Status::Invalid(
+ "The provided URI has a host component but a relative path which is
not "
+ "supported. "
+ "Received ",
+ uri_string);
+ }
+ switch (authority_handling) {
+ case AuthorityHandlingBehavior::kPrepend:
+ return std::string(internal::RemoveTrailingSlash(host + path));
+ case AuthorityHandlingBehavior::kWindows:
+ return std::string(internal::RemoveTrailingSlash("//" + host + path));
+ case AuthorityHandlingBehavior::kIgnore:
+ return std::string(internal::RemoveTrailingSlash(path));
+ default:
+ return Status::Invalid("Unrecognized authority_handling value");
+ }
+}
+
Result<FileInfoVector> GlobFiles(const std::shared_ptr<FileSystem>& filesystem,
const std::string& glob) {
// TODO: ARROW-17640
diff --git a/cpp/src/arrow/filesystem/util_internal.h
b/cpp/src/arrow/filesystem/util_internal.h
index cc16dbba10..29a51512d0 100644
--- a/cpp/src/arrow/filesystem/util_internal.h
+++ b/cpp/src/arrow/filesystem/util_internal.h
@@ -24,9 +24,11 @@
#include "arrow/filesystem/filesystem.h"
#include "arrow/io/interfaces.h"
#include "arrow/status.h"
+#include "arrow/util/uri.h"
#include "arrow/util/visibility.h"
namespace arrow {
+using internal::Uri;
namespace fs {
namespace internal {
@@ -50,6 +52,40 @@ Status NotAFile(std::string_view path);
ARROW_EXPORT
Status InvalidDeleteDirContents(std::string_view path);
+/// \brief Parse the string as a URI
+/// \param uri_string the string to parse
+///
+/// This is the same as Uri::Parse except it tolerates Windows
+/// file URIs that contain backslash instead of /
+Result<Uri> ParseFileSystemUri(const std::string& uri_string);
+
+/// \brief check if the string is a local absolute path
+ARROW_EXPORT
+bool DetectAbsolutePath(const std::string& s);
+
+/// \brief describes how to handle the authority (host) component of the URI
+enum class AuthorityHandlingBehavior {
+ // Return an invalid status if the authority is non-empty
+ kDisallow = 0,
+ // Prepend the authority to the path (e.g. authority/some/path)
+ kPrepend = 1,
+ // Convert to a Windows style network path (e.g. //authority/some/path)
+ kWindows = 2,
+ // Ignore the authority and just use the path
+ kIgnore = 3
+};
+
+/// \brief check to see if uri_string matches one of the supported schemes and
return the
+/// path component
+/// \param uri_string a uri or local path to test and convert
+/// \param supported_schemes the set of URI schemes that should be accepted
+/// \param accept_local_paths if true, allow an absolute path
+/// \return the path portion of the URI
+Result<std::string> PathFromUriHelper(const std::string& uri_string,
+ std::vector<std::string>
supported_schemes,
+ bool accept_local_paths,
+ AuthorityHandlingBehavior
authority_handling);
+
/// \brief Return files matching the glob pattern on the filesystem
///
/// Globbing starts from the root of the filesystem.