felipecrv commented on code in PR #39067:
URL: https://github.com/apache/arrow/pull/39067#discussion_r1505256437
##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -674,6 +684,53 @@ Status CopyFiles(const std::shared_ptr<FileSystem>&
source_fs,
return CopyFiles(sources, destinations, io_context, chunk_size, use_threads);
}
+struct Registry {
+ std::shared_mutex mutex;
+ std::unordered_map<std::string, FileSystem::Factory*> scheme_to_factory;
+ std::vector<void (*)()> finalizers;
+ bool finalized = false;
+};
Review Comment:
Could these members be made `private` and all functions that manipulate them
(including the mutex) become part of the class itself?
##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -682,6 +739,19 @@ Result<std::shared_ptr<FileSystem>>
FileSystemFromUriReal(const Uri& uri,
std::string*
out_path) {
const auto scheme = uri.scheme();
+ {
+ auto& [mutex, scheme_to_factory, _, finalized] =
*GetFileSystemFactoryRegistry();
+ std::shared_lock lock{mutex};
+ if (finalized) {
+ return Status::Invalid("FileSystem factories were already finalized!");
+ }
+
+ auto it = scheme_to_factory.find(scheme);
+ if (it != scheme_to_factory.end()) {
+ return it->second(uri, io_context, out_path);
+ }
+ }
+
Review Comment:
Another block that can be moved to `Registry`.
##########
cpp/examples/arrow/filesystem_definition_example.cc:
##########
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/io/memory.h>
+#include <arrow/result.h>
+#include <arrow/util/uri.h>
+
+// Demonstrate registering a user-defined Arrow FileSystem outside
+// of the Arrow source tree.
+
+using arrow::Result;
+using arrow::Status;
+namespace io = arrow::io;
+namespace fs = arrow::fs;
+
+class ExampleFileSystem : public fs::FileSystem {
+ public:
+ explicit ExampleFileSystem(const io::IOContext& io_context)
+ : fs::FileSystem{io_context} {}
+
+ // This is a mock filesystem whose root directory contains a single file.
+ // All operations which would mutate will simply raise an error.
+ static constexpr std::string_view kPath = "example_file";
+ static constexpr std::string_view kContents = "hello world";
+ static fs::FileInfo info() {
+ fs::FileInfo info;
+ info.set_path(std::string{kPath});
+ info.set_type(fs::FileType::File);
+ info.set_size(kContents.size());
+ return info;
+ }
+
+ static Status DoesntExist(std::string_view path) {
Review Comment:
nit: `NotFound` or `PathNotFound` maybe
##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -674,6 +684,53 @@ Status CopyFiles(const std::shared_ptr<FileSystem>&
source_fs,
return CopyFiles(sources, destinations, io_context, chunk_size, use_threads);
}
+struct Registry {
+ std::shared_mutex mutex;
+ std::unordered_map<std::string, FileSystem::Factory*> scheme_to_factory;
+ std::vector<void (*)()> finalizers;
+ bool finalized = false;
+};
+
+extern "C" ARROW_EXPORT auto* GetFileSystemFactoryRegistry() {
+ // Check if this function is the one linked to main
+ static Registry registry;
+ return ®istry;
+}
+
+extern "C" {
+ARROW_EXPORT void MergeFileSystemRegistry(void* main_registry) {
+ if (GetFileSystemFactoryRegistry() == main_registry) return;
+
+ auto& [mutex, scheme_to_factory, finalizers, _f] =
*GetFileSystemFactoryRegistry();
+ auto& [main_mutex, main_scheme_to_factory, main_finalizers, _mf] =
+ *static_cast<Registry*>(main_registry);
+
+ std::unique_lock lock{mutex}, main_lock{main_mutex};
+
+ for (auto& [scheme, factory] : scheme_to_factory) {
+ auto& ref = main_scheme_to_factory[scheme];
+ if (ref) continue;
+ ref = factory;
+ }
+ scheme_to_factory.clear();
+
+ for (auto* finalizer : finalizers) {
+ main_finalizers.push_back(finalizer);
+ }
+ finalizers.clear();
+}
Review Comment:
This can become `GetFilesystemFactoryRegistry()->Merge(main_registry);` with
`Merge` being defined at `Registry`. The initial check would be `this ==
main_registry` and all the member variables would be already in scope.
##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -225,7 +232,8 @@ class ARROW_EXPORT FileSystem : public
std::enable_shared_from_this<FileSystem>
/// Create a directory and subdirectories.
///
/// This function succeeds if the directory already exists.
- virtual Status CreateDir(const std::string& path, bool recursive = true) = 0;
+ virtual Status CreateDir(const std::string& path, bool recursive) = 0;
+ Status CreateDir(const std::string& path) { return CreateDir(path, true); }
Review Comment:
All power to you for changing this @bkietz. Default arguments in virtual
functions are so confusing!
##########
cpp/examples/arrow/filesystem_definition_example.cc:
##########
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/io/memory.h>
+#include <arrow/result.h>
+#include <arrow/util/uri.h>
+
+// Demonstrate registering a user-defined Arrow FileSystem outside
+// of the Arrow source tree.
+
+using arrow::Result;
+using arrow::Status;
+namespace io = arrow::io;
+namespace fs = arrow::fs;
+
+class ExampleFileSystem : public fs::FileSystem {
+ public:
+ explicit ExampleFileSystem(const io::IOContext& io_context)
+ : fs::FileSystem{io_context} {}
+
+ // This is a mock filesystem whose root directory contains a single file.
+ // All operations which would mutate will simply raise an error.
+ static constexpr std::string_view kPath = "example_file";
+ static constexpr std::string_view kContents = "hello world";
+ static fs::FileInfo info() {
+ fs::FileInfo info;
+ info.set_path(std::string{kPath});
+ info.set_type(fs::FileType::File);
+ info.set_size(kContents.size());
+ return info;
+ }
+
+ static Status DoesntExist(std::string_view path) {
+ return Status::IOError("Path does not exist '", path, "'");
+ }
+
+ static Status NoMutation() {
+ return Status::IOError("operations which would mutate are not permitted");
Review Comment:
Why not `Status::NotImplemented`?
##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -737,6 +807,31 @@ Result<std::shared_ptr<FileSystem>>
FileSystemFromUriReal(const Uri& uri,
} // namespace
+void RegisterFileSystemFactory(std::string scheme, FileSystem::Factory factory,
+ void finalizer()) {
+ auto& [mutex, scheme_to_factory, finalizers, finalized] =
+ *GetFileSystemFactoryRegistry();
+ std::unique_lock lock{mutex};
+ if (finalized) return;
+
+ auto& ref = scheme_to_factory[scheme];
+ if (ref) return;
+ ref = factory;
+
+ finalizers.push_back(finalizer);
+}
+
+void EnsureFinalized() {
+ auto& [mutex, scheme_to_factory, finalizers, finalized] =
+ *GetFileSystemFactoryRegistry();
+ std::unique_lock lock{mutex};
+ if (finalized) return;
+
+ for (auto finalizer : finalizers) {
+ finalizer();
+ }
+}
Review Comment:
These two can also be moved to `Registry`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]