pitrou commented on code in PR #39067:
URL: https://github.com/apache/arrow/pull/39067#discussion_r1516197413


##########
cpp/src/arrow/util/visibility.h:
##########
@@ -80,4 +82,6 @@
 #define ARROW_FRIEND_EXPORT
 #define ARROW_TEMPLATE_EXPORT
 
+#define ARROW_FORCE_EXPORT [[gnu::visibility("default")]]

Review Comment:
   Is this required in some particular case? Unless instructed otherwise, gcc 
exports all non-static symbols by default.



##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -674,6 +685,153 @@ Status CopyFiles(const std::shared_ptr<FileSystem>& 
source_fs,
   return CopyFiles(sources, destinations, io_context, chunk_size, use_threads);
 }
 
+class FileSystemFactoryRegistry {
+ public:
+  static FileSystemFactoryRegistry* GetInstance() {
+    static FileSystemFactoryRegistry registry;
+    if (registry.merged_into_ != nullptr) {
+      return registry.merged_into_;
+    }
+    return &registry;
+  }
+
+  Result<const FileSystemFactory*> FactoryForScheme(const std::string& scheme) 
{
+    std::shared_lock lock{mutex_};
+    RETURN_NOT_OK(CheckValid());
+
+    auto it = scheme_to_factory_.find(scheme);
+    if (it == scheme_to_factory_.end()) return nullptr;
+
+    return it->second.Map([](const auto& r) { return &r.factory; });
+  }
+
+  Status MergeInto(FileSystemFactoryRegistry* main_registry) {
+    std::unique_lock lock{mutex_};
+    RETURN_NOT_OK(CheckValid());
+
+    std::unique_lock main_lock{main_registry->mutex_};
+    RETURN_NOT_OK(main_registry->CheckValid());
+
+    std::vector<std::string_view> duplicated_schemes;
+    for (auto& [scheme, registered] : scheme_to_factory_) {
+      if (!registered.ok()) {
+        duplicated_schemes.emplace_back(scheme);
+        continue;
+      }
+
+      auto [it, success] =
+          main_registry->scheme_to_factory_.emplace(std::move(scheme), 
registered);
+      if (success) continue;
+
+      duplicated_schemes.emplace_back(it->first);
+    }
+    scheme_to_factory_.clear();
+    merged_into_ = main_registry;
+
+    if (duplicated_schemes.empty()) return Status::OK();
+    return Status::KeyError("Attempted to register ", 
duplicated_schemes.size(),
+                            " factories for schemes ['",
+                            arrow::internal::JoinStrings(duplicated_schemes, 
"', '"),
+                            "'] but those schemes were already registered.");
+  }
+
+  void EnsureFinalized() {
+    std::unique_lock lock{mutex_};
+    if (finalized_) return;
+
+    for (const auto& [_, registered_or_error] : scheme_to_factory_) {
+      if (!registered_or_error.ok()) continue;
+      registered_or_error->finalizer();
+    }
+    finalized_ = true;
+  }
+
+  Status RegisterFactory(std::string scheme, FileSystemFactory factory,
+                         std::function<void()> finalizer, bool defer_error) {
+    std::unique_lock lock{mutex_};
+    RETURN_NOT_OK(CheckValid());
+
+    auto [it, success] = scheme_to_factory_.emplace(
+        std::move(scheme), Registered{std::move(factory), 
std::move(finalizer)});
+    if (success) {
+      return Status::OK();
+    }
+
+    auto st = Status::KeyError(
+        "Attempted to register factory for "
+        "scheme '",
+        it->first,
+        "' but that scheme is already "
+        "registered.");
+    if (!defer_error) return st;
+
+    it->second = std::move(st);
+    return Status::OK();
+  }
+
+ private:
+  struct Registered {
+    FileSystemFactory factory;
+    std::function<void()> finalizer;
+  };
+
+  Status CheckValid() {
+    if (finalized_) {
+      return Status::Invalid("FileSystem factories were already finalized!");
+    }
+    if (merged_into_ != nullptr) {
+      return Status::Invalid(
+          "FileSystem factories were merged into a different registry!");
+    }
+    return Status::OK();
+  }
+
+  std::shared_mutex mutex_;
+  std::unordered_map<std::string, Result<Registered>> scheme_to_factory_;
+  bool finalized_ = false;
+  FileSystemFactoryRegistry* merged_into_ = nullptr;
+};
+
+Status RegisterFileSystemFactory(std::string scheme, FileSystemFactory factory,
+                                 std::function<void()> finalizer) {
+  return FileSystemFactoryRegistry::GetInstance()->RegisterFactory(
+      std::move(scheme), factory, std::move(finalizer),
+      /*defer_error=*/false);
+}
+
+void EnsureFinalized() { 
FileSystemFactoryRegistry::GetInstance()->EnsureFinalized(); }
+
+FileSystemRegistrar::FileSystemRegistrar(std::string scheme, FileSystemFactory 
factory,
+                                         std::function<void()> finalizer) {
+  DCHECK_OK(FileSystemFactoryRegistry::GetInstance()->RegisterFactory(
+      std::move(scheme), std::move(factory), std::move(finalizer),
+      /*defer_error=*/true));
+}
+
+namespace internal {
+void* GetFileSystemRegistry() { return 
FileSystemFactoryRegistry::GetInstance(); }
+}  // namespace internal
+
+Status LoadFileSystemFactories(const char* libpath) {
+  using ::arrow::internal::GetSymbolAs;
+  using ::arrow::internal::LoadDynamicLibrary;
+
+  ARROW_ASSIGN_OR_RAISE(void* lib, LoadDynamicLibrary(libpath));
+  auto* get_instance =
+      GetSymbolAs<void*()>(lib, 
"arrow_filesystem_get_registry").ValueOr(nullptr);
+  if (get_instance == nullptr) {
+    return Status::OK();

Review Comment:
   Add comment explaining why this consider this ok?



##########
cpp/src/arrow/testing/examplefs.cc:
##########
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/filesystem/filesystem_library.h"
+#include "arrow/result.h"
+#include "arrow/util/uri.h"
+
+namespace arrow::fs {
+
+FileSystemRegistrar kExampleFileSystemModule{
+    "example",
+    [](const Uri& uri, const io::IOContext& io_context,
+       std::string* out_path) -> Result<std::shared_ptr<FileSystem>> {
+      constexpr std::string_view kScheme = "example";

Review Comment:
   Can we assert that the URI scheme is equal to `kScheme`?



##########
docs/source/cpp/io.rst:
##########
@@ -91,4 +101,46 @@ Concrete implementations are available for
 
   Tasks that use filesystems will typically run on the
   :ref:`I/O thread pool<io_thread_pool>`.  For filesystems that support high 
levels
-  of concurrency you may get a benefit from increasing the size of the I/O 
thread pool.
\ No newline at end of file
+  of concurrency you may get a benefit from increasing the size of the I/O 
thread pool.
+
+Defining new FileSystems

Review Comment:
   filesystems?



##########
docs/source/cpp/io.rst:
##########
@@ -81,7 +84,14 @@ underlying storage, are automatically dereferenced.  Only 
basic
 :class:`metadata <FileStats>` about file entries, such as the file size
 and modification time, is made available.
 
-Concrete implementations are available for
+Filesystem instances are constructed from URI strings using one of the

Review Comment:
   ```suggestion
   Filesystem instances can be constructed from URI strings using one of the
   ```



##########
cpp/src/arrow/filesystem/localfs_test.cc:
##########
@@ -86,6 +84,86 @@ Result<std::shared_ptr<FileSystem>> FSFromUriOrPath(const 
std::string& uri,
 ////////////////////////////////////////////////////////////////////////////
 // Misc tests
 
+Result<std::shared_ptr<FileSystem>> SlowFileSystemFactory(const Uri& uri,

Review Comment:
   Could add a dedicated heading instead of adding these tests under the "Misc 
tests" heading.



##########
docs/source/cpp/io.rst:
##########
@@ -73,6 +73,9 @@ The :class:`filesystem interface <FileSystem>` allows 
abstracted access over
 various data storage backends such as the local filesystem or a S3 bucket.
 It provides input and output streams as well as directory operations.
 
+.. seealso::
+    :ref:`FileSystems API reference <cpp-api-filesystems>`.

Review Comment:
   `FileSystem` is the C++ class, but otherwise I think we would just write 
"Filesystems" or "filesystems".



##########
docs/source/cpp/io.rst:
##########
@@ -91,4 +101,46 @@ Concrete implementations are available for
 
   Tasks that use filesystems will typically run on the
   :ref:`I/O thread pool<io_thread_pool>`.  For filesystems that support high 
levels
-  of concurrency you may get a benefit from increasing the size of the I/O 
thread pool.
\ No newline at end of file
+  of concurrency you may get a benefit from increasing the size of the I/O 
thread pool.
+
+Defining new FileSystems
+========================
+
+Support for additional URI schemes can be added to the
+:ref:`FromUri factories <filesystem-factory-functions>`
+by registering a factory for each new URI scheme with
+:func:`~arrow::fs::RegisterFileSystemFactory`. To enable the common case
+wherein it is preferred that registration be automatic, an instance of
+:class:`~arrow::fs::FileSystemRegistrar` can be defined at namespace
+scope, which will register a factory whenever the instance is loaded:
+
+.. code-block:: cpp
+
+    arrow::fs::FileSystemRegistrar kExampleFileSystemModule{
+      "example",
+      [](const Uri& uri, const io::IOContext& io_context,
+          std::string* out_path) -> 
Result<std::shared_ptr<arrow::fs::FileSystem>> {
+        EnsureExampleFileSystemInitialized();
+        return std::make_shared<ExampleFileSystem>();
+      },
+      &EnsureExampleFileSystemFinalized,
+    };
+
+If a filesystem implementation requires initialization before any instances
+may be constructed, this should be included in the corresponding factory or
+otherwise automatically ensured before the factory is invoked. Likewise if
+a filesystem implementation requires tear down before the process ends, this
+can be wrapped in a function and registered alongside the factory. All
+finalizers will be called by :func:`~arrow::fs::EnsureFinalized`.
+
+Build complexity can be decreased by compartmentalizing a FileSystem
+implementation into a separate shared library, which applications may
+link or load dynamically. Arrow's built-in FileSystem implementations
+also follow this pattern. If a shared library containing instances of
+:class:`~arrow::fs::FileSystemRegistrar` must be dynamically loaded,
+:func:`~arrow::fs::LoadFileSystemFactories` should be used to load it.
+Such a library should have exactly one of its sources
+``#include "arrow/filesystem/filesystem_library.h"``

Review Comment:
   Did we determine this is required for shared libraries unless they 
statically link to Arrow?



##########
cpp/src/arrow/filesystem/localfs_test.cc:
##########
@@ -86,6 +84,86 @@ Result<std::shared_ptr<FileSystem>> FSFromUriOrPath(const 
std::string& uri,
 ////////////////////////////////////////////////////////////////////////////
 // Misc tests
 
+Result<std::shared_ptr<FileSystem>> SlowFileSystemFactory(const Uri& uri,
+                                                          const io::IOContext& 
io_context,
+                                                          std::string* 
out_path) {
+  auto local_uri = "file" + uri.ToString().substr(uri.scheme().size());
+  ARROW_ASSIGN_OR_RAISE(auto base_fs, FileSystemFromUri(local_uri, io_context, 
out_path));
+  double average_latency = 1;
+  int32_t seed = 0xDEADBEEF;
+  ARROW_ASSIGN_OR_RAISE(auto params, uri.query_items());

Review Comment:
   There seems to be no test passing any query parameters, is it useful to 
parse them?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to