pitrou commented on code in PR #39067:
URL: https://github.com/apache/arrow/pull/39067#discussion_r1505835423


##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -519,6 +574,82 @@ Result<std::shared_ptr<FileSystem>> 
FileSystemFromUriOrPath(
 
 /// @}
 
+/// \defgroup filesystem-factory-registration Helpers for FileSystem 
registration
+///
+/// @{
+
+/// \brief Register a Filesystem factory
+///
+/// Support for custom Uri schemes can be added by registering a factory
+/// for the corresponding FileSystem.
+///
+/// \param[in] scheme a Uri scheme which the factory will handle.
+///            If a factory has already been registered for a scheme,
+///            the new factory will be ignored.

Review Comment:
   Can we perhaps return a `Result<bool>`? Successful return would be true if 
the factory was actually added, false otherwise.



##########
cpp/src/arrow/util/library.cc:
##########
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/library.h"
+
+#include "arrow/util/io_util.h"
+
+#ifdef _WIN32
+#include <Windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace arrow::util {
+
+// TODO(bkietz) unify with the utilities in hdfs_internal.cc
+Result<void*> LoadDynamicLibrary(const char* path) {
+#ifdef _WIN32
+  if (void* handle = LoadLibraryA(path)) return handle;

Review Comment:
   Ideally we should use `LoadLibraryW` after converting from utf8 to wide 
char. There are probably the required utilities in `io_util.cc` already (+ 
other functions doing the same conversion).



##########
cpp/src/arrow/util/library.h:
##########
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow::util {
+
+/// \brief Load a dynamic library
+///
+/// This wraps dlopen() except on Windows, where LoadLibrary() is called.

Review Comment:
   Can you please explain what this returns? 



##########
cpp/src/arrow/util/library.h:
##########
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow::util {
+

Review Comment:
   I would put these in `io_util.h` with other platform-related utilities.



##########
cpp/src/arrow/util/library.cc:
##########
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/library.h"
+
+#include "arrow/util/io_util.h"
+
+#ifdef _WIN32
+#include <Windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace arrow::util {
+
+// TODO(bkietz) unify with the utilities in hdfs_internal.cc
+Result<void*> LoadDynamicLibrary(const char* path) {
+#ifdef _WIN32
+  if (void* handle = LoadLibraryA(path)) return handle;
+  return arrow::internal::IOErrorFromWinError(GetLastError(), "LoadLibrary 
failed.");

Review Comment:
   People will probably want to have the library path in the error message (for 
dlopen as well below).



##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -324,6 +346,10 @@ class ARROW_EXPORT FileSystem : public 
std::enable_shared_from_this<FileSystem>
       const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
   Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(const 
std::string& path);
 
+  using Factory = Result<std::shared_ptr<FileSystem>>(const Uri& uri,

Review Comment:
   Style nit: we generally don't use nested type definitions except when 
they're used specifically in the enclosing class's APIs (which does not seem to 
be the case here). Should we make this a top-level `FileSystemFactory` instead?
   



##########
cpp/src/arrow/util/library.cc:
##########
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/library.h"
+
+#include "arrow/util/io_util.h"
+
+#ifdef _WIN32
+#include <Windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+namespace arrow::util {
+
+// TODO(bkietz) unify with the utilities in hdfs_internal.cc

Review Comment:
   Should this be done in this PR or is it hdfs_internal.cc too intrincate?



##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -519,6 +574,82 @@ Result<std::shared_ptr<FileSystem>> 
FileSystemFromUriOrPath(
 
 /// @}
 
+/// \defgroup filesystem-factory-registration Helpers for FileSystem 
registration
+///
+/// @{
+
+/// \brief Register a Filesystem factory
+///
+/// Support for custom Uri schemes can be added by registering a factory
+/// for the corresponding FileSystem.
+///
+/// \param[in] scheme a Uri scheme which the factory will handle.
+///            If a factory has already been registered for a scheme,
+///            the new factory will be ignored.
+/// \param[in] factory a function which can produce a FileSystem for Uris 
which match
+///            scheme.
+/// \param[in] finalizer a function which must be called to finalize the 
factory before
+///            the process exits, or nullptr if no finalization is necessary.
+ARROW_EXPORT void RegisterFileSystemFactory(std::string scheme,
+                                            FileSystem::Factory factory,
+                                            void finalizer() = NULLPTR);

Review Comment:
   If we make the finalizer a `std::function<void()>`, wouldn't that give more 
flexibility? Or is that technically impossible?



##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -519,6 +574,82 @@ Result<std::shared_ptr<FileSystem>> 
FileSystemFromUriOrPath(
 
 /// @}
 
+/// \defgroup filesystem-factory-registration Helpers for FileSystem 
registration
+///
+/// @{
+
+/// \brief Register a Filesystem factory
+///
+/// Support for custom Uri schemes can be added by registering a factory
+/// for the corresponding FileSystem.
+///
+/// \param[in] scheme a Uri scheme which the factory will handle.
+///            If a factory has already been registered for a scheme,
+///            the new factory will be ignored.
+/// \param[in] factory a function which can produce a FileSystem for Uris 
which match
+///            scheme.
+/// \param[in] finalizer a function which must be called to finalize the 
factory before
+///            the process exits, or nullptr if no finalization is necessary.
+ARROW_EXPORT void RegisterFileSystemFactory(std::string scheme,
+                                            FileSystem::Factory factory,
+                                            void finalizer() = NULLPTR);
+
+/// \brief Register Filesystem factories from a shared library
+///
+/// In addition to dynamically loading the indicated library, registries are 
merged if
+/// necessary (static linkage can produce isolated registries).
+ARROW_EXPORT Status LoadFileSystemFactories(const char* libpath);

Review Comment:
   Can the docstring make the contract more explicit? i.e., IIUC, the library 
is supposed to instantiate static FileSystemRegistrar instances that 
automatically register the provided factories?
   



##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -674,6 +684,53 @@ Status CopyFiles(const std::shared_ptr<FileSystem>& 
source_fs,
   return CopyFiles(sources, destinations, io_context, chunk_size, use_threads);
 }
 
+struct Registry {
+  std::shared_mutex mutex;
+  std::unordered_map<std::string, FileSystem::Factory*> scheme_to_factory;
+  std::vector<void (*)()> finalizers;
+  bool finalized = false;
+};
+
+extern "C" ARROW_EXPORT auto* GetFileSystemFactoryRegistry() {
+  // Check if this function is the one linked to main
+  static Registry registry;
+  return &registry;
+}
+
+extern "C" {
+ARROW_EXPORT void MergeFileSystemRegistry(void* main_registry) {
+  if (GetFileSystemFactoryRegistry() == main_registry) return;
+
+  auto& [mutex, scheme_to_factory, finalizers, _f] = 
*GetFileSystemFactoryRegistry();
+  auto& [main_mutex, main_scheme_to_factory, main_finalizers, _mf] =
+      *static_cast<Registry*>(main_registry);
+
+  std::unique_lock lock{mutex}, main_lock{main_mutex};
+
+  for (auto& [scheme, factory] : scheme_to_factory) {
+    auto& ref = main_scheme_to_factory[scheme];
+    if (ref) continue;
+    ref = factory;
+  }
+  scheme_to_factory.clear();
+
+  for (auto* finalizer : finalizers) {
+    main_finalizers.push_back(finalizer);
+  }
+  finalizers.clear();
+}
+}
+
+Status LoadFileSystemFactories(const char* libpath) {
+  ARROW_ASSIGN_OR_RAISE(void* lib, util::LoadDynamicLibrary(libpath));
+
+  if (void* merge = util::GetSymbol(lib, 
"MergeFileSystemRegistry").ValueOr(nullptr)) {

Review Comment:
   I'm curious: why do we lookup `MergeFileSystemRegistry`, while we could 
simply lookup `GetFileSystemFactoryRegistry` and use our own merge 
function/method?



##########
cpp/src/arrow/util/uri.h:
##########
@@ -86,6 +85,9 @@ class ARROW_EXPORT Uri {
   /// Factory function to parse a URI from its string representation.
   Status Parse(const std::string& uri_string);
 
+  /// Factory function to parse a URI from its string representation.
+  static Result<Uri> FromString(const std::string& uri_string);

Review Comment:
   +1 :-)



##########
cpp/src/arrow/filesystem/filesystem.cc:
##########
@@ -674,6 +684,53 @@ Status CopyFiles(const std::shared_ptr<FileSystem>& 
source_fs,
   return CopyFiles(sources, destinations, io_context, chunk_size, use_threads);
 }
 
+struct Registry {
+  std::shared_mutex mutex;
+  std::unordered_map<std::string, FileSystem::Factory*> scheme_to_factory;
+  std::vector<void (*)()> finalizers;
+  bool finalized = false;
+};
+
+extern "C" ARROW_EXPORT auto* GetFileSystemFactoryRegistry() {
+  // Check if this function is the one linked to main
+  static Registry registry;
+  return &registry;
+}
+
+extern "C" {
+ARROW_EXPORT void MergeFileSystemRegistry(void* main_registry) {

Review Comment:
   If these "extern C" functions are not adorned with the `arrow` namespace, 
then they should probably be prefix with "Arrow".



##########
cpp/src/arrow/util/uri.cc:
##########
@@ -218,6 +215,9 @@ std::string Uri::path() const {
 std::string Uri::query_string() const { return 
TextRangeToString(impl_->uri_.query); }
 
 Result<std::vector<std::pair<std::string, std::string>>> Uri::query_items() 
const {
+  // XXX would it be worthwhile to fold this parsing into Uri::parse() or maybe
+  // cache these lazily in an unordered_map? Then we could provide
+  // Uri::query_item(std::string name)

Review Comment:
   Not all URIs may use this format for query strings, so this can't be folded 
into `Parse`.
   
   We can perhaps cache the result and also provide a `query_items_map()` to 
return an `unordered_map` if we want.



##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -36,8 +36,15 @@
 #include "arrow/util/windows_fixup.h"
 
 namespace arrow {
+
+namespace util {
+class Uri;

Review Comment:
   Can be added to `arrow/util/type_fwd.h` instead?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to