westonpace commented on a change in pull request #12279:
URL: https://github.com/apache/arrow/pull/12279#discussion_r799918352



##########
File path: cpp/src/arrow/engine/substrait/extension_set.h
##########
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/engine/visibility.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace engine {
+
+/// A mapping from arrow types and functions to the (uri, name) which 
identifies
+/// the corresponding substrait extension. Substrait types and variations must 
be
+/// registered with their corresponding arrow::DataType before they can be 
used!
+class ARROW_ENGINE_EXPORT ExtensionIdRegistry {
+ public:
+  /// All uris registered in this ExtensionIdRegistry
+  virtual std::vector<util::string_view> Uris() const = 0;
+
+  struct Id {
+    util::string_view uri, name;
+
+    bool empty() const { return uri.empty() && name.empty(); }
+  };
+
+  struct TypeRecord {
+    Id id;
+    const std::shared_ptr<DataType>& type;
+    bool is_variation;
+  };
+  virtual util::optional<TypeRecord> GetType(const DataType&) const = 0;
+  virtual util::optional<TypeRecord> GetType(Id, bool is_variation) const = 0;
+  virtual Status RegisterType(Id, std::shared_ptr<DataType>, bool 
is_variation) = 0;
+
+  // FIXME some functions will not be simple enough to convert without access 
to their
+  // arguments/options. For example is_in embeds the set in options rather 
than using an
+  // argument:
+  //     is_in(x, SetLookupOptions(set)) <-> (k...Uri, "is_in")(x, set)
+  //
+  // ... for another example, depending on the value of the first argument to
+  // substrait::add it either corresponds to arrow::add or arrow::add_checked
+  struct FunctionRecord {
+    Id id;
+    const std::string& function_name;
+  };
+  virtual util::optional<FunctionRecord> GetFunction(Id) const = 0;
+  virtual util::optional<FunctionRecord> GetFunction(
+      util::string_view arrow_function_name) const = 0;
+  virtual Status RegisterFunction(Id, std::string arrow_function_name) = 0;
+};
+
+constexpr util::string_view kArrowExtTypesUri =
+    "https://github.com/apache/arrow/blob/master/format/substrait/";
+    "extension_types.yaml";
+
+ARROW_ENGINE_EXPORT ExtensionIdRegistry* default_extension_id_registry();
+
+/// A subset of an ExtensionIdRegistry with extensions identifiable by an 
integer.
+///
+/// ExtensionSet does not own strings; it only refers to strings in an
+/// ExtensionIdRegistry.
+class ARROW_ENGINE_EXPORT ExtensionSet {
+ public:
+  using Id = ExtensionIdRegistry::Id;
+
+  /// Construct an empty ExtensionSet to be populated during serialization.
+  explicit ExtensionSet(ExtensionIdRegistry* = 
default_extension_id_registry());
+  ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionSet);
+
+  /// Construct an ExtensionSet with explicit extension ids for efficient 
referencing
+  /// during deserialization. Note that input vectors need not be densely 
packed; an empty
+  /// (default constructed) Id may be used as a placeholder to indicate an 
unused
+  /// _anchor/_reference. This factory will be used to wrap the extensions 
declared in a
+  /// substrait::Plan before deserializing the plan's relations.
+  ///
+  /// Views will be replaced with equivalent views pointing to memory owned by 
the
+  /// registry.
+  static Result<ExtensionSet> Make(
+      std::vector<util::string_view> uris, std::vector<Id> type_ids,
+      std::vector<bool> type_is_variation, std::vector<Id> function_ids,
+      ExtensionIdRegistry* = default_extension_id_registry());
+
+  // index in these vectors == value of _anchor/_reference fields
+  /// FIXME this assumes that _anchor/_references won't be huge, which is not 
guaranteed.
+  /// Could it be?
+  const std::vector<util::string_view>& uris() const { return uris_; }

Review comment:
       ARROW-15583




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to